├── .github ├── CODEOWNERS └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── MLPERF4.0 ├── Inference │ ├── functions.sh │ ├── llama │ │ ├── README.md │ │ ├── SUT.py │ │ ├── configs │ │ │ └── fp8.conf │ │ ├── evaluation.py │ │ ├── hqt │ │ │ └── llama2-70b-8x │ │ │ │ ├── config_meas_maxabs_quant_MAXABS_HW.json │ │ │ │ ├── measure_hooks_maxabs_0_8.json │ │ │ │ ├── measure_hooks_maxabs_0_8.npz │ │ │ │ ├── measure_hooks_maxabs_0_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_1_8.json │ │ │ │ ├── measure_hooks_maxabs_1_8.npz │ │ │ │ ├── measure_hooks_maxabs_1_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_2_8.json │ │ │ │ ├── measure_hooks_maxabs_2_8.npz │ │ │ │ ├── measure_hooks_maxabs_2_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_3_8.json │ │ │ │ ├── measure_hooks_maxabs_3_8.npz │ │ │ │ ├── measure_hooks_maxabs_3_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_4_8.json │ │ │ │ ├── measure_hooks_maxabs_4_8.npz │ │ │ │ ├── measure_hooks_maxabs_4_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_5_8.json │ │ │ │ ├── measure_hooks_maxabs_5_8.npz │ │ │ │ ├── measure_hooks_maxabs_5_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_6_8.json │ │ │ │ ├── measure_hooks_maxabs_6_8.npz │ │ │ │ ├── measure_hooks_maxabs_6_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_7_8.json │ │ │ │ ├── measure_hooks_maxabs_7_8.npz │ │ │ │ ├── measure_hooks_maxabs_7_8_mod_list.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_0_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_0_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_1_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_1_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_2_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_2_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_3_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_3_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_4_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_4_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_5_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_5_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_6_8.json │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_6_8.npz │ │ │ │ ├── measure_hooks_maxabs_MAXABS_HW_7_8.json │ │ │ │ └── measure_hooks_maxabs_MAXABS_HW_7_8.npz │ │ ├── llama_greedy.py │ │ ├── main.py │ │ ├── mlperf.conf │ │ ├── processorca.py │ │ ├── quantization_config │ │ │ ├── act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json │ │ │ ├── maxabs_measure.json │ │ │ ├── maxabs_pcq_measure.json │ │ │ ├── maxabs_quant.json │ │ │ ├── shape_measure.json │ │ │ ├── unit_scale_quant.json │ │ │ └── without_scale_quant.json │ │ ├── requirements.txt │ │ ├── run_tgi_server.sh │ │ ├── setup_tgi.sh │ │ └── utils.py │ ├── prepare_and_check_submission.py │ ├── run_mlperf_scenarios.py │ ├── scenarios.yaml │ └── stable-diffusion-xl │ │ ├── README.md │ │ ├── attention_processor.py │ │ ├── backend.py │ │ ├── backend_debug.py │ │ ├── backend_pytorch.py │ │ ├── coco.py │ │ ├── coco2014 │ │ ├── calibration │ │ │ └── coco_cal_captions_list.txt │ │ └── captions │ │ │ └── captions_source.tsv │ │ ├── configs │ │ └── user.conf │ │ ├── dataset.py │ │ ├── evaluation.py │ │ ├── hpu_multicard.py │ │ ├── main.py │ │ ├── mlperf.conf │ │ ├── pipeline_stable_diffusion_xl_hpu.py │ │ ├── requirements.txt │ │ ├── scheduling_euler_discrete_hpu.py │ │ ├── tools │ │ ├── accuracy_coco.py │ │ ├── check_latents.py │ │ ├── clip │ │ │ └── clip_encoder.py │ │ ├── coco.py │ │ ├── coco_calibration.py │ │ ├── coco_generate_calibration.py │ │ ├── download-coco-2014-calibration.sh │ │ ├── download-coco-2014.sh │ │ ├── fid │ │ │ ├── README.md │ │ │ ├── fid_score.py │ │ │ └── inception.py │ │ ├── generate_fp32_weights.py │ │ ├── latent.py │ │ ├── latents.npy │ │ ├── latents.pt │ │ ├── measure.sh │ │ ├── quantize │ │ │ ├── measure_all │ │ │ │ ├── fp8_hooks_maxabs.json │ │ │ │ ├── fp8_hooks_maxabs.npz │ │ │ │ ├── fp8_hooks_maxabs_0_8.npz │ │ │ │ ├── fp8_hooks_maxabs_1_8.npz │ │ │ │ ├── fp8_hooks_maxabs_2_8.npz │ │ │ │ ├── fp8_hooks_maxabs_3_8.npz │ │ │ │ ├── fp8_hooks_maxabs_4_8.npz │ │ │ │ ├── fp8_hooks_maxabs_5_8.npz │ │ │ │ ├── fp8_hooks_maxabs_6_8.npz │ │ │ │ └── fp8_hooks_maxabs_7_8.npz │ │ │ ├── measure_config.json │ │ │ ├── quant_config.json │ │ │ └── quant_config_bmm.json │ │ ├── sample_ids.py │ │ └── sample_ids.txt │ │ └── unet_2d_condition_hpu.py └── Training │ └── benchmarks │ ├── gpt3 │ ├── CODEOWNERS │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── SECURITY.md │ ├── dataset │ │ ├── README.md │ │ ├── download_books.sh │ │ ├── download_ckpt.sh │ │ └── download_vocab.sh │ ├── examples │ │ ├── MoE │ │ │ ├── ds_config_gpt_TEMPLATE.json │ │ │ ├── ds_config_gpt_Zero2_TEMPLATE.json │ │ │ ├── ds_evalharness.sh │ │ │ ├── ds_pretrain_gpt_1.3B_MoE128.sh │ │ │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh │ │ │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh │ │ │ ├── ds_pretrain_gpt_1.3B_dense.sh │ │ │ ├── ds_pretrain_gpt_1.3B_dense_cl.sh │ │ │ ├── ds_pretrain_gpt_125M_MoE64.sh │ │ │ ├── ds_pretrain_gpt_125M_dense_cl.sh │ │ │ ├── ds_pretrain_gpt_350M_MoE128.sh │ │ │ ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh │ │ │ ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh │ │ │ ├── ds_pretrain_gpt_350M_dense.sh │ │ │ ├── ds_pretrain_gpt_6.7B_dense.sh │ │ │ └── readme_evalharness.md │ │ ├── README.md │ │ ├── azure │ │ │ ├── README.md │ │ │ ├── run-175b.sh │ │ │ ├── run-1t.sh │ │ │ └── run-benchmark-model.sh │ │ ├── azureml │ │ │ ├── Dockerfile.dockerfile │ │ │ ├── README.md │ │ │ ├── aml_submit.py │ │ │ └── prepare_dataset.py │ │ ├── compression │ │ │ ├── 125M-Int8-test-64gpu-distilled-group48.sh │ │ │ ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh │ │ │ ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh │ │ │ ├── ds_config_gpt_TEMPLATE.json │ │ │ ├── ds_config_gpt_TEMPLATE_compression.json │ │ │ ├── ds_evalharness.sh │ │ │ ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh │ │ │ ├── ds_pretrain_gpt_125M_dense_cl_kd.sh │ │ │ ├── ds_pretrain_gpt_125M_dense_kd.sh │ │ │ └── ds_pretrain_gpt_350M_dense_kd.sh │ │ ├── create_embeddings.sh │ │ ├── curriculum_learning │ │ │ ├── README.md │ │ │ ├── ds_pretrain_gpt2.sh │ │ │ ├── ds_train.sh │ │ │ ├── ds_zero_stage_1_config_baseline.json │ │ │ └── ds_zero_stage_1_config_curriculum_fixed_linear.json │ │ ├── evaluate_ict_zeroshot_nq.sh │ │ ├── evaluate_zeroshot_gpt.sh │ │ ├── finetune_mnli_distributed.sh │ │ ├── finetune_race_distributed.sh │ │ ├── generate_text.sh │ │ ├── merge_mp_bert.sh │ │ ├── pretrain_bert.sh │ │ ├── pretrain_bert_distributed.sh │ │ ├── pretrain_bert_distributed_with_mp.sh │ │ ├── pretrain_gpt.sh │ │ ├── pretrain_gpt3_175B.sh │ │ ├── pretrain_gpt_distributed.sh │ │ ├── pretrain_gpt_distributed_with_mp.sh │ │ ├── pretrain_ict.sh │ │ ├── pretrain_t5.sh │ │ ├── pretrain_t5_distributed.sh │ │ ├── pretrain_t5_distributed_with_mp.sh │ │ └── run_deepspeed_example.sh │ ├── images │ │ └── cases_april2021.png │ ├── megatron │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── checkpointing.py │ │ ├── data │ │ │ ├── Makefile │ │ │ ├── __init__.py │ │ │ ├── autoaugment.py │ │ │ ├── bert_dataset.py │ │ │ ├── biencoder_dataset_utils.py │ │ │ ├── blendable_dataset.py │ │ │ ├── data_samplers.py │ │ │ ├── dataset_utils.py │ │ │ ├── gpt_dataset.py │ │ │ ├── helpers.cpp │ │ │ ├── ict_dataset.py │ │ │ ├── indexed_dataset.py │ │ │ ├── orqa_wiki_dataset.py │ │ │ ├── realm_dataset_utils.py │ │ │ ├── realm_index.py │ │ │ ├── t5_dataset.py │ │ │ ├── test │ │ │ │ ├── test_indexed_dataset.py │ │ │ │ └── test_preprocess_data.sh │ │ │ └── vit_dataset.py │ │ ├── enums.py │ │ ├── fp16_deprecated │ │ │ └── loss_scaler.py │ │ ├── fused_kernels │ │ │ ├── __init__.py │ │ │ ├── compat.h │ │ │ ├── layer_norm_cuda.cpp │ │ │ ├── layer_norm_cuda_kernel.cu │ │ │ ├── scaled_masked_softmax.cpp │ │ │ ├── scaled_masked_softmax.h │ │ │ ├── scaled_masked_softmax_cuda.cu │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ ├── scaled_upper_triang_masked_softmax.h │ │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ │ └── type_shim.h │ │ ├── global_vars.py │ │ ├── indexer.py │ │ ├── initialize.py │ │ ├── learning_rates.py │ │ ├── memory.py │ │ ├── microbatches.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── bert_model.py │ │ │ ├── biencoder_model.py │ │ │ ├── classification.py │ │ │ ├── distributed.py │ │ │ ├── enums.py │ │ │ ├── fused_bias_gelu.py │ │ │ ├── fused_layer_norm.py │ │ │ ├── fused_softmax.py │ │ │ ├── gpt_model.py │ │ │ ├── language_model.py │ │ │ ├── llama_model.py │ │ │ ├── module.py │ │ │ ├── multiple_choice.py │ │ │ ├── positional_embeddings.py │ │ │ ├── realm_model.py │ │ │ ├── rmsnorm.py │ │ │ ├── t5_model.py │ │ │ ├── transformer.py │ │ │ ├── utils.py │ │ │ └── vit_model.py │ │ ├── mpu │ │ │ ├── __init__.py │ │ │ ├── cross_entropy.py │ │ │ ├── data.py │ │ │ ├── initialize.py │ │ │ ├── layers.py │ │ │ ├── mappings.py │ │ │ ├── random.py │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── commons.py │ │ │ │ ├── test_cross_entropy.py │ │ │ │ ├── test_data.py │ │ │ │ ├── test_initialize.py │ │ │ │ ├── test_layers.py │ │ │ │ └── test_random.py │ │ │ └── utils.py │ │ ├── optimizer │ │ │ ├── __init__.py │ │ │ ├── clip_grads.py │ │ │ ├── grad_scaler.py │ │ │ └── optimizer.py │ │ ├── p2p_communication.py │ │ ├── package_info.py │ │ ├── profiler.py │ │ ├── schedules.py │ │ ├── testing_utils.py │ │ ├── text_generation_utils.py │ │ ├── tokenizer │ │ │ ├── __init__.py │ │ │ ├── bert_tokenization.py │ │ │ ├── gpt2_tokenization.py │ │ │ ├── sentencepiece_tokenization.py │ │ │ └── tokenizer.py │ │ ├── training.py │ │ └── utils.py │ ├── pretrain_bert.py │ ├── pretrain_gpt.py │ ├── pretrain_ict.py │ ├── pretrain_t5.py │ ├── pretrain_vit.py │ ├── requirements.txt │ ├── run_gpt.sh │ ├── setup.py │ ├── tasks │ │ ├── data_utils.py │ │ ├── detok.py │ │ ├── ensemble_classifier.py │ │ ├── eval_harness │ │ │ ├── download.py │ │ │ ├── evaluate.py │ │ │ └── report-to-csv.py │ │ ├── eval_utils.py │ │ ├── finetune_utils.py │ │ ├── glue │ │ │ ├── data.py │ │ │ ├── finetune.py │ │ │ ├── mnli.py │ │ │ └── qqp.py │ │ ├── main.py │ │ ├── main_3d.py │ │ ├── orqa │ │ │ ├── evaluate_orqa.py │ │ │ ├── evaluate_utils.py │ │ │ └── natural_questions │ │ │ │ ├── nq.py │ │ │ │ ├── qa_utils.py │ │ │ │ └── tokenizers.py │ │ ├── race │ │ │ ├── data.py │ │ │ └── finetune.py │ │ ├── tasks_args.py │ │ ├── vision │ │ │ ├── classification.py │ │ │ ├── eval_utils.py │ │ │ ├── finetune_utils.py │ │ │ └── main.py │ │ └── zeroshot_gpt │ │ │ ├── datasets.py │ │ │ ├── detokenizer.py │ │ │ └── evaluate.py │ ├── tests │ │ ├── ds_config_bf16.json │ │ ├── test_basic.py │ │ ├── test_checkpoints.py │ │ └── test_training.py │ └── tools │ │ ├── __init__.py │ │ ├── convert_checkpoint │ │ ├── README.md │ │ ├── __init__.py │ │ ├── common_bf16.json │ │ ├── convert_paxml_optimizer.py │ │ ├── deepspeed_checkpoint.py │ │ ├── deepspeed_to_megatron.py │ │ ├── deepspeed_to_transformers.py │ │ ├── ds_to_universal.py │ │ ├── inspect_checkpoint.py │ │ ├── inspect_deepspeed_checkpoint.py │ │ ├── megatron_optim_merge.py │ │ ├── megatron_optim_merged_to_ds_universal_convert.py │ │ └── verify_checkpoint_non_tp_consistency.py │ │ ├── create_doc_index.py │ │ ├── create_synthetic_dataset.py │ │ ├── generate_samples_gpt.py │ │ ├── linter.py │ │ ├── merge_mp_partitions.py │ │ ├── openwebtext │ │ ├── README.md │ │ ├── add_id.py │ │ ├── blacklist_urls.py │ │ ├── cleanup_dataset.py │ │ ├── cleanup_fix_dataset.py │ │ ├── filter_ngrams.py │ │ ├── find_duplicates.py │ │ ├── group_duplicate_url.py │ │ ├── merge_jsons.py │ │ └── remove_group_duplicates.py │ │ └── preprocess_data.py │ └── llm_finetune │ ├── LICENSE.md │ ├── README.md │ ├── config.json │ ├── configs │ └── ds_zero3.json │ ├── ops_bf16.txt │ ├── requirements.txt │ ├── run_llama_70B_fp8_submission.sh │ └── scripts │ ├── create_warmup_data.py │ ├── gaudi_spawn.py │ ├── mlperf_logging_utils.py │ ├── train.py │ └── utils.py ├── PyTorch ├── __init__.py ├── audio │ └── wav2vec2 │ │ └── inference │ │ ├── LICENSE │ │ ├── README.md │ │ ├── librispeech_asr_test_clean.py │ │ ├── requirements.txt │ │ └── wav2vec.py ├── computer_vision │ ├── classification │ │ ├── ViT │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── img │ │ │ │ ├── figure1.png │ │ │ │ ├── figure2.png │ │ │ │ └── figure3.png │ │ │ ├── models │ │ │ │ ├── configs.py │ │ │ │ ├── modeling.py │ │ │ │ └── modeling_resnet.py │ │ │ ├── ops_bf16.txt │ │ │ ├── ops_fp32.txt │ │ │ ├── requirements.txt │ │ │ ├── train.py │ │ │ ├── visualize_attention_map.ipynb │ │ │ └── vit_utils │ │ │ │ ├── data_utils.py │ │ │ │ ├── dist_util.py │ │ │ │ └── scheduler.py │ │ └── torchvision │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── data_loaders.py │ │ │ ├── googlenet_utils.py │ │ │ ├── inference.py │ │ │ ├── main.py │ │ │ ├── media_pipe_settings.py │ │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── optimizer.py │ │ │ ├── resnet.py │ │ │ └── utils.py │ │ │ ├── ops_bf16_Resnet.txt │ │ │ ├── ops_fp32_Resnet.txt │ │ │ ├── requirements.txt │ │ │ ├── requirements_u24.txt │ │ │ ├── resnet_media_pipe.py │ │ │ ├── train.py │ │ │ └── utils.py │ ├── detection │ │ └── yolox │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── assets │ │ │ ├── demo.png │ │ │ ├── dog.jpg │ │ │ ├── git_fig.png │ │ │ └── logo.png │ │ │ ├── demo │ │ │ ├── MegEngine │ │ │ │ ├── cpp │ │ │ │ │ ├── README.md │ │ │ │ │ ├── build.sh │ │ │ │ │ └── yolox.cpp │ │ │ │ └── python │ │ │ │ │ ├── README.md │ │ │ │ │ ├── build.py │ │ │ │ │ ├── convert_weights.py │ │ │ │ │ ├── demo.py │ │ │ │ │ ├── dump.py │ │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── darknet.py │ │ │ │ │ ├── network_blocks.py │ │ │ │ │ ├── yolo_fpn.py │ │ │ │ │ ├── yolo_head.py │ │ │ │ │ ├── yolo_pafpn.py │ │ │ │ │ └── yolox.py │ │ │ ├── ONNXRuntime │ │ │ │ ├── README.md │ │ │ │ └── onnx_inference.py │ │ │ ├── OpenVINO │ │ │ │ ├── README.md │ │ │ │ ├── cpp │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── README.md │ │ │ │ │ └── yolox_openvino.cpp │ │ │ │ └── python │ │ │ │ │ ├── README.md │ │ │ │ │ └── openvino_inference.py │ │ │ ├── TensorRT │ │ │ │ ├── cpp │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── README.md │ │ │ │ │ ├── logging.h │ │ │ │ │ └── yolox.cpp │ │ │ │ └── python │ │ │ │ │ └── README.md │ │ │ └── ncnn │ │ │ │ ├── android │ │ │ │ ├── README.md │ │ │ │ ├── app │ │ │ │ │ ├── build.gradle │ │ │ │ │ └── src │ │ │ │ │ │ └── main │ │ │ │ │ │ ├── AndroidManifest.xml │ │ │ │ │ │ ├── assets │ │ │ │ │ │ └── yolox.param │ │ │ │ │ │ ├── java │ │ │ │ │ │ └── com │ │ │ │ │ │ │ └── megvii │ │ │ │ │ │ │ └── yoloXncnn │ │ │ │ │ │ │ ├── MainActivity.java │ │ │ │ │ │ │ ├── YOLOXncnn.java │ │ │ │ │ │ │ └── yoloXncnn.java │ │ │ │ │ │ ├── jni │ │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ │ └── yoloXncnn_jni.cpp │ │ │ │ │ │ └── res │ │ │ │ │ │ ├── layout │ │ │ │ │ │ └── main.xml │ │ │ │ │ │ └── values │ │ │ │ │ │ └── strings.xml │ │ │ │ ├── build.gradle │ │ │ │ ├── gradle │ │ │ │ │ └── wrapper │ │ │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ │ │ └── gradle-wrapper.properties │ │ │ │ ├── gradlew │ │ │ │ ├── gradlew.bat │ │ │ │ └── settings.gradle │ │ │ │ └── cpp │ │ │ │ ├── README.md │ │ │ │ └── yolox.cpp │ │ │ ├── docs │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ ├── _static │ │ │ │ └── css │ │ │ │ │ └── custom.css │ │ │ ├── conf.py │ │ │ ├── demo │ │ │ │ ├── megengine_cpp_readme.md │ │ │ │ ├── megengine_py_readme.md │ │ │ │ ├── ncnn_android_readme.md │ │ │ │ ├── ncnn_cpp_readme.md │ │ │ │ ├── onnx_readme.md │ │ │ │ ├── openvino_cpp_readme.md │ │ │ │ ├── openvino_py_readme.md │ │ │ │ ├── trt_cpp_readme.md │ │ │ │ └── trt_py_readme.md │ │ │ ├── index.rst │ │ │ ├── manipulate_training_image_size.md │ │ │ ├── model_zoo.md │ │ │ ├── quick_run.md │ │ │ ├── requirements-doc.txt │ │ │ ├── train_custom_data.md │ │ │ └── updates_note.md │ │ │ ├── download_dataset.sh │ │ │ ├── exps │ │ │ ├── default │ │ │ │ ├── __init__.py │ │ │ │ ├── yolov3.py │ │ │ │ ├── yolox_l.py │ │ │ │ ├── yolox_m.py │ │ │ │ ├── yolox_nano.py │ │ │ │ ├── yolox_s.py │ │ │ │ ├── yolox_tiny.py │ │ │ │ └── yolox_x.py │ │ │ └── example │ │ │ │ ├── custom │ │ │ │ ├── nano.py │ │ │ │ └── yolox_s.py │ │ │ │ └── yolox_voc │ │ │ │ └── yolox_voc_s.py │ │ │ ├── hubconf.py │ │ │ ├── ops_bf16_yolox.txt │ │ │ ├── ops_fp32_yolox.txt │ │ │ ├── requirements.txt │ │ │ ├── setup.cfg │ │ │ ├── setup.py │ │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── utils │ │ │ │ └── test_model_utils.py │ │ │ ├── tools │ │ │ ├── __init__.py │ │ │ ├── demo.py │ │ │ ├── eval.py │ │ │ ├── export_onnx.py │ │ │ ├── export_torchscript.py │ │ │ ├── train.py │ │ │ └── trt.py │ │ │ └── yolox │ │ │ ├── __init__.py │ │ │ ├── core │ │ │ ├── __init__.py │ │ │ ├── launch.py │ │ │ └── trainer.py │ │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── data_augment.py │ │ │ ├── data_prefetcher.py │ │ │ ├── dataloading.py │ │ │ ├── datasets │ │ │ │ ├── __init__.py │ │ │ │ ├── coco.py │ │ │ │ ├── coco_classes.py │ │ │ │ ├── datasets_wrapper.py │ │ │ │ ├── mosaicdetection.py │ │ │ │ ├── voc.py │ │ │ │ └── voc_classes.py │ │ │ └── samplers.py │ │ │ ├── evaluators │ │ │ ├── __init__.py │ │ │ ├── coco_evaluator.py │ │ │ ├── voc_eval.py │ │ │ └── voc_evaluator.py │ │ │ ├── exp │ │ │ ├── __init__.py │ │ │ ├── base_exp.py │ │ │ ├── build.py │ │ │ ├── default │ │ │ │ └── __init__.py │ │ │ └── yolox_base.py │ │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── csrc │ │ │ │ ├── cocoeval │ │ │ │ │ ├── cocoeval.cpp │ │ │ │ │ └── cocoeval.h │ │ │ │ └── vision.cpp │ │ │ └── fast_coco_eval_api.py │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── build.py │ │ │ ├── darknet.py │ │ │ ├── losses.py │ │ │ ├── network_blocks.py │ │ │ ├── yolo_fpn.py │ │ │ ├── yolo_head.py │ │ │ ├── yolo_head_script.py │ │ │ ├── yolo_pafpn.py │ │ │ └── yolox.py │ │ │ ├── tools │ │ │ └── __init__.py │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── allreduce_norm.py │ │ │ ├── boxes.py │ │ │ ├── checkpoint.py │ │ │ ├── compat.py │ │ │ ├── demo_utils.py │ │ │ ├── dist.py │ │ │ ├── ema.py │ │ │ ├── logger.py │ │ │ ├── lr_scheduler.py │ │ │ ├── metric.py │ │ │ ├── model_utils.py │ │ │ ├── setup_env.py │ │ │ └── visualize.py │ └── segmentation │ │ └── Unet │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ ├── config │ │ ├── ops_bf16_unet.txt │ │ └── ops_fp32_unet.txt │ │ ├── data_loading │ │ ├── dali_loader.py │ │ └── data_module.py │ │ ├── data_preprocessing │ │ ├── configs.py │ │ └── preprocessor.py │ │ ├── download.py │ │ ├── evaluate.py │ │ ├── images │ │ └── unet3d.png │ │ ├── lightning_trainer │ │ ├── __init__.py │ │ └── ptl.py │ │ ├── main.py │ │ ├── models │ │ ├── dice.py │ │ ├── layers.py │ │ ├── loss.py │ │ ├── metrics.py │ │ ├── monai_sliding_window_inference.py │ │ ├── nn_unet.py │ │ ├── pl_metric.py │ │ └── unet.py │ │ ├── preprocess.py │ │ ├── pytorch │ │ ├── early_stopping_unet.py │ │ ├── misc.py │ │ ├── npt.py │ │ └── trainer.py │ │ ├── requirements.txt │ │ ├── requirements_u22.txt │ │ ├── requirements_u24.txt │ │ ├── scripts │ │ ├── benchmark.py │ │ ├── inference.py │ │ └── train.py │ │ └── utils │ │ ├── __init__.py │ │ ├── early_stopping_unet.py │ │ ├── gpu_affinity.py │ │ ├── logger.py │ │ └── utils.py ├── examples │ ├── DeepSpeed │ │ └── cifar_example │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── cifar10_deepspeed.py │ │ │ ├── ds_config.json │ │ │ ├── requirements.txt │ │ │ └── run_ds_habanax8.sh │ ├── bucketing │ │ ├── README.md │ │ ├── brute_force_min_pad_waste.py │ │ ├── bucket.py │ │ ├── bucket_analysis.svg │ │ ├── bucket_analysis_bar_gaussian.svg │ │ ├── bucket_analysis_bar_squad.svg │ │ ├── bucket_analysis_num_steps_gaussian.svg │ │ ├── datasets_library.py │ │ ├── gaussian.svg │ │ ├── lloyd_max_bucket.py │ │ ├── plotting.py │ │ ├── requirements.txt │ │ ├── run_demo_bucketing_gaussian.py │ │ ├── run_demo_bucketing_squad.py │ │ ├── run_demo_controlling_num_steps.py │ │ ├── run_demo_gaussian.py │ │ ├── run_demo_squad.py │ │ ├── squad.svg │ │ └── test.py │ ├── computer_vision │ │ └── hello_world │ │ │ ├── README.md │ │ │ ├── example.py │ │ │ ├── mnist.py │ │ │ └── utils.py │ ├── custom_op │ │ ├── custom_fusedsdpa │ │ │ ├── README.md │ │ │ └── custom_fusedsdpa_op.patch │ │ ├── legacy_custom_op_API │ │ │ ├── custom_relu │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── custom_relu.py │ │ │ │ ├── custom_relu_op.patch │ │ │ │ ├── hpu_custom_op_relu_test.py │ │ │ │ ├── hpu_custom_relu.cpp │ │ │ │ └── setup.py │ │ │ └── custom_topk │ │ │ │ ├── README.md │ │ │ │ ├── hpu_custom_op_topk_test.py │ │ │ │ ├── hpu_custom_topk.cpp │ │ │ │ └── setup.py │ │ └── pt2_custom_op_API │ │ │ ├── custom_relu │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── custom_relu.py │ │ │ ├── custom_relu_op.patch │ │ │ ├── hpu_custom_op_relu_test.py │ │ │ ├── hpu_custom_relu.cpp │ │ │ └── setup.py │ │ │ └── custom_topk │ │ │ ├── README.md │ │ │ ├── hpu_custom_topk.cpp │ │ │ ├── setup.py │ │ │ └── test_hpu_custom_op_topk.py │ ├── gpu_migration │ │ ├── README.md │ │ ├── computer_vision │ │ │ └── classification │ │ │ │ └── torchvision │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── gpu_migration_logs │ │ │ │ └── gpu_migration_958.log │ │ │ │ ├── patches │ │ │ │ ├── lr_scheduler.diff │ │ │ │ ├── minimal_changes.diff │ │ │ │ └── performance_improvements.diff │ │ │ │ ├── presets.py │ │ │ │ ├── sampler.py │ │ │ │ ├── train.py │ │ │ │ ├── train_quantization.py │ │ │ │ ├── transforms.py │ │ │ │ └── utils.py │ │ ├── generative_models │ │ │ └── stable-diffusion │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── configs │ │ │ │ ├── autoencoder │ │ │ │ │ ├── autoencoder_kl_16x16x16.yaml │ │ │ │ │ ├── autoencoder_kl_32x32x4.yaml │ │ │ │ │ ├── autoencoder_kl_64x64x3.yaml │ │ │ │ │ └── autoencoder_kl_8x8x64.yaml │ │ │ │ ├── latent-diffusion │ │ │ │ │ ├── celebahq-ldm-vq-4.yaml │ │ │ │ │ ├── cin-ldm-vq-f8.yaml │ │ │ │ │ ├── cin256-v2.yaml │ │ │ │ │ ├── ffhq-ldm-vq-4.yaml │ │ │ │ │ ├── lsun_bedrooms-ldm-vq-4.yaml │ │ │ │ │ ├── lsun_churches-ldm-kl-8.yaml │ │ │ │ │ └── txt2img-1p4B-eval.yaml │ │ │ │ └── stable-diffusion │ │ │ │ │ ├── dev.yaml │ │ │ │ │ ├── dev_mn.yaml │ │ │ │ │ ├── dev_mn_dummy.yaml │ │ │ │ │ ├── inpainting │ │ │ │ │ ├── v1-edgeinpainting.yaml │ │ │ │ │ ├── v1-finetune-for-inpainting-laion-aesthetic-larger-masks-and-ucfg.yaml │ │ │ │ │ ├── v1-finetune-for-inpainting-laion-aesthetic-larger-masks.yaml │ │ │ │ │ └── v1-finetune-for-inpainting-laion-iaesthe.yaml │ │ │ │ │ ├── txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml │ │ │ │ │ ├── txt2img-1p4B-multinode-clip-encoder.yaml │ │ │ │ │ ├── txt2img-1p4B-multinode-t5-encoder.yaml │ │ │ │ │ ├── txt2img-1p4B-multinode.yaml │ │ │ │ │ ├── txt2img-clip-encoder-dev.yaml │ │ │ │ │ ├── txt2img-ldm-frozen-dev.yaml │ │ │ │ │ ├── txt2img-ldm-unfrozen-dev.yaml │ │ │ │ │ ├── txt2img-ldm-vae-f8.yaml │ │ │ │ │ ├── txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml │ │ │ │ │ ├── txt2img-multinode-clip-encoder-f16-256-pretraining.yaml │ │ │ │ │ ├── txt2img-multinode-clip-encoder-f16-768-laion-hr-inference.yaml │ │ │ │ │ ├── txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml │ │ │ │ │ ├── txt2img-multinode-clip-encoder-f16-768.yaml │ │ │ │ │ ├── txt2img-t5-encoder-dev.yaml │ │ │ │ │ ├── txt2img-upscale-clip-encoder-f16-1024-dev.yaml │ │ │ │ │ ├── txt2img-upscale-clip-encoder-f16-1024.yaml │ │ │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml │ │ │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-256.yaml │ │ │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml │ │ │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-512.yaml │ │ │ │ │ ├── upscaling │ │ │ │ │ └── upscale-v1-with-f16.yaml │ │ │ │ │ ├── v1-inference.yaml │ │ │ │ │ ├── v1_improvedaesthetics.yaml │ │ │ │ │ ├── v1_laionhr.yaml │ │ │ │ │ ├── v2_laionhr1024.yaml │ │ │ │ │ ├── v2_laionhr1024_2.yaml │ │ │ │ │ ├── v2_pretraining.yaml │ │ │ │ │ └── v3_pretraining.yaml │ │ │ │ ├── environment.yaml │ │ │ │ ├── gpu_migration_logs │ │ │ │ └── gpu_migration_1762.log │ │ │ │ ├── hpu_graph_utils.py │ │ │ │ ├── ldm │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── coco.py │ │ │ │ │ ├── dummy.py │ │ │ │ │ ├── imagenet.py │ │ │ │ │ ├── inpainting │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── synthetic_mask.py │ │ │ │ │ ├── laion.py │ │ │ │ │ └── lsun.py │ │ │ │ ├── lr_scheduler.py │ │ │ │ ├── models │ │ │ │ │ ├── autoencoder.py │ │ │ │ │ └── diffusion │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── classifier.py │ │ │ │ │ │ ├── ddim.py │ │ │ │ │ │ ├── ddpm.py │ │ │ │ │ │ ├── plms.py │ │ │ │ │ │ └── sampling_util.py │ │ │ │ ├── modules │ │ │ │ │ ├── attention.py │ │ │ │ │ ├── diffusionmodules │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── model.py │ │ │ │ │ │ ├── openaimodel.py │ │ │ │ │ │ └── util.py │ │ │ │ │ ├── distributions │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── distributions.py │ │ │ │ │ ├── ema.py │ │ │ │ │ ├── encoders │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── modules.py │ │ │ │ │ ├── evaluate │ │ │ │ │ │ ├── adm_evaluator.py │ │ │ │ │ │ ├── evaluate_perceptualsim.py │ │ │ │ │ │ ├── frechet_video_distance.py │ │ │ │ │ │ ├── ssim.py │ │ │ │ │ │ └── torch_frechet_video_distance.py │ │ │ │ │ ├── image_degradation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bsrgan.py │ │ │ │ │ │ ├── bsrgan_light.py │ │ │ │ │ │ ├── utils │ │ │ │ │ │ │ └── test.png │ │ │ │ │ │ └── utils_image.py │ │ │ │ │ ├── losses │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── contperceptual.py │ │ │ │ │ │ └── vqperceptual.py │ │ │ │ │ └── x_transformer.py │ │ │ │ └── util.py │ │ │ │ ├── main.py │ │ │ │ ├── models │ │ │ │ ├── first_stage_models │ │ │ │ │ ├── kl-f16 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── kl-f32 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── kl-f4 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── kl-f8 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── vq-f16 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── vq-f4-noattn │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── vq-f4 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── vq-f8-n256 │ │ │ │ │ │ └── config.yaml │ │ │ │ │ └── vq-f8 │ │ │ │ │ │ └── config.yaml │ │ │ │ └── ldm │ │ │ │ │ ├── bsr_sr │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── celeba256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── cin256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── ffhq256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── inpainting_big │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── layout2img-openimages256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── lsun_beds256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── lsun_churches256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── semantic_synthesis256 │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── semantic_synthesis512 │ │ │ │ │ └── config.yaml │ │ │ │ │ └── text2img256 │ │ │ │ │ └── config.yaml │ │ │ │ ├── notebook_helpers.py │ │ │ │ ├── patches │ │ │ │ ├── hpu_graph.diff │ │ │ │ ├── minimal_changes.diff │ │ │ │ └── randn_to_cpu.diff │ │ │ │ ├── requirements.txt │ │ │ │ ├── scripts │ │ │ │ ├── autoencoder-eval.py │ │ │ │ ├── checker.py │ │ │ │ ├── cmd_on_new_ckpt.py │ │ │ │ ├── demo │ │ │ │ │ └── inpainting.py │ │ │ │ ├── download_first_stages.sh │ │ │ │ ├── download_models.sh │ │ │ │ ├── img2img.py │ │ │ │ ├── inpaint.py │ │ │ │ ├── logging_template.py │ │ │ │ ├── mnist-distributed.py │ │ │ │ ├── printckpt.py │ │ │ │ ├── prompts │ │ │ │ │ ├── aesthetic-prompts-plain.txt │ │ │ │ │ ├── aesthetic-prompts-surrealism.txt │ │ │ │ │ ├── prompts-with-wings.txt │ │ │ │ │ ├── six-prompts │ │ │ │ │ ├── weird-dalle-prompts.txt │ │ │ │ │ ├── wings1.txt │ │ │ │ │ ├── wings2.txt │ │ │ │ │ ├── wings3.txt │ │ │ │ │ └── wings4.txt │ │ │ │ ├── prune-ckpt.py │ │ │ │ ├── sample_diffusion.py │ │ │ │ ├── slurm │ │ │ │ │ ├── README.md │ │ │ │ │ ├── resume_512 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── resume_512_improvedaesthetic │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── resume_768_hr │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1-upscaling-f16-pretraining-512-aesthetics │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_edgeinpainting │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_iahr_torch111 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_iahr_torch111_ucg │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_improvedaesthetics │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_improvedaesthetics_torch111 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_inpainting_aesthetics-larger-masks-ucg │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_inpainting_aesthetics-larger-masks │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_inpainting_improvedaesthetics_torch111 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v1_laionhr_torch111 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v2_laionhr1024 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v2_laionhr1024_2 │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ ├── v2_pretraining │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ │ └── v3_pretraining │ │ │ │ │ │ ├── launcher.sh │ │ │ │ │ │ └── sbatch.sh │ │ │ │ ├── test_gpu.py │ │ │ │ ├── test_gpu.sh │ │ │ │ ├── txt2img.py │ │ │ │ └── vqgan_codebook_visualizer.py │ │ │ │ └── setup.py │ │ ├── nlp │ │ │ ├── DeepSpeedExamples │ │ │ │ └── Megatron-DeepSpeed │ │ │ │ │ ├── CODEOWNERS │ │ │ │ │ ├── LICENSE │ │ │ │ │ ├── MANIFEST.in │ │ │ │ │ ├── README.md │ │ │ │ │ ├── SECURITY.md │ │ │ │ │ ├── dataset │ │ │ │ │ ├── README.md │ │ │ │ │ ├── download_books.sh │ │ │ │ │ ├── download_ckpt.sh │ │ │ │ │ └── download_vocab.sh │ │ │ │ │ ├── examples │ │ │ │ │ ├── MoE │ │ │ │ │ │ ├── ds_config_gpt_TEMPLATE.json │ │ │ │ │ │ ├── ds_config_gpt_Zero2_TEMPLATE.json │ │ │ │ │ │ ├── ds_evalharness.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_MoE128.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_dense.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_dense_cl.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_125M_MoE64.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_125M_dense_cl.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_350M_MoE128.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_350M_PR-MoE32or64.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_350M_dense.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_6.7B_dense.sh │ │ │ │ │ │ └── readme_evalharness.md │ │ │ │ │ ├── README.md │ │ │ │ │ ├── azure │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── run-175b.sh │ │ │ │ │ │ ├── run-1t.sh │ │ │ │ │ │ └── run-benchmark-model.sh │ │ │ │ │ ├── azureml │ │ │ │ │ │ ├── Dockerfile.dockerfile │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── aml_submit.py │ │ │ │ │ │ └── prepare_dataset.py │ │ │ │ │ ├── compression │ │ │ │ │ │ ├── 125M-Int8-test-64gpu-distilled-group48.sh │ │ │ │ │ │ ├── 125M-L10-Int8-test-64gpu-distilled-group48.sh │ │ │ │ │ │ ├── 125M-L12-Int8-test-64gpu-distilled-group48.sh │ │ │ │ │ │ ├── ds_config_gpt_TEMPLATE.json │ │ │ │ │ │ ├── ds_config_gpt_TEMPLATE_compression.json │ │ │ │ │ │ ├── ds_evalharness.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_1.3B_dense_cl_kd.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_125M_dense_cl_kd.sh │ │ │ │ │ │ ├── ds_pretrain_gpt_125M_dense_kd.sh │ │ │ │ │ │ └── ds_pretrain_gpt_350M_dense_kd.sh │ │ │ │ │ ├── create_embeddings.sh │ │ │ │ │ ├── curriculum_learning │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── ds_pretrain_gpt2.sh │ │ │ │ │ │ ├── ds_train.sh │ │ │ │ │ │ ├── ds_zero_stage_1_config_baseline.json │ │ │ │ │ │ └── ds_zero_stage_1_config_curriculum_fixed_linear.json │ │ │ │ │ ├── evaluate_ict_zeroshot_nq.sh │ │ │ │ │ ├── evaluate_zeroshot_gpt.sh │ │ │ │ │ ├── finetune_mnli_distributed.sh │ │ │ │ │ ├── finetune_race_distributed.sh │ │ │ │ │ ├── generate_text.sh │ │ │ │ │ ├── merge_mp_bert.sh │ │ │ │ │ ├── pretrain_bert.sh │ │ │ │ │ ├── pretrain_bert_distributed.sh │ │ │ │ │ ├── pretrain_bert_distributed_with_mp.sh │ │ │ │ │ ├── pretrain_gpt.sh │ │ │ │ │ ├── pretrain_gpt3_175B.sh │ │ │ │ │ ├── pretrain_gpt_distributed.sh │ │ │ │ │ ├── pretrain_gpt_distributed_with_mp.sh │ │ │ │ │ ├── pretrain_ict.sh │ │ │ │ │ ├── pretrain_t5.sh │ │ │ │ │ ├── pretrain_t5_distributed.sh │ │ │ │ │ ├── pretrain_t5_distributed_with_mp.sh │ │ │ │ │ └── run_deepspeed_example.sh │ │ │ │ │ ├── gpu_migration_logs │ │ │ │ │ └── gpu_migration_424488.log │ │ │ │ │ ├── images │ │ │ │ │ └── cases_april2021.png │ │ │ │ │ ├── megatron │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── arguments.py │ │ │ │ │ ├── checkpointing.py │ │ │ │ │ ├── data │ │ │ │ │ │ ├── Makefile │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── autoaugment.py │ │ │ │ │ │ ├── bert_dataset.py │ │ │ │ │ │ ├── biencoder_dataset_utils.py │ │ │ │ │ │ ├── blendable_dataset.py │ │ │ │ │ │ ├── data_samplers.py │ │ │ │ │ │ ├── dataset_utils.py │ │ │ │ │ │ ├── gpt_dataset.py │ │ │ │ │ │ ├── helpers.cpp │ │ │ │ │ │ ├── ict_dataset.py │ │ │ │ │ │ ├── indexed_dataset.py │ │ │ │ │ │ ├── orqa_wiki_dataset.py │ │ │ │ │ │ ├── realm_dataset_utils.py │ │ │ │ │ │ ├── realm_index.py │ │ │ │ │ │ ├── t5_dataset.py │ │ │ │ │ │ ├── test │ │ │ │ │ │ │ ├── test_indexed_dataset.py │ │ │ │ │ │ │ └── test_preprocess_data.sh │ │ │ │ │ │ └── vit_dataset.py │ │ │ │ │ ├── enums.py │ │ │ │ │ ├── fp16_deprecated │ │ │ │ │ │ └── loss_scaler.py │ │ │ │ │ ├── fused_kernels │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── compat.h │ │ │ │ │ │ ├── layer_norm_cuda.cpp │ │ │ │ │ │ ├── layer_norm_cuda_kernel.cu │ │ │ │ │ │ ├── scaled_masked_softmax.cpp │ │ │ │ │ │ ├── scaled_masked_softmax.h │ │ │ │ │ │ ├── scaled_masked_softmax_cuda.cu │ │ │ │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ │ │ │ ├── scaled_upper_triang_masked_softmax.h │ │ │ │ │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ │ │ │ │ └── type_shim.h │ │ │ │ │ ├── global_vars.py │ │ │ │ │ ├── indexer.py │ │ │ │ │ ├── initialize.py │ │ │ │ │ ├── learning_rates.py │ │ │ │ │ ├── memory.py │ │ │ │ │ ├── microbatches.py │ │ │ │ │ ├── model │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bert_model.py │ │ │ │ │ │ ├── biencoder_model.py │ │ │ │ │ │ ├── classification.py │ │ │ │ │ │ ├── distributed.py │ │ │ │ │ │ ├── enums.py │ │ │ │ │ │ ├── fused_bias_gelu.py │ │ │ │ │ │ ├── fused_layer_norm.py │ │ │ │ │ │ ├── fused_softmax.py │ │ │ │ │ │ ├── gpt_model.py │ │ │ │ │ │ ├── language_model.py │ │ │ │ │ │ ├── module.py │ │ │ │ │ │ ├── multiple_choice.py │ │ │ │ │ │ ├── realm_model.py │ │ │ │ │ │ ├── t5_model.py │ │ │ │ │ │ ├── transformer.py │ │ │ │ │ │ ├── utils.py │ │ │ │ │ │ └── vit_model.py │ │ │ │ │ ├── mpu │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── cross_entropy.py │ │ │ │ │ │ ├── data.py │ │ │ │ │ │ ├── initialize.py │ │ │ │ │ │ ├── layers.py │ │ │ │ │ │ ├── mappings.py │ │ │ │ │ │ ├── random.py │ │ │ │ │ │ ├── tests │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── commons.py │ │ │ │ │ │ │ ├── test_cross_entropy.py │ │ │ │ │ │ │ ├── test_data.py │ │ │ │ │ │ │ ├── test_initialize.py │ │ │ │ │ │ │ ├── test_layers.py │ │ │ │ │ │ │ └── test_random.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── optimizer │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── clip_grads.py │ │ │ │ │ │ ├── grad_scaler.py │ │ │ │ │ │ └── optimizer.py │ │ │ │ │ ├── p2p_communication.py │ │ │ │ │ ├── package_info.py │ │ │ │ │ ├── schedules.py │ │ │ │ │ ├── text_generation_utils.py │ │ │ │ │ ├── tokenizer │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── bert_tokenization.py │ │ │ │ │ │ ├── gpt2_tokenization.py │ │ │ │ │ │ └── tokenizer.py │ │ │ │ │ ├── training.py │ │ │ │ │ └── utils.py │ │ │ │ │ ├── patches │ │ │ │ │ ├── functional_changes.diff │ │ │ │ │ └── performance_patch_1.diff │ │ │ │ │ ├── pretrain_bert.py │ │ │ │ │ ├── pretrain_gpt.py │ │ │ │ │ ├── pretrain_ict.py │ │ │ │ │ ├── pretrain_t5.py │ │ │ │ │ ├── pretrain_vit.py │ │ │ │ │ ├── requirements.txt │ │ │ │ │ ├── scripts │ │ │ │ │ └── run_bloom13b.sh │ │ │ │ │ ├── setup.py │ │ │ │ │ ├── tasks │ │ │ │ │ ├── data_utils.py │ │ │ │ │ ├── ensemble_classifier.py │ │ │ │ │ ├── eval_harness │ │ │ │ │ │ ├── download.py │ │ │ │ │ │ ├── evaluate.py │ │ │ │ │ │ └── report-to-csv.py │ │ │ │ │ ├── eval_utils.py │ │ │ │ │ ├── finetune_utils.py │ │ │ │ │ ├── glue │ │ │ │ │ │ ├── data.py │ │ │ │ │ │ ├── finetune.py │ │ │ │ │ │ ├── mnli.py │ │ │ │ │ │ └── qqp.py │ │ │ │ │ ├── main.py │ │ │ │ │ ├── orqa │ │ │ │ │ │ ├── evaluate_orqa.py │ │ │ │ │ │ ├── evaluate_utils.py │ │ │ │ │ │ └── natural_questions │ │ │ │ │ │ │ ├── nq.py │ │ │ │ │ │ │ ├── qa_utils.py │ │ │ │ │ │ │ └── tokenizers.py │ │ │ │ │ ├── race │ │ │ │ │ │ ├── data.py │ │ │ │ │ │ └── finetune.py │ │ │ │ │ ├── vision │ │ │ │ │ │ ├── classification.py │ │ │ │ │ │ ├── eval_utils.py │ │ │ │ │ │ ├── finetune_utils.py │ │ │ │ │ │ └── main.py │ │ │ │ │ └── zeroshot_gpt │ │ │ │ │ │ ├── datasets.py │ │ │ │ │ │ ├── detokenizer.py │ │ │ │ │ │ └── evaluate.py │ │ │ │ │ ├── tests │ │ │ │ │ └── test_basic.py │ │ │ │ │ └── tools │ │ │ │ │ ├── convert_checkpoint │ │ │ │ │ ├── README.md │ │ │ │ │ ├── deepspeed_checkpoint.py │ │ │ │ │ ├── deepspeed_to_megatron.py │ │ │ │ │ ├── deepspeed_to_transformers.py │ │ │ │ │ ├── inspect_checkpoint.py │ │ │ │ │ └── inspect_deepspeed_checkpoint.py │ │ │ │ │ ├── create_doc_index.py │ │ │ │ │ ├── generate_samples_gpt.py │ │ │ │ │ ├── linter.py │ │ │ │ │ ├── merge_mp_partitions.py │ │ │ │ │ ├── openwebtext │ │ │ │ │ ├── README.md │ │ │ │ │ ├── add_id.py │ │ │ │ │ ├── blacklist_urls.py │ │ │ │ │ ├── cleanup_dataset.py │ │ │ │ │ ├── cleanup_fix_dataset.py │ │ │ │ │ ├── filter_ngrams.py │ │ │ │ │ ├── find_duplicates.py │ │ │ │ │ ├── group_duplicate_url.py │ │ │ │ │ ├── merge_jsons.py │ │ │ │ │ └── remove_group_duplicates.py │ │ │ │ │ └── preprocess_data.py │ │ │ └── bert │ │ │ │ ├── Dockerfile │ │ │ │ ├── LICENSE │ │ │ │ ├── NOTICE │ │ │ │ ├── README.md │ │ │ │ ├── bert_config.json │ │ │ │ ├── bind.sh │ │ │ │ ├── bind_pyt.py │ │ │ │ ├── checkpoints │ │ │ │ └── .keep │ │ │ │ ├── configurations.yml │ │ │ │ ├── create_pretraining_data.py │ │ │ │ ├── data │ │ │ │ ├── BooksDownloader.py │ │ │ │ ├── BookscorpusTextFormatting.py │ │ │ │ ├── Downloader.py │ │ │ │ ├── GLUEDownloader.py │ │ │ │ ├── GooglePretrainedWeightDownloader.py │ │ │ │ ├── NVIDIAPretrainedWeightDownloader.py │ │ │ │ ├── SquadDownloader.py │ │ │ │ ├── TextSharding.py │ │ │ │ ├── WikiDownloader.py │ │ │ │ ├── WikicorpusTextFormatting.py │ │ │ │ ├── __init__.py │ │ │ │ ├── bertPrep.py │ │ │ │ ├── create_datasets_from_start.sh │ │ │ │ └── squad │ │ │ │ │ └── squad_download.sh │ │ │ │ ├── extract_features.py │ │ │ │ ├── file_utils.py │ │ │ │ ├── gpu_migration_logs │ │ │ │ └── gpu_migration_5494.log │ │ │ │ ├── inference.py │ │ │ │ ├── modeling.py │ │ │ │ ├── optimization.py │ │ │ │ ├── patches │ │ │ │ ├── minimal_changes.diff │ │ │ │ ├── performance_improvements.diff │ │ │ │ └── use_packed_dataset.diff │ │ │ │ ├── processors │ │ │ │ ├── __init__.py │ │ │ │ └── glue.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── results │ │ │ │ ├── .keep │ │ │ │ ├── checkpoints │ │ │ │ │ └── lddl_log │ │ │ │ │ │ ├── node-0.txt │ │ │ │ │ │ └── node-0_local-0.txt │ │ │ │ └── dllogger.json │ │ │ │ ├── run.sub │ │ │ │ ├── run_glue.py │ │ │ │ ├── run_pretraining.py │ │ │ │ ├── run_squad.py │ │ │ │ ├── run_swag.py │ │ │ │ ├── schedulers.py │ │ │ │ ├── scripts │ │ │ │ ├── configs │ │ │ │ │ ├── glue_config.sh │ │ │ │ │ ├── pretrain_config.sh │ │ │ │ │ └── squad_config.sh │ │ │ │ ├── data_download.sh │ │ │ │ ├── docker │ │ │ │ │ ├── build.sh │ │ │ │ │ └── launch.sh │ │ │ │ ├── run_glue.sh │ │ │ │ ├── run_pretraining.sh │ │ │ │ ├── run_squad.sh │ │ │ │ └── run_swag.sh │ │ │ │ ├── tokenization.py │ │ │ │ ├── utils.py │ │ │ │ └── vocab │ │ │ │ └── vocab │ │ └── simple_examples │ │ │ └── mnist │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── gpu_migration_logs │ │ │ └── gpu_migration_66.log │ │ │ └── main.py │ └── multi_tenants │ │ ├── README.md │ │ └── multi_tenants_resnet_pt.sh ├── generative_models │ └── stable-diffusion │ │ ├── LICENSE │ │ ├── README.md │ │ ├── configs │ │ ├── autoencoder │ │ │ ├── autoencoder_kl_16x16x16.yaml │ │ │ ├── autoencoder_kl_32x32x4.yaml │ │ │ ├── autoencoder_kl_64x64x3.yaml │ │ │ └── autoencoder_kl_8x8x64.yaml │ │ ├── latent-diffusion │ │ │ ├── celebahq-ldm-vq-4.yaml │ │ │ ├── cin-ldm-vq-f8.yaml │ │ │ ├── cin256-v2.yaml │ │ │ ├── ffhq-ldm-vq-4.yaml │ │ │ ├── lsun_bedrooms-ldm-vq-4.yaml │ │ │ ├── lsun_churches-ldm-kl-8.yaml │ │ │ └── txt2img-1p4B-eval.yaml │ │ └── stable-diffusion │ │ │ ├── dev.yaml │ │ │ ├── dev_mn.yaml │ │ │ ├── dev_mn_dummy.yaml │ │ │ ├── inpainting │ │ │ ├── v1-edgeinpainting.yaml │ │ │ ├── v1-finetune-for-inpainting-laion-aesthetic-larger-masks-and-ucfg.yaml │ │ │ ├── v1-finetune-for-inpainting-laion-aesthetic-larger-masks.yaml │ │ │ └── v1-finetune-for-inpainting-laion-iaesthe.yaml │ │ │ ├── txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml │ │ │ ├── txt2img-1p4B-multinode-clip-encoder.yaml │ │ │ ├── txt2img-1p4B-multinode-t5-encoder.yaml │ │ │ ├── txt2img-1p4B-multinode.yaml │ │ │ ├── txt2img-clip-encoder-dev.yaml │ │ │ ├── txt2img-ldm-frozen-dev.yaml │ │ │ ├── txt2img-ldm-unfrozen-dev.yaml │ │ │ ├── txt2img-ldm-vae-f8.yaml │ │ │ ├── txt2img-multinode-clip-encoder-f16-1024-laion-hr.yaml │ │ │ ├── txt2img-multinode-clip-encoder-f16-256-pretraining.yaml │ │ │ ├── txt2img-multinode-clip-encoder-f16-768-laion-hr-inference.yaml │ │ │ ├── txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml │ │ │ ├── txt2img-multinode-clip-encoder-f16-768.yaml │ │ │ ├── txt2img-t5-encoder-dev.yaml │ │ │ ├── txt2img-upscale-clip-encoder-f16-1024-dev.yaml │ │ │ ├── txt2img-upscale-clip-encoder-f16-1024.yaml │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-256-dev.yaml │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-256.yaml │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-512-dev.yaml │ │ │ ├── txt2img-v2-clip-encoder-improved_aesthetics-512.yaml │ │ │ ├── upscaling │ │ │ └── upscale-v1-with-f16.yaml │ │ │ ├── v1-inference.yaml │ │ │ ├── v1_improvedaesthetics.yaml │ │ │ ├── v1_laionhr.yaml │ │ │ ├── v2_laionhr1024.yaml │ │ │ ├── v2_laionhr1024_2.yaml │ │ │ ├── v2_pretraining.yaml │ │ │ └── v3_pretraining.yaml │ │ ├── cpu_config.yaml │ │ ├── environment.yaml │ │ ├── hpu_config_web_dataset.yaml │ │ ├── ldm │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── coco.py │ │ │ ├── dummy.py │ │ │ ├── imagenet.py │ │ │ ├── inpainting │ │ │ │ ├── __init__.py │ │ │ │ └── synthetic_mask.py │ │ │ ├── laion.py │ │ │ └── lsun.py │ │ ├── lr_scheduler.py │ │ ├── models │ │ │ ├── autoencoder.py │ │ │ └── diffusion │ │ │ │ ├── __init__.py │ │ │ │ ├── classifier.py │ │ │ │ ├── ddim.py │ │ │ │ ├── ddpm.py │ │ │ │ ├── plms.py │ │ │ │ └── sampling_util.py │ │ ├── modules │ │ │ ├── attention.py │ │ │ ├── diffusionmodules │ │ │ │ ├── __init__.py │ │ │ │ ├── model.py │ │ │ │ ├── openaimodel.py │ │ │ │ └── util.py │ │ │ ├── distributions │ │ │ │ ├── __init__.py │ │ │ │ └── distributions.py │ │ │ ├── ema.py │ │ │ ├── encoders │ │ │ │ ├── __init__.py │ │ │ │ └── modules.py │ │ │ ├── evaluate │ │ │ │ ├── adm_evaluator.py │ │ │ │ ├── evaluate_perceptualsim.py │ │ │ │ ├── frechet_video_distance.py │ │ │ │ ├── ssim.py │ │ │ │ └── torch_frechet_video_distance.py │ │ │ ├── image_degradation │ │ │ │ ├── __init__.py │ │ │ │ ├── bsrgan.py │ │ │ │ ├── bsrgan_light.py │ │ │ │ ├── utils │ │ │ │ │ └── test.png │ │ │ │ └── utils_image.py │ │ │ ├── losses │ │ │ │ ├── __init__.py │ │ │ │ ├── contperceptual.py │ │ │ │ └── vqperceptual.py │ │ │ └── x_transformer.py │ │ └── util.py │ │ ├── main.py │ │ ├── models │ │ ├── first_stage_models │ │ │ ├── kl-f16 │ │ │ │ └── config.yaml │ │ │ ├── kl-f32 │ │ │ │ └── config.yaml │ │ │ ├── kl-f4 │ │ │ │ └── config.yaml │ │ │ ├── kl-f8 │ │ │ │ └── config.yaml │ │ │ ├── vq-f16 │ │ │ │ └── config.yaml │ │ │ ├── vq-f4-noattn │ │ │ │ └── config.yaml │ │ │ ├── vq-f4 │ │ │ │ └── config.yaml │ │ │ ├── vq-f8-n256 │ │ │ │ └── config.yaml │ │ │ └── vq-f8 │ │ │ │ └── config.yaml │ │ └── ldm │ │ │ ├── bsr_sr │ │ │ └── config.yaml │ │ │ ├── celeba256 │ │ │ └── config.yaml │ │ │ ├── cin256 │ │ │ └── config.yaml │ │ │ ├── ffhq256 │ │ │ └── config.yaml │ │ │ ├── inpainting_big │ │ │ └── config.yaml │ │ │ ├── layout2img-openimages256 │ │ │ └── config.yaml │ │ │ ├── lsun_beds256 │ │ │ └── config.yaml │ │ │ ├── lsun_churches256 │ │ │ └── config.yaml │ │ │ ├── semantic_synthesis256 │ │ │ └── config.yaml │ │ │ ├── semantic_synthesis512 │ │ │ └── config.yaml │ │ │ └── text2img256 │ │ │ └── config.yaml │ │ ├── notebook_helpers.py │ │ ├── ops_bf16.txt │ │ ├── ops_fp32.txt │ │ ├── requirements.txt │ │ ├── scripts │ │ ├── autoencoder-eval.py │ │ ├── checker.py │ │ ├── cmd_on_new_ckpt.py │ │ ├── demo │ │ │ └── inpainting.py │ │ ├── img2img.py │ │ ├── inpaint.py │ │ ├── inpaint_sd.py │ │ ├── logging_template.py │ │ ├── mnist-distributed.py │ │ ├── printckpt.py │ │ ├── prompts │ │ │ ├── aesthetic-prompts-plain.txt │ │ │ ├── aesthetic-prompts-surrealism.txt │ │ │ ├── prompts-with-wings.txt │ │ │ ├── six-prompts │ │ │ ├── weird-dalle-prompts.txt │ │ │ ├── wings1.txt │ │ │ ├── wings2.txt │ │ │ ├── wings3.txt │ │ │ └── wings4.txt │ │ ├── prune-ckpt.py │ │ ├── sample_diffusion.py │ │ ├── slurm │ │ │ ├── README.md │ │ │ ├── eval_inpainting │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── resume_512 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── resume_512_improvedaesthetic │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── resume_768_hr │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1-upscaling-f16-pretraining-512-aesthetics │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_edgeinpainting │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_iahr_torch111 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_iahr_torch111_ucg │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_improvedaesthetics │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_improvedaesthetics_torch111 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_inpainting_aesthetics-larger-masks-ucg │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_inpainting_aesthetics-larger-masks │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_inpainting_improvedaesthetics_torch111 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v1_laionhr_torch111 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v2_laionhr1024 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v2_laionhr1024_2 │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ ├── v2_pretraining │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ │ └── v3_pretraining │ │ │ │ ├── launcher.sh │ │ │ │ └── sbatch.sh │ │ ├── test_gpu.py │ │ ├── test_gpu.sh │ │ ├── txt2img.py │ │ └── vqgan_codebook_visualizer.py │ │ └── setup.py └── nlp │ ├── DeepSpeedExamples │ └── deepspeed-bert │ │ ├── LICENSE │ │ ├── README.md │ │ ├── create_pretraining_data.py │ │ ├── data │ │ ├── BooksDownloader.py │ │ ├── BookscorpusTextFormatting.py │ │ ├── Downloader.py │ │ ├── GooglePretrainedWeightDownloader.py │ │ ├── TextSharding.py │ │ ├── WikiDownloader.py │ │ ├── WikicorpusTextFormatting.py │ │ ├── __init__.py │ │ ├── bertPrep.py │ │ └── create_datasets_from_start.sh │ │ ├── file_utils.py │ │ ├── lamb.py │ │ ├── lamb_exp.py │ │ ├── lans.py │ │ ├── modeling.py │ │ ├── requirements.txt │ │ ├── run_pretraining.py │ │ ├── schedulers.py │ │ ├── scripts │ │ ├── bert_1.5b_config.json │ │ ├── bert_5b_config.json │ │ ├── deepspeed_config_bert_1.5b.json │ │ ├── deepspeed_config_bert_5b_lans.json │ │ ├── hostsfile │ │ ├── run_bert_1.5b_8x.sh │ │ └── run_bert_5b_32x_lans.sh │ │ ├── tokenization.py │ │ └── utils.py │ └── bert │ ├── LICENSE │ ├── README.md │ ├── bert_config.json │ ├── bert_config_1.2B.json │ ├── create_pretraining_data.py │ ├── data │ ├── BooksDownloader.py │ ├── BookscorpusTextFormatting.py │ ├── Downloader.py │ ├── GooglePretrainedWeightDownloader.py │ ├── TextSharding.py │ ├── WikiDownloader.py │ ├── WikicorpusTextFormatting.py │ ├── __init__.py │ ├── bertPrep.py │ ├── create_datasets_from_start.sh │ └── squad │ │ └── squad_download.sh │ ├── file_utils.py │ ├── lamb.py │ ├── modeling.py │ ├── optimization.py │ ├── pack_pretraining_data_pytorch.py │ ├── pytorch_packed_data_checker.py │ ├── requirements.txt │ ├── run_pretraining.py │ ├── run_squad.py │ ├── schedulers.py │ ├── scripts │ └── run_pretraining.sh │ ├── tokenization.py │ └── utils.py └── README.md /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | > :memo: Please include a summary of the changes. 4 | > 5 | > * List any dependencies that are required for the changes. 6 | 7 | ## Type of changes 8 | 9 | Please specify the type of changes, and delete the options that are not relevant. 10 | 11 | - [ ] Documentation update 12 | - [ ] Bug fix (changes which fix an issue) 13 | - [ ] Others (please specify) 14 | 15 | ## Tests 16 | 17 | > :memo: Please describe the tests that you ran to verify your changes. 18 | > 19 | > * Provide the instructions so that we can reproduce. 20 | > * Please also list any relevant details for your test configuration. 21 | 22 | ## Checklist 23 | 24 | - [ ] I agree with the [Developer Certificate of Origin](https://developercertificate.org/). 25 | - [ ] My code conforms to the following coding guidelines: 26 | - [ ] Use Python 3 27 | - [ ] Python code follows [PEP 8 Coding Styles](https://www.python.org/dev/peps/pep-0008/) 28 | - [ ] I have performed a self code review. 29 | - [ ] I have made corresponding changes to the documentation. 30 | - [ ] I have added tests that prove my fix is effective or that my feature works. 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.h5 2 | *__pycache__* 3 | *.used 4 | *.log 5 | *.pbtxt 6 | *.pyc 7 | *.raw 8 | .falsh* 9 | graphs/* 10 | *.vscode 11 | build/ 12 | *.graph_dumps 13 | .idea/ 14 | events.out.tfevents* 15 | *.whl 16 | .gitreview 17 | !PyTorch/examples/gpu_migration/** -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "MLPERF4.0/Inference/llama/tgi-gaudi"] 2 | path = MLPERF4.0/Inference/llama/tgi-gaudi 3 | url = https://github.com/huggingface/tgi-gaudi.git 4 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/configs/fp8.conf: -------------------------------------------------------------------------------- 1 | *.Offline.min_query_count = 98304 2 | 3 | *.Server.target_qps = 21.1 4 | *.Server.min_query_count = 24576 5 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/config_meas_maxabs_quant_MAXABS_HW.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "observer": "maxabs", 5 | "scale_method": "maxabs_hw", 6 | "allowlist": {"types": [], "names": []}, 7 | "blocklist": {"types": [], "names": []}, 8 | "dump_stats_path": "./hqt/llama2-70b-8x/measure", 9 | "dump_stats_xlsx_path": "./hqt/fp8stats.xlsx" 10 | } 11 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_0_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_0_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_1_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_1_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_2_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_2_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_3_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_3_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_4_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_4_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_5_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_5_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_6_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_6_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_7_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_7_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_0_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_0_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_1_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_1_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_2_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_2_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_3_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_3_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_4_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_4_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_5_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_5_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_6_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_6_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_7_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/llama/hqt/llama2-70b-8x/measure_hooks_maxabs_MAXABS_HW_7_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "observer": "maxabs_per_channel", 5 | "scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2", 6 | "whitelist": {"types": [], "names": []}, 7 | "blacklist": {"types": [], "names": ["lm_head"]}, 8 | "dump_stats_path": "./llama_output", 9 | "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" 10 | } 11 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/maxabs_measure.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "MEASURE", 4 | "observer": "maxabs", 5 | "whitelist": {"types": [], "names": []}, 6 | "blacklist": {"types": [], "names": []}, 7 | "dump_stats_path": "./llama_output/7b_measure", 8 | "dump_stats_xlsx_path": "./llama_output/7b_measure/7b_fp8stats.xlsx" 9 | } -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/maxabs_pcq_measure.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "MEASURE", 4 | "observer": "maxabs_per_channel", 5 | "blacklist": {"types": [], "names": ["lm_head"]}, 6 | "dump_stats_path": "./llama_output", 7 | "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" 8 | } 9 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/maxabs_quant.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "observer": "maxabs", 5 | "scale_method": "maxabs_hw", 6 | "whitelist": {"types": [], "names": []}, 7 | "blacklist": {"types": [], "names": ["lm_head"]}, 8 | "dump_stats_path": "./llama_output/7b_measure", 9 | "dump_stats_xlsx_path": "./llama_output/7b_measure/7b_fp8stats.xlsx" 10 | } -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/shape_measure.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "MEASURE", 4 | "observer": "shape", 5 | "blacklist": {"types": [], "names": ["lm_head"]}, 6 | "dump_stats_path": "./llama_output", 7 | "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" 8 | } 9 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/unit_scale_quant.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "observer": "maxabs", 5 | "scale_method": "unit_scale", 6 | "whitelist": {"types": [], "names": []}, 7 | "blacklist": {"types": [], "names": []}, 8 | "dump_stats_path": "./llama_output", 9 | "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" 10 | } 11 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/quantization_config/without_scale_quant.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "observer": "maxabs", 5 | "scale_method": "without_scale", 6 | "whitelist": {"types": [], "names": []}, 7 | "blacklist": {"types": [], "names": []}, 8 | "dump_stats_path": "./llama_output", 9 | "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx" 10 | } 11 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/requirements.txt: -------------------------------------------------------------------------------- 1 | evaluate 2 | rouge_score 3 | accelerate 4 | pandas 5 | git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 6 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/llama/setup_tgi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | apt update -y && apt-get install -y psmisc 4 | script_dir=$(dirname "$(realpath "${BASH_SOURCE[0]}")") 5 | pushd $HOME 6 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 7 | source "$HOME/.cargo/env" 8 | # install protobuf 9 | PROTOC_ZIP=protoc-21.12-linux-x86_64.zip 10 | curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP 11 | unzip -o $PROTOC_ZIP -d /usr/local bin/protoc 12 | unzip -o $PROTOC_ZIP -d /usr/local 'include/*' 13 | rm -f $PROTOC_ZIP 14 | # prepare TGI with Gaudi support 15 | cd "$script_dir/tgi-gaudi/" 16 | pushd $HOME 17 | mkdir repos 18 | cp -r "$script_dir/tgi-gaudi/" repos/ 19 | # build server 20 | cd repos/tgi-gaudi/server 21 | make gen-server 22 | pip install pip --upgrade 23 | # don't try to overwrite torch 24 | grep -v "torch==" requirements.txt | pip install --no-deps -r /dev/stdin 25 | pip install -e . 26 | # this stoped to be installed by TGI but is still required: 27 | pip install outlines==0.0.36 28 | cd .. 29 | # build router 30 | cd router 31 | cargo install --locked --path . 32 | cd .. 33 | # build launcher 34 | cd launcher 35 | cargo install --locked --path . 36 | cd .. 37 | popd 38 | pip list 39 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/backend.py: -------------------------------------------------------------------------------- 1 | """ 2 | abstract backend class 3 | """ 4 | 5 | 6 | class Backend: 7 | def __init__(self): 8 | self.inputs = [] 9 | self.outputs = [] 10 | 11 | def version(self): 12 | raise NotImplementedError("Backend:version") 13 | 14 | def name(self): 15 | raise NotImplementedError("Backend:name") 16 | 17 | def load(self, model_path, inputs=None, outputs=None): 18 | raise NotImplementedError("Backend:load") 19 | 20 | def predict(self, feed): 21 | raise NotImplementedError("Backend:predict") 22 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/backend_debug.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import backend 3 | 4 | 5 | class BackendDebug(backend.Backend): 6 | def __init__(self, image_size=[3, 1024, 1024], **kwargs): 7 | super(BackendDebug, self).__init__() 8 | self.image_size = image_size 9 | 10 | def version(self): 11 | return torch.__version__ 12 | 13 | def name(self): 14 | return "debug-SUT" 15 | 16 | def image_format(self): 17 | return "NCHW" 18 | 19 | def load(self): 20 | return self 21 | 22 | def predict(self, prompts): 23 | images = [] 24 | with torch.no_grad(): 25 | for prompt in prompts: 26 | image = torch.randn(self.image_size) 27 | images.append(image) 28 | return images 29 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/configs/user.conf: -------------------------------------------------------------------------------- 1 | # The format of this config file is 'key = value'. 2 | # The key has the format 'model.scenario.key'. Value is mostly int64_t. 3 | # Model maybe '*' as wildcard. In that case the value applies to all models. 4 | # All times are in milli seconds 5 | stable-diffusion-xl.Offline.min_query_count = 5000 6 | stable-diffusion-xl.Server.min_query_count = 5000 7 | stable-diffusion-xl.Server.target_qps = 6.38 8 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import argparse 5 | 6 | def get_args(): 7 | """Parse commandline.""" 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--mlperf-accuracy-file", default="build/logs/results.json", help="path to results.json") 10 | args = parser.parse_args() 11 | return args 12 | 13 | def main(): 14 | args = get_args() 15 | with open(args.mlperf_accuracy_file, "r") as file: 16 | data = json.load(file) 17 | 18 | acc_results = data.get("accuracy_results", {"CLIP_SCORE": 0.0, "FID_SCORE": 0.0}) 19 | args = data.get("args", {}) 20 | 21 | acc_results["gen_num"] = args["count"] 22 | print("\nResults\n") 23 | print(acc_results) 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/requirements.txt: -------------------------------------------------------------------------------- 1 | diffusers==0.21.2 2 | transformers==4.42.3 3 | accelerate==0.23.0 4 | open-clip-torch==2.7.0 5 | opencv-python==4.8.1.78 6 | pycocotools==2.0.7 7 | torchmetrics[image]==1.2.0 8 | scipy==1.14.0 9 | flask==3.0.1 10 | numpy==1.26.3 11 | huggingface-hub==0.25.2 12 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/check_latents.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--torch-input", type=str, default="latents.pt") 9 | parser.add_argument("--numpy-input", type=str, default="latents.npy") 10 | args = parser.parse_args() 11 | return args 12 | 13 | 14 | if __name__ == "__main__": 15 | args = get_args() 16 | torch_latents = torch.load(args.torch_input) 17 | numpy_latents = torch.Tensor(np.load(args.numpy_input)) 18 | print(f"Torch Latents: {torch_latents}\nShape: {torch_latents.shape}") 19 | print(f"Numpy Latents: {numpy_latents}\nShape: {numpy_latents.shape}") 20 | assert torch_latents.shape == numpy_latents.shape 21 | assert (numpy_latents == torch_latents).all().item() 22 | print("All tests passed!") 23 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/download-coco-2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : "${DOWNLOAD_PATH:=../coco2014}" 4 | : "${MAX_IMAGES:=5000}" 5 | : "${NUM_WORKERS:=1}" 6 | 7 | while [ "$1" != "" ]; do 8 | case $1 in 9 | -d | --download-path ) shift 10 | DOWNLOAD_PATH=$1 11 | ;; 12 | esac 13 | case $1 in 14 | -m | --max-images ) shift 15 | MAX_IMAGES=$1 16 | ;; 17 | esac 18 | case $1 in 19 | -n | --num-workers ) shift 20 | NUM_WORKERS=$1 21 | ;; 22 | esac 23 | shift 24 | done 25 | 26 | if [ -z ${MAX_IMAGES} ]; 27 | then 28 | python3 coco.py \ 29 | --dataset-dir ${DOWNLOAD_PATH} \ 30 | --num-workers ${NUM_WORKERS} 31 | else 32 | python3 coco.py \ 33 | --dataset-dir ${DOWNLOAD_PATH} \ 34 | --max-images ${MAX_IMAGES} \ 35 | --num-workers ${NUM_WORKERS} 36 | fi -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/fid/README.md: -------------------------------------------------------------------------------- 1 | This is a copy from https://github.com/mseitzer/pytorch-fid/ with the modifications made here https://github.com/ahmadki/mlperf_sd_inference and some additional modifications for taking as dataset of tensors as input -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/latents.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/latents.npy -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/latents.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/latents.pt -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_0_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_0_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_1_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_1_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_2_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_2_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_3_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_3_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_4_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_4_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_5_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_5_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_6_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_6_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_7_8.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_all/fp8_hooks_maxabs_7_8.npz -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/measure_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "MEASURE", 4 | "dump_stats_path": "../tools/quantize/measure_all/fp8" 5 | } 6 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/quant_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "scale_method": "maxabs_hw_opt_weight", 5 | "dump_stats_path": "tools/quantize/measure_all/fp8" 6 | } -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/quantize/quant_config_bmm.json: -------------------------------------------------------------------------------- 1 | { 2 | "method": "HOOKS", 3 | "mode": "QUANTIZE", 4 | "scale_method": "maxabs_hw_opt_weight", 5 | "dump_stats_path": "tools/quantize/measure_all/fp8", 6 | "blocklist": {"types": ["Linear", "Conv2d", "LoRACompatibleLinear", "LoRACompatibleConv"]} 7 | } 8 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/sample_ids.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import numpy as np 4 | import pandas as pd 5 | 6 | def get_args(): 7 | """Parse commandline.""" 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument( 10 | "--tsv-path", default="../coco2014/captions/captions_source.tsv", help="Dataset download location" 11 | ) 12 | parser.add_argument( 13 | "--output-path", default="sample_ids.txt", help="Dataset download location" 14 | ) 15 | parser.add_argument( 16 | "--n", type=int, default=10, help="Dataset download location" 17 | ) 18 | parser.add_argument( 19 | "--seed", "-s", type=int, default=926019364, help="Dataset download location" 20 | ) 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | if __name__ == "__main__": 26 | args = get_args() 27 | np.random.seed(args.seed) 28 | df_annotations = pd.read_csv(f"{args.tsv_path}", sep="\t") 29 | sample_ids = list(np.random.choice(df_annotations.shape[0], args.n)) 30 | with open(args.output_path, "w+") as f: 31 | for i, sample in enumerate(sample_ids): 32 | if i != (len(sample_ids)-1): 33 | f.write(str(sample) + "\n") 34 | else: 35 | f.write(str(sample)) 36 | -------------------------------------------------------------------------------- /MLPERF4.0/Inference/stable-diffusion-xl/tools/sample_ids.txt: -------------------------------------------------------------------------------- 1 | 4459 2 | 4015 3 | 2705 4 | 1682 5 | 4048 6 | 4683 7 | 3757 8 | 1578 9 | 3319 10 | 95 -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang 2 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Run the scripts below to setup dataset 2 | 3 | bash download_books.sh 4 | 5 | bash download_vocab.sh 6 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/dataset/download_books.sh: -------------------------------------------------------------------------------- 1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin 2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/dataset/download_ckpt.sh: -------------------------------------------------------------------------------- 1 | mkdir -p checkpoints/gpt2_345m 2 | 3 | cd checkpoints/gpt2_345m 4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip 5 | unzip megatron_lm_345m_v0.0.zip 6 | rm megatron_lm_345m_v0.0.zip 7 | cd ../.. 8 | 9 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/dataset/download_vocab.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/MoE/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/MoE/ds_config_gpt_Zero2_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": 2 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": false, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes and Scripts 2 | 3 | ### Azure 4 | 5 | We strongly recommend to start with AzureML recipe in the ```azureml``` folder. 6 | 7 | If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder. 8 | 9 | ### MoE 10 | 11 | Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models. 12 | 13 | ### Curriculum Learning 14 | 15 | Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. 16 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/azureml/Dockerfile.dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu115-py38-torch1110 2 | 3 | USER root:root 4 | 5 | RUN pip install pybind11 6 | 7 | RUN pip install git+https://github.com/microsoft/DeepSpeed.git 8 | 9 | # add a100-topo.xml 10 | RUN mkdir -p /opt/microsoft/ 11 | RUN wget -O /opt/microsoft/a100-topo.xml https://hpcbenchmarks.blob.core.windows.net/bookcorpus/data/a100-topo.xml 12 | 13 | # to use on A100, enable env var below in your job 14 | ENV NCCL_TOPO_FILE="/opt/microsoft/a100-topo.xml" 15 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/azureml/README.md: -------------------------------------------------------------------------------- 1 | ## Megatron-DeepSpeed on AzureML 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning. 3 | 4 | ------ 5 | 6 | # Workspace Setup 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up). 8 | 9 | # Dataset Preparation 10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset. 12 | 13 | # Training 14 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py). 15 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/compression/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/create_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | # Wikipedia data can be downloaded from the following link: 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 10 | EVIDENCE_DATA_DIR= 11 | EMBEDDING_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | python tools/create_doc_index.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 128 \ 20 | --checkpoint-activations \ 21 | --seq-length 512 \ 22 | --retriever-seq-length 256 \ 23 | --max-position-embeddings 512 \ 24 | --load ${CHECKPOINT_PATH} \ 25 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 26 | --embedding-path ${EMBEDDING_PATH} \ 27 | --indexer-log-interval 1000 \ 28 | --indexer-batch-size 128 \ 29 | --vocab-file bert-vocab.txt \ 30 | --num-workers 2 \ 31 | --fp16 32 | 33 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/curriculum_learning/README.md: -------------------------------------------------------------------------------- 1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084). -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/curriculum_learning/ds_train.sh: -------------------------------------------------------------------------------- 1 | # # baseline 2 | # CONFIG=baseline 3 | # TAG=baseline 4 | # MODEL_SIZE=1558 5 | # LR=1.5e-4 6 | # BSZ=512 7 | # SEQ_LEN=1024 8 | # MP_SIZE=1 9 | # SEED=1234 10 | # SAVE_INTERVAL=5000 11 | # NUM_ITER=600000 12 | # NUM_TOKEN=157286400000 13 | # LR_DECAY_TOKEN=157286400000 14 | # LR_WARMUP_ITER=3000 15 | # CONFIG_TEMPLATE=false 16 | # CURRICULUM_STEP=0 17 | # CURRICULUM_MIN=0 18 | 19 | # curriculum learning 20 | CONFIG=curriculum_fixed_linear 21 | MODEL_SIZE=1558 22 | LR=6e-4 23 | BSZ=4096 24 | SEQ_LEN=1024 25 | MP_SIZE=1 26 | SEED=1234 27 | SAVE_INTERVAL=1000 28 | NUM_ITER=75000 29 | NUM_TOKEN=157286400000 30 | LR_DECAY_TOKEN=157286400000 31 | LR_WARMUP_ITER=3000 32 | CONFIG_TEMPLATE=true 33 | CURRICULUM_STEP=45000 34 | CURRICULUM_MIN=64 35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}" 36 | 37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN 38 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/curriculum_learning/ds_zero_stage_1_config_baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false 26 | } 27 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false, 26 | "curriculum_learning": { 27 | "enabled": true, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/evaluate_ict_zeroshot_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task ICT-ZEROSHOT-NQ \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --checkpoint-activations \ 24 | --seq-length 512 \ 25 | --max-position-embeddings 512 \ 26 | --load ${CHECKPOINT_PATH} \ 27 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 28 | --embedding-path ${EMBEDDING_PATH} \ 29 | --retriever-seq-length 256 \ 30 | --vocab-file bert-vocab.txt\ 31 | --qa-data-test ${QA_FILE} \ 32 | --num-workers 2 \ 33 | --faiss-use-gpu \ 34 | --retriever-report-topk-accuracies 1 5 20 100 \ 35 | --fp16 36 | 37 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --micro-batch-size 4 \ 13 | --global-batch-size 8 \ 14 | --seq-length 512 \ 15 | --max-position-embeddings 512 \ 16 | --train-iters 2000000 \ 17 | --lr-decay-iters 990000 \ 18 | --save $CHECKPOINT_PATH \ 19 | --load $CHECKPOINT_PATH \ 20 | --data-path $DATA_PATH \ 21 | --vocab-file bert-vocab.txt \ 22 | --data-impl mmap \ 23 | --split 949,50,1 \ 24 | --lr 0.0001 \ 25 | --min-lr 0.00001 \ 26 | --lr-decay-style linear \ 27 | --lr-warmup-fraction .01 \ 28 | --weight-decay 1e-2 \ 29 | --clip-grad 1.0 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | python pretrain_gpt.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 1024 \ 19 | --max-position-embeddings 1024 \ 20 | --train-iters 500000 \ 21 | --lr-decay-iters 320000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file gpt2-vocab.json \ 26 | --merge-file gpt2-merges.txt \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --checkpoint-activations \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 42 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH= 6 | VOCAB_FILE= 7 | CHECKPOINT_PATH= 8 | 9 | python pretrain_t5.py \ 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --micro-batch-size 16 \ 18 | --global-batch-size 2048 \ 19 | --max-position-embeddings 512 \ 20 | --train-iters 1000000 \ 21 | --lr-decay-iters 1000000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file $VOCAB_FILE \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --lr 0.0001 \ 29 | --min-lr 0.00001 \ 30 | --lr-decay-style linear \ 31 | --lr-warmup-fraction .01 \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --log-interval 100 \ 35 | --save-interval 10000 \ 36 | --eval-interval 1000 \ 37 | --eval-iters 10 \ 38 | --fp16 39 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Training/benchmarks/gpt3/images/cases_april2021.png -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Habana Labs, Ltd. an Intel Company. 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import enum 18 | 19 | class LayerType(enum.Enum): 20 | encoder = 1 21 | decoder = 2 22 | 23 | class AttnType(enum.Enum): 24 | self_attn = 1 25 | cross_attn = 2 26 | 27 | class AttnMaskType(enum.Enum): 28 | padding = 1 29 | causal = 2 30 | prefix = 3 31 | 32 | class PositionEmbeddingType(enum.Enum): 33 | rotary = 1 34 | absolute = 2 35 | alibi = 3 36 | learnable = 4 -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Habana Labs, Ltd. an Intel Company. 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import torch 18 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 19 | from .rmsnorm import RMSNorm 20 | from .utils import CrossEntropy 21 | 22 | from .distributed import DistributedDataParallel 23 | from .bert_model import BertModel 24 | from .gpt_model import GPTModel, GPTModelPipe 25 | from .llama_model import LLaMAModel, LLaMAModelPipe 26 | from .t5_model import T5Model 27 | from .language_model import get_language_model 28 | from .module import Float16Module 29 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/model/rmsnorm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023, Habana Labs, Ltd. an Intel Company 3 | 4 | import torch 5 | from torch.nn import init 6 | from torch.nn.parameter import Parameter 7 | 8 | class RMSNorm(torch.nn.Module): 9 | def __init__(self, dim, eps=1e-6, sequence_parallel=False): 10 | super().__init__() 11 | self.epsilon = eps 12 | self.weight = Parameter(torch.Tensor(dim)) 13 | init.ones_(self.weight) 14 | 15 | if sequence_parallel: 16 | # set sequence parallelism flag on weight parameter 17 | setattr(self.weight, 'sequence_parallel', True) 18 | 19 | def forward(self, x): 20 | dtype = x.dtype 21 | x = x.float() 22 | norm = torch.mean(x**2, -1, keepdim=True) 23 | norm = x.mul(norm.add_(self.epsilon).rsqrt_()) 24 | return self.weight * norm.to(dtype) 25 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Training/benchmarks/gpt3/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/mlcommons/logging.git@2.1.0-rc1 2 | git+https://github.com/NVIDIA/mlperf-common.git 3 | pybind11 4 | torch 5 | six 6 | regex 7 | numpy 8 | sentencepiece 9 | torchmetrics>=0.8.0 10 | einops -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tasks/eval_harness/download.py: -------------------------------------------------------------------------------- 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 2 | # under the license https://huggingface.co/spaces/bigscience/license 3 | 4 | # Downloads the specified taks in the evaluation harness 5 | # This is particularly useful when running in environments where the GPU nodes 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. 7 | 8 | from lm_eval import tasks 9 | from lm_eval.tasks import ALL_TASKS 10 | import argparse 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False) 15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.') 16 | args = parser.parse_args() 17 | 18 | def main(): 19 | task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') 20 | tasks.get_task_dict(task_list) 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | 26 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tests/ds_config_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "train_batch_size": 16, 4 | "gradient_clipping": 1.0, 5 | "zero_optimization": { 6 | "stage": 0 7 | }, 8 | "bf16": { 9 | "enabled": true 10 | }, 11 | "zero_allow_untested_optimizer": true, 12 | "steps_per_print": 2000, 13 | "wall_clock_breakdown": false 14 | } 15 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/MLPERF4.0/Training/benchmarks/gpt3/tools/__init__.py -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tools/convert_checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | from .verify_checkpoint_non_tp_consistency import verify_checkpoint -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tools/convert_checkpoint/common_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "optimizer": { 4 | "param_groups": [ 5 | { 6 | "wd_mult": 1.0, 7 | "lr_mult": 1.0, 8 | "lr": 5.984178321979609e-05, 9 | "bias_correction": true, 10 | "betas": [ 11 | 0.9, 12 | 0.95 13 | ], 14 | "eps": 1e-08, 15 | "weight_decay": 0.1, 16 | "step": 4000 17 | } 18 | ] 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tools/convert_checkpoint/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os 4 | from collections import OrderedDict 5 | 6 | 7 | def dump_data(datum, name_list=[]): 8 | if type(datum) in (dict, OrderedDict): 9 | for k, v in datum.items(): 10 | dump_data(v, name_list+[str(k)]) 11 | elif type(datum) in (list, tuple): 12 | for v in datum: 13 | dump_data(v, name_list) 14 | elif torch.is_tensor(datum): 15 | prefix = '.'.join(name_list) 16 | print(f'[tensor] {prefix} = {datum.shape}') 17 | else: 18 | #pass 19 | prefix = '.'.join(name_list) 20 | print(f'[other] {prefix} = {datum}') 21 | 22 | def main(): 23 | if len(sys.argv) < 2: 24 | print(f'Usage: {sys.argv[0]} ') 25 | exit(1) 26 | 27 | ckpt_file = sys.argv[1] 28 | if not os.path.isfile(ckpt_file): 29 | print(f'{ckpt_file} is not a valid file') 30 | exit(1) 31 | 32 | print(f'loading checkpoint file: {ckpt_file}') 33 | sd = torch.load(ckpt_file, map_location=torch.device('cpu')) 34 | dump_data(sd) 35 | 36 | quit() 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/gpt3/tools/create_doc_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 4 | os.path.pardir))) 5 | 6 | from megatron import print_rank_0 7 | from megatron.indexer import IndexBuilder 8 | from megatron.initialize import initialize_megatron 9 | 10 | 11 | def main(): 12 | """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset 13 | - Include all args needed for initial model specification 14 | 15 | Other key args: 16 | --block-data-path: path to write to 17 | --ict-load or --realm-load: path to checkpoint with which to embed 18 | --data-path and --titles-data-path: paths for dataset 19 | --indexer-log-interval: reporting interval 20 | --indexer-batch-size: size specific for indexer jobs 21 | 22 | Check README.md for example script 23 | """ 24 | 25 | initialize_megatron(extra_args_provider=None, 26 | args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) 27 | index_builder = IndexBuilder() 28 | index_builder.build_and_save_index() 29 | print_rank_0("Build and save indices: done!") 30 | 31 | if __name__ == "__main__": 32 | main() 33 | 34 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/llm_finetune/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "regisss/llama2-70b-fused-qkv-mlperf", 3 | "architectures": [ 4 | "LlamaForCausalLM" 5 | ], 6 | "fused_qkv": true, 7 | "attention_bias": false, 8 | "attention_dropout": 0.0, 9 | "bos_token_id": 1, 10 | "eos_token_id": 2, 11 | "hidden_act": "silu", 12 | "hidden_size": 8192, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 28672, 15 | "max_position_embeddings": 8192, 16 | "model_type": "llama", 17 | "num_attention_heads": 64, 18 | "num_hidden_layers": 80, 19 | "num_key_value_heads": 8, 20 | "pretraining_tp": 1, 21 | "rms_norm_eps": 1e-05, 22 | "rope_scaling": null, 23 | "rope_theta": 10000.0, 24 | "tie_word_embeddings": false, 25 | "torch_dtype": "float32", 26 | "transformers_version": "4.37.2", 27 | "use_cache": false, 28 | "vocab_size": 32000 29 | } 30 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/llm_finetune/configs/ds_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps_per_print": 1, 3 | "train_batch_size": "auto", 4 | "train_micro_batch_size_per_gpu": "auto", 5 | "gradient_accumulation_steps": "auto", 6 | "bf16": { 7 | "enabled": true 8 | }, 9 | "gradient_clipping": "auto", 10 | "zero_optimization": { 11 | "stage": 3, 12 | "overlap_comm": false, 13 | "contiguous_gradients": false, 14 | "stage3_gather_16bit_weights_on_model_save": true 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/llm_finetune/ops_bf16.txt: -------------------------------------------------------------------------------- 1 | addmm 2 | addbmm 3 | batch_norm 4 | baddbmm 5 | bmm 6 | conv1d 7 | conv2d 8 | conv3d 9 | conv_transpose1d 10 | conv_transpose2d 11 | conv_transpose3d 12 | dot 13 | dropout 14 | feature_dropout 15 | group_norm 16 | instance_norm 17 | layer_norm 18 | leaky_relu 19 | linear 20 | matmul 21 | mean 22 | mm 23 | mul 24 | mv 25 | softmax 26 | log_softmax 27 | sin 28 | cos 29 | add 30 | div 31 | gather 32 | embedding 33 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/llm_finetune/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/mlcommons/logging.git 2 | datasets==2.17.1 3 | torch >= 1.3 4 | datasets >= 2.4.0 5 | sentencepiece != 0.1.92 6 | protobuf 7 | evaluate 8 | scikit-learn 9 | peft >= 0.10.0 10 | -------------------------------------------------------------------------------- /MLPERF4.0/Training/benchmarks/llm_finetune/scripts/create_warmup_data.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company 3 | ############################################################################### 4 | 5 | import json 6 | 7 | # Opening JSON file 8 | import numpy as np 9 | # returns JSON object as 10 | # a dictionary 11 | 12 | for i in range(160): 13 | train_dict = {} 14 | train_dict["input_ids"]=np.random.randint(8192, size=(8192)).tolist() 15 | train_dict["labels"]=np.random.randint(8192, size=(8192)).tolist() 16 | tweets = [] 17 | with open("./train_warmup.json", "a") as outfile: 18 | json.dump(train_dict, outfile) 19 | outfile.write('\n') 20 | 21 | for i in range(173): 22 | train_dict = {} 23 | train_dict["input_ids"]=np.random.randint(8192, size=(8192)).tolist() 24 | train_dict["labels"]=[-100] * 8192 25 | tweets = [] 26 | with open("./eval_warmup.json", "a") as outfile: 27 | json.dump(train_dict, outfile) 28 | outfile.write('\n') 29 | 30 | # Closing file 31 | -------------------------------------------------------------------------------- /PyTorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/__init__.py -------------------------------------------------------------------------------- /PyTorch/audio/wav2vec2/inference/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.41.1 2 | datasets>=2.8.0,<=2.10.0 3 | soundfile==0.11.0 4 | librosa==0.9.2 5 | jiwer 6 | fsspec==2023.9.2 7 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Habana Labs, Ltd. an Intel Company 4 | Copyright (c) 2020 jeonsworld 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/img/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/classification/ViT/img/figure1.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/img/figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/classification/ViT/img/figure2.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/img/figure3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/classification/ViT/img/figure3.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/ops_bf16.txt: -------------------------------------------------------------------------------- 1 | add 2 | addmm 3 | bmm 4 | dot 5 | iadd 6 | layer_norm 7 | matmul 8 | mm 9 | rsub 10 | softmax 11 | mul 12 | mean 13 | dropout 14 | linear 15 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/ops_fp32.txt: -------------------------------------------------------------------------------- 1 | cross_entropy 2 | log_softmax 3 | embedding 4 | binary_cross_entropy 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | tqdm 3 | scipy 4 | ml-collections 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/ViT/vit_utils/dist_util.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | 3 | def get_rank(): 4 | if not dist.is_available(): 5 | return 0 6 | if not dist.is_initialized(): 7 | return 0 8 | return dist.get_rank() 9 | 10 | def get_world_size(): 11 | if not dist.is_available(): 12 | return 1 13 | if not dist.is_initialized(): 14 | return 1 15 | return dist.get_world_size() 16 | 17 | def is_main_process(): 18 | return get_rank() == 0 19 | 20 | def format_step(step): 21 | if isinstance(step, str): 22 | return step 23 | s = "" 24 | if len(step) > 0: 25 | s += "Training Epoch: {} ".format(step[0]) 26 | if len(step) > 1: 27 | s += "Training Iteration: {} ".format(step[1]) 28 | if len(step) > 2: 29 | s += "Validation Iteration: {} ".format(step[2]) 30 | return s 31 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/media_pipe_settings.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2022 Habana Labs, Ltd. an Intel Company 2 | 3 | TRAIN_RESIZE_DIM = 224 4 | EVAL_RESIZE_DIM = 256 5 | CROP_DIM = 224 6 | 7 | DECODER_SCALE_MIN = 0.08 8 | DECODER_SCALE_MAX = 1.0 9 | DECODER_RATIO_MIN = 0.75 10 | DECODER_RATIO_MAX = 1.3333333333333333 11 | 12 | USE_HORIZONTAL_FLIP = 1 13 | FLIP_PROBABILITY = 0.5 14 | 15 | RGB_MEAN_VALUES = [0.485, 0.456, 0.406] 16 | RGB_STD_VALUES = [0.229, 0.224, 0.225] 17 | RGB_MULTIPLIER = 255 18 | 19 | EVAL_CROP_X = 0.5 20 | EVAL_CROP_Y = 0.5 21 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import * 2 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/model/utils.py: -------------------------------------------------------------------------------- 1 | try: 2 | from torch.hub import load_state_dict_from_url 3 | except ImportError: 4 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/ops_bf16_Resnet.txt: -------------------------------------------------------------------------------- 1 | addmm 2 | avg_pool2d 3 | bmm 4 | conv2d 5 | dot 6 | max_pool2d 7 | mm 8 | mv 9 | relu 10 | t 11 | linear -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/ops_fp32_Resnet.txt: -------------------------------------------------------------------------------- 1 | cross_entropy 2 | log_softmax 3 | nll_loss 4 | softmax 5 | topk 6 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/requirements.txt: -------------------------------------------------------------------------------- 1 | mpi4py>=3.0.3 2 | scipy>=1.7.1 3 | colorlog==6.6.0 4 | numpy==1.23.5 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/classification/torchvision/requirements_u24.txt: -------------------------------------------------------------------------------- 1 | mpi4py>=3.0.3 2 | scipy>=1.7.1 3 | colorlog==6.6.0 4 | numpy==1.26.4 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/assets/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/detection/yolox/assets/demo.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/assets/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/detection/yolox/assets/dog.jpg -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/assets/git_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/detection/yolox/assets/git_fig.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/detection/yolox/assets/logo.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/MegEngine/python/README.md: -------------------------------------------------------------------------------- 1 | # YOLOX-Python-MegEngine 2 | 3 | Python version of YOLOX object detection base on [MegEngine](https://github.com/MegEngine/MegEngine). 4 | 5 | ## Tutorial 6 | 7 | ### Step1: install requirements 8 | 9 | ``` 10 | python3 -m pip install megengine -f https://megengine.org.cn/whl/mge.html 11 | ``` 12 | 13 | ### Step2: convert checkpoint weights from torch's path file 14 | 15 | ``` 16 | python3 convert_weights.py -w yolox_s.pth -o yolox_s_mge.pkl 17 | ``` 18 | 19 | ### Step3: run demo 20 | 21 | This part is the same as torch's python demo, but no need to specify device. 22 | 23 | ``` 24 | python3 demo.py image -n yolox-s -c yolox_s_mge.pkl --path ../../../assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result 25 | ``` 26 | 27 | ### [Optional]Step4: dump model for cpp inference 28 | 29 | > **Note**: result model is dumped with `optimize_for_inference` and `enable_fuse_conv_bias_nonlinearity`. 30 | 31 | ``` 32 | python3 dump.py -n yolox-s -c yolox_s_mge.pkl --dump_path yolox_s.mge 33 | ``` 34 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/MegEngine/python/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .darknet import CSPDarknet, Darknet 6 | from .yolo_fpn import YOLOFPN 7 | from .yolo_head import YOLOXHead 8 | from .yolo_pafpn import YOLOPAFPN 9 | from .yolox import YOLOX 10 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/MegEngine/python/models/yolox.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | import megengine.module as M 6 | 7 | from .yolo_head import YOLOXHead 8 | from .yolo_pafpn import YOLOPAFPN 9 | 10 | 11 | class YOLOX(M.Module): 12 | """ 13 | YOLOX model module. The module list is defined by create_yolov3_modules function. 14 | The network returns loss values from three YOLO layers during training 15 | and detection results during test. 16 | """ 17 | 18 | def __init__(self, backbone=None, head=None): 19 | super().__init__() 20 | if backbone is None: 21 | backbone = YOLOPAFPN() 22 | if head is None: 23 | head = YOLOXHead(80) 24 | 25 | self.backbone = backbone 26 | self.head = head 27 | 28 | def forward(self, x): 29 | # fpn output content features of [dark3, dark4, dark5] 30 | fpn_outs = self.backbone(x) 31 | assert not self.training 32 | outputs = self.head(fpn_outs) 33 | 34 | return outputs 35 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/OpenVINO/README.md: -------------------------------------------------------------------------------- 1 | ## YOLOX for OpenVINO 2 | 3 | * [C++ Demo](./cpp) 4 | * [Python Demo](./python) -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/OpenVINO/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.1) 2 | set(CMAKE_CXX_STANDARD 14) 3 | 4 | project(yolox_openvino_demo) 5 | 6 | find_package(OpenCV REQUIRED) 7 | find_package(InferenceEngine REQUIRED) 8 | find_package(ngraph REQUIRED) 9 | 10 | include_directories( 11 | ${OpenCV_INCLUDE_DIRS} 12 | ${CMAKE_CURRENT_SOURCE_DIR} 13 | ${CMAKE_CURRENT_BINARY_DIR} 14 | ) 15 | 16 | add_executable(yolox_openvino yolox_openvino.cpp) 17 | 18 | target_link_libraries( 19 | yolox_openvino 20 | ${InferenceEngine_LIBRARIES} 21 | ${NGRAPH_LIBRARIES} 22 | ${OpenCV_LIBS} 23 | ) -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/README.md: -------------------------------------------------------------------------------- 1 | # YOLOX-Android-ncnn 2 | 3 | Andoird app of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn) 4 | 5 | 6 | ## Tutorial 7 | 8 | ### Step1 9 | 10 | Download ncnn-android-vulkan.zip from [releases of ncnn](https://github.com/Tencent/ncnn/releases). This repo uses 11 | [20210525 release](https://github.com/Tencent/ncnn/releases/download/20210525/ncnn-20210525-android-vulkan.zip) for building. 12 | 13 | ### Step2 14 | 15 | After downloading, please extract your zip file. Then, there are two ways to finish this step: 16 | * put your extracted directory into **app/src/main/jni** 17 | * change the **ncnn_DIR** path in **app/src/main/jni/CMakeLists.txt** to your extracted directory 18 | 19 | ### Step3 20 | Download example param and bin file from [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESXBH_GSSmFMszWJ6YG2VkQB5cWDfqVWXgk0D996jH0rpQ?e=qzEqUh) or [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_ncnn.tar.gz). Unzip the file to **app/src/main/assets**. 21 | 22 | ### Step4 23 | Open this project with Android Studio, build it and enjoy! 24 | 25 | ## Reference 26 | 27 | * [ncnn-android-yolov5](https://github.com/nihui/ncnn-android-yolov5) 28 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'com.android.application' 2 | 3 | android { 4 | compileSdkVersion 24 5 | buildToolsVersion "29.0.2" 6 | 7 | defaultConfig { 8 | applicationId "com.megvii.yoloXncnn" 9 | archivesBaseName = "$applicationId" 10 | 11 | ndk { 12 | moduleName "ncnn" 13 | abiFilters "armeabi-v7a", "arm64-v8a" 14 | } 15 | minSdkVersion 24 16 | } 17 | 18 | externalNativeBuild { 19 | cmake { 20 | version "3.10.2" 21 | path file('src/main/jni/CMakeLists.txt') 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/YOLOXncnn.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. 2 | 3 | package com.megvii.yoloXncnn; 4 | 5 | import android.content.res.AssetManager; 6 | import android.graphics.Bitmap; 7 | 8 | public class YOLOXncnn 9 | { 10 | public native boolean Init(AssetManager mgr); 11 | 12 | public class Obj 13 | { 14 | public float x; 15 | public float y; 16 | public float w; 17 | public float h; 18 | public String label; 19 | public float prob; 20 | } 21 | 22 | public native Obj[] Detect(Bitmap bitmap, boolean use_gpu); 23 | 24 | static { 25 | System.loadLibrary("yoloXncnn"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/yoloXncnn.java: -------------------------------------------------------------------------------- 1 | // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. 2 | 3 | package com.megvii.yoloXncnn; 4 | 5 | import android.content.res.AssetManager; 6 | import android.graphics.Bitmap; 7 | 8 | public class YOLOXncnn 9 | { 10 | public native boolean Init(AssetManager mgr); 11 | 12 | public class Obj 13 | { 14 | public float x; 15 | public float y; 16 | public float w; 17 | public float h; 18 | public String label; 19 | public float prob; 20 | } 21 | 22 | public native Obj[] Detect(Bitmap bitmap, boolean use_gpu); 23 | 24 | static { 25 | System.loadLibrary("yoloXncnn"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/src/main/jni/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(yoloXncnn) 2 | 3 | cmake_minimum_required(VERSION 3.4.1) 4 | 5 | set(ncnn_DIR ${CMAKE_SOURCE_DIR}/ncnn-20210525-android-vulkan/${ANDROID_ABI}/lib/cmake/ncnn) 6 | find_package(ncnn REQUIRED) 7 | 8 | add_library(yoloXncnn SHARED yoloXncnn_jni.cpp) 9 | 10 | target_link_libraries(yoloXncnn 11 | ncnn 12 | 13 | jnigraphics 14 | ) 15 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/app/src/main/res/values/strings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | yoloXncnn 4 | 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/build.gradle: -------------------------------------------------------------------------------- 1 | // Top-level build file where you can add configuration options common to all sub-projects/modules. 2 | buildscript { 3 | repositories { 4 | jcenter() 5 | google() 6 | } 7 | dependencies { 8 | classpath 'com.android.tools.build:gradle:3.5.0' 9 | } 10 | } 11 | 12 | allprojects { 13 | repositories { 14 | jcenter() 15 | google() 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/detection/yolox/demo/ncnn/android/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sun Aug 25 10:34:48 CST 2019 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip 7 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/demo/ncnn/android/settings.gradle: -------------------------------------------------------------------------------- 1 | include ':app' 2 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * some extra css to make markdown look similar between github/sphinx 4 | */ 5 | 6 | /* 7 | * Below is for install.md: 8 | */ 9 | .rst-content code { 10 | white-space: pre; 11 | border: 0px; 12 | } 13 | 14 | .rst-content th { 15 | border: 1px solid #e1e4e5; 16 | } 17 | 18 | .rst-content th p { 19 | /* otherwise will be default 24px for regular paragraph */ 20 | margin-bottom: 0px; 21 | } 22 | 23 | .rst-content .line-block { 24 | /* otherwise will be 24px */ 25 | margin-bottom: 0px; 26 | } 27 | 28 | div.section > details { 29 | padding-bottom: 1em; 30 | } 31 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/megengine_cpp_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/MegEngine/cpp/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/megengine_py_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/MegEngine/python/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/ncnn_android_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/ncnn/android/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/ncnn_cpp_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/ncnn/cpp/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/onnx_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/ONNXRuntime/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/openvino_cpp_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/OpenVINO/cpp/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/openvino_py_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/OpenVINO/python/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/trt_cpp_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/TensorRT/cpp/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/demo/trt_py_readme.md: -------------------------------------------------------------------------------- 1 | ../../demo/TensorRT/python/README.md -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Welcome to YOLOX's documentation! 3 | ====================================== 4 | 5 | .. image:: ../assets/logo.png 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Quick Run 10 | 11 | quick_run 12 | model_zoo 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | :caption: Tutorials 17 | 18 | train_custom_data 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | :caption: Demployment 23 | 24 | demo/trt_py_readme 25 | demo/trt_cpp_readme 26 | demo/megengine_cpp_readme 27 | demo/megengine_py_readme 28 | demo/ncnn_android_readme 29 | demo/ncnn_cpp_readme 30 | demo/onnx_readme 31 | demo/openvino_py_readme 32 | demo/openvino_cpp_readme -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/docs/requirements-doc.txt: -------------------------------------------------------------------------------- 1 | docutils==0.16 2 | # https://github.com/sphinx-doc/sphinx/commit/7acd3ada3f38076af7b2b5c9f3b60bb9c2587a3d 3 | sphinx==3.2.0 4 | recommonmark==0.6.0 5 | sphinx_rtd_theme 6 | omegaconf>=2.1.0.dev24 7 | hydra-core>=1.1.0.dev5 8 | sphinx-markdown-tables==0.0.15 9 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/download_dataset.sh: -------------------------------------------------------------------------------- 1 | # Get COCO 2017 data sets 2 | dir=$(pwd) 3 | mkdir -p /data/COCO; cd /data/COCO 4 | curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip 5 | curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip 6 | curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip 7 | cd $dir 8 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolov3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | import torch.nn as nn 8 | 9 | from yolox.exp import Exp as MyExp 10 | 11 | 12 | class Exp(MyExp): 13 | def __init__(self): 14 | super(Exp, self).__init__() 15 | self.depth = 1.0 16 | self.width = 1.0 17 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 18 | 19 | def get_model(self, sublinear=False): 20 | def init_yolo(M): 21 | for m in M.modules(): 22 | if isinstance(m, nn.BatchNorm2d): 23 | m.eps = 1e-3 24 | m.momentum = 0.03 25 | if "model" not in self.__dict__: 26 | from yolox.models import YOLOX, YOLOFPN, YOLOXHead 27 | backbone = YOLOFPN() 28 | head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu") 29 | self.model = YOLOX(backbone, head) 30 | self.model.apply(init_yolo) 31 | self.model.head.initialize_biases(1e-2) 32 | 33 | return self.model 34 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolox_l.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 1.0 14 | self.width = 1.0 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolox_m.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.67 14 | self.width = 0.75 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolox_s.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.33 14 | self.width = 0.50 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolox_tiny.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 0.33 14 | self.width = 0.375 15 | self.input_size = (416, 416) 16 | self.mosaic_scale = (0.5, 1.5) 17 | self.random_size = (10, 20) 18 | self.test_size = (416, 416) 19 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 20 | self.enable_mixup = False 21 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/default/yolox_x.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | import os 6 | 7 | from yolox.exp import Exp as MyExp 8 | 9 | 10 | class Exp(MyExp): 11 | def __init__(self): 12 | super(Exp, self).__init__() 13 | self.depth = 1.33 14 | self.width = 1.25 15 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/exps/example/custom/yolox_s.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | import os 5 | 6 | from yolox.exp import Exp as MyExp 7 | 8 | 9 | class Exp(MyExp): 10 | def __init__(self): 11 | super(Exp, self).__init__() 12 | self.depth = 0.33 13 | self.width = 0.50 14 | self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] 15 | 16 | # Define yourself dataset path 17 | self.data_dir = "datasets/coco128" 18 | self.train_ann = "instances_train2017.json" 19 | self.val_ann = "instances_val2017.json" 20 | 21 | self.num_classes = 71 22 | 23 | self.max_epoch = 300 24 | self.data_num_workers = 4 25 | self.eval_interval = 1 26 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/hubconf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | """ 5 | Usage example: 6 | import torch 7 | model = torch.hub.load("Megvii-BaseDetection/YOLOX", "yolox_s") 8 | """ 9 | dependencies = ["torch"] 10 | 11 | from yolox.models import ( # isort:skip # noqa: F401, E402 12 | yolox_tiny, 13 | yolox_nano, 14 | yolox_s, 15 | yolox_m, 16 | yolox_l, 17 | yolox_x, 18 | yolov3, 19 | ) 20 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/ops_bf16_yolox.txt: -------------------------------------------------------------------------------- 1 | max_pool2d_with_indices 2 | max_pool2d 3 | conv2d 4 | bmm 5 | mul 6 | mm 7 | mv 8 | div 9 | batch_norm 10 | sigmoid 11 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/ops_fp32_yolox.txt: -------------------------------------------------------------------------------- 1 | cross_entropy 2 | log_softmax 3 | nll_loss 4 | softmax 5 | binary_cross_entropy 6 | l1_loss 7 | view 8 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/requirements.txt: -------------------------------------------------------------------------------- 1 | # TODO: Update with exact module version 2 | opencv_python 3 | loguru 4 | scikit-image 5 | pillow==10.3.0 6 | thop 7 | ninja 8 | tabulate 9 | cython 10 | 11 | # verified versions 12 | # pycocotools corresponds to https://github.com/ppwwyyxx/cocoapi 13 | pycocotools>=2.0.2 14 | 15 | # no need onnox for hpu enablement 16 | # latest version of thop require onnx 17 | #onnx==1.8.1 18 | onnxruntime==1.18.1 19 | #onnx-simplifier==0.3.5 20 | 21 | 22 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 100 3 | multi_line_output = 3 4 | balanced_wrapping = True 5 | known_standard_library = setuptools 6 | known_third_party = tqdm,loguru 7 | known_data_processing = cv2,numpy,scipy,PIL,matplotlib,scikit_image 8 | known_datasets = pycocotools 9 | known_deeplearning = torch,torchvision,caffe2,onnx,apex,timm,thop,torch2trt,tensorrt,openvino,onnxruntime 10 | known_myself = yolox 11 | sections = FUTURE,STDLIB,THIRDPARTY,data_processing,datasets,deeplearning,myself,FIRSTPARTY,LOCALFOLDER 12 | no_lines_before=STDLIB,THIRDPARTY,datasets 13 | default_section = FIRSTPARTY 14 | 15 | [flake8] 16 | max-line-length = 100 17 | max-complexity = 18 18 | exclude = __init__.py 19 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | from .utils import configure_module 5 | 6 | configure_module() 7 | 8 | __version__ = "0.2.0" 9 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/core/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .launch import launch 6 | from .trainer import Trainer 7 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/data/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .data_augment import TrainTransform, ValTransform 6 | from .data_prefetcher import DataPrefetcher 7 | from .dataloading import DataLoader, get_yolox_datadir, worker_init_reset_seed 8 | from .datasets import * 9 | from .samplers import InfiniteSampler, YoloBatchSampler 10 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .coco import COCODataset 6 | from .coco_classes import COCO_CLASSES 7 | from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset 8 | from .mosaicdetection import MosaicDetection 9 | from .voc import VOCDetection 10 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/data/datasets/voc_classes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | # VOC_CLASSES = ( '__background__', # always index 0 6 | VOC_CLASSES = ( 7 | "aeroplane", 8 | "bicycle", 9 | "bird", 10 | "boat", 11 | "bottle", 12 | "bus", 13 | "car", 14 | "cat", 15 | "chair", 16 | "cow", 17 | "diningtable", 18 | "dog", 19 | "horse", 20 | "motorbike", 21 | "person", 22 | "pottedplant", 23 | "sheep", 24 | "sofa", 25 | "train", 26 | "tvmonitor", 27 | ) 28 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii, Inc. and its affiliates. 4 | 5 | from .coco_evaluator import COCOEvaluator 6 | from .voc_evaluator import VOCEvaluator 7 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/exp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii Inc. All rights reserved. 4 | 5 | from .base_exp import BaseExp 6 | from .build import get_exp 7 | from .yolox_base import Exp 8 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/exp/default/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii Inc. All rights reserved. 4 | 5 | # This file is used for package installation and find default exp file 6 | 7 | import importlib 8 | import sys 9 | from pathlib import Path 10 | 11 | _EXP_PATH = Path(__file__).resolve().parent.parent.parent.parent / "exps" / "default" 12 | 13 | if _EXP_PATH.is_dir(): 14 | # This is true only for in-place installation (pip install -e, setup.py develop), 15 | # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 16 | 17 | class _ExpFinder(importlib.abc.MetaPathFinder): 18 | 19 | def find_spec(self, name, path, target=None): 20 | if not name.startswith("yolox.exp.default"): 21 | return 22 | project_name = name.split(".")[-1] + ".py" 23 | target_file = _EXP_PATH / project_name 24 | if not target_file.is_file(): 25 | return 26 | return importlib.util.spec_from_file_location(name, target_file) 27 | 28 | sys.meta_path.append(_ExpFinder()) 29 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/layers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | from .fast_coco_eval_api import COCOeval_opt 6 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/layers/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | #include "cocoeval/cocoeval.h" 2 | 3 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 4 | m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); 5 | m.def( 6 | "COCOevalEvaluateImages", 7 | &COCOeval::EvaluateImages, 8 | "COCOeval::EvaluateImages"); 9 | pybind11::class_(m, "InstanceAnnotation") 10 | .def(pybind11::init()); 11 | pybind11::class_(m, "ImageEvaluation") 12 | .def(pybind11::init<>()); 13 | } 14 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii Inc. All rights reserved. 4 | 5 | from .build import * 6 | from .darknet import CSPDarknet, Darknet 7 | from .losses import IOUloss 8 | from .yolo_fpn import YOLOFPN 9 | from .yolo_head import YOLOXHead 10 | from .yolo_pafpn import YOLOPAFPN 11 | from .yolox import YOLOX 12 | try: 13 | from .yolo_head_script import YOLOXHeadScript 14 | except RuntimeError: 15 | pass -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 4 | 5 | # This file is used for package installation. Script of train/eval/export will be available. 6 | 7 | import importlib 8 | import sys 9 | from pathlib import Path 10 | 11 | _TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / "tools" 12 | 13 | if _TOOLS_PATH.is_dir(): 14 | # This is true only for in-place installation (pip install -e, setup.py develop), 15 | # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 16 | 17 | class _PathFinder(importlib.abc.MetaPathFinder): 18 | 19 | def find_spec(self, name, path, target=None): 20 | if not name.startswith("yolox.tools."): 21 | return 22 | project_name = name.split(".")[-1] + ".py" 23 | target_file = _TOOLS_PATH / project_name 24 | if not target_file.is_file(): 25 | return 26 | return importlib.util.spec_from_file_location(name, target_file) 27 | 28 | sys.meta_path.append(_PathFinder()) 29 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # Copyright (c) Megvii Inc. All rights reserved. 4 | 5 | from .allreduce_norm import * 6 | from .boxes import * 7 | from .checkpoint import load_ckpt, save_checkpoint 8 | from .compat import meshgrid 9 | from .demo_utils import * 10 | from .dist import * 11 | from .ema import * 12 | from .logger import WandbLogger, setup_logger 13 | from .lr_scheduler import LRScheduler 14 | from .metric import * 15 | from .model_utils import * 16 | from .setup_env import * 17 | from .visualize import * 18 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/detection/yolox/yolox/utils/compat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import torch 5 | 6 | _TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]] 7 | 8 | __all__ = ["meshgrid"] 9 | 10 | 11 | def meshgrid(*tensors): 12 | if _TORCH_VER >= [1, 10]: 13 | return torch.meshgrid(*tensors, indexing="ij") 14 | else: 15 | return torch.meshgrid(*tensors) 16 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | ADD . /workspace/nnunet_pyt 5 | WORKDIR /workspace/nnunet_pyt 6 | 7 | RUN pip install --upgrade pip 8 | RUN pip install --disable-pip-version-check -r requirements.txt 9 | RUN pip install pytorch-lightning==1.0.0 --no-dependencies 10 | RUN pip install monai==0.4.0 --no-dependencies 11 | RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/ nvidia-dali-cuda110==0.30.0 12 | RUN pip install torch_optimizer==0.0.1a15 --no-dependencies 13 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 14 | RUN unzip -qq awscliv2.zip 15 | RUN ./aws/install 16 | RUN rm -rf awscliv2.zip aws 17 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/config/ops_bf16_unet.txt: -------------------------------------------------------------------------------- 1 | addmm 2 | avg_pool2d 3 | bmm 4 | conv_transpose1d 5 | conv_transpose2d 6 | conv_transpose3d 7 | conv1d 8 | conv2d 9 | conv3d 10 | dot 11 | leaky_relu 12 | linear 13 | matmul 14 | max_pool2d 15 | mm 16 | mv 17 | relu 18 | t -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/config/ops_fp32_unet.txt: -------------------------------------------------------------------------------- 1 | cross_entropy 2 | log_softmax 3 | nll_loss 4 | softmax 5 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/images/unet3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/segmentation/Unet/images/unet3d.png -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/lightning_trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/segmentation/Unet/lightning_trainer/__init__.py -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/requirements.txt: -------------------------------------------------------------------------------- 1 | # The extra-index-url is for downloading the nvidia package \ 2 | # It is used only as a fallback mechanism if download fails from \ 3 | # primary source 4 | --extra-index-url https://developer.download.nvidia.com/compute/redist 5 | git+https://github.com/NVIDIA/dllogger 6 | nibabel==3.2.1 7 | scikit-learn==1.2.1 ; python_version <= '3.8' 8 | scikit-learn==1.5.0 ; python_version > '3.8' 9 | pynvml==12.0.0 10 | pytorch_ranger==0.1.1 11 | dropblock==0.3.0 12 | monai==1.4.0 13 | nvidia-dali-cuda110==1.32.0 14 | torch-optimizer==0.0.1a15 15 | scikit-image==0.19.3 16 | awscli 17 | lightning==2.5.1 18 | lightning-habana==1.6.0 19 | numpy==1.24.0 20 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/requirements_u22.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/NVIDIA/dllogger 2 | nibabel==3.2.1 3 | scikit-learn==1.5.0 4 | pynvml==12.0.0 5 | pytorch_ranger==0.1.1 6 | dropblock==0.3.0 7 | monai==1.4.0 8 | nvidia-dali-cuda110==1.32.0 9 | torch-optimizer==0.0.1a15 10 | scikit-image==0.19.3 11 | awscli 12 | lightning==2.5.1 13 | lightning-habana==1.6.0 14 | numpy==1.24.0 15 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/requirements_u24.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/NVIDIA/dllogger 2 | nibabel==3.2.1 3 | scikit-learn==1.5.0 4 | pynvml==12.0.0 5 | pytorch_ranger==0.1.1 6 | dropblock==0.3.0 7 | monai==1.4.0 8 | nvidia-dali-cuda110==1.34.0 9 | torch-optimizer==0.0.1a15 10 | scikit-image==0.24.0 11 | awscli 12 | lightning==2.5.1 13 | lightning-habana==1.6.0 14 | numpy==1.26.4 15 | -------------------------------------------------------------------------------- /PyTorch/computer_vision/segmentation/Unet/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/computer_vision/segmentation/Unet/utils/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/DeepSpeed/cifar_example/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "steps_per_print": 2000, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.001, 8 | "betas": [ 9 | 0.8, 10 | 0.999 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupLR", 18 | "params": { 19 | "warmup_min_lr": 0, 20 | "warmup_max_lr": 0.001, 21 | "warmup_num_steps": 1000 22 | } 23 | }, 24 | "gradient_clipping": 1.0, 25 | "prescale_gradients": false, 26 | "bf16": {"enabled": true}, 27 | "fp16": {"enabled": false}, 28 | "wall_clock_breakdown": false, 29 | "zero_optimization": { 30 | "stage": 1, 31 | "allgather_partitions": true, 32 | "reduce_scatter": true, 33 | "allgather_bucket_size": 50000000, 34 | "reduce_bucket_size": 50000000, 35 | "overlap_comm": true, 36 | "contiguous_gradients": true, 37 | "cpu_offload": false 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /PyTorch/examples/DeepSpeed/cifar_example/requirements.txt: -------------------------------------------------------------------------------- 1 | pillow>=7.1.0 2 | matplotlib 3 | torchmetrics>=0.8.0 4 | -------------------------------------------------------------------------------- /PyTorch/examples/DeepSpeed/cifar_example/run_ds_habanax8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PT_HPU_LAZY_MODE=0 deepspeed --num_nodes=1 --num_gpus=8 cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json $@ 4 | -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | tqdm 3 | datasets 4 | transformers 5 | pulp 6 | scipy 7 | pytest -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/run_demo_bucketing_gaussian.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 2 | 3 | import itertools 4 | from plotting import plot_bucket_analysis_results 5 | from bucket import bucket_analysis, lp_bucket, const_bucket, uniform_bucket, percentile_bucket, lloyd_max_bucketing, brute_force_min_pad_waste 6 | from datasets_library import generate_random_gaussian 7 | 8 | shapes = list(itertools.islice(generate_random_gaussian(), 1000)) 9 | results = bucket_analysis(shapes, [("lp_bucket", lp_bucket), ("const_bucket", const_bucket), ("uniform_bucket", uniform_bucket), \ 10 | ("percentile_bucket", percentile_bucket), ("lloyd_max_bucketing", lloyd_max_bucketing), \ 11 | ("brute_force_min_pad_waste", brute_force_min_pad_waste)], [2,3,4,5,6,10,20]) 12 | plot_bucket_analysis_results(results, 'bucket_analysis_bar_gaussian.svg') 13 | 14 | 15 | -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/run_demo_bucketing_squad.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 2 | 3 | import itertools 4 | from plotting import plot_bucket_analysis_results 5 | from bucket import bucket_analysis, lp_bucket, const_bucket, uniform_bucket, percentile_bucket, lloyd_max_bucketing, brute_force_min_pad_waste 6 | from datasets_library import squad 7 | 8 | shapes = squad(4) 9 | results = bucket_analysis(shapes, [("const_bucket", const_bucket), ("uniform_bucket", uniform_bucket), \ 10 | ("percentile_bucket", percentile_bucket), ("lloyd_max_bucketing", lloyd_max_bucketing), \ 11 | ("brute_force_min_pad_waste", brute_force_min_pad_waste)], [2,3,4,5,6,10,20]) 12 | 13 | plot_bucket_analysis_results(results, "bucket_analysis_bar_squad.svg") 14 | 15 | 16 | -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/run_demo_controlling_num_steps.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 2 | 3 | import itertools 4 | from plotting import plot_bucket_analysis_results 5 | from bucket import bucket_analysis, lloyd_max_bucketing, brute_force_min_pad_waste 6 | from datasets_library import generate_random_gaussian 7 | 8 | shapes = list(itertools.islice(generate_random_gaussian(), 10000)) 9 | 10 | lloyd_max_set_step = lambda step : (lambda shp, num_buckets : lloyd_max_bucketing(shp, num_buckets, step)) 11 | bruteforce_set_threshold = lambda th : (lambda shp, num_buckets : brute_force_min_pad_waste(shp, num_buckets, th)) 12 | 13 | 14 | results = bucket_analysis(shapes, [("lloyd_max_02", lloyd_max_set_step(2)), ("lloyd_max_10", lloyd_max_set_step(10)), ("lloyd_max_20", lloyd_max_set_step(20)), \ 15 | ("lloyd_max_30", lloyd_max_set_step(30)), ("bruteforce_100k", bruteforce_set_threshold(100000)), ("bruteforce_1M", bruteforce_set_threshold(1000000))], [6, 8, 10]) 16 | plot_bucket_analysis_results(results, 'bucket_analysis_num_steps_gaussian.svg') 17 | -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/run_demo_gaussian.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 2 | 3 | from datasets_library import gaussian, batched_gaussian, batch_by_formula, sample_from_pdf 4 | from plotting import plotter 5 | 6 | 7 | if __name__ == '__main__': 8 | print("Plotting gaussian") 9 | num_samples = 100000 10 | bs = 4 11 | gs = gaussian(num_samples) 12 | orig = batched_gaussian(gs, 1, max) 13 | max_batch4 = batched_gaussian(gs, bs, max) 14 | min_batch4 = batched_gaussian(gs, bs, min) 15 | max_formula_batch4 = sample_from_pdf(batch_by_formula(gs, bs, 'max'), num_samples) 16 | min_formula_batch4 = sample_from_pdf(batch_by_formula(gs, bs, 'min'), num_samples) 17 | max_batch32 = batched_gaussian(gs, 8*bs, max) 18 | min_batch32 = batched_gaussian(gs, 8*bs, min) 19 | max_formula_batch32 = sample_from_pdf(batch_by_formula(gs, 8*bs, 'max'), num_samples) 20 | min_formula_batch32 = sample_from_pdf(batch_by_formula(gs, 8*bs, 'min'), num_samples) 21 | plotter([orig, max_batch4, max_formula_batch4, min_batch4, min_formula_batch4, max_batch32, max_formula_batch32, min_batch32, min_formula_batch32], 'gaussian.svg', ['original', 'bs4_max', 'bs4_max_formula', 'bs4_min', 'bs4_min_formula', 'bs32_max', 'bs32_max_formula', 'bs32_min', 'bs32_min_formula']) -------------------------------------------------------------------------------- /PyTorch/examples/bucketing/run_demo_squad.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 2 | 3 | from datasets_library import squad 4 | from plotting import plotter 5 | 6 | 7 | if __name__ == '__main__': 8 | print("Plotting squad, takes 2-3 mins to run") 9 | plotter([squad(1), squad(4), squad(16), squad(64), squad(256), squad(512)], 'squad.svg', ['bs='+str(bs) for bs in [1,4,16,64,256,512]]) -------------------------------------------------------------------------------- /PyTorch/examples/custom_op/custom_fusedsdpa/README.md: -------------------------------------------------------------------------------- 1 | ## Table of Contents 2 | 3 | * [Model-References](../../../../README.md) 4 | * [Applying SDPA CustomOp to Bert NV](#applying-customops-to-a-real-training-model-example) 5 | 6 | A brief description of Scaled Dot Product Attention (SDPA) kernel is provided in 7 | [FusedSDPA section](https://docs.habana.ai/en/latest/PyTorch/Python_Packages.html#hpex-kernels-fusedsdpa). 8 | 9 | The usage of the SDPA is demonstrated through the BERT Fine tuning training model. 10 | The changes required to invoke SDPA are available in `custom_fusedsdpa_op.patch`. 11 | The BERT FT model can be patched with `custom_fusedsdpa_op.patch` and trained using SDPA. 12 | 13 | Below are the steps to patch and run the BERT FT training script. The commands to run the 14 | training remain unmodified. 15 | 16 | ## Applying SDPA CustomOp to BERT Fine-Tuning 17 | 18 | 1. Apply the patch `custom_fusedsdpa_op.patch` to PyTorch/nlp/bert/modeling.py: 19 | - Go to the main directory in the repository. 20 | - Run `git apply --verbose PyTorch/examples/custom_op/custom_fusedsdpa/custom_fusedsdpa_op.patch` 21 | 2. Run the model. 22 | -------------------------------------------------------------------------------- /PyTorch/examples/custom_op/legacy_custom_op_API/custom_relu/__init__.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company 3 | # All Rights Reserved. 4 | # 5 | # Unauthorized copying of this file or any element(s) within it, via any medium 6 | # is strictly prohibited. 7 | # This file contains Habana Labs, Ltd. proprietary and confidential information 8 | # and is subject to the confidentiality and license agreements under which it 9 | # was provided. 10 | # 11 | # ****************************************************************************** 12 | 13 | from .custom_relu import CustomReLU 14 | 15 | __all__ = [CustomReLU] 16 | 17 | -------------------------------------------------------------------------------- /PyTorch/examples/custom_op/pt2_custom_op_API/custom_relu/__init__.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company 3 | # All Rights Reserved. 4 | # 5 | # Unauthorized copying of this file or any element(s) within it, via any medium 6 | # is strictly prohibited. 7 | # This file contains Habana Labs, Ltd. proprietary and confidential information 8 | # and is subject to the confidentiality and license agreements under which it 9 | # was provided. 10 | # 11 | # ****************************************************************************** 12 | 13 | from .custom_relu import CustomReLU 14 | 15 | __all__ = [CustomReLU] 16 | 17 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (C) 2021 Habana Labs Ltd. an Intel Company 4 | Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/environment.yaml: -------------------------------------------------------------------------------- 1 | name: ldm 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.8.5 7 | - pip=20.3 8 | - cudatoolkit=11.0 9 | - pytorch=1.7.0 10 | - torchvision=0.8.1 11 | - numpy=1.19.2 12 | - pip: 13 | - albumentations==0.4.3 14 | - opencv-python==4.1.2.30 15 | - pudb==2019.2 16 | - imageio==2.9.0 17 | - imageio-ffmpeg==0.4.2 18 | - pytorch-lightning==1.4.2 19 | - omegaconf==2.1.1 20 | - test-tube>=0.7.5 21 | - streamlit>=0.73.1 22 | - einops==0.3.0 23 | - torch-fidelity==0.3.0 24 | - transformers==4.3.1 25 | - webdataset==0.2.5 26 | - kornia==0.6 27 | - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers 28 | - -e git+https://github.com/openai/CLIP.git@main#egg=clip 29 | - -e . 30 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/data/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/data/dummy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import string 4 | from torch.utils.data import Dataset, Subset 5 | 6 | class DummyData(Dataset): 7 | def __init__(self, length, size): 8 | self.length = length 9 | self.size = size 10 | 11 | def __len__(self): 12 | return self.length 13 | 14 | def __getitem__(self, i): 15 | x = np.random.randn(*self.size) 16 | letters = string.ascii_lowercase 17 | y = ''.join(random.choice(string.ascii_lowercase) for i in range(10)) 18 | return {"jpg": x, "txt": y} 19 | 20 | 21 | class DummyDataWithEmbeddings(Dataset): 22 | def __init__(self, length, size, emb_size): 23 | self.length = length 24 | self.size = size 25 | self.emb_size = emb_size 26 | 27 | def __len__(self): 28 | return self.length 29 | 30 | def __getitem__(self, i): 31 | x = np.random.randn(*self.size) 32 | y = np.random.randn(*self.emb_size).astype(np.float32) 33 | return {"jpg": x, "txt": y} 34 | 35 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/data/inpainting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/data/inpainting/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/kl-f16/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 16 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 16 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | num_res_blocks: 2 27 | attn_resolutions: 28 | - 16 29 | dropout: 0.0 30 | data: 31 | target: main.DataModuleFromConfig 32 | params: 33 | batch_size: 6 34 | wrap: true 35 | train: 36 | target: ldm.data.openimages.FullOpenImagesTrain 37 | params: 38 | size: 384 39 | crop_size: 256 40 | validation: 41 | target: ldm.data.openimages.FullOpenImagesValidation 42 | params: 43 | size: 384 44 | crop_size: 256 45 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/kl-f32/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 64 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 64 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | - 4 27 | num_res_blocks: 2 28 | attn_resolutions: 29 | - 16 30 | - 8 31 | dropout: 0.0 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 6 36 | wrap: true 37 | train: 38 | target: ldm.data.openimages.FullOpenImagesTrain 39 | params: 40 | size: 384 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | size: 384 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/kl-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 3 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 3 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | num_res_blocks: 2 25 | attn_resolutions: [] 26 | dropout: 0.0 27 | data: 28 | target: main.DataModuleFromConfig 29 | params: 30 | batch_size: 10 31 | wrap: true 32 | train: 33 | target: ldm.data.openimages.FullOpenImagesTrain 34 | params: 35 | size: 384 36 | crop_size: 256 37 | validation: 38 | target: ldm.data.openimages.FullOpenImagesValidation 39 | params: 40 | size: 384 41 | crop_size: 256 42 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/kl-f8/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 4 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 4 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | - 4 25 | num_res_blocks: 2 26 | attn_resolutions: [] 27 | dropout: 0.0 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 4 32 | wrap: true 33 | train: 34 | target: ldm.data.openimages.FullOpenImagesTrain 35 | params: 36 | size: 384 37 | crop_size: 256 38 | validation: 39 | target: ldm.data.openimages.FullOpenImagesValidation 40 | params: 41 | size: 384 42 | crop_size: 256 43 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/vq-f4-noattn/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | attn_type: none 11 | double_z: false 12 | z_channels: 3 13 | resolution: 256 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: 18 | - 1 19 | - 2 20 | - 4 21 | num_res_blocks: 2 22 | attn_resolutions: [] 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 11 30 | disc_weight: 0.75 31 | codebook_weight: 1.0 32 | 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 8 37 | num_workers: 12 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | crop_size: 256 43 | validation: 44 | target: ldm.data.openimages.FullOpenImagesValidation 45 | params: 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/models/first_stage_models/vq-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | double_z: false 11 | z_channels: 3 12 | resolution: 256 13 | in_channels: 3 14 | out_ch: 3 15 | ch: 128 16 | ch_mult: 17 | - 1 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: [] 22 | dropout: 0.0 23 | lossconfig: 24 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 25 | params: 26 | disc_conditional: false 27 | disc_in_channels: 3 28 | disc_start: 0 29 | disc_weight: 0.75 30 | codebook_weight: 1.0 31 | 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 8 36 | num_workers: 16 37 | wrap: true 38 | train: 39 | target: ldm.data.openimages.FullOpenImagesTrain 40 | params: 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | crop_size: 256 46 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/patches/minimal_changes.diff: -------------------------------------------------------------------------------- 1 | diff --git a/scripts/txt2img.py b/scripts/txt2img.py 2 | index ef52ee0..d7791b6 100644 3 | --- a/scripts/txt2img.py 4 | +++ b/scripts/txt2img.py 5 | @@ -263,6 +263,8 @@ 6 | base_count += 1 7 | all_samples.append(x_samples_ddim) 8 | 9 | + import habana_frameworks.torch.core as htcore 10 | + htcore.mark_step() 11 | if not opt.skip_grid: 12 | # additionally, save as grid 13 | grid = torch.stack(all_samples, 0) 14 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/patches/randn_to_cpu.diff: -------------------------------------------------------------------------------- 1 | diff --git a/ldm/models/diffusion/ddim.py b/ldm/models/diffusion/ddim.py 2 | index aa3fbec..894f258 100644 3 | --- a/ldm/models/diffusion/ddim.py 4 | +++ b/ldm/models/diffusion/ddim.py 5 | @@ -125,7 +125,8 @@ class DDIMSampler(object): 6 | device = self.model.betas.device 7 | b = shape[0] 8 | if x_T is None: 9 | - img = torch.randn(shape, device=device) 10 | + img = torch.randn(shape, device=torch.device("cpu")) 11 | + img = torch.tensor(img, device=device).clone().detach() 12 | else: 13 | img = x_T 14 | 15 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | albumentations==0.4.3 2 | opencv-python 3 | pudb==2019.2 4 | imageio==2.9.0 5 | imageio-ffmpeg==0.4.2 6 | omegaconf==2.1.1 7 | test-tube>=0.7.5 8 | streamlit>=0.73.1 9 | einops==0.3.0 10 | torch-fidelity==0.3.0 11 | transformers==4.38.0 12 | webdataset==0.2.5 13 | -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers 14 | -e git+https://github.com/openai/CLIP.git@main#egg=clip 15 | -e . 16 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/cmd_on_new_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | import fire 5 | 6 | 7 | class Checker(object): 8 | def __init__(self, filename, interval=60): 9 | self._cached_stamp = 0 10 | self.filename = filename 11 | self.interval = interval 12 | 13 | def check(self, cmd): 14 | while True: 15 | stamp = os.stat(self.filename).st_mtime 16 | if stamp != self._cached_stamp: 17 | self._cached_stamp = stamp 18 | print(f"{self.__class__.__name__}: Detected a new file at {self.filename}, running evaluation commands on it.") 19 | subprocess.run(cmd, shell=True) 20 | else: 21 | time.sleep(self.interval) 22 | 23 | 24 | def run(filename, cmd): 25 | checker = Checker(filename, interval=60) 26 | checker.check(cmd) 27 | 28 | 29 | if __name__ == "__main__": 30 | fire.Fire(run) 31 | 32 | 33 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/printckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import fire 4 | 5 | 6 | def printit(p): 7 | print(f"printin' in path: {p}") 8 | size_initial = os.path.getsize(p) 9 | nsd = dict() 10 | sd = torch.load(p, map_location="cpu") 11 | if "global_step" in sd: 12 | print(f"This is global step {sd['global_step']}.") 13 | if "model_ema.num_updates" in sd["state_dict"]: 14 | print(f"And we got {sd['state_dict']['model_ema.num_updates']} EMA updates.") 15 | 16 | 17 | if __name__ == "__main__": 18 | fire.Fire(printit) 19 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/six-prompts: -------------------------------------------------------------------------------- 1 | the Tower of Babel by J.M.W. Turner 2 | advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art 3 | the gateway between dreams, trending on ArtStation 4 | Humanity is killed by AI, by James Gurney 5 | A fantasy painting of a city in a deep valley by Ivan Aivazovsky 6 | Darth Vader at Woodstock (1969) 7 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/weird-dalle-prompts.txt: -------------------------------------------------------------------------------- 1 | # TODO, check out Twitter. 2 | Darth Vader at Woodstock (1969) 3 | Bunny Vikings 4 | The Demogorgon from Stranger Thinhs holding a basketball 5 | Hamster in my microwave 6 | a courtroom sketch of a Ford Transit van 7 | PS1 Hagrid at MCDonalds 8 | Karl Marx in KFC Logo 9 | Moai Statue giving a TED talk 10 | wahing machine trail cam 11 | minions at cross burning 12 | Hindenburg disaster in Fortnite -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/wings1.txt: -------------------------------------------------------------------------------- 1 | A portrait of Abraham Lincoln 2 | A portrait of Barack Obama 3 | A portrait of a nekomimi girl smiling 4 | a portrait of isaac newton the alchemist 5 | A portrait of Friedrich Nietzsche wearing an open double breasted suit with a bowtie 6 | Portrait of a cyberpunk cyborg man wearing alternate reality goggles 7 | Portrait of a woman screaming 8 | A portrait of a man in a flight jacket leaning against a biplane 9 | a cold landscape by Albert Bierstadt 10 | the monument of the ancients by van gogh 11 | the universal library 12 | a vision of paradise. unreal engine 13 | matte painting of cozy underground bunker wholefoods aisle, trending on artstation 14 | illustration of wooly mammoths reclaiming the arctic, trending on artstation 15 | a mountain range in the desert, Provia, Velvia 16 | the gateway between dreams, trending on ArtStation 17 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/wings2.txt: -------------------------------------------------------------------------------- 1 | a cityscape at night 2 | starry night by cyberpunk 3 | A fantasy painting of a city in a deep valley by Ivan Aivazovsky 4 | An oil painting of The New York City Skyline by Natalia Goncharova 5 | a rainy city street in the style of cyberpunk noir, trending on ArtStation 6 | an astral city in the style of cyberpunk noir art deco 7 | The Golden Gate Bridge in the style of art deco 8 | a city on a 70s science fiction novel cover 9 | An oil painting of A Vase Of Flowers 10 | still life oil painting of a smooth silver steel tungsten square cube box by Albrecht Dürer 11 | An oil painting of a bookshelf crammed with books, trending on artstation 12 | An N95 respirator mask in the style of art deco 13 | a surreal and organic stone monument to a plutonium atom 14 | oil painting of a candy dish of glass candies, mints, and other assorted sweets 15 | illustration of a ford model-t in pristine condition, trending on artstation 16 | illustration of DEC minicomputer console monitor retrocomputing teletype interdata PDP-11 univac, trending on artstation 17 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/wings3.txt: -------------------------------------------------------------------------------- 1 | The Rise Of Consciousness 2 | The Human Utility Function 3 | Revolution of the Souls 4 | a good amphetamine spirit 5 | Control The Soul 6 | The Lunatic, The Lover, and The Poet 7 | A Planet Ruled By Angels 8 | the Tower of Babel by J.M.W. Turner 9 | sketch of a 3D printer by Leonardo da Vinci 10 | In The Style Of M.C. Escher 11 | A cup of coffee by Picasso 12 | The US Capitol Building in the style of Kandinsky 13 | A Mysterious Orb by Andy Warhol 14 | The everlasting zero, a glimpse of a million, by Salvador Dali 15 | a painting of a haunted house with Halloween decorations by Giovanni Paolo Panini 16 | a painting of drops of Venus by Vincent van Gogh 17 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/prompts/wings4.txt: -------------------------------------------------------------------------------- 1 | ascii art of a man riding a bicycle 2 | cyberpunk noir art deco detective in space 3 | a cyborg angel in the style of ukiyo-e 4 | Hell in the style of pointillism 5 | Moloch in the style of socialist realism 6 | Metaphysics in the style of WPAP 7 | advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art 8 | a watercolor painting of a Christmas tree 9 | control room monitors televisions screens computers hacker lab, concept art, matte painting, trending on artstation 10 | a group of surgeons wait to cryonically suspend a patient 11 | technological singularity cult by James Gurney 12 | an autogyro flying car, trending on artstation 13 | illustration of airship zepplins in the skies, trending on artstation 14 | watercolor illustration of a martian colony geodesic dome aquaponics farming on the surface, trending on artstation 15 | humanity is killed by AI, by James Gurney 16 | the Vitruvian Man as a propaganda poster for transhumanism -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/slurm/README.md: -------------------------------------------------------------------------------- 1 | # Example 2 | 3 | Resume f8 @ 512 on Laion-HR 4 | 5 | ``` 6 | sbatch scripts/slurm/resume_512/sbatch.sh 7 | ``` 8 | 9 | # Reuse 10 | 11 | To reuse this as a template, copy `sbatch.sh` and `launcher.sh` somewhere. In 12 | `sbatch.sh`, adjust the lines 13 | 14 | ``` 15 | #SBATCH --job-name=stable-diffusion-512cont 16 | #SBATCH --nodes=24 17 | ``` 18 | 19 | and the path to your `launcher.sh` in the last line, 20 | 21 | ``` 22 | srun bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh 23 | ``` 24 | 25 | In `launcher.sh`, adjust `CONFIG` and `EXTRA`. Maybe give it a test run with 26 | debug flags uncommented and a reduced number of nodes. 27 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NODE_RANK=${SLURM_NODEID} 3 | echo "##########################################" 4 | echo MASTER_ADDR=${MASTER_ADDR} 5 | echo MASTER_PORT=${MASTER_PORT} 6 | echo NODE_RANK=${NODE_RANK} 7 | echo WORLD_SIZE=${WORLD_SIZE} 8 | echo "##########################################" 9 | # debug environment worked great so we stick with it 10 | # no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 11 | # env with pip dependencies from stable diffusion's requirements.txt 12 | eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" 13 | conda activate stable 14 | cd /fsx/stable-diffusion/stable-diffusion 15 | 16 | CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml 17 | EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt" 18 | DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" 19 | 20 | python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG 21 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/scripts/test_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" 3 | conda activate stable 4 | cd /fsx/stable-diffusion/stable-diffusion 5 | python scripts/test_gpu.py 6 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/generative_models/stable-diffusion/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='latent-diffusion', 5 | version='0.0.1', 6 | description='', 7 | packages=find_packages(), 8 | install_requires=[ 9 | 'torch', 10 | 'numpy', 11 | 'tqdm', 12 | ], 13 | ) -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang 2 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include megatron/data/Makefile 2 | include megatron/data/helpers.cpp 3 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Run the scripts below to setup dataset 2 | 3 | bash download_books.sh 4 | 5 | bash download_vocab.sh 6 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/dataset/download_books.sh: -------------------------------------------------------------------------------- 1 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin 2 | wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/dataset/download_ckpt.sh: -------------------------------------------------------------------------------- 1 | mkdir -p checkpoints/gpt2_345m 2 | 3 | cd checkpoints/gpt2_345m 4 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip 5 | unzip megatron_lm_345m_v0.0.zip 6 | rm megatron_lm_345m_v0.0.zip 7 | cd ../.. 8 | 9 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/dataset/download_vocab.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json 2 | wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/MoE/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/MoE/ds_config_gpt_Zero2_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": 2 8 | }, 9 | 10 | "gradient_clipping": 1.0, 11 | "prescale_gradients": false, 12 | 13 | "fp16": { 14 | "enabled": CONFIG_FP16_ENABLED, 15 | "loss_scale": 0, 16 | "loss_scale_window": 500, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1, 19 | "initial_scale_power": 11 20 | }, 21 | 22 | "bf16": { 23 | "enabled": CONFIG_BF16_ENABLED 24 | }, 25 | "curriculum_learning": { 26 | "enabled": CONFIG_CL_ENABLED, 27 | "curriculum_type": "seqlen", 28 | "min_difficulty": CONFIG_CL_MIN, 29 | "max_difficulty": CONFIG_CL_MAX, 30 | "schedule_type": "fixed_linear", 31 | "schedule_config": { 32 | "total_curriculum_step": CONFIG_CL_DURATION, 33 | "difficulty_step": 8 34 | } 35 | }, 36 | 37 | "wall_clock_breakdown" : false 38 | } 39 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/README.md: -------------------------------------------------------------------------------- 1 | ## Recipes and Scripts 2 | 3 | ### Azure 4 | 5 | We strongly recommend to start with AzureML recipe in the ```azureml``` folder. 6 | 7 | If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder. 8 | 9 | ### MoE 10 | 11 | Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models. 12 | 13 | ### Curriculum Learning 14 | 15 | Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. 16 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/azureml/Dockerfile.dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu115-py38-torch1110 2 | 3 | USER root:root 4 | 5 | RUN pip install pybind11 6 | 7 | RUN pip install git+https://github.com/microsoft/DeepSpeed.git 8 | 9 | # add a100-topo.xml 10 | RUN mkdir -p /opt/microsoft/ 11 | RUN wget -O /opt/microsoft/a100-topo.xml https://hpcbenchmarks.blob.core.windows.net/bookcorpus/data/a100-topo.xml 12 | 13 | # to use on A100, enable env var below in your job 14 | ENV NCCL_TOPO_FILE="/opt/microsoft/a100-topo.xml" 15 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/azureml/README.md: -------------------------------------------------------------------------------- 1 | ## Megatron-DeepSpeed on AzureML 2 | Example script for running Megatron-DeepSpeed using Azure Machine Learning. 3 | 4 | ------ 5 | 6 | # Workspace Setup 7 | Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up). 8 | 9 | # Dataset Preparation 10 | Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 11 | Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset. 12 | 13 | # Training 14 | Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py). 15 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/compression/ds_config_gpt_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : CONFIG_BATCH_SIZE, 3 | "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, 4 | "steps_per_print": LOG_INTERVAL, 5 | 6 | "zero_optimization": { 7 | "stage": ZERO_STAGE, 8 | "elastic_checkpoint": true 9 | }, 10 | 11 | "gradient_clipping": 1.0, 12 | "prescale_gradients": PRESCALE_GRAD, 13 | 14 | "fp16": { 15 | "enabled": CONFIG_FP16_ENABLED, 16 | "loss_scale": 0, 17 | "loss_scale_window": 500, 18 | "hysteresis": 2, 19 | "min_loss_scale": 1, 20 | "initial_scale_power": 11 21 | }, 22 | 23 | "bf16": { 24 | "enabled": CONFIG_BF16_ENABLED 25 | }, 26 | "curriculum_learning": { 27 | "enabled": CONFIG_CL_ENABLED, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | }, 37 | 38 | "wall_clock_breakdown" : false 39 | } 40 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/create_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | # Wikipedia data can be downloaded from the following link: 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 10 | EVIDENCE_DATA_DIR= 11 | EMBEDDING_PATH= 12 | CHECKPOINT_PATH= 13 | 14 | python tools/create_doc_index.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 128 \ 20 | --checkpoint-activations \ 21 | --seq-length 512 \ 22 | --retriever-seq-length 256 \ 23 | --max-position-embeddings 512 \ 24 | --load ${CHECKPOINT_PATH} \ 25 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 26 | --embedding-path ${EMBEDDING_PATH} \ 27 | --indexer-log-interval 1000 \ 28 | --indexer-batch-size 128 \ 29 | --vocab-file bert-vocab.txt \ 30 | --num-workers 2 \ 31 | --fp16 32 | 33 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/curriculum_learning/README.md: -------------------------------------------------------------------------------- 1 | This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084). -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/curriculum_learning/ds_train.sh: -------------------------------------------------------------------------------- 1 | # # baseline 2 | # CONFIG=baseline 3 | # TAG=baseline 4 | # MODEL_SIZE=1558 5 | # LR=1.5e-4 6 | # BSZ=512 7 | # SEQ_LEN=1024 8 | # MP_SIZE=1 9 | # SEED=1234 10 | # SAVE_INTERVAL=5000 11 | # NUM_ITER=600000 12 | # NUM_TOKEN=157286400000 13 | # LR_DECAY_TOKEN=157286400000 14 | # LR_WARMUP_ITER=3000 15 | # CONFIG_TEMPLATE=false 16 | # CURRICULUM_STEP=0 17 | # CURRICULUM_MIN=0 18 | 19 | # curriculum learning 20 | CONFIG=curriculum_fixed_linear 21 | MODEL_SIZE=1558 22 | LR=6e-4 23 | BSZ=4096 24 | SEQ_LEN=1024 25 | MP_SIZE=1 26 | SEED=1234 27 | SAVE_INTERVAL=1000 28 | NUM_ITER=75000 29 | NUM_TOKEN=157286400000 30 | LR_DECAY_TOKEN=157286400000 31 | LR_WARMUP_ITER=3000 32 | CONFIG_TEMPLATE=true 33 | CURRICULUM_STEP=45000 34 | CURRICULUM_MIN=64 35 | TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}" 36 | 37 | bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN 38 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/curriculum_learning/ds_zero_stage_1_config_baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false 26 | } 27 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 1, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 0.00015, 12 | "max_grad_norm": 1.0, 13 | "betas": [0.9, 0.95] 14 | } 15 | }, 16 | "gradient_clipping": 1.0, 17 | "fp16": { 18 | "enabled": true, 19 | "loss_scale": 0, 20 | "loss_scale_window": 1000, 21 | "hysteresis": 2, 22 | "min_loss_scale": 1 23 | }, 24 | "wall_clock_breakdown": false, 25 | "zero_allow_untested_optimizer": false, 26 | "curriculum_learning": { 27 | "enabled": true, 28 | "curriculum_type": "seqlen", 29 | "min_difficulty": CONFIG_CL_MIN, 30 | "max_difficulty": CONFIG_CL_MAX, 31 | "schedule_type": "fixed_linear", 32 | "schedule_config": { 33 | "total_curriculum_step": CONFIG_CL_DURATION, 34 | "difficulty_step": 8 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH=_text_sentence 6 | CHECKPOINT_PATH= 7 | 8 | python pretrain_bert.py \ 9 | --num-layers 24 \ 10 | --hidden-size 1024 \ 11 | --num-attention-heads 16 \ 12 | --micro-batch-size 4 \ 13 | --global-batch-size 8 \ 14 | --seq-length 512 \ 15 | --max-position-embeddings 512 \ 16 | --train-iters 2000000 \ 17 | --lr-decay-iters 990000 \ 18 | --save $CHECKPOINT_PATH \ 19 | --load $CHECKPOINT_PATH \ 20 | --data-path $DATA_PATH \ 21 | --vocab-file bert-vocab.txt \ 22 | --data-impl mmap \ 23 | --split 949,50,1 \ 24 | --lr 0.0001 \ 25 | --min-lr 0.00001 \ 26 | --lr-decay-style linear \ 27 | --lr-warmup-fraction .01 \ 28 | --weight-decay 1e-2 \ 29 | --clip-grad 1.0 \ 30 | --log-interval 100 \ 31 | --save-interval 10000 \ 32 | --eval-interval 1000 \ 33 | --eval-iters 10 \ 34 | --fp16 35 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | DATA_PATH=_text_document 9 | CHECKPOINT_PATH= 10 | 11 | 12 | python pretrain_gpt.py \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --micro-batch-size 4 \ 17 | --global-batch-size 8 \ 18 | --seq-length 1024 \ 19 | --max-position-embeddings 1024 \ 20 | --train-iters 500000 \ 21 | --lr-decay-iters 320000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file gpt2-vocab.json \ 26 | --merge-file gpt2-merges.txt \ 27 | --data-impl mmap \ 28 | --split 949,50,1 \ 29 | --distributed-backend nccl \ 30 | --lr 0.00015 \ 31 | --min-lr 1.0e-5 \ 32 | --lr-decay-style cosine \ 33 | --weight-decay 1e-2 \ 34 | --clip-grad 1.0 \ 35 | --lr-warmup-fraction .01 \ 36 | --checkpoint-activations \ 37 | --log-interval 100 \ 38 | --save-interval 10000 \ 39 | --eval-interval 1000 \ 40 | --eval-iters 10 \ 41 | --fp16 42 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RANK=0 4 | WORLD_SIZE=1 5 | DATA_PATH= 6 | VOCAB_FILE= 7 | CHECKPOINT_PATH= 8 | 9 | python pretrain_t5.py \ 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --micro-batch-size 16 \ 18 | --global-batch-size 2048 \ 19 | --max-position-embeddings 512 \ 20 | --train-iters 1000000 \ 21 | --lr-decay-iters 1000000 \ 22 | --save $CHECKPOINT_PATH \ 23 | --load $CHECKPOINT_PATH \ 24 | --data-path $DATA_PATH \ 25 | --vocab-file $VOCAB_FILE \ 26 | --data-impl mmap \ 27 | --split 949,50,1 \ 28 | --lr 0.0001 \ 29 | --min-lr 0.00001 \ 30 | --lr-decay-style linear \ 31 | --lr-warmup-fraction .01 \ 32 | --weight-decay 1e-2 \ 33 | --clip-grad 1.0 \ 34 | --log-interval 100 \ 35 | --save-interval 10000 \ 36 | --eval-interval 1000 \ 37 | --eval-iters 10 \ 38 | --fp16 39 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/images/cases_april2021.png -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | prefix = 3 30 | 31 | class PositionEmbeddingType(enum.Enum): 32 | rotary = 1 33 | absolute = 2 34 | alibi = 3 35 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from apex.normalization import MixedFusedLayerNorm as LayerNorm 17 | 18 | from .distributed import DistributedDataParallel 19 | from .bert_model import BertModel 20 | from .gpt_model import GPTModel, GPTModelPipe 21 | from .t5_model import T5Model 22 | from .language_model import get_language_model 23 | from .module import Float16Module 24 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class LayerType(enum.Enum): 19 | encoder = 1 20 | decoder = 2 21 | 22 | class AttnType(enum.Enum): 23 | self_attn = 1 24 | cross_attn = 2 25 | 26 | class AttnMaskType(enum.Enum): 27 | padding = 1 28 | causal = 2 29 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/requirements.txt: -------------------------------------------------------------------------------- 1 | pybind11 2 | torch 3 | six 4 | regex 5 | numpy 6 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/tasks/eval_harness/download.py: -------------------------------------------------------------------------------- 1 | # This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed 2 | # under the license https://huggingface.co/spaces/bigscience/license 3 | 4 | # Downloads the specified taks in the evaluation harness 5 | # This is particularly useful when running in environments where the GPU nodes 6 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. 7 | 8 | from lm_eval import tasks 9 | from lm_eval.tasks import ALL_TASKS 10 | import argparse 11 | import os 12 | 13 | 14 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False) 15 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.') 16 | args = parser.parse_args() 17 | 18 | def main(): 19 | task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') 20 | tasks.get_task_dict(task_list) 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | 26 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/DeepSpeedExamples/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os 4 | from collections import OrderedDict 5 | 6 | 7 | def dump_data(datum, name_list=[]): 8 | if type(datum) in (dict, OrderedDict): 9 | for k, v in datum.items(): 10 | dump_data(v, name_list+[str(k)]) 11 | elif type(datum) in (list, tuple): 12 | for v in datum: 13 | dump_data(v, name_list) 14 | elif torch.is_tensor(datum): 15 | prefix = '.'.join(name_list) 16 | print(f'[tensor] {prefix} = {datum.shape}') 17 | else: 18 | #pass 19 | prefix = '.'.join(name_list) 20 | print(f'[other] {prefix} = {datum}') 21 | 22 | def main(): 23 | if len(sys.argv) < 2: 24 | print(f'Usage: {sys.argv[0]} ') 25 | exit(1) 26 | 27 | ckpt_file = sys.argv[1] 28 | if not os.path.isfile(ckpt_file): 29 | print(f'{ckpt_file} is not a valid file') 30 | exit(1) 31 | 32 | print(f'loading checkpoint file: {ckpt_file}') 33 | sd = torch.load(ckpt_file) 34 | dump_data(sd) 35 | 36 | quit() 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/NOTICE: -------------------------------------------------------------------------------- 1 | BERT PyTorch 2 | 3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT 4 | licensed under the Apache License 2.0. 5 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/checkpoints/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/checkpoints/.keep -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/data/BooksDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import subprocess 15 | 16 | class BooksDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path 19 | pass 20 | 21 | 22 | def download(self): 23 | bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out' 24 | bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' 25 | bookscorpus_download_command += ' --trash-bad-count' 26 | bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) 27 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/data/NVIDIAPretrainedWeightDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | 16 | class NVIDIAPretrainedWeightDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path + '/nvidia_pretrained_weights' 19 | 20 | if not os.path.exists(self.save_path): 21 | os.makedirs(self.save_path) 22 | 23 | pass 24 | 25 | 26 | def download(self): 27 | assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.' -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/processors/__init__.py -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3 5 | # Used for downloading models over HTTP 6 | requests 7 | six 8 | ipdb 9 | #Data processing 10 | h5py 11 | html2text 12 | nltk 13 | progressbar 14 | #Others 15 | git+https://github.com/NVIDIA/dllogger 16 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/results/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/results/.keep -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/results/checkpoints/lddl_log/node-0.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/results/checkpoints/lddl_log/node-0.txt -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/results/checkpoints/lddl_log/node-0_local-0.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/results/checkpoints/lddl_log/node-0_local-0.txt -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/results/dllogger.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/examples/gpu_migration/nlp/bert/results/dllogger.json -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker build --network=host . --rm --pull --no-cache -t bert 3 | -------------------------------------------------------------------------------- /PyTorch/examples/gpu_migration/nlp/bert/scripts/docker/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD=${1:-/bin/bash} 4 | NV_VISIBLE_DEVICES=${2:-"all"} 5 | DOCKER_BRIDGE=${3:-"host"} 6 | 7 | docker run -it --rm \ 8 | --gpus device=$NV_VISIBLE_DEVICES \ 9 | --net=$DOCKER_BRIDGE \ 10 | --shm-size=1g \ 11 | --ulimit memlock=-1 \ 12 | --ulimit stack=67108864 \ 13 | -e LD_LIBRARY_PATH='/workspace/install/lib/' \ 14 | -v $PWD:/workspace/bert \ 15 | -v $PWD/results:/results \ 16 | bert $CMD 17 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (C) 2023 Habana Labs, Ltd. an Intel Company 3 | Copyright (c) 2022 Machine Vision and Learning Group, LMU Munich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/environment.yaml: -------------------------------------------------------------------------------- 1 | name: ldm 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.8.5 7 | - pip=20.3 8 | - cudatoolkit=11.3 9 | - pytorch=1.11.0 10 | - torchvision=0.12.0 11 | - numpy=1.19.2 12 | - pip: 13 | - albumentations==0.4.3 14 | - diffusers 15 | - opencv-python==4.1.2.30 16 | - pudb==2019.2 17 | - invisible-watermark 18 | - imageio==2.9.0 19 | - imageio-ffmpeg==0.4.2 20 | - pytorch-lightning==1.4.2 21 | - omegaconf==2.1.1 22 | - test-tube>=0.7.5 23 | - streamlit>=0.73.1 24 | - einops==0.3.0 25 | - torch-fidelity==0.3.0 26 | - transformers==4.19.2 27 | - torchmetrics==0.6.0 28 | - kornia==0.6 29 | - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers 30 | - -e git+https://github.com/openai/CLIP.git@main#egg=clip 31 | - -e . 32 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/data/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/data/dummy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import string 4 | from torch.utils.data import Dataset, Subset 5 | 6 | class DummyData(Dataset): 7 | def __init__(self, length, size): 8 | self.length = length 9 | self.size = size 10 | 11 | def __len__(self): 12 | return self.length 13 | 14 | def __getitem__(self, i): 15 | x = np.random.randn(*self.size) 16 | letters = string.ascii_lowercase 17 | y = ''.join(random.choice(string.ascii_lowercase) for i in range(10)) 18 | return {"jpg": x, "txt": y} 19 | 20 | 21 | class DummyDataWithEmbeddings(Dataset): 22 | def __init__(self, length, size, emb_size): 23 | self.length = length 24 | self.size = size 25 | self.emb_size = emb_size 26 | 27 | def __len__(self): 28 | return self.length 29 | 30 | def __getitem__(self, i): 31 | x = np.random.randn(*self.size) 32 | y = np.random.randn(*self.emb_size).astype(np.float32) 33 | return {"jpg": x, "txt": y} 34 | 35 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/data/inpainting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/data/inpainting/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/image_degradation/utils/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HabanaAI/Model-References/da5f246c18f41063a89a04f19ce306d1e61f6345/PyTorch/generative_models/stable-diffusion/ldm/modules/image_degradation/utils/test.png -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/kl-f16/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 16 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 16 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | num_res_blocks: 2 27 | attn_resolutions: 28 | - 16 29 | dropout: 0.0 30 | data: 31 | target: main.DataModuleFromConfig 32 | params: 33 | batch_size: 6 34 | wrap: true 35 | train: 36 | target: ldm.data.openimages.FullOpenImagesTrain 37 | params: 38 | size: 384 39 | crop_size: 256 40 | validation: 41 | target: ldm.data.openimages.FullOpenImagesValidation 42 | params: 43 | size: 384 44 | crop_size: 256 45 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/kl-f32/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 64 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 64 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 1 23 | - 2 24 | - 2 25 | - 4 26 | - 4 27 | num_res_blocks: 2 28 | attn_resolutions: 29 | - 16 30 | - 8 31 | dropout: 0.0 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 6 36 | wrap: true 37 | train: 38 | target: ldm.data.openimages.FullOpenImagesTrain 39 | params: 40 | size: 384 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | size: 384 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/kl-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 3 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 3 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | num_res_blocks: 2 25 | attn_resolutions: [] 26 | dropout: 0.0 27 | data: 28 | target: main.DataModuleFromConfig 29 | params: 30 | batch_size: 10 31 | wrap: true 32 | train: 33 | target: ldm.data.openimages.FullOpenImagesTrain 34 | params: 35 | size: 384 36 | crop_size: 256 37 | validation: 38 | target: ldm.data.openimages.FullOpenImagesValidation 39 | params: 40 | size: 384 41 | crop_size: 256 42 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/kl-f8/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.AutoencoderKL 4 | params: 5 | monitor: val/rec_loss 6 | embed_dim: 4 7 | lossconfig: 8 | target: ldm.modules.losses.LPIPSWithDiscriminator 9 | params: 10 | disc_start: 50001 11 | kl_weight: 1.0e-06 12 | disc_weight: 0.5 13 | ddconfig: 14 | double_z: true 15 | z_channels: 4 16 | resolution: 256 17 | in_channels: 3 18 | out_ch: 3 19 | ch: 128 20 | ch_mult: 21 | - 1 22 | - 2 23 | - 4 24 | - 4 25 | num_res_blocks: 2 26 | attn_resolutions: [] 27 | dropout: 0.0 28 | data: 29 | target: main.DataModuleFromConfig 30 | params: 31 | batch_size: 4 32 | wrap: true 33 | train: 34 | target: ldm.data.openimages.FullOpenImagesTrain 35 | params: 36 | size: 384 37 | crop_size: 256 38 | validation: 39 | target: ldm.data.openimages.FullOpenImagesValidation 40 | params: 41 | size: 384 42 | crop_size: 256 43 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/vq-f4-noattn/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | attn_type: none 11 | double_z: false 12 | z_channels: 3 13 | resolution: 256 14 | in_channels: 3 15 | out_ch: 3 16 | ch: 128 17 | ch_mult: 18 | - 1 19 | - 2 20 | - 4 21 | num_res_blocks: 2 22 | attn_resolutions: [] 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 11 30 | disc_weight: 0.75 31 | codebook_weight: 1.0 32 | 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 8 37 | num_workers: 12 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | crop_size: 256 43 | validation: 44 | target: ldm.data.openimages.FullOpenImagesValidation 45 | params: 46 | crop_size: 256 47 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/vq-f4/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 3 6 | n_embed: 8192 7 | monitor: val/rec_loss 8 | 9 | ddconfig: 10 | double_z: false 11 | z_channels: 3 12 | resolution: 256 13 | in_channels: 3 14 | out_ch: 3 15 | ch: 128 16 | ch_mult: 17 | - 1 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: [] 22 | dropout: 0.0 23 | lossconfig: 24 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 25 | params: 26 | disc_conditional: false 27 | disc_in_channels: 3 28 | disc_start: 0 29 | disc_weight: 0.75 30 | codebook_weight: 1.0 31 | 32 | data: 33 | target: main.DataModuleFromConfig 34 | params: 35 | batch_size: 8 36 | num_workers: 16 37 | wrap: true 38 | train: 39 | target: ldm.data.openimages.FullOpenImagesTrain 40 | params: 41 | crop_size: 256 42 | validation: 43 | target: ldm.data.openimages.FullOpenImagesValidation 44 | params: 45 | crop_size: 256 46 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/models/first_stage_models/vq-f8-n256/config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | base_learning_rate: 4.5e-06 3 | target: ldm.models.autoencoder.VQModel 4 | params: 5 | embed_dim: 4 6 | n_embed: 256 7 | monitor: val/rec_loss 8 | ddconfig: 9 | double_z: false 10 | z_channels: 4 11 | resolution: 256 12 | in_channels: 3 13 | out_ch: 3 14 | ch: 128 15 | ch_mult: 16 | - 1 17 | - 2 18 | - 2 19 | - 4 20 | num_res_blocks: 2 21 | attn_resolutions: 22 | - 32 23 | dropout: 0.0 24 | lossconfig: 25 | target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator 26 | params: 27 | disc_conditional: false 28 | disc_in_channels: 3 29 | disc_start: 250001 30 | disc_weight: 0.75 31 | codebook_weight: 1.0 32 | 33 | data: 34 | target: main.DataModuleFromConfig 35 | params: 36 | batch_size: 10 37 | num_workers: 20 38 | wrap: true 39 | train: 40 | target: ldm.data.openimages.FullOpenImagesTrain 41 | params: 42 | size: 384 43 | crop_size: 256 44 | validation: 45 | target: ldm.data.openimages.FullOpenImagesValidation 46 | params: 47 | size: 384 48 | crop_size: 256 49 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ops_bf16.txt: -------------------------------------------------------------------------------- 1 | addmm 2 | addcmul 3 | bmm 4 | conv2d 5 | layer_norm 6 | batch_norm 7 | linear 8 | dot 9 | mm 10 | matmul 11 | mv 12 | conv_transpose2d 13 | dropout 14 | gelu 15 | t 16 | div 17 | truediv 18 | softmax 19 | einsum 20 | group_norm 21 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/ops_fp32.txt: -------------------------------------------------------------------------------- 1 | cross_entropy 2 | log_softmax 3 | nll_loss 4 | topk 5 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | albumentations==0.4.3 2 | opencv-python 3 | pudb==2019.2 4 | imageio==2.9.0 5 | imageio-ffmpeg==0.4.2 6 | lightning==2.5.1 7 | lightning-habana==1.6.0 8 | torchmetrics==0.10.3 9 | omegaconf==2.1.1 10 | test-tube>=0.7.5 11 | streamlit>=0.73.1 12 | einops==0.3.0 13 | torch-fidelity==0.3.0 14 | transformers==4.48.0 15 | kornia==0.6 16 | webdataset==0.2.5 17 | -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers 18 | -e git+https://github.com/openai/CLIP.git@main#egg=clip 19 | -e . 20 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/cmd_on_new_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | import fire 5 | 6 | 7 | class Checker(object): 8 | def __init__(self, filename, interval=60): 9 | self._cached_stamp = 0 10 | self.filename = filename 11 | self.interval = interval 12 | 13 | def check(self, cmd): 14 | while True: 15 | stamp = os.stat(self.filename).st_mtime 16 | if stamp != self._cached_stamp: 17 | self._cached_stamp = stamp 18 | print(f"{self.__class__.__name__}: Detected a new file at {self.filename}, running evaluation commands on it.") 19 | subprocess.run(cmd, shell=True) 20 | else: 21 | time.sleep(self.interval) 22 | 23 | 24 | def run(filename, cmd): 25 | checker = Checker(filename, interval=60) 26 | checker.check(cmd) 27 | 28 | 29 | if __name__ == "__main__": 30 | fire.Fire(run) 31 | 32 | 33 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/printckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import fire 4 | 5 | 6 | def printit(p): 7 | print(f"printin' in path: {p}") 8 | size_initial = os.path.getsize(p) 9 | nsd = dict() 10 | sd = torch.load(p, map_location="cpu") 11 | if "global_step" in sd: 12 | print(f"This is global step {sd['global_step']}.") 13 | if "model_ema.num_updates" in sd["state_dict"]: 14 | print(f"And we got {sd['state_dict']['model_ema.num_updates']} EMA updates.") 15 | 16 | 17 | if __name__ == "__main__": 18 | fire.Fire(printit) 19 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/six-prompts: -------------------------------------------------------------------------------- 1 | the Tower of Babel by J.M.W. Turner 2 | advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art 3 | the gateway between dreams, trending on ArtStation 4 | Humanity is killed by AI, by James Gurney 5 | A fantasy painting of a city in a deep valley by Ivan Aivazovsky 6 | Darth Vader at Woodstock (1969) 7 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/weird-dalle-prompts.txt: -------------------------------------------------------------------------------- 1 | # TODO, check out Twitter. 2 | Darth Vader at Woodstock (1969) 3 | Bunny Vikings 4 | The Demogorgon from Stranger Thinhs holding a basketball 5 | Hamster in my microwave 6 | a courtroom sketch of a Ford Transit van 7 | PS1 Hagrid at MCDonalds 8 | Karl Marx in KFC Logo 9 | Moai Statue giving a TED talk 10 | wahing machine trail cam 11 | minions at cross burning 12 | Hindenburg disaster in Fortnite -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/wings1.txt: -------------------------------------------------------------------------------- 1 | A portrait of Abraham Lincoln 2 | A portrait of Barack Obama 3 | A portrait of a nekomimi girl smiling 4 | a portrait of isaac newton the alchemist 5 | A portrait of Friedrich Nietzsche wearing an open double breasted suit with a bowtie 6 | Portrait of a cyberpunk cyborg man wearing alternate reality goggles 7 | Portrait of a woman screaming 8 | A portrait of a man in a flight jacket leaning against a biplane 9 | a cold landscape by Albert Bierstadt 10 | the monument of the ancients by van gogh 11 | the universal library 12 | a vision of paradise. unreal engine 13 | matte painting of cozy underground bunker wholefoods aisle, trending on artstation 14 | illustration of wooly mammoths reclaiming the arctic, trending on artstation 15 | a mountain range in the desert, Provia, Velvia 16 | the gateway between dreams, trending on ArtStation 17 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/wings2.txt: -------------------------------------------------------------------------------- 1 | a cityscape at night 2 | starry night by cyberpunk 3 | A fantasy painting of a city in a deep valley by Ivan Aivazovsky 4 | An oil painting of The New York City Skyline by Natalia Goncharova 5 | a rainy city street in the style of cyberpunk noir, trending on ArtStation 6 | an astral city in the style of cyberpunk noir art deco 7 | The Golden Gate Bridge in the style of art deco 8 | a city on a 70s science fiction novel cover 9 | An oil painting of A Vase Of Flowers 10 | still life oil painting of a smooth silver steel tungsten square cube box by Albrecht Dürer 11 | An oil painting of a bookshelf crammed with books, trending on artstation 12 | An N95 respirator mask in the style of art deco 13 | a surreal and organic stone monument to a plutonium atom 14 | oil painting of a candy dish of glass candies, mints, and other assorted sweets 15 | illustration of a ford model-t in pristine condition, trending on artstation 16 | illustration of DEC minicomputer console monitor retrocomputing teletype interdata PDP-11 univac, trending on artstation 17 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/wings3.txt: -------------------------------------------------------------------------------- 1 | The Rise Of Consciousness 2 | The Human Utility Function 3 | Revolution of the Souls 4 | a good amphetamine spirit 5 | Control The Soul 6 | The Lunatic, The Lover, and The Poet 7 | A Planet Ruled By Angels 8 | the Tower of Babel by J.M.W. Turner 9 | sketch of a 3D printer by Leonardo da Vinci 10 | In The Style Of M.C. Escher 11 | A cup of coffee by Picasso 12 | The US Capitol Building in the style of Kandinsky 13 | A Mysterious Orb by Andy Warhol 14 | The everlasting zero, a glimpse of a million, by Salvador Dali 15 | a painting of a haunted house with Halloween decorations by Giovanni Paolo Panini 16 | a painting of drops of Venus by Vincent van Gogh 17 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/prompts/wings4.txt: -------------------------------------------------------------------------------- 1 | ascii art of a man riding a bicycle 2 | cyberpunk noir art deco detective in space 3 | a cyborg angel in the style of ukiyo-e 4 | Hell in the style of pointillism 5 | Moloch in the style of socialist realism 6 | Metaphysics in the style of WPAP 7 | advertisement for a psychedelic virtual reality headset, 16 bit sprite pixel art 8 | a watercolor painting of a Christmas tree 9 | control room monitors televisions screens computers hacker lab, concept art, matte painting, trending on artstation 10 | a group of surgeons wait to cryonically suspend a patient 11 | technological singularity cult by James Gurney 12 | an autogyro flying car, trending on artstation 13 | illustration of airship zepplins in the skies, trending on artstation 14 | watercolor illustration of a martian colony geodesic dome aquaponics farming on the surface, trending on artstation 15 | humanity is killed by AI, by James Gurney 16 | the Vitruvian Man as a propaganda poster for transhumanism -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/slurm/README.md: -------------------------------------------------------------------------------- 1 | # Example 2 | 3 | Resume f8 @ 512 on Laion-HR 4 | 5 | ``` 6 | sbatch scripts/slurm/resume_512/sbatch.sh 7 | ``` 8 | 9 | # Reuse 10 | 11 | To reuse this as a template, copy `sbatch.sh` and `launcher.sh` somewhere. In 12 | `sbatch.sh`, adjust the lines 13 | 14 | ``` 15 | #SBATCH --job-name=stable-diffusion-512cont 16 | #SBATCH --nodes=24 17 | ``` 18 | 19 | and the path to your `launcher.sh` in the last line, 20 | 21 | ``` 22 | srun bash /fsx/stable-diffusion/stable-diffusion/scripts/slurm/resume_512/launcher.sh 23 | ``` 24 | 25 | In `launcher.sh`, adjust `CONFIG` and `EXTRA`. Maybe give it a test run with 26 | debug flags uncommented and a reduced number of nodes. 27 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/slurm/resume_512_improvedaesthetic/launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NODE_RANK=${SLURM_NODEID} 3 | echo "##########################################" 4 | echo MASTER_ADDR=${MASTER_ADDR} 5 | echo MASTER_PORT=${MASTER_PORT} 6 | echo NODE_RANK=${NODE_RANK} 7 | echo WORLD_SIZE=${WORLD_SIZE} 8 | echo "##########################################" 9 | # debug environment worked great so we stick with it 10 | # no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 11 | # env with pip dependencies from stable diffusion's requirements.txt 12 | eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" 13 | conda activate stable 14 | cd /fsx/stable-diffusion/stable-diffusion 15 | 16 | CONFIG=configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512-improvedaesthetic.yaml 17 | EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T11-06-38_txt2img-1p4B-multinode-clip-encoder-high-res-512_improvedaesthetic/checkpoints/last.ckpt" 18 | DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" 19 | 20 | python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG 21 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/slurm/resume_768_hr/launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NODE_RANK=${SLURM_NODEID} 3 | echo "##########################################" 4 | echo MASTER_ADDR=${MASTER_ADDR} 5 | echo MASTER_PORT=${MASTER_PORT} 6 | echo NODE_RANK=${NODE_RANK} 7 | echo WORLD_SIZE=${WORLD_SIZE} 8 | echo "##########################################" 9 | # debug environment worked great so we stick with it 10 | # no magic there, just a miniconda python=3.9, pytorch=1.12, cudatoolkit=11.3 11 | # env with pip dependencies from stable diffusion's requirements.txt 12 | eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" 13 | conda activate stable 14 | cd /fsx/stable-diffusion/stable-diffusion 15 | 16 | CONFIG=configs/stable-diffusion/txt2img-multinode-clip-encoder-f16-768-laion-hr.yaml 17 | # EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/checkpoints/f16-33k+12k-hr_pruned.ckpt" 18 | EXTRA="model.params.ckpt_path=/fsx/stable-diffusion/stable-diffusion/logs/2022-07-09T20-06-38_txt2img-multinode-clip-encoder-f16-768-laion-hr/checkpoints/last.ckpt" 19 | DEBUG="-d True lightning.callbacks.image_logger.params.batch_frequency=5" 20 | 21 | python main.py --base $CONFIG --gpus 0,1,2,3,4,5,6,7 -t --num_nodes ${WORLD_SIZE} --scale_lr False $EXTRA #$DEBUG 22 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/scripts/test_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(/fsx/stable-diffusion/debug/miniconda3/bin/conda shell.bash hook)" 3 | conda activate stable 4 | cd /fsx/stable-diffusion/stable-diffusion 5 | python scripts/test_gpu.py 6 | -------------------------------------------------------------------------------- /PyTorch/generative_models/stable-diffusion/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='latent-diffusion', 5 | version='0.0.1', 6 | description='', 7 | packages=find_packages(), 8 | install_requires=[ 9 | 'torch', 10 | 'numpy', 11 | 'tqdm', 12 | ], 13 | ) -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/requirements.txt: -------------------------------------------------------------------------------- 1 | # Used for downloading models over HTTP 2 | ipdb==0.13.9 3 | #Data processing 4 | h5py==3.9.0 ; python_version < '3.12' 5 | h5py==3.13.0 ; python_version >= '3.12' 6 | html2text==2020.1.16 7 | nltk>=3.6.7 8 | progressbar==2.5 9 | #Others 10 | git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc 11 | torchmetrics>=0.8.0 12 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts/bert_1.5b_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "hidden_size": 1600, 3 | "hidden_act": "gelu", 4 | "initializer_range": 0.02, 5 | "vocab_size": 30522, 6 | "hidden_dropout_prob": 0.1, 7 | "num_attention_heads": 25, 8 | "type_vocab_size": 2, 9 | "max_position_embeddings": 512, 10 | "num_hidden_layers": 48, 11 | "intermediate_size": 6400, 12 | "attention_probs_dropout_prob": 0.1, 13 | "layer_norm_large_model": true 14 | } 15 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts/bert_5b_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "hidden_size": 2560, 3 | "hidden_act": "gelu", 4 | "initializer_range": 0.02, 5 | "vocab_size": 512035, 6 | "hidden_dropout_prob": 0.1, 7 | "num_attention_heads": 40, 8 | "type_vocab_size": 2, 9 | "max_position_embeddings": 512, 10 | "num_hidden_layers": 63, 11 | "intermediate_size": 10240, 12 | "attention_probs_dropout_prob": 0.1, 13 | "layer_norm_large_model": true 14 | } 15 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts/deepspeed_config_bert_1.5b.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps_per_print": 48, 3 | "train_batch_size": 12288, 4 | "train_micro_batch_size_per_gpu": 32, 5 | "tensorboard": { 6 | "enabled": true, 7 | "output_path": "./results/bert_1.5b/tensorboard", 8 | "job_name": "bert_1.5b_lans_zero1_bf16" 9 | }, 10 | "bf16": { "enabled": true }, 11 | "gradient_clipping": 1.0, 12 | "zero_optimization": { 13 | "stage": 1, 14 | "contiguous_gradients": false 15 | }, 16 | "zero_allow_untested_optimizer": true, 17 | "timers": { 18 | "throughput": { 19 | "enabled": true, 20 | "synchronized": false 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts/deepspeed_config_bert_5b_lans.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps_per_print": 4, 3 | "train_batch_size": 12288, 4 | "train_micro_batch_size_per_gpu": 32, 5 | "tensorboard": { 6 | "enabled": false, 7 | "output_path": "./results/bert_5b_lans/tensorboard", 8 | "job_name": "bert_5b_lans_zero2_bf16" 9 | }, 10 | "bf16": { "enabled": true }, 11 | "gradient_clipping": 1.0, 12 | 13 | "zero_optimization": { "stage": 2, 14 | "overlap_comm": false, 15 | "reduce_scatter" : false, 16 | "contiguous_gradients" : false, 17 | "reduce_bucket_size": 150000000 18 | }, 19 | "zero_allow_untested_optimizer": true, 20 | "timers": { 21 | "throughput": { 22 | "enabled": true, 23 | "synchronized": false 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts/hostsfile: -------------------------------------------------------------------------------- 1 | 10.10.100.101 slots=8 2 | 10.10.100.102 slots=8 3 | 10.10.100.103 slots=8 4 | 10.10.100.104 slots=8 -------------------------------------------------------------------------------- /PyTorch/nlp/bert/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /PyTorch/nlp/bert/bert_config_1.2B.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1536, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 6144, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 40, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /PyTorch/nlp/bert/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /PyTorch/nlp/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | # Accessing files from S3 directly. 2 | boto3==1.26.75 3 | # Used for downloading models over HTTP 4 | ipdb==0.13.9 5 | #Data processing 6 | h5py==3.9.0 7 | html2text==2020.1.16 8 | nltk==3.8.1 9 | progressbar==2.5 10 | #Others 11 | git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc 12 | --------------------------------------------------------------------------------