├── .bazelrc ├── .bazelversion ├── .buckconfig.oss ├── .ci ├── caffe2 │ ├── README.md │ ├── common.sh │ └── test.sh ├── docker │ ├── README.md │ ├── android │ │ ├── AndroidManifest.xml │ │ └── build.gradle │ ├── build.sh │ ├── build_docker.sh │ ├── centos-rocm │ │ └── Dockerfile │ ├── ci_commit_pins │ │ ├── triton-rocm.txt │ │ └── triton.txt │ ├── common │ │ ├── common_utils.sh │ │ ├── install_android.sh │ │ ├── install_base.sh │ │ ├── install_cache.sh │ │ ├── install_clang.sh │ │ ├── install_cmake.sh │ │ ├── install_conda.sh │ │ ├── install_cudnn.sh │ │ ├── install_db.sh │ │ ├── install_devtoolset.sh │ │ ├── install_docs_reqs.sh │ │ ├── install_gcc.sh │ │ ├── install_glibc.sh │ │ ├── install_jni.sh │ │ ├── install_lcov.sh │ │ ├── install_linter.sh │ │ ├── install_ninja.sh │ │ ├── install_onnx.sh │ │ ├── install_openmpi.sh │ │ ├── install_openssl.sh │ │ ├── install_protobuf.sh │ │ ├── install_rocm.sh │ │ ├── install_rocm_magma.sh │ │ ├── install_swiftshader.sh │ │ ├── install_thrift.sh │ │ ├── install_triton.sh │ │ ├── install_ucc.sh │ │ ├── install_user.sh │ │ ├── install_vision.sh │ │ └── install_vulkan_sdk.sh │ ├── java │ │ └── jni.h │ ├── linter │ │ └── Dockerfile │ ├── requirements-ci.txt │ ├── triton_version.txt │ ├── ubuntu-cuda │ │ └── Dockerfile │ ├── ubuntu-rocm │ │ ├── .gitignore │ │ └── Dockerfile │ └── ubuntu │ │ └── Dockerfile ├── onnx │ ├── README.md │ ├── common.sh │ └── test.sh └── pytorch │ ├── .shellcheckrc │ ├── README.md │ ├── build-asan.sh │ ├── build-mobile.sh │ ├── build.sh │ ├── codegen-test.sh │ ├── common-build.sh │ ├── common.sh │ ├── common_utils.sh │ ├── create_test_cert.py │ ├── docker-build-test.sh │ ├── docs-test.sh │ ├── fake_numpy │ └── numpy.py │ ├── macos-build-test.sh │ ├── macos-build.sh │ ├── macos-common.sh │ ├── macos-test.sh │ ├── multigpu-test.sh │ ├── perf_test │ ├── common.sh │ ├── compare_with_baseline.py │ ├── get_stats.py │ ├── test_cpu_speed_mini_sequence_labeler.sh │ ├── test_cpu_speed_mnist.sh │ ├── test_cpu_speed_torch.sh │ ├── test_cpu_speed_torch_tensor.sh │ ├── test_gpu_speed_cudnn_lstm.sh │ ├── test_gpu_speed_lstm.sh │ ├── test_gpu_speed_mlstm.sh │ ├── test_gpu_speed_mnist.sh │ ├── test_gpu_speed_word_language_model.sh │ └── update_commit_hash.py │ ├── print_sccache_log.py │ ├── run_glootls_test.sh │ ├── short-perf-test-cpu.sh │ ├── short-perf-test-gpu.sh │ ├── test.sh │ ├── win-build.sh │ ├── win-test-helpers │ ├── build_pytorch.bat │ ├── choose_runtime_cuda_version.bat │ ├── installation-helpers │ │ ├── activate_miniconda3.bat │ │ ├── install_magma.bat │ │ ├── install_mkl.bat │ │ └── install_sccache.bat │ ├── run_python_nn_smoketests.py │ ├── setup_pytorch_env.bat │ ├── test_custom_backend.bat │ ├── test_custom_script_ops.bat │ ├── test_distributed.bat │ ├── test_libtorch.bat │ ├── test_python_jit_legacy.bat │ └── test_python_shard.bat │ └── win-test.sh ├── .clang-format ├── .clang-tidy ├── .cmakelintrc ├── .coveragerc ├── .ctags.d └── pytorch.ctags ├── .dockerignore ├── .flake8 ├── .git-blame-ignore-revs ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── ci-sev.md │ ├── config.yml │ ├── disable-ci-jobs.md │ ├── documentation.yml │ ├── feature-request.yml │ └── pt2-bug-report.yml ├── PULL_REQUEST_TEMPLATE.md ├── actionlint.yaml ├── actions │ ├── build-android │ │ └── action.yml │ ├── calculate-docker-image │ │ └── action.yml │ ├── checkout-pytorch │ │ └── action.yml │ ├── chown-workspace │ │ └── action.yml │ ├── diskspace-cleanup │ │ └── action.yml │ ├── download-build-artifacts │ │ └── action.yml │ ├── filter-test-configs │ │ └── action.yml │ ├── get-workflow-job-id │ │ └── action.yml │ ├── setup-linux │ │ └── action.yml │ ├── setup-rocm │ │ └── action.yml │ ├── setup-win │ │ └── action.yml │ ├── teardown-rocm │ │ └── action.yml │ ├── teardown-win │ │ └── action.yml │ ├── test-pytorch-binary │ │ └── action.yml │ └── upload-test-artifacts │ │ └── action.yml ├── auto_request_review.yml ├── ci_commit_pins │ ├── audio.txt │ ├── huggingface.txt │ ├── multipy.txt │ ├── text.txt │ ├── timm.txt │ ├── torchbench.txt │ ├── triton.txt │ ├── vision.txt │ └── xla.txt ├── labeler.yml ├── merge_rules.yaml ├── pytorch-circleci-labels.yml ├── pytorch-probot.yml ├── regenerate.sh ├── requirements-gha-cache.txt ├── requirements │ ├── README.md │ ├── conda-env-Linux-X64 │ ├── conda-env-iOS │ ├── conda-env-macOS-ARM64 │ ├── conda-env-macOS-X64 │ ├── pip-requirements-iOS.txt │ ├── pip-requirements-macOS.txt │ └── regenerate-requirements.txt ├── scripts │ ├── README.md │ ├── build_triton_wheel.py │ ├── check_labels.py │ ├── collect_ciflow_labels.py │ ├── comment_on_pr.py │ ├── convert_lintrunner_annotations_to_github.py │ ├── ensure_actions_will_cancel.py │ ├── export_pytorch_labels.py │ ├── fetch_latest_green_commit.py │ ├── filter_test_configs.py │ ├── generate_binary_build_matrix.py │ ├── generate_ci_workflows.py │ ├── generate_pytorch_version.py │ ├── get_workflow_job_id.py │ ├── github_utils.py │ ├── gitutils.py │ ├── gql_mocks.json │ ├── kill_active_ssh_sessions.ps1 │ ├── label_utils.py │ ├── lint_native_functions.py │ ├── on_cancel_merge.py │ ├── parse_ref.py │ ├── pr-sanity-check.sh │ ├── report_git_status.sh │ ├── rockset_mocks.json │ ├── run_torchbench.py │ ├── stop_runner_service.sh │ ├── test_check_labels.py │ ├── test_fetch_latest_green_commit.py │ ├── test_filter_test_configs.py │ ├── test_gitutils.py │ ├── test_label_utils.py │ ├── test_trymerge.py │ ├── test_tryrebase.py │ ├── trymerge.py │ ├── trymerge_explainer.py │ ├── tryrebase.py │ ├── update_commit_hashes.py │ └── wait_for_ssh_to_drain.ps1 └── templates │ ├── common.yml.j2 │ ├── linux_binary_build_workflow.yml.j2 │ ├── macos_binary_build_workflow.yml.j2 │ ├── upload.yml.j2 │ └── windows_binary_build_workflow.yml.j2 ├── .gitignore ├── .gitmodules ├── .isort.cfg ├── .lintrunner.toml ├── .lldbinit ├── BUILD.bazel ├── LICENSE ├── README.md ├── SECURITY.md ├── benchmarks ├── README.md ├── compare-fastrnn-results.py ├── compare.sh ├── cpp │ ├── CMakeLists.txt │ ├── convolution.cpp │ └── tensorexpr │ │ ├── CMakeLists.txt │ │ ├── bench_approx.cpp │ │ ├── bench_batchnorm.cpp │ │ ├── bench_compile.cpp │ │ ├── bench_concat.cpp │ │ ├── bench_fuser_overhead.cpp │ │ ├── bench_gemm.cpp │ │ ├── bench_kernels.cpp │ │ ├── bench_ops.py │ │ ├── bench_parallel.cpp │ │ ├── bench_prefix_sum.cpp │ │ ├── bench_reduce.cpp │ │ ├── bench_signed_log1p.cpp │ │ └── main.cpp ├── distributed │ ├── ddp │ │ ├── README.md │ │ ├── benchmark.py │ │ └── diff.py │ ├── pipeline │ │ ├── benchmark_dataset.py │ │ └── pipe.py │ └── rpc │ │ ├── parameter_server │ │ ├── README.md │ │ ├── configurations │ │ │ ├── data_configurations.json │ │ │ └── model_configurations.json │ │ ├── data │ │ │ ├── DummyData.py │ │ │ └── __init__.py │ │ ├── launcher.py │ │ ├── metrics │ │ │ ├── CPUMetric.py │ │ │ ├── CUDAMetric.py │ │ │ ├── MetricBase.py │ │ │ ├── MetricsLogger.py │ │ │ └── ProcessedMetricsPrinter.py │ │ ├── models │ │ │ ├── DummyModel.py │ │ │ └── __init__.py │ │ ├── server │ │ │ ├── __init__.py │ │ │ └── server.py │ │ ├── trainer │ │ │ ├── __init__.py │ │ │ ├── criterions.py │ │ │ ├── ddp_models.py │ │ │ ├── hook_states.py │ │ │ ├── hooks.py │ │ │ ├── iteration_steps.py │ │ │ ├── preprocess_data.py │ │ │ └── trainer.py │ │ └── utils.py │ │ └── rl │ │ ├── README.md │ │ ├── agent.py │ │ ├── coordinator.py │ │ ├── launcher.py │ │ └── observer.py ├── dynamo │ ├── Makefile │ ├── README.md │ ├── __init__.py │ ├── all_torchbench_models_list.txt │ ├── benchmarks.py │ ├── check_accuracy.py │ ├── check_csv.py │ ├── check_graph_breaks.py │ ├── check_hf_bert_perf_csv.py │ ├── check_memory_compression_ratio.py │ ├── ci_expected_accuracy │ │ ├── inductor_huggingface_dynamic_inference.csv │ │ ├── inductor_huggingface_dynamic_training.csv │ │ ├── inductor_huggingface_inference.csv │ │ ├── inductor_huggingface_training.csv │ │ ├── inductor_timm_dynamic_inference.csv │ │ ├── inductor_timm_dynamic_training.csv │ │ ├── inductor_timm_inference.csv │ │ ├── inductor_timm_training.csv │ │ ├── inductor_torchbench_dynamic_inference.csv │ │ ├── inductor_torchbench_dynamic_training.csv │ │ ├── inductor_torchbench_inference.csv │ │ ├── inductor_torchbench_training.csv │ │ └── update_expected.py │ ├── combine_csv.py │ ├── common.py │ ├── dist_util.py │ ├── distributed.py │ ├── expected_ci_perf_inductor_torchbench.csv │ ├── huggingface.py │ ├── huggingface_models_list.txt │ ├── huggingface_models_list_cpu.txt │ ├── microbenchmarks │ │ ├── __init__.py │ │ ├── bench_mm_fusion.py │ │ ├── benchmark_helper.py │ │ ├── inductor_bmm.py │ │ ├── inductor_mm.py │ │ ├── matmul_relu.py │ │ ├── microbench.py │ │ ├── model.py │ │ ├── operator_inp_logs │ │ │ ├── hf_train │ │ │ │ ├── AlbertForMaskedLM_training.txt │ │ │ │ ├── AlbertForQuestionAnswering_training.txt │ │ │ │ ├── AllenaiLongformerBase_training.txt │ │ │ │ ├── BartForCausalLM_training.txt │ │ │ │ ├── BartForConditionalGeneration_training.txt │ │ │ │ ├── BertForMaskedLM_training.txt │ │ │ │ ├── BertForQuestionAnswering_training.txt │ │ │ │ ├── BigBird_training.txt │ │ │ │ ├── BlenderbotSmallForCausalLM_training.txt │ │ │ │ ├── BlenderbotSmallForConditionalGeneration_training.txt │ │ │ │ ├── CamemBert_training.txt │ │ │ │ ├── DebertaForMaskedLM_training.txt │ │ │ │ ├── DebertaForQuestionAnswering_training.txt │ │ │ │ ├── DebertaV2ForMaskedLM_training.txt │ │ │ │ ├── DebertaV2ForQuestionAnswering_training.txt │ │ │ │ ├── DistilBertForMaskedLM_training.txt │ │ │ │ ├── DistilBertForQuestionAnswering_training.txt │ │ │ │ ├── DistillGPT2_training.txt │ │ │ │ ├── ElectraForCausalLM_training.txt │ │ │ │ ├── ElectraForQuestionAnswering_training.txt │ │ │ │ ├── GPT2ForSequenceClassification_training.txt │ │ │ │ ├── GPTNeoForCausalLM_training.txt │ │ │ │ ├── GPTNeoForSequenceClassification_training.txt │ │ │ │ ├── GoogleFnet_training.txt │ │ │ │ ├── LayoutLMForMaskedLM_training.txt │ │ │ │ ├── LayoutLMForSequenceClassification_training.txt │ │ │ │ ├── M2M100ForConditionalGeneration_training.txt │ │ │ │ ├── MBartForCausalLM_training.txt │ │ │ │ ├── MBartForConditionalGeneration_training.txt │ │ │ │ ├── MegatronBertForCausalLM_training.txt │ │ │ │ ├── MegatronBertForQuestionAnswering_training.txt │ │ │ │ ├── MobileBertForMaskedLM_training.txt │ │ │ │ ├── MobileBertForQuestionAnswering_training.txt │ │ │ │ ├── OPTForCausalLM_training.txt │ │ │ │ ├── PLBartForCausalLM_training.txt │ │ │ │ ├── PLBartForConditionalGeneration_training.txt │ │ │ │ ├── PegasusForCausalLM_training.txt │ │ │ │ ├── PegasusForConditionalGeneration_training.txt │ │ │ │ ├── RobertaForCausalLM_training.txt │ │ │ │ ├── RobertaForQuestionAnswering_training.txt │ │ │ │ ├── Speech2Text2ForCausalLM_training.txt │ │ │ │ ├── TrOCRForCausalLM_training.txt │ │ │ │ ├── XGLMForCausalLM_training.txt │ │ │ │ ├── XLNetLMHeadModel_training.txt │ │ │ │ └── YituTechConvBert_training.txt │ │ │ ├── timm_train │ │ │ │ ├── adv_inception_v3_training.txt │ │ │ │ ├── beit_base_patch16_224_training.txt │ │ │ │ ├── botnet26t_256_training.txt │ │ │ │ ├── cait_m36_384_training.txt │ │ │ │ ├── coat_lite_mini_training.txt │ │ │ │ ├── convmixer_768_32_training.txt │ │ │ │ ├── convnext_base_training.txt │ │ │ │ ├── crossvit_9_240_training.txt │ │ │ │ ├── cspdarknet53_training.txt │ │ │ │ ├── deit_base_distilled_patch16_224_training.txt │ │ │ │ ├── densenet121_training.txt │ │ │ │ ├── dla102_training.txt │ │ │ │ ├── dm_nfnet_f0_training.txt │ │ │ │ ├── dpn107_training.txt │ │ │ │ ├── eca_botnext26ts_256_training.txt │ │ │ │ ├── eca_halonext26ts_training.txt │ │ │ │ ├── ecaresnet101d_training.txt │ │ │ │ ├── ese_vovnet19b_dw_training.txt │ │ │ │ ├── fbnetc_100_training.txt │ │ │ │ ├── fbnetv3_b_training.txt │ │ │ │ ├── gernet_l_training.txt │ │ │ │ ├── ghostnet_100_training.txt │ │ │ │ ├── gluon_inception_v3_training.txt │ │ │ │ ├── gluon_senet154_training.txt │ │ │ │ ├── gluon_xception65_training.txt │ │ │ │ ├── gmixer_24_224_training.txt │ │ │ │ ├── gmlp_s16_224_training.txt │ │ │ │ ├── hardcorenas_a_training.txt │ │ │ │ ├── hrnet_w18_training.txt │ │ │ │ ├── inception_v3_training.txt │ │ │ │ ├── jx_nest_base_training.txt │ │ │ │ ├── lcnet_050_training.txt │ │ │ │ ├── legacy_senet154_training.txt │ │ │ │ ├── levit_128_training.txt │ │ │ │ ├── mixer_b16_224_training.txt │ │ │ │ ├── mixnet_l_training.txt │ │ │ │ ├── mnasnet_100_training.txt │ │ │ │ ├── mobilenetv2_100_training.txt │ │ │ │ ├── mobilenetv3_large_100_training.txt │ │ │ │ ├── mobilevit_s_training.txt │ │ │ │ ├── nasnetalarge_training.txt │ │ │ │ ├── nfnet_l0_training.txt │ │ │ │ ├── pit_b_224_training.txt │ │ │ │ ├── pnasnet5large_training.txt │ │ │ │ ├── poolformer_m36_training.txt │ │ │ │ ├── regnety_002_training.txt │ │ │ │ ├── repvgg_a2_training.txt │ │ │ │ ├── res2net101_26w_4s_training.txt │ │ │ │ ├── res2net50_14w_8s_training.txt │ │ │ │ ├── res2next50_training.txt │ │ │ │ ├── resmlp_12_224_training.txt │ │ │ │ ├── resnest101e_training.txt │ │ │ │ ├── resnet18_training.txt │ │ │ │ ├── rexnet_100_training.txt │ │ │ │ ├── sebotnet33ts_256_training.txt │ │ │ │ ├── selecsls42b_training.txt │ │ │ │ ├── spnasnet_100_training.txt │ │ │ │ ├── swin_base_patch4_window7_224_training.txt │ │ │ │ ├── swsl_resnext101_32x16d_training.txt │ │ │ │ ├── tf_efficientnet_b0_training.txt │ │ │ │ ├── tf_mixnet_l_training.txt │ │ │ │ ├── tinynet_a_training.txt │ │ │ │ ├── tnt_s_patch16_224_training.txt │ │ │ │ ├── twins_pcpvt_base_training.txt │ │ │ │ ├── visformer_small_training.txt │ │ │ │ ├── vit_base_patch16_224_training.txt │ │ │ │ └── volo_d1_224_training.txt │ │ │ └── torchbench_train │ │ │ │ ├── BERT_pytorch_training.txt │ │ │ │ ├── Background_Matting_training.txt │ │ │ │ ├── LearningToPaint_training.txt │ │ │ │ ├── Super_SloMo_training.txt │ │ │ │ ├── alexnet_training.txt │ │ │ │ ├── attention_is_all_you_need_pytorch_training.txt │ │ │ │ ├── dcgan_training.txt │ │ │ │ ├── densenet121_training.txt │ │ │ │ ├── fambench_dlrm_training.txt │ │ │ │ ├── fastNLP_Bert_training.txt │ │ │ │ ├── hf_Albert_training.txt │ │ │ │ ├── hf_Bart_training.txt │ │ │ │ ├── hf_Bert_training.txt │ │ │ │ ├── hf_BigBird_training.txt │ │ │ │ ├── hf_DistilBert_training.txt │ │ │ │ ├── hf_GPT2_training.txt │ │ │ │ ├── hf_Longformer_training.txt │ │ │ │ ├── maml_omniglot_training.txt │ │ │ │ ├── mnasnet1_0_training.txt │ │ │ │ ├── mobilenet_v2_training.txt │ │ │ │ ├── mobilenet_v3_large_training.txt │ │ │ │ ├── nvidia_deeprecommender_training.txt │ │ │ │ ├── pytorch_CycleGAN_and_pix2pix_training.txt │ │ │ │ ├── pytorch_stargan_training.txt │ │ │ │ ├── pytorch_struct_training.txt │ │ │ │ ├── pytorch_unet_training.txt │ │ │ │ ├── resnet18_training.txt │ │ │ │ ├── resnet50_training.txt │ │ │ │ ├── resnext50_32x4d_training.txt │ │ │ │ ├── shufflenet_v2_x1_0_training.txt │ │ │ │ ├── speech_transformer_training.txt │ │ │ │ ├── squeezenet1_1_training.txt │ │ │ │ ├── timm_efficientdet_training.txt │ │ │ │ ├── timm_efficientnet_training.txt │ │ │ │ ├── timm_nfnet_training.txt │ │ │ │ ├── timm_regnet_training.txt │ │ │ │ ├── timm_resnest_training.txt │ │ │ │ ├── timm_vision_transformer_training.txt │ │ │ │ ├── timm_vovnet_training.txt │ │ │ │ ├── tts_angular_training.txt │ │ │ │ ├── vgg16_training.txt │ │ │ │ ├── vision_maskrcnn_training.txt │ │ │ │ └── yolov3_training.txt │ │ ├── operator_inp_utils.py │ │ ├── operatorbench.py │ │ └── utils.py │ ├── parse_logs.py │ ├── run_all.sh │ ├── run_delta.sh │ ├── runner.py │ ├── summarize_perf.py │ ├── test.py │ ├── timm_models.py │ ├── timm_models_list.txt │ ├── timm_models_list_cpu.txt │ ├── torchbench.py │ ├── torchbench_models_list.txt │ ├── torchbench_models_list_cpu.txt │ └── training_loss.py ├── fastrnns │ ├── README.md │ ├── __init__.py │ ├── bench.py │ ├── cells.py │ ├── conftest.py │ ├── custom_lstms.py │ ├── factory.py │ ├── fuser.py │ ├── profile.py │ ├── runner.py │ ├── scratch.py │ ├── test.py │ └── test_bench.py ├── framework_overhead_benchmark │ ├── C2Module.py │ ├── SimpleAddModule.py │ ├── framework_overhead_benchmark.py │ ├── pt_wrapper_module.py │ └── utils.py ├── functional_autograd_benchmark │ ├── README.md │ ├── audio_text_models.py │ ├── compare.py │ ├── functional_autograd_benchmark.py │ ├── ppl_models.py │ ├── torchaudio_models.py │ ├── torchvision_models.py │ ├── utils.py │ └── vision_models.py ├── fuser │ ├── plot_speedups.py │ └── run_benchmarks.py ├── instruction_counts │ ├── README.md │ ├── applications │ │ ├── __init__.py │ │ └── ci.py │ ├── core │ │ ├── __init__.py │ │ ├── api.py │ │ ├── expand.py │ │ ├── types.py │ │ └── utils.py │ ├── definitions │ │ ├── __init__.py │ │ ├── setup.py │ │ └── standard.py │ ├── execution │ │ ├── __init__.py │ │ ├── runner.py │ │ └── work.py │ ├── main.py │ └── worker │ │ ├── __init__.py │ │ └── main.py ├── nested │ └── nested_bmm_bench.py ├── operator_benchmark │ ├── README.md │ ├── __init__.py │ ├── benchmark_all_other_test.py │ ├── benchmark_all_quantized_test.py │ ├── benchmark_all_test.py │ ├── benchmark_caffe2.py │ ├── benchmark_core.py │ ├── benchmark_pytorch.py │ ├── benchmark_runner.py │ ├── benchmark_test_generator.py │ ├── benchmark_utils.py │ ├── c2 │ │ ├── __init__.py │ │ ├── add_test.py │ │ ├── batch_box_cox_test.py │ │ ├── batch_gather_test.py │ │ ├── clip_ranges_test.py │ │ ├── concat_test.py │ │ ├── matmul_test.py │ │ ├── quantile_op_test.py │ │ └── replace_nan_test.py │ ├── common │ │ ├── __init__.py │ │ ├── repeat_benchmark.py │ │ └── tests │ │ │ ├── add_ops_list_test.py │ │ │ ├── c2_cpu_gpu_forward_backward_test.py │ │ │ ├── jit_forward_test.py │ │ │ ├── pt_backward_test.py │ │ │ ├── pt_configs_list_test.py │ │ │ ├── pt_cpu_gpu_forward_backward_test.py │ │ │ └── random_sample_test.py │ ├── operator_benchmark.py │ ├── pt │ │ ├── __init__.py │ │ ├── add_test.py │ │ ├── ao_sparsifier_test.py │ │ ├── as_strided_test.py │ │ ├── batchnorm_test.py │ │ ├── binary_test.py │ │ ├── bmm_test.py │ │ ├── cat_test.py │ │ ├── channel_shuffle_test.py │ │ ├── chunk_test.py │ │ ├── clip_ranges_test.py │ │ ├── configs.py │ │ ├── conv_test.py │ │ ├── diag_test.py │ │ ├── embeddingbag_test.py │ │ ├── fill_test.py │ │ ├── gather_test.py │ │ ├── gelu_test.py │ │ ├── groupnorm_test.py │ │ ├── hardsigmoid_test.py │ │ ├── hardswish_test.py │ │ ├── index_select_test.py │ │ ├── instancenorm_test.py │ │ ├── interpolate_test.py │ │ ├── layernorm_test.py │ │ ├── linear_prepack_fp16_test.py │ │ ├── linear_test.py │ │ ├── linear_unpack_fp16_test.py │ │ ├── matmul_test.py │ │ ├── matrix_mult_test.py │ │ ├── nan_to_num_test.py │ │ ├── pool_test.py │ │ ├── qactivation_test.py │ │ ├── qarithmetic_test.py │ │ ├── qatembedding_ops_test.py │ │ ├── qbatchnorm_test.py │ │ ├── qcat_test.py │ │ ├── qcomparators_test.py │ │ ├── qconv_test.py │ │ ├── qembedding_bag_lookups_test.py │ │ ├── qembedding_pack_test.py │ │ ├── qembeddingbag_test.py │ │ ├── qgroupnorm_test.py │ │ ├── qinstancenorm_test.py │ │ ├── qinterpolate_test.py │ │ ├── qlayernorm_test.py │ │ ├── qlinear_test.py │ │ ├── qobserver_test.py │ │ ├── qpool_test.py │ │ ├── qrnn_test.py │ │ ├── qtensor_method_test.py │ │ ├── quantization_test.py │ │ ├── qunary_test.py │ │ ├── remainder_test.py │ │ ├── softmax_test.py │ │ ├── split_test.py │ │ ├── stack_test.py │ │ ├── sum_test.py │ │ ├── tensor_to_test.py │ │ └── unary_test.py │ └── pt_extension │ │ ├── cpp_extension_test.py │ │ ├── extension.cpp │ │ └── setup.py ├── overrides_benchmark │ ├── README.md │ ├── bench.py │ ├── common.py │ └── pyspybench.py ├── profiler_benchmark │ ├── profiler_bench.py │ └── resnet_memory_profiler.py ├── record_function_benchmark │ └── record_function_bench.py ├── serialization │ ├── nested_annotation_str.py │ └── simple_measurement.py ├── sparse │ ├── README.md │ ├── __init__.py │ ├── dlmc │ │ ├── README.md │ │ ├── __init__.py │ │ ├── matmul_bench.py │ │ ├── test.sh │ │ └── utils.py │ ├── spmm.py │ ├── spmv.py │ ├── test_csr.sh │ └── utils.py ├── static_runtime │ ├── CMakeLists.txt │ ├── deep_wide_pt.cc │ ├── deep_wide_pt.h │ ├── deep_wide_pt_bench.cc │ ├── test_cpu_fusion.cc │ ├── test_generated_ops.cc │ ├── test_static_module.cc │ ├── test_static_runtime.cc │ ├── test_utils.cc │ └── test_utils.h ├── tensorexpr │ ├── HowToRun.md │ ├── __main__.py │ ├── attention.py │ ├── benchmark.py │ ├── broadcast.py │ ├── concat.py │ ├── conv.py │ ├── elementwise.py │ ├── matmul.py │ ├── microbenchmarks.py │ ├── nnc.png │ ├── normalization.py │ ├── pooling.py │ ├── pt_engine.py │ ├── reduction.py │ ├── rnn_eltwise.py │ ├── softmax.py │ ├── swish.py │ └── tensor_engine.py ├── transformer │ ├── better_transformer_vs_mha_functional.py │ ├── sdp.py │ └── sdp_backwards.py └── upload_scribe.py ├── binaries ├── CMakeLists.txt ├── aot_model_compiler.cc ├── at_launch_benchmark.cc ├── bench_gen │ └── bench_gen.py ├── benchmark_args.h ├── benchmark_helper.cc ├── benchmark_helper.h ├── caffe2_benchmark.cc ├── compare_models_torch.cc ├── convert_and_benchmark.cc ├── convert_caffe_image_db.cc ├── convert_db.cc ├── convert_encoded_to_raw_leveldb.cc ├── convert_image_to_tensor.cc ├── core_overhead_benchmark.cc ├── core_overhead_benchmark_gpu.cc ├── db_throughput.cc ├── dump_operator_names.cc ├── inspect_gpu.cc ├── intra_inter_benchmark.cc ├── lite_interpreter_model_load.cc ├── load_benchmark_torch.cc ├── make_cifar_db.cc ├── make_image_db.cc ├── make_mnist_db.cc ├── optimize_for_mobile.cc ├── parallel_info.cc ├── predictor_verifier.cc ├── print_core_object_sizes_gpu.cc ├── print_registered_core_operators.cc ├── record_function_benchmark.cc ├── run_plan.cc ├── run_plan_mpi.cc ├── speed_benchmark.cc ├── speed_benchmark_torch.cc ├── split_db.cc ├── tsv_2_proto.cc ├── tutorial_blob.cc └── zmq_feeder.cc ├── third_party ├── BUCK.oss ├── BUILD ├── LICENSES_BUNDLED.txt ├── METADATA.bzl ├── README.md ├── build_bundled.py ├── cuda.BUILD ├── cudnn.BUILD ├── cutlass.BUILD ├── eigen.BUILD ├── fmt.BUILD ├── foxi.BUILD ├── generate-cpuinfo-wrappers.py ├── generate-xnnpack-wrappers.py ├── glog.buck.bzl ├── gloo.BUILD ├── ideep.BUILD ├── kineto.BUILD ├── kineto.buck.bzl ├── miniz-2.1.0 │ ├── BUILD.bazel │ ├── ChangeLog.md │ ├── LICENSE │ ├── examples │ │ ├── example1.c │ │ ├── example2.c │ │ ├── example3.c │ │ ├── example4.c │ │ ├── example5.c │ │ └── example6.c │ ├── miniz.c │ ├── miniz.h │ └── readme.md ├── mkl-dnn.BUILD ├── mkl.BUILD ├── mkl_headers.BUILD ├── nvfuser │ ├── CMakeLists.txt │ ├── benchmark │ │ ├── CMakeLists.txt │ │ ├── batch_norm_channels_first.cpp │ │ ├── batch_norm_channels_first_backward.cpp │ │ ├── batch_norm_channels_last.cpp │ │ ├── batch_norm_channels_last_backward.cpp │ │ ├── bert.cpp │ │ ├── broadcast.cpp │ │ ├── gelu_backward.cpp │ │ ├── heuristic_cache.cpp │ │ ├── heuristic_lookup.cpp │ │ ├── instance_norm.cpp │ │ ├── layer_norm.cpp │ │ ├── layer_norm_backward.cpp │ │ ├── lstm_cell.cpp │ │ ├── main.cpp │ │ ├── matmul.cpp │ │ ├── reduction.cpp │ │ ├── rms_norm.cpp │ │ ├── rms_norm_backward.cpp │ │ ├── scale_bias_relu.cpp │ │ ├── shape_inference.cpp │ │ ├── softmax.cpp │ │ ├── softmax_backward.cpp │ │ ├── softmax_dropout.cpp │ │ ├── timm.cpp │ │ ├── transpose.cpp │ │ ├── utils.cpp │ │ └── utils.h │ ├── csrc │ │ ├── arith.cpp │ │ ├── arith.h │ │ ├── codegen.cpp │ │ ├── codegen.h │ │ ├── compute_at.cpp │ │ ├── compute_at.h │ │ ├── compute_at_map.cpp │ │ ├── compute_at_map.h │ │ ├── contiguity.cpp │ │ ├── contiguity.h │ │ ├── disjoint_set.h │ │ ├── dispatch.cpp │ │ ├── dispatch.h │ │ ├── docs │ │ │ ├── .gitignore │ │ │ ├── documentation.h │ │ │ ├── fuser.doxygen │ │ │ ├── images │ │ │ │ └── ir_architecture.png │ │ │ └── main_page.md │ │ ├── dynamic_type.h │ │ ├── evaluator_common.cpp │ │ ├── evaluator_common.h │ │ ├── executor.cpp │ │ ├── executor.h │ │ ├── executor_kernel_arg.cpp │ │ ├── executor_kernel_arg.h │ │ ├── executor_launch_params.cpp │ │ ├── executor_launch_params.h │ │ ├── executor_utils.cpp │ │ ├── executor_utils.h │ │ ├── expr_evaluator.cpp │ │ ├── expr_evaluator.h │ │ ├── fusion.cpp │ │ ├── fusion.h │ │ ├── fusion_segmenter.cpp │ │ ├── fusion_segmenter.h │ │ ├── graph_fuser.cpp │ │ ├── grouped_reduction.cpp │ │ ├── grouped_reduction.h │ │ ├── index_compute.cpp │ │ ├── index_compute.h │ │ ├── inlining.cpp │ │ ├── inlining.h │ │ ├── instrumentation.cpp │ │ ├── instrumentation.h │ │ ├── ir_all_nodes.h │ │ ├── ir_base_nodes.cpp │ │ ├── ir_base_nodes.h │ │ ├── ir_builder.cpp │ │ ├── ir_builder.h │ │ ├── ir_cloner.cpp │ │ ├── ir_cloner.h │ │ ├── ir_container.cpp │ │ ├── ir_container.h │ │ ├── ir_graphviz.cpp │ │ ├── ir_graphviz.h │ │ ├── ir_interface_nodes.h │ │ ├── ir_internal_nodes.h │ │ ├── ir_iostream.cpp │ │ ├── ir_iostream.h │ │ ├── ir_nodes.cpp │ │ ├── ir_printer.h │ │ ├── ir_utils.cpp │ │ ├── ir_utils.h │ │ ├── iter_visitor.cpp │ │ ├── iter_visitor.h │ │ ├── kernel.cpp │ │ ├── kernel.h │ │ ├── kernel_cache.cpp │ │ ├── kernel_cache.h │ │ ├── kernel_expr_evaluator.cpp │ │ ├── kernel_expr_evaluator.h │ │ ├── kernel_ir.cpp │ │ ├── kernel_ir.h │ │ ├── kernel_ir_dispatch.cpp │ │ ├── kernel_ir_dispatch.h │ │ ├── lower2device.cpp │ │ ├── lower2device.h │ │ ├── lower_alias_memory.cpp │ │ ├── lower_alias_memory.h │ │ ├── lower_allocation.cpp │ │ ├── lower_allocation.h │ │ ├── lower_bank_conflict.cpp │ │ ├── lower_bank_conflict.h │ │ ├── lower_divisible_split.cpp │ │ ├── lower_divisible_split.h │ │ ├── lower_double_buffer.cpp │ │ ├── lower_double_buffer.h │ │ ├── lower_expr_sort.cpp │ │ ├── lower_expr_sort.h │ │ ├── lower_fused_reduction.cpp │ │ ├── lower_fused_reduction.h │ │ ├── lower_fusion_simplifier.cpp │ │ ├── lower_fusion_simplifier.h │ │ ├── lower_index.cpp │ │ ├── lower_index.h │ │ ├── lower_index_compute.cpp │ │ ├── lower_index_compute.h │ │ ├── lower_index_hoist.cpp │ │ ├── lower_index_hoist.h │ │ ├── lower_insert_syncs.cpp │ │ ├── lower_insert_syncs.h │ │ ├── lower_instrument.cpp │ │ ├── lower_instrument.h │ │ ├── lower_loops.cpp │ │ ├── lower_loops.h │ │ ├── lower_magic_zero.cpp │ │ ├── lower_magic_zero.h │ │ ├── lower_misaligned_vectorization.cpp │ │ ├── lower_misaligned_vectorization.h │ │ ├── lower_predicate.cpp │ │ ├── lower_predicate.h │ │ ├── lower_predicate_elimination.cpp │ │ ├── lower_predicate_elimination.h │ │ ├── lower_replace_size.cpp │ │ ├── lower_replace_size.h │ │ ├── lower_shift.cpp │ │ ├── lower_shift.h │ │ ├── lower_sync_information.cpp │ │ ├── lower_sync_information.h │ │ ├── lower_thread_predicate.cpp │ │ ├── lower_thread_predicate.h │ │ ├── lower_trivial_broadcast.cpp │ │ ├── lower_trivial_broadcast.h │ │ ├── lower_trivial_reductions.cpp │ │ ├── lower_trivial_reductions.h │ │ ├── lower_unroll.cpp │ │ ├── lower_unroll.h │ │ ├── lower_utils.cpp │ │ ├── lower_utils.h │ │ ├── lower_validation.cpp │ │ ├── lower_validation.h │ │ ├── lower_warp_reduce.cpp │ │ ├── lower_warp_reduce.h │ │ ├── manager.cpp │ │ ├── manager.h │ │ ├── maxinfo_propagator.cpp │ │ ├── maxinfo_propagator.h │ │ ├── mma_type.cpp │ │ ├── mma_type.h │ │ ├── mutator.cpp │ │ ├── mutator.h │ │ ├── non_divisible_split.cpp │ │ ├── non_divisible_split.h │ │ ├── ops │ │ │ ├── alias.cpp │ │ │ ├── alias.h │ │ │ ├── all_ops.h │ │ │ ├── composite.cpp │ │ │ ├── composite.h │ │ │ ├── normalization.cpp │ │ │ └── normalization.h │ │ ├── parallel_dimension_map.cpp │ │ ├── parallel_dimension_map.h │ │ ├── parallel_type_bitmap.cpp │ │ ├── parallel_type_bitmap.h │ │ ├── parser.cpp │ │ ├── parser.h │ │ ├── partial_split_map.cpp │ │ ├── partial_split_map.h │ │ ├── partition.cpp │ │ ├── partition.h │ │ ├── predicate_compute.cpp │ │ ├── predicate_compute.h │ │ ├── python_frontend │ │ │ ├── README.md │ │ │ ├── fusion_cache.cpp │ │ │ ├── fusion_cache.h │ │ │ ├── fusion_definition.cpp │ │ │ ├── fusion_definition.h │ │ │ ├── fusion_interface.cpp │ │ │ ├── fusion_interface.h │ │ │ ├── fusion_record.h │ │ │ ├── python_bindings.cpp │ │ │ ├── python_bindings.h │ │ │ ├── python_bindings_extension.cpp │ │ │ └── test │ │ │ │ ├── test_nvfuser_fusion_cache.cpp │ │ │ │ ├── test_nvfuser_fusion_definition.cpp │ │ │ │ └── test_nvfuser_fusion_record.cpp │ │ ├── register_interface.cpp │ │ ├── register_interface.h │ │ ├── root_domain_map.cpp │ │ ├── root_domain_map.h │ │ ├── scheduler │ │ │ ├── all_schedulers.h │ │ │ ├── compile_time_info.h │ │ │ ├── debug_utils.h │ │ │ ├── heuristic.h │ │ │ ├── matmul.cpp │ │ │ ├── matmul.h │ │ │ ├── mma_utils.cpp │ │ │ ├── mma_utils.h │ │ │ ├── normalization.cpp │ │ │ ├── normalization.h │ │ │ ├── pointwise.cpp │ │ │ ├── pointwise.h │ │ │ ├── pointwise_heuristic.h │ │ │ ├── pointwise_utils.cpp │ │ │ ├── pointwise_utils.h │ │ │ ├── reduction.cpp │ │ │ ├── reduction.h │ │ │ ├── reduction_heuristic.h │ │ │ ├── reduction_utils.cpp │ │ │ ├── reduction_utils.h │ │ │ ├── registry.cpp │ │ │ ├── registry.h │ │ │ ├── transpose.cpp │ │ │ ├── transpose.h │ │ │ ├── transpose_heuristic.h │ │ │ ├── utils.cpp │ │ │ ├── utils.h │ │ │ ├── vectorize_helper.cpp │ │ │ └── vectorize_helper.h │ │ ├── tensor_view.cpp │ │ ├── transform_iter.cpp │ │ ├── transform_iter.h │ │ ├── transform_replay.cpp │ │ ├── transform_replay.h │ │ ├── transform_rfactor.cpp │ │ ├── transform_rfactor.h │ │ ├── transform_view.cpp │ │ ├── transform_view.h │ │ ├── type.cpp │ │ ├── type.h │ │ ├── type_inference.cpp │ │ ├── type_inference.h │ │ ├── type_promotion.cpp │ │ ├── type_promotion.h │ │ ├── utils.cpp │ │ ├── utils.h │ │ └── vectorization_info.h │ ├── examples │ │ ├── sinh_extension │ │ │ ├── README.md │ │ │ ├── main.cpp │ │ │ ├── setup.py │ │ │ └── test.py │ │ └── sinh_libtorch │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ └── main.cpp │ ├── python │ │ └── __init__.py │ ├── python_tests │ │ ├── __init__.py │ │ ├── test_dynamo.py │ │ ├── test_python_frontend.py │ │ └── test_torchscript.py │ ├── runtime │ │ ├── array.cu │ │ ├── array_rocm.cu │ │ ├── bf16_support.cu │ │ ├── bf16_support_rocm.cu │ │ ├── block_reduction.cu │ │ ├── block_sync_atomic.cu │ │ ├── block_sync_default.cu │ │ ├── block_sync_default_rocm.cu │ │ ├── broadcast.cu │ │ ├── fp16_support.cu │ │ ├── fused_reduction.cu │ │ ├── fused_welford_helper.cu │ │ ├── fused_welford_impl.cu │ │ ├── grid_broadcast.cu │ │ ├── grid_reduction.cu │ │ ├── grid_sync.cu │ │ ├── helpers.cu │ │ ├── index_utils.cu │ │ ├── memory.cu │ │ ├── random_numbers.cu │ │ ├── swizzle.cu │ │ ├── tensor.cu │ │ ├── tensorcore.cu │ │ ├── tuple.cu │ │ ├── type_traits.cu │ │ ├── warp.cu │ │ ├── warp_rocm.cu │ │ └── welford.cu │ ├── test │ │ ├── test_gpu1.cpp │ │ ├── test_gpu2.cpp │ │ ├── test_gpu3.cpp │ │ ├── test_gpu_fused_reduction.cpp │ │ ├── test_gpu_rng.cu │ │ ├── test_gpu_shift.cpp │ │ ├── test_gpu_tensor_factories.cpp │ │ ├── test_gpu_tensorcore.cpp │ │ ├── test_gpu_transpose.cpp │ │ ├── test_gpu_utils.cpp │ │ ├── test_gpu_validator.h │ │ ├── test_gpu_view.cpp │ │ └── test_utils.h │ └── tools │ │ └── stringify_file.py ├── onnx.BUILD ├── sleef.BUILD ├── sleef.bzl ├── substitution.bzl ├── tbb.BUILD ├── tbb.patch ├── tensorflow_cuda_bazel_build │ └── cuda │ │ └── build_defs.bzl ├── tensorpipe.BUILD ├── valgrind-headers │ ├── README.md │ ├── callgrind.h │ └── valgrind.h ├── xnnpack.buck.bzl ├── xnnpack_src_defs.bzl └── xnnpack_wrapper_defs.bzl └── torchgen ├── BUCK.oss ├── BUILD.bazel ├── __init__.py ├── api ├── __init__.py ├── autograd.py ├── cpp.py ├── dispatcher.py ├── functionalization.py ├── lazy.py ├── meta.py ├── native.py ├── python.py ├── structured.py ├── translate.py ├── types │ ├── __init__.py │ ├── signatures.py │ ├── types.py │ └── types_base.py ├── ufunc.py └── unboxing.py ├── build.bzl ├── code_template.py ├── context.py ├── decompositions └── gen_jit_decompositions.py ├── dest ├── __init__.py ├── lazy_ir.py ├── lazy_ts_lowering.py ├── native_functions.py ├── register_dispatch_key.py └── ufunc.py ├── executorch ├── __init__.py └── api │ ├── __init__.py │ ├── custom_ops.py │ ├── et_cpp.py │ ├── types │ ├── __init__.py │ ├── signatures.py │ └── types.py │ └── unboxing.py ├── gen.py ├── gen_backend_stubs.py ├── gen_executorch.py ├── gen_functionalization_type.py ├── gen_lazy_tensor.py ├── gen_vmap_plumbing.py ├── local.py ├── model.py ├── native_function_generation.py ├── operator_versions ├── __init__.py ├── gen_mobile_upgraders.py └── gen_mobile_upgraders_constant.py ├── selective_build ├── __init__.py ├── operator.py └── selector.py ├── shape_functions └── gen_jit_shape_functions.py ├── static_runtime ├── __init__.py ├── config.py ├── gen_static_runtime_ops.py └── generator.py └── utils.py /.bazelversion: -------------------------------------------------------------------------------- 1 | 6.1.1 2 | -------------------------------------------------------------------------------- /.buckconfig.oss: -------------------------------------------------------------------------------- 1 | [pt] 2 | is_oss=1 3 | 4 | [buildfile] 5 | name = BUCK.oss 6 | includes = //tools/build_defs/select.bzl 7 | 8 | [repositories] 9 | bazel_skylib = third_party/bazel-skylib/ 10 | ovr_config = . 11 | 12 | [download] 13 | in_build = true 14 | 15 | [cxx] 16 | cxxflags = -std=c++17 17 | ldflags = -Wl,--no-undefined 18 | should_remap_host_platform = true 19 | cpp = /usr/bin/clang 20 | cc = /usr/bin/clang 21 | cxx = /usr/bin/clang++ 22 | cxxpp = /usr/bin/clang++ 23 | ld = /usr/bin/clang++ 24 | 25 | [project] 26 | default_flavors_mode=all 27 | -------------------------------------------------------------------------------- /.ci/caffe2/README.md: -------------------------------------------------------------------------------- 1 | # Jenkins 2 | 3 | The scripts in this directory are the entrypoint for testing Caffe2. 4 | 5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to 6 | the build environment you intend to test. It is a hint for the build 7 | and test scripts to configure Caffe2 a certain way and include/exclude 8 | tests. Docker images, they equal the name of the image itself. For 9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are 10 | built on Jenkins and are used in triggered builds already have this 11 | environment variable set in their manifest. Also see 12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. 13 | 14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. 15 | -------------------------------------------------------------------------------- /.ci/caffe2/common.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) 5 | TEST_DIR="$ROOT_DIR/test" 6 | gtest_reports_dir="${TEST_DIR}/test-reports/cpp" 7 | pytest_reports_dir="${TEST_DIR}/test-reports/python" 8 | 9 | # Figure out which Python to use 10 | PYTHON="$(which python)" 11 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then 12 | PYTHON=$(which "python${BASH_REMATCH[1]}") 13 | fi 14 | 15 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then 16 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors 17 | unset HIP_PLATFORM 18 | if which sccache > /dev/null; then 19 | # Save sccache logs to file 20 | sccache --stop-server || true 21 | rm -f ~/sccache_error.log || true 22 | SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server 23 | 24 | # Report sccache stats for easier debugging 25 | sccache --zero-stats 26 | fi 27 | fi 28 | 29 | # /usr/local/caffe2 is where the cpp bits are installed to in cmake-only 30 | # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so 31 | # that the test code in .ci/test.sh is the same 32 | INSTALL_PREFIX="/usr/local/caffe2" 33 | 34 | mkdir -p "$gtest_reports_dir" || true 35 | mkdir -p "$pytest_reports_dir" || true 36 | mkdir -p "$INSTALL_PREFIX" || true 37 | -------------------------------------------------------------------------------- /.ci/docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker images for Jenkins 2 | 3 | This directory contains everything needed to build the Docker images 4 | that are used in our CI 5 | 6 | The Dockerfiles located in subdirectories are parameterized to 7 | conditionally run build stages depending on build arguments passed to 8 | `docker build`. This lets us use only a few Dockerfiles for many 9 | images. The different configurations are identified by a freeform 10 | string that we call a _build environment_. This string is persisted in 11 | each image as the `BUILD_ENVIRONMENT` environment variable. 12 | 13 | See `build.sh` for valid build environments (it's the giant switch). 14 | 15 | Docker builds are now defined with `.circleci/cimodel/data/simple/docker_definitions.py` 16 | 17 | ## Contents 18 | 19 | * `build.sh` -- dispatch script to launch all builds 20 | * `common` -- scripts used to execute individual Docker build stages 21 | * `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker 22 | 23 | ## Usage 24 | 25 | ```bash 26 | # Build a specific image 27 | ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest 28 | 29 | # Set flags (see build.sh) and build image 30 | sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest 31 | ``` 32 | -------------------------------------------------------------------------------- /.ci/docker/android/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.ci/docker/ci_commit_pins/triton-rocm.txt: -------------------------------------------------------------------------------- 1 | de3f5436247e391b062a7dd7fd42d2a55c2cd524 2 | -------------------------------------------------------------------------------- /.ci/docker/ci_commit_pins/triton.txt: -------------------------------------------------------------------------------- 1 | 46672772b46b103db7341c9e10fbad7f643557d4 2 | -------------------------------------------------------------------------------- /.ci/docker/common/common_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Work around bug where devtoolset replaces sudo and breaks it. 4 | if [ -n "$DEVTOOLSET_VERSION" ]; then 5 | export SUDO=/bin/sudo 6 | else 7 | export SUDO=sudo 8 | fi 9 | 10 | as_jenkins() { 11 | # NB: unsetting the environment variables works around a conda bug 12 | # https://github.com/conda/conda/issues/6576 13 | # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation 14 | # NB: This must be run from a directory that jenkins has access to, 15 | # works around https://github.com/conda/conda-package-handling/pull/34 16 | $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* 17 | } 18 | 19 | conda_install() { 20 | # Ensure that the install command don't upgrade/downgrade Python 21 | # This should be called as 22 | # conda_install pkg1 pkg2 ... [-c channel] 23 | as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* 24 | } 25 | 26 | conda_run() { 27 | as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* 28 | } 29 | 30 | pip_install() { 31 | as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* 32 | } 33 | 34 | get_pinned_commit() { 35 | cat "${1}".txt 36 | } 37 | -------------------------------------------------------------------------------- /.ci/docker/common/install_cmake.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$CMAKE_VERSION" ] 6 | 7 | # Remove system cmake install so it won't get used instead 8 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 9 | case "$ID" in 10 | ubuntu) 11 | apt-get remove cmake -y 12 | ;; 13 | centos) 14 | yum remove cmake -y 15 | ;; 16 | *) 17 | echo "Unable to determine OS..." 18 | exit 1 19 | ;; 20 | esac 21 | 22 | # Turn 3.6.3 into v3.6 23 | path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/') 24 | file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz" 25 | 26 | # Download and install specific CMake version in /usr/local 27 | pushd /tmp 28 | curl -Os --retry 3 "https://cmake.org/files/${path}/${file}" 29 | tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz 30 | rm -f cmake-*.tar.gz 31 | popd 32 | -------------------------------------------------------------------------------- /.ci/docker/common/install_cudnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ ${CUDNN_VERSION} == 8 ]]; then 4 | # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement 5 | mkdir tmp_cudnn && cd tmp_cudnn 6 | CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive" 7 | if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then 8 | CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive" 9 | curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz 10 | elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then 11 | CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive" 12 | curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz 13 | else 14 | curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz 15 | fi 16 | 17 | tar xf ${CUDNN_NAME}.tar.xz 18 | cp -a ${CUDNN_NAME}/include/* /usr/include/ 19 | cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/ 20 | cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/ 21 | 22 | cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/ 23 | cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/ 24 | cd .. 25 | rm -rf tmp_cudnn 26 | ldconfig 27 | fi 28 | -------------------------------------------------------------------------------- /.ci/docker/common/install_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | install_ubuntu() { 6 | apt-get update 7 | apt-get install -y --no-install-recommends \ 8 | libhiredis-dev \ 9 | libleveldb-dev \ 10 | liblmdb-dev \ 11 | libsnappy-dev 12 | 13 | # Cleanup 14 | apt-get autoclean && apt-get clean 15 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 16 | } 17 | 18 | install_centos() { 19 | # Need EPEL for many packages we depend on. 20 | # See http://fedoraproject.org/wiki/EPEL 21 | yum --enablerepo=extras install -y epel-release 22 | 23 | yum install -y \ 24 | hiredis-devel \ 25 | leveldb-devel \ 26 | lmdb-devel \ 27 | snappy-devel 28 | 29 | # Cleanup 30 | yum clean all 31 | rm -rf /var/cache/yum 32 | rm -rf /var/lib/yum/yumdb 33 | rm -rf /var/lib/yum/history 34 | } 35 | 36 | # Install base packages depending on the base OS 37 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 38 | case "$ID" in 39 | ubuntu) 40 | install_ubuntu 41 | ;; 42 | centos) 43 | install_centos 44 | ;; 45 | *) 46 | echo "Unable to determine OS..." 47 | exit 1 48 | ;; 49 | esac 50 | -------------------------------------------------------------------------------- /.ci/docker/common/install_devtoolset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$DEVTOOLSET_VERSION" ] 6 | 7 | yum install -y centos-release-scl 8 | yum install -y devtoolset-$DEVTOOLSET_VERSION 9 | 10 | echo "source scl_source enable devtoolset-$DEVTOOLSET_VERSION" > "/etc/profile.d/devtoolset-$DEVTOOLSET_VERSION.sh" 11 | -------------------------------------------------------------------------------- /.ci/docker/common/install_docs_reqs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | if [ -n "$KATEX" ]; then 6 | apt-get update 7 | # Ignore error if gpg-agent doesn't exist (for Ubuntu 16.04) 8 | apt-get install -y gpg-agent || : 9 | 10 | curl --retry 3 -sL https://deb.nodesource.com/setup_12.x | sudo -E bash - 11 | sudo apt-get install -y nodejs 12 | 13 | curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - 14 | echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list 15 | 16 | apt-get update 17 | apt-get install -y --no-install-recommends yarn 18 | yarn global add katex --prefix /usr/local 19 | 20 | sudo apt-get -y install doxygen 21 | 22 | apt-get autoclean && apt-get clean 23 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 24 | 25 | fi 26 | -------------------------------------------------------------------------------- /.ci/docker/common/install_gcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | if [ -n "$GCC_VERSION" ]; then 6 | 7 | # Need the official toolchain repo to get alternate packages 8 | add-apt-repository ppa:ubuntu-toolchain-r/test 9 | apt-get update 10 | if [[ "$UBUNTU_VERSION" == "16.04" && "${GCC_VERSION:0:1}" == "5" ]]; then 11 | apt-get install -y g++-5=5.4.0-6ubuntu1~16.04.12 12 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 13 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50 14 | update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-5 50 15 | else 16 | apt-get install -y g++-$GCC_VERSION 17 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 18 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 19 | update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 20 | fi 21 | 22 | 23 | # Cleanup package manager 24 | apt-get autoclean && apt-get clean 25 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 26 | 27 | fi 28 | -------------------------------------------------------------------------------- /.ci/docker/common/install_glibc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$GLIBC_VERSION" ] 6 | if [[ -n "$CENTOS_VERSION" ]]; then 7 | [ -n "$DEVTOOLSET_VERSION" ] 8 | fi 9 | 10 | yum install -y wget sed 11 | 12 | mkdir -p /packages && cd /packages 13 | wget -q http://ftp.gnu.org/gnu/glibc/glibc-$GLIBC_VERSION.tar.gz 14 | tar xzf glibc-$GLIBC_VERSION.tar.gz 15 | if [[ "$GLIBC_VERSION" == "2.26" ]]; then 16 | cd glibc-$GLIBC_VERSION 17 | sed -i 's/$name ne "nss_test1"/$name ne "nss_test1" \&\& $name ne "nss_test2"/' scripts/test-installation.pl 18 | cd .. 19 | fi 20 | mkdir -p glibc-$GLIBC_VERSION-build && cd glibc-$GLIBC_VERSION-build 21 | 22 | if [[ -n "$CENTOS_VERSION" ]]; then 23 | export PATH=/opt/rh/devtoolset-$DEVTOOLSET_VERSION/root/usr/bin:$PATH 24 | fi 25 | 26 | ../glibc-$GLIBC_VERSION/configure --prefix=/usr CFLAGS='-Wno-stringop-truncation -Wno-format-overflow -Wno-restrict -Wno-format-truncation -g -O2' 27 | make -j$(nproc) 28 | make install 29 | 30 | # Cleanup 31 | rm -rf /packages 32 | rm -rf /var/cache/yum/* 33 | rm -rf /var/lib/rpm/__db.* 34 | yum clean all 35 | -------------------------------------------------------------------------------- /.ci/docker/common/install_jni.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | mkdir -p /usr/local/include 6 | cp jni.h /usr/local/include 7 | -------------------------------------------------------------------------------- /.ci/docker/common/install_lcov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | git clone --branch v1.15 https://github.com/linux-test-project/lcov.git 6 | pushd lcov 7 | sudo make install # will be installed in /usr/local/bin/lcov 8 | popd 9 | -------------------------------------------------------------------------------- /.ci/docker/common/install_linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" 6 | 7 | if [ -n "${UBUNTU_VERSION}" ]; then 8 | apt update 9 | apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5 10 | fi 11 | 12 | # Do shallow clone of PyTorch so that we can init lintrunner in Docker build context 13 | git clone https://github.com/pytorch/pytorch.git --depth 1 14 | chown -R jenkins pytorch 15 | 16 | pushd pytorch 17 | # Install all linter dependencies 18 | pip_install -r requirements.txt 19 | conda_run lintrunner init 20 | 21 | # Cache .lintbin directory as part of the Docker image 22 | cp -r .lintbin /tmp 23 | popd 24 | 25 | # Node dependencies required by toc linter job 26 | npm install -g markdown-toc 27 | 28 | # Cleaning up 29 | rm -rf pytorch 30 | -------------------------------------------------------------------------------- /.ci/docker/common/install_ninja.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "$NINJA_VERSION" ] 6 | 7 | url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip" 8 | 9 | pushd /tmp 10 | wget --no-verbose --output-document=ninja-linux.zip "$url" 11 | unzip ninja-linux.zip -d /usr/local/bin 12 | rm -f ninja-linux.zip 13 | popd 14 | -------------------------------------------------------------------------------- /.ci/docker/common/install_openmpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get update 4 | # also install ssh to avoid error of: 5 | # -------------------------------------------------------------------------- 6 | # The value of the MCA parameter "plm_rsh_agent" was set to a path 7 | # that could not be found: 8 | # plm_rsh_agent: ssh : rsh 9 | sudo apt-get install -y ssh 10 | sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev 11 | -------------------------------------------------------------------------------- /.ci/docker/common/install_openssl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | OPENSSL=openssl-1.1.1k 6 | 7 | wget -q -O "${OPENSSL}.tar.gz" "https://ossci-linux.s3.amazonaws.com/${OPENSSL}.tar.gz" 8 | tar xf "${OPENSSL}.tar.gz" 9 | cd "${OPENSSL}" 10 | ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)' 11 | # NOTE: openssl install errors out when built with the -j option 12 | make -j6; make install_sw 13 | # Link the ssl libraries to the /usr/lib folder. 14 | sudo ln -s /opt/openssl/lib/lib* /usr/lib 15 | cd .. 16 | rm -rf "${OPENSSL}" 17 | -------------------------------------------------------------------------------- /.ci/docker/common/install_swiftshader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "${SWIFTSHADER}" ] 6 | 7 | retry () { 8 | $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) 9 | } 10 | 11 | _https_amazon_aws=https://ossci-android.s3.amazonaws.com 12 | 13 | # SwiftShader 14 | _swiftshader_dir=/var/lib/jenkins/swiftshader 15 | _swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz 16 | mkdir -p $_swiftshader_dir 17 | _tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}" 18 | 19 | curl --silent --show-error --location --fail --retry 3 \ 20 | --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}" 21 | 22 | tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}" 23 | 24 | export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json" 25 | -------------------------------------------------------------------------------- /.ci/docker/common/install_thrift.sh: -------------------------------------------------------------------------------- 1 | apt-get update 2 | apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev 3 | wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz 4 | tar -xvf thrift-0.12.0.tar.gz 5 | cd thrift-0.12.0 6 | for file in ./compiler/cpp/Makefile*; do 7 | sed -i 's/\-Werror//' $file 8 | done 9 | ./bootstrap.sh 10 | ./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby 11 | sudo make 12 | sudo make install 13 | cd .. 14 | rm thrift-0.12.0.tar.gz 15 | -------------------------------------------------------------------------------- /.ci/docker/common/install_ucc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | if [[ -d "/usr/local/cuda/" ]]; then 6 | with_cuda=/usr/local/cuda/ 7 | else 8 | with_cuda=no 9 | fi 10 | 11 | function install_ucx() { 12 | set -ex 13 | git clone --recursive https://github.com/openucx/ucx.git 14 | pushd ucx 15 | git checkout ${UCX_COMMIT} 16 | git submodule update --init --recursive 17 | 18 | ./autogen.sh 19 | ./configure --prefix=$UCX_HOME \ 20 | --enable-mt \ 21 | --with-cuda=$with_cuda \ 22 | --enable-profiling \ 23 | --enable-stats 24 | time make -j 25 | sudo make install 26 | 27 | popd 28 | rm -rf ucx 29 | } 30 | 31 | function install_ucc() { 32 | set -ex 33 | git clone --recursive https://github.com/openucx/ucc.git 34 | pushd ucc 35 | git checkout ${UCC_COMMIT} 36 | git submodule update --init --recursive 37 | 38 | ./autogen.sh 39 | ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda 40 | time make -j 41 | sudo make install 42 | 43 | popd 44 | rm -rf ucc 45 | } 46 | 47 | install_ucx 48 | install_ucc 49 | -------------------------------------------------------------------------------- /.ci/docker/common/install_user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # Mirror jenkins user in container 6 | # jenkins user as ec2-user should have the same user-id 7 | echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd 8 | echo "jenkins:x:1000:" >> /etc/group 9 | # Needed on focal or newer 10 | echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow 11 | 12 | # Create $HOME 13 | mkdir -p /var/lib/jenkins 14 | chown jenkins:jenkins /var/lib/jenkins 15 | mkdir -p /var/lib/jenkins/.ccache 16 | chown jenkins:jenkins /var/lib/jenkins/.ccache 17 | 18 | # Allow writing to /usr/local (for make install) 19 | chown jenkins:jenkins /usr/local 20 | 21 | # Allow sudo 22 | # TODO: Maybe we shouldn't 23 | echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins 24 | 25 | # Work around bug where devtoolset replaces sudo and breaks it. 26 | if [ -n "$DEVTOOLSET_VERSION" ]; then 27 | SUDO=/bin/sudo 28 | else 29 | SUDO=sudo 30 | fi 31 | 32 | # Test that sudo works 33 | $SUDO -u jenkins $SUDO -v 34 | -------------------------------------------------------------------------------- /.ci/docker/common/install_vision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | install_ubuntu() { 6 | apt-get update 7 | apt-get install -y --no-install-recommends \ 8 | libopencv-dev \ 9 | libavcodec-dev 10 | 11 | # Cleanup 12 | apt-get autoclean && apt-get clean 13 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 14 | } 15 | 16 | install_centos() { 17 | # Need EPEL for many packages we depend on. 18 | # See http://fedoraproject.org/wiki/EPEL 19 | yum --enablerepo=extras install -y epel-release 20 | 21 | yum install -y \ 22 | opencv-devel \ 23 | ffmpeg-devel 24 | 25 | # Cleanup 26 | yum clean all 27 | rm -rf /var/cache/yum 28 | rm -rf /var/lib/yum/yumdb 29 | rm -rf /var/lib/yum/history 30 | } 31 | 32 | # Install base packages depending on the base OS 33 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') 34 | case "$ID" in 35 | ubuntu) 36 | install_ubuntu 37 | ;; 38 | centos) 39 | install_centos 40 | ;; 41 | *) 42 | echo "Unable to determine OS..." 43 | exit 1 44 | ;; 45 | esac 46 | -------------------------------------------------------------------------------- /.ci/docker/common/install_vulkan_sdk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | [ -n "${VULKAN_SDK_VERSION}" ] 6 | 7 | retry () { 8 | $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) 9 | } 10 | 11 | _vulkansdk_dir=/var/lib/jenkins/vulkansdk 12 | _tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz 13 | 14 | curl \ 15 | --silent \ 16 | --show-error \ 17 | --location \ 18 | --fail \ 19 | --retry 3 \ 20 | --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz" 21 | 22 | mkdir -p "${_vulkansdk_dir}" 23 | tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1 24 | rm -rf "${_tmp_vulkansdk_targz}" 25 | -------------------------------------------------------------------------------- /.ci/docker/linter/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION 2 | 3 | FROM ubuntu:${UBUNTU_VERSION} 4 | 5 | ARG UBUNTU_VERSION 6 | 7 | ENV DEBIAN_FRONTEND noninteractive 8 | 9 | # Install common dependencies (so that this step can be cached separately) 10 | COPY ./common/install_base.sh install_base.sh 11 | RUN bash ./install_base.sh && rm install_base.sh 12 | 13 | # Install user 14 | COPY ./common/install_user.sh install_user.sh 15 | RUN bash ./install_user.sh && rm install_user.sh 16 | 17 | # Install conda and other packages (e.g., numpy, pytest) 18 | ARG ANACONDA_PYTHON_VERSION 19 | ARG CONDA_CMAKE 20 | ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION 21 | ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH 22 | COPY requirements-ci.txt /opt/conda/requirements-ci.txt 23 | COPY ./common/install_conda.sh install_conda.sh 24 | COPY ./common/common_utils.sh common_utils.sh 25 | RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt 26 | 27 | # Note that Docker build forbids copying file outside the build context 28 | COPY ./common/install_linter.sh install_linter.sh 29 | COPY ./common/common_utils.sh common_utils.sh 30 | RUN bash ./install_linter.sh 31 | RUN rm install_linter.sh common_utils.sh 32 | 33 | USER jenkins 34 | CMD ["bash"] 35 | -------------------------------------------------------------------------------- /.ci/docker/triton_version.txt: -------------------------------------------------------------------------------- 1 | 2.1.0 2 | -------------------------------------------------------------------------------- /.ci/docker/ubuntu-rocm/.gitignore: -------------------------------------------------------------------------------- 1 | *.sh 2 | -------------------------------------------------------------------------------- /.ci/onnx/README.md: -------------------------------------------------------------------------------- 1 | # Jenkins 2 | 3 | The scripts in this directory are the entrypoint for testing ONNX exporter. 4 | 5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to 6 | the build environment you intend to test. It is a hint for the build 7 | and test scripts to configure Caffe2 a certain way and include/exclude 8 | tests. Docker images, they equal the name of the image itself. For 9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are 10 | built on Jenkins and are used in triggered builds already have this 11 | environment variable set in their manifest. Also see 12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. 13 | 14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/. 15 | -------------------------------------------------------------------------------- /.ci/onnx/common.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) 5 | TEST_DIR="$ROOT_DIR/test" 6 | pytest_reports_dir="${TEST_DIR}/test-reports/python" 7 | 8 | # Figure out which Python to use 9 | PYTHON="$(which python)" 10 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then 11 | PYTHON=$(which "python${BASH_REMATCH[1]}") 12 | fi 13 | 14 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then 15 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors 16 | unset HIP_PLATFORM 17 | fi 18 | 19 | mkdir -p "$pytest_reports_dir" || true 20 | -------------------------------------------------------------------------------- /.ci/onnx/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # shellcheck source=./common.sh 4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh" 5 | 6 | # Use to retry ONNX test, only retry it twice 7 | retry () { 8 | "$@" || (sleep 60 && "$@") 9 | } 10 | 11 | if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then 12 | pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx" 13 | # TODO: This can be removed later once vision is also part of the Docker image 14 | pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" 15 | # JIT C++ extensions require ninja, so put it into PATH. 16 | export PATH="/var/lib/jenkins/.local/bin:$PATH" 17 | # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we 18 | # need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in 19 | # https://github.com/pytorch/pytorch/issues/98626 20 | retry "$ROOT_DIR/scripts/onnx/test.sh" 21 | fi 22 | -------------------------------------------------------------------------------- /.ci/pytorch/.shellcheckrc: -------------------------------------------------------------------------------- 1 | source-path=SCRIPTDIR 2 | 3 | # we'd like to enable --external-sources here but can't 4 | # https://github.com/koalaman/shellcheck/issues/1818 5 | -------------------------------------------------------------------------------- /.ci/pytorch/build-asan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Required environment variable: $BUILD_ENVIRONMENT 4 | # (This is set by default in the Docker images we build, so you don't 5 | # need to set it yourself. 6 | 7 | # shellcheck source=./common.sh 8 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh" 9 | # shellcheck source=./common-build.sh 10 | source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" 11 | 12 | echo "Clang version:" 13 | clang --version 14 | 15 | python tools/stats/export_test_times.py 16 | 17 | if [ -n "$(which conda)" ]; then 18 | export CMAKE_PREFIX_PATH=/opt/conda 19 | fi 20 | 21 | CC="clang" CXX="clang++" LDSHARED="clang --shared" \ 22 | USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \ 23 | UBSAN_FLAGS="-fno-sanitize-recover=all" \ 24 | python setup.py bdist_wheel 25 | pip_install_whl "$(echo dist/*.whl)" 26 | 27 | # Test building via the sdist source tarball 28 | python setup.py sdist 29 | mkdir -p /tmp/tmp 30 | pushd /tmp/tmp 31 | tar zxf "$(dirname "${BASH_SOURCE[0]}")/../../dist/"*.tar.gz 32 | cd torch-* 33 | python setup.py build --cmake-only 34 | popd 35 | 36 | print_sccache_stats 37 | 38 | assert_git_not_dirty 39 | -------------------------------------------------------------------------------- /.ci/pytorch/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Common setup for all Jenkins scripts 4 | # shellcheck source=./common_utils.sh 5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" 6 | set -ex 7 | 8 | # Required environment variables: 9 | # $BUILD_ENVIRONMENT (should be set by your Docker image) 10 | 11 | # Figure out which Python to use for ROCm 12 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then 13 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors 14 | unset HIP_PLATFORM 15 | export PYTORCH_TEST_WITH_ROCM=1 16 | # temporary to locate some kernel issues on the CI nodes 17 | export HSAKMT_DEBUG_LEVEL=4 18 | # improve rccl performance for distributed tests 19 | export HSA_FORCE_FINE_GRAIN_PCIE=1 20 | fi 21 | 22 | # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598 23 | # shellcheck disable=SC2034 24 | BUILD_TEST_LIBTORCH=0 25 | -------------------------------------------------------------------------------- /.ci/pytorch/docker-build-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # shellcheck source=./common.sh 4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh" 5 | 6 | docker build -t pytorch . 7 | -------------------------------------------------------------------------------- /.ci/pytorch/docs-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # shellcheck source=./common.sh 4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh" 5 | 6 | echo "Testing pytorch docs" 7 | 8 | cd docs 9 | pip_install -r requirements.txt 10 | make doctest 11 | -------------------------------------------------------------------------------- /.ci/pytorch/fake_numpy/numpy.py: -------------------------------------------------------------------------------- 1 | raise ModuleNotFoundError("Sorry PyTorch, but our NumPy is in the other folder") 2 | -------------------------------------------------------------------------------- /.ci/pytorch/macos-build-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-build* ]]; then 4 | # shellcheck source=./macos-build.sh 5 | source "$(dirname "${BASH_SOURCE[0]}")/macos-build.sh" 6 | fi 7 | 8 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-test* ]]; then 9 | # shellcheck source=./macos-test.sh 10 | source "$(dirname "${BASH_SOURCE[0]}")/macos-test.sh" 11 | fi 12 | -------------------------------------------------------------------------------- /.ci/pytorch/macos-common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Common prelude for macos-build.sh and macos-test.sh 4 | 5 | # shellcheck source=./common.sh 6 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh" 7 | 8 | sysctl -a | grep machdep.cpu 9 | 10 | # These are required for both the build job and the test job. 11 | # In the latter to test cpp extensions. 12 | export MACOSX_DEPLOYMENT_TARGET=10.9 13 | export CXX=clang++ 14 | export CC=clang 15 | 16 | print_cmake_info() { 17 | CMAKE_EXEC=$(which cmake) 18 | echo "$CMAKE_EXEC" 19 | 20 | CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC") 21 | # Print all libraries under cmake rpath for debugging 22 | ls -la "$CONDA_INSTALLATION_DIR/../lib" 23 | 24 | export CMAKE_EXEC 25 | # Explicitly add conda env lib folder to cmake rpath to address the flaky issue 26 | # where cmake dependencies couldn't be found. This seems to point to how conda 27 | # links $CMAKE_EXEC to its package cache when cloning a new environment 28 | install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true 29 | # Adding the rpath will invalidate cmake signature, so signing it again here 30 | # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid)) 31 | # with an exit code 137 otherwise 32 | codesign -f -s - "${CMAKE_EXEC}" || true 33 | } 34 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | run_test () { 5 | rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/ 6 | "$@" 7 | cd .. && rm -rf test_tmp/ 8 | } 9 | 10 | get_runtime_of_command () { 11 | TIMEFORMAT=%R 12 | 13 | # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null) 14 | runtime=$( { time "$@"; } 2>&1 1>/dev/null) 15 | if [[ $runtime == *"Error"* ]]; then 16 | exit 1 17 | fi 18 | runtime=${runtime#+++ $@} 19 | runtime=$(python -c "print($runtime)") 20 | 21 | echo "$runtime" 22 | } 23 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/get_stats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import numpy 4 | 5 | sample_data_list = sys.argv[1:] 6 | sample_data_list = [float(v.strip()) for v in sample_data_list] 7 | 8 | sample_mean = numpy.mean(sample_data_list) 9 | sample_sigma = numpy.std(sample_data_list) 10 | 11 | data = { 12 | 'mean': sample_mean, 13 | 'sigma': sample_sigma, 14 | } 15 | 16 | print(json.dumps(data)) 17 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_cpu_speed_mini_sequence_labeler () { 7 | echo "Testing: mini sequence labeler, CPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/benchmark.git 13 | 14 | cd benchmark/ 15 | 16 | git checkout 726567a455edbfda6199445922a8cfee82535664 17 | 18 | cd scripts/mini_sequence_labeler 19 | 20 | SAMPLE_ARRAY=() 21 | NUM_RUNS=$1 22 | 23 | for (( i=1; i<=NUM_RUNS; i++ )) do 24 | runtime=$(get_runtime_of_command python main.py) 25 | SAMPLE_ARRAY+=("${runtime}") 26 | done 27 | 28 | cd ../../.. 29 | 30 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 31 | echo "Runtime stats in seconds:" 32 | echo "$stats" 33 | 34 | if [ "$2" == "compare_with_baseline" ]; then 35 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 36 | elif [ "$2" == "compare_and_update" ]; then 37 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 38 | fi 39 | } 40 | 41 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 42 | run_test test_cpu_speed_mini_sequence_labeler "$@" 43 | fi 44 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_cpu_speed_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_cpu_speed_mnist () { 7 | echo "Testing: MNIST, CPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/examples.git -b perftests 13 | 14 | cd examples/mnist 15 | 16 | conda install -c pytorch torchvision-cpu 17 | 18 | # Download data 19 | python main.py --epochs 0 20 | 21 | SAMPLE_ARRAY=() 22 | NUM_RUNS=$1 23 | 24 | for (( i=1; i<=NUM_RUNS; i++ )) do 25 | runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log) 26 | echo "$runtime" 27 | SAMPLE_ARRAY+=("${runtime}") 28 | done 29 | 30 | cd ../.. 31 | 32 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 33 | echo "Runtime stats in seconds:" 34 | echo "$stats" 35 | 36 | if [ "$2" == "compare_with_baseline" ]; then 37 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 38 | elif [ "$2" == "compare_and_update" ]; then 39 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 40 | fi 41 | } 42 | 43 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 44 | run_test test_cpu_speed_mnist "$@" 45 | fi 46 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_cpu_speed_torch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./common.sh 4 | 5 | test_cpu_speed_torch () { 6 | echo "Testing: torch.*, CPU" 7 | 8 | export OMP_NUM_THREADS=4 9 | export MKL_NUM_THREADS=4 10 | 11 | git clone https://github.com/yf225/perf-tests.git 12 | 13 | if [ "$1" == "compare_with_baseline" ]; then 14 | export ARGS=(--compare ../cpu_runtime.json) 15 | elif [ "$1" == "compare_and_update" ]; then 16 | export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json) 17 | elif [ "$1" == "update_only" ]; then 18 | export ARGS=(--update ../new_cpu_runtime.json) 19 | fi 20 | 21 | if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then 22 | echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change." 23 | exit 1 24 | fi 25 | } 26 | 27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 28 | run_test test_cpu_speed_torch "$@" 29 | fi 30 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./common.sh 4 | 5 | test_cpu_speed_torch_tensor () { 6 | echo "Testing: torch.Tensor.*, CPU" 7 | 8 | export OMP_NUM_THREADS=4 9 | export MKL_NUM_THREADS=4 10 | 11 | git clone https://github.com/yf225/perf-tests.git 12 | 13 | if [ "$1" == "compare_with_baseline" ]; then 14 | export ARGS=(--compare ../cpu_runtime.json) 15 | elif [ "$1" == "compare_and_update" ]; then 16 | export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json) 17 | elif [ "$1" == "update_only" ]; then 18 | export ARGS=(--update ../new_cpu_runtime.json) 19 | fi 20 | 21 | if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then 22 | echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change." 23 | exit 1 24 | fi 25 | } 26 | 27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 28 | run_test test_cpu_speed_torch_tensor "$@" 29 | fi 30 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_gpu_speed_cudnn_lstm () { 7 | echo "Testing: CuDNN LSTM, GPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/benchmark.git 13 | 14 | cd benchmark/ 15 | 16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 17 | 18 | cd scripts/ 19 | 20 | SAMPLE_ARRAY=() 21 | NUM_RUNS=$1 22 | 23 | for (( i=1; i<=NUM_RUNS; i++ )) do 24 | runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check) 25 | echo "$runtime" 26 | SAMPLE_ARRAY+=("${runtime}") 27 | done 28 | 29 | cd ../.. 30 | 31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 32 | echo "Runtime stats in seconds:" 33 | echo "$stats" 34 | 35 | if [ "$2" == "compare_with_baseline" ]; then 36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 37 | elif [ "$2" == "compare_and_update" ]; then 38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 39 | fi 40 | } 41 | 42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 43 | run_test test_gpu_speed_cudnn_lstm "$@" 44 | fi 45 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_gpu_speed_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_gpu_speed_lstm () { 7 | echo "Testing: LSTM, GPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/benchmark.git 13 | 14 | cd benchmark/ 15 | 16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 17 | 18 | cd scripts/ 19 | 20 | SAMPLE_ARRAY=() 21 | NUM_RUNS=$1 22 | 23 | for (( i=1; i<=NUM_RUNS; i++ )) do 24 | runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check) 25 | echo "$runtime" 26 | SAMPLE_ARRAY+=("${runtime}") 27 | done 28 | 29 | cd ../.. 30 | 31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 32 | echo "Runtime stats in seconds:" 33 | echo "$stats" 34 | 35 | if [ "$2" == "compare_with_baseline" ]; then 36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 37 | elif [ "$2" == "compare_and_update" ]; then 38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 39 | fi 40 | } 41 | 42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 43 | run_test test_gpu_speed_lstm "$@" 44 | fi 45 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_gpu_speed_mlstm () { 7 | echo "Testing: MLSTM, GPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/benchmark.git 13 | 14 | cd benchmark/ 15 | 16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0 17 | 18 | cd scripts/ 19 | 20 | SAMPLE_ARRAY=() 21 | NUM_RUNS=$1 22 | 23 | for (( i=1; i<=NUM_RUNS; i++ )) do 24 | runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check) 25 | echo "$runtime" 26 | SAMPLE_ARRAY+=("${runtime}") 27 | done 28 | 29 | cd ../.. 30 | 31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 32 | echo "Runtime stats in seconds:" 33 | echo "$stats" 34 | 35 | if [ "$2" == "compare_with_baseline" ]; then 36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 37 | elif [ "$2" == "compare_and_update" ]; then 38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 39 | fi 40 | } 41 | 42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 43 | run_test test_gpu_speed_mlstm "$@" 44 | fi 45 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/test_gpu_speed_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ./common.sh 5 | 6 | test_gpu_speed_mnist () { 7 | echo "Testing: MNIST, GPU" 8 | 9 | export OMP_NUM_THREADS=4 10 | export MKL_NUM_THREADS=4 11 | 12 | git clone https://github.com/pytorch/examples.git -b perftests 13 | 14 | cd examples/mnist 15 | 16 | conda install -c pytorch torchvision 17 | 18 | # Download data 19 | python main.py --epochs 0 20 | 21 | SAMPLE_ARRAY=() 22 | NUM_RUNS=$1 23 | 24 | # Needs warm up to get accurate number 25 | python main.py --epochs 1 --no-log 26 | 27 | for (( i=1; i<=NUM_RUNS; i++ )) do 28 | runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log) 29 | echo "$runtime" 30 | SAMPLE_ARRAY+=("${runtime}") 31 | done 32 | 33 | cd ../.. 34 | 35 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}") 36 | echo "Runtime stats in seconds:" 37 | echo "$stats" 38 | 39 | if [ "$2" == "compare_with_baseline" ]; then 40 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" 41 | elif [ "$2" == "compare_and_update" ]; then 42 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update 43 | fi 44 | } 45 | 46 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 47 | run_test test_gpu_speed_mnist "$@" 48 | fi 49 | -------------------------------------------------------------------------------- /.ci/pytorch/perf_test/update_commit_hash.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | data_file_path = sys.argv[1] 5 | commit_hash = sys.argv[2] 6 | 7 | with open(data_file_path) as data_file: 8 | data = json.load(data_file) 9 | 10 | data['commit'] = commit_hash 11 | 12 | with open(data_file_path, 'w') as data_file: 13 | json.dump(data, data_file) 14 | -------------------------------------------------------------------------------- /.ci/pytorch/print_sccache_log.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | log_file_path = sys.argv[1] 4 | 5 | with open(log_file_path) as f: 6 | lines = f.readlines() 7 | 8 | for line in lines: 9 | # Ignore errors from CPU instruction set, symbol existing testing, 10 | # or compilation error formatting 11 | ignored_keywords = [ 12 | 'src.c', 13 | 'CheckSymbolExists.c', 14 | 'test_compilation_error_formatting', 15 | ] 16 | if all([keyword not in line for keyword in ignored_keywords]): 17 | print(line) 18 | -------------------------------------------------------------------------------- /.ci/pytorch/run_glootls_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py" 4 | TMP_CERT_DIR=$(python "$CREATE_TEST_CERT") 5 | 6 | openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem" 7 | 8 | export GLOO_DEVICE_TRANSPORT=TCP_TLS 9 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key 10 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem 11 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem 12 | 13 | time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest 14 | 15 | unset GLOO_DEVICE_TRANSPORT 16 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY 17 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT 18 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE 19 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/choose_runtime_cuda_version.bat: -------------------------------------------------------------------------------- 1 | REM The first argument should the CUDA version 2 | echo %PATH% 3 | echo %CUDA_PATH% 4 | set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%1\bin;%PATH% 5 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat: -------------------------------------------------------------------------------- 1 | if "%BUILD_ENVIRONMENT%"=="" ( 2 | set CONDA_PARENT_DIR=%CD% 3 | ) else ( 4 | set CONDA_PARENT_DIR=C:\Jenkins 5 | ) 6 | 7 | 8 | :: Be conservative here when rolling out the new AMI with conda. This will try 9 | :: to install conda as before if it couldn't find the conda installation. This 10 | :: can be removed eventually after we gain enough confidence in the AMI 11 | if not exist %CONDA_PARENT_DIR%\Miniconda3 ( 12 | set INSTALL_FRESH_CONDA=1 13 | ) 14 | 15 | if "%INSTALL_FRESH_CONDA%"=="1" ( 16 | curl --retry 3 --retry-all-errors -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe 17 | if errorlevel 1 exit /b 18 | if not errorlevel 0 exit /b 19 | 20 | %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 21 | if errorlevel 1 exit /b 22 | if not errorlevel 0 exit /b 23 | ) 24 | 25 | :: Activate conda so that we can use its commands, i.e. conda, python, pip 26 | call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 27 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat: -------------------------------------------------------------------------------- 1 | if "%CUDA_VERSION%" == "cpu" ( 2 | echo skip magma installation for cpu builds 3 | exit /b 0 4 | ) 5 | 6 | rem remove dot in cuda_version, fox example 11.1 to 111 7 | 8 | if not "%USE_CUDA%"=="1" ( 9 | exit /b 0 10 | ) 11 | 12 | if x%CUDA_VERSION:.=%==x%CUDA_VERSION% ( 13 | echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.' 14 | exit /b 1 15 | ) 16 | 17 | set VERSION_SUFFIX=%CUDA_VERSION:.=% 18 | set CUDA_SUFFIX=cuda%VERSION_SUFFIX% 19 | 20 | if "%CUDA_SUFFIX%" == "" ( 21 | echo unknown CUDA version, please set `CUDA_VERSION` higher than 10.2 22 | exit /b 1 23 | ) 24 | 25 | if "%REBUILD%"=="" ( 26 | if "%BUILD_ENVIRONMENT%"=="" ( 27 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z 28 | ) else ( 29 | aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet 30 | ) 31 | if errorlevel 1 exit /b 32 | if not errorlevel 0 exit /b 33 | 7z x -aoa %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma 34 | if errorlevel 1 exit /b 35 | if not errorlevel 0 exit /b 36 | ) 37 | set MAGMA_HOME=%TMP_DIR_WIN%\magma 38 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat: -------------------------------------------------------------------------------- 1 | if "%REBUILD%"=="" ( 2 | if "%BUILD_ENVIRONMENT%"=="" ( 3 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z 4 | ) else ( 5 | aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet 6 | ) 7 | if errorlevel 1 exit /b 8 | if not errorlevel 0 exit /b 9 | 7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl 10 | if errorlevel 1 exit /b 11 | if not errorlevel 0 exit /b 12 | ) 13 | set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include 14 | set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB% 15 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat: -------------------------------------------------------------------------------- 1 | mkdir %TMP_DIR_WIN%\bin 2 | 3 | if "%REBUILD%"=="" ( 4 | :check_sccache 5 | %TMP_DIR_WIN%\bin\sccache.exe --show-stats || ( 6 | taskkill /im sccache.exe /f /t || ver > nul 7 | del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul 8 | del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul 9 | if "%BUILD_ENVIRONMENT%"=="" ( 10 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe 11 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe 12 | ) else ( 13 | aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe 14 | aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe 15 | ) 16 | goto :check_sccache 17 | ) 18 | ) 19 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/test_custom_backend.bat: -------------------------------------------------------------------------------- 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat 2 | 3 | git submodule update --init --recursive third_party/pybind11 4 | cd test\custom_backend 5 | 6 | :: Build the custom backend library. 7 | mkdir build 8 | pushd build 9 | 10 | echo "Executing CMake for custom_backend test..." 11 | 12 | :: Note: Caffe2 does not support MSVC + CUDA + Debug mode (has to be Release mode) 13 | cmake -DCMAKE_PREFIX_PATH=%TMP_DIR_WIN%\build\torch -DCMAKE_BUILD_TYPE=Release -GNinja .. 14 | if ERRORLEVEL 1 exit /b 1 15 | 16 | echo "Executing Ninja for custom_backend test..." 17 | 18 | ninja -v 19 | if ERRORLEVEL 1 exit /b 1 20 | 21 | echo "Ninja succeeded for custom_backend test." 22 | 23 | popd 24 | 25 | :: Run tests Python-side and export a script module. 26 | python test_custom_backend.py -v 27 | if ERRORLEVEL 1 exit /b 1 28 | 29 | python backend.py --export-module-to="build/model.pt" 30 | if ERRORLEVEL 1 exit /b 1 31 | 32 | :: Run tests C++-side and load the exported script module. 33 | cd build 34 | set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH% 35 | test_custom_backend.exe model.pt 36 | if ERRORLEVEL 1 exit /b 1 37 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/test_distributed.bat: -------------------------------------------------------------------------------- 1 | REM The first argument should lead to the python interpreter 2 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_common 3 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 4 | 5 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_gloo 6 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 7 | 8 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_nccl 9 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 10 | 11 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo 12 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 13 | 14 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl 15 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 16 | 17 | %1\python.exe test/run_test.py --verbose -i distributed/test_data_parallel 18 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 19 | 20 | %1\python.exe test/run_test.py --verbose -i distributed/test_store 21 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 22 | 23 | %1\python.exe test/run_test.py --verbose -i distributed/test_pg_wrapper 24 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 25 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat: -------------------------------------------------------------------------------- 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat 2 | 3 | echo Copying over test times file 4 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%" 5 | 6 | pushd test 7 | 8 | echo Run jit_profiling tests 9 | python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose 10 | if ERRORLEVEL 1 exit /b 1 11 | 12 | popd 13 | -------------------------------------------------------------------------------- /.ci/pytorch/win-test-helpers/test_python_shard.bat: -------------------------------------------------------------------------------- 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat 2 | :: exit the batch once there's an error 3 | if not errorlevel 0 ( 4 | echo "setup pytorch env failed" 5 | echo %errorlevel% 6 | exit /b 7 | ) 8 | 9 | pushd test 10 | 11 | set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe" 12 | if "%SHARD_NUMBER%" == "1" ( 13 | if exist %GFLAGS_EXE% ( 14 | echo Some smoke tests 15 | %GFLAGS_EXE% /i python.exe +sls 16 | python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py 17 | if ERRORLEVEL 1 goto fail 18 | 19 | %GFLAGS_EXE% /i python.exe -sls 20 | if ERRORLEVEL 1 goto fail 21 | ) 22 | ) 23 | 24 | echo Copying over test times file 25 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%" 26 | 27 | echo Run nn tests 28 | python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose 29 | if ERRORLEVEL 1 goto fail 30 | 31 | popd 32 | 33 | :eof 34 | exit /b 0 35 | 36 | :fail 37 | exit /b 1 38 | -------------------------------------------------------------------------------- /.cmakelintrc: -------------------------------------------------------------------------------- 1 | filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | plugins = 3 | coverage_plugins.jit_plugin 4 | omit = 5 | */tmp* 6 | */Temp/* 7 | */usr/local/lib* 8 | *test/* 9 | 10 | [report] 11 | omit = 12 | */tmp* 13 | */Temp/* 14 | */usr/local/lib* 15 | *test/* 16 | -------------------------------------------------------------------------------- /.ctags.d/pytorch.ctags: -------------------------------------------------------------------------------- 1 | --exclude=build/* 2 | --exclude=include/* 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bat text eol=crlf 2 | .circleci/config.yml linguist-generated=true 3 | .github/workflows/generated-*.yml linguist-generated=true 4 | .github/generated-* linguist-generated=true 5 | .github/scripts/gql_mocks.json linguist-generated=true 6 | third_party/LICENSES_BUNDLED.txt linguist-generated=true 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ci-sev.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "⚠️ CI SEV" 3 | about: Tracking incidents for PyTorch's CI infra. 4 | labels: "ci: sev" 5 | --- 6 | 7 | > NOTE: Remember to label this issue with "`ci: sev`" 8 | 9 | **MERGE BLOCKING** 10 | 11 | ## Current Status 12 | *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*. 13 | 14 | ## Error looks like 15 | *Provide some way users can tell that this SEV is causing their issue.* 16 | 17 | ## Incident timeline (all times pacific) 18 | *Include when the incident began, when it was detected, mitigated, root caused, and finally closed.* 19 | 20 |
21 | Click for example 22 | 23 | e.g. 24 | - 10/30 7:27a incident began 25 | - 10/30 8:30a detected by 26 | - 10/30 9:00 pm root caused as… 27 | - 10/30 9:10 pm mitigated by… 28 | - 10/31 10: am closed by… 29 | 30 |
31 | 32 | ## User impact 33 | *How does this affect users of PyTorch CI?* 34 | 35 | ## Root cause 36 | *What was the root cause of this issue?* 37 | 38 | ## Mitigation 39 | *How did we mitigate the issue?* 40 | 41 | ## Prevention/followups 42 | *How do we prevent issues like this in the future?* 43 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Questions 4 | url: https://discuss.pytorch.org/ 5 | about: Ask questions and discuss with other PyTorch community members 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/disable-ci-jobs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Disable CI jobs (PyTorch Dev Infra only) 3 | about: Use this template to disable CI jobs 4 | title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]" 5 | labels: "module: ci" 6 | --- 7 | 8 | > For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once 9 | > created, the job will be disabled within 15 minutes. You can check the 10 | > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json 11 | 12 | > If you need to get this out ASAP instead of waiting for 15 minutes, 13 | > you can manually trigger the workflow at https://github.com/pytorch/test-infra/actions/workflows/update_disabled_tests.yml 14 | > once the issue is created to update the above JSON list right away. 15 | 16 | > Noted: you need to have write access to PyTorch repo to disable CI 17 | > jobs. The issue will be rejected otherwise. 18 | 19 | ## Reason 20 | *Provide a reason why this is needed and when this can be resolved*. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://pytorch.org/docs/stable/index.html 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 📚 The doc issue 8 | description: > 9 | A clear and concise description of what content in https://pytorch.org/docs/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Suggest a potential alternative/fix 15 | description: > 16 | Tell us how we could improve the documentation in this regard. 17 | - type: markdown 18 | attributes: 19 | value: > 20 | Thanks for contributing 🎉! 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new PyTorch feature 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 🚀 The feature, motivation and pitch 8 | description: > 9 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Alternatives 15 | description: > 16 | A description of any alternative solutions or features you've considered, if any. 17 | - type: textarea 18 | attributes: 19 | label: Additional context 20 | description: > 21 | Add any other context or screenshots about the feature request. 22 | - type: markdown 23 | attributes: 24 | value: > 25 | Thanks for contributing 🎉! 26 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes #ISSUE_NUMBER 2 | -------------------------------------------------------------------------------- /.github/actionlint.yaml: -------------------------------------------------------------------------------- 1 | self-hosted-runner: 2 | labels: 3 | - linux.20_04.4x 4 | - linux.20_04.16x 5 | - linux.large 6 | - linux.2xlarge 7 | - linux.4xlarge 8 | - linux.12xlarge 9 | - linux.24xlarge 10 | - linux.4xlarge.nvidia.gpu 11 | - linux.8xlarge.nvidia.gpu 12 | - linux.16xlarge.nvidia.gpu 13 | - linux.g5.4xlarge.nvidia.gpu 14 | - windows.4xlarge 15 | - windows.8xlarge.nvidia.gpu 16 | - windows.g5.4xlarge.nvidia.gpu 17 | - bm-runner 18 | - linux.rocm.gpu 19 | - macos-m1-12 20 | - macos-m1-13 21 | - macos-12-xl 22 | - macos-12 23 | - macos12.3-m1 24 | -------------------------------------------------------------------------------- /.github/actions/chown-workspace/action.yml: -------------------------------------------------------------------------------- 1 | name: Chown workspace 2 | 3 | description: Ensure that the working directory gets chowned back to the current user 4 | 5 | runs: 6 | using: composite 7 | steps: 8 | - run: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . 9 | shell: bash 10 | env: 11 | ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" 12 | -------------------------------------------------------------------------------- /.github/actions/diskspace-cleanup/action.yml: -------------------------------------------------------------------------------- 1 | name: Cleans up diskspace 2 | 3 | description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace. 4 | 5 | inputs: 6 | diskspace-cutoff: 7 | description: The percent amount after which docker prune is run. 8 | required: true 9 | default: 70 10 | 11 | runs: 12 | using: composite 13 | steps: 14 | - name: Cleans up diskspace 15 | shell: bash 16 | run: | 17 | diskspace_cutoff=${{ inputs.diskspace-cutoff }} 18 | diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //') 19 | msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" 20 | if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then 21 | docker system prune -af 22 | diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //') 23 | if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then 24 | echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace." 25 | echo "$msg" 26 | exit 1 27 | else 28 | difference=$((diskspace - diskspace_new)) 29 | echo "Diskspace saved: $difference percent" 30 | fi 31 | fi 32 | -------------------------------------------------------------------------------- /.github/actions/download-build-artifacts/action.yml: -------------------------------------------------------------------------------- 1 | name: Download PyTorch Build Artifacts 2 | 3 | description: Download and unzip artifacts from a previous PyTorch build. 4 | 5 | inputs: 6 | name: 7 | description: Name of what artifact to download 8 | required: true 9 | use-gha: 10 | description: If set to any value, use GHA to download the artifact. Otherwise use s3. 11 | required: false 12 | 13 | runs: 14 | using: composite 15 | steps: 16 | - name: Download PyTorch Build Artifacts from S3 17 | if: ${{ !inputs.use-gha }} 18 | uses: seemethere/download-artifact-s3@v4 19 | with: 20 | name: ${{ inputs.name }} 21 | 22 | - name: Download PyTorch Build Artifacts from GHA 23 | if: inputs.use-gha 24 | uses: actions/download-artifact@v3 25 | with: 26 | name: ${{ inputs.name }} 27 | 28 | - name: Unzip artifacts 29 | shell: bash 30 | run: unzip -o artifacts.zip 31 | 32 | - name: Output disk space left 33 | shell: bash 34 | run: df -H 35 | -------------------------------------------------------------------------------- /.github/actions/get-workflow-job-id/action.yml: -------------------------------------------------------------------------------- 1 | name: Get workflow job id 2 | 3 | description: Get the ID of the workflow job that is currently running. 4 | 5 | inputs: 6 | github-token: 7 | description: GITHUB_TOKEN 8 | required: true 9 | 10 | outputs: 11 | job-id: 12 | description: The retrieved workflow job id 13 | value: ${{ steps.get-job-id.outputs.job-id }} 14 | 15 | runs: 16 | using: composite 17 | steps: 18 | - name: Get jobid or fail 19 | # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979 20 | # timeout-minutes: 10 21 | shell: bash 22 | id: get-job-id 23 | run: | 24 | set -eux 25 | GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}") 26 | echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}" 27 | env: 28 | GITHUB_TOKEN: ${{ inputs.github-token }} 29 | -------------------------------------------------------------------------------- /.github/actions/teardown-rocm/action.yml: -------------------------------------------------------------------------------- 1 | name: Teardown ROCm host 2 | 3 | description: Tear down ROCm host for CI 4 | 5 | runs: 6 | using: composite 7 | steps: 8 | - name: Teardown ROCm 9 | if: always() 10 | shell: bash 11 | run: | 12 | # ignore expansion of "docker ps -q" since it could be empty 13 | # shellcheck disable=SC2046 14 | docker stop $(docker ps -q) || true 15 | # Prune all stopped containers. 16 | docker container prune -f 17 | - name: Runner diskspace health check 18 | uses: ./.github/actions/diskspace-cleanup 19 | if: always() 20 | -------------------------------------------------------------------------------- /.github/actions/teardown-win/action.yml: -------------------------------------------------------------------------------- 1 | name: Teardown Windows 2 | 3 | description: Set up Docker workspace on linux 4 | 5 | inputs: 6 | extra-delete-dir: 7 | description: If set, cleaning up the workspace will delete this too 8 | required: false 9 | default: "" 10 | 11 | runs: 12 | using: composite 13 | steps: 14 | - name: Wait until all sessions have drained 15 | shell: powershell 16 | if: always() 17 | run: | 18 | .github\scripts\wait_for_ssh_to_drain.ps1 19 | 20 | - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) 21 | shell: powershell 22 | if: always() 23 | run: | 24 | .github\scripts\kill_active_ssh_sessions.ps1 25 | 26 | - name: Cleanup workspace 27 | if: always() 28 | shell: bash 29 | env: 30 | EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }} 31 | run: | 32 | [ ! -z "${EXTRA_DELETE_DIR}" ] || rm -rf "${EXTRA_DELETE_DIR}" 33 | rm -rf ./* 34 | -------------------------------------------------------------------------------- /.github/auto_request_review.yml: -------------------------------------------------------------------------------- 1 | # Documented at https://github.com/necojackarc/auto-request-review 2 | reviewers: 3 | groups: 4 | symbolic-shapes: 5 | - ezyang 6 | - albanD 7 | - miladm 8 | - bdhirsh 9 | - voznesenskym 10 | - jbschlosser 11 | 12 | per_author: 13 | symbolic-shapes: 14 | - symbolic-shapes 15 | - antoniojkim 16 | - wconstab 17 | - SherlockNoMad 18 | Chillee: 19 | - ezyang 20 | 21 | files: 22 | # none yet, TODO: migrate CODEOWNERS here 23 | 24 | options: 25 | ignore_draft: true 26 | ignored_keywords: 27 | - DO NOT REVIEW 28 | # Just manually setup a self-referential per_author rule if you 29 | # want group assignment 30 | enable_group_assignment: false 31 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/audio.txt: -------------------------------------------------------------------------------- 1 | a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602 -------------------------------------------------------------------------------- /.github/ci_commit_pins/huggingface.txt: -------------------------------------------------------------------------------- 1 | ebee0a27940adfbb30444d83387b9ea0f1173f40 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/multipy.txt: -------------------------------------------------------------------------------- 1 | 7dd29931fa8e9bb7c970f05f8c0dc13b69e17494 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/text.txt: -------------------------------------------------------------------------------- 1 | 5b78d074bd303eb230d30567646fcf0358ee2dd4 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/timm.txt: -------------------------------------------------------------------------------- 1 | 6635bc3f7d06c6a0d0481803b24d6ad0004b61ac 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/torchbench.txt: -------------------------------------------------------------------------------- 1 | 159e58f0b36ee22e2b89d74bd7dc8a79376de01d 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/triton.txt: -------------------------------------------------------------------------------- 1 | ../../.ci/docker/ci_commit_pins/triton.txt -------------------------------------------------------------------------------- /.github/ci_commit_pins/vision.txt: -------------------------------------------------------------------------------- 1 | b78d98bb152ffb9c0c0f5365f59f475c70b1784e 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/xla.txt: -------------------------------------------------------------------------------- 1 | f235d4da06905b35d75879a0a9bc3034ab7385ac 2 | -------------------------------------------------------------------------------- /.github/pytorch-circleci-labels.yml: -------------------------------------------------------------------------------- 1 | # For documentation concerning this configuration please refer to, 2 | # https://github.com/pytorch/pytorch-probot#trigger-circleci-workflows 3 | labels_to_circle_params: 4 | ci/binaries: 5 | parameter: run_binary_tests 6 | default_true_on: 7 | branches: 8 | - nightly 9 | - release/.* 10 | tags: 11 | - v[0-9]+(\.[0-9]+)*-rc[0-9]+ 12 | set_to_false: 13 | - run_build 14 | ci/master: 15 | parameter: run_master_build 16 | set_to_false: 17 | - run_build 18 | ci/slow-gradcheck: 19 | parameter: run_slow_gradcheck_build 20 | set_to_false: 21 | - run_build 22 | -------------------------------------------------------------------------------- /.github/pytorch-probot.yml: -------------------------------------------------------------------------------- 1 | tracking_issue: 24422 2 | ciflow_tracking_issue: 64124 3 | ciflow_push_tags: 4 | - ciflow/binaries 5 | - ciflow/binaries_conda 6 | - ciflow/binaries_libtorch 7 | - ciflow/binaries_wheel 8 | - ciflow/inductor 9 | - ciflow/inductor-perf-compare 10 | - ciflow/mps 11 | - ciflow/nightly 12 | - ciflow/periodic 13 | - ciflow/slow 14 | - ciflow/trunk 15 | - ciflow/unstable 16 | retryable_workflows: 17 | - lint 18 | - pull 19 | - trunk 20 | - linux-binary 21 | - windows-binary 22 | -------------------------------------------------------------------------------- /.github/regenerate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Allows this script to be invoked from any directory: 4 | cd "$(dirname "$0")" 5 | 6 | python3 scripts/generate_ci_workflows.py 7 | -------------------------------------------------------------------------------- /.github/requirements-gha-cache.txt: -------------------------------------------------------------------------------- 1 | # This file is to cache other dependencies not specified elsewhere in: 2 | # requirement.txt 3 | # requirements-flake8.txt 4 | # docs/requirements.txt 5 | # docs/cpp/requirements.txt 6 | # functorch/docs/requirements.txt 7 | # .ci/docker/requirements-ci.txt 8 | boto3==1.19.12 9 | jinja2==3.0.1 10 | lintrunner==0.10.7 11 | ninja==1.10.0.post1 12 | nvidia-ml-py==11.525.84 13 | pyyaml==6.0 14 | requests==2.26 15 | rich==10.9.0 16 | rockset==1.0.3 17 | -------------------------------------------------------------------------------- /.github/requirements/conda-env-Linux-X64: -------------------------------------------------------------------------------- 1 | cmake=3.22.* 2 | mkl=2022.1.0 3 | mkl-include=2022.1.0 4 | ninja=1.10.2 5 | numpy=1.23.3 6 | pyyaml=6.0 7 | requests=2.28.1 8 | setuptools=65.5.0 9 | typing-extensions=4.3.0 10 | -------------------------------------------------------------------------------- /.github/requirements/conda-env-iOS: -------------------------------------------------------------------------------- 1 | blas=1.0 2 | cmake=3.22.1 3 | mkl=2022.1.0 4 | mkl-include=2022.1.0 5 | ninja=1.10.2 6 | numpy=1.23.3 7 | pyyaml=6.0 8 | requests=2.28.1 9 | setuptools=63.4.1 10 | typing-extensions=4.3.0 11 | -------------------------------------------------------------------------------- /.github/requirements/conda-env-macOS-ARM64: -------------------------------------------------------------------------------- 1 | numpy=1.22.3 2 | pyyaml=6.0 3 | setuptools=61.2.0 4 | cmake=3.22.* 5 | typing-extensions=4.3.0 6 | dataclasses=0.8 7 | pip=22.2.2 8 | pillow=9.2.0 9 | pkg-config=0.29.2 10 | wheel=0.37.1 11 | expecttest=0.1.3 12 | 13 | # Not pinning certifi so that we can always get the latest certificates 14 | certifi 15 | 16 | # Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64 17 | # itself only has up to 1.39.0 from upstream conda. Both work though 18 | libuv>=1.39.0,<=1.40.0 19 | -------------------------------------------------------------------------------- /.github/requirements/conda-env-macOS-X64: -------------------------------------------------------------------------------- 1 | mkl=2021.2.0 2 | mkl-include=2021.2.0 3 | numpy=1.21.2 4 | pyyaml=5.3 5 | setuptools=46.0.0 6 | cmake=3.22.* 7 | typing-extensions=4.3.0 8 | dataclasses=0.8 9 | pip=22.2.2 10 | pillow=9.2.0 11 | libuv=1.40.0 12 | pkg-config=0.29.2 13 | wheel=0.37.1 14 | 15 | # Not pinning certifi so that we can always get the latest certificates 16 | certifi 17 | -------------------------------------------------------------------------------- /.github/requirements/pip-requirements-iOS.txt: -------------------------------------------------------------------------------- 1 | # iOS simulator requirements 2 | coremltools==5.0b5 3 | protobuf==3.20.2 4 | -------------------------------------------------------------------------------- /.github/requirements/pip-requirements-macOS.txt: -------------------------------------------------------------------------------- 1 | boto3==1.19.12 2 | hypothesis==6.56.4 3 | expecttest==0.1.3 4 | librosa>=0.6.2 5 | mpmath==1.2.1 6 | networkx==2.8.7 7 | # Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available 8 | numba==0.56.0; platform_machine == "arm64" 9 | numba<=0.49.1; platform_machine != "arm64" 10 | opt-einsum>=3.3 11 | psutil==5.9.1 12 | nvidia-ml-py==11.525.84 13 | pygments==2.12.0 14 | pytest==7.2.0 15 | pytest-xdist==3.0.2 16 | pytest-rerunfailures==10.3 17 | pytest-flakefinder==1.1.0 18 | pytest-shard==0.1.2 19 | scipy==1.9.0 20 | sympy==1.11.1 21 | unittest-xml-reporting<=3.2.0,>=2.0.0 22 | xdoctest==1.1.0 23 | filelock==3.6.0 24 | sympy==1.11.1 25 | -------------------------------------------------------------------------------- /.github/requirements/regenerate-requirements.txt: -------------------------------------------------------------------------------- 1 | typing-extensions 2 | jinja2 3 | -------------------------------------------------------------------------------- /.github/scripts/comment_on_pr.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | 4 | from github_utils import gh_post_pr_comment 5 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo 6 | from trymerge_explainer import BOT_COMMANDS_WIKI 7 | 8 | 9 | def parse_args() -> Any: 10 | from argparse import ArgumentParser 11 | 12 | parser = ArgumentParser("Comment on a PR") 13 | parser.add_argument("pr_num", type=int) 14 | parser.add_argument("action", type=str) 15 | return parser.parse_args() 16 | 17 | 18 | def main() -> None: 19 | args = parse_args() 20 | repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True) 21 | org, project = repo.gh_owner_and_name() 22 | run_url = os.environ.get("GH_RUN_URL") 23 | 24 | job_link = f"[job]({run_url})" if run_url is not None else "job" 25 | msg = ( 26 | f"The {args.action} {job_link} was canceled. If you believe this is a mistake," 27 | + f"then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})." 28 | ) 29 | 30 | gh_post_pr_comment(org, project, args.pr_num, msg) 31 | print(org, project, args.pr_num, msg) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /.github/scripts/export_pytorch_labels.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232. 4 | 5 | As a part of enforcing test ownership, we want to maintain a list of existing PyTorch labels 6 | to verify the owners' existence. This script outputs a file containing a list of existing 7 | pytorch/pytorch labels so that the file could be uploaded to S3. 8 | 9 | This script assumes the correct env vars are set for AWS permissions. 10 | 11 | """ 12 | 13 | import json 14 | from typing import Any 15 | 16 | import boto3 # type: ignore[import] 17 | 18 | from label_utils import gh_get_labels 19 | 20 | 21 | def parse_args() -> Any: 22 | from argparse import ArgumentParser 23 | 24 | parser = ArgumentParser("Export PR labels") 25 | parser.add_argument("org", type=str) 26 | parser.add_argument("repo", type=str) 27 | 28 | return parser.parse_args() 29 | 30 | 31 | def main() -> None: 32 | args = parse_args() 33 | print(f"Exporting labels for {args.org}/{args.repo}") 34 | labels_file_name = "pytorch_labels.json" 35 | obj = boto3.resource("s3").Object("ossci-metrics", labels_file_name) 36 | obj.put(Body=json.dumps(gh_get_labels(args.org, args.repo)).encode()) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /.github/scripts/kill_active_ssh_sessions.ps1: -------------------------------------------------------------------------------- 1 | function Get-SSH-Sessions { 2 | Get-Process sshd -IncludeUserName | 3 | Where-Object UserName -notLike "*SYSTEM*" | 4 | Select-Object Id 5 | } 6 | 7 | $runningSessions = Get-SSH-Sessions 8 | 9 | foreach ($session in $runningSessions) { 10 | Stop-Process -id $session.Id 11 | } 12 | -------------------------------------------------------------------------------- /.github/scripts/on_cancel_merge.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Any 3 | 4 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo 5 | from trymerge import GitHubPR, MERGE_IN_PROGRESS_LABEL 6 | 7 | 8 | def parse_args() -> Any: 9 | parser = argparse.ArgumentParser( 10 | description="Perform actions when a merge workflow is cancelled" 11 | ) 12 | parser.add_argument( 13 | "--pr-num", 14 | type=int, 15 | required=True, 16 | help="The PR number to cancel the merge for", 17 | ) 18 | return parser.parse_args() 19 | 20 | 21 | def main() -> None: 22 | args = parse_args() 23 | repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True) 24 | org, project = repo.gh_owner_and_name() 25 | pr_num = args.pr_num 26 | 27 | GitHubPR(org, project, pr_num).remove_label(MERGE_IN_PROGRESS_LABEL) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /.github/scripts/parse_ref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | 6 | 7 | def set_output(name: str, val: str) -> None: 8 | if os.getenv("GITHUB_OUTPUT"): 9 | with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: 10 | print(f"{name}={val}", file=env) 11 | else: 12 | print(f"::set-output name={name}::{val}") 13 | 14 | 15 | def main() -> None: 16 | ref = os.environ["GITHUB_REF"] 17 | m = re.match(r"^refs/(\w+)/(.*)$", ref) 18 | if m: 19 | category, stripped = m.groups() 20 | if category == "heads": 21 | set_output("branch", stripped) 22 | elif category == "pull": 23 | set_output("branch", "pull/" + stripped.split("/")[0]) 24 | elif category == "tags": 25 | set_output("tag", stripped) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /.github/scripts/report_git_status.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | CHANGES=$(git status --porcelain "$1") 3 | echo "$CHANGES" 4 | git diff "$1" 5 | [ -z "$CHANGES" ] 6 | -------------------------------------------------------------------------------- /.github/scripts/stop_runner_service.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +e 4 | set -x 5 | 6 | # Get the service name 7 | RUNNER_SERVICE=$(cat "${RUNNER_WORKSPACE}/../../.service") 8 | echo "GitHub self-hosted runner service: ${RUNNER_SERVICE}" 9 | 10 | if [[ -n "${RUNNER_SERVICE}" ]]; then 11 | echo "The self-hosted runner has encountered an unrecoverable error and will be shutdown" 12 | 13 | pushd "${RUNNER_WORKSPACE}/../../" 14 | # Stop it to prevent the runner from receiving new jobs 15 | sudo ./svc.sh stop 16 | # then uninstall the service 17 | sudo ./svc.sh uninstall 18 | # Finally, shutting down the runner completely 19 | sudo shutdown -P now 20 | # NB: In my test, cleaning up and shutting down the runner this way would already 21 | # remove the runner from the list of registered runners. Calling config.sh remove 22 | # seems redundant as it would require an org token to use, which I don't want to 23 | # add as yet another secret to the CI if there is no need 24 | fi 25 | -------------------------------------------------------------------------------- /.github/scripts/wait_for_ssh_to_drain.ps1: -------------------------------------------------------------------------------- 1 | function Get-SSH-Users { 2 | # Gets ssh sessions for all users not named SYSTEM 3 | Get-CimInstance -ClassName Win32_Process -Filter "Name = 'sshd.exe'" | 4 | Get-CimAssociatedInstance -Association Win32_SessionProcess | 5 | Get-CimAssociatedInstance -Association Win32_LoggedOnUser | 6 | Where-Object {$_.Name -ne 'SYSTEM'} | 7 | Measure-Object 8 | } 9 | 10 | $usersLoggedOn = Get-SSH-Users 11 | 12 | Write-Output "Holding runner until all ssh sessions have logged out" 13 | while ($usersLoggedOn.Count -gt 0) { 14 | $usersLoggedOn = Get-SSH-Users 15 | Write-Output "." 16 | Start-Sleep -s 5 17 | } 18 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | include_trailing_comma=True 3 | multi_line_output=3 4 | skip=third_party 5 | skip_gitignore=True 6 | use_parentheses=True 7 | -------------------------------------------------------------------------------- /.lldbinit: -------------------------------------------------------------------------------- 1 | # automatically load the pytorch_lldb extension. 2 | # 3 | # lldb automatically tries to load this file whenever it is executed from the 4 | # root of the pytorch repo, but by default it is not allowed to do so due to 5 | # security reasons. If you want to use pytorch_lldb, please add the following 6 | # line to your ~/.lldbinit (i.e., the .lldbinit file which is in your home 7 | # directory, NOT this file): 8 | # settings set target.load-cwd-lldbinit true 9 | # setting set escape-non-printables false 10 | # 11 | # Alternatively, you can manually load the pytorch_lldb commands into your 12 | # existing lldb session by doing the following: 13 | # (lldb) command script import tools/lldb/pytorch_lldb.py 14 | 15 | command script import tools/lldb/pytorch_lldb.py 16 | setting set escape-non-printables false 17 | type category enable torch 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is not a real branch. 2 | Please checkout `main` 3 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Reporting Security Issues 2 | 3 | If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. 4 | 5 | Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new 6 | 7 | Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: 8 | 9 | https://www.facebook.com/whitehat 10 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Benchmarks 2 | 3 | This folder contains scripts that produce reproducible timings of various PyTorch features. 4 | 5 | It also provides mechanisms to compare PyTorch with other frameworks. 6 | 7 | ## Setup environment 8 | Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order: 9 | ``` 10 | # Install torchvision. It comes with the pytorch stable release binary 11 | conda install pytorch torchvision -c pytorch 12 | 13 | # Install the latest pytorch master from source. 14 | # It should supersede the installation from the release binary. 15 | cd $PYTORCH_HOME 16 | python setup.py build develop 17 | 18 | # Check the pytorch installation version 19 | python -c "import torch; print(torch.__version__)" 20 | ``` 21 | 22 | ## Benchmark List 23 | 24 | Please refer to each subfolder to discover each benchmark suite 25 | 26 | * [Fast RNNs benchmarks](fastrnns/README.md) 27 | -------------------------------------------------------------------------------- /benchmarks/compare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m fastrnns.bench --fuser=old --group=rnns --print-json oss > old.json 3 | python -m fastrnns.bench --fuser=te --group=rnns --print-json oss > te.json 4 | python compare-fastrnn-results.py old.json te.json --format md 5 | -------------------------------------------------------------------------------- /benchmarks/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(convolution_bench convolution.cpp) 2 | target_link_libraries(convolution_bench PRIVATE torch_library benchmark) 3 | -------------------------------------------------------------------------------- /benchmarks/cpp/tensorexpr/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(AVX) 2 | 3 | add_executable( 4 | tensorexpr_bench 5 | bench_approx.cpp 6 | bench_batchnorm.cpp 7 | bench_concat.cpp 8 | bench_compile.cpp 9 | bench_signed_log1p.cpp 10 | bench_fuser_overhead.cpp 11 | bench_gemm.cpp 12 | bench_kernels.cpp 13 | bench_parallel.cpp 14 | bench_prefix_sum.cpp 15 | bench_reduce.cpp 16 | main.cpp) 17 | 18 | if(C_AVX2_FOUND) 19 | message(STATUS "AVX2 compiler support found") 20 | target_compile_options(tensorexpr_bench PUBLIC -mavx2) 21 | target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2) 22 | endif() 23 | 24 | target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark) 25 | -------------------------------------------------------------------------------- /benchmarks/cpp/tensorexpr/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | BENCHMARK_MAIN(); 4 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json: -------------------------------------------------------------------------------- 1 | { 2 | "DummyData": { 3 | "data_class": "DummyData", 4 | "configurations": { 5 | "max_val": 1024, 6 | "sample_count": 1024, 7 | "sample_length": 1024, 8 | "sparsity_percentage": 20 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json: -------------------------------------------------------------------------------- 1 | { 2 | "DummyModel": { 3 | "model_class": "DummyModel", 4 | "configurations": { 5 | "num_embeddings": 1024, 6 | "embedding_dim": 1024, 7 | "dense_input_size": 1024, 8 | "dense_output_size": 1024, 9 | "dense_layers_count": 8, 10 | "sparse": false 11 | } 12 | }, 13 | "DummyModelSparse": { 14 | "model_class": "DummyModel", 15 | "configurations": { 16 | "num_embeddings": 1024, 17 | "embedding_dim": 1024, 18 | "dense_input_size": 1024, 19 | "dense_output_size": 1024, 20 | "dense_layers_count": 8, 21 | "sparse": true 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .DummyData import DummyData 2 | 3 | data_map = { 4 | "DummyData": DummyData 5 | } 6 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from .MetricBase import MetricBase 4 | 5 | 6 | class CPUMetric(MetricBase): 7 | def __init__(self, name: str): 8 | self.name = name 9 | self.start = None 10 | self.end = None 11 | 12 | def record_start(self): 13 | self.start = time.time() 14 | 15 | def record_end(self): 16 | self.end = time.time() 17 | 18 | def elapsed_time(self): 19 | if self.start is None: 20 | raise RuntimeError("start is None") 21 | if self.end is None: 22 | raise RuntimeError("end is None") 23 | return self.end - self.start 24 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .MetricBase import MetricBase 4 | 5 | 6 | class CUDAMetric(MetricBase): 7 | def __init__(self, rank: int, name: str): 8 | self.rank = rank 9 | self.name = name 10 | self.start = None 11 | self.end = None 12 | 13 | def record_start(self): 14 | self.start = torch.cuda.Event(enable_timing=True) 15 | with torch.cuda.device(self.rank): 16 | self.start.record() 17 | 18 | def record_end(self): 19 | self.end = torch.cuda.Event(enable_timing=True) 20 | with torch.cuda.device(self.rank): 21 | self.end.record() 22 | 23 | def elapsed_time(self): 24 | if not self.start.query(): 25 | raise RuntimeError("start event did not complete") 26 | if not self.end.query(): 27 | raise RuntimeError("end event did not complete") 28 | return self.start.elapsed_time(self.end) 29 | 30 | def synchronize(self): 31 | self.start.synchronize() 32 | self.end.synchronize() 33 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class MetricBase(ABC): 5 | def __init__(self, name): 6 | self.name = name 7 | self.start = None 8 | self.end = None 9 | 10 | @abstractmethod 11 | def record_start(self): 12 | return 13 | 14 | @abstractmethod 15 | def record_end(self): 16 | return 17 | 18 | @abstractmethod 19 | def elapsed_time(self): 20 | return 21 | 22 | def get_name(self): 23 | return self.name 24 | 25 | def get_end(self): 26 | return self.end 27 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/models/DummyModel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class DummyModel(nn.Module): 6 | def __init__( 7 | self, 8 | num_embeddings: int, 9 | embedding_dim: int, 10 | dense_input_size: int, 11 | dense_output_size: int, 12 | dense_layers_count: int, 13 | sparse: bool 14 | ): 15 | r""" 16 | A dummy model with an EmbeddingBag Layer and Dense Layer. 17 | Args: 18 | num_embeddings (int): size of the dictionary of embeddings 19 | embedding_dim (int): the size of each embedding vector 20 | dense_input_size (int): size of each input sample 21 | dense_output_size (int): size of each output sample 22 | dense_layers_count: (int): number of dense layers in dense Sequential module 23 | sparse (bool): if True, gradient w.r.t. weight matrix will be a sparse tensor 24 | """ 25 | super().__init__() 26 | self.embedding = nn.EmbeddingBag( 27 | num_embeddings, embedding_dim, sparse=sparse 28 | ) 29 | self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(dense_layers_count)]) 30 | 31 | def forward(self, x): 32 | x = self.embedding(x) 33 | return F.softmax(self.dense(x), dim=1) 34 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .DummyModel import DummyModel 2 | 3 | model_map = { 4 | "DummyModel": DummyModel 5 | } 6 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/server/__init__.py: -------------------------------------------------------------------------------- 1 | from .server import AverageBatchParameterServer, AverageParameterServer 2 | 3 | server_map = { 4 | "AverageParameterServer": AverageParameterServer, 5 | "AverageBatchParameterServer": AverageBatchParameterServer 6 | } 7 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .criterions import cel 2 | from .ddp_models import basic_ddp_model 3 | from .hook_states import BasicHookState 4 | from .hooks import allreduce_hook, hybrid_hook, rpc_hook, sparse_rpc_hook 5 | from .iteration_steps import basic_iteration_step 6 | from .preprocess_data import preprocess_dummy_data 7 | from .trainer import DdpTrainer 8 | 9 | criterion_map = { 10 | "cel": cel 11 | } 12 | 13 | ddp_hook_map = { 14 | "allreduce_hook": allreduce_hook, 15 | "hybrid_hook": hybrid_hook, 16 | "rpc_hook": rpc_hook, 17 | "sparse_rpc_hook": sparse_rpc_hook 18 | } 19 | 20 | ddp_model_map = { 21 | "basic_ddp_model": basic_ddp_model 22 | } 23 | 24 | iteration_step_map = { 25 | "basic_iteration_step": basic_iteration_step 26 | } 27 | 28 | preprocess_data_map = { 29 | "preprocess_dummy_data": preprocess_dummy_data 30 | } 31 | 32 | hook_state_map = { 33 | "BasicHookState": BasicHookState 34 | } 35 | 36 | trainer_map = { 37 | "DdpTrainer": DdpTrainer 38 | } 39 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/criterions.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def cel(rank): 5 | r"""A function that creates a CrossEntropyLoss 6 | criterion for training. 7 | Args: 8 | rank (int): worker rank 9 | """ 10 | return nn.CrossEntropyLoss().cuda(rank) 11 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py: -------------------------------------------------------------------------------- 1 | from torch.nn.parallel import DistributedDataParallel as DDP 2 | 3 | 4 | def basic_ddp_model(self, rank, model, process_group, hook_state, hook): 5 | r""" 6 | A function that creates a ddp_model and hook_state objects. 7 | The ddp model is initialized with a single device id and 8 | the process group. The ddp_model also registers the communication 9 | hook. 10 | Args: 11 | rank (int): worker rank 12 | model (nn.Module): neural network model 13 | process_group (ProcessGroup): distributed process group 14 | hook_state (class): class that will be used to keep track of state 15 | during training. 16 | hook (function): ddp communication hook 17 | """ 18 | ddp_model = DDP( 19 | model, device_ids=[rank], process_group=process_group 20 | ) 21 | hook_state = hook_state(self, process_group) 22 | ddp_model.register_comm_hook(hook_state, hook) 23 | return ddp_model, hook_state 24 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py: -------------------------------------------------------------------------------- 1 | class BasicHookState: 2 | 3 | def __init__(self, cref, process_group): 4 | r""" 5 | A class that holds state information that is needed by the communication hook 6 | during the training algorithm. 7 | Args: 8 | cref (DdpTrainer): reference to the self keyword of the trainer instance 9 | process_group (ProcessGroup): distributed process group 10 | """ 11 | self.cref = cref 12 | self.process_group = process_group 13 | self.batch_number = -1 14 | 15 | def get_key(self, bucket_index): 16 | r""" 17 | A method that returns an encoded key that represents the current batch and 18 | bucket index. 19 | Args: 20 | bucket_index (int): index of the bucket being processed in backward 21 | """ 22 | return f"{self.batch_number},{bucket_index}" 23 | 24 | def next_batch(self): 25 | r""" 26 | A method that increments batch_number by 1. 27 | """ 28 | self.batch_number += 1 29 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py: -------------------------------------------------------------------------------- 1 | def basic_iteration_step(self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch): 2 | r""" 3 | A function that performs an iteration of training. 4 | Args: 5 | ddp_model (nn.Module): distributed data parallel model 6 | criterion (nn.Module): loss function to measure model 7 | optimizer (optim.Optimizer): updates model parameters 8 | hook_state (object): ddp communication hook state object 9 | epoch (int): index of pass through the data 10 | index (int): iteration number - 1 in current batch 11 | batch (list): training examples 12 | """ 13 | hook_state.next_batch() 14 | self.record_batch_start(self.epoch_key(epoch, index)) 15 | optimizer.zero_grad() 16 | self.record_forward_start(self.epoch_key(epoch, index)) 17 | loss = criterion(ddp_model(batch[0]), batch[1]) 18 | self.record_forward_end(self.epoch_key(epoch, index)) 19 | self.record_backward_start(self.epoch_key(epoch, index)) 20 | loss.backward() 21 | self.record_backward_end(self.epoch_key(epoch, index)) 22 | optimizer.step() 23 | self.record_batch_end(self.epoch_key(epoch, index)) 24 | -------------------------------------------------------------------------------- /benchmarks/distributed/rpc/parameter_server/trainer/preprocess_data.py: -------------------------------------------------------------------------------- 1 | def preprocess_dummy_data(rank, data): 2 | r""" 3 | A function that moves the data from CPU to GPU 4 | for DummyData class. 5 | Args: 6 | rank (int): worker rank 7 | data (list): training examples 8 | """ 9 | for i in range(len(data)): 10 | data[i][0] = data[i][0].cuda(rank) 11 | data[i][1] = data[i][1].cuda(rank) 12 | return data 13 | -------------------------------------------------------------------------------- /benchmarks/dynamo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/__init__.py -------------------------------------------------------------------------------- /benchmarks/dynamo/check_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import textwrap 4 | 5 | import pandas as pd 6 | 7 | 8 | def check_csv(filename): 9 | """ 10 | Basic accuracy checking. 11 | """ 12 | 13 | df = pd.read_csv(filename) 14 | 15 | failed = [] 16 | for _, row in df.iterrows(): 17 | model_name = row["name"] 18 | status = row["accuracy"] 19 | if "pass" not in status: 20 | failed.append(model_name) 21 | 22 | print(f"{model_name:34} {status}") 23 | 24 | if failed: 25 | print( 26 | textwrap.dedent( 27 | f""" 28 | Error {len(failed)} models failed 29 | {' '.join(failed)} 30 | """ 31 | ) 32 | ) 33 | sys.exit(1) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--file", "-f", type=str, help="csv file name") 39 | args = parser.parse_args() 40 | check_csv(args.file) 41 | -------------------------------------------------------------------------------- /benchmarks/dynamo/check_hf_bert_perf_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import textwrap 4 | 5 | import pandas as pd 6 | 7 | 8 | def check_hf_bert_perf_csv(filename): 9 | """ 10 | Basic performance checking. 11 | """ 12 | 13 | df = pd.read_csv(filename) 14 | 15 | failed = [] 16 | for _, row in df.iterrows(): 17 | model_name = row["name"] 18 | speedup = row["speedup"] 19 | # Reduce from 1.165 to 1.160, see https://github.com/pytorch/pytorch/issues/96530 20 | # Reduce from 1.160 to 1.140 after a transformer version upgrade, see https://github.com/pytorch/benchmark/pull/1406 21 | # The speedup is not backed to 1.16 after the extra graph break issue is fixed in transformer upstream 22 | if speedup < 1.150: 23 | failed.append(model_name) 24 | 25 | print(f"{model_name:34} {speedup}") 26 | 27 | if failed: 28 | print( 29 | textwrap.dedent( 30 | f""" 31 | Error {len(failed)} models performance regressed 32 | {' '.join(failed)} 33 | """ 34 | ) 35 | ) 36 | sys.exit(1) 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument("--file", "-f", type=str, help="csv file name") 42 | args = parser.parse_args() 43 | check_hf_bert_perf_csv(args.file) 44 | -------------------------------------------------------------------------------- /benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_dynamic_training.csv: -------------------------------------------------------------------------------- 1 | name,accuracy,graph_breaks 2 | AlbertForMaskedLM,pass,7 3 | AlbertForQuestionAnswering,pass,7 4 | BartForCausalLM,pass,7 5 | BertForMaskedLM,pass,7 6 | BertForQuestionAnswering,pass,7 7 | BlenderbotSmallForCausalLM,pass,7 8 | BlenderbotSmallForConditionalGeneration,pass,7 9 | CamemBert,pass,7 10 | DebertaForMaskedLM,pass,52 11 | DebertaForQuestionAnswering,pass,52 12 | DebertaV2ForMaskedLM,pass_due_to_skip,0 13 | DistilBertForMaskedLM,pass,7 14 | DistilBertForQuestionAnswering,pass,7 15 | DistillGPT2,pass,7 16 | ElectraForCausalLM,pass,7 17 | ElectraForQuestionAnswering,pass,7 18 | GPT2ForSequenceClassification,pass,9 19 | LayoutLMForMaskedLM,pass,7 20 | LayoutLMForSequenceClassification,pass,9 21 | MBartForCausalLM,pass,7 22 | MegatronBertForCausalLM,pass,7 23 | MegatronBertForQuestionAnswering,pass,7 24 | MobileBertForMaskedLM,pass,4 25 | MobileBertForQuestionAnswering,pass,4 26 | PLBartForCausalLM,pass,7 27 | PLBartForConditionalGeneration,pass,7 28 | PegasusForCausalLM,pass,7 29 | PegasusForConditionalGeneration,pass,4 30 | RobertaForCausalLM,pass,7 31 | RobertaForQuestionAnswering,pass,7 32 | Speech2Text2ForCausalLM,pass,7 33 | T5ForConditionalGeneration,pass,7 34 | T5Small,pass,7 35 | TrOCRForCausalLM,pass,7 36 | XLNetLMHeadModel,pass,7 37 | YituTechConvBert,pass,7 38 | -------------------------------------------------------------------------------- /benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv: -------------------------------------------------------------------------------- 1 | name,accuracy,graph_breaks 2 | AlbertForMaskedLM,pass,7 3 | AlbertForQuestionAnswering,pass,7 4 | BartForCausalLM,pass,7 5 | BertForMaskedLM,pass,7 6 | BertForQuestionAnswering,pass,7 7 | BlenderbotSmallForCausalLM,pass,7 8 | BlenderbotSmallForConditionalGeneration,pass,7 9 | CamemBert,pass,7 10 | DebertaForMaskedLM,pass,52 11 | DebertaForQuestionAnswering,pass,52 12 | DebertaV2ForMaskedLM,pass_due_to_skip,0 13 | DistilBertForMaskedLM,pass,7 14 | DistilBertForQuestionAnswering,pass,7 15 | DistillGPT2,pass,7 16 | ElectraForCausalLM,pass,7 17 | ElectraForQuestionAnswering,pass,7 18 | GPT2ForSequenceClassification,pass,9 19 | LayoutLMForMaskedLM,pass,7 20 | LayoutLMForSequenceClassification,pass,9 21 | MBartForCausalLM,pass,7 22 | MegatronBertForCausalLM,pass,7 23 | MegatronBertForQuestionAnswering,pass,7 24 | MobileBertForMaskedLM,pass,4 25 | MobileBertForQuestionAnswering,pass,4 26 | PLBartForCausalLM,pass,7 27 | PLBartForConditionalGeneration,pass,7 28 | PegasusForCausalLM,pass,7 29 | PegasusForConditionalGeneration,pass,4 30 | RobertaForCausalLM,pass,7 31 | RobertaForQuestionAnswering,pass,7 32 | Speech2Text2ForCausalLM,pass,7 33 | T5ForConditionalGeneration,pass,7 34 | T5Small,pass,7 35 | TrOCRForCausalLM,pass,7 36 | XLNetLMHeadModel,pass,7 37 | YituTechConvBert,pass,7 38 | -------------------------------------------------------------------------------- /benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_training.csv: -------------------------------------------------------------------------------- 1 | name,accuracy,graph_breaks 2 | BERT_pytorch,pass,11 3 | LearningToPaint,pass,9 4 | Super_SloMo,pass,9 5 | alexnet,pass,9 6 | attention_is_all_you_need_pytorch,pass,9 7 | dcgan,pass,9 8 | densenet121,pass,9 9 | drq,pass,8 10 | fastNLP_Bert,pass,14 11 | functorch_dp_cifar10,pass,9 12 | functorch_maml_omniglot,pass,9 13 | hf_Albert,pass,8 14 | hf_Bart,pass,8 15 | hf_Bert,pass,8 16 | hf_Bert_large,pass,8 17 | hf_DistilBert,pass,8 18 | hf_GPT2,pass,8 19 | hf_Reformer,pass,45 20 | hf_T5_large,pass_due_to_skip,0 21 | lennard_jones,pass,9 22 | maml_omniglot,pass,9 23 | mnasnet1_0,pass,9 24 | mobilenet_v2,pass,9 25 | nvidia_deeprecommender,pass,9 26 | phlippe_densenet,pass,9 27 | phlippe_resnet,pass,9 28 | pytorch_CycleGAN_and_pix2pix,pass,9 29 | pytorch_stargan,pass,9 30 | pytorch_unet,pass,9 31 | resnet152,pass,9 32 | resnet18,pass,9 33 | resnet50,pass,9 34 | resnext50_32x4d,pass,9 35 | shufflenet_v2_x1_0,pass,9 36 | soft_actor_critic,pass,8 37 | speech_transformer,pass,19 38 | squeezenet1_1,pass,9 39 | timm_efficientnet,pass,9 40 | timm_regnet,pass,9 41 | timm_resnest,pass,9 42 | timm_vision_transformer,pass,9 43 | timm_vision_transformer_large,pass_due_to_skip,0 44 | timm_vovnet,pass,9 45 | tts_angular,pass,11 46 | vgg16,pass,9 47 | yolov3,pass,13 48 | -------------------------------------------------------------------------------- /benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv: -------------------------------------------------------------------------------- 1 | name,accuracy,graph_breaks 2 | BERT_pytorch,pass,11 3 | LearningToPaint,pass,9 4 | Super_SloMo,pass,9 5 | alexnet,pass,9 6 | attention_is_all_you_need_pytorch,pass,9 7 | dcgan,pass,9 8 | densenet121,pass,9 9 | drq,pass,8 10 | fastNLP_Bert,pass,14 11 | functorch_dp_cifar10,pass,9 12 | functorch_maml_omniglot,pass,9 13 | hf_Albert,pass,8 14 | hf_Bart,pass,8 15 | hf_Bert,pass,8 16 | hf_Bert_large,pass,8 17 | hf_DistilBert,pass,8 18 | hf_GPT2,pass,8 19 | hf_Reformer,pass,67 20 | hf_T5_large,pass_due_to_skip,0 21 | lennard_jones,pass,9 22 | maml_omniglot,pass,9 23 | mnasnet1_0,pass,9 24 | mobilenet_v2,pass,9 25 | nvidia_deeprecommender,pass,9 26 | phlippe_densenet,pass,9 27 | phlippe_resnet,pass,9 28 | pytorch_CycleGAN_and_pix2pix,pass,9 29 | pytorch_stargan,pass,9 30 | pytorch_unet,pass,9 31 | resnet152,pass,9 32 | resnet18,pass,9 33 | resnet50,pass,9 34 | resnext50_32x4d,pass,9 35 | shufflenet_v2_x1_0,pass,9 36 | soft_actor_critic,pass,8 37 | speech_transformer,pass,19 38 | squeezenet1_1,pass,9 39 | timm_efficientnet,pass,9 40 | timm_regnet,pass,9 41 | timm_resnest,pass,9 42 | timm_vision_transformer,pass,9 43 | timm_vision_transformer_large,pass_due_to_skip,0 44 | timm_vovnet,pass,9 45 | tts_angular,pass,11 46 | vgg16,pass,9 47 | yolov3,pass,13 48 | -------------------------------------------------------------------------------- /benchmarks/dynamo/microbenchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/microbenchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/dynamo/microbenchmarks/benchmark_helper.py: -------------------------------------------------------------------------------- 1 | from torch.utils.benchmark import Timer 2 | 3 | 4 | def time_with_torch_timer(fn, args, kwargs=None, iters=100): 5 | kwargs = kwargs or {} 6 | env = {"args": args, "kwargs": kwargs, "fn": fn} 7 | fn_call = "fn(*args, **kwargs)" 8 | 9 | # Measure end-to-end time 10 | timer = Timer(stmt=f"{fn_call}", globals=env) 11 | tt = timer.timeit(iters) 12 | 13 | return tt 14 | -------------------------------------------------------------------------------- /benchmarks/dynamo/microbenchmarks/model.py: -------------------------------------------------------------------------------- 1 | # resnet50 layer shape 2 | resnet50_layers = ( 3 | # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding 4 | (224, 224, 3, 7, 7, 64, (2, 2), (0, 0)), 5 | # conv2_x 6 | (56, 56, 64, 1, 1, 64, (1, 1), (0, 0)), 7 | (56, 56, 64, 3, 3, 64, (1, 1), (0, 0)), 8 | (56, 56, 64, 1, 1, 256, (1, 1), (0, 0)), 9 | # conv3_x 10 | (56, 56, 256, 1, 1, 128, (2, 2), (0, 0)), 11 | (28, 28, 128, 3, 3, 128, (1, 1), (0, 0)), 12 | (28, 28, 128, 1, 1, 512, (1, 1), (0, 0)), 13 | # conv4_x 14 | (28, 28, 512, 1, 1, 256, (2, 2), (0, 0)), 15 | (14, 14, 256, 3, 3, 256, (1, 1), (0, 0)), 16 | (14, 14, 256, 1, 1, 1024, (1, 1), (0, 0)), 17 | # conv5_x 18 | (14, 14, 1024, 1, 1, 512, (2, 2), (0, 0)), 19 | (7, 7, 512, 3, 3, 512, (1, 1), (0, 0)), 20 | (7, 7, 512, 1, 1, 2048, (1, 1), (0, 0)), 21 | ) 22 | 23 | alexnet_layers = ( 24 | # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding 25 | (224, 224, 3, 11, 11, 64, (4, 4), (2, 2)), 26 | ) 27 | -------------------------------------------------------------------------------- /benchmarks/dynamo/microbenchmarks/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | 6 | def rounded_linspace(low, high, steps, div): 7 | ret = torch.linspace(low, high, steps) 8 | ret = (ret.int() + div - 1) // div * div 9 | ret = torch.unique(ret) 10 | return list(map(int, ret)) 11 | 12 | 13 | def powspace(start, stop, pow, step): 14 | start = math.log(start, pow) 15 | stop = math.log(stop, pow) 16 | steps = int((stop - start + 1) // step) 17 | ret = torch.pow(pow, torch.linspace(start, stop, steps)) 18 | ret = torch.unique(ret) 19 | return list(map(int, ret)) 20 | -------------------------------------------------------------------------------- /benchmarks/dynamo/run_delta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | # Some QoL for people running this script on Meta servers 6 | if getent hosts fwdproxy; then 7 | export https_proxy=http://fwdproxy:8080 http_proxy=http://fwdproxy:8080 no_proxy=.fbcdn.net,.facebook.com,.thefacebook.com,.tfbnw.net,.fb.com,.fburl.com,.facebook.net,.sb.fbsbx.com,localhost 8 | fi 9 | 10 | WORK="$PWD" 11 | 12 | cd "$(dirname "$BASH_SOURCE")"/../.. 13 | 14 | ROOT="$PWD" 15 | 16 | mkdir -p "$WORK/sweep/static" 17 | mkdir -p "$WORK/sweep/dynamic" 18 | 19 | (cd "$WORK/sweep/static" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@") 20 | (cd "$WORK/sweep/dynamic" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@" --dynamic-shapes) 21 | python benchmarks/dynamo/combine_csv.py "$WORK/sweep/static/final.csv" "$WORK/sweep/dynamic/final.csv" > "$WORK/delta.csv" 22 | gh gist create "$WORK/delta.csv" 23 | -------------------------------------------------------------------------------- /benchmarks/dynamo/timm_models_list_cpu.txt: -------------------------------------------------------------------------------- 1 | adv_inception_v3,128 2 | beit_base_patch16_224,64 3 | botnet26t_256,128 4 | cait_m36_384,4 5 | coat_lite_mini,32 6 | convit_base,64 7 | convmixer_768_32,2 8 | convnext_base,64 9 | crossvit_9_240,32 10 | cspdarknet53,64 11 | deit_base_distilled_patch16_224,64 12 | dm_nfnet_f0,128 13 | dpn107,32 14 | eca_botnext26ts_256,128 15 | eca_halonext26ts,128 16 | ese_vovnet19b_dw,128 17 | fbnetc_100,32 18 | fbnetv3_b,32 19 | gernet_l,128 20 | ghostnet_100,128 21 | gluon_inception_v3,128 22 | gluon_xception65,32 23 | gmixer_24_224,16 24 | gmlp_s16_224,128 25 | hrnet_w18,128 26 | inception_v3,128 27 | jx_nest_base,32 28 | lcnet_050,64 29 | mixer_b16_224,128 30 | mixnet_l,128 31 | mnasnet_100,32 32 | mobilenetv2_100,32 33 | mobilenetv3_large_100,32 34 | mobilevit_s,256 35 | nfnet_l0,128 36 | pit_b_224,64 37 | pnasnet5large,16 38 | poolformer_m36,64 39 | regnety_002,128 40 | repvgg_a2,128 41 | res2net101_26w_4s,64 42 | res2net50_14w_8s,128 43 | res2next50,128 44 | resmlp_12_224,128 45 | resnest101e,64 46 | rexnet_100,128 47 | sebotnet33ts_256,64 48 | selecsls42b,128 49 | spnasnet_100,32 50 | swin_base_patch4_window7_224,64 51 | swsl_resnext101_32x16d,32 52 | tf_efficientnet_b0,128 53 | tf_mixnet_l,32 54 | tinynet_a,128 55 | tnt_s_patch16_224,32 56 | twins_pcpvt_base,64 57 | visformer_small,128 58 | vit_base_patch16_224,64 59 | volo_d1_224,64 60 | xcit_large_24_p8_224,5 61 | -------------------------------------------------------------------------------- /benchmarks/dynamo/torchbench_models_list.txt: -------------------------------------------------------------------------------- 1 | BERT_pytorch,128 2 | Background_Matting, 16 3 | LearningToPaint,1024 4 | alexnet,1024 5 | dcgan,1024 6 | densenet121,64 7 | hf_Albert,32 8 | hf_Bart,16 9 | hf_Bert,16 10 | hf_GPT2,16 11 | hf_T5,4 12 | mnasnet1_0,256 13 | mobilenet_v2,128 14 | mobilenet_v3_large,256 15 | nvidia_deeprecommender,1024 16 | pytorch_unet,8 17 | resnet18,512 18 | resnet50,128 19 | resnext50_32x4d,128 20 | shufflenet_v2_x1_0,512 21 | squeezenet1_1,512 22 | timm_nfnet,256 23 | timm_efficientnet,128 24 | timm_regnet,128 25 | timm_resnest,256 26 | timm_vision_transformer,256 27 | timm_vovnet,128 28 | vgg16,128 29 | -------------------------------------------------------------------------------- /benchmarks/dynamo/torchbench_models_list_cpu.txt: -------------------------------------------------------------------------------- 1 | alexnet,128 2 | attention_is_all_you_need_pytorch,64 3 | BERT_pytorch,32 4 | dcgan,256 5 | densenet121,512 6 | dlrm,2048 7 | fastNLP_Bert,8 8 | functorch_dp_cifar10,1024 9 | hf_Albert,8 10 | hf_Bart,8 11 | hf_Bert,8 12 | hf_Bert_large,8 13 | hf_DistilBert,8 14 | hf_GPT2,8 15 | hf_GPT2_large,1 16 | hf_Longformer,4 17 | hf_Reformer,8 18 | hf_T5,4 19 | hf_T5_base,1 20 | hf_T5_large,1 21 | LearningToPaint,96 22 | lennard_jones,1024 23 | mnasnet1_0,32 24 | mobilenet_v2,16 25 | mobilenet_v3_large,32 26 | nvidia_deeprecommender,256 27 | phlippe_densenet,128 28 | phlippe_resnet,512 29 | pytorch_unet,4 30 | resnet152,32 31 | resnet18,256 32 | resnet50,256 33 | resnext50_32x4d,256 34 | shufflenet_v2_x1_0,64 35 | speech_transformer,1024 36 | squeezenet1_1,16 37 | Super_SloMo,1024 38 | timm_efficientnet,64 39 | timm_nfnet,128 40 | timm_regnet,32 41 | timm_resnest,32 42 | timm_vision_transformer,16 43 | timm_vision_transformer_large,8 44 | timm_vovnet,32 45 | tts_angular,1024 46 | vgg16,64 47 | vision_maskrcnn,1 48 | yolov3,32 49 | -------------------------------------------------------------------------------- /benchmarks/fastrnns/README.md: -------------------------------------------------------------------------------- 1 | # Fast RNN benchmarks 2 | 3 | Benchmarks for TorchScript models 4 | 5 | For most stable results, do the following: 6 | - Set CPU Governor to performance mode (as opposed to energy save) 7 | - Turn off turbo for all CPUs (assuming Intel CPUs) 8 | - Shield cpus via `cset shield` when running benchmarks. 9 | 10 | Some of these scripts accept command line args but most of them do not because 11 | I was lazy. They will probably be added sometime in the future, but the default 12 | sizes are pretty reasonable. 13 | 14 | ## Test fastrnns (fwd + bwd) correctness 15 | 16 | Test the fastrnns benchmarking scripts with the following: 17 | `python -m fastrnns.test` 18 | or run the test independently: 19 | `python -m fastrnns.test --rnns jit` 20 | 21 | ## Run benchmarks 22 | 23 | `python -m fastrnns.bench` 24 | 25 | should give a good comparison, or you can specify the type of model to run 26 | 27 | `python -m fastrnns.bench --rnns cudnn aten jit --group rnns` 28 | 29 | ## Run model profiling, calls nvprof 30 | 31 | `python -m fastrnns.profile` 32 | 33 | should generate nvprof file for all models somewhere. 34 | you can also specify the models to generate nvprof files separately: 35 | 36 | `python -m fastrnns.profile --rnns aten jit` 37 | 38 | ### Caveats 39 | 40 | Use Linux for the most accurate timing. A lot of these tests only run 41 | on CUDA. 42 | -------------------------------------------------------------------------------- /benchmarks/fastrnns/__init__.py: -------------------------------------------------------------------------------- 1 | from .cells import * # noqa: F403 2 | from .factory import * # noqa: F403 3 | 4 | # (output, next_state) = cell(input, state) 5 | seqLength = 100 6 | numLayers = 2 7 | inputSize = 512 8 | hiddenSize = 512 9 | miniBatch = 64 10 | -------------------------------------------------------------------------------- /benchmarks/fastrnns/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest # noqa: F401 2 | 3 | default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple', 4 | 'jit_multilayer', 'py'] 5 | default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit'] 6 | all_nets = default_rnns + default_cnns 7 | 8 | def pytest_generate_tests(metafunc): 9 | # This creates lists of tests to generate, can be customized 10 | if metafunc.cls.__name__ == "TestBenchNetwork": 11 | metafunc.parametrize('net_name', all_nets, scope="class") 12 | metafunc.parametrize("executor", [metafunc.config.getoption("executor")], scope="class") 13 | metafunc.parametrize("fuser", [metafunc.config.getoption("fuser")], scope="class") 14 | 15 | def pytest_addoption(parser): 16 | parser.addoption("--fuser", default="old", help="fuser to use for benchmarks") 17 | parser.addoption("--executor", default="legacy", help="executor to use for benchmarks") 18 | -------------------------------------------------------------------------------- /benchmarks/fastrnns/scratch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | @torch.jit.script 5 | def fn(x, scale, shift): 6 | return scale * x / shift 7 | 8 | 9 | @torch.jit.script 10 | def recurrent(x, scale, shift): 11 | y = x 12 | for i in range(100): 13 | y = fn(y, scale, shift) 14 | return y 15 | 16 | 17 | x = torch.randn(2, 2, device='cuda') 18 | scale = torch.randn(2, 2, device='cuda', requires_grad=True) 19 | shift = torch.randn(2, 2, device='cuda', requires_grad=True) 20 | inputs = [x, scale, shift] 21 | 22 | 23 | out = recurrent(x, scale, shift) 24 | recurrent.graph_for(x, scale, shift) 25 | 26 | 27 | import torch 28 | 29 | 30 | @torch.jit.script 31 | def recurrent_scaleshift(x, scale, shift): 32 | y = x 33 | for i in range(64): 34 | y = scale * y + shift 35 | return y 36 | 37 | 38 | x = torch.randn(2, 2, device='cuda') 39 | scale = torch.randn(2, 2, device='cuda', requires_grad=True) 40 | shift = torch.randn(2, 2, device='cuda', requires_grad=True) 41 | inputs = [x, scale, shift] 42 | out = recurrent_scaleshift(x, scale, shift) 43 | recurrent_scaleshift.graph_for(x, scale, shift) 44 | 45 | 46 | import torch 47 | x = torch.tensor([]) 48 | x.requires_grad = True 49 | x.mean().backward() # no error triggered 50 | x = x.cuda() 51 | x.mean().backward() 52 | -------------------------------------------------------------------------------- /benchmarks/framework_overhead_benchmark/SimpleAddModule.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils import NUM_LOOP_ITERS 3 | 4 | def add_tensors_loop(x, y): 5 | z = torch.add(x, y) 6 | for i in range(NUM_LOOP_ITERS): 7 | z = torch.add(z, x) 8 | return z 9 | 10 | class SimpleAddModule(torch.nn.Module): 11 | def __init__(self, add_op): 12 | super().__init__() 13 | self.add_op = add_op 14 | 15 | def forward(self, x, y): 16 | return self.add_op(x, y) 17 | -------------------------------------------------------------------------------- /benchmarks/framework_overhead_benchmark/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import namedtuple 3 | from torch.utils import ThroughputBenchmark 4 | 5 | NUM_LOOP_ITERS = 1000 6 | BenchmarkConfig = namedtuple('BenchmarkConfig', 'num_warmup_iters num_iters') 7 | ModuleConfig = namedtuple('ModuleConfig', 'pt_fn c2_op num_params graph_mode') 8 | 9 | def ms_to_us(time_ms): 10 | return (time_ms * 1e3) 11 | 12 | def secs_to_us(time_s): 13 | return (time_s * 1e6) 14 | 15 | def secs_to_ms(time_s): 16 | return (time_s * 1e3) 17 | 18 | def benchmark_using_throughput_benchmark(config, module): 19 | print("Benchmarking via ThroughputBenchmark") 20 | bench = ThroughputBenchmark(module.module) 21 | bench.add_input(*module.tensor_inputs) 22 | stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters) 23 | return stats.latency_avg_ms / NUM_LOOP_ITERS 24 | 25 | def benchmark_module(config, module, use_throughput_benchmark=False): 26 | if use_throughput_benchmark: 27 | return benchmark_using_throughput_benchmark(config, module) 28 | module.forward(config.num_warmup_iters) 29 | print("Running module for {} iterations".format(config.num_iters)) 30 | start = time.time() 31 | module.forward(config.num_iters) 32 | end = time.time() 33 | time_elapsed_s = (end - start) 34 | return (secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS) 35 | -------------------------------------------------------------------------------- /benchmarks/fuser/plot_speedups.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | 3 | df = pandas.read_csv("perf.csv") 4 | 5 | ops = pandas.unique(df["operator"]) 6 | nops = len(ops) 7 | pivot_op_shape = df.pivot_table( 8 | values="time", index=["operator", "shape"], columns=["fuser"] 9 | ) 10 | pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | plt.rcParams["figure.figsize"] = (20, 100) 15 | fig, axs = plt.subplots(nops) 16 | plt.subplots_adjust(hspace=0.5) 17 | for idx, op in enumerate(ops): 18 | op_speedups = pivot_speedups.T[op].T 19 | op_speedups.plot(ax=axs[idx], kind="bar", ylim=(0, 2), rot=45) 20 | axs[idx].set_title(op) 21 | axs[idx].set_xlabel("") 22 | plt.savefig("perf.png") 23 | -------------------------------------------------------------------------------- /benchmarks/instruction_counts/applications/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/applications/__init__.py -------------------------------------------------------------------------------- /benchmarks/instruction_counts/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/core/__init__.py -------------------------------------------------------------------------------- /benchmarks/instruction_counts/definitions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/definitions/__init__.py -------------------------------------------------------------------------------- /benchmarks/instruction_counts/execution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/execution/__init__.py -------------------------------------------------------------------------------- /benchmarks/instruction_counts/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/worker/__init__.py -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/__init__.py -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/benchmark_all_other_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | from pt import ( # noqa: F401 3 | add_test, as_strided_test, batchnorm_test, binary_test, cat_test, 4 | channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test, 5 | fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test, 6 | softmax_test, hardsigmoid_test, hardswish_test, layernorm_test, 7 | groupnorm_test, interpolate_test, instancenorm_test, remainder_test, 8 | split_test, sum_test, tensor_to_test 9 | ) 10 | from pt import ( # noqa: F401 11 | ao_sparsifier_test 12 | ) 13 | 14 | if __name__ == "__main__": 15 | op_bench.benchmark_runner.main() 16 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/benchmark_all_quantized_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | from pt import ( # noqa: F401 3 | qactivation_test, 4 | qarithmetic_test, 5 | qbatchnorm_test, 6 | qcat_test, 7 | qcomparators_test, 8 | qconv_test, 9 | qgroupnorm_test, 10 | qinstancenorm_test, 11 | qinterpolate_test, 12 | qlayernorm_test, 13 | qlinear_test, 14 | qobserver_test, 15 | qpool_test, 16 | qrnn_test, 17 | qtensor_method_test, 18 | quantization_test, 19 | qunary_test, 20 | qembedding_pack_test, 21 | qembeddingbag_test, 22 | qatembedding_ops_test, 23 | ) 24 | 25 | 26 | if __name__ == "__main__": 27 | op_bench.benchmark_runner.main() 28 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/benchmark_all_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | from pt import ( # noqa: F401 3 | unary_test, 4 | ) 5 | import benchmark_all_other_test # noqa: F401 6 | import benchmark_all_quantized_test # noqa: F401 7 | 8 | if __name__ == "__main__": 9 | op_bench.benchmark_runner.main() 10 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/c2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/c2/__init__.py -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/c2/replace_nan_test.py: -------------------------------------------------------------------------------- 1 | import benchmark_caffe2 as op_bench_c2 2 | import operator_benchmark as op_bench 3 | from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401 4 | from caffe2.python import core 5 | 6 | 7 | """Microbenchmarks for element-wise ReplaceNaN operator.""" 8 | 9 | # Configs for C2 ReplaceNaN operator 10 | replace_nan_long_configs = op_bench.cross_product_configs( 11 | M=[32, 64, 128], N=range(32, 128, 32), dtype=["float", "double"], tags=["long"] 12 | ) 13 | 14 | 15 | replace_nan_short_configs = op_bench.config_list( 16 | attrs=[ 17 | [16, 16, "float"], 18 | [16, 16, "double"], 19 | [64, 64, "float"], 20 | [64, 64, "double"], 21 | ], 22 | attr_names=["M", "N", "dtype"], 23 | tags=["short"], 24 | ) 25 | 26 | 27 | class ReplaceNaNBenchmark(op_bench_c2.Caffe2BenchmarkBase): 28 | def init(self, M, N, dtype): 29 | self.input = self.tensor([M, N], dtype) 30 | self.set_module_name("replace_nan") 31 | 32 | def forward(self): 33 | op = core.CreateOperator("ReplaceNaN", self.input, self.input, value=1.0) 34 | return op 35 | 36 | 37 | op_bench_c2.generate_c2_test( 38 | replace_nan_long_configs + replace_nan_short_configs, ReplaceNaNBenchmark 39 | ) 40 | 41 | 42 | if __name__ == "__main__": 43 | op_bench.benchmark_runner.main() 44 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/common/__init__.py -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/add_ops_list_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | # Configs for pointwise unary ops 6 | unary_ops_configs = op_bench.config_list( 7 | attrs=[ 8 | [128, 128], 9 | ], 10 | attr_names=["M", "N"], 11 | tags=["short"] 12 | ) 13 | 14 | 15 | unary_ops_list = op_bench.op_list( 16 | attr_names=["op_name", "op_func"], 17 | attrs=[ 18 | ["abs", torch.abs], 19 | ["acos", torch.acos], 20 | ], 21 | ) 22 | 23 | 24 | class UnaryOpBenchmark(op_bench.TorchBenchmarkBase): 25 | def init(self, M, N, op_func): 26 | self.input_one = torch.rand(M, N) 27 | self.op_func = op_func 28 | 29 | def forward(self): 30 | return self.op_func(self.input_one) 31 | 32 | 33 | op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark) 34 | 35 | 36 | if __name__ == "__main__": 37 | op_bench.benchmark_runner.main() 38 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/jit_forward_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | intraop_bench_configs = op_bench.config_list( 5 | attrs=[ 6 | [8, 16], 7 | ], 8 | attr_names=["M", "N"], 9 | tags=["short"], 10 | ) 11 | 12 | @torch.jit.script 13 | def torch_sumall(a, iterations): 14 | # type: (Tensor, int) 15 | result = 0.0 16 | for _ in range(iterations): 17 | result += float(torch.sum(a)) 18 | a[0][0] += 0.01 19 | return result 20 | 21 | 22 | class TorchSumBenchmark(op_bench.TorchBenchmarkBase): 23 | def init(self, M, N): 24 | self.input_one = torch.rand(M, N) 25 | self.set_module_name("sum") 26 | 27 | # This is a very temporary method and will be removed soon, so 28 | # don't use this method in your benchmark 29 | # TODO(mingzhe): use one forward method for both JIT and Eager 30 | def jit_forward(self, iters): 31 | return torch_sumall(self.input_one, iters) 32 | 33 | op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark) 34 | 35 | 36 | if __name__ == "__main__": 37 | op_bench.benchmark_runner.main() 38 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/pt_backward_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | add_configs = op_bench.cross_product_configs( 6 | M=[8, 1], 7 | N=[8, 2], 8 | K=[8, 4], 9 | tags=["short"] 10 | ) 11 | 12 | # This benchmark uses the auto_set to automatically set requires_grad 13 | # for both inputs. The test name can also be used for filtering. 14 | class AddBenchmark(op_bench.TorchBenchmarkBase): 15 | def init(self, M, N, K): 16 | self.input_one = torch.rand(M, N, K, requires_grad=self.auto_set()) 17 | self.input_two = torch.rand(M, N, K, requires_grad=self.auto_set()) 18 | self.set_module_name("add") 19 | 20 | def forward(self): 21 | return torch.add(self.input_one, self.input_two) 22 | 23 | 24 | op_bench.generate_pt_test(add_configs, AddBenchmark) 25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark) 26 | 27 | 28 | if __name__ == "__main__": 29 | op_bench.benchmark_runner.main() 30 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch.""" 5 | 6 | add_short_configs = op_bench.config_list( 7 | attr_names=['M', 'N', 'K'], 8 | attrs=[ 9 | [8, 16, 32], 10 | [16, 16, 64], 11 | [64, 64, 128], 12 | ], 13 | cross_product_configs={ 14 | 'device': ['cpu', 'cuda'], 15 | 'dtype': [torch.float, torch.float64], 16 | }, 17 | tags=['short'], 18 | ) 19 | 20 | 21 | class AddBenchmark(op_bench.TorchBenchmarkBase): 22 | def init(self, M, N, K, device, dtype): 23 | self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True) 24 | self.input_two = torch.rand(M, N, K, device=device, dtype=dtype) 25 | self.set_module_name('add') 26 | 27 | def forward(self): 28 | return torch.add(self.input_one, self.input_two) 29 | 30 | 31 | op_bench.generate_pt_test(add_short_configs, AddBenchmark) 32 | 33 | 34 | if __name__ == "__main__": 35 | op_bench.benchmark_runner.main() 36 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | add_configs = op_bench.cross_product_configs( 6 | M=[8], 7 | N=[8], 8 | K=[8], 9 | device=["cuda", "cpu"], 10 | tags=["short"] 11 | ) 12 | 13 | 14 | class AddBenchmark(op_bench.TorchBenchmarkBase): 15 | def init(self, M, N, K, device): 16 | self.input_one = torch.rand(M, N, K, device=device, requires_grad=True) 17 | self.input_two = torch.rand(M, N, K, device=device, requires_grad=True) 18 | self.set_module_name("add") 19 | 20 | def forward(self): 21 | return torch.add(self.input_one, self.input_two) 22 | 23 | 24 | op_bench.generate_pt_test(add_configs, AddBenchmark) 25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark) 26 | 27 | 28 | if __name__ == "__main__": 29 | op_bench.benchmark_runner.main() 30 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/common/tests/random_sample_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | configs = op_bench.random_sample_configs( 6 | M=[1, 2, 3, 4, 5, 6], 7 | N=[7, 8, 9, 10, 11, 12], 8 | K=[13, 14, 15, 16, 17, 18], 9 | # probs saves the weights of each value 10 | probs=op_bench.attr_probs( 11 | M=[0.5, 0.2, 0.1, 0.05, 0.03, 0.1], 12 | N=[0.1, 0.3, 0.4, 0.02, 0.03, 0.04], 13 | K=[0.03, 0.6, 0.04, 0.02, 0.03, 0.01], 14 | ), 15 | # this is the number of returned inputs 16 | total_samples=10, 17 | tags=["short"], 18 | ) 19 | 20 | 21 | class AddBenchmark(op_bench.TorchBenchmarkBase): 22 | def init(self, M, N, K): 23 | self.input_one = torch.rand(M, N, K) 24 | self.input_two = torch.rand(M, N, K) 25 | self.set_module_name("add") 26 | 27 | def forward(self): 28 | return torch.add(self.input_one, self.input_two) 29 | 30 | 31 | op_bench.generate_pt_test(configs, AddBenchmark) 32 | 33 | 34 | if __name__ == "__main__": 35 | op_bench.benchmark_runner.main() 36 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/operator_benchmark.py: -------------------------------------------------------------------------------- 1 | # TODO (mingzhe09088): get rid of noqa 2 | import benchmark_runner # noqa: F401 3 | from benchmark_pytorch import TorchBenchmarkBase # noqa: F401 4 | from benchmark_test_generator import * # noqa: F401,F403 5 | from benchmark_utils import * # noqa: F401,F403 6 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/pt/__init__.py -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/bmm_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch.""" 5 | 6 | class BmmBenchmark(op_bench.TorchBenchmarkBase): 7 | def init(self, B, M, N, K, device, op): 8 | self.inputs = { 9 | "batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()), 10 | "batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set()) 11 | } 12 | self.set_module_name(f"bmm (actual op={op}") 13 | self.op = torch.bmm if op == "bmm" else torch.matmul 14 | 15 | def forward(self, batch1, batch2): 16 | return self.op(batch1, batch2) 17 | 18 | bmm_configs = op_bench.cross_product_configs( 19 | B=[2, 100], 20 | M=[8, 256], 21 | N=[256, 16], 22 | K=[16, 32], 23 | device=['cpu'], 24 | tags=["short"], 25 | op=["bmm", "matmul"], 26 | ) 27 | 28 | op_bench.generate_pt_test(bmm_configs, BmmBenchmark) 29 | 30 | if __name__ == "__main__": 31 | op_bench.benchmark_runner.main() 32 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/chunk_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | """Microbenchmarks for Chunk operator""" 6 | 7 | 8 | # Configs for PT Chunk operator 9 | chunk_short_configs = op_bench.config_list( 10 | attr_names=["M", "N", "chunks"], 11 | attrs=[ 12 | [8, 8, 2], 13 | [256, 512, 2], 14 | [512, 512, 2], 15 | ], 16 | cross_product_configs={ 17 | 'device': ['cpu', 'cuda'], 18 | }, 19 | tags=["short"], 20 | ) 21 | 22 | chunks_long_configs = op_bench.cross_product_configs( 23 | M=[128, 1024], 24 | N=[128, 1024], 25 | chunks=[2, 4], 26 | device=['cpu', 'cuda'], 27 | tags=['long'] 28 | ) 29 | 30 | 31 | class ChunkBenchmark(op_bench.TorchBenchmarkBase): 32 | def init(self, M, N, chunks, device): 33 | self.inputs = { 34 | "input_one": torch.rand(M, N, device=device), 35 | "chunks": chunks 36 | } 37 | self.set_module_name("chunk") 38 | 39 | def forward(self, input_one, chunks: int): 40 | return torch.chunk(input_one, chunks) 41 | 42 | 43 | op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs, 44 | ChunkBenchmark) 45 | 46 | 47 | if __name__ == "__main__": 48 | op_bench.benchmark_runner.main() 49 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/diag_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | """Microbenchmarks for diag operator""" 6 | 7 | 8 | # Configs for PT diag operator 9 | diag_configs_short = op_bench.config_list( 10 | attr_names=['dim', 'M', 'N', 'diagonal', 'out'], 11 | attrs=[ 12 | [1, 64, 64, 0, True], 13 | [2, 128, 128, -10, False], 14 | [1, 256, 256, 20, True], 15 | ], 16 | cross_product_configs={ 17 | 'device': ['cpu', 'cuda'], 18 | }, 19 | tags=['short'], 20 | ) 21 | 22 | 23 | class DiagBenchmark(op_bench.TorchBenchmarkBase): 24 | def init(self, dim, M, N, diagonal, out, device): 25 | self.inputs = { 26 | "input": torch.rand(M, N, device=device) if dim == 2 else torch.rand(M, device=device), 27 | "diagonal": diagonal, 28 | "out": out, 29 | "out_tensor": torch.tensor((),) 30 | } 31 | self.set_module_name('diag') 32 | 33 | def forward(self, input, diagonal: int, out: bool, out_tensor): 34 | if out: 35 | return torch.diag(input, diagonal=diagonal, out=out_tensor) 36 | else: 37 | return torch.diag(input, diagonal=diagonal) 38 | 39 | 40 | op_bench.generate_pt_test(diag_configs_short, DiagBenchmark) 41 | 42 | 43 | if __name__ == "__main__": 44 | op_bench.benchmark_runner.main() 45 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/fill_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | from torch.testing._internal.common_device_type import get_all_device_types 5 | 6 | """Microbenchmark for Fill_ operator.""" 7 | 8 | fill_short_configs = op_bench.config_list( 9 | attr_names=["N"], 10 | attrs=[ 11 | [1], 12 | [1024], 13 | [2048], 14 | ], 15 | cross_product_configs={ 16 | 'device': ['cpu', 'cuda'], 17 | 'dtype': [torch.int32], 18 | }, 19 | tags=["short"], 20 | ) 21 | 22 | fill_long_configs = op_bench.cross_product_configs( 23 | N=[10, 1000], 24 | device=get_all_device_types(), 25 | dtype=[torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32, 26 | torch.int64, torch.half, torch.float, torch.double], 27 | tags=["long"] 28 | ) 29 | 30 | 31 | class Fill_Benchmark(op_bench.TorchBenchmarkBase): 32 | def init(self, N, device, dtype): 33 | self.inputs = { 34 | "input_one": torch.zeros(N, device=device).type(dtype) 35 | } 36 | self.set_module_name("fill_") 37 | 38 | def forward(self, input_one): 39 | return input_one.fill_(10) 40 | 41 | 42 | op_bench.generate_pt_test(fill_short_configs + fill_long_configs, 43 | Fill_Benchmark) 44 | 45 | 46 | if __name__ == "__main__": 47 | op_bench.benchmark_runner.main() 48 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/gelu_test.py: -------------------------------------------------------------------------------- 1 | 2 | import operator_benchmark as op_bench 3 | import torch 4 | 5 | 6 | """ 7 | Microbenchmarks for the gelu operators. 8 | """ 9 | 10 | gelu_configs_long = op_bench.cross_product_configs( 11 | N=[1, 4], 12 | C=[3], 13 | H=[16, 256], 14 | W=[16, 256], 15 | device=['cpu'], 16 | tags=['long'] 17 | ) 18 | 19 | 20 | class GeluBenchmark(op_bench.TorchBenchmarkBase): 21 | def init(self, N, C, H, W, device): 22 | self.inputs = { 23 | "input": torch.rand(N, C, H, W, device=device) 24 | } 25 | 26 | def forward(self, input): 27 | return torch.nn.functional.gelu(input) 28 | 29 | 30 | op_bench.generate_pt_test(gelu_configs_long, GeluBenchmark) 31 | 32 | 33 | if __name__ == "__main__": 34 | op_bench.benchmark_runner.main() 35 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/groupnorm_test.py: -------------------------------------------------------------------------------- 1 | 2 | import operator_benchmark as op_bench 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | """Microbenchmarks for groupnorm operator.""" 8 | 9 | groupnorm_configs_short = op_bench.cross_product_configs( 10 | dims=( 11 | (32, 8, 16), 12 | (32, 8, 56, 56), 13 | ), 14 | num_groups=(2, 4), 15 | tags=["short"], 16 | ) 17 | 18 | 19 | class GroupNormBenchmark(op_bench.TorchBenchmarkBase): 20 | def init(self, dims, num_groups): 21 | num_channels = dims[1] 22 | self.inputs = { 23 | "input": (torch.rand(*dims) - 0.5) * 256, 24 | "num_groups": num_groups, 25 | "weight": torch.rand(num_channels, dtype=torch.float), 26 | "bias": torch.rand(num_channels, dtype=torch.float), 27 | "eps": 1e-5 28 | } 29 | 30 | def forward(self, input, num_groups: int, weight, bias, eps: float): 31 | return F.group_norm( 32 | input, num_groups, weight=weight, bias=bias, eps=eps) 33 | 34 | 35 | op_bench.generate_pt_test(groupnorm_configs_short, GroupNormBenchmark) 36 | 37 | 38 | if __name__ == "__main__": 39 | op_bench.benchmark_runner.main() 40 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/instancenorm_test.py: -------------------------------------------------------------------------------- 1 | 2 | import operator_benchmark as op_bench 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | """Microbenchmarks for instancenorm operator.""" 8 | 9 | instancenorm_configs_short = op_bench.cross_product_configs( 10 | dims=( 11 | (32, 8, 16), 12 | (32, 8, 56, 56), 13 | ), 14 | tags=["short"], 15 | ) 16 | 17 | 18 | class InstanceNormBenchmark(op_bench.TorchBenchmarkBase): 19 | def init(self, dims): 20 | num_channels = dims[1] 21 | self.inputs = { 22 | "input": (torch.rand(*dims) - 0.5) * 256, 23 | "weight": torch.rand(num_channels, dtype=torch.float), 24 | "bias": torch.rand(num_channels, dtype=torch.float), 25 | "eps": 1e-5 26 | } 27 | 28 | def forward(self, input, weight, bias, eps: float): 29 | return F.instance_norm( 30 | input, weight=weight, bias=bias, eps=eps) 31 | 32 | 33 | op_bench.generate_pt_test(instancenorm_configs_short, InstanceNormBenchmark) 34 | 35 | 36 | if __name__ == "__main__": 37 | op_bench.benchmark_runner.main() 38 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/layernorm_test.py: -------------------------------------------------------------------------------- 1 | 2 | import operator_benchmark as op_bench 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | """Microbenchmarks for layernorm operator.""" 8 | 9 | layernorm_configs_short = op_bench.cross_product_configs( 10 | dims=( 11 | (1, 8, 16), 12 | (8, 8, 16), 13 | (32, 8, 16), 14 | (64, 128, 56, 56), 15 | ), 16 | tags=["short"], 17 | ) 18 | 19 | 20 | class LayerNormBenchmark(op_bench.TorchBenchmarkBase): 21 | def init(self, dims): 22 | input = (torch.rand(*dims) - 0.5) * 256 23 | self.inputs = { 24 | "input": input, 25 | "weight": torch.rand(*input.size()[1:], dtype=torch.float), 26 | "bias": torch.rand(*input.size()[1:], dtype=torch.float), 27 | "eps": 1e-5 28 | } 29 | 30 | def forward(self, input, weight, bias, eps: float): 31 | return F.layer_norm( 32 | input, input.size()[1:], weight=weight, bias=bias, eps=eps) 33 | 34 | 35 | op_bench.generate_pt_test(layernorm_configs_short, LayerNormBenchmark) 36 | 37 | 38 | if __name__ == "__main__": 39 | op_bench.benchmark_runner.main() 40 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/linear_test.py: -------------------------------------------------------------------------------- 1 | 2 | import operator_benchmark as op_bench 3 | import torch 4 | import torch.nn as nn 5 | 6 | from pt import configs 7 | 8 | 9 | """Microbenchmarks for Linear operator.""" 10 | 11 | 12 | class LinearBenchmark(op_bench.TorchBenchmarkBase): 13 | def init(self, N, IN, OUT, device): 14 | self.inputs = { 15 | "input_one": torch.rand(N, IN, device=device) 16 | } 17 | self.linear = nn.Linear(IN, OUT).to(device=device) 18 | self.set_module_name("linear") 19 | 20 | def forward(self, input_one): 21 | return self.linear(input_one) 22 | 23 | 24 | op_bench.generate_pt_test(configs.linear_configs_short + configs.linear_configs_long, 25 | LinearBenchmark) 26 | 27 | 28 | if __name__ == "__main__": 29 | op_bench.benchmark_runner.main() 30 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt/split_test.py: -------------------------------------------------------------------------------- 1 | import operator_benchmark as op_bench 2 | import torch 3 | 4 | 5 | """Microbenchmarks for Split operator""" 6 | 7 | 8 | # Configs for PT Split operator 9 | split_configs_short = op_bench.config_list( 10 | attr_names=["M", "N", "parts"], 11 | attrs=[ 12 | [8, 8, 2], 13 | [256, 512, 2], 14 | [512, 512, 2], 15 | ], 16 | cross_product_configs={ 17 | 'device': ['cpu', 'cuda'], 18 | }, 19 | tags=["short"], 20 | ) 21 | 22 | split_configs_long = op_bench.cross_product_configs( 23 | M=[128, 1024], 24 | N=[128, 1024], 25 | parts=[2, 4], 26 | device=['cpu', 'cuda'], 27 | tags=['long'] 28 | ) 29 | 30 | 31 | class SplitBenchmark(op_bench.TorchBenchmarkBase): 32 | def init(self, M, N, parts, device): 33 | self.inputs = { 34 | "input": torch.rand(M, N, device=device), 35 | "split_size": int(M * N / parts) 36 | } 37 | self.set_module_name('split') 38 | 39 | def forward(self, input, split_size: int): 40 | return torch.split(input, split_size) 41 | 42 | 43 | op_bench.generate_pt_test(split_configs_short + split_configs_long, 44 | SplitBenchmark) 45 | 46 | 47 | if __name__ == "__main__": 48 | op_bench.benchmark_runner.main() 49 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt_extension/extension.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using torch::List; 5 | using torch::Tensor; 6 | 7 | Tensor consume(Tensor a) { 8 | return a; 9 | } 10 | 11 | List consume_list(List a) { 12 | return a; 13 | } 14 | 15 | // When JIT tracing is used on function with constant for loop, 16 | // the for loop is optimized away because of dead code elimination. 17 | // That caused an issue for our op benchmark which needs to run an op 18 | // in a loop and report the execution time. This diff resolves that issue by 19 | // registering this consume op with correct alias information which is DEFAULT. 20 | TORCH_LIBRARY_FRAGMENT(operator_benchmark, m) { 21 | m.def("_consume", &consume); 22 | m.def("_consume.list", &consume_list); 23 | } 24 | 25 | PYBIND11_MODULE(benchmark_cpp_extension, m) { 26 | m.def("_consume", &consume, "consume"); 27 | m.def("_consume_list", &consume_list, "consume_list"); 28 | } 29 | -------------------------------------------------------------------------------- /benchmarks/operator_benchmark/pt_extension/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension 3 | 4 | setup(name='benchmark_cpp_extension', 5 | ext_modules=[CppExtension('benchmark_cpp_extension', ['extension.cpp'])], 6 | cmdclass={'build_ext': BuildExtension}) 7 | -------------------------------------------------------------------------------- /benchmarks/overrides_benchmark/common.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | NUM_REPEATS = 1000 4 | NUM_REPEAT_OF_REPEATS = 1000 5 | 6 | 7 | class SubTensor(torch.Tensor): 8 | pass 9 | 10 | 11 | class WithTorchFunction: 12 | def __init__(self, data, requires_grad=False): 13 | if isinstance(data, torch.Tensor): 14 | self._tensor = data 15 | return 16 | 17 | self._tensor = torch.tensor(data, requires_grad=requires_grad) 18 | 19 | @classmethod 20 | def __torch_function__(cls, func, types, args=(), kwargs=None): 21 | if kwargs is None: 22 | kwargs = {} 23 | 24 | return WithTorchFunction(args[0]._tensor + args[1]._tensor) 25 | 26 | 27 | class SubWithTorchFunction(torch.Tensor): 28 | @classmethod 29 | def __torch_function__(cls, func, types, args=(), kwargs=None): 30 | if kwargs is None: 31 | kwargs = {} 32 | 33 | return super().__torch_function__(func, types, args, kwargs) 34 | -------------------------------------------------------------------------------- /benchmarks/overrides_benchmark/pyspybench.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from common import SubTensor, WithTorchFunction, SubWithTorchFunction # noqa: F401 4 | 5 | Tensor = torch.tensor 6 | 7 | NUM_REPEATS = 1000000 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser( 11 | description="Run the torch.add for a given class a given number of times." 12 | ) 13 | parser.add_argument( 14 | "tensor_class", metavar="TensorClass", type=str, help="The class to benchmark." 15 | ) 16 | parser.add_argument( 17 | "--nreps", "-n", type=int, default=NUM_REPEATS, help="The number of repeats." 18 | ) 19 | args = parser.parse_args() 20 | 21 | TensorClass = globals()[args.tensor_class] 22 | NUM_REPEATS = args.nreps 23 | 24 | t1 = TensorClass([1.]) 25 | t2 = TensorClass([2.]) 26 | 27 | for _ in range(NUM_REPEATS): 28 | torch.add(t1, t2) 29 | -------------------------------------------------------------------------------- /benchmarks/profiler_benchmark/resnet_memory_profiler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.models as models 3 | 4 | import torch.autograd.profiler as profiler 5 | 6 | for with_cuda in [False, True]: 7 | model = models.resnet18() 8 | inputs = torch.randn(5, 3, 224, 224) 9 | sort_key = "self_cpu_memory_usage" 10 | if with_cuda and torch.cuda.is_available(): 11 | model = model.cuda() 12 | inputs = inputs.cuda() 13 | sort_key = "self_cuda_memory_usage" 14 | print("Profiling CUDA Resnet model") 15 | else: 16 | print("Profiling CPU Resnet model") 17 | 18 | with profiler.profile(profile_memory=True, record_shapes=True) as prof: 19 | with profiler.record_function("root"): 20 | model(inputs) 21 | 22 | print(prof.key_averages(group_by_input_shape=True).table(sort_by=sort_key, row_limit=-1)) 23 | -------------------------------------------------------------------------------- /benchmarks/serialization/nested_annotation_str.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.benchmark as benchmark 3 | 4 | MEMO = {} 5 | def create_nested_dict_type(layers): 6 | if layers == 0: 7 | return torch._C.StringType.get() 8 | if layers not in MEMO: 9 | less_nested = create_nested_dict_type(layers - 1) 10 | result = torch._C.DictType(torch._C.StringType.get(), torch._C.TupleType([less_nested, less_nested])) 11 | MEMO[layers] = result 12 | return MEMO[layers] 13 | 14 | 15 | nesting_levels = (1, 3, 5, 10) 16 | types = (reasonable, medium, big, huge) = [create_nested_dict_type(x) for x in nesting_levels] 17 | 18 | timers = [benchmark.Timer(stmt='x.annotation_str', globals={'x': nested_type}) for nested_type in types] 19 | 20 | for nesting_level, typ, timer in zip(nesting_levels, types, timers): 21 | print("Nesting level:", nesting_level) 22 | print("output:", typ.annotation_str[:70]) 23 | print(timer.blocked_autorange()) 24 | -------------------------------------------------------------------------------- /benchmarks/serialization/simple_measurement.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pyarkbench import Benchmark, Timer, default_args 3 | 4 | use_new = True 5 | 6 | class Basic(Benchmark): 7 | def benchmark(self): 8 | x = [torch.ones(200, 200) for i in range(30)] 9 | with Timer() as big1: 10 | torch.save(x, "big_tensor.zip", _use_new_zipfile_serialization=use_new) 11 | 12 | with Timer() as big2: 13 | v = torch.load("big_tensor.zip") 14 | 15 | x = [torch.ones(10, 10) for i in range(200)] 16 | with Timer() as small1: 17 | torch.save(x, "small_tensor.zip", _use_new_zipfile_serialization=use_new) 18 | 19 | with Timer() as small2: 20 | v = torch.load("small_tensor.zip") 21 | 22 | return { 23 | "Big Tensors Save": big1.ms_duration, 24 | "Big Tensors Load": big2.ms_duration, 25 | "Small Tensors Save": small1.ms_duration, 26 | "Small Tensors Load": small2.ms_duration, 27 | } 28 | 29 | if __name__ == '__main__': 30 | bench = Basic(*default_args.bench()) 31 | print("Use zipfile serialization:", use_new) 32 | results = bench.run() 33 | bench.print_stats(results, stats=['mean', 'median']) 34 | -------------------------------------------------------------------------------- /benchmarks/sparse/README.md: -------------------------------------------------------------------------------- 1 | #Sparse benchmarks 2 | 3 | These sets of benchmarks are for the sparse matrix functionality. They exist for 4 | comparing the performance of sparse matrix routines such as SpMV between various 5 | sparse matrix formats and with other frameworks such as TensorFlow. 6 | -------------------------------------------------------------------------------- /benchmarks/sparse/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | if __name__ == "__main__": 3 | pass 4 | -------------------------------------------------------------------------------- /benchmarks/sparse/dlmc/README.md: -------------------------------------------------------------------------------- 1 | # Sparse benchmarks 2 | 3 | These sets of benchmarks are for the sparse matrix functionality using a popular real dataset collection called the Deep Learning Matrix Collection (DLMC), which were used in recent studies [1, 2]. 4 | 5 | Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here. 6 | 7 | - `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward-test`, on CPU or CUDA with `--with-cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file. 8 | 9 | - `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test. 10 | 11 | References: 12 | 13 | 1. Trevor Gale, Matei Zaharia, Cliff Young, Erich Elsen. Sparse GPU Kernels for Deep Learning. Proceedings of the International Conference for High Performance Computing, 2020. https://github.com/google-research/google-research/tree/master/sgk 14 | 15 | 2. Trevor Gale, Erich Elsen, Sara Hooker. The State of Sparsity in Deep Neural Networks. https://github.com/google-research/google-research/tree/master/state_of_sparsity 16 | -------------------------------------------------------------------------------- /benchmarks/sparse/dlmc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | if __name__ == "__main__": 3 | pass 4 | -------------------------------------------------------------------------------- /benchmarks/sparse/test_csr.sh: -------------------------------------------------------------------------------- 1 | OUTFILE=spmm-no-mkl-test.txt 2 | PYTORCH_HOME=$1 3 | 4 | cd $PYTORCH_HOME 5 | 6 | echo "" >> $OUTFILE 7 | echo "----- USE_MKL=1 -----" >> $OUTFILE 8 | rm -rf build 9 | 10 | export USE_MKL=1 11 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} 12 | python setup.py build --cmake-only 13 | ccmake build # or cmake-gui build 14 | 15 | python setup.py install 16 | 17 | cd benchmarks 18 | echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE 19 | for dim0 in 1000 5000 10000; do 20 | for nnzr in 0.01 0.05 0.1 0.3; do 21 | python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE 22 | # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE 23 | done 24 | done 25 | echo "----------------------" >> $OUTFILE 26 | 27 | cd $PYTORCH_HOME 28 | echo "----- USE_MKL=0 ------" >> $OUTFILE 29 | rm -rf build 30 | 31 | export USE_MKL=0 32 | python setup.py install 33 | 34 | cd benchmarks 35 | for dim0 in 1000 5000 10000; do 36 | for nnzr in 0.01 0.05 0.1 0.3; do 37 | python -m sparse.spmv --format csr --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE 38 | python -m sparse.spmv --format coo --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE 39 | done 40 | done 41 | echo "----------------------" >> $OUTFILE 42 | -------------------------------------------------------------------------------- /benchmarks/static_runtime/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc) 2 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc) 3 | set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE) 4 | 5 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc) 6 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cc) 7 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc) 8 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_module.cc) 9 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_generated_ops.cc) 10 | set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE) 11 | -------------------------------------------------------------------------------- /benchmarks/tensorexpr/HowToRun.md: -------------------------------------------------------------------------------- 1 | From the root of pytorch repo, run: 2 | ``` 3 | python -m benchmarks.tensorexpr --help 4 | ``` 5 | to show documentation. 6 | 7 | An example of an actual command line that one might use as a starting point: 8 | ``` 9 | python -m benchmarks.tensorexpr --device gpu --mode fwd --jit-mode trace --cuda-fuser=te 10 | ``` 11 | -------------------------------------------------------------------------------- /benchmarks/tensorexpr/nnc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/tensorexpr/nnc.png -------------------------------------------------------------------------------- /benchmarks/tensorexpr/tensor_engine.py: -------------------------------------------------------------------------------- 1 | tensor_engine = None 2 | 3 | 4 | def unsupported(func): 5 | def wrapper(self): 6 | return func(self) 7 | 8 | wrapper.is_supported = False 9 | return wrapper 10 | 11 | 12 | def is_supported(method): 13 | if hasattr(method, "is_supported"): 14 | return method.is_supported 15 | return True 16 | 17 | 18 | def set_engine_mode(mode): 19 | global tensor_engine 20 | if mode == "tf": 21 | from . import tf_engine 22 | 23 | tensor_engine = tf_engine.TensorFlowEngine() 24 | elif mode == "pt": 25 | from . import pt_engine 26 | 27 | tensor_engine = pt_engine.TorchTensorEngine() 28 | elif mode == "topi": 29 | from . import topi_engine 30 | 31 | tensor_engine = topi_engine.TopiEngine() 32 | elif mode == "relay": 33 | from . import relay_engine 34 | 35 | tensor_engine = relay_engine.RelayEngine() 36 | elif mode == "nnc": 37 | from . import nnc_engine 38 | 39 | tensor_engine = nnc_engine.NncEngine() 40 | else: 41 | raise ValueError("invalid tensor engine mode: %s" % (mode)) 42 | tensor_engine.mode = mode 43 | 44 | 45 | def get_engine(): 46 | if tensor_engine is None: 47 | raise ValueError("use of get_engine, before calling set_engine_mode is illegal") 48 | return tensor_engine 49 | -------------------------------------------------------------------------------- /binaries/caffe2_benchmark.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "binaries/benchmark_args.h" 6 | #include "binaries/benchmark_helper.h" 7 | 8 | 9 | int main(int argc, char** argv) { 10 | caffe2::GlobalInit(&argc, &argv); 11 | benchmark( 12 | argc, 13 | argv, 14 | FLAGS_backend, 15 | FLAGS_init_net, 16 | FLAGS_input, 17 | FLAGS_input_dims, 18 | FLAGS_input_file, 19 | FLAGS_input_type, 20 | FLAGS_iter, 21 | FLAGS_measure_memory, 22 | FLAGS_net, 23 | FLAGS_output, 24 | FLAGS_output_folder, 25 | FLAGS_run_individual, 26 | FLAGS_sleep_before_run, 27 | FLAGS_sleep_between_iteration, 28 | FLAGS_sleep_between_net_and_operator, 29 | FLAGS_text_output, 30 | FLAGS_warmup, 31 | FLAGS_wipe_cache); 32 | } 33 | -------------------------------------------------------------------------------- /binaries/lite_interpreter_model_load.cc: -------------------------------------------------------------------------------- 1 | #include "ATen/ATen.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "torch/script.h" 8 | 9 | C10_DEFINE_string(model, "", "The given bytecode model to check if it is supported by lite_interpreter."); 10 | 11 | int main(int argc, char** argv) { 12 | c10::SetUsageMessage( 13 | "Check if exported bytecode model is runnable by lite_interpreter.\n" 14 | "Example usage:\n" 15 | "./lite_interpreter_model_load" 16 | " --model="); 17 | 18 | if (!c10::ParseCommandLineFlags(&argc, &argv)) { 19 | std::cerr << "Failed to parse command line flags!" << std::endl; 20 | return 1; 21 | } 22 | 23 | if (FLAGS_model.empty()) { 24 | std::cerr << FLAGS_model << ":Model file is not provided\n"; 25 | return -1; 26 | } 27 | 28 | // TODO: avoid having to set this guard for custom mobile build with mobile 29 | // interpreter. 30 | c10::InferenceMode mode; 31 | torch::jit::mobile::Module bc = torch::jit::_load_for_mobile(FLAGS_model); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /binaries/parallel_info.cc: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-present, Facebook, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "ATen/Parallel.h" 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __linux__ 23 | #include 24 | #include 25 | #endif 26 | 27 | int main(int argc, char** argv) { 28 | at::init_num_threads(); 29 | 30 | std::cout << at::get_parallel_info() << std::endl; 31 | 32 | # ifdef __linux__ 33 | std::ostringstream cmd; 34 | cmd << "lsof -p " << getpid() << " | grep .so"; 35 | std::cout << "Loaded .so:" << std::endl; 36 | std::cout << cmd.str() << std::endl; 37 | std::system(cmd.str().c_str()); 38 | # endif 39 | 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /third_party/BUILD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/BUILD -------------------------------------------------------------------------------- /third_party/METADATA.bzl: -------------------------------------------------------------------------------- 1 | METADATA = { 2 | "maintainers": [ 3 | "pytorch_dev_infra", 4 | ], 5 | "name": "third_party", 6 | "owner": "pytorch_dev_infra", 7 | } 8 | -------------------------------------------------------------------------------- /third_party/README.md: -------------------------------------------------------------------------------- 1 | This folder contains vendored copies of third-party libraries that we 2 | use. 3 | -------------------------------------------------------------------------------- /third_party/cudnn.BUILD: -------------------------------------------------------------------------------- 1 | # Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD 2 | 3 | cc_library( 4 | name = "cudnn_headers", 5 | hdrs = ["include/cudnn.h"] + glob([ 6 | "include/cudnn+.h", 7 | "include/cudnn_*.h", 8 | ]), 9 | includes = ["include/"], 10 | visibility = ["//visibility:private"], 11 | ) 12 | 13 | cc_import( 14 | name = "cudnn_lib", 15 | shared_library = "lib/x86_64-linux-gnu/libcudnn.so", 16 | visibility = ["//visibility:private"], 17 | ) 18 | 19 | cc_library( 20 | name = "cudnn", 21 | visibility = ["//visibility:public"], 22 | deps = [ 23 | "cudnn_headers", 24 | "cudnn_lib", 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /third_party/cutlass.BUILD: -------------------------------------------------------------------------------- 1 | # Description: 2 | # CUDA Templates for Linear Algebra Subroutines 3 | 4 | load("@rules_cc//cc:defs.bzl", "cc_library") 5 | 6 | cc_library( 7 | name = "cutlass", 8 | hdrs = glob(["include/**/*.h"]), 9 | includes = ["include/"], 10 | visibility = ["//visibility:public"], 11 | ) 12 | -------------------------------------------------------------------------------- /third_party/fmt.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "fmt", 5 | hdrs = glob(["include/fmt/*.h",]), 6 | defines = ["FMT_HEADER_ONLY=1"], 7 | includes = ["include"], 8 | visibility = ["//visibility:public"], 9 | ) 10 | -------------------------------------------------------------------------------- /third_party/foxi.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "foxi", 5 | srcs = [ 6 | "foxi/onnxifi_loader.c", 7 | ], 8 | hdrs = glob([ 9 | "foxi/*.h", 10 | ]), 11 | includes = [ 12 | ".", 13 | ], 14 | linkstatic = 1, 15 | visibility = ["//visibility:public"], 16 | ) 17 | -------------------------------------------------------------------------------- /third_party/ideep.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "ideep", 5 | hdrs = glob([ 6 | "include/**/*.hpp", 7 | "include/**/*.h", 8 | ]), 9 | defines = [ 10 | "IDEEP_USE_MKL", 11 | ], 12 | includes = [ 13 | "include/", 14 | ], 15 | visibility = ["//visibility:public"], 16 | deps = ["@mkl_dnn//:mkl-dnn"], 17 | ) 18 | -------------------------------------------------------------------------------- /third_party/kineto.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "kineto", 5 | hdrs = glob(["libkineto/include/*.h",]), 6 | includes = [ 7 | "libkineto/include/", 8 | ], 9 | visibility = ["//visibility:public"], 10 | ) 11 | -------------------------------------------------------------------------------- /third_party/miniz-2.1.0/BUILD.bazel: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name = "miniz", 3 | srcs = [ 4 | "miniz.c", 5 | ], 6 | hdrs = [ 7 | "miniz.h", 8 | ], 9 | strip_include_prefix = ".", 10 | visibility = ["//visibility:public"], 11 | ) 12 | -------------------------------------------------------------------------------- /third_party/miniz-2.1.0/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013-2014 RAD Game Tools and Valve Software 2 | Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC 3 | 4 | All Rights Reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /third_party/mkl.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "mkl", 5 | srcs = [ 6 | "libmkl_avx2.so", 7 | "libmkl_core.so", 8 | "libmkl_def.so", 9 | "libmkl_intel_lp64.so", 10 | "libmkl_rt.so", 11 | "libmkl_sequential.so", 12 | "libmkl_vml_avx2.so", 13 | "libmkl_vml_avx512.so", 14 | "libmkl_vml_def.so", 15 | ] + select({ 16 | "@pytorch//tools/config:thread_sanitizer": [], 17 | "//conditions:default": ["libmkl_tbb_thread.so"], 18 | }), 19 | visibility = ["//visibility:public"], 20 | deps = ["@mkl_headers"], 21 | ) 22 | -------------------------------------------------------------------------------- /third_party/mkl_headers.BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | cc_library( 4 | name = "mkl_headers", 5 | hdrs = glob(["include/*.h"]), 6 | includes = ["include/"], 7 | visibility = ["//visibility:public"], 8 | ) 9 | -------------------------------------------------------------------------------- /third_party/nvfuser/benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(USE_CUDA) 2 | add_executable(nvfuser_bench 3 | batch_norm_channels_first.cpp 4 | batch_norm_channels_first_backward.cpp 5 | batch_norm_channels_last.cpp 6 | batch_norm_channels_last_backward.cpp 7 | bert.cpp 8 | broadcast.cpp 9 | gelu_backward.cpp 10 | heuristic_lookup.cpp 11 | shape_inference.cpp 12 | instance_norm.cpp 13 | layer_norm.cpp 14 | layer_norm_backward.cpp 15 | rms_norm.cpp 16 | rms_norm_backward.cpp 17 | lstm_cell.cpp 18 | reduction.cpp 19 | softmax.cpp 20 | softmax_backward.cpp 21 | scale_bias_relu.cpp 22 | transpose.cpp 23 | matmul.cpp 24 | timm.cpp 25 | utils.cpp 26 | main.cpp) 27 | 28 | target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark) 29 | if(NOT MSVC) 30 | target_compile_options_if_supported(nvfuser_bench -Werror) 31 | target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable) 32 | target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy) 33 | endif() 34 | 35 | endif() 36 | -------------------------------------------------------------------------------- /third_party/nvfuser/benchmark/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | BENCHMARK_MAIN(); 4 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/codegen.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace torch { 9 | namespace jit { 10 | namespace fuser { 11 | namespace cuda { 12 | namespace codegen { 13 | 14 | //! Generates a CUDA kernel definition for the given kernel 15 | TORCH_CUDA_CU_API std::string generateCudaKernel( 16 | const kir::Kernel* kernel, 17 | const std::string& kernel_name = "CUDAGeneratedKernel"); 18 | 19 | } // namespace codegen 20 | } // namespace cuda 21 | } // namespace fuser 22 | } // namespace jit 23 | } // namespace torch 24 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/compute_at.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace torch { 16 | namespace jit { 17 | namespace fuser { 18 | namespace cuda { 19 | 20 | class TensorDomain; 21 | class TensorView; 22 | 23 | struct ComputeAt { 24 | public: 25 | // Runs the compute at pass making producer look like consumer, computing 26 | // producer relative to consumer 27 | static void runAt( 28 | TensorView* producer, 29 | TensorView* consumer, 30 | int64_t consumer_position, 31 | ComputeAtMode mode = ComputeAtMode::Standard); 32 | 33 | // Runs the compute with pass making consumer look like producer, computing 34 | // producer relative to consumer 35 | static void runWith( 36 | TensorView* producer, 37 | TensorView* consumer, 38 | int64_t producer_position, 39 | ComputeAtMode mode = ComputeAtMode::Standard); 40 | }; 41 | 42 | } // namespace cuda 43 | } // namespace fuser 44 | } // namespace jit 45 | } // namespace torch 46 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/docs/.gitignore: -------------------------------------------------------------------------------- 1 | html 2 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/docs/documentation.h: -------------------------------------------------------------------------------- 1 | 2 | #error This is used exclusively for generating the documentation (not a real header) 3 | 4 | //! \namespace torch::jit::fuser 5 | //! \brief Main PyTorch JIT Fuser namespace 6 | 7 | //! \namespace torch::jit::fuser::cuda 8 | //! \brief CUDA specific components 9 | 10 | //! \namespace torch::jit::fuser::cuda::executor_utils 11 | //! \brief Fuser executor related utilities 12 | 13 | //! \namespace torch::jit::fuser::kir 14 | //! \brief Kernel IR 15 | 16 | //! \namespace torch::jit::fuser::ir_utils 17 | //! \brief IR manipulation utilities 18 | 19 | //! \namespace torch::jit::fuser::loop_utils 20 | //! \brief Loop utilities 21 | 22 | //! \namespace torch::jit::fuser::scope_utils 23 | //! \brief Scope utilities 24 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/docs/images/ir_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/csrc/docs/images/ir_architecture.png -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/docs/main_page.md: -------------------------------------------------------------------------------- 1 | 2 | This is the implementation reference for the CUDA PyTorch JIT Fuser 3 | 4 | - [PyTorch GitHub Page](https://github.com/pytorch/pytorch) 5 | - [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda) 6 | - Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html) 7 | 8 | ![Fuser Architecture Overview](images/ir_architecture.png) 9 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/ir_all_nodes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | // TODO: remove this once the Kernel IR split is complete 8 | #include 9 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_alias_memory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace torch { 11 | namespace jit { 12 | namespace fuser { 13 | namespace cuda { 14 | 15 | //! Reuse Allocation nodes via pointer aliasing 16 | //! 17 | //! First pass finds candidate TensorViews 18 | //! A candidate TensorView is anything in shared memory OR 19 | //! in local memory with a static size larger than register_size_threshold 20 | //! 21 | //! Second pass finds appropriate input Allocate Node 22 | //! among candidate TensorViews 23 | //! 24 | //! Alias Criteria: 25 | //! If input is a candidate TensorView, 26 | //! input allocation has the same size as output allocation, 27 | //! thread bindings match, 28 | //! is not used after this op: 29 | //! then alias output Allocate to input Allocate. 30 | //! 31 | std::vector reuseMemoryAllocations(const std::vector& exprs); 32 | 33 | } // namespace cuda 34 | } // namespace fuser 35 | } // namespace jit 36 | } // namespace torch 37 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_allocation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace torch { 11 | namespace jit { 12 | namespace fuser { 13 | namespace cuda { 14 | 15 | //! Buffer allocation information to store in GPU lower to avoid 16 | //! logic duplication 17 | struct LocalAllocationInfo { 18 | kir::Allocate* alloc_expr = nullptr; 19 | std::vector alloc_domains; 20 | bool has_halo = false; 21 | }; 22 | 23 | using LocalAllocationInfoMap = 24 | std::unordered_map>; 25 | 26 | //! Insert buffer allocations 27 | std::vector insertAllocations(const std::vector& exprs); 28 | 29 | } // namespace cuda 30 | } // namespace fuser 31 | } // namespace jit 32 | } // namespace torch 33 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_divisible_split.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace torch { 10 | namespace jit { 11 | namespace fuser { 12 | namespace cuda { 13 | 14 | // Looks through all transformations assocaited with view, or enforced divisible 15 | // vectorization splits and gathers all splits that provably don't have a 16 | // remainder, therefore the extents of the associated IterDomains do not require 17 | // a ceilDiv expressions. 18 | TORCH_CUDA_CU_API std::unordered_set getAllDivisibleSplits( 19 | Fusion* fusion); 20 | 21 | // Same as above but will use provided ComputeAtMap instead of building its own. 22 | TORCH_CUDA_CU_API std::unordered_set getAllDivisibleSplits( 23 | Fusion* fusion, 24 | const ComputeAtMap* ca_map); 25 | 26 | } // namespace cuda 27 | } // namespace fuser 28 | } // namespace jit 29 | } // namespace torch 30 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_expr_sort.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace torch { 6 | namespace jit { 7 | namespace fuser { 8 | namespace cuda { 9 | 10 | std::vector reorderExprsForComputeAt(); 11 | 12 | } // namespace cuda 13 | } // namespace fuser 14 | } // namespace jit 15 | } // namespace torch 16 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_fused_reduction.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace torch { 6 | namespace jit { 7 | namespace fuser { 8 | namespace cuda { 9 | 10 | //! Keep track of certain patterns of reductions. 11 | //! 12 | //! - Allreduce IterDomain: reduced and broadcast domain. 13 | class FusedReductionInfo { 14 | public: 15 | void markAsAllreduce(IterDomain* id); 16 | 17 | bool isAllreduce(IterDomain* id) const; 18 | 19 | private: 20 | // Reduction IterDomains that are also broadcast 21 | std::unordered_set allreduce_ids_; 22 | }; 23 | 24 | //! Detect reductions and broadcasts that are eligible for the fused 25 | //! reduction kernel. When found, the predicate flags of the broadcast 26 | //! is unset, which effectively makes the broadcast just a unary set 27 | //! op. 28 | //! TODO: Consider moving the warp-based fused reduction here. 29 | void fuseReductionsAndBroadcasts(Fusion*); 30 | 31 | } // namespace cuda 32 | } // namespace fuser 33 | } // namespace jit 34 | } // namespace torch 35 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_fusion_simplifier.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | namespace torch { 13 | namespace jit { 14 | namespace fuser { 15 | namespace cuda { 16 | 17 | // Replaces trivial reductions with Unary Set Ops 18 | void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&); 19 | 20 | // Transpose, Shift, Gather, and View Ops with Unary Set Ops 21 | std::vector unarySetOpInserter(const std::vector& exprs); 22 | 23 | } // namespace cuda 24 | } // namespace fuser 25 | } // namespace jit 26 | } // namespace torch 27 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_insert_syncs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace torch { 11 | namespace jit { 12 | namespace fuser { 13 | namespace cuda { 14 | 15 | //! Insert sync at end of for-loops to prevent write-after-read race condition. 16 | //! 17 | //! WAR race condition occurs when the next iteration of the loop overwrites 18 | //! shared memory value before a previous operation has finished reading it. 19 | std::vector insertWarThreadSynchronization( 20 | const std::vector& exprs); 21 | 22 | //! Insert syncs between writing to shared memory and then reading it. 23 | //! RAW pass is run before indexing, unrolling (loop duplication), memory 24 | //! aliasing, and index (grid/block bcast/reduction) 25 | std::vector insertRawThreadSynchronization( 26 | const std::vector& exprs); 27 | 28 | } // namespace cuda 29 | } // namespace fuser 30 | } // namespace jit 31 | } // namespace torch 32 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_instrument.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace torch { 6 | namespace jit { 7 | namespace fuser { 8 | namespace cuda { 9 | 10 | //! Set up KernelPerformanceProfile of GpuLower when enabled, which 11 | //! keeps track of expressions to profile. A new TensorView is added 12 | //! for storing profiling results. The expression list is prepended 13 | //! with an kir::Allocate node to allocate the TensorView profile 14 | //! buffer. Note that any expression added after this pass will not be 15 | //! profiled, so this pass should be called after all expressions are 16 | //! lowered. KernelPerformanceProfile is copied to Kernel after 17 | //! lowering. 18 | std::vector instrumentKernel(const std::vector& exprs); 19 | 20 | } // namespace cuda 21 | } // namespace fuser 22 | } // namespace jit 23 | } // namespace torch 24 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_predicate.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace torch { 10 | namespace jit { 11 | namespace fuser { 12 | namespace cuda { 13 | 14 | //! Update predicates with valid bool conditionals 15 | //! 16 | std::vector generateConditionalFromPredicate( 17 | const std::vector& exprs); 18 | 19 | } // namespace cuda 20 | } // namespace fuser 21 | } // namespace jit 22 | } // namespace torch 23 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_replace_size.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace torch { 10 | namespace jit { 11 | namespace fuser { 12 | namespace cuda { 13 | 14 | // TensorViews are all based on symbolic sizes. When we first initialize them 15 | // we don't know if they're inputs or outputs which would mean that they have 16 | // runtime shapes. Intermediate tensors (those not going to global memory) do 17 | // not have this information. Since we need to have the correct information in 18 | // the kernel being fetched for shapes, we want to replace input and output 19 | // tensors to reference the runtime structure containing sizes. 20 | void replaceSymbolicSizes(Fusion*); 21 | 22 | } // namespace cuda 23 | } // namespace fuser 24 | } // namespace jit 25 | } // namespace torch 26 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/lower_warp_reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace torch { 6 | namespace jit { 7 | namespace fuser { 8 | namespace cuda { 9 | 10 | struct WarpPaddedParallelInfo { 11 | bool is_tidx_padded = false; 12 | bool is_tidx_single_warp = false; 13 | bool has_warp_reduction = false; 14 | }; 15 | 16 | std::vector fuseWarpReduce(const std::vector exprs); 17 | 18 | } // namespace cuda 19 | } // namespace fuser 20 | } // namespace jit 21 | } // namespace torch 22 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/mutator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace torch { 11 | namespace jit { 12 | namespace fuser { 13 | namespace cuda { 14 | 15 | /* 16 | * Mutators are the mechanism used to modify IR nodes. Since most nodes are 17 | * immutable or at least partially immutable changeing them can require creating 18 | * a new node. Base mutator at the moment is a dumb sample mutator that takes 19 | * any float of value 1.0 and converts it to 0.0; It is currently used as a 20 | * dummy example, however, we should make it a simple instantiation of all the 21 | * mutate functions on all node types so that people can inherit it, and only 22 | * specialize those nodes which they want to have a particular transformation. 23 | */ 24 | 25 | } // namespace cuda 26 | } // namespace fuser 27 | } // namespace jit 28 | } // namespace torch 29 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/ops/all_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/parallel_type_bitmap.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace torch { 4 | namespace jit { 5 | namespace fuser { 6 | namespace cuda { 7 | 8 | constexpr std::bitset 9 | ParallelTypeBitmap::kTIDBits; 10 | constexpr std::bitset 11 | ParallelTypeBitmap::kBIDBits; 12 | 13 | std::string ParallelTypeBitmap::toString() const { 14 | std::stringstream ss; 15 | ss << "("; 16 | bool is_first = true; 17 | for (ParallelType pt : *this) { 18 | if (!is_first) { 19 | ss << " "; 20 | } 21 | ss << pt; 22 | is_first = false; 23 | } 24 | ss << ")"; 25 | return ss.str(); 26 | } 27 | 28 | } // namespace cuda 29 | } // namespace fuser 30 | } // namespace jit 31 | } // namespace torch 32 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/partial_split_map.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace torch { 12 | namespace jit { 13 | namespace fuser { 14 | namespace cuda { 15 | 16 | //! Collects start and stop offsets of all split root domains. Offsets 17 | //! are zero unless partially split. 18 | class TORCH_CUDA_CU_API PartialSplitMap { 19 | public: 20 | void build(Fusion* fusion); 21 | 22 | Val* getStartOffset(IterDomain* root_domain) const; 23 | Val* getStopOffset(IterDomain* root_domain) const; 24 | 25 | private: 26 | std::unordered_map start_offset_map_; 27 | std::unordered_map stop_offset_map_; 28 | }; 29 | 30 | } // namespace cuda 31 | } // namespace fuser 32 | } // namespace jit 33 | } // namespace torch 34 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/partition.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | /* 7 | * API for query node-compatibility in CudaCodeGen 8 | * 9 | * It is used in the optimization passes, where the graph is traversed and parts 10 | * that could be handled by CudaCodegen is partitioned and stuffed in 11 | * `attr::Subgraph` of `prim::CudaFusionGroup`. 12 | * 13 | * Logic right now is very simple. On top of device placement, we consider a 14 | * `Node` compatible when we have a parsing rule for it in our parser. 15 | */ 16 | 17 | namespace torch { 18 | namespace jit { 19 | namespace fuser { 20 | namespace cuda { 21 | 22 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup(const Node* node); 23 | 24 | // consider if `node` could be fused into `fusion` 25 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup( 26 | const Node* fusion, 27 | const Node* node); 28 | 29 | } // namespace cuda 30 | } // namespace fuser 31 | } // namespace jit 32 | } // namespace torch 33 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/python_frontend/python_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace torch { 7 | namespace jit { 8 | void initNvFuserPythonBindings(PyObject* module); 9 | } // namespace jit 10 | } // namespace torch 11 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | PYBIND11_MODULE(EXTENSION_NAME, m) { 5 | m.doc() = "nvfuser C API python binding"; // optional module docstring 6 | torch::jit::initNvFuserPythonBindings(m.ptr()); 7 | } 8 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/scheduler/all_schedulers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace torch { 8 | namespace jit { 9 | namespace fuser { 10 | namespace cuda { 11 | 12 | enum class TORCH_CUDA_CU_API ScheduleHeuristic { 13 | None, 14 | NoOp, 15 | PointWise, 16 | Reduction, 17 | Persistent, 18 | Transpose 19 | }; 20 | } 21 | } // namespace fuser 22 | } // namespace jit 23 | } // namespace torch 24 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/scheduler/debug_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace torch { 4 | namespace jit { 5 | namespace fuser { 6 | namespace cuda { 7 | 8 | namespace scheduler_debug_utils { 9 | 10 | // Basic logging utility for any messages in scheduler or segmenter 11 | template 12 | void canScheduleMessage(const Args&... args) { 13 | // Using builtin expect to reduce the overhead slightly, 14 | // alternatively may want to allow this message in debug 15 | // build only but that'd be inconvenient for user support. 16 | if (C10_UNLIKELY(isDebugDumpEnabled(DebugDumpOption::FusionSegmenterLog))) { 17 | std::cout << c10::str(args...) << "\n"; 18 | } 19 | } 20 | 21 | // Short-cut message for flagging why shedulers cannot schedule fusions, 22 | // assuming first argument is heuristic type (not actively checked). 23 | template 24 | void canScheduleRejectReason(HeuristicType heuristic, const Args&... args) { 25 | canScheduleMessage( 26 | "Scheduler _", heuristic, "_ ***rejected*** because : ", args...); 27 | } 28 | 29 | } // namespace scheduler_debug_utils 30 | 31 | } // namespace cuda 32 | } // namespace fuser 33 | } // namespace jit 34 | } // namespace torch 35 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/scheduler/heuristic.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace torch { 9 | namespace jit { 10 | namespace fuser { 11 | namespace cuda { 12 | 13 | class HeuristicParams : public PolymorphicBase { 14 | public: 15 | std::string tag = ""; 16 | 17 | LaunchParams lparams; 18 | 19 | virtual std::string toString() const { 20 | return "Undefined Heuristic Params"; 21 | } 22 | 23 | virtual size_t hash() const = 0; 24 | 25 | virtual ~HeuristicParams() = default; 26 | 27 | virtual bool sameAs(const std::shared_ptr& other) const = 0; 28 | 29 | virtual std::shared_ptr clone() const = 0; 30 | 31 | HeuristicParams() = default; 32 | HeuristicParams(const std::string& tag) : tag(tag) {} 33 | }; 34 | 35 | } // namespace cuda 36 | } // namespace fuser 37 | } // namespace jit 38 | } // namespace torch 39 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/scheduler/normalization.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | // TODO: If caching inputs would require persistence we are sending it to the 9 | // persistent kerenl scheduler. This isn't necessary if the only persistent 10 | // buffers are inputs as we could re-read them from global memory. Need to 11 | // consider if this is worth implementing. 12 | 13 | namespace torch { 14 | namespace jit { 15 | namespace fuser { 16 | namespace cuda { 17 | 18 | class SchedulerRuntimeInfo; 19 | class HeuristicSummary; 20 | 21 | TORCH_CUDA_CU_API std::shared_ptr getPersistentHeuristics( 22 | Fusion* fusion, 23 | const at::ArrayRef& runtime_inputs, 24 | HeuristicSummary* data_cache = nullptr); 25 | 26 | TORCH_CUDA_CU_API std::shared_ptr getPersistentHeuristics( 27 | Fusion* fusion, 28 | SchedulerRuntimeInfo& runtime_info, 29 | HeuristicSummary* data_cache = nullptr); 30 | 31 | TORCH_CUDA_CU_API void schedulePersistentKernel( 32 | Fusion* fusion, 33 | const ReductionParams& rparams); 34 | 35 | } // namespace cuda 36 | } // namespace fuser 37 | } // namespace jit 38 | } // namespace torch 39 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/scheduler/reduction.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | namespace torch { 9 | namespace jit { 10 | namespace fuser { 11 | namespace cuda { 12 | 13 | class SchedulerRuntimeInfo; 14 | class HeuristicSummary; 15 | 16 | TORCH_CUDA_CU_API std::shared_ptr getReductionHeuristics( 17 | Fusion* fusion, 18 | const at::ArrayRef& runtime_inputs, 19 | HeuristicSummary* data_cache = nullptr); 20 | 21 | TORCH_CUDA_CU_API std::shared_ptr getReductionHeuristics( 22 | Fusion* fusion, 23 | SchedulerRuntimeInfo& runtime_info, 24 | HeuristicSummary* data_cache = nullptr); 25 | 26 | TORCH_CUDA_CU_API void scheduleReduction( 27 | Fusion* fusion, 28 | const ReductionParams& rparams); 29 | } // namespace cuda 30 | } // namespace fuser 31 | } // namespace jit 32 | } // namespace torch 33 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/transform_rfactor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace torch { 12 | namespace jit { 13 | namespace fuser { 14 | namespace cuda { 15 | 16 | // TODO: Only replay dispatch is really borrowed from TransformIter, we should 17 | // reevaluate the reuse of dispatch for classes that inherit TransformIter. 18 | class TORCH_CUDA_CU_API TransformRFactor { 19 | public: 20 | // Transform the provided tensor domain to two domains, a producer and 21 | // consumer domain. These domains are created by taking axes and reducing them 22 | // in the producer domain, and taking the remaining reduction axes and 23 | // reducing them in the consumer domain. 24 | static std::pair runReplay( 25 | TensorDomain*, 26 | std::vector axes); 27 | }; 28 | 29 | } // namespace cuda 30 | } // namespace fuser 31 | } // namespace jit 32 | } // namespace torch 33 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/type_inference.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace torch { 7 | namespace jit { 8 | namespace fuser { 9 | namespace cuda { 10 | 11 | TORCH_CUDA_CU_API void TypePropagate(std::shared_ptr& graph); 12 | 13 | } // namespace cuda 14 | } // namespace fuser 15 | } // namespace jit 16 | } // namespace torch 17 | -------------------------------------------------------------------------------- /third_party/nvfuser/csrc/vectorization_info.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace torch { 8 | namespace jit { 9 | namespace fuser { 10 | namespace cuda { 11 | 12 | struct VectorizedSetInfo { 13 | //! Producer of a vectorized set 14 | TensorView* producer_tv = nullptr; 15 | //! Consumer of a vectorized set 16 | TensorView* consumer_tv = nullptr; 17 | //! Number of elements to vectorize 18 | int word_size = -1; 19 | //! Vectorized domain 20 | IterDomain* vectorized_leaf_id = nullptr; 21 | //! Right-most root dependent domain of the leaf domain 22 | IterDomain* vectorized_root_id = nullptr; 23 | //! All of the dependent root domains that are contiguously merged 24 | std::unordered_set contig_root_ids; 25 | }; 26 | 27 | } // namespace cuda 28 | } // namespace fuser 29 | } // namespace jit 30 | } // namespace torch 31 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_extension/README.md: -------------------------------------------------------------------------------- 1 | # Build 2 | 3 | ``` 4 | python setup.py install 5 | ``` 6 | 7 | # Test 8 | 9 | ``` 10 | python test.py 11 | ``` 12 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_extension/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | using namespace torch::jit::fuser::cuda; 9 | 10 | at::Tensor sinh_nvfuser(const at::Tensor& input) { 11 | Fusion fusion; 12 | FusionGuard fg(&fusion); 13 | 14 | int dim = input.dim(); 15 | auto dtype = input.scalar_type(); 16 | auto x = 17 | TensorViewBuilder().ndims(dim).dtype(aten_to_data_type(dtype)).build(); 18 | fusion.addInput(x); 19 | 20 | // Using equation sinh(x) = [ exp(x) - exp(-1) ] / 2 21 | auto output = div(sub(exp(x), exp(neg(x))), IrBuilder::create(2.0)); 22 | fusion.addOutput(output); 23 | 24 | std::cout << "Create fusion:" << std::endl; 25 | fusion.print(); 26 | 27 | auto lparams = schedulePointwise(&fusion, {input}); 28 | 29 | FusionExecutor fe; 30 | fe.compileFusion(&fusion, {input}, lparams); 31 | auto outputs = fe.runFusion({input}, lparams); 32 | 33 | return outputs[0]; 34 | } 35 | 36 | TORCH_LIBRARY(myop, m) { 37 | m.def("sinh_nvfuser", sinh_nvfuser); 38 | } 39 | 40 | TORCH_LIBRARY_IMPL(myop, CUDA, m) { 41 | m.impl("sinh_nvfuser", sinh_nvfuser); 42 | } 43 | 44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {} 45 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_extension/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='nvfuser_extension', 6 | ext_modules=[ 7 | CUDAExtension( 8 | name='nvfuser_extension', 9 | pkg='nvfuser_extension', 10 | sources=['main.cpp']) 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_extension/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import nvfuser_extension # noqa: F401 3 | 4 | t = torch.randn((5, 5), device='cuda') 5 | expected = torch.sinh(t) 6 | output = torch.ops.myop.sinh_nvfuser(t) 7 | 8 | print("Expected:", expected) 9 | print("Output:", output) 10 | 11 | assert torch.allclose(output, expected) 12 | print("They match!") 13 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_libtorch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10 FATAL_ERROR) 2 | project(sinh_example LANGUAGES CXX) 3 | set(CMAKE_CXX_STANDARD 14) 4 | 5 | find_package(Torch REQUIRED) 6 | 7 | add_executable(sinh_example main.cpp) 8 | target_link_libraries(sinh_example ${TORCH_LIBRARIES}) 9 | -------------------------------------------------------------------------------- /third_party/nvfuser/examples/sinh_libtorch/README.md: -------------------------------------------------------------------------------- 1 | # Build 2 | 3 | ``` 4 | mkdir build 5 | cd build 6 | cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. 7 | make -j 8 | ``` 9 | 10 | # Test 11 | 12 | ``` 13 | ./sinh_example 14 | ``` 15 | -------------------------------------------------------------------------------- /third_party/nvfuser/python/__init__.py: -------------------------------------------------------------------------------- 1 | from . import _C 2 | -------------------------------------------------------------------------------- /third_party/nvfuser/python_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/python_tests/__init__.py -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/bf16_support.cu: -------------------------------------------------------------------------------- 1 | 2 | #define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast(&(var))) 3 | #define __NVFUSER_BFLOAT_TO_CUS(var) \ 4 | *(reinterpret_cast(&(var))) 5 | 6 | struct __bfloat; 7 | __device__ __bfloat __float2bfloat(const float); 8 | 9 | struct __align__(2) __bfloat { 10 | __bfloat() = default; 11 | 12 | __device__ __bfloat(const float f) { 13 | __x = __float2bfloat(f).__x; 14 | } 15 | 16 | protected: 17 | unsigned short __x; 18 | }; 19 | 20 | __device__ __bfloat __float2bfloat(const float f) { 21 | __bfloat val; 22 | asm("{ cvt.rn.bf16.f32 %0, %1;}\n" 23 | : "=h"(__NVFUSER_BFLOAT_TO_US(val)) 24 | : "f"(f)); 25 | return val; 26 | } 27 | 28 | __device__ float __bfloat2float(const __bfloat h) { 29 | float val; 30 | asm("{ mov.b32 %0, {0,%1};}\n" 31 | : "=f"(val) 32 | : "h"(__NVFUSER_BFLOAT_TO_CUS(h))); 33 | return val; 34 | } 35 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/bf16_support_rocm.cu: -------------------------------------------------------------------------------- 1 | 2 | struct __align__(2) __bfloat { 3 | __bfloat() = default; 4 | 5 | inline __device__ __bfloat(const float f) { 6 | if (f != f) { 7 | __x = uint16_t(0x7FC0); 8 | } else { 9 | union { 10 | uint32_t U32; 11 | float F32; 12 | }; 13 | 14 | F32 = f; 15 | uint32_t rounding_bias = ((U32 >> 16) & 1) + uint32_t(0x7FFF); 16 | __x = static_cast((U32 + rounding_bias) >> 16); 17 | } 18 | } 19 | 20 | inline __device__ operator float() const { 21 | float res = 0; 22 | uint32_t tmp = __x; 23 | tmp <<= 16; 24 | float* tempRes = reinterpret_cast(&tmp); 25 | res = *tempRes; 26 | return res; 27 | } 28 | 29 | protected: 30 | unsigned short __x; 31 | }; 32 | 33 | __device__ __bfloat __float2bfloat(const float f) { 34 | return __bfloat(f); 35 | } 36 | 37 | __device__ float __bfloat2float(const __bfloat h) { 38 | return float(h); 39 | } 40 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/block_sync_default.cu: -------------------------------------------------------------------------------- 1 | 2 | // Default block synchronization. Just use __barrier_sync 3 | namespace block_sync { 4 | 5 | __forceinline__ __device__ void init() {} 6 | 7 | // Thread-block synchronization 8 | __forceinline__ __device__ void sync() { 9 | __barrier_sync(0); 10 | } 11 | 12 | } // namespace block_sync 13 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/block_sync_default_rocm.cu: -------------------------------------------------------------------------------- 1 | 2 | // Default block synchronization. Just use __barrier_sync 3 | namespace block_sync { 4 | 5 | __forceinline__ __device__ void init() {} 6 | 7 | // Thread-block synchronization 8 | __forceinline__ __device__ void sync() { 9 | __syncthreads(); 10 | } 11 | 12 | } // namespace block_sync 13 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/broadcast.cu: -------------------------------------------------------------------------------- 1 | 2 | namespace broadcast { 3 | // Broadcasts within partitioned groups of threads. 4 | // 5 | // X_THREAD: Broadcast from threadIdx.x == 0 if true 6 | // Y_THREAD: Broadcast from threadIdx.y == 0 if true 7 | // Z_THREAD: Broadcast from threadIdx.z == 0 if true 8 | // inp_val: Per-thread source value. Only valid when the thread is a source. 9 | // out: Per-thread output location 10 | // 11 | template 12 | __device__ void blockBroadcast( 13 | T& out, 14 | const T& inp_val, 15 | T* shared_mem, 16 | bool read_write_pred) { 17 | const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && 18 | (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0); 19 | 20 | const auto shared_offset = 21 | index_utils::maskedOffset( 22 | threadIdx, blockDim); 23 | 24 | if (has_valid_data && read_write_pred) { 25 | shared_mem[shared_offset] = inp_val; 26 | } 27 | 28 | block_sync::sync(); 29 | 30 | if (read_write_pred) { 31 | out = shared_mem[shared_offset]; 32 | } 33 | 34 | block_sync::sync(); 35 | } 36 | 37 | } // namespace broadcast 38 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/tensor.cu: -------------------------------------------------------------------------------- 1 | template 2 | struct Tensor { 3 | __device__ T& operator[](nvfuser_index_t ind) { 4 | return data[ind]; 5 | }; 6 | 7 | T* data; 8 | nvfuser_index_t size[N]; 9 | nvfuser_index_t stride[N]; 10 | }; 11 | 12 | // Specialization for 0-dim case as it does not need size and stride arrays. 13 | // They will be an error as well since zero-length arrays are not allowed. 14 | template 15 | struct Tensor { 16 | __device__ T& operator[](nvfuser_index_t) { 17 | return *data; 18 | }; 19 | 20 | T* data; 21 | }; 22 | 23 | // Specialization for 0-dim case that's easy to pass in a CPU based tensor. 24 | template 25 | struct CpuScalarTensor { 26 | __device__ T& operator[](int) { 27 | return data; 28 | }; 29 | 30 | T data; 31 | }; 32 | -------------------------------------------------------------------------------- /third_party/nvfuser/runtime/type_traits.cu: -------------------------------------------------------------------------------- 1 | // Type trait utils 2 | template 3 | struct MaybeVolatile; 4 | 5 | template 6 | struct MaybeVolatile { 7 | using type = volatile Type; 8 | }; 9 | 10 | template 11 | struct MaybeVolatile { 12 | using type = Type; 13 | }; 14 | 15 | template 16 | struct TypeList {}; 17 | 18 | template 19 | struct TypeSelector { 20 | using type = typename TypeSelector::type; 21 | }; 22 | 23 | template 24 | struct TypeSelector<0, T, Types...> { 25 | using type = T; 26 | }; 27 | 28 | template 29 | struct IsSameType { 30 | static constexpr bool value = false; 31 | }; 32 | 33 | template 34 | struct IsSameType { 35 | static constexpr bool value = true; 36 | }; 37 | 38 | template 39 | struct IsPointerType { 40 | static constexpr bool value = false; 41 | }; 42 | 43 | template 44 | struct IsPointerType { 45 | static constexpr bool value = true; 46 | }; 47 | -------------------------------------------------------------------------------- /third_party/sleef.bzl: -------------------------------------------------------------------------------- 1 | load("@rules_cc//cc:defs.bzl", "cc_library") 2 | 3 | # This macro provides for generating both "sleef" and 4 | # "sleefdet" libraries for a given set of code. The difference is 5 | # that the "det" libraries get compiled with "-DDETERMINISTIC=1". 6 | 7 | def sleef_cc_library(name, copts, **kwargs): 8 | cc_library( 9 | name = name, 10 | copts = copts, 11 | **kwargs 12 | ) 13 | 14 | prefix = "sleef" 15 | if not name.startswith(prefix): 16 | fail("name {} does not start with {}".format(repr(name), repr(prefix))) 17 | 18 | cc_library( 19 | name = name.replace(prefix, prefix + "det", 1), 20 | copts = copts + ["-DDETERMINISTIC=1"], 21 | **kwargs 22 | ) 23 | -------------------------------------------------------------------------------- /third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl: -------------------------------------------------------------------------------- 1 | # Macros for building CUDA code. 2 | def if_cuda(if_true, if_false = []): 3 | """Shorthand for select()'ing on whether we're building with CUDA. 4 | 5 | Returns a select statement which evaluates to if_true if we're building 6 | with CUDA enabled. Otherwise, the select statement evaluates to if_false. 7 | 8 | """ 9 | return select({ 10 | "@local_config_cuda//cuda:using_clang": if_true, 11 | "@local_config_cuda//cuda:using_nvcc": if_true, 12 | "//conditions:default": if_false, 13 | }) 14 | 15 | def cuda_default_copts(): 16 | """Default options for all CUDA compilations.""" 17 | return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + []) 18 | 19 | def cuda_is_configured(): 20 | """Returns true if CUDA was enabled during the configure process.""" 21 | return True 22 | 23 | def if_cuda_is_configured(x): 24 | """Tests if the CUDA was enabled during the configure process. 25 | 26 | Unlike if_cuda(), this does not require that we are building with 27 | --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries. 28 | """ 29 | if cuda_is_configured(): 30 | return x 31 | return [] 32 | -------------------------------------------------------------------------------- /third_party/valgrind-headers/README.md: -------------------------------------------------------------------------------- 1 | This folder contains 2 Valgrind headers, downloaded from 2 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=callgrind/callgrind.h;hb=HEAD 3 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=include/valgrind.h;hb=HEAD 4 | 5 | 6 | -------------------------------------------------------------------------------- /torchgen/BUCK.oss: -------------------------------------------------------------------------------- 1 | python_library( 2 | name = "torchgen", 3 | srcs = glob( 4 | ["**/*.py"], 5 | ), 6 | base_module = "torchgen", 7 | visibility = ["PUBLIC"], 8 | deps = [ 9 | "//third_party:pyyaml", 10 | "//third_party:typing-extensions", 11 | ], 12 | ) 13 | 14 | python_binary( 15 | name = "gen", 16 | main_module = "torchgen.gen", 17 | visibility = [ 18 | "PUBLIC", 19 | ], 20 | deps = [ 21 | ":torchgen", 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /torchgen/BUILD.bazel: -------------------------------------------------------------------------------- 1 | load("//:tools/bazel.bzl", "rules") 2 | load(":build.bzl", "define_targets") 3 | 4 | define_targets(rules = rules) 5 | -------------------------------------------------------------------------------- /torchgen/__init__.py: -------------------------------------------------------------------------------- 1 | """torchgen 2 | 3 | This module contains codegeneration utilities for PyTorch. It is used to 4 | build PyTorch from source, but may also be used for out-of-tree projects 5 | that extend PyTorch. 6 | 7 | Note well that we provide no BC guarantees for torchgen. If you're interested 8 | in using torchgen and want the PyTorch team to be aware, please reach out 9 | on GitHub. 10 | """ 11 | -------------------------------------------------------------------------------- /torchgen/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/api/__init__.py -------------------------------------------------------------------------------- /torchgen/api/meta.py: -------------------------------------------------------------------------------- 1 | from torchgen.model import NativeFunctionsGroup 2 | 3 | # Follows dispatcher calling convention, but: 4 | # - Mutable arguments not allowed. Meta functions are always 5 | # written in functional form. Look at FunctionSchema.signature() 6 | # - No tensor returns; instead we return a TensorMeta describing 7 | # the tensor in question 8 | 9 | 10 | def name(g: NativeFunctionsGroup) -> str: 11 | # use the overload name from the functional version 12 | return str(g.functional.func.name).replace(".", "_") 13 | -------------------------------------------------------------------------------- /torchgen/api/types/__init__.py: -------------------------------------------------------------------------------- 1 | from .types import * 2 | from .types_base import * 3 | from .signatures import * # isort:skip 4 | -------------------------------------------------------------------------------- /torchgen/build.bzl: -------------------------------------------------------------------------------- 1 | def define_targets(rules): 2 | rules.py_library( 3 | name = "torchgen", 4 | srcs = rules.glob(["**/*.py"]), 5 | visibility = ["//visibility:public"], 6 | deps = [ 7 | rules.requirement("PyYAML"), 8 | rules.requirement("typing-extensions"), 9 | ], 10 | ) 11 | 12 | rules.py_binary( 13 | name = "gen", 14 | srcs = [":torchgen"], 15 | visibility = ["//visibility:public"], 16 | ) 17 | 18 | rules.py_binary( 19 | name = "gen_executorch", 20 | srcs = [":torchgen"], 21 | visibility = ["//visibility:public"], 22 | ) 23 | -------------------------------------------------------------------------------- /torchgen/dest/__init__.py: -------------------------------------------------------------------------------- 1 | from .lazy_ir import ( 2 | generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes, 3 | GenLazyIR as GenLazyIR, 4 | GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition, 5 | GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition, 6 | ) 7 | from .native_functions import ( 8 | compute_native_function_declaration as compute_native_function_declaration, 9 | ) 10 | from .register_dispatch_key import ( 11 | gen_registration_headers as gen_registration_headers, 12 | gen_registration_helpers as gen_registration_helpers, 13 | RegisterDispatchKey as RegisterDispatchKey, 14 | ) 15 | from .ufunc import ( 16 | compute_ufunc_cpu as compute_ufunc_cpu, 17 | compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel, 18 | compute_ufunc_cuda as compute_ufunc_cuda, 19 | ) 20 | -------------------------------------------------------------------------------- /torchgen/executorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/__init__.py -------------------------------------------------------------------------------- /torchgen/executorch/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/api/__init__.py -------------------------------------------------------------------------------- /torchgen/executorch/api/types/__init__.py: -------------------------------------------------------------------------------- 1 | from .types import * 2 | from .signatures import * # isort:skip 3 | -------------------------------------------------------------------------------- /torchgen/operator_versions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/operator_versions/__init__.py -------------------------------------------------------------------------------- /torchgen/operator_versions/gen_mobile_upgraders_constant.py: -------------------------------------------------------------------------------- 1 | MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/** 2 | * @generated 3 | * This is an auto-generated file. Please do not modify it by hand. 4 | * To re-generate, please run: 5 | * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py 6 | */ 7 | """ 8 | -------------------------------------------------------------------------------- /torchgen/selective_build/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/selective_build/__init__.py -------------------------------------------------------------------------------- /torchgen/static_runtime/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/static_runtime/__init__.py --------------------------------------------------------------------------------