├── .bazelrc
├── .bazelversion
├── .buckconfig.oss
├── .ci
    ├── caffe2
    │   ├── README.md
    │   ├── common.sh
    │   └── test.sh
    ├── docker
    │   ├── README.md
    │   ├── android
    │   │   ├── AndroidManifest.xml
    │   │   └── build.gradle
    │   ├── build.sh
    │   ├── build_docker.sh
    │   ├── centos-rocm
    │   │   └── Dockerfile
    │   ├── ci_commit_pins
    │   │   ├── triton-rocm.txt
    │   │   └── triton.txt
    │   ├── common
    │   │   ├── common_utils.sh
    │   │   ├── install_android.sh
    │   │   ├── install_base.sh
    │   │   ├── install_cache.sh
    │   │   ├── install_clang.sh
    │   │   ├── install_cmake.sh
    │   │   ├── install_conda.sh
    │   │   ├── install_cudnn.sh
    │   │   ├── install_db.sh
    │   │   ├── install_devtoolset.sh
    │   │   ├── install_docs_reqs.sh
    │   │   ├── install_gcc.sh
    │   │   ├── install_glibc.sh
    │   │   ├── install_jni.sh
    │   │   ├── install_lcov.sh
    │   │   ├── install_linter.sh
    │   │   ├── install_ninja.sh
    │   │   ├── install_onnx.sh
    │   │   ├── install_openmpi.sh
    │   │   ├── install_openssl.sh
    │   │   ├── install_protobuf.sh
    │   │   ├── install_rocm.sh
    │   │   ├── install_rocm_magma.sh
    │   │   ├── install_swiftshader.sh
    │   │   ├── install_thrift.sh
    │   │   ├── install_triton.sh
    │   │   ├── install_ucc.sh
    │   │   ├── install_user.sh
    │   │   ├── install_vision.sh
    │   │   └── install_vulkan_sdk.sh
    │   ├── java
    │   │   └── jni.h
    │   ├── linter
    │   │   └── Dockerfile
    │   ├── requirements-ci.txt
    │   ├── triton_version.txt
    │   ├── ubuntu-cuda
    │   │   └── Dockerfile
    │   ├── ubuntu-rocm
    │   │   ├── .gitignore
    │   │   └── Dockerfile
    │   └── ubuntu
    │   │   └── Dockerfile
    ├── onnx
    │   ├── README.md
    │   ├── common.sh
    │   └── test.sh
    └── pytorch
    │   ├── .shellcheckrc
    │   ├── README.md
    │   ├── build-asan.sh
    │   ├── build-mobile.sh
    │   ├── build.sh
    │   ├── codegen-test.sh
    │   ├── common-build.sh
    │   ├── common.sh
    │   ├── common_utils.sh
    │   ├── create_test_cert.py
    │   ├── docker-build-test.sh
    │   ├── docs-test.sh
    │   ├── fake_numpy
    │       └── numpy.py
    │   ├── macos-build-test.sh
    │   ├── macos-build.sh
    │   ├── macos-common.sh
    │   ├── macos-test.sh
    │   ├── multigpu-test.sh
    │   ├── perf_test
    │       ├── common.sh
    │       ├── compare_with_baseline.py
    │       ├── get_stats.py
    │       ├── test_cpu_speed_mini_sequence_labeler.sh
    │       ├── test_cpu_speed_mnist.sh
    │       ├── test_cpu_speed_torch.sh
    │       ├── test_cpu_speed_torch_tensor.sh
    │       ├── test_gpu_speed_cudnn_lstm.sh
    │       ├── test_gpu_speed_lstm.sh
    │       ├── test_gpu_speed_mlstm.sh
    │       ├── test_gpu_speed_mnist.sh
    │       ├── test_gpu_speed_word_language_model.sh
    │       └── update_commit_hash.py
    │   ├── print_sccache_log.py
    │   ├── run_glootls_test.sh
    │   ├── short-perf-test-cpu.sh
    │   ├── short-perf-test-gpu.sh
    │   ├── test.sh
    │   ├── win-build.sh
    │   ├── win-test-helpers
    │       ├── build_pytorch.bat
    │       ├── choose_runtime_cuda_version.bat
    │       ├── installation-helpers
    │       │   ├── activate_miniconda3.bat
    │       │   ├── install_magma.bat
    │       │   ├── install_mkl.bat
    │       │   └── install_sccache.bat
    │       ├── run_python_nn_smoketests.py
    │       ├── setup_pytorch_env.bat
    │       ├── test_custom_backend.bat
    │       ├── test_custom_script_ops.bat
    │       ├── test_distributed.bat
    │       ├── test_libtorch.bat
    │       ├── test_python_jit_legacy.bat
    │       └── test_python_shard.bat
    │   └── win-test.sh
├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .coveragerc
├── .ctags.d
    └── pytorch.ctags
├── .dockerignore
├── .flake8
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yml
    │   ├── ci-sev.md
    │   ├── config.yml
    │   ├── disable-ci-jobs.md
    │   ├── documentation.yml
    │   ├── feature-request.yml
    │   └── pt2-bug-report.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── actionlint.yaml
    ├── actions
    │   ├── build-android
    │   │   └── action.yml
    │   ├── calculate-docker-image
    │   │   └── action.yml
    │   ├── checkout-pytorch
    │   │   └── action.yml
    │   ├── chown-workspace
    │   │   └── action.yml
    │   ├── diskspace-cleanup
    │   │   └── action.yml
    │   ├── download-build-artifacts
    │   │   └── action.yml
    │   ├── filter-test-configs
    │   │   └── action.yml
    │   ├── get-workflow-job-id
    │   │   └── action.yml
    │   ├── setup-linux
    │   │   └── action.yml
    │   ├── setup-rocm
    │   │   └── action.yml
    │   ├── setup-win
    │   │   └── action.yml
    │   ├── teardown-rocm
    │   │   └── action.yml
    │   ├── teardown-win
    │   │   └── action.yml
    │   ├── test-pytorch-binary
    │   │   └── action.yml
    │   └── upload-test-artifacts
    │   │   └── action.yml
    ├── auto_request_review.yml
    ├── ci_commit_pins
    │   ├── audio.txt
    │   ├── huggingface.txt
    │   ├── multipy.txt
    │   ├── text.txt
    │   ├── timm.txt
    │   ├── torchbench.txt
    │   ├── triton.txt
    │   ├── vision.txt
    │   └── xla.txt
    ├── labeler.yml
    ├── merge_rules.yaml
    ├── pytorch-circleci-labels.yml
    ├── pytorch-probot.yml
    ├── regenerate.sh
    ├── requirements-gha-cache.txt
    ├── requirements
    │   ├── README.md
    │   ├── conda-env-Linux-X64
    │   ├── conda-env-iOS
    │   ├── conda-env-macOS-ARM64
    │   ├── conda-env-macOS-X64
    │   ├── pip-requirements-iOS.txt
    │   ├── pip-requirements-macOS.txt
    │   └── regenerate-requirements.txt
    ├── scripts
    │   ├── README.md
    │   ├── build_triton_wheel.py
    │   ├── check_labels.py
    │   ├── collect_ciflow_labels.py
    │   ├── comment_on_pr.py
    │   ├── convert_lintrunner_annotations_to_github.py
    │   ├── ensure_actions_will_cancel.py
    │   ├── export_pytorch_labels.py
    │   ├── fetch_latest_green_commit.py
    │   ├── filter_test_configs.py
    │   ├── generate_binary_build_matrix.py
    │   ├── generate_ci_workflows.py
    │   ├── generate_pytorch_version.py
    │   ├── get_workflow_job_id.py
    │   ├── github_utils.py
    │   ├── gitutils.py
    │   ├── gql_mocks.json
    │   ├── kill_active_ssh_sessions.ps1
    │   ├── label_utils.py
    │   ├── lint_native_functions.py
    │   ├── on_cancel_merge.py
    │   ├── parse_ref.py
    │   ├── pr-sanity-check.sh
    │   ├── report_git_status.sh
    │   ├── rockset_mocks.json
    │   ├── run_torchbench.py
    │   ├── stop_runner_service.sh
    │   ├── test_check_labels.py
    │   ├── test_fetch_latest_green_commit.py
    │   ├── test_filter_test_configs.py
    │   ├── test_gitutils.py
    │   ├── test_label_utils.py
    │   ├── test_trymerge.py
    │   ├── test_tryrebase.py
    │   ├── trymerge.py
    │   ├── trymerge_explainer.py
    │   ├── tryrebase.py
    │   ├── update_commit_hashes.py
    │   └── wait_for_ssh_to_drain.ps1
    └── templates
    │   ├── common.yml.j2
    │   ├── linux_binary_build_workflow.yml.j2
    │   ├── macos_binary_build_workflow.yml.j2
    │   ├── upload.yml.j2
    │   └── windows_binary_build_workflow.yml.j2
├── .gitignore
├── .gitmodules
├── .isort.cfg
├── .lintrunner.toml
├── .lldbinit
├── BUILD.bazel
├── LICENSE
├── README.md
├── SECURITY.md
├── benchmarks
    ├── README.md
    ├── compare-fastrnn-results.py
    ├── compare.sh
    ├── cpp
    │   ├── CMakeLists.txt
    │   ├── convolution.cpp
    │   └── tensorexpr
    │   │   ├── CMakeLists.txt
    │   │   ├── bench_approx.cpp
    │   │   ├── bench_batchnorm.cpp
    │   │   ├── bench_compile.cpp
    │   │   ├── bench_concat.cpp
    │   │   ├── bench_fuser_overhead.cpp
    │   │   ├── bench_gemm.cpp
    │   │   ├── bench_kernels.cpp
    │   │   ├── bench_ops.py
    │   │   ├── bench_parallel.cpp
    │   │   ├── bench_prefix_sum.cpp
    │   │   ├── bench_reduce.cpp
    │   │   ├── bench_signed_log1p.cpp
    │   │   └── main.cpp
    ├── distributed
    │   ├── ddp
    │   │   ├── README.md
    │   │   ├── benchmark.py
    │   │   └── diff.py
    │   ├── pipeline
    │   │   ├── benchmark_dataset.py
    │   │   └── pipe.py
    │   └── rpc
    │   │   ├── parameter_server
    │   │       ├── README.md
    │   │       ├── configurations
    │   │       │   ├── data_configurations.json
    │   │       │   └── model_configurations.json
    │   │       ├── data
    │   │       │   ├── DummyData.py
    │   │       │   └── __init__.py
    │   │       ├── launcher.py
    │   │       ├── metrics
    │   │       │   ├── CPUMetric.py
    │   │       │   ├── CUDAMetric.py
    │   │       │   ├── MetricBase.py
    │   │       │   ├── MetricsLogger.py
    │   │       │   └── ProcessedMetricsPrinter.py
    │   │       ├── models
    │   │       │   ├── DummyModel.py
    │   │       │   └── __init__.py
    │   │       ├── server
    │   │       │   ├── __init__.py
    │   │       │   └── server.py
    │   │       ├── trainer
    │   │       │   ├── __init__.py
    │   │       │   ├── criterions.py
    │   │       │   ├── ddp_models.py
    │   │       │   ├── hook_states.py
    │   │       │   ├── hooks.py
    │   │       │   ├── iteration_steps.py
    │   │       │   ├── preprocess_data.py
    │   │       │   └── trainer.py
    │   │       └── utils.py
    │   │   └── rl
    │   │       ├── README.md
    │   │       ├── agent.py
    │   │       ├── coordinator.py
    │   │       ├── launcher.py
    │   │       └── observer.py
    ├── dynamo
    │   ├── Makefile
    │   ├── README.md
    │   ├── __init__.py
    │   ├── all_torchbench_models_list.txt
    │   ├── benchmarks.py
    │   ├── check_accuracy.py
    │   ├── check_csv.py
    │   ├── check_graph_breaks.py
    │   ├── check_hf_bert_perf_csv.py
    │   ├── check_memory_compression_ratio.py
    │   ├── ci_expected_accuracy
    │   │   ├── inductor_huggingface_dynamic_inference.csv
    │   │   ├── inductor_huggingface_dynamic_training.csv
    │   │   ├── inductor_huggingface_inference.csv
    │   │   ├── inductor_huggingface_training.csv
    │   │   ├── inductor_timm_dynamic_inference.csv
    │   │   ├── inductor_timm_dynamic_training.csv
    │   │   ├── inductor_timm_inference.csv
    │   │   ├── inductor_timm_training.csv
    │   │   ├── inductor_torchbench_dynamic_inference.csv
    │   │   ├── inductor_torchbench_dynamic_training.csv
    │   │   ├── inductor_torchbench_inference.csv
    │   │   ├── inductor_torchbench_training.csv
    │   │   └── update_expected.py
    │   ├── combine_csv.py
    │   ├── common.py
    │   ├── dist_util.py
    │   ├── distributed.py
    │   ├── expected_ci_perf_inductor_torchbench.csv
    │   ├── huggingface.py
    │   ├── huggingface_models_list.txt
    │   ├── huggingface_models_list_cpu.txt
    │   ├── microbenchmarks
    │   │   ├── __init__.py
    │   │   ├── bench_mm_fusion.py
    │   │   ├── benchmark_helper.py
    │   │   ├── inductor_bmm.py
    │   │   ├── inductor_mm.py
    │   │   ├── matmul_relu.py
    │   │   ├── microbench.py
    │   │   ├── model.py
    │   │   ├── operator_inp_logs
    │   │   │   ├── hf_train
    │   │   │   │   ├── AlbertForMaskedLM_training.txt
    │   │   │   │   ├── AlbertForQuestionAnswering_training.txt
    │   │   │   │   ├── AllenaiLongformerBase_training.txt
    │   │   │   │   ├── BartForCausalLM_training.txt
    │   │   │   │   ├── BartForConditionalGeneration_training.txt
    │   │   │   │   ├── BertForMaskedLM_training.txt
    │   │   │   │   ├── BertForQuestionAnswering_training.txt
    │   │   │   │   ├── BigBird_training.txt
    │   │   │   │   ├── BlenderbotSmallForCausalLM_training.txt
    │   │   │   │   ├── BlenderbotSmallForConditionalGeneration_training.txt
    │   │   │   │   ├── CamemBert_training.txt
    │   │   │   │   ├── DebertaForMaskedLM_training.txt
    │   │   │   │   ├── DebertaForQuestionAnswering_training.txt
    │   │   │   │   ├── DebertaV2ForMaskedLM_training.txt
    │   │   │   │   ├── DebertaV2ForQuestionAnswering_training.txt
    │   │   │   │   ├── DistilBertForMaskedLM_training.txt
    │   │   │   │   ├── DistilBertForQuestionAnswering_training.txt
    │   │   │   │   ├── DistillGPT2_training.txt
    │   │   │   │   ├── ElectraForCausalLM_training.txt
    │   │   │   │   ├── ElectraForQuestionAnswering_training.txt
    │   │   │   │   ├── GPT2ForSequenceClassification_training.txt
    │   │   │   │   ├── GPTNeoForCausalLM_training.txt
    │   │   │   │   ├── GPTNeoForSequenceClassification_training.txt
    │   │   │   │   ├── GoogleFnet_training.txt
    │   │   │   │   ├── LayoutLMForMaskedLM_training.txt
    │   │   │   │   ├── LayoutLMForSequenceClassification_training.txt
    │   │   │   │   ├── M2M100ForConditionalGeneration_training.txt
    │   │   │   │   ├── MBartForCausalLM_training.txt
    │   │   │   │   ├── MBartForConditionalGeneration_training.txt
    │   │   │   │   ├── MegatronBertForCausalLM_training.txt
    │   │   │   │   ├── MegatronBertForQuestionAnswering_training.txt
    │   │   │   │   ├── MobileBertForMaskedLM_training.txt
    │   │   │   │   ├── MobileBertForQuestionAnswering_training.txt
    │   │   │   │   ├── OPTForCausalLM_training.txt
    │   │   │   │   ├── PLBartForCausalLM_training.txt
    │   │   │   │   ├── PLBartForConditionalGeneration_training.txt
    │   │   │   │   ├── PegasusForCausalLM_training.txt
    │   │   │   │   ├── PegasusForConditionalGeneration_training.txt
    │   │   │   │   ├── RobertaForCausalLM_training.txt
    │   │   │   │   ├── RobertaForQuestionAnswering_training.txt
    │   │   │   │   ├── Speech2Text2ForCausalLM_training.txt
    │   │   │   │   ├── TrOCRForCausalLM_training.txt
    │   │   │   │   ├── XGLMForCausalLM_training.txt
    │   │   │   │   ├── XLNetLMHeadModel_training.txt
    │   │   │   │   └── YituTechConvBert_training.txt
    │   │   │   ├── timm_train
    │   │   │   │   ├── adv_inception_v3_training.txt
    │   │   │   │   ├── beit_base_patch16_224_training.txt
    │   │   │   │   ├── botnet26t_256_training.txt
    │   │   │   │   ├── cait_m36_384_training.txt
    │   │   │   │   ├── coat_lite_mini_training.txt
    │   │   │   │   ├── convmixer_768_32_training.txt
    │   │   │   │   ├── convnext_base_training.txt
    │   │   │   │   ├── crossvit_9_240_training.txt
    │   │   │   │   ├── cspdarknet53_training.txt
    │   │   │   │   ├── deit_base_distilled_patch16_224_training.txt
    │   │   │   │   ├── densenet121_training.txt
    │   │   │   │   ├── dla102_training.txt
    │   │   │   │   ├── dm_nfnet_f0_training.txt
    │   │   │   │   ├── dpn107_training.txt
    │   │   │   │   ├── eca_botnext26ts_256_training.txt
    │   │   │   │   ├── eca_halonext26ts_training.txt
    │   │   │   │   ├── ecaresnet101d_training.txt
    │   │   │   │   ├── ese_vovnet19b_dw_training.txt
    │   │   │   │   ├── fbnetc_100_training.txt
    │   │   │   │   ├── fbnetv3_b_training.txt
    │   │   │   │   ├── gernet_l_training.txt
    │   │   │   │   ├── ghostnet_100_training.txt
    │   │   │   │   ├── gluon_inception_v3_training.txt
    │   │   │   │   ├── gluon_senet154_training.txt
    │   │   │   │   ├── gluon_xception65_training.txt
    │   │   │   │   ├── gmixer_24_224_training.txt
    │   │   │   │   ├── gmlp_s16_224_training.txt
    │   │   │   │   ├── hardcorenas_a_training.txt
    │   │   │   │   ├── hrnet_w18_training.txt
    │   │   │   │   ├── inception_v3_training.txt
    │   │   │   │   ├── jx_nest_base_training.txt
    │   │   │   │   ├── lcnet_050_training.txt
    │   │   │   │   ├── legacy_senet154_training.txt
    │   │   │   │   ├── levit_128_training.txt
    │   │   │   │   ├── mixer_b16_224_training.txt
    │   │   │   │   ├── mixnet_l_training.txt
    │   │   │   │   ├── mnasnet_100_training.txt
    │   │   │   │   ├── mobilenetv2_100_training.txt
    │   │   │   │   ├── mobilenetv3_large_100_training.txt
    │   │   │   │   ├── mobilevit_s_training.txt
    │   │   │   │   ├── nasnetalarge_training.txt
    │   │   │   │   ├── nfnet_l0_training.txt
    │   │   │   │   ├── pit_b_224_training.txt
    │   │   │   │   ├── pnasnet5large_training.txt
    │   │   │   │   ├── poolformer_m36_training.txt
    │   │   │   │   ├── regnety_002_training.txt
    │   │   │   │   ├── repvgg_a2_training.txt
    │   │   │   │   ├── res2net101_26w_4s_training.txt
    │   │   │   │   ├── res2net50_14w_8s_training.txt
    │   │   │   │   ├── res2next50_training.txt
    │   │   │   │   ├── resmlp_12_224_training.txt
    │   │   │   │   ├── resnest101e_training.txt
    │   │   │   │   ├── resnet18_training.txt
    │   │   │   │   ├── rexnet_100_training.txt
    │   │   │   │   ├── sebotnet33ts_256_training.txt
    │   │   │   │   ├── selecsls42b_training.txt
    │   │   │   │   ├── spnasnet_100_training.txt
    │   │   │   │   ├── swin_base_patch4_window7_224_training.txt
    │   │   │   │   ├── swsl_resnext101_32x16d_training.txt
    │   │   │   │   ├── tf_efficientnet_b0_training.txt
    │   │   │   │   ├── tf_mixnet_l_training.txt
    │   │   │   │   ├── tinynet_a_training.txt
    │   │   │   │   ├── tnt_s_patch16_224_training.txt
    │   │   │   │   ├── twins_pcpvt_base_training.txt
    │   │   │   │   ├── visformer_small_training.txt
    │   │   │   │   ├── vit_base_patch16_224_training.txt
    │   │   │   │   └── volo_d1_224_training.txt
    │   │   │   └── torchbench_train
    │   │   │   │   ├── BERT_pytorch_training.txt
    │   │   │   │   ├── Background_Matting_training.txt
    │   │   │   │   ├── LearningToPaint_training.txt
    │   │   │   │   ├── Super_SloMo_training.txt
    │   │   │   │   ├── alexnet_training.txt
    │   │   │   │   ├── attention_is_all_you_need_pytorch_training.txt
    │   │   │   │   ├── dcgan_training.txt
    │   │   │   │   ├── densenet121_training.txt
    │   │   │   │   ├── fambench_dlrm_training.txt
    │   │   │   │   ├── fastNLP_Bert_training.txt
    │   │   │   │   ├── hf_Albert_training.txt
    │   │   │   │   ├── hf_Bart_training.txt
    │   │   │   │   ├── hf_Bert_training.txt
    │   │   │   │   ├── hf_BigBird_training.txt
    │   │   │   │   ├── hf_DistilBert_training.txt
    │   │   │   │   ├── hf_GPT2_training.txt
    │   │   │   │   ├── hf_Longformer_training.txt
    │   │   │   │   ├── maml_omniglot_training.txt
    │   │   │   │   ├── mnasnet1_0_training.txt
    │   │   │   │   ├── mobilenet_v2_training.txt
    │   │   │   │   ├── mobilenet_v3_large_training.txt
    │   │   │   │   ├── nvidia_deeprecommender_training.txt
    │   │   │   │   ├── pytorch_CycleGAN_and_pix2pix_training.txt
    │   │   │   │   ├── pytorch_stargan_training.txt
    │   │   │   │   ├── pytorch_struct_training.txt
    │   │   │   │   ├── pytorch_unet_training.txt
    │   │   │   │   ├── resnet18_training.txt
    │   │   │   │   ├── resnet50_training.txt
    │   │   │   │   ├── resnext50_32x4d_training.txt
    │   │   │   │   ├── shufflenet_v2_x1_0_training.txt
    │   │   │   │   ├── speech_transformer_training.txt
    │   │   │   │   ├── squeezenet1_1_training.txt
    │   │   │   │   ├── timm_efficientdet_training.txt
    │   │   │   │   ├── timm_efficientnet_training.txt
    │   │   │   │   ├── timm_nfnet_training.txt
    │   │   │   │   ├── timm_regnet_training.txt
    │   │   │   │   ├── timm_resnest_training.txt
    │   │   │   │   ├── timm_vision_transformer_training.txt
    │   │   │   │   ├── timm_vovnet_training.txt
    │   │   │   │   ├── tts_angular_training.txt
    │   │   │   │   ├── vgg16_training.txt
    │   │   │   │   ├── vision_maskrcnn_training.txt
    │   │   │   │   └── yolov3_training.txt
    │   │   ├── operator_inp_utils.py
    │   │   ├── operatorbench.py
    │   │   └── utils.py
    │   ├── parse_logs.py
    │   ├── run_all.sh
    │   ├── run_delta.sh
    │   ├── runner.py
    │   ├── summarize_perf.py
    │   ├── test.py
    │   ├── timm_models.py
    │   ├── timm_models_list.txt
    │   ├── timm_models_list_cpu.txt
    │   ├── torchbench.py
    │   ├── torchbench_models_list.txt
    │   ├── torchbench_models_list_cpu.txt
    │   └── training_loss.py
    ├── fastrnns
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bench.py
    │   ├── cells.py
    │   ├── conftest.py
    │   ├── custom_lstms.py
    │   ├── factory.py
    │   ├── fuser.py
    │   ├── profile.py
    │   ├── runner.py
    │   ├── scratch.py
    │   ├── test.py
    │   └── test_bench.py
    ├── framework_overhead_benchmark
    │   ├── C2Module.py
    │   ├── SimpleAddModule.py
    │   ├── framework_overhead_benchmark.py
    │   ├── pt_wrapper_module.py
    │   └── utils.py
    ├── functional_autograd_benchmark
    │   ├── README.md
    │   ├── audio_text_models.py
    │   ├── compare.py
    │   ├── functional_autograd_benchmark.py
    │   ├── ppl_models.py
    │   ├── torchaudio_models.py
    │   ├── torchvision_models.py
    │   ├── utils.py
    │   └── vision_models.py
    ├── fuser
    │   ├── plot_speedups.py
    │   └── run_benchmarks.py
    ├── instruction_counts
    │   ├── README.md
    │   ├── applications
    │   │   ├── __init__.py
    │   │   └── ci.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── expand.py
    │   │   ├── types.py
    │   │   └── utils.py
    │   ├── definitions
    │   │   ├── __init__.py
    │   │   ├── setup.py
    │   │   └── standard.py
    │   ├── execution
    │   │   ├── __init__.py
    │   │   ├── runner.py
    │   │   └── work.py
    │   ├── main.py
    │   └── worker
    │   │   ├── __init__.py
    │   │   └── main.py
    ├── nested
    │   └── nested_bmm_bench.py
    ├── operator_benchmark
    │   ├── README.md
    │   ├── __init__.py
    │   ├── benchmark_all_other_test.py
    │   ├── benchmark_all_quantized_test.py
    │   ├── benchmark_all_test.py
    │   ├── benchmark_caffe2.py
    │   ├── benchmark_core.py
    │   ├── benchmark_pytorch.py
    │   ├── benchmark_runner.py
    │   ├── benchmark_test_generator.py
    │   ├── benchmark_utils.py
    │   ├── c2
    │   │   ├── __init__.py
    │   │   ├── add_test.py
    │   │   ├── batch_box_cox_test.py
    │   │   ├── batch_gather_test.py
    │   │   ├── clip_ranges_test.py
    │   │   ├── concat_test.py
    │   │   ├── matmul_test.py
    │   │   ├── quantile_op_test.py
    │   │   └── replace_nan_test.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── repeat_benchmark.py
    │   │   └── tests
    │   │   │   ├── add_ops_list_test.py
    │   │   │   ├── c2_cpu_gpu_forward_backward_test.py
    │   │   │   ├── jit_forward_test.py
    │   │   │   ├── pt_backward_test.py
    │   │   │   ├── pt_configs_list_test.py
    │   │   │   ├── pt_cpu_gpu_forward_backward_test.py
    │   │   │   └── random_sample_test.py
    │   ├── operator_benchmark.py
    │   ├── pt
    │   │   ├── __init__.py
    │   │   ├── add_test.py
    │   │   ├── ao_sparsifier_test.py
    │   │   ├── as_strided_test.py
    │   │   ├── batchnorm_test.py
    │   │   ├── binary_test.py
    │   │   ├── bmm_test.py
    │   │   ├── cat_test.py
    │   │   ├── channel_shuffle_test.py
    │   │   ├── chunk_test.py
    │   │   ├── clip_ranges_test.py
    │   │   ├── configs.py
    │   │   ├── conv_test.py
    │   │   ├── diag_test.py
    │   │   ├── embeddingbag_test.py
    │   │   ├── fill_test.py
    │   │   ├── gather_test.py
    │   │   ├── gelu_test.py
    │   │   ├── groupnorm_test.py
    │   │   ├── hardsigmoid_test.py
    │   │   ├── hardswish_test.py
    │   │   ├── index_select_test.py
    │   │   ├── instancenorm_test.py
    │   │   ├── interpolate_test.py
    │   │   ├── layernorm_test.py
    │   │   ├── linear_prepack_fp16_test.py
    │   │   ├── linear_test.py
    │   │   ├── linear_unpack_fp16_test.py
    │   │   ├── matmul_test.py
    │   │   ├── matrix_mult_test.py
    │   │   ├── nan_to_num_test.py
    │   │   ├── pool_test.py
    │   │   ├── qactivation_test.py
    │   │   ├── qarithmetic_test.py
    │   │   ├── qatembedding_ops_test.py
    │   │   ├── qbatchnorm_test.py
    │   │   ├── qcat_test.py
    │   │   ├── qcomparators_test.py
    │   │   ├── qconv_test.py
    │   │   ├── qembedding_bag_lookups_test.py
    │   │   ├── qembedding_pack_test.py
    │   │   ├── qembeddingbag_test.py
    │   │   ├── qgroupnorm_test.py
    │   │   ├── qinstancenorm_test.py
    │   │   ├── qinterpolate_test.py
    │   │   ├── qlayernorm_test.py
    │   │   ├── qlinear_test.py
    │   │   ├── qobserver_test.py
    │   │   ├── qpool_test.py
    │   │   ├── qrnn_test.py
    │   │   ├── qtensor_method_test.py
    │   │   ├── quantization_test.py
    │   │   ├── qunary_test.py
    │   │   ├── remainder_test.py
    │   │   ├── softmax_test.py
    │   │   ├── split_test.py
    │   │   ├── stack_test.py
    │   │   ├── sum_test.py
    │   │   ├── tensor_to_test.py
    │   │   └── unary_test.py
    │   └── pt_extension
    │   │   ├── cpp_extension_test.py
    │   │   ├── extension.cpp
    │   │   └── setup.py
    ├── overrides_benchmark
    │   ├── README.md
    │   ├── bench.py
    │   ├── common.py
    │   └── pyspybench.py
    ├── profiler_benchmark
    │   ├── profiler_bench.py
    │   └── resnet_memory_profiler.py
    ├── record_function_benchmark
    │   └── record_function_bench.py
    ├── serialization
    │   ├── nested_annotation_str.py
    │   └── simple_measurement.py
    ├── sparse
    │   ├── README.md
    │   ├── __init__.py
    │   ├── dlmc
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── matmul_bench.py
    │   │   ├── test.sh
    │   │   └── utils.py
    │   ├── spmm.py
    │   ├── spmv.py
    │   ├── test_csr.sh
    │   └── utils.py
    ├── static_runtime
    │   ├── CMakeLists.txt
    │   ├── deep_wide_pt.cc
    │   ├── deep_wide_pt.h
    │   ├── deep_wide_pt_bench.cc
    │   ├── test_cpu_fusion.cc
    │   ├── test_generated_ops.cc
    │   ├── test_static_module.cc
    │   ├── test_static_runtime.cc
    │   ├── test_utils.cc
    │   └── test_utils.h
    ├── tensorexpr
    │   ├── HowToRun.md
    │   ├── __main__.py
    │   ├── attention.py
    │   ├── benchmark.py
    │   ├── broadcast.py
    │   ├── concat.py
    │   ├── conv.py
    │   ├── elementwise.py
    │   ├── matmul.py
    │   ├── microbenchmarks.py
    │   ├── nnc.png
    │   ├── normalization.py
    │   ├── pooling.py
    │   ├── pt_engine.py
    │   ├── reduction.py
    │   ├── rnn_eltwise.py
    │   ├── softmax.py
    │   ├── swish.py
    │   └── tensor_engine.py
    ├── transformer
    │   ├── better_transformer_vs_mha_functional.py
    │   ├── sdp.py
    │   └── sdp_backwards.py
    └── upload_scribe.py
├── binaries
    ├── CMakeLists.txt
    ├── aot_model_compiler.cc
    ├── at_launch_benchmark.cc
    ├── bench_gen
    │   └── bench_gen.py
    ├── benchmark_args.h
    ├── benchmark_helper.cc
    ├── benchmark_helper.h
    ├── caffe2_benchmark.cc
    ├── compare_models_torch.cc
    ├── convert_and_benchmark.cc
    ├── convert_caffe_image_db.cc
    ├── convert_db.cc
    ├── convert_encoded_to_raw_leveldb.cc
    ├── convert_image_to_tensor.cc
    ├── core_overhead_benchmark.cc
    ├── core_overhead_benchmark_gpu.cc
    ├── db_throughput.cc
    ├── dump_operator_names.cc
    ├── inspect_gpu.cc
    ├── intra_inter_benchmark.cc
    ├── lite_interpreter_model_load.cc
    ├── load_benchmark_torch.cc
    ├── make_cifar_db.cc
    ├── make_image_db.cc
    ├── make_mnist_db.cc
    ├── optimize_for_mobile.cc
    ├── parallel_info.cc
    ├── predictor_verifier.cc
    ├── print_core_object_sizes_gpu.cc
    ├── print_registered_core_operators.cc
    ├── record_function_benchmark.cc
    ├── run_plan.cc
    ├── run_plan_mpi.cc
    ├── speed_benchmark.cc
    ├── speed_benchmark_torch.cc
    ├── split_db.cc
    ├── tsv_2_proto.cc
    ├── tutorial_blob.cc
    └── zmq_feeder.cc
├── third_party
    ├── BUCK.oss
    ├── BUILD
    ├── LICENSES_BUNDLED.txt
    ├── METADATA.bzl
    ├── README.md
    ├── build_bundled.py
    ├── cuda.BUILD
    ├── cudnn.BUILD
    ├── cutlass.BUILD
    ├── eigen.BUILD
    ├── fmt.BUILD
    ├── foxi.BUILD
    ├── generate-cpuinfo-wrappers.py
    ├── generate-xnnpack-wrappers.py
    ├── glog.buck.bzl
    ├── gloo.BUILD
    ├── ideep.BUILD
    ├── kineto.BUILD
    ├── kineto.buck.bzl
    ├── miniz-2.1.0
    │   ├── BUILD.bazel
    │   ├── ChangeLog.md
    │   ├── LICENSE
    │   ├── examples
    │   │   ├── example1.c
    │   │   ├── example2.c
    │   │   ├── example3.c
    │   │   ├── example4.c
    │   │   ├── example5.c
    │   │   └── example6.c
    │   ├── miniz.c
    │   ├── miniz.h
    │   └── readme.md
    ├── mkl-dnn.BUILD
    ├── mkl.BUILD
    ├── mkl_headers.BUILD
    ├── nvfuser
    │   ├── CMakeLists.txt
    │   ├── benchmark
    │   │   ├── CMakeLists.txt
    │   │   ├── batch_norm_channels_first.cpp
    │   │   ├── batch_norm_channels_first_backward.cpp
    │   │   ├── batch_norm_channels_last.cpp
    │   │   ├── batch_norm_channels_last_backward.cpp
    │   │   ├── bert.cpp
    │   │   ├── broadcast.cpp
    │   │   ├── gelu_backward.cpp
    │   │   ├── heuristic_cache.cpp
    │   │   ├── heuristic_lookup.cpp
    │   │   ├── instance_norm.cpp
    │   │   ├── layer_norm.cpp
    │   │   ├── layer_norm_backward.cpp
    │   │   ├── lstm_cell.cpp
    │   │   ├── main.cpp
    │   │   ├── matmul.cpp
    │   │   ├── reduction.cpp
    │   │   ├── rms_norm.cpp
    │   │   ├── rms_norm_backward.cpp
    │   │   ├── scale_bias_relu.cpp
    │   │   ├── shape_inference.cpp
    │   │   ├── softmax.cpp
    │   │   ├── softmax_backward.cpp
    │   │   ├── softmax_dropout.cpp
    │   │   ├── timm.cpp
    │   │   ├── transpose.cpp
    │   │   ├── utils.cpp
    │   │   └── utils.h
    │   ├── csrc
    │   │   ├── arith.cpp
    │   │   ├── arith.h
    │   │   ├── codegen.cpp
    │   │   ├── codegen.h
    │   │   ├── compute_at.cpp
    │   │   ├── compute_at.h
    │   │   ├── compute_at_map.cpp
    │   │   ├── compute_at_map.h
    │   │   ├── contiguity.cpp
    │   │   ├── contiguity.h
    │   │   ├── disjoint_set.h
    │   │   ├── dispatch.cpp
    │   │   ├── dispatch.h
    │   │   ├── docs
    │   │   │   ├── .gitignore
    │   │   │   ├── documentation.h
    │   │   │   ├── fuser.doxygen
    │   │   │   ├── images
    │   │   │   │   └── ir_architecture.png
    │   │   │   └── main_page.md
    │   │   ├── dynamic_type.h
    │   │   ├── evaluator_common.cpp
    │   │   ├── evaluator_common.h
    │   │   ├── executor.cpp
    │   │   ├── executor.h
    │   │   ├── executor_kernel_arg.cpp
    │   │   ├── executor_kernel_arg.h
    │   │   ├── executor_launch_params.cpp
    │   │   ├── executor_launch_params.h
    │   │   ├── executor_utils.cpp
    │   │   ├── executor_utils.h
    │   │   ├── expr_evaluator.cpp
    │   │   ├── expr_evaluator.h
    │   │   ├── fusion.cpp
    │   │   ├── fusion.h
    │   │   ├── fusion_segmenter.cpp
    │   │   ├── fusion_segmenter.h
    │   │   ├── graph_fuser.cpp
    │   │   ├── grouped_reduction.cpp
    │   │   ├── grouped_reduction.h
    │   │   ├── index_compute.cpp
    │   │   ├── index_compute.h
    │   │   ├── inlining.cpp
    │   │   ├── inlining.h
    │   │   ├── instrumentation.cpp
    │   │   ├── instrumentation.h
    │   │   ├── ir_all_nodes.h
    │   │   ├── ir_base_nodes.cpp
    │   │   ├── ir_base_nodes.h
    │   │   ├── ir_builder.cpp
    │   │   ├── ir_builder.h
    │   │   ├── ir_cloner.cpp
    │   │   ├── ir_cloner.h
    │   │   ├── ir_container.cpp
    │   │   ├── ir_container.h
    │   │   ├── ir_graphviz.cpp
    │   │   ├── ir_graphviz.h
    │   │   ├── ir_interface_nodes.h
    │   │   ├── ir_internal_nodes.h
    │   │   ├── ir_iostream.cpp
    │   │   ├── ir_iostream.h
    │   │   ├── ir_nodes.cpp
    │   │   ├── ir_printer.h
    │   │   ├── ir_utils.cpp
    │   │   ├── ir_utils.h
    │   │   ├── iter_visitor.cpp
    │   │   ├── iter_visitor.h
    │   │   ├── kernel.cpp
    │   │   ├── kernel.h
    │   │   ├── kernel_cache.cpp
    │   │   ├── kernel_cache.h
    │   │   ├── kernel_expr_evaluator.cpp
    │   │   ├── kernel_expr_evaluator.h
    │   │   ├── kernel_ir.cpp
    │   │   ├── kernel_ir.h
    │   │   ├── kernel_ir_dispatch.cpp
    │   │   ├── kernel_ir_dispatch.h
    │   │   ├── lower2device.cpp
    │   │   ├── lower2device.h
    │   │   ├── lower_alias_memory.cpp
    │   │   ├── lower_alias_memory.h
    │   │   ├── lower_allocation.cpp
    │   │   ├── lower_allocation.h
    │   │   ├── lower_bank_conflict.cpp
    │   │   ├── lower_bank_conflict.h
    │   │   ├── lower_divisible_split.cpp
    │   │   ├── lower_divisible_split.h
    │   │   ├── lower_double_buffer.cpp
    │   │   ├── lower_double_buffer.h
    │   │   ├── lower_expr_sort.cpp
    │   │   ├── lower_expr_sort.h
    │   │   ├── lower_fused_reduction.cpp
    │   │   ├── lower_fused_reduction.h
    │   │   ├── lower_fusion_simplifier.cpp
    │   │   ├── lower_fusion_simplifier.h
    │   │   ├── lower_index.cpp
    │   │   ├── lower_index.h
    │   │   ├── lower_index_compute.cpp
    │   │   ├── lower_index_compute.h
    │   │   ├── lower_index_hoist.cpp
    │   │   ├── lower_index_hoist.h
    │   │   ├── lower_insert_syncs.cpp
    │   │   ├── lower_insert_syncs.h
    │   │   ├── lower_instrument.cpp
    │   │   ├── lower_instrument.h
    │   │   ├── lower_loops.cpp
    │   │   ├── lower_loops.h
    │   │   ├── lower_magic_zero.cpp
    │   │   ├── lower_magic_zero.h
    │   │   ├── lower_misaligned_vectorization.cpp
    │   │   ├── lower_misaligned_vectorization.h
    │   │   ├── lower_predicate.cpp
    │   │   ├── lower_predicate.h
    │   │   ├── lower_predicate_elimination.cpp
    │   │   ├── lower_predicate_elimination.h
    │   │   ├── lower_replace_size.cpp
    │   │   ├── lower_replace_size.h
    │   │   ├── lower_shift.cpp
    │   │   ├── lower_shift.h
    │   │   ├── lower_sync_information.cpp
    │   │   ├── lower_sync_information.h
    │   │   ├── lower_thread_predicate.cpp
    │   │   ├── lower_thread_predicate.h
    │   │   ├── lower_trivial_broadcast.cpp
    │   │   ├── lower_trivial_broadcast.h
    │   │   ├── lower_trivial_reductions.cpp
    │   │   ├── lower_trivial_reductions.h
    │   │   ├── lower_unroll.cpp
    │   │   ├── lower_unroll.h
    │   │   ├── lower_utils.cpp
    │   │   ├── lower_utils.h
    │   │   ├── lower_validation.cpp
    │   │   ├── lower_validation.h
    │   │   ├── lower_warp_reduce.cpp
    │   │   ├── lower_warp_reduce.h
    │   │   ├── manager.cpp
    │   │   ├── manager.h
    │   │   ├── maxinfo_propagator.cpp
    │   │   ├── maxinfo_propagator.h
    │   │   ├── mma_type.cpp
    │   │   ├── mma_type.h
    │   │   ├── mutator.cpp
    │   │   ├── mutator.h
    │   │   ├── non_divisible_split.cpp
    │   │   ├── non_divisible_split.h
    │   │   ├── ops
    │   │   │   ├── alias.cpp
    │   │   │   ├── alias.h
    │   │   │   ├── all_ops.h
    │   │   │   ├── composite.cpp
    │   │   │   ├── composite.h
    │   │   │   ├── normalization.cpp
    │   │   │   └── normalization.h
    │   │   ├── parallel_dimension_map.cpp
    │   │   ├── parallel_dimension_map.h
    │   │   ├── parallel_type_bitmap.cpp
    │   │   ├── parallel_type_bitmap.h
    │   │   ├── parser.cpp
    │   │   ├── parser.h
    │   │   ├── partial_split_map.cpp
    │   │   ├── partial_split_map.h
    │   │   ├── partition.cpp
    │   │   ├── partition.h
    │   │   ├── predicate_compute.cpp
    │   │   ├── predicate_compute.h
    │   │   ├── python_frontend
    │   │   │   ├── README.md
    │   │   │   ├── fusion_cache.cpp
    │   │   │   ├── fusion_cache.h
    │   │   │   ├── fusion_definition.cpp
    │   │   │   ├── fusion_definition.h
    │   │   │   ├── fusion_interface.cpp
    │   │   │   ├── fusion_interface.h
    │   │   │   ├── fusion_record.h
    │   │   │   ├── python_bindings.cpp
    │   │   │   ├── python_bindings.h
    │   │   │   ├── python_bindings_extension.cpp
    │   │   │   └── test
    │   │   │   │   ├── test_nvfuser_fusion_cache.cpp
    │   │   │   │   ├── test_nvfuser_fusion_definition.cpp
    │   │   │   │   └── test_nvfuser_fusion_record.cpp
    │   │   ├── register_interface.cpp
    │   │   ├── register_interface.h
    │   │   ├── root_domain_map.cpp
    │   │   ├── root_domain_map.h
    │   │   ├── scheduler
    │   │   │   ├── all_schedulers.h
    │   │   │   ├── compile_time_info.h
    │   │   │   ├── debug_utils.h
    │   │   │   ├── heuristic.h
    │   │   │   ├── matmul.cpp
    │   │   │   ├── matmul.h
    │   │   │   ├── mma_utils.cpp
    │   │   │   ├── mma_utils.h
    │   │   │   ├── normalization.cpp
    │   │   │   ├── normalization.h
    │   │   │   ├── pointwise.cpp
    │   │   │   ├── pointwise.h
    │   │   │   ├── pointwise_heuristic.h
    │   │   │   ├── pointwise_utils.cpp
    │   │   │   ├── pointwise_utils.h
    │   │   │   ├── reduction.cpp
    │   │   │   ├── reduction.h
    │   │   │   ├── reduction_heuristic.h
    │   │   │   ├── reduction_utils.cpp
    │   │   │   ├── reduction_utils.h
    │   │   │   ├── registry.cpp
    │   │   │   ├── registry.h
    │   │   │   ├── transpose.cpp
    │   │   │   ├── transpose.h
    │   │   │   ├── transpose_heuristic.h
    │   │   │   ├── utils.cpp
    │   │   │   ├── utils.h
    │   │   │   ├── vectorize_helper.cpp
    │   │   │   └── vectorize_helper.h
    │   │   ├── tensor_view.cpp
    │   │   ├── transform_iter.cpp
    │   │   ├── transform_iter.h
    │   │   ├── transform_replay.cpp
    │   │   ├── transform_replay.h
    │   │   ├── transform_rfactor.cpp
    │   │   ├── transform_rfactor.h
    │   │   ├── transform_view.cpp
    │   │   ├── transform_view.h
    │   │   ├── type.cpp
    │   │   ├── type.h
    │   │   ├── type_inference.cpp
    │   │   ├── type_inference.h
    │   │   ├── type_promotion.cpp
    │   │   ├── type_promotion.h
    │   │   ├── utils.cpp
    │   │   ├── utils.h
    │   │   └── vectorization_info.h
    │   ├── examples
    │   │   ├── sinh_extension
    │   │   │   ├── README.md
    │   │   │   ├── main.cpp
    │   │   │   ├── setup.py
    │   │   │   └── test.py
    │   │   └── sinh_libtorch
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   └── main.cpp
    │   ├── python
    │   │   └── __init__.py
    │   ├── python_tests
    │   │   ├── __init__.py
    │   │   ├── test_dynamo.py
    │   │   ├── test_python_frontend.py
    │   │   └── test_torchscript.py
    │   ├── runtime
    │   │   ├── array.cu
    │   │   ├── array_rocm.cu
    │   │   ├── bf16_support.cu
    │   │   ├── bf16_support_rocm.cu
    │   │   ├── block_reduction.cu
    │   │   ├── block_sync_atomic.cu
    │   │   ├── block_sync_default.cu
    │   │   ├── block_sync_default_rocm.cu
    │   │   ├── broadcast.cu
    │   │   ├── fp16_support.cu
    │   │   ├── fused_reduction.cu
    │   │   ├── fused_welford_helper.cu
    │   │   ├── fused_welford_impl.cu
    │   │   ├── grid_broadcast.cu
    │   │   ├── grid_reduction.cu
    │   │   ├── grid_sync.cu
    │   │   ├── helpers.cu
    │   │   ├── index_utils.cu
    │   │   ├── memory.cu
    │   │   ├── random_numbers.cu
    │   │   ├── swizzle.cu
    │   │   ├── tensor.cu
    │   │   ├── tensorcore.cu
    │   │   ├── tuple.cu
    │   │   ├── type_traits.cu
    │   │   ├── warp.cu
    │   │   ├── warp_rocm.cu
    │   │   └── welford.cu
    │   ├── test
    │   │   ├── test_gpu1.cpp
    │   │   ├── test_gpu2.cpp
    │   │   ├── test_gpu3.cpp
    │   │   ├── test_gpu_fused_reduction.cpp
    │   │   ├── test_gpu_rng.cu
    │   │   ├── test_gpu_shift.cpp
    │   │   ├── test_gpu_tensor_factories.cpp
    │   │   ├── test_gpu_tensorcore.cpp
    │   │   ├── test_gpu_transpose.cpp
    │   │   ├── test_gpu_utils.cpp
    │   │   ├── test_gpu_validator.h
    │   │   ├── test_gpu_view.cpp
    │   │   └── test_utils.h
    │   └── tools
    │   │   └── stringify_file.py
    ├── onnx.BUILD
    ├── sleef.BUILD
    ├── sleef.bzl
    ├── substitution.bzl
    ├── tbb.BUILD
    ├── tbb.patch
    ├── tensorflow_cuda_bazel_build
    │   └── cuda
    │   │   └── build_defs.bzl
    ├── tensorpipe.BUILD
    ├── valgrind-headers
    │   ├── README.md
    │   ├── callgrind.h
    │   └── valgrind.h
    ├── xnnpack.buck.bzl
    ├── xnnpack_src_defs.bzl
    └── xnnpack_wrapper_defs.bzl
└── torchgen
    ├── BUCK.oss
    ├── BUILD.bazel
    ├── __init__.py
    ├── api
        ├── __init__.py
        ├── autograd.py
        ├── cpp.py
        ├── dispatcher.py
        ├── functionalization.py
        ├── lazy.py
        ├── meta.py
        ├── native.py
        ├── python.py
        ├── structured.py
        ├── translate.py
        ├── types
        │   ├── __init__.py
        │   ├── signatures.py
        │   ├── types.py
        │   └── types_base.py
        ├── ufunc.py
        └── unboxing.py
    ├── build.bzl
    ├── code_template.py
    ├── context.py
    ├── decompositions
        └── gen_jit_decompositions.py
    ├── dest
        ├── __init__.py
        ├── lazy_ir.py
        ├── lazy_ts_lowering.py
        ├── native_functions.py
        ├── register_dispatch_key.py
        └── ufunc.py
    ├── executorch
        ├── __init__.py
        └── api
        │   ├── __init__.py
        │   ├── custom_ops.py
        │   ├── et_cpp.py
        │   ├── types
        │       ├── __init__.py
        │       ├── signatures.py
        │       └── types.py
        │   └── unboxing.py
    ├── gen.py
    ├── gen_backend_stubs.py
    ├── gen_executorch.py
    ├── gen_functionalization_type.py
    ├── gen_lazy_tensor.py
    ├── gen_vmap_plumbing.py
    ├── local.py
    ├── model.py
    ├── native_function_generation.py
    ├── operator_versions
        ├── __init__.py
        ├── gen_mobile_upgraders.py
        └── gen_mobile_upgraders_constant.py
    ├── selective_build
        ├── __init__.py
        ├── operator.py
        └── selector.py
    ├── shape_functions
        └── gen_jit_shape_functions.py
    ├── static_runtime
        ├── __init__.py
        ├── config.py
        ├── gen_static_runtime_ops.py
        └── generator.py
    └── utils.py


/.bazelversion:
--------------------------------------------------------------------------------
1 | 6.1.1
2 | 


--------------------------------------------------------------------------------
/.buckconfig.oss:
--------------------------------------------------------------------------------
 1 | [pt]
 2 |   is_oss=1
 3 | 
 4 | [buildfile]
 5 |   name = BUCK.oss
 6 |   includes = //tools/build_defs/select.bzl
 7 | 
 8 | [repositories]
 9 |   bazel_skylib = third_party/bazel-skylib/
10 |   ovr_config = .
11 | 
12 | [download]
13 |   in_build = true
14 | 
15 | [cxx]
16 |   cxxflags = -std=c++17
17 |   ldflags = -Wl,--no-undefined
18 |   should_remap_host_platform = true
19 |   cpp = /usr/bin/clang
20 |   cc = /usr/bin/clang
21 |   cxx = /usr/bin/clang++
22 |   cxxpp = /usr/bin/clang++
23 |   ld = /usr/bin/clang++
24 | 
25 | [project]
26 |   default_flavors_mode=all
27 | 


--------------------------------------------------------------------------------
/.ci/caffe2/README.md:
--------------------------------------------------------------------------------
 1 | # Jenkins
 2 | 
 3 | The scripts in this directory are the entrypoint for testing Caffe2.
 4 | 
 5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to
 6 | the build environment you intend to test. It is a hint for the build
 7 | and test scripts to configure Caffe2 a certain way and include/exclude
 8 | tests. Docker images, they equal the name of the image itself. For
 9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
10 | built on Jenkins and are used in triggered builds already have this
11 | environment variable set in their manifest. Also see
12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13 | 
14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
15 | 


--------------------------------------------------------------------------------
/.ci/caffe2/common.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
 5 | TEST_DIR="$ROOT_DIR/test"
 6 | gtest_reports_dir="${TEST_DIR}/test-reports/cpp"
 7 | pytest_reports_dir="${TEST_DIR}/test-reports/python"
 8 | 
 9 | # Figure out which Python to use
10 | PYTHON="$(which python)"
11 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
12 |   PYTHON=$(which "python${BASH_REMATCH[1]}")
13 | fi
14 | 
15 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
16 |     # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
17 |     unset HIP_PLATFORM
18 |     if which sccache > /dev/null; then
19 |         # Save sccache logs to file
20 |         sccache --stop-server || true
21 |         rm -f ~/sccache_error.log || true
22 |         SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
23 | 
24 |         # Report sccache stats for easier debugging
25 |         sccache --zero-stats
26 |     fi
27 | fi
28 | 
29 | # /usr/local/caffe2 is where the cpp bits are installed to in cmake-only
30 | # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so
31 | # that the test code in .ci/test.sh is the same
32 | INSTALL_PREFIX="/usr/local/caffe2"
33 | 
34 | mkdir -p "$gtest_reports_dir" || true
35 | mkdir -p "$pytest_reports_dir" || true
36 | mkdir -p "$INSTALL_PREFIX" || true
37 | 


--------------------------------------------------------------------------------
/.ci/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Docker images for Jenkins
 2 | 
 3 | This directory contains everything needed to build the Docker images
 4 | that are used in our CI
 5 | 
 6 | The Dockerfiles located in subdirectories are parameterized to
 7 | conditionally run build stages depending on build arguments passed to
 8 | `docker build`. This lets us use only a few Dockerfiles for many
 9 | images. The different configurations are identified by a freeform
10 | string that we call a _build environment_. This string is persisted in
11 | each image as the `BUILD_ENVIRONMENT` environment variable.
12 | 
13 | See `build.sh` for valid build environments (it's the giant switch).
14 | 
15 | Docker builds are now defined with `.circleci/cimodel/data/simple/docker_definitions.py`
16 | 
17 | ## Contents
18 | 
19 | * `build.sh` -- dispatch script to launch all builds
20 | * `common` -- scripts used to execute individual Docker build stages
21 | * `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker
22 | 
23 | ## Usage
24 | 
25 | ```bash
26 | # Build a specific image
27 | ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
28 | 
29 | # Set flags (see build.sh) and build image
30 | sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
31 | ```
32 | 


--------------------------------------------------------------------------------
/.ci/docker/android/AndroidManifest.xml:
--------------------------------------------------------------------------------
1 | <manifest package="org.pytorch.deps" />
2 | 


--------------------------------------------------------------------------------
/.ci/docker/ci_commit_pins/triton-rocm.txt:
--------------------------------------------------------------------------------
1 | de3f5436247e391b062a7dd7fd42d2a55c2cd524
2 | 


--------------------------------------------------------------------------------
/.ci/docker/ci_commit_pins/triton.txt:
--------------------------------------------------------------------------------
1 | 46672772b46b103db7341c9e10fbad7f643557d4
2 | 


--------------------------------------------------------------------------------
/.ci/docker/common/common_utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Work around bug where devtoolset replaces sudo and breaks it.
 4 | if [ -n "$DEVTOOLSET_VERSION" ]; then
 5 |   export SUDO=/bin/sudo
 6 | else
 7 |   export SUDO=sudo
 8 | fi
 9 | 
10 | as_jenkins() {
11 |   # NB: unsetting the environment variables works around a conda bug
12 |   # https://github.com/conda/conda/issues/6576
13 |   # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
14 |   # NB: This must be run from a directory that jenkins has access to,
15 |   # works around https://github.com/conda/conda-package-handling/pull/34
16 |   $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
17 | }
18 | 
19 | conda_install() {
20 |   # Ensure that the install command don't upgrade/downgrade Python
21 |   # This should be called as
22 |   #   conda_install pkg1 pkg2 ... [-c channel]
23 |   as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
24 | }
25 | 
26 | conda_run() {
27 |   as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
28 | }
29 | 
30 | pip_install() {
31 |   as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
32 | }
33 | 
34 | get_pinned_commit() {
35 |   cat "${1}".txt
36 | }
37 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_cmake.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$CMAKE_VERSION" ]
 6 | 
 7 | # Remove system cmake install so it won't get used instead
 8 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 9 | case "$ID" in
10 |   ubuntu)
11 |     apt-get remove cmake -y
12 |     ;;
13 |   centos)
14 |     yum remove cmake -y
15 |     ;;
16 |   *)
17 |     echo "Unable to determine OS..."
18 |     exit 1
19 |     ;;
20 | esac
21 | 
22 | # Turn 3.6.3 into v3.6
23 | path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
24 | file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
25 | 
26 | # Download and install specific CMake version in /usr/local
27 | pushd /tmp
28 | curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
29 | tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
30 | rm -f cmake-*.tar.gz
31 | popd
32 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_cudnn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ ${CUDNN_VERSION} == 8 ]]; then
 4 |     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
 5 |     mkdir tmp_cudnn && cd tmp_cudnn
 6 |     CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
 7 |     if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
 8 |         CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
 9 |         curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
10 |     elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
11 |         CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
12 |         curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
13 |     else
14 |         curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
15 |     fi
16 | 
17 |     tar xf ${CUDNN_NAME}.tar.xz
18 |     cp -a ${CUDNN_NAME}/include/* /usr/include/
19 |     cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
20 |     cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/
21 | 
22 |     cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
23 |     cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/
24 |     cd ..
25 |     rm -rf tmp_cudnn
26 |     ldconfig
27 | fi
28 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | install_ubuntu() {
 6 |   apt-get update
 7 |   apt-get install -y --no-install-recommends \
 8 |           libhiredis-dev \
 9 |           libleveldb-dev \
10 |           liblmdb-dev \
11 |           libsnappy-dev
12 | 
13 |   # Cleanup
14 |   apt-get autoclean && apt-get clean
15 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
16 | }
17 | 
18 | install_centos() {
19 |   # Need EPEL for many packages we depend on.
20 |   # See http://fedoraproject.org/wiki/EPEL
21 |   yum --enablerepo=extras install -y epel-release
22 | 
23 |   yum install -y \
24 |       hiredis-devel \
25 |       leveldb-devel \
26 |       lmdb-devel \
27 |       snappy-devel
28 | 
29 |   # Cleanup
30 |   yum clean all
31 |   rm -rf /var/cache/yum
32 |   rm -rf /var/lib/yum/yumdb
33 |   rm -rf /var/lib/yum/history
34 | }
35 | 
36 | # Install base packages depending on the base OS
37 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
38 | case "$ID" in
39 |   ubuntu)
40 |     install_ubuntu
41 |     ;;
42 |   centos)
43 |     install_centos
44 |     ;;
45 |   *)
46 |     echo "Unable to determine OS..."
47 |     exit 1
48 |     ;;
49 | esac
50 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_devtoolset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$DEVTOOLSET_VERSION" ]
 6 | 
 7 | yum install -y centos-release-scl
 8 | yum install -y devtoolset-$DEVTOOLSET_VERSION
 9 | 
10 | echo "source scl_source enable devtoolset-$DEVTOOLSET_VERSION" > "/etc/profile.d/devtoolset-$DEVTOOLSET_VERSION.sh"
11 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_docs_reqs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | if [ -n "$KATEX" ]; then
 6 |   apt-get update
 7 |   # Ignore error if gpg-agent doesn't exist (for Ubuntu 16.04)
 8 |   apt-get install -y gpg-agent || :
 9 | 
10 |   curl --retry 3 -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
11 |   sudo apt-get install -y nodejs
12 | 
13 |   curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
14 |   echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
15 | 
16 |   apt-get update
17 |   apt-get install -y --no-install-recommends yarn
18 |   yarn global add katex --prefix /usr/local
19 | 
20 |   sudo apt-get -y install doxygen
21 | 
22 |   apt-get autoclean && apt-get clean
23 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
24 | 
25 | fi
26 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_gcc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | if [ -n "$GCC_VERSION" ]; then
 6 | 
 7 |   # Need the official toolchain repo to get alternate packages
 8 |   add-apt-repository ppa:ubuntu-toolchain-r/test
 9 |   apt-get update
10 |   if [[ "$UBUNTU_VERSION" == "16.04" && "${GCC_VERSION:0:1}" == "5" ]]; then
11 |     apt-get install -y g++-5=5.4.0-6ubuntu1~16.04.12
12 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50
13 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
14 |     update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-5 50
15 |   else
16 |     apt-get install -y g++-$GCC_VERSION
17 |     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
18 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
19 |     update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
20 |   fi
21 | 
22 | 
23 |   # Cleanup package manager
24 |   apt-get autoclean && apt-get clean
25 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
26 | 
27 | fi
28 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_glibc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$GLIBC_VERSION" ]
 6 | if [[ -n "$CENTOS_VERSION" ]]; then
 7 |   [ -n "$DEVTOOLSET_VERSION" ]
 8 | fi
 9 | 
10 | yum install -y wget sed
11 | 
12 | mkdir -p /packages && cd /packages
13 | wget -q http://ftp.gnu.org/gnu/glibc/glibc-$GLIBC_VERSION.tar.gz
14 | tar xzf glibc-$GLIBC_VERSION.tar.gz
15 | if [[ "$GLIBC_VERSION" == "2.26" ]]; then
16 |   cd glibc-$GLIBC_VERSION
17 |   sed -i 's/$name ne "nss_test1"/$name ne "nss_test1" \&\& $name ne "nss_test2"/' scripts/test-installation.pl
18 |   cd ..
19 | fi
20 | mkdir -p glibc-$GLIBC_VERSION-build && cd glibc-$GLIBC_VERSION-build
21 | 
22 | if [[ -n "$CENTOS_VERSION" ]]; then
23 |   export PATH=/opt/rh/devtoolset-$DEVTOOLSET_VERSION/root/usr/bin:$PATH
24 | fi
25 | 
26 | ../glibc-$GLIBC_VERSION/configure --prefix=/usr CFLAGS='-Wno-stringop-truncation -Wno-format-overflow -Wno-restrict -Wno-format-truncation -g -O2'
27 | make -j$(nproc)
28 | make install
29 | 
30 | # Cleanup
31 | rm -rf /packages
32 | rm -rf /var/cache/yum/*
33 | rm -rf /var/lib/rpm/__db.*
34 | yum clean all
35 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_jni.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | mkdir -p /usr/local/include
6 | cp jni.h /usr/local/include
7 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_lcov.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | git clone --branch v1.15 https://github.com/linux-test-project/lcov.git
6 | pushd lcov
7 | sudo make install   # will be installed in /usr/local/bin/lcov
8 | popd
9 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_linter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 6 | 
 7 | if [ -n "${UBUNTU_VERSION}" ]; then
 8 |   apt update
 9 |   apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
10 | fi
11 | 
12 | # Do shallow clone of PyTorch so that we can init lintrunner in Docker build context
13 | git clone https://github.com/pytorch/pytorch.git --depth 1
14 | chown -R jenkins pytorch
15 | 
16 | pushd pytorch
17 | # Install all linter dependencies
18 | pip_install -r requirements.txt
19 | conda_run lintrunner init
20 | 
21 | # Cache .lintbin directory as part of the Docker image
22 | cp -r .lintbin /tmp
23 | popd
24 | 
25 | # Node dependencies required by toc linter job
26 | npm install -g markdown-toc
27 | 
28 | # Cleaning up
29 | rm -rf pytorch
30 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_ninja.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "$NINJA_VERSION" ]
 6 | 
 7 | url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
 8 | 
 9 | pushd /tmp
10 | wget --no-verbose --output-document=ninja-linux.zip "$url"
11 | unzip ninja-linux.zip -d /usr/local/bin
12 | rm -f ninja-linux.zip
13 | popd
14 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_openmpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt-get update
 4 | # also install ssh to avoid error of:
 5 | # --------------------------------------------------------------------------
 6 | # The value of the MCA parameter "plm_rsh_agent" was set to a path
 7 | # that could not be found:
 8 | #   plm_rsh_agent: ssh : rsh
 9 | sudo apt-get install -y ssh
10 | sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
11 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_openssl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | OPENSSL=openssl-1.1.1k
 6 | 
 7 | wget -q -O "${OPENSSL}.tar.gz" "https://ossci-linux.s3.amazonaws.com/${OPENSSL}.tar.gz"
 8 | tar xf "${OPENSSL}.tar.gz"
 9 | cd "${OPENSSL}"
10 | ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
11 | # NOTE: openssl install errors out when built with the -j option
12 | make -j6; make install_sw
13 | # Link the ssl libraries to the /usr/lib folder.
14 | sudo ln -s /opt/openssl/lib/lib* /usr/lib
15 | cd ..
16 | rm -rf "${OPENSSL}"
17 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_swiftshader.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "${SWIFTSHADER}" ]
 6 | 
 7 | retry () {
 8 |     $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 9 | }
10 | 
11 | _https_amazon_aws=https://ossci-android.s3.amazonaws.com
12 | 
13 | # SwiftShader
14 | _swiftshader_dir=/var/lib/jenkins/swiftshader
15 | _swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
16 | mkdir -p $_swiftshader_dir
17 | _tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
18 | 
19 | curl --silent --show-error --location --fail --retry 3 \
20 |   --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
21 | 
22 | tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
23 | 
24 | export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
25 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_thrift.sh:
--------------------------------------------------------------------------------
 1 | apt-get update
 2 | apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
 3 | wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz
 4 | tar -xvf thrift-0.12.0.tar.gz
 5 | cd thrift-0.12.0
 6 | for file in ./compiler/cpp/Makefile*; do
 7 |   sed -i 's/\-Werror//' $file
 8 | done
 9 | ./bootstrap.sh
10 | ./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby
11 | sudo make
12 | sudo make install
13 | cd ..
14 | rm thrift-0.12.0.tar.gz
15 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_ucc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | if [[ -d "/usr/local/cuda/" ]];  then
 6 |   with_cuda=/usr/local/cuda/
 7 | else
 8 |   with_cuda=no
 9 | fi
10 | 
11 | function install_ucx() {
12 |   set -ex
13 |   git clone --recursive https://github.com/openucx/ucx.git
14 |   pushd ucx
15 |   git checkout ${UCX_COMMIT}
16 |   git submodule update --init --recursive
17 | 
18 |   ./autogen.sh
19 |   ./configure --prefix=$UCX_HOME      \
20 |       --enable-mt                     \
21 |       --with-cuda=$with_cuda          \
22 |       --enable-profiling              \
23 |       --enable-stats
24 |   time make -j
25 |   sudo make install
26 | 
27 |   popd
28 |   rm -rf ucx
29 | }
30 | 
31 | function install_ucc() {
32 |   set -ex
33 |   git clone --recursive https://github.com/openucx/ucc.git
34 |   pushd ucc
35 |   git checkout ${UCC_COMMIT}
36 |   git submodule update --init --recursive
37 | 
38 |   ./autogen.sh
39 |   ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
40 |   time make -j
41 |   sudo make install
42 | 
43 |   popd
44 |   rm -rf ucc
45 | }
46 | 
47 | install_ucx
48 | install_ucc
49 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_user.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # Mirror jenkins user in container
 6 | # jenkins user as ec2-user should have the same user-id
 7 | echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
 8 | echo "jenkins:x:1000:" >> /etc/group
 9 | # Needed on focal or newer
10 | echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow
11 | 
12 | # Create $HOME
13 | mkdir -p /var/lib/jenkins
14 | chown jenkins:jenkins /var/lib/jenkins
15 | mkdir -p /var/lib/jenkins/.ccache
16 | chown jenkins:jenkins /var/lib/jenkins/.ccache
17 | 
18 | # Allow writing to /usr/local (for make install)
19 | chown jenkins:jenkins /usr/local
20 | 
21 | # Allow sudo
22 | # TODO: Maybe we shouldn't
23 | echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
24 | 
25 | # Work around bug where devtoolset replaces sudo and breaks it.
26 | if [ -n "$DEVTOOLSET_VERSION" ]; then
27 |   SUDO=/bin/sudo
28 | else
29 |   SUDO=sudo
30 | fi
31 | 
32 | # Test that sudo works
33 | $SUDO -u jenkins $SUDO -v
34 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_vision.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | install_ubuntu() {
 6 |   apt-get update
 7 |   apt-get install -y --no-install-recommends \
 8 |           libopencv-dev \
 9 |           libavcodec-dev
10 | 
11 |   # Cleanup
12 |   apt-get autoclean && apt-get clean
13 |   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
14 | }
15 | 
16 | install_centos() {
17 |   # Need EPEL for many packages we depend on.
18 |   # See http://fedoraproject.org/wiki/EPEL
19 |   yum --enablerepo=extras install -y epel-release
20 | 
21 |   yum install -y \
22 |       opencv-devel \
23 |       ffmpeg-devel
24 | 
25 |   # Cleanup
26 |   yum clean all
27 |   rm -rf /var/cache/yum
28 |   rm -rf /var/lib/yum/yumdb
29 |   rm -rf /var/lib/yum/history
30 | }
31 | 
32 | # Install base packages depending on the base OS
33 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
34 | case "$ID" in
35 |   ubuntu)
36 |     install_ubuntu
37 |     ;;
38 |   centos)
39 |     install_centos
40 |     ;;
41 |   *)
42 |     echo "Unable to determine OS..."
43 |     exit 1
44 |     ;;
45 | esac
46 | 


--------------------------------------------------------------------------------
/.ci/docker/common/install_vulkan_sdk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | [ -n "${VULKAN_SDK_VERSION}" ]
 6 | 
 7 | retry () {
 8 |     $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 9 | }
10 | 
11 | _vulkansdk_dir=/var/lib/jenkins/vulkansdk
12 | _tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
13 | 
14 | curl \
15 |   --silent \
16 |   --show-error \
17 |   --location \
18 |   --fail \
19 |   --retry 3 \
20 |   --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
21 | 
22 | mkdir -p "${_vulkansdk_dir}"
23 | tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
24 | rm -rf "${_tmp_vulkansdk_targz}"
25 | 


--------------------------------------------------------------------------------
/.ci/docker/linter/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION
 2 | 
 3 | FROM ubuntu:${UBUNTU_VERSION}
 4 | 
 5 | ARG UBUNTU_VERSION
 6 | 
 7 | ENV DEBIAN_FRONTEND noninteractive
 8 | 
 9 | # Install common dependencies (so that this step can be cached separately)
10 | COPY ./common/install_base.sh install_base.sh
11 | RUN bash ./install_base.sh && rm install_base.sh
12 | 
13 | # Install user
14 | COPY ./common/install_user.sh install_user.sh
15 | RUN bash ./install_user.sh && rm install_user.sh
16 | 
17 | # Install conda and other packages (e.g., numpy, pytest)
18 | ARG ANACONDA_PYTHON_VERSION
19 | ARG CONDA_CMAKE
20 | ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
21 | ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
22 | COPY requirements-ci.txt /opt/conda/requirements-ci.txt
23 | COPY ./common/install_conda.sh install_conda.sh
24 | COPY ./common/common_utils.sh common_utils.sh
25 | RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
26 | 
27 | # Note that Docker build forbids copying file outside the build context
28 | COPY ./common/install_linter.sh install_linter.sh
29 | COPY ./common/common_utils.sh common_utils.sh
30 | RUN bash ./install_linter.sh
31 | RUN rm install_linter.sh common_utils.sh
32 | 
33 | USER jenkins
34 | CMD ["bash"]
35 | 


--------------------------------------------------------------------------------
/.ci/docker/triton_version.txt:
--------------------------------------------------------------------------------
1 | 2.1.0
2 | 


--------------------------------------------------------------------------------
/.ci/docker/ubuntu-rocm/.gitignore:
--------------------------------------------------------------------------------
1 | *.sh
2 | 


--------------------------------------------------------------------------------
/.ci/onnx/README.md:
--------------------------------------------------------------------------------
 1 | # Jenkins
 2 | 
 3 | The scripts in this directory are the entrypoint for testing ONNX exporter.
 4 | 
 5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to
 6 | the build environment you intend to test. It is a hint for the build
 7 | and test scripts to configure Caffe2 a certain way and include/exclude
 8 | tests. Docker images, they equal the name of the image itself. For
 9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
10 | built on Jenkins and are used in triggered builds already have this
11 | environment variable set in their manifest. Also see
12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13 | 
14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
15 | 


--------------------------------------------------------------------------------
/.ci/onnx/common.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
 5 | TEST_DIR="$ROOT_DIR/test"
 6 | pytest_reports_dir="${TEST_DIR}/test-reports/python"
 7 | 
 8 | # Figure out which Python to use
 9 | PYTHON="$(which python)"
10 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
11 |   PYTHON=$(which "python${BASH_REMATCH[1]}")
12 | fi
13 | 
14 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
15 |     # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
16 |     unset HIP_PLATFORM
17 | fi
18 | 
19 | mkdir -p "$pytest_reports_dir" || true
20 | 


--------------------------------------------------------------------------------
/.ci/onnx/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # shellcheck source=./common.sh
 4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 5 | 
 6 | # Use to retry ONNX test, only retry it twice
 7 | retry () {
 8 |     "$@" || (sleep 60 && "$@")
 9 | }
10 | 
11 | if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
12 |   pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
13 |   # TODO: This can be removed later once vision is also part of the Docker image
14 |   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
15 |   # JIT C++ extensions require ninja, so put it into PATH.
16 |   export PATH="/var/lib/jenkins/.local/bin:$PATH"
17 |   # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
18 |   # need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in
19 |   # https://github.com/pytorch/pytorch/issues/98626
20 |   retry "$ROOT_DIR/scripts/onnx/test.sh"
21 | fi
22 | 


--------------------------------------------------------------------------------
/.ci/pytorch/.shellcheckrc:
--------------------------------------------------------------------------------
1 | source-path=SCRIPTDIR
2 | 
3 | # we'd like to enable --external-sources here but can't
4 | # https://github.com/koalaman/shellcheck/issues/1818
5 | 


--------------------------------------------------------------------------------
/.ci/pytorch/build-asan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Required environment variable: $BUILD_ENVIRONMENT
 4 | # (This is set by default in the Docker images we build, so you don't
 5 | # need to set it yourself.
 6 | 
 7 | # shellcheck source=./common.sh
 8 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 9 | # shellcheck source=./common-build.sh
10 | source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
11 | 
12 | echo "Clang version:"
13 | clang --version
14 | 
15 | python tools/stats/export_test_times.py
16 | 
17 | if [ -n "$(which conda)" ]; then
18 |   export CMAKE_PREFIX_PATH=/opt/conda
19 | fi
20 | 
21 | CC="clang" CXX="clang++" LDSHARED="clang --shared" \
22 |   USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \
23 |   UBSAN_FLAGS="-fno-sanitize-recover=all" \
24 |   python setup.py bdist_wheel
25 |   pip_install_whl "$(echo dist/*.whl)"
26 | 
27 | # Test building via the sdist source tarball
28 | python setup.py sdist
29 | mkdir -p /tmp/tmp
30 | pushd /tmp/tmp
31 | tar zxf "$(dirname "${BASH_SOURCE[0]}")/../../dist/"*.tar.gz
32 | cd torch-*
33 | python setup.py build --cmake-only
34 | popd
35 | 
36 | print_sccache_stats
37 | 
38 | assert_git_not_dirty
39 | 


--------------------------------------------------------------------------------
/.ci/pytorch/common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Common setup for all Jenkins scripts
 4 | # shellcheck source=./common_utils.sh
 5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 6 | set -ex
 7 | 
 8 | # Required environment variables:
 9 | #   $BUILD_ENVIRONMENT (should be set by your Docker image)
10 | 
11 | # Figure out which Python to use for ROCm
12 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
13 |   # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
14 |   unset HIP_PLATFORM
15 |   export PYTORCH_TEST_WITH_ROCM=1
16 |   # temporary to locate some kernel issues on the CI nodes
17 |   export HSAKMT_DEBUG_LEVEL=4
18 |   # improve rccl performance for distributed tests
19 |   export HSA_FORCE_FINE_GRAIN_PCIE=1
20 | fi
21 | 
22 | # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
23 | # shellcheck disable=SC2034
24 | BUILD_TEST_LIBTORCH=0
25 | 


--------------------------------------------------------------------------------
/.ci/pytorch/docker-build-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # shellcheck source=./common.sh
4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
5 | 
6 | docker build -t pytorch .
7 | 


--------------------------------------------------------------------------------
/.ci/pytorch/docs-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # shellcheck source=./common.sh
 4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 5 | 
 6 | echo "Testing pytorch docs"
 7 | 
 8 | cd docs
 9 | pip_install -r requirements.txt
10 | make doctest
11 | 


--------------------------------------------------------------------------------
/.ci/pytorch/fake_numpy/numpy.py:
--------------------------------------------------------------------------------
1 | raise ModuleNotFoundError("Sorry PyTorch, but our NumPy is in the other folder")
2 | 


--------------------------------------------------------------------------------
/.ci/pytorch/macos-build-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-build* ]]; then
 4 |   # shellcheck source=./macos-build.sh
 5 |   source "$(dirname "${BASH_SOURCE[0]}")/macos-build.sh"
 6 | fi
 7 | 
 8 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-test* ]]; then
 9 | # shellcheck source=./macos-test.sh
10 |   source "$(dirname "${BASH_SOURCE[0]}")/macos-test.sh"
11 | fi
12 | 


--------------------------------------------------------------------------------
/.ci/pytorch/macos-common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Common prelude for macos-build.sh and macos-test.sh
 4 | 
 5 | # shellcheck source=./common.sh
 6 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 7 | 
 8 | sysctl -a | grep machdep.cpu
 9 | 
10 | # These are required for both the build job and the test job.
11 | # In the latter to test cpp extensions.
12 | export MACOSX_DEPLOYMENT_TARGET=10.9
13 | export CXX=clang++
14 | export CC=clang
15 | 
16 | print_cmake_info() {
17 |   CMAKE_EXEC=$(which cmake)
18 |   echo "$CMAKE_EXEC"
19 | 
20 |   CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
21 |   # Print all libraries under cmake rpath for debugging
22 |   ls -la "$CONDA_INSTALLATION_DIR/../lib"
23 | 
24 |   export CMAKE_EXEC
25 |   # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
26 |   # where cmake dependencies couldn't be found. This seems to point to how conda
27 |   # links $CMAKE_EXEC to its package cache when cloning a new environment
28 |   install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
29 |   # Adding the rpath will invalidate cmake signature, so signing it again here
30 |   # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
31 |   # with an exit code 137 otherwise
32 |   codesign -f -s - "${CMAKE_EXEC}" || true
33 | }
34 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | run_test () {
 5 |   rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
 6 |   "$@"
 7 |   cd .. && rm -rf test_tmp/
 8 | }
 9 | 
10 | get_runtime_of_command () {
11 |   TIMEFORMAT=%R
12 | 
13 |   # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
14 |   runtime=$( { time "$@"; } 2>&1 1>/dev/null)
15 |   if [[ $runtime == *"Error"* ]]; then
16 |     exit 1
17 |   fi
18 |   runtime=${runtime#+++ $@}
19 |   runtime=$(python -c "print($runtime)")
20 | 
21 |   echo "$runtime"
22 | }
23 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/get_stats.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import numpy
 4 | 
 5 | sample_data_list = sys.argv[1:]
 6 | sample_data_list = [float(v.strip()) for v in sample_data_list]
 7 | 
 8 | sample_mean = numpy.mean(sample_data_list)
 9 | sample_sigma = numpy.std(sample_data_list)
10 | 
11 | data = {
12 |     'mean': sample_mean,
13 |     'sigma': sample_sigma,
14 | }
15 | 
16 | print(json.dumps(data))
17 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_cpu_speed_mini_sequence_labeler () {
 7 |   echo "Testing: mini sequence labeler, CPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/benchmark.git
13 | 
14 |   cd benchmark/
15 | 
16 |   git checkout 726567a455edbfda6199445922a8cfee82535664
17 | 
18 |   cd scripts/mini_sequence_labeler
19 | 
20 |   SAMPLE_ARRAY=()
21 |   NUM_RUNS=$1
22 | 
23 |   for (( i=1; i<=NUM_RUNS; i++ )) do
24 |     runtime=$(get_runtime_of_command python main.py)
25 |     SAMPLE_ARRAY+=("${runtime}")
26 |   done
27 | 
28 |   cd ../../..
29 | 
30 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
31 |   echo "Runtime stats in seconds:"
32 |   echo "$stats"
33 | 
34 |   if [ "$2" == "compare_with_baseline" ]; then
35 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
36 |   elif [ "$2" == "compare_and_update" ]; then
37 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
38 |   fi
39 | }
40 | 
41 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
42 |   run_test test_cpu_speed_mini_sequence_labeler "$@"
43 | fi
44 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_cpu_speed_mnist () {
 7 |   echo "Testing: MNIST, CPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/examples.git -b perftests
13 | 
14 |   cd examples/mnist
15 | 
16 |   conda install -c pytorch torchvision-cpu
17 | 
18 |   # Download data
19 |   python main.py --epochs 0
20 | 
21 |   SAMPLE_ARRAY=()
22 |   NUM_RUNS=$1
23 | 
24 |   for (( i=1; i<=NUM_RUNS; i++ )) do
25 |     runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
26 |     echo "$runtime"
27 |     SAMPLE_ARRAY+=("${runtime}")
28 |   done
29 | 
30 |   cd ../..
31 | 
32 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
33 |   echo "Runtime stats in seconds:"
34 |   echo "$stats"
35 | 
36 |   if [ "$2" == "compare_with_baseline" ]; then
37 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
38 |   elif [ "$2" == "compare_and_update" ]; then
39 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
40 |   fi
41 | }
42 | 
43 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
44 |   run_test test_cpu_speed_mnist "$@"
45 | fi
46 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_torch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./common.sh
 4 | 
 5 | test_cpu_speed_torch () {
 6 |   echo "Testing: torch.*, CPU"
 7 | 
 8 |   export OMP_NUM_THREADS=4
 9 |   export MKL_NUM_THREADS=4
10 | 
11 |   git clone https://github.com/yf225/perf-tests.git
12 | 
13 |   if [ "$1" == "compare_with_baseline" ]; then
14 |     export ARGS=(--compare ../cpu_runtime.json)
15 |   elif [ "$1" == "compare_and_update" ]; then
16 |     export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
17 |   elif [ "$1" == "update_only" ]; then
18 |     export ARGS=(--update ../new_cpu_runtime.json)
19 |   fi
20 | 
21 |   if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
22 |     echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
23 |     exit 1
24 |   fi
25 | }
26 | 
27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
28 |   run_test test_cpu_speed_torch "$@"
29 | fi
30 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . ./common.sh
 4 | 
 5 | test_cpu_speed_torch_tensor () {
 6 |   echo "Testing: torch.Tensor.*, CPU"
 7 | 
 8 |   export OMP_NUM_THREADS=4
 9 |   export MKL_NUM_THREADS=4
10 | 
11 |   git clone https://github.com/yf225/perf-tests.git
12 | 
13 |   if [ "$1" == "compare_with_baseline" ]; then
14 |     export ARGS=(--compare ../cpu_runtime.json)
15 |   elif [ "$1" == "compare_and_update" ]; then
16 |     export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
17 |   elif [ "$1" == "update_only" ]; then
18 |     export ARGS=(--update ../new_cpu_runtime.json)
19 |   fi
20 | 
21 |   if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
22 |     echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
23 |     exit 1
24 |   fi
25 | }
26 | 
27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
28 |   run_test test_cpu_speed_torch_tensor "$@"
29 | fi
30 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_gpu_speed_cudnn_lstm () {
 7 |   echo "Testing: CuDNN LSTM, GPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/benchmark.git
13 | 
14 |   cd benchmark/
15 | 
16 |   git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 | 
18 |   cd scripts/
19 | 
20 |   SAMPLE_ARRAY=()
21 |   NUM_RUNS=$1
22 | 
23 |   for (( i=1; i<=NUM_RUNS; i++ )) do
24 |     runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
25 |     echo "$runtime"
26 |     SAMPLE_ARRAY+=("${runtime}")
27 |   done
28 | 
29 |   cd ../..
30 | 
31 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 |   echo "Runtime stats in seconds:"
33 |   echo "$stats"
34 | 
35 |   if [ "$2" == "compare_with_baseline" ]; then
36 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 |   elif [ "$2" == "compare_and_update" ]; then
38 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 |   fi
40 | }
41 | 
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 |   run_test test_gpu_speed_cudnn_lstm "$@"
44 | fi
45 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_gpu_speed_lstm () {
 7 |   echo "Testing: LSTM, GPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/benchmark.git
13 | 
14 |   cd benchmark/
15 | 
16 |   git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 | 
18 |   cd scripts/
19 | 
20 |   SAMPLE_ARRAY=()
21 |   NUM_RUNS=$1
22 | 
23 |   for (( i=1; i<=NUM_RUNS; i++ )) do
24 |     runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
25 |     echo "$runtime"
26 |     SAMPLE_ARRAY+=("${runtime}")
27 |   done
28 | 
29 |   cd ../..
30 | 
31 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 |   echo "Runtime stats in seconds:"
33 |   echo "$stats"
34 | 
35 |   if [ "$2" == "compare_with_baseline" ]; then
36 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 |   elif [ "$2" == "compare_and_update" ]; then
38 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 |   fi
40 | }
41 | 
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 |   run_test test_gpu_speed_lstm "$@"
44 | fi
45 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_gpu_speed_mlstm () {
 7 |   echo "Testing: MLSTM, GPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/benchmark.git
13 | 
14 |   cd benchmark/
15 | 
16 |   git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 | 
18 |   cd scripts/
19 | 
20 |   SAMPLE_ARRAY=()
21 |   NUM_RUNS=$1
22 | 
23 |   for (( i=1; i<=NUM_RUNS; i++ )) do
24 |     runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
25 |     echo "$runtime"
26 |     SAMPLE_ARRAY+=("${runtime}")
27 |   done
28 | 
29 |   cd ../..
30 | 
31 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 |   echo "Runtime stats in seconds:"
33 |   echo "$stats"
34 | 
35 |   if [ "$2" == "compare_with_baseline" ]; then
36 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 |   elif [ "$2" == "compare_and_update" ]; then
38 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 |   fi
40 | }
41 | 
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 |   run_test test_gpu_speed_mlstm "$@"
44 | fi
45 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | . ./common.sh
 5 | 
 6 | test_gpu_speed_mnist () {
 7 |   echo "Testing: MNIST, GPU"
 8 | 
 9 |   export OMP_NUM_THREADS=4
10 |   export MKL_NUM_THREADS=4
11 | 
12 |   git clone https://github.com/pytorch/examples.git -b perftests
13 | 
14 |   cd examples/mnist
15 | 
16 |   conda install -c pytorch torchvision
17 | 
18 |   # Download data
19 |   python main.py --epochs 0
20 | 
21 |   SAMPLE_ARRAY=()
22 |   NUM_RUNS=$1
23 | 
24 |   # Needs warm up to get accurate number
25 |   python main.py --epochs 1 --no-log
26 | 
27 |   for (( i=1; i<=NUM_RUNS; i++ )) do
28 |     runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
29 |     echo "$runtime"
30 |     SAMPLE_ARRAY+=("${runtime}")
31 |   done
32 | 
33 |   cd ../..
34 | 
35 |   stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
36 |   echo "Runtime stats in seconds:"
37 |   echo "$stats"
38 | 
39 |   if [ "$2" == "compare_with_baseline" ]; then
40 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
41 |   elif [ "$2" == "compare_and_update" ]; then
42 |     python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
43 |   fi
44 | }
45 | 
46 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
47 |   run_test test_gpu_speed_mnist "$@"
48 | fi
49 | 


--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/update_commit_hash.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | data_file_path = sys.argv[1]
 5 | commit_hash = sys.argv[2]
 6 | 
 7 | with open(data_file_path) as data_file:
 8 |     data = json.load(data_file)
 9 | 
10 | data['commit'] = commit_hash
11 | 
12 | with open(data_file_path, 'w') as data_file:
13 |     json.dump(data, data_file)
14 | 


--------------------------------------------------------------------------------
/.ci/pytorch/print_sccache_log.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | log_file_path = sys.argv[1]
 4 | 
 5 | with open(log_file_path) as f:
 6 |     lines = f.readlines()
 7 | 
 8 | for line in lines:
 9 |     # Ignore errors from CPU instruction set, symbol existing testing,
10 |     # or compilation error formatting
11 |     ignored_keywords = [
12 |         'src.c',
13 |         'CheckSymbolExists.c',
14 |         'test_compilation_error_formatting',
15 |     ]
16 |     if all([keyword not in line for keyword in ignored_keywords]):
17 |         print(line)
18 | 


--------------------------------------------------------------------------------
/.ci/pytorch/run_glootls_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
 4 | TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
 5 | 
 6 | openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
 7 | 
 8 | export GLOO_DEVICE_TRANSPORT=TCP_TLS
 9 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
10 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
11 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
12 | 
13 | time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
14 | 
15 | unset GLOO_DEVICE_TRANSPORT
16 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
17 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
18 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
19 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/choose_runtime_cuda_version.bat:
--------------------------------------------------------------------------------
1 | REM The first argument should the CUDA version
2 | echo %PATH%
3 | echo %CUDA_PATH%
4 | set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%1\bin;%PATH%
5 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat:
--------------------------------------------------------------------------------
 1 | if "%BUILD_ENVIRONMENT%"=="" (
 2 |   set CONDA_PARENT_DIR=%CD%
 3 | ) else (
 4 |   set CONDA_PARENT_DIR=C:\Jenkins
 5 | )
 6 | 
 7 | 
 8 | :: Be conservative here when rolling out the new AMI with conda. This will try
 9 | :: to install conda as before if it couldn't find the conda installation. This
10 | :: can be removed eventually after we gain enough confidence in the AMI
11 | if not exist %CONDA_PARENT_DIR%\Miniconda3 (
12 |   set INSTALL_FRESH_CONDA=1
13 | )
14 | 
15 | if "%INSTALL_FRESH_CONDA%"=="1" (
16 |   curl --retry 3 --retry-all-errors -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
17 |   if errorlevel 1 exit /b
18 |   if not errorlevel 0 exit /b
19 | 
20 |   %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
21 |   if errorlevel 1 exit /b
22 |   if not errorlevel 0 exit /b
23 | )
24 | 
25 | :: Activate conda so that we can use its commands, i.e. conda, python, pip
26 | call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
27 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat:
--------------------------------------------------------------------------------
 1 | if "%CUDA_VERSION%" == "cpu" (
 2 |   echo skip magma installation for cpu builds
 3 |   exit /b 0
 4 | )
 5 | 
 6 | rem remove dot in cuda_version, fox example 11.1 to 111
 7 | 
 8 | if not "%USE_CUDA%"=="1" (
 9 |     exit /b 0
10 | )
11 | 
12 | if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
13 |     echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
14 |     exit /b 1
15 | )
16 | 
17 | set VERSION_SUFFIX=%CUDA_VERSION:.=%
18 | set CUDA_SUFFIX=cuda%VERSION_SUFFIX%
19 | 
20 | if "%CUDA_SUFFIX%" == "" (
21 |   echo unknown CUDA version, please set `CUDA_VERSION` higher than 10.2
22 |   exit /b 1
23 | )
24 | 
25 | if "%REBUILD%"=="" (
26 |   if "%BUILD_ENVIRONMENT%"=="" (
27 |     curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
28 |   ) else (
29 |     aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
30 |   )
31 |   if errorlevel 1 exit /b
32 |   if not errorlevel 0 exit /b
33 |   7z x -aoa %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
34 |   if errorlevel 1 exit /b
35 |   if not errorlevel 0 exit /b
36 | )
37 | set MAGMA_HOME=%TMP_DIR_WIN%\magma
38 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat:
--------------------------------------------------------------------------------
 1 | if "%REBUILD%"=="" (
 2 |   if "%BUILD_ENVIRONMENT%"=="" (
 3 |     curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
 4 |   ) else (
 5 |     aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
 6 |   )
 7 |   if errorlevel 1 exit /b
 8 |   if not errorlevel 0 exit /b
 9 |   7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
10 |   if errorlevel 1 exit /b
11 |   if not errorlevel 0 exit /b
12 | )
13 | set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
14 | set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
15 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat:
--------------------------------------------------------------------------------
 1 | mkdir %TMP_DIR_WIN%\bin
 2 | 
 3 | if "%REBUILD%"=="" (
 4 |   :check_sccache
 5 |   %TMP_DIR_WIN%\bin\sccache.exe --show-stats || (
 6 |     taskkill /im sccache.exe /f /t || ver > nul
 7 |     del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
 8 |     del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
 9 |     if "%BUILD_ENVIRONMENT%"=="" (
10 |       curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
11 |       curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
12 |     ) else (
13 |       aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
14 |       aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
15 |     )
16 |     goto :check_sccache
17 |   )
18 | )
19 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_custom_backend.bat:
--------------------------------------------------------------------------------
 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 2 | 
 3 | git submodule update --init --recursive third_party/pybind11
 4 | cd test\custom_backend
 5 | 
 6 | :: Build the custom backend library.
 7 | mkdir build
 8 | pushd build
 9 | 
10 | echo "Executing CMake for custom_backend test..."
11 | 
12 | :: Note: Caffe2 does not support MSVC + CUDA + Debug mode (has to be Release mode)
13 | cmake -DCMAKE_PREFIX_PATH=%TMP_DIR_WIN%\build\torch -DCMAKE_BUILD_TYPE=Release -GNinja ..
14 | if ERRORLEVEL 1 exit /b 1
15 | 
16 | echo "Executing Ninja for custom_backend test..."
17 | 
18 | ninja -v
19 | if ERRORLEVEL 1 exit /b 1
20 | 
21 | echo "Ninja succeeded for custom_backend test."
22 | 
23 | popd
24 | 
25 | :: Run tests Python-side and export a script module.
26 | python test_custom_backend.py -v
27 | if ERRORLEVEL 1 exit /b 1
28 | 
29 | python backend.py --export-module-to="build/model.pt"
30 | if ERRORLEVEL 1 exit /b 1
31 | 
32 | :: Run tests C++-side and load the exported script module.
33 | cd build
34 | set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
35 | test_custom_backend.exe model.pt
36 | if ERRORLEVEL 1 exit /b 1
37 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_distributed.bat:
--------------------------------------------------------------------------------
 1 | REM The first argument should lead to the python interpreter
 2 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_common
 3 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
 4 | 
 5 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_gloo
 6 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
 7 | 
 8 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_nccl
 9 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
10 | 
11 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
12 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
13 | 
14 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
15 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
16 | 
17 | %1\python.exe test/run_test.py --verbose -i distributed/test_data_parallel
18 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
19 | 
20 | %1\python.exe test/run_test.py --verbose -i distributed/test_store
21 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
22 | 
23 | %1\python.exe test/run_test.py --verbose -i distributed/test_pg_wrapper
24 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
25 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat:
--------------------------------------------------------------------------------
 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 2 | 
 3 | echo Copying over test times file
 4 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
 5 | 
 6 | pushd test
 7 | 
 8 | echo Run jit_profiling tests
 9 | python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
10 | if ERRORLEVEL 1 exit /b 1
11 | 
12 | popd
13 | 


--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_python_shard.bat:
--------------------------------------------------------------------------------
 1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 2 | :: exit the batch once there's an error
 3 | if not errorlevel 0 (
 4 |   echo "setup pytorch env failed"
 5 |   echo %errorlevel%
 6 |   exit /b
 7 | )
 8 | 
 9 | pushd test
10 | 
11 | set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe"
12 | if "%SHARD_NUMBER%" == "1" (
13 |   if exist %GFLAGS_EXE% (
14 |     echo Some smoke tests
15 |     %GFLAGS_EXE% /i python.exe +sls
16 |     python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py
17 |     if ERRORLEVEL 1 goto fail
18 | 
19 |     %GFLAGS_EXE% /i python.exe -sls
20 |     if ERRORLEVEL 1 goto fail
21 |   )
22 | )
23 | 
24 | echo Copying over test times file
25 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
26 | 
27 | echo Run nn tests
28 | python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
29 | if ERRORLEVEL 1 goto fail
30 | 
31 | popd
32 | 
33 | :eof
34 | exit /b 0
35 | 
36 | :fail
37 | exit /b 1
38 | 


--------------------------------------------------------------------------------
/.cmakelintrc:
--------------------------------------------------------------------------------
1 | filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
2 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | plugins =
 3 |     coverage_plugins.jit_plugin
 4 | omit =
 5 |     */tmp*
 6 |     */Temp/*
 7 |     */usr/local/lib*
 8 |     *test/*
 9 | 
10 | [report]
11 | omit =
12 |     */tmp*
13 |     */Temp/*
14 |     */usr/local/lib*
15 |     *test/*
16 | 


--------------------------------------------------------------------------------
/.ctags.d/pytorch.ctags:
--------------------------------------------------------------------------------
1 | --exclude=build/*
2 | --exclude=include/*
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .gitignore


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bat text eol=crlf
2 | .circleci/config.yml linguist-generated=true
3 | .github/workflows/generated-*.yml linguist-generated=true
4 | .github/generated-* linguist-generated=true
5 | .github/scripts/gql_mocks.json linguist-generated=true
6 | third_party/LICENSES_BUNDLED.txt linguist-generated=true
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ci-sev.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "⚠️ CI SEV"
 3 | about: Tracking incidents for PyTorch's CI infra.
 4 | labels: "ci: sev"
 5 | ---
 6 | 
 7 | > NOTE: Remember to label this issue with "`ci: sev`"
 8 | 
 9 | **MERGE BLOCKING** <!-- remove this line if you don't want this SEV to block merges -->
10 | 
11 | ## Current Status
12 | *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
13 | 
14 | ## Error looks like
15 | *Provide some way users can tell that this SEV is causing their issue.*
16 | 
17 | ## Incident timeline (all times pacific)
18 | *Include when the incident began, when it was detected, mitigated, root caused, and finally closed.*
19 | 
20 | <details>
21 | <summary> Click for example </summary>
22 | 
23 | e.g.
24 | - 10/30 7:27a incident began
25 | - 10/30 8:30a detected by <method>
26 | - 10/30 9:00 pm root caused as…
27 | - 10/30 9:10 pm mitigated by…
28 | - 10/31 10: am closed by…
29 | 
30 | </details>
31 | 
32 | ## User impact
33 | *How does this affect users of PyTorch CI?*
34 | 
35 | ## Root cause
36 | *What was the root cause of this issue?*
37 | 
38 | ## Mitigation
39 | *How did we mitigate the issue?*
40 | 
41 | ## Prevention/followups
42 | *How do we prevent issues like this in the future?*
43 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Questions
4 |     url: https://discuss.pytorch.org/
5 |     about: Ask questions and discuss with other PyTorch community members
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/disable-ci-jobs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Disable CI jobs (PyTorch Dev Infra only)
 3 | about: Use this template to disable CI jobs
 4 | title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
 5 | labels: "module: ci"
 6 | ---
 7 | 
 8 | > For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
 9 | > created, the job will be disabled within 15 minutes. You can check the
10 | > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json
11 | 
12 | > If you need to get this out ASAP instead of waiting for 15 minutes,
13 | > you can manually trigger the workflow at https://github.com/pytorch/test-infra/actions/workflows/update_disabled_tests.yml
14 | > once the issue is created to update the above JSON list right away.
15 | 
16 | > Noted: you need to have write access to PyTorch repo to disable CI
17 | > jobs. The issue will be rejected otherwise.
18 | 
19 | ## Reason
20 | *Provide a reason why this is needed and when this can be resolved*.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://pytorch.org/docs/stable/index.html
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 📚 The doc issue
 8 |     description: >
 9 |       A clear and concise description of what content in https://pytorch.org/docs/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Suggest a potential alternative/fix
15 |     description: >
16 |       Tell us how we could improve the documentation in this regard.
17 | - type: markdown
18 |   attributes:
19 |     value: >
20 |       Thanks for contributing 🎉!
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new PyTorch feature
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 🚀 The feature, motivation and pitch
 8 |     description: >
 9 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Alternatives
15 |     description: >
16 |       A description of any alternative solutions or features you've considered, if any.
17 | - type: textarea
18 |   attributes:
19 |     label: Additional context
20 |     description: >
21 |       Add any other context or screenshots about the feature request.
22 | - type: markdown
23 |   attributes:
24 |     value: >
25 |       Thanks for contributing 🎉!
26 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #ISSUE_NUMBER
2 | 


--------------------------------------------------------------------------------
/.github/actionlint.yaml:
--------------------------------------------------------------------------------
 1 | self-hosted-runner:
 2 |   labels:
 3 |     - linux.20_04.4x
 4 |     - linux.20_04.16x
 5 |     - linux.large
 6 |     - linux.2xlarge
 7 |     - linux.4xlarge
 8 |     - linux.12xlarge
 9 |     - linux.24xlarge
10 |     - linux.4xlarge.nvidia.gpu
11 |     - linux.8xlarge.nvidia.gpu
12 |     - linux.16xlarge.nvidia.gpu
13 |     - linux.g5.4xlarge.nvidia.gpu
14 |     - windows.4xlarge
15 |     - windows.8xlarge.nvidia.gpu
16 |     - windows.g5.4xlarge.nvidia.gpu
17 |     - bm-runner
18 |     - linux.rocm.gpu
19 |     - macos-m1-12
20 |     - macos-m1-13
21 |     - macos-12-xl
22 |     - macos-12
23 |     - macos12.3-m1
24 | 


--------------------------------------------------------------------------------
/.github/actions/chown-workspace/action.yml:
--------------------------------------------------------------------------------
 1 | name: Chown workspace
 2 | 
 3 | description: Ensure that the working directory gets chowned back to the current user
 4 | 
 5 | runs:
 6 |   using: composite
 7 |   steps:
 8 |     - run: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
 9 |       shell: bash
10 |       env:
11 |         ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
12 | 


--------------------------------------------------------------------------------
/.github/actions/diskspace-cleanup/action.yml:
--------------------------------------------------------------------------------
 1 | name: Cleans up diskspace
 2 | 
 3 | description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace.
 4 | 
 5 | inputs:
 6 |     diskspace-cutoff:
 7 |         description: The percent amount after which docker prune is run.
 8 |         required: true
 9 |         default: 70
10 | 
11 | runs:
12 |   using: composite
13 |   steps:
14 |     - name: Cleans up diskspace
15 |       shell: bash
16 |       run: |
17 |         diskspace_cutoff=${{ inputs.diskspace-cutoff }}
18 |         diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
19 |         msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
20 |         if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
21 |             docker system prune -af
22 |             diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
23 |             if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
24 |                 echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
25 |                 echo "$msg"
26 |                 exit 1
27 |             else
28 |                 difference=$((diskspace - diskspace_new))
29 |                 echo "Diskspace saved: $difference percent"
30 |             fi
31 |         fi
32 | 


--------------------------------------------------------------------------------
/.github/actions/download-build-artifacts/action.yml:
--------------------------------------------------------------------------------
 1 | name: Download PyTorch Build Artifacts
 2 | 
 3 | description: Download and unzip artifacts from a previous PyTorch build.
 4 | 
 5 | inputs:
 6 |   name:
 7 |     description: Name of what artifact to download
 8 |     required: true
 9 |   use-gha:
10 |     description: If set to any value, use GHA to download the artifact. Otherwise use s3.
11 |     required: false
12 | 
13 | runs:
14 |   using: composite
15 |   steps:
16 |     - name: Download PyTorch Build Artifacts from S3
17 |       if: ${{ !inputs.use-gha }}
18 |       uses: seemethere/download-artifact-s3@v4
19 |       with:
20 |         name: ${{ inputs.name }}
21 | 
22 |     - name: Download PyTorch Build Artifacts from GHA
23 |       if: inputs.use-gha
24 |       uses: actions/download-artifact@v3
25 |       with:
26 |         name: ${{ inputs.name }}
27 | 
28 |     - name: Unzip artifacts
29 |       shell: bash
30 |       run: unzip -o artifacts.zip
31 | 
32 |     - name: Output disk space left
33 |       shell: bash
34 |       run: df -H
35 | 


--------------------------------------------------------------------------------
/.github/actions/get-workflow-job-id/action.yml:
--------------------------------------------------------------------------------
 1 | name: Get workflow job id
 2 | 
 3 | description: Get the ID of the workflow job that is currently running.
 4 | 
 5 | inputs:
 6 |   github-token:
 7 |     description: GITHUB_TOKEN
 8 |     required: true
 9 | 
10 | outputs:
11 |   job-id:
12 |     description: The retrieved workflow job id
13 |     value: ${{ steps.get-job-id.outputs.job-id }}
14 | 
15 | runs:
16 |   using: composite
17 |   steps:
18 |     - name: Get jobid or fail
19 |       # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
20 |       # timeout-minutes: 10
21 |       shell: bash
22 |       id: get-job-id
23 |       run: |
24 |         set -eux
25 |         GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
26 |         echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}"
27 |       env:
28 |         GITHUB_TOKEN: ${{ inputs.github-token }}
29 | 


--------------------------------------------------------------------------------
/.github/actions/teardown-rocm/action.yml:
--------------------------------------------------------------------------------
 1 | name: Teardown ROCm host
 2 | 
 3 | description: Tear down ROCm host for CI
 4 | 
 5 | runs:
 6 |   using: composite
 7 |   steps:
 8 |     - name: Teardown ROCm
 9 |       if: always()
10 |       shell: bash
11 |       run: |
12 |         # ignore expansion of "docker ps -q" since it could be empty
13 |         # shellcheck disable=SC2046
14 |         docker stop $(docker ps -q) || true
15 |         # Prune all stopped containers.
16 |         docker container prune -f
17 |     - name: Runner diskspace health check
18 |       uses: ./.github/actions/diskspace-cleanup
19 |       if: always()
20 | 


--------------------------------------------------------------------------------
/.github/actions/teardown-win/action.yml:
--------------------------------------------------------------------------------
 1 | name: Teardown Windows
 2 | 
 3 | description: Set up Docker workspace on linux
 4 | 
 5 | inputs:
 6 |   extra-delete-dir:
 7 |     description: If set, cleaning up the workspace will delete this too
 8 |     required: false
 9 |     default: ""
10 | 
11 | runs:
12 |   using: composite
13 |   steps:
14 |     - name: Wait until all sessions have drained
15 |       shell: powershell
16 |       if: always()
17 |       run: |
18 |         .github\scripts\wait_for_ssh_to_drain.ps1
19 | 
20 |     - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
21 |       shell: powershell
22 |       if: always()
23 |       run: |
24 |         .github\scripts\kill_active_ssh_sessions.ps1
25 | 
26 |     - name: Cleanup workspace
27 |       if: always()
28 |       shell: bash
29 |       env:
30 |         EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
31 |       run: |
32 |         [ ! -z "${EXTRA_DELETE_DIR}" ]  || rm -rf "${EXTRA_DELETE_DIR}"
33 |         rm -rf ./*
34 | 


--------------------------------------------------------------------------------
/.github/auto_request_review.yml:
--------------------------------------------------------------------------------
 1 | # Documented at https://github.com/necojackarc/auto-request-review
 2 | reviewers:
 3 |   groups:
 4 |     symbolic-shapes:
 5 |       - ezyang
 6 |       - albanD
 7 |       - miladm
 8 |       - bdhirsh
 9 |       - voznesenskym
10 |       - jbschlosser
11 | 
12 |   per_author:
13 |     symbolic-shapes:
14 |       - symbolic-shapes
15 |       - antoniojkim
16 |       - wconstab
17 |       - SherlockNoMad
18 |     Chillee:
19 |       - ezyang
20 | 
21 | files:
22 |   # none yet, TODO: migrate CODEOWNERS here
23 | 
24 | options:
25 |   ignore_draft: true
26 |   ignored_keywords:
27 |     - DO NOT REVIEW
28 |   # Just manually setup a self-referential per_author rule if you
29 |   # want group assignment
30 |   enable_group_assignment: false
31 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/audio.txt:
--------------------------------------------------------------------------------
1 | a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602


--------------------------------------------------------------------------------
/.github/ci_commit_pins/huggingface.txt:
--------------------------------------------------------------------------------
1 | ebee0a27940adfbb30444d83387b9ea0f1173f40
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/multipy.txt:
--------------------------------------------------------------------------------
1 | 7dd29931fa8e9bb7c970f05f8c0dc13b69e17494
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/text.txt:
--------------------------------------------------------------------------------
1 | 5b78d074bd303eb230d30567646fcf0358ee2dd4
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/timm.txt:
--------------------------------------------------------------------------------
1 | 6635bc3f7d06c6a0d0481803b24d6ad0004b61ac
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/torchbench.txt:
--------------------------------------------------------------------------------
1 | 159e58f0b36ee22e2b89d74bd7dc8a79376de01d
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/triton.txt:
--------------------------------------------------------------------------------
1 | ../../.ci/docker/ci_commit_pins/triton.txt


--------------------------------------------------------------------------------
/.github/ci_commit_pins/vision.txt:
--------------------------------------------------------------------------------
1 | b78d98bb152ffb9c0c0f5365f59f475c70b1784e
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/xla.txt:
--------------------------------------------------------------------------------
1 | f235d4da06905b35d75879a0a9bc3034ab7385ac
2 | 


--------------------------------------------------------------------------------
/.github/pytorch-circleci-labels.yml:
--------------------------------------------------------------------------------
 1 | # For documentation concerning this configuration please refer to,
 2 | # https://github.com/pytorch/pytorch-probot#trigger-circleci-workflows
 3 | labels_to_circle_params:
 4 |   ci/binaries:
 5 |     parameter: run_binary_tests
 6 |     default_true_on:
 7 |       branches:
 8 |         - nightly
 9 |         - release/.*
10 |       tags:
11 |         - v[0-9]+(\.[0-9]+)*-rc[0-9]+
12 |     set_to_false:
13 |       - run_build
14 |   ci/master:
15 |     parameter: run_master_build
16 |     set_to_false:
17 |       - run_build
18 |   ci/slow-gradcheck:
19 |     parameter: run_slow_gradcheck_build
20 |     set_to_false:
21 |       - run_build
22 | 


--------------------------------------------------------------------------------
/.github/pytorch-probot.yml:
--------------------------------------------------------------------------------
 1 | tracking_issue: 24422
 2 | ciflow_tracking_issue: 64124
 3 | ciflow_push_tags:
 4 | - ciflow/binaries
 5 | - ciflow/binaries_conda
 6 | - ciflow/binaries_libtorch
 7 | - ciflow/binaries_wheel
 8 | - ciflow/inductor
 9 | - ciflow/inductor-perf-compare
10 | - ciflow/mps
11 | - ciflow/nightly
12 | - ciflow/periodic
13 | - ciflow/slow
14 | - ciflow/trunk
15 | - ciflow/unstable
16 | retryable_workflows:
17 | - lint
18 | - pull
19 | - trunk
20 | - linux-binary
21 | - windows-binary
22 | 


--------------------------------------------------------------------------------
/.github/regenerate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | 
3 | # Allows this script to be invoked from any directory:
4 | cd "$(dirname "$0")"
5 | 
6 | python3 scripts/generate_ci_workflows.py
7 | 


--------------------------------------------------------------------------------
/.github/requirements-gha-cache.txt:
--------------------------------------------------------------------------------
 1 | # This file is to cache other dependencies not specified elsewhere in:
 2 | #   requirement.txt
 3 | #   requirements-flake8.txt
 4 | #   docs/requirements.txt
 5 | #   docs/cpp/requirements.txt
 6 | #   functorch/docs/requirements.txt
 7 | #   .ci/docker/requirements-ci.txt
 8 | boto3==1.19.12
 9 | jinja2==3.0.1
10 | lintrunner==0.10.7
11 | ninja==1.10.0.post1
12 | nvidia-ml-py==11.525.84
13 | pyyaml==6.0
14 | requests==2.26
15 | rich==10.9.0
16 | rockset==1.0.3
17 | 


--------------------------------------------------------------------------------
/.github/requirements/conda-env-Linux-X64:
--------------------------------------------------------------------------------
 1 | cmake=3.22.*
 2 | mkl=2022.1.0
 3 | mkl-include=2022.1.0
 4 | ninja=1.10.2
 5 | numpy=1.23.3
 6 | pyyaml=6.0
 7 | requests=2.28.1
 8 | setuptools=65.5.0
 9 | typing-extensions=4.3.0
10 | 


--------------------------------------------------------------------------------
/.github/requirements/conda-env-iOS:
--------------------------------------------------------------------------------
 1 | blas=1.0
 2 | cmake=3.22.1
 3 | mkl=2022.1.0
 4 | mkl-include=2022.1.0
 5 | ninja=1.10.2
 6 | numpy=1.23.3
 7 | pyyaml=6.0
 8 | requests=2.28.1
 9 | setuptools=63.4.1
10 | typing-extensions=4.3.0
11 | 


--------------------------------------------------------------------------------
/.github/requirements/conda-env-macOS-ARM64:
--------------------------------------------------------------------------------
 1 | numpy=1.22.3
 2 | pyyaml=6.0
 3 | setuptools=61.2.0
 4 | cmake=3.22.*
 5 | typing-extensions=4.3.0
 6 | dataclasses=0.8
 7 | pip=22.2.2
 8 | pillow=9.2.0
 9 | pkg-config=0.29.2
10 | wheel=0.37.1
11 | expecttest=0.1.3
12 | 
13 | # Not pinning certifi so that we can always get the latest certificates
14 | certifi
15 | 
16 | # Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64
17 | # itself only has up to 1.39.0 from upstream conda. Both work though
18 | libuv>=1.39.0,<=1.40.0
19 | 


--------------------------------------------------------------------------------
/.github/requirements/conda-env-macOS-X64:
--------------------------------------------------------------------------------
 1 | mkl=2021.2.0
 2 | mkl-include=2021.2.0
 3 | numpy=1.21.2
 4 | pyyaml=5.3
 5 | setuptools=46.0.0
 6 | cmake=3.22.*
 7 | typing-extensions=4.3.0
 8 | dataclasses=0.8
 9 | pip=22.2.2
10 | pillow=9.2.0
11 | libuv=1.40.0
12 | pkg-config=0.29.2
13 | wheel=0.37.1
14 | 
15 | # Not pinning certifi so that we can always get the latest certificates
16 | certifi
17 | 


--------------------------------------------------------------------------------
/.github/requirements/pip-requirements-iOS.txt:
--------------------------------------------------------------------------------
1 | # iOS simulator requirements
2 | coremltools==5.0b5
3 | protobuf==3.20.2
4 | 


--------------------------------------------------------------------------------
/.github/requirements/pip-requirements-macOS.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.19.12
 2 | hypothesis==6.56.4
 3 | expecttest==0.1.3
 4 | librosa>=0.6.2
 5 | mpmath==1.2.1
 6 | networkx==2.8.7
 7 | # Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available
 8 | numba==0.56.0; platform_machine == "arm64"
 9 | numba<=0.49.1; platform_machine != "arm64"
10 | opt-einsum>=3.3
11 | psutil==5.9.1
12 | nvidia-ml-py==11.525.84
13 | pygments==2.12.0
14 | pytest==7.2.0
15 | pytest-xdist==3.0.2
16 | pytest-rerunfailures==10.3
17 | pytest-flakefinder==1.1.0
18 | pytest-shard==0.1.2
19 | scipy==1.9.0
20 | sympy==1.11.1
21 | unittest-xml-reporting<=3.2.0,>=2.0.0
22 | xdoctest==1.1.0
23 | filelock==3.6.0
24 | sympy==1.11.1
25 | 


--------------------------------------------------------------------------------
/.github/requirements/regenerate-requirements.txt:
--------------------------------------------------------------------------------
1 | typing-extensions
2 | jinja2
3 | 


--------------------------------------------------------------------------------
/.github/scripts/comment_on_pr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any
 3 | 
 4 | from github_utils import gh_post_pr_comment
 5 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 6 | from trymerge_explainer import BOT_COMMANDS_WIKI
 7 | 
 8 | 
 9 | def parse_args() -> Any:
10 |     from argparse import ArgumentParser
11 | 
12 |     parser = ArgumentParser("Comment on a PR")
13 |     parser.add_argument("pr_num", type=int)
14 |     parser.add_argument("action", type=str)
15 |     return parser.parse_args()
16 | 
17 | 
18 | def main() -> None:
19 |     args = parse_args()
20 |     repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True)
21 |     org, project = repo.gh_owner_and_name()
22 |     run_url = os.environ.get("GH_RUN_URL")
23 | 
24 |     job_link = f"[job]({run_url})" if run_url is not None else "job"
25 |     msg = (
26 |         f"The {args.action} {job_link} was canceled. If you believe this is a mistake,"
27 |         + f"then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})."
28 |     )
29 | 
30 |     gh_post_pr_comment(org, project, args.pr_num, msg)
31 |     print(org, project, args.pr_num, msg)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/.github/scripts/export_pytorch_labels.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232.
 4 | 
 5 | As a part of enforcing test ownership, we want to maintain a list of existing PyTorch labels
 6 | to verify the owners' existence. This script outputs a file containing a list of existing
 7 | pytorch/pytorch labels so that the file could be uploaded to S3.
 8 | 
 9 | This script assumes the correct env vars are set for AWS permissions.
10 | 
11 | """
12 | 
13 | import json
14 | from typing import Any
15 | 
16 | import boto3  # type: ignore[import]
17 | 
18 | from label_utils import gh_get_labels
19 | 
20 | 
21 | def parse_args() -> Any:
22 |     from argparse import ArgumentParser
23 | 
24 |     parser = ArgumentParser("Export PR labels")
25 |     parser.add_argument("org", type=str)
26 |     parser.add_argument("repo", type=str)
27 | 
28 |     return parser.parse_args()
29 | 
30 | 
31 | def main() -> None:
32 |     args = parse_args()
33 |     print(f"Exporting labels for {args.org}/{args.repo}")
34 |     labels_file_name = "pytorch_labels.json"
35 |     obj = boto3.resource("s3").Object("ossci-metrics", labels_file_name)
36 |     obj.put(Body=json.dumps(gh_get_labels(args.org, args.repo)).encode())
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/.github/scripts/kill_active_ssh_sessions.ps1:
--------------------------------------------------------------------------------
 1 | function Get-SSH-Sessions {
 2 |     Get-Process sshd -IncludeUserName |
 3 |         Where-Object UserName -notLike "*SYSTEM*" |
 4 |         Select-Object Id
 5 | }
 6 | 
 7 | $runningSessions = Get-SSH-Sessions
 8 | 
 9 | foreach ($session in $runningSessions) {
10 |     Stop-Process -id $session.Id
11 | }
12 | 


--------------------------------------------------------------------------------
/.github/scripts/on_cancel_merge.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import Any
 3 | 
 4 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 5 | from trymerge import GitHubPR, MERGE_IN_PROGRESS_LABEL
 6 | 
 7 | 
 8 | def parse_args() -> Any:
 9 |     parser = argparse.ArgumentParser(
10 |         description="Perform actions when a merge workflow is cancelled"
11 |     )
12 |     parser.add_argument(
13 |         "--pr-num",
14 |         type=int,
15 |         required=True,
16 |         help="The PR number to cancel the merge for",
17 |     )
18 |     return parser.parse_args()
19 | 
20 | 
21 | def main() -> None:
22 |     args = parse_args()
23 |     repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True)
24 |     org, project = repo.gh_owner_and_name()
25 |     pr_num = args.pr_num
26 | 
27 |     GitHubPR(org, project, pr_num).remove_label(MERGE_IN_PROGRESS_LABEL)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/.github/scripts/parse_ref.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | 
 6 | 
 7 | def set_output(name: str, val: str) -> None:
 8 |     if os.getenv("GITHUB_OUTPUT"):
 9 |         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
10 |             print(f"{name}={val}", file=env)
11 |     else:
12 |         print(f"::set-output name={name}::{val}")
13 | 
14 | 
15 | def main() -> None:
16 |     ref = os.environ["GITHUB_REF"]
17 |     m = re.match(r"^refs/(\w+)/(.*)$", ref)
18 |     if m:
19 |         category, stripped = m.groups()
20 |         if category == "heads":
21 |             set_output("branch", stripped)
22 |         elif category == "pull":
23 |             set_output("branch", "pull/" + stripped.split("/")[0])
24 |         elif category == "tags":
25 |             set_output("tag", stripped)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/.github/scripts/report_git_status.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CHANGES=$(git status --porcelain "$1")
3 | echo "$CHANGES"
4 | git diff "$1"
5 | [ -z "$CHANGES" ]
6 | 


--------------------------------------------------------------------------------
/.github/scripts/stop_runner_service.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set +e
 4 | set -x
 5 | 
 6 | # Get the service name
 7 | RUNNER_SERVICE=$(cat "${RUNNER_WORKSPACE}/../../.service")
 8 | echo "GitHub self-hosted runner service: ${RUNNER_SERVICE}"
 9 | 
10 | if [[ -n "${RUNNER_SERVICE}" ]]; then
11 |   echo "The self-hosted runner has encountered an unrecoverable error and will be shutdown"
12 | 
13 |   pushd "${RUNNER_WORKSPACE}/../../"
14 |   # Stop it to prevent the runner from receiving new jobs
15 |   sudo ./svc.sh stop
16 |   # then uninstall the service
17 |   sudo ./svc.sh uninstall
18 |   # Finally, shutting down the runner completely
19 |   sudo shutdown -P now
20 |   # NB: In my test, cleaning up and shutting down the runner this way would already
21 |   # remove the runner from the list of registered runners. Calling config.sh remove
22 |   # seems redundant as it would require an org token to use, which I don't want to
23 |   # add as yet another secret to the CI if there is no need
24 | fi
25 | 


--------------------------------------------------------------------------------
/.github/scripts/wait_for_ssh_to_drain.ps1:
--------------------------------------------------------------------------------
 1 | function Get-SSH-Users {
 2 |     # Gets ssh sessions for all users not named SYSTEM
 3 |     Get-CimInstance -ClassName Win32_Process -Filter "Name = 'sshd.exe'" |
 4 |         Get-CimAssociatedInstance -Association Win32_SessionProcess |
 5 |         Get-CimAssociatedInstance -Association Win32_LoggedOnUser |
 6 |         Where-Object {$_.Name -ne 'SYSTEM'} |
 7 |         Measure-Object
 8 | }
 9 | 
10 | $usersLoggedOn = Get-SSH-Users
11 | 
12 | Write-Output "Holding runner until all ssh sessions have logged out"
13 | while ($usersLoggedOn.Count -gt 0) {
14 |     $usersLoggedOn = Get-SSH-Users
15 |     Write-Output "."
16 |     Start-Sleep -s 5
17 | }
18 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | include_trailing_comma=True
3 | multi_line_output=3
4 | skip=third_party
5 | skip_gitignore=True
6 | use_parentheses=True
7 | 


--------------------------------------------------------------------------------
/.lldbinit:
--------------------------------------------------------------------------------
 1 | # automatically load the pytorch_lldb extension.
 2 | #
 3 | # lldb automatically tries to load this file whenever it is executed from the
 4 | # root of the pytorch repo, but by default it is not allowed to do so due to
 5 | # security reasons. If you want to use pytorch_lldb, please add the following
 6 | # line to your ~/.lldbinit (i.e., the .lldbinit file which is in your home
 7 | # directory, NOT this file):
 8 | #    settings set target.load-cwd-lldbinit true
 9 | #    setting set escape-non-printables false
10 | #
11 | # Alternatively, you can manually load the pytorch_lldb  commands into your
12 | # existing lldb session by doing the following:
13 | #    (lldb) command script import tools/lldb/pytorch_lldb.py
14 | 
15 | command script import tools/lldb/pytorch_lldb.py
16 | setting set escape-non-printables false
17 | type category enable torch
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This is not a real branch.
2 | Please checkout `main`
3 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Reporting Security Issues
 2 | 
 3 | If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 4 | 
 5 | Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
 6 | 
 7 | Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 8 | 
 9 | https://www.facebook.com/whitehat
10 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Benchmarks
 2 | 
 3 | This folder contains scripts that produce reproducible timings of various PyTorch features.
 4 | 
 5 | It also provides mechanisms to compare PyTorch with other frameworks.
 6 | 
 7 | ## Setup environment
 8 | Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
 9 | ```
10 | # Install torchvision. It comes with the pytorch stable release binary
11 | conda install pytorch torchvision -c pytorch
12 | 
13 | # Install the latest pytorch master from source.
14 | # It should supersede the installation from the release binary.
15 | cd $PYTORCH_HOME
16 | python setup.py build develop
17 | 
18 | # Check the pytorch installation version
19 | python -c "import torch; print(torch.__version__)"
20 | ```
21 | 
22 | ## Benchmark List
23 | 
24 | Please refer to each subfolder to discover each benchmark suite
25 | 
26 | * [Fast RNNs benchmarks](fastrnns/README.md)
27 | 


--------------------------------------------------------------------------------
/benchmarks/compare.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m fastrnns.bench --fuser=old --group=rnns --print-json oss > old.json
3 | python -m fastrnns.bench --fuser=te --group=rnns  --print-json oss > te.json
4 | python compare-fastrnn-results.py old.json te.json --format md
5 | 


--------------------------------------------------------------------------------
/benchmarks/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(convolution_bench convolution.cpp)
2 | target_link_libraries(convolution_bench PRIVATE torch_library benchmark)
3 | 


--------------------------------------------------------------------------------
/benchmarks/cpp/tensorexpr/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(AVX)
 2 | 
 3 | add_executable(
 4 |   tensorexpr_bench
 5 |   bench_approx.cpp
 6 |   bench_batchnorm.cpp
 7 |   bench_concat.cpp
 8 |   bench_compile.cpp
 9 |   bench_signed_log1p.cpp
10 |   bench_fuser_overhead.cpp
11 |   bench_gemm.cpp
12 |   bench_kernels.cpp
13 |   bench_parallel.cpp
14 |   bench_prefix_sum.cpp
15 |   bench_reduce.cpp
16 |   main.cpp)
17 | 
18 | if(C_AVX2_FOUND)
19 |   message(STATUS "AVX2 compiler support found")
20 |   target_compile_options(tensorexpr_bench PUBLIC -mavx2)
21 |   target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
22 | endif()
23 | 
24 | target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)
25 | 


--------------------------------------------------------------------------------
/benchmarks/cpp/tensorexpr/main.cpp:
--------------------------------------------------------------------------------
1 | #include <benchmark/benchmark.h>
2 | 
3 | BENCHMARK_MAIN();
4 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "DummyData": {
 3 |         "data_class": "DummyData",
 4 |         "configurations": {
 5 |             "max_val": 1024,
 6 |             "sample_count": 1024,
 7 |             "sample_length": 1024,
 8 |             "sparsity_percentage": 20
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "DummyModel": {
 3 |         "model_class": "DummyModel",
 4 |         "configurations": {
 5 |             "num_embeddings": 1024,
 6 |             "embedding_dim": 1024,
 7 |             "dense_input_size": 1024,
 8 |             "dense_output_size": 1024,
 9 |             "dense_layers_count": 8,
10 |             "sparse": false
11 |         }
12 |     },
13 |     "DummyModelSparse": {
14 |         "model_class": "DummyModel",
15 |         "configurations": {
16 |             "num_embeddings": 1024,
17 |             "embedding_dim": 1024,
18 |             "dense_input_size": 1024,
19 |             "dense_output_size": 1024,
20 |             "dense_layers_count": 8,
21 |             "sparse": true
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .DummyData import DummyData
2 | 
3 | data_map = {
4 |     "DummyData": DummyData
5 | }
6 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from .MetricBase import MetricBase
 4 | 
 5 | 
 6 | class CPUMetric(MetricBase):
 7 |     def __init__(self, name: str):
 8 |         self.name = name
 9 |         self.start = None
10 |         self.end = None
11 | 
12 |     def record_start(self):
13 |         self.start = time.time()
14 | 
15 |     def record_end(self):
16 |         self.end = time.time()
17 | 
18 |     def elapsed_time(self):
19 |         if self.start is None:
20 |             raise RuntimeError("start is None")
21 |         if self.end is None:
22 |             raise RuntimeError("end is None")
23 |         return self.end - self.start
24 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .MetricBase import MetricBase
 4 | 
 5 | 
 6 | class CUDAMetric(MetricBase):
 7 |     def __init__(self, rank: int, name: str):
 8 |         self.rank = rank
 9 |         self.name = name
10 |         self.start = None
11 |         self.end = None
12 | 
13 |     def record_start(self):
14 |         self.start = torch.cuda.Event(enable_timing=True)
15 |         with torch.cuda.device(self.rank):
16 |             self.start.record()
17 | 
18 |     def record_end(self):
19 |         self.end = torch.cuda.Event(enable_timing=True)
20 |         with torch.cuda.device(self.rank):
21 |             self.end.record()
22 | 
23 |     def elapsed_time(self):
24 |         if not self.start.query():
25 |             raise RuntimeError("start event did not complete")
26 |         if not self.end.query():
27 |             raise RuntimeError("end event did not complete")
28 |         return self.start.elapsed_time(self.end)
29 | 
30 |     def synchronize(self):
31 |         self.start.synchronize()
32 |         self.end.synchronize()
33 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class MetricBase(ABC):
 5 |     def __init__(self, name):
 6 |         self.name = name
 7 |         self.start = None
 8 |         self.end = None
 9 | 
10 |     @abstractmethod
11 |     def record_start(self):
12 |         return
13 | 
14 |     @abstractmethod
15 |     def record_end(self):
16 |         return
17 | 
18 |     @abstractmethod
19 |     def elapsed_time(self):
20 |         return
21 | 
22 |     def get_name(self):
23 |         return self.name
24 | 
25 |     def get_end(self):
26 |         return self.end
27 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/models/DummyModel.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class DummyModel(nn.Module):
 6 |     def __init__(
 7 |         self,
 8 |         num_embeddings: int,
 9 |         embedding_dim: int,
10 |         dense_input_size: int,
11 |         dense_output_size: int,
12 |         dense_layers_count: int,
13 |         sparse: bool
14 |     ):
15 |         r"""
16 |         A dummy model with an EmbeddingBag Layer and Dense Layer.
17 |         Args:
18 |             num_embeddings (int): size of the dictionary of embeddings
19 |             embedding_dim (int): the size of each embedding vector
20 |             dense_input_size (int): size of each input sample
21 |             dense_output_size (int):  size of each output sample
22 |             dense_layers_count: (int): number of dense layers in dense Sequential module
23 |             sparse (bool): if True, gradient w.r.t. weight matrix will be a sparse tensor
24 |         """
25 |         super().__init__()
26 |         self.embedding = nn.EmbeddingBag(
27 |             num_embeddings, embedding_dim, sparse=sparse
28 |         )
29 |         self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(dense_layers_count)])
30 | 
31 |     def forward(self, x):
32 |         x = self.embedding(x)
33 |         return F.softmax(self.dense(x), dim=1)
34 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .DummyModel import DummyModel
2 | 
3 | model_map = {
4 |     "DummyModel": DummyModel
5 | }
6 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/server/__init__.py:
--------------------------------------------------------------------------------
1 | from .server import AverageBatchParameterServer, AverageParameterServer
2 | 
3 | server_map = {
4 |     "AverageParameterServer": AverageParameterServer,
5 |     "AverageBatchParameterServer": AverageBatchParameterServer
6 | }
7 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .criterions import cel
 2 | from .ddp_models import basic_ddp_model
 3 | from .hook_states import BasicHookState
 4 | from .hooks import allreduce_hook, hybrid_hook, rpc_hook, sparse_rpc_hook
 5 | from .iteration_steps import basic_iteration_step
 6 | from .preprocess_data import preprocess_dummy_data
 7 | from .trainer import DdpTrainer
 8 | 
 9 | criterion_map = {
10 |     "cel": cel
11 | }
12 | 
13 | ddp_hook_map = {
14 |     "allreduce_hook": allreduce_hook,
15 |     "hybrid_hook": hybrid_hook,
16 |     "rpc_hook": rpc_hook,
17 |     "sparse_rpc_hook": sparse_rpc_hook
18 | }
19 | 
20 | ddp_model_map = {
21 |     "basic_ddp_model": basic_ddp_model
22 | }
23 | 
24 | iteration_step_map = {
25 |     "basic_iteration_step": basic_iteration_step
26 | }
27 | 
28 | preprocess_data_map = {
29 |     "preprocess_dummy_data": preprocess_dummy_data
30 | }
31 | 
32 | hook_state_map = {
33 |     "BasicHookState": BasicHookState
34 | }
35 | 
36 | trainer_map = {
37 |     "DdpTrainer": DdpTrainer
38 | }
39 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/criterions.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def cel(rank):
 5 |     r"""A function that creates a CrossEntropyLoss
 6 |     criterion for training.
 7 |     Args:
 8 |         rank (int): worker rank
 9 |     """
10 |     return nn.CrossEntropyLoss().cuda(rank)
11 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.parallel import DistributedDataParallel as DDP
 2 | 
 3 | 
 4 | def basic_ddp_model(self, rank, model, process_group, hook_state, hook):
 5 |     r"""
 6 |     A function that creates a ddp_model and hook_state objects.
 7 |     The ddp model is initialized with a single device id and
 8 |     the process group. The ddp_model also registers the communication
 9 |     hook.
10 |     Args:
11 |         rank (int): worker rank
12 |         model (nn.Module): neural network model
13 |         process_group (ProcessGroup): distributed process group
14 |         hook_state (class): class that will be used to keep track of state
15 |             during training.
16 |         hook (function): ddp communication hook
17 |     """
18 |     ddp_model = DDP(
19 |         model, device_ids=[rank], process_group=process_group
20 |     )
21 |     hook_state = hook_state(self, process_group)
22 |     ddp_model.register_comm_hook(hook_state, hook)
23 |     return ddp_model, hook_state
24 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py:
--------------------------------------------------------------------------------
 1 | class BasicHookState:
 2 | 
 3 |     def __init__(self, cref, process_group):
 4 |         r"""
 5 |         A class that holds state information that is needed by the communication hook
 6 |         during the training algorithm.
 7 |         Args:
 8 |             cref (DdpTrainer): reference to the self keyword of the trainer instance
 9 |             process_group (ProcessGroup): distributed process group
10 |         """
11 |         self.cref = cref
12 |         self.process_group = process_group
13 |         self.batch_number = -1
14 | 
15 |     def get_key(self, bucket_index):
16 |         r"""
17 |         A method that returns an encoded key that represents the current batch and
18 |         bucket index.
19 |         Args:
20 |             bucket_index (int): index of the bucket being processed in backward
21 |         """
22 |         return f"{self.batch_number},{bucket_index}"
23 | 
24 |     def next_batch(self):
25 |         r"""
26 |         A method that increments batch_number by 1.
27 |         """
28 |         self.batch_number += 1
29 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py:
--------------------------------------------------------------------------------
 1 | def basic_iteration_step(self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch):
 2 |     r"""
 3 |     A function that performs an iteration of training.
 4 |     Args:
 5 |         ddp_model (nn.Module): distributed data parallel model
 6 |         criterion (nn.Module): loss function to measure model
 7 |         optimizer (optim.Optimizer): updates model parameters
 8 |         hook_state (object): ddp communication hook state object
 9 |         epoch (int): index of pass through the data
10 |         index (int): iteration number - 1 in current batch
11 |         batch (list): training examples
12 |     """
13 |     hook_state.next_batch()
14 |     self.record_batch_start(self.epoch_key(epoch, index))
15 |     optimizer.zero_grad()
16 |     self.record_forward_start(self.epoch_key(epoch, index))
17 |     loss = criterion(ddp_model(batch[0]), batch[1])
18 |     self.record_forward_end(self.epoch_key(epoch, index))
19 |     self.record_backward_start(self.epoch_key(epoch, index))
20 |     loss.backward()
21 |     self.record_backward_end(self.epoch_key(epoch, index))
22 |     optimizer.step()
23 |     self.record_batch_end(self.epoch_key(epoch, index))
24 | 


--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | def preprocess_dummy_data(rank, data):
 2 |     r"""
 3 |     A function that moves the data from CPU to GPU
 4 |     for DummyData class.
 5 |     Args:
 6 |         rank (int): worker rank
 7 |         data (list): training examples
 8 |     """
 9 |     for i in range(len(data)):
10 |         data[i][0] = data[i][0].cuda(rank)
11 |         data[i][1] = data[i][1].cuda(rank)
12 |     return data
13 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/__init__.py


--------------------------------------------------------------------------------
/benchmarks/dynamo/check_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import textwrap
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def check_csv(filename):
 9 |     """
10 |     Basic accuracy checking.
11 |     """
12 | 
13 |     df = pd.read_csv(filename)
14 | 
15 |     failed = []
16 |     for _, row in df.iterrows():
17 |         model_name = row["name"]
18 |         status = row["accuracy"]
19 |         if "pass" not in status:
20 |             failed.append(model_name)
21 | 
22 |         print(f"{model_name:34} {status}")
23 | 
24 |     if failed:
25 |         print(
26 |             textwrap.dedent(
27 |                 f"""
28 |                 Error {len(failed)} models failed
29 |                     {' '.join(failed)}
30 |                 """
31 |             )
32 |         )
33 |         sys.exit(1)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument("--file", "-f", type=str, help="csv file name")
39 |     args = parser.parse_args()
40 |     check_csv(args.file)
41 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/check_hf_bert_perf_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import textwrap
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def check_hf_bert_perf_csv(filename):
 9 |     """
10 |     Basic performance checking.
11 |     """
12 | 
13 |     df = pd.read_csv(filename)
14 | 
15 |     failed = []
16 |     for _, row in df.iterrows():
17 |         model_name = row["name"]
18 |         speedup = row["speedup"]
19 |         # Reduce from 1.165 to 1.160, see https://github.com/pytorch/pytorch/issues/96530
20 |         # Reduce from 1.160 to 1.140 after a transformer version upgrade, see https://github.com/pytorch/benchmark/pull/1406
21 |         # The speedup is not backed to 1.16 after the extra graph break issue is fixed in transformer upstream
22 |         if speedup < 1.150:
23 |             failed.append(model_name)
24 | 
25 |         print(f"{model_name:34} {speedup}")
26 | 
27 |     if failed:
28 |         print(
29 |             textwrap.dedent(
30 |                 f"""
31 |                 Error {len(failed)} models performance regressed
32 |                     {' '.join(failed)}
33 |                 """
34 |             )
35 |         )
36 |         sys.exit(1)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument("--file", "-f", type=str, help="csv file name")
42 |     args = parser.parse_args()
43 |     check_hf_bert_perf_csv(args.file)
44 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_dynamic_training.csv:
--------------------------------------------------------------------------------
 1 | name,accuracy,graph_breaks
 2 | AlbertForMaskedLM,pass,7
 3 | AlbertForQuestionAnswering,pass,7
 4 | BartForCausalLM,pass,7
 5 | BertForMaskedLM,pass,7
 6 | BertForQuestionAnswering,pass,7
 7 | BlenderbotSmallForCausalLM,pass,7
 8 | BlenderbotSmallForConditionalGeneration,pass,7
 9 | CamemBert,pass,7
10 | DebertaForMaskedLM,pass,52
11 | DebertaForQuestionAnswering,pass,52
12 | DebertaV2ForMaskedLM,pass_due_to_skip,0
13 | DistilBertForMaskedLM,pass,7
14 | DistilBertForQuestionAnswering,pass,7
15 | DistillGPT2,pass,7
16 | ElectraForCausalLM,pass,7
17 | ElectraForQuestionAnswering,pass,7
18 | GPT2ForSequenceClassification,pass,9
19 | LayoutLMForMaskedLM,pass,7
20 | LayoutLMForSequenceClassification,pass,9
21 | MBartForCausalLM,pass,7
22 | MegatronBertForCausalLM,pass,7
23 | MegatronBertForQuestionAnswering,pass,7
24 | MobileBertForMaskedLM,pass,4
25 | MobileBertForQuestionAnswering,pass,4
26 | PLBartForCausalLM,pass,7
27 | PLBartForConditionalGeneration,pass,7
28 | PegasusForCausalLM,pass,7
29 | PegasusForConditionalGeneration,pass,4
30 | RobertaForCausalLM,pass,7
31 | RobertaForQuestionAnswering,pass,7
32 | Speech2Text2ForCausalLM,pass,7
33 | T5ForConditionalGeneration,pass,7
34 | T5Small,pass,7
35 | TrOCRForCausalLM,pass,7
36 | XLNetLMHeadModel,pass,7
37 | YituTechConvBert,pass,7
38 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv:
--------------------------------------------------------------------------------
 1 | name,accuracy,graph_breaks
 2 | AlbertForMaskedLM,pass,7
 3 | AlbertForQuestionAnswering,pass,7
 4 | BartForCausalLM,pass,7
 5 | BertForMaskedLM,pass,7
 6 | BertForQuestionAnswering,pass,7
 7 | BlenderbotSmallForCausalLM,pass,7
 8 | BlenderbotSmallForConditionalGeneration,pass,7
 9 | CamemBert,pass,7
10 | DebertaForMaskedLM,pass,52
11 | DebertaForQuestionAnswering,pass,52
12 | DebertaV2ForMaskedLM,pass_due_to_skip,0
13 | DistilBertForMaskedLM,pass,7
14 | DistilBertForQuestionAnswering,pass,7
15 | DistillGPT2,pass,7
16 | ElectraForCausalLM,pass,7
17 | ElectraForQuestionAnswering,pass,7
18 | GPT2ForSequenceClassification,pass,9
19 | LayoutLMForMaskedLM,pass,7
20 | LayoutLMForSequenceClassification,pass,9
21 | MBartForCausalLM,pass,7
22 | MegatronBertForCausalLM,pass,7
23 | MegatronBertForQuestionAnswering,pass,7
24 | MobileBertForMaskedLM,pass,4
25 | MobileBertForQuestionAnswering,pass,4
26 | PLBartForCausalLM,pass,7
27 | PLBartForConditionalGeneration,pass,7
28 | PegasusForCausalLM,pass,7
29 | PegasusForConditionalGeneration,pass,4
30 | RobertaForCausalLM,pass,7
31 | RobertaForQuestionAnswering,pass,7
32 | Speech2Text2ForCausalLM,pass,7
33 | T5ForConditionalGeneration,pass,7
34 | T5Small,pass,7
35 | TrOCRForCausalLM,pass,7
36 | XLNetLMHeadModel,pass,7
37 | YituTechConvBert,pass,7
38 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_training.csv:
--------------------------------------------------------------------------------
 1 | name,accuracy,graph_breaks
 2 | BERT_pytorch,pass,11
 3 | LearningToPaint,pass,9
 4 | Super_SloMo,pass,9
 5 | alexnet,pass,9
 6 | attention_is_all_you_need_pytorch,pass,9
 7 | dcgan,pass,9
 8 | densenet121,pass,9
 9 | drq,pass,8
10 | fastNLP_Bert,pass,14
11 | functorch_dp_cifar10,pass,9
12 | functorch_maml_omniglot,pass,9
13 | hf_Albert,pass,8
14 | hf_Bart,pass,8
15 | hf_Bert,pass,8
16 | hf_Bert_large,pass,8
17 | hf_DistilBert,pass,8
18 | hf_GPT2,pass,8
19 | hf_Reformer,pass,45
20 | hf_T5_large,pass_due_to_skip,0
21 | lennard_jones,pass,9
22 | maml_omniglot,pass,9
23 | mnasnet1_0,pass,9
24 | mobilenet_v2,pass,9
25 | nvidia_deeprecommender,pass,9
26 | phlippe_densenet,pass,9
27 | phlippe_resnet,pass,9
28 | pytorch_CycleGAN_and_pix2pix,pass,9
29 | pytorch_stargan,pass,9
30 | pytorch_unet,pass,9
31 | resnet152,pass,9
32 | resnet18,pass,9
33 | resnet50,pass,9
34 | resnext50_32x4d,pass,9
35 | shufflenet_v2_x1_0,pass,9
36 | soft_actor_critic,pass,8
37 | speech_transformer,pass,19
38 | squeezenet1_1,pass,9
39 | timm_efficientnet,pass,9
40 | timm_regnet,pass,9
41 | timm_resnest,pass,9
42 | timm_vision_transformer,pass,9
43 | timm_vision_transformer_large,pass_due_to_skip,0
44 | timm_vovnet,pass,9
45 | tts_angular,pass,11
46 | vgg16,pass,9
47 | yolov3,pass,13
48 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv:
--------------------------------------------------------------------------------
 1 | name,accuracy,graph_breaks
 2 | BERT_pytorch,pass,11
 3 | LearningToPaint,pass,9
 4 | Super_SloMo,pass,9
 5 | alexnet,pass,9
 6 | attention_is_all_you_need_pytorch,pass,9
 7 | dcgan,pass,9
 8 | densenet121,pass,9
 9 | drq,pass,8
10 | fastNLP_Bert,pass,14
11 | functorch_dp_cifar10,pass,9
12 | functorch_maml_omniglot,pass,9
13 | hf_Albert,pass,8
14 | hf_Bart,pass,8
15 | hf_Bert,pass,8
16 | hf_Bert_large,pass,8
17 | hf_DistilBert,pass,8
18 | hf_GPT2,pass,8
19 | hf_Reformer,pass,67
20 | hf_T5_large,pass_due_to_skip,0
21 | lennard_jones,pass,9
22 | maml_omniglot,pass,9
23 | mnasnet1_0,pass,9
24 | mobilenet_v2,pass,9
25 | nvidia_deeprecommender,pass,9
26 | phlippe_densenet,pass,9
27 | phlippe_resnet,pass,9
28 | pytorch_CycleGAN_and_pix2pix,pass,9
29 | pytorch_stargan,pass,9
30 | pytorch_unet,pass,9
31 | resnet152,pass,9
32 | resnet18,pass,9
33 | resnet50,pass,9
34 | resnext50_32x4d,pass,9
35 | shufflenet_v2_x1_0,pass,9
36 | soft_actor_critic,pass,8
37 | speech_transformer,pass,19
38 | squeezenet1_1,pass,9
39 | timm_efficientnet,pass,9
40 | timm_regnet,pass,9
41 | timm_resnest,pass,9
42 | timm_vision_transformer,pass,9
43 | timm_vision_transformer_large,pass_due_to_skip,0
44 | timm_vovnet,pass,9
45 | tts_angular,pass,11
46 | vgg16,pass,9
47 | yolov3,pass,13
48 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/microbenchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/benchmark_helper.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.benchmark import Timer
 2 | 
 3 | 
 4 | def time_with_torch_timer(fn, args, kwargs=None, iters=100):
 5 |     kwargs = kwargs or {}
 6 |     env = {"args": args, "kwargs": kwargs, "fn": fn}
 7 |     fn_call = "fn(*args, **kwargs)"
 8 | 
 9 |     # Measure end-to-end time
10 |     timer = Timer(stmt=f"{fn_call}", globals=env)
11 |     tt = timer.timeit(iters)
12 | 
13 |     return tt
14 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/model.py:
--------------------------------------------------------------------------------
 1 | # resnet50 layer shape
 2 | resnet50_layers = (
 3 |     # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding
 4 |     (224, 224, 3, 7, 7, 64, (2, 2), (0, 0)),
 5 |     # conv2_x
 6 |     (56, 56, 64, 1, 1, 64, (1, 1), (0, 0)),
 7 |     (56, 56, 64, 3, 3, 64, (1, 1), (0, 0)),
 8 |     (56, 56, 64, 1, 1, 256, (1, 1), (0, 0)),
 9 |     # conv3_x
10 |     (56, 56, 256, 1, 1, 128, (2, 2), (0, 0)),
11 |     (28, 28, 128, 3, 3, 128, (1, 1), (0, 0)),
12 |     (28, 28, 128, 1, 1, 512, (1, 1), (0, 0)),
13 |     # conv4_x
14 |     (28, 28, 512, 1, 1, 256, (2, 2), (0, 0)),
15 |     (14, 14, 256, 3, 3, 256, (1, 1), (0, 0)),
16 |     (14, 14, 256, 1, 1, 1024, (1, 1), (0, 0)),
17 |     # conv5_x
18 |     (14, 14, 1024, 1, 1, 512, (2, 2), (0, 0)),
19 |     (7, 7, 512, 3, 3, 512, (1, 1), (0, 0)),
20 |     (7, 7, 512, 1, 1, 2048, (1, 1), (0, 0)),
21 | )
22 | 
23 | alexnet_layers = (
24 |     # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding
25 |     (224, 224, 3, 11, 11, 64, (4, 4), (2, 2)),
26 | )
27 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def rounded_linspace(low, high, steps, div):
 7 |     ret = torch.linspace(low, high, steps)
 8 |     ret = (ret.int() + div - 1) // div * div
 9 |     ret = torch.unique(ret)
10 |     return list(map(int, ret))
11 | 
12 | 
13 | def powspace(start, stop, pow, step):
14 |     start = math.log(start, pow)
15 |     stop = math.log(stop, pow)
16 |     steps = int((stop - start + 1) // step)
17 |     ret = torch.pow(pow, torch.linspace(start, stop, steps))
18 |     ret = torch.unique(ret)
19 |     return list(map(int, ret))
20 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/run_delta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | # Some QoL for people running this script on Meta servers
 6 | if getent hosts fwdproxy; then
 7 |     export https_proxy=http://fwdproxy:8080 http_proxy=http://fwdproxy:8080 no_proxy=.fbcdn.net,.facebook.com,.thefacebook.com,.tfbnw.net,.fb.com,.fburl.com,.facebook.net,.sb.fbsbx.com,localhost
 8 | fi
 9 | 
10 | WORK="$PWD"
11 | 
12 | cd "$(dirname "$BASH_SOURCE")"/../..
13 | 
14 | ROOT="$PWD"
15 | 
16 | mkdir -p "$WORK/sweep/static"
17 | mkdir -p "$WORK/sweep/dynamic"
18 | 
19 | (cd "$WORK/sweep/static" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@")
20 | (cd "$WORK/sweep/dynamic" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@" --dynamic-shapes)
21 | python benchmarks/dynamo/combine_csv.py "$WORK/sweep/static/final.csv" "$WORK/sweep/dynamic/final.csv" > "$WORK/delta.csv"
22 | gh gist create "$WORK/delta.csv"
23 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/timm_models_list_cpu.txt:
--------------------------------------------------------------------------------
 1 | adv_inception_v3,128
 2 | beit_base_patch16_224,64
 3 | botnet26t_256,128
 4 | cait_m36_384,4
 5 | coat_lite_mini,32
 6 | convit_base,64
 7 | convmixer_768_32,2
 8 | convnext_base,64
 9 | crossvit_9_240,32
10 | cspdarknet53,64
11 | deit_base_distilled_patch16_224,64
12 | dm_nfnet_f0,128
13 | dpn107,32
14 | eca_botnext26ts_256,128
15 | eca_halonext26ts,128
16 | ese_vovnet19b_dw,128
17 | fbnetc_100,32
18 | fbnetv3_b,32
19 | gernet_l,128
20 | ghostnet_100,128
21 | gluon_inception_v3,128
22 | gluon_xception65,32
23 | gmixer_24_224,16
24 | gmlp_s16_224,128
25 | hrnet_w18,128
26 | inception_v3,128
27 | jx_nest_base,32
28 | lcnet_050,64
29 | mixer_b16_224,128
30 | mixnet_l,128
31 | mnasnet_100,32
32 | mobilenetv2_100,32
33 | mobilenetv3_large_100,32
34 | mobilevit_s,256
35 | nfnet_l0,128
36 | pit_b_224,64
37 | pnasnet5large,16
38 | poolformer_m36,64
39 | regnety_002,128
40 | repvgg_a2,128
41 | res2net101_26w_4s,64
42 | res2net50_14w_8s,128
43 | res2next50,128
44 | resmlp_12_224,128
45 | resnest101e,64
46 | rexnet_100,128
47 | sebotnet33ts_256,64
48 | selecsls42b,128
49 | spnasnet_100,32
50 | swin_base_patch4_window7_224,64
51 | swsl_resnext101_32x16d,32
52 | tf_efficientnet_b0,128
53 | tf_mixnet_l,32
54 | tinynet_a,128
55 | tnt_s_patch16_224,32
56 | twins_pcpvt_base,64
57 | visformer_small,128
58 | vit_base_patch16_224,64
59 | volo_d1_224,64
60 | xcit_large_24_p8_224,5
61 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/torchbench_models_list.txt:
--------------------------------------------------------------------------------
 1 | BERT_pytorch,128
 2 | Background_Matting, 16
 3 | LearningToPaint,1024
 4 | alexnet,1024
 5 | dcgan,1024
 6 | densenet121,64
 7 | hf_Albert,32
 8 | hf_Bart,16
 9 | hf_Bert,16
10 | hf_GPT2,16
11 | hf_T5,4
12 | mnasnet1_0,256
13 | mobilenet_v2,128
14 | mobilenet_v3_large,256
15 | nvidia_deeprecommender,1024
16 | pytorch_unet,8
17 | resnet18,512
18 | resnet50,128
19 | resnext50_32x4d,128
20 | shufflenet_v2_x1_0,512
21 | squeezenet1_1,512
22 | timm_nfnet,256
23 | timm_efficientnet,128
24 | timm_regnet,128
25 | timm_resnest,256
26 | timm_vision_transformer,256
27 | timm_vovnet,128
28 | vgg16,128
29 | 


--------------------------------------------------------------------------------
/benchmarks/dynamo/torchbench_models_list_cpu.txt:
--------------------------------------------------------------------------------
 1 | alexnet,128
 2 | attention_is_all_you_need_pytorch,64
 3 | BERT_pytorch,32
 4 | dcgan,256
 5 | densenet121,512
 6 | dlrm,2048
 7 | fastNLP_Bert,8
 8 | functorch_dp_cifar10,1024
 9 | hf_Albert,8
10 | hf_Bart,8
11 | hf_Bert,8
12 | hf_Bert_large,8
13 | hf_DistilBert,8
14 | hf_GPT2,8
15 | hf_GPT2_large,1
16 | hf_Longformer,4
17 | hf_Reformer,8
18 | hf_T5,4
19 | hf_T5_base,1
20 | hf_T5_large,1
21 | LearningToPaint,96
22 | lennard_jones,1024
23 | mnasnet1_0,32
24 | mobilenet_v2,16
25 | mobilenet_v3_large,32
26 | nvidia_deeprecommender,256
27 | phlippe_densenet,128
28 | phlippe_resnet,512
29 | pytorch_unet,4
30 | resnet152,32
31 | resnet18,256
32 | resnet50,256
33 | resnext50_32x4d,256
34 | shufflenet_v2_x1_0,64
35 | speech_transformer,1024
36 | squeezenet1_1,16
37 | Super_SloMo,1024
38 | timm_efficientnet,64
39 | timm_nfnet,128
40 | timm_regnet,32
41 | timm_resnest,32
42 | timm_vision_transformer,16
43 | timm_vision_transformer_large,8
44 | timm_vovnet,32
45 | tts_angular,1024
46 | vgg16,64
47 | vision_maskrcnn,1
48 | yolov3,32
49 | 


--------------------------------------------------------------------------------
/benchmarks/fastrnns/README.md:
--------------------------------------------------------------------------------
 1 | # Fast RNN benchmarks
 2 | 
 3 | Benchmarks for TorchScript models
 4 | 
 5 | For most stable results, do the following:
 6 | - Set CPU Governor to performance mode (as opposed to energy save)
 7 | - Turn off turbo for all CPUs (assuming Intel CPUs)
 8 | - Shield cpus via `cset shield` when running benchmarks.
 9 | 
10 | Some of these scripts accept command line args but most of them do not because
11 | I was lazy. They will probably be added sometime in the future, but the default
12 | sizes are pretty reasonable.
13 | 
14 | ## Test fastrnns (fwd + bwd) correctness
15 | 
16 | Test the fastrnns benchmarking scripts with the following:
17 | `python -m fastrnns.test`
18 | or run the test independently:
19 | `python -m fastrnns.test --rnns jit`
20 | 
21 | ## Run benchmarks
22 | 
23 | `python -m fastrnns.bench`
24 | 
25 | should give a good comparison, or you can specify the type of model to run
26 | 
27 | `python -m fastrnns.bench --rnns cudnn aten jit --group rnns`
28 | 
29 | ## Run model profiling, calls nvprof
30 | 
31 | `python -m fastrnns.profile`
32 | 
33 | should generate nvprof file for all models somewhere.
34 | you can also specify the models to generate nvprof files separately:
35 | 
36 | `python -m fastrnns.profile --rnns aten jit`
37 | 
38 | ### Caveats
39 | 
40 | Use Linux for the most accurate timing. A lot of these tests only run
41 | on CUDA.
42 | 


--------------------------------------------------------------------------------
/benchmarks/fastrnns/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cells import *  # noqa: F403
 2 | from .factory import *  # noqa: F403
 3 | 
 4 | # (output, next_state) = cell(input, state)
 5 | seqLength = 100
 6 | numLayers = 2
 7 | inputSize = 512
 8 | hiddenSize = 512
 9 | miniBatch = 64
10 | 


--------------------------------------------------------------------------------
/benchmarks/fastrnns/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest  # noqa: F401
 2 | 
 3 | default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
 4 |                          'jit_multilayer', 'py']
 5 | default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
 6 | all_nets = default_rnns + default_cnns
 7 | 
 8 | def pytest_generate_tests(metafunc):
 9 |     # This creates lists of tests to generate, can be customized
10 |     if metafunc.cls.__name__ == "TestBenchNetwork":
11 |         metafunc.parametrize('net_name', all_nets, scope="class")
12 |         metafunc.parametrize("executor", [metafunc.config.getoption("executor")], scope="class")
13 |         metafunc.parametrize("fuser", [metafunc.config.getoption("fuser")], scope="class")
14 | 
15 | def pytest_addoption(parser):
16 |     parser.addoption("--fuser", default="old", help="fuser to use for benchmarks")
17 |     parser.addoption("--executor", default="legacy", help="executor to use for benchmarks")
18 | 


--------------------------------------------------------------------------------
/benchmarks/fastrnns/scratch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | @torch.jit.script
 5 | def fn(x, scale, shift):
 6 |     return scale * x / shift
 7 | 
 8 | 
 9 | @torch.jit.script
10 | def recurrent(x, scale, shift):
11 |     y = x
12 |     for i in range(100):
13 |         y = fn(y, scale, shift)
14 |     return y
15 | 
16 | 
17 | x = torch.randn(2, 2, device='cuda')
18 | scale = torch.randn(2, 2, device='cuda', requires_grad=True)
19 | shift = torch.randn(2, 2, device='cuda', requires_grad=True)
20 | inputs = [x, scale, shift]
21 | 
22 | 
23 | out = recurrent(x, scale, shift)
24 | recurrent.graph_for(x, scale, shift)
25 | 
26 | 
27 | import torch
28 | 
29 | 
30 | @torch.jit.script
31 | def recurrent_scaleshift(x, scale, shift):
32 |     y = x
33 |     for i in range(64):
34 |         y = scale * y + shift
35 |     return y
36 | 
37 | 
38 | x = torch.randn(2, 2, device='cuda')
39 | scale = torch.randn(2, 2, device='cuda', requires_grad=True)
40 | shift = torch.randn(2, 2, device='cuda', requires_grad=True)
41 | inputs = [x, scale, shift]
42 | out = recurrent_scaleshift(x, scale, shift)
43 | recurrent_scaleshift.graph_for(x, scale, shift)
44 | 
45 | 
46 | import torch
47 | x = torch.tensor([])
48 | x.requires_grad = True
49 | x.mean().backward()  # no error triggered
50 | x = x.cuda()
51 | x.mean().backward()
52 | 


--------------------------------------------------------------------------------
/benchmarks/framework_overhead_benchmark/SimpleAddModule.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from utils import NUM_LOOP_ITERS
 3 | 
 4 | def add_tensors_loop(x, y):
 5 |     z = torch.add(x, y)
 6 |     for i in range(NUM_LOOP_ITERS):
 7 |         z = torch.add(z, x)
 8 |     return z
 9 | 
10 | class SimpleAddModule(torch.nn.Module):
11 |     def __init__(self, add_op):
12 |         super().__init__()
13 |         self.add_op = add_op
14 | 
15 |     def forward(self, x, y):
16 |         return self.add_op(x, y)
17 | 


--------------------------------------------------------------------------------
/benchmarks/framework_overhead_benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import namedtuple
 3 | from torch.utils import ThroughputBenchmark
 4 | 
 5 | NUM_LOOP_ITERS = 1000
 6 | BenchmarkConfig = namedtuple('BenchmarkConfig', 'num_warmup_iters num_iters')
 7 | ModuleConfig = namedtuple('ModuleConfig', 'pt_fn c2_op num_params graph_mode')
 8 | 
 9 | def ms_to_us(time_ms):
10 |     return (time_ms * 1e3)
11 | 
12 | def secs_to_us(time_s):
13 |     return (time_s * 1e6)
14 | 
15 | def secs_to_ms(time_s):
16 |     return (time_s * 1e3)
17 | 
18 | def benchmark_using_throughput_benchmark(config, module):
19 |     print("Benchmarking via ThroughputBenchmark")
20 |     bench = ThroughputBenchmark(module.module)
21 |     bench.add_input(*module.tensor_inputs)
22 |     stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters)
23 |     return stats.latency_avg_ms / NUM_LOOP_ITERS
24 | 
25 | def benchmark_module(config, module, use_throughput_benchmark=False):
26 |     if use_throughput_benchmark:
27 |         return benchmark_using_throughput_benchmark(config, module)
28 |     module.forward(config.num_warmup_iters)
29 |     print("Running module for {} iterations".format(config.num_iters))
30 |     start = time.time()
31 |     module.forward(config.num_iters)
32 |     end = time.time()
33 |     time_elapsed_s = (end - start)
34 |     return (secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS)
35 | 


--------------------------------------------------------------------------------
/benchmarks/fuser/plot_speedups.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | 
 3 | df = pandas.read_csv("perf.csv")
 4 | 
 5 | ops = pandas.unique(df["operator"])
 6 | nops = len(ops)
 7 | pivot_op_shape = df.pivot_table(
 8 |     values="time", index=["operator", "shape"], columns=["fuser"]
 9 | )
10 | pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T
11 | 
12 | import matplotlib.pyplot as plt
13 | 
14 | plt.rcParams["figure.figsize"] = (20, 100)
15 | fig, axs = plt.subplots(nops)
16 | plt.subplots_adjust(hspace=0.5)
17 | for idx, op in enumerate(ops):
18 |     op_speedups = pivot_speedups.T[op].T
19 |     op_speedups.plot(ax=axs[idx], kind="bar", ylim=(0, 2), rot=45)
20 |     axs[idx].set_title(op)
21 |     axs[idx].set_xlabel("")
22 | plt.savefig("perf.png")
23 | 


--------------------------------------------------------------------------------
/benchmarks/instruction_counts/applications/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/applications/__init__.py


--------------------------------------------------------------------------------
/benchmarks/instruction_counts/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/core/__init__.py


--------------------------------------------------------------------------------
/benchmarks/instruction_counts/definitions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/definitions/__init__.py


--------------------------------------------------------------------------------
/benchmarks/instruction_counts/execution/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/execution/__init__.py


--------------------------------------------------------------------------------
/benchmarks/instruction_counts/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/worker/__init__.py


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/__init__.py


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_other_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | from pt import (  # noqa: F401
 3 |     add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
 4 |     channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
 5 |     fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
 6 |     softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
 7 |     groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
 8 |     split_test, sum_test, tensor_to_test
 9 | )
10 | from pt import (  # noqa: F401
11 |     ao_sparsifier_test
12 | )
13 | 
14 | if __name__ == "__main__":
15 |     op_bench.benchmark_runner.main()
16 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_quantized_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | from pt import (  # noqa: F401
 3 |     qactivation_test,
 4 |     qarithmetic_test,
 5 |     qbatchnorm_test,
 6 |     qcat_test,
 7 |     qcomparators_test,
 8 |     qconv_test,
 9 |     qgroupnorm_test,
10 |     qinstancenorm_test,
11 |     qinterpolate_test,
12 |     qlayernorm_test,
13 |     qlinear_test,
14 |     qobserver_test,
15 |     qpool_test,
16 |     qrnn_test,
17 |     qtensor_method_test,
18 |     quantization_test,
19 |     qunary_test,
20 |     qembedding_pack_test,
21 |     qembeddingbag_test,
22 |     qatembedding_ops_test,
23 | )
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     op_bench.benchmark_runner.main()
28 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | from pt import (  # noqa: F401
 3 |     unary_test,
 4 | )
 5 | import benchmark_all_other_test  # noqa: F401
 6 | import benchmark_all_quantized_test  # noqa: F401
 7 | 
 8 | if __name__ == "__main__":
 9 |     op_bench.benchmark_runner.main()
10 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/c2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/c2/__init__.py


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/c2/replace_nan_test.py:
--------------------------------------------------------------------------------
 1 | import benchmark_caffe2 as op_bench_c2
 2 | import operator_benchmark as op_bench
 3 | from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa: F401
 4 | from caffe2.python import core
 5 | 
 6 | 
 7 | """Microbenchmarks for element-wise ReplaceNaN operator."""
 8 | 
 9 | # Configs for C2 ReplaceNaN operator
10 | replace_nan_long_configs = op_bench.cross_product_configs(
11 |     M=[32, 64, 128], N=range(32, 128, 32), dtype=["float", "double"], tags=["long"]
12 | )
13 | 
14 | 
15 | replace_nan_short_configs = op_bench.config_list(
16 |     attrs=[
17 |         [16, 16, "float"],
18 |         [16, 16, "double"],
19 |         [64, 64, "float"],
20 |         [64, 64, "double"],
21 |     ],
22 |     attr_names=["M", "N", "dtype"],
23 |     tags=["short"],
24 | )
25 | 
26 | 
27 | class ReplaceNaNBenchmark(op_bench_c2.Caffe2BenchmarkBase):
28 |     def init(self, M, N, dtype):
29 |         self.input = self.tensor([M, N], dtype)
30 |         self.set_module_name("replace_nan")
31 | 
32 |     def forward(self):
33 |         op = core.CreateOperator("ReplaceNaN", self.input, self.input, value=1.0)
34 |         return op
35 | 
36 | 
37 | op_bench_c2.generate_c2_test(
38 |     replace_nan_long_configs + replace_nan_short_configs, ReplaceNaNBenchmark
39 | )
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     op_bench.benchmark_runner.main()
44 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/common/__init__.py


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | # Configs for pointwise unary ops
 6 | unary_ops_configs = op_bench.config_list(
 7 |     attrs=[
 8 |         [128, 128],
 9 |     ],
10 |     attr_names=["M", "N"],
11 |     tags=["short"]
12 | )
13 | 
14 | 
15 | unary_ops_list = op_bench.op_list(
16 |     attr_names=["op_name", "op_func"],
17 |     attrs=[
18 |         ["abs", torch.abs],
19 |         ["acos", torch.acos],
20 |     ],
21 | )
22 | 
23 | 
24 | class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
25 |     def init(self, M, N, op_func):
26 |         self.input_one = torch.rand(M, N)
27 |         self.op_func = op_func
28 | 
29 |     def forward(self):
30 |         return self.op_func(self.input_one)
31 | 
32 | 
33 | op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     op_bench.benchmark_runner.main()
38 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/jit_forward_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | intraop_bench_configs = op_bench.config_list(
 5 |     attrs=[
 6 |         [8, 16],
 7 |     ],
 8 |     attr_names=["M", "N"],
 9 |     tags=["short"],
10 | )
11 | 
12 | @torch.jit.script
13 | def torch_sumall(a, iterations):
14 |     # type: (Tensor, int)
15 |     result = 0.0
16 |     for _ in range(iterations):
17 |         result += float(torch.sum(a))
18 |         a[0][0] += 0.01
19 |     return result
20 | 
21 | 
22 | class TorchSumBenchmark(op_bench.TorchBenchmarkBase):
23 |     def init(self, M, N):
24 |         self.input_one = torch.rand(M, N)
25 |         self.set_module_name("sum")
26 | 
27 |     # This is a very temporary method and will be removed soon, so
28 |     # don't use this method in your benchmark
29 |     # TODO(mingzhe): use one forward method for both JIT and Eager
30 |     def jit_forward(self, iters):
31 |         return torch_sumall(self.input_one, iters)
32 | 
33 | op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     op_bench.benchmark_runner.main()
38 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_backward_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | add_configs = op_bench.cross_product_configs(
 6 |     M=[8, 1],
 7 |     N=[8, 2],
 8 |     K=[8, 4],
 9 |     tags=["short"]
10 | )
11 | 
12 | # This benchmark uses the auto_set to automatically set requires_grad
13 | # for both inputs. The test name can also be used for filtering.
14 | class AddBenchmark(op_bench.TorchBenchmarkBase):
15 |     def init(self, M, N, K):
16 |         self.input_one = torch.rand(M, N, K, requires_grad=self.auto_set())
17 |         self.input_two = torch.rand(M, N, K, requires_grad=self.auto_set())
18 |         self.set_module_name("add")
19 | 
20 |     def forward(self):
21 |         return torch.add(self.input_one, self.input_two)
22 | 
23 | 
24 | op_bench.generate_pt_test(add_configs, AddBenchmark)
25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     op_bench.benchmark_runner.main()
30 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
 5 | 
 6 | add_short_configs = op_bench.config_list(
 7 |     attr_names=['M', 'N', 'K'],
 8 |     attrs=[
 9 |         [8, 16, 32],
10 |         [16, 16, 64],
11 |         [64, 64, 128],
12 |     ],
13 |     cross_product_configs={
14 |         'device': ['cpu', 'cuda'],
15 |         'dtype': [torch.float, torch.float64],
16 |     },
17 |     tags=['short'],
18 | )
19 | 
20 | 
21 | class AddBenchmark(op_bench.TorchBenchmarkBase):
22 |     def init(self, M, N, K, device, dtype):
23 |         self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
24 |         self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
25 |         self.set_module_name('add')
26 | 
27 |     def forward(self):
28 |         return torch.add(self.input_one, self.input_two)
29 | 
30 | 
31 | op_bench.generate_pt_test(add_short_configs, AddBenchmark)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     op_bench.benchmark_runner.main()
36 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | add_configs = op_bench.cross_product_configs(
 6 |     M=[8],
 7 |     N=[8],
 8 |     K=[8],
 9 |     device=["cuda", "cpu"],
10 |     tags=["short"]
11 | )
12 | 
13 | 
14 | class AddBenchmark(op_bench.TorchBenchmarkBase):
15 |     def init(self, M, N, K, device):
16 |         self.input_one = torch.rand(M, N, K, device=device, requires_grad=True)
17 |         self.input_two = torch.rand(M, N, K, device=device, requires_grad=True)
18 |         self.set_module_name("add")
19 | 
20 |     def forward(self):
21 |         return torch.add(self.input_one, self.input_two)
22 | 
23 | 
24 | op_bench.generate_pt_test(add_configs, AddBenchmark)
25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     op_bench.benchmark_runner.main()
30 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/random_sample_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | configs = op_bench.random_sample_configs(
 6 |     M=[1, 2, 3, 4, 5, 6],
 7 |     N=[7, 8, 9, 10, 11, 12],
 8 |     K=[13, 14, 15, 16, 17, 18],
 9 |     # probs saves the weights of each value
10 |     probs=op_bench.attr_probs(
11 |         M=[0.5, 0.2, 0.1, 0.05, 0.03, 0.1],
12 |         N=[0.1, 0.3, 0.4, 0.02, 0.03, 0.04],
13 |         K=[0.03, 0.6, 0.04, 0.02, 0.03, 0.01],
14 |     ),
15 |     # this is the number of returned inputs
16 |     total_samples=10,
17 |     tags=["short"],
18 | )
19 | 
20 | 
21 | class AddBenchmark(op_bench.TorchBenchmarkBase):
22 |     def init(self, M, N, K):
23 |         self.input_one = torch.rand(M, N, K)
24 |         self.input_two = torch.rand(M, N, K)
25 |         self.set_module_name("add")
26 | 
27 |     def forward(self):
28 |         return torch.add(self.input_one, self.input_two)
29 | 
30 | 
31 | op_bench.generate_pt_test(configs, AddBenchmark)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     op_bench.benchmark_runner.main()
36 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/operator_benchmark.py:
--------------------------------------------------------------------------------
1 | # TODO (mingzhe09088): get rid of noqa
2 | import benchmark_runner  # noqa: F401
3 | from benchmark_pytorch import TorchBenchmarkBase  # noqa: F401
4 | from benchmark_test_generator import *  # noqa: F401,F403
5 | from benchmark_utils import *  # noqa: F401,F403
6 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/pt/__init__.py


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/bmm_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""
 5 | 
 6 | class BmmBenchmark(op_bench.TorchBenchmarkBase):
 7 |     def init(self, B, M, N, K, device, op):
 8 |         self.inputs = {
 9 |             "batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
10 |             "batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
11 |         }
12 |         self.set_module_name(f"bmm (actual op={op}")
13 |         self.op = torch.bmm if op == "bmm" else torch.matmul
14 | 
15 |     def forward(self, batch1, batch2):
16 |         return self.op(batch1, batch2)
17 | 
18 | bmm_configs = op_bench.cross_product_configs(
19 |     B=[2, 100],
20 |     M=[8, 256],
21 |     N=[256, 16],
22 |     K=[16, 32],
23 |     device=['cpu'],
24 |     tags=["short"],
25 |     op=["bmm", "matmul"],
26 | )
27 | 
28 | op_bench.generate_pt_test(bmm_configs, BmmBenchmark)
29 | 
30 | if __name__ == "__main__":
31 |     op_bench.benchmark_runner.main()
32 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/chunk_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | """Microbenchmarks for Chunk operator"""
 6 | 
 7 | 
 8 | # Configs for PT Chunk operator
 9 | chunk_short_configs = op_bench.config_list(
10 |     attr_names=["M", "N", "chunks"],
11 |     attrs=[
12 |         [8, 8, 2],
13 |         [256, 512, 2],
14 |         [512, 512, 2],
15 |     ],
16 |     cross_product_configs={
17 |         'device': ['cpu', 'cuda'],
18 |     },
19 |     tags=["short"],
20 | )
21 | 
22 | chunks_long_configs = op_bench.cross_product_configs(
23 |     M=[128, 1024],
24 |     N=[128, 1024],
25 |     chunks=[2, 4],
26 |     device=['cpu', 'cuda'],
27 |     tags=['long']
28 | )
29 | 
30 | 
31 | class ChunkBenchmark(op_bench.TorchBenchmarkBase):
32 |     def init(self, M, N, chunks, device):
33 |         self.inputs = {
34 |             "input_one": torch.rand(M, N, device=device),
35 |             "chunks": chunks
36 |         }
37 |         self.set_module_name("chunk")
38 | 
39 |     def forward(self, input_one, chunks: int):
40 |         return torch.chunk(input_one, chunks)
41 | 
42 | 
43 | op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
44 |                           ChunkBenchmark)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     op_bench.benchmark_runner.main()
49 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/diag_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | """Microbenchmarks for diag operator"""
 6 | 
 7 | 
 8 | # Configs for PT diag operator
 9 | diag_configs_short = op_bench.config_list(
10 |     attr_names=['dim', 'M', 'N', 'diagonal', 'out'],
11 |     attrs=[
12 |         [1, 64, 64, 0, True],
13 |         [2, 128, 128, -10, False],
14 |         [1, 256, 256, 20, True],
15 |     ],
16 |     cross_product_configs={
17 |         'device': ['cpu', 'cuda'],
18 |     },
19 |     tags=['short'],
20 | )
21 | 
22 | 
23 | class DiagBenchmark(op_bench.TorchBenchmarkBase):
24 |     def init(self, dim, M, N, diagonal, out, device):
25 |         self.inputs = {
26 |             "input": torch.rand(M, N, device=device) if dim == 2 else torch.rand(M, device=device),
27 |             "diagonal": diagonal,
28 |             "out": out,
29 |             "out_tensor": torch.tensor((),)
30 |         }
31 |         self.set_module_name('diag')
32 | 
33 |     def forward(self, input, diagonal: int, out: bool, out_tensor):
34 |         if out:
35 |             return torch.diag(input, diagonal=diagonal, out=out_tensor)
36 |         else:
37 |             return torch.diag(input, diagonal=diagonal)
38 | 
39 | 
40 | op_bench.generate_pt_test(diag_configs_short, DiagBenchmark)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     op_bench.benchmark_runner.main()
45 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/fill_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | from torch.testing._internal.common_device_type import get_all_device_types
 5 | 
 6 | """Microbenchmark for Fill_ operator."""
 7 | 
 8 | fill_short_configs = op_bench.config_list(
 9 |     attr_names=["N"],
10 |     attrs=[
11 |         [1],
12 |         [1024],
13 |         [2048],
14 |     ],
15 |     cross_product_configs={
16 |         'device': ['cpu', 'cuda'],
17 |         'dtype': [torch.int32],
18 |     },
19 |     tags=["short"],
20 | )
21 | 
22 | fill_long_configs = op_bench.cross_product_configs(
23 |     N=[10, 1000],
24 |     device=get_all_device_types(),
25 |     dtype=[torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
26 |            torch.int64, torch.half, torch.float, torch.double],
27 |     tags=["long"]
28 | )
29 | 
30 | 
31 | class Fill_Benchmark(op_bench.TorchBenchmarkBase):
32 |     def init(self, N, device, dtype):
33 |         self.inputs = {
34 |             "input_one": torch.zeros(N, device=device).type(dtype)
35 |         }
36 |         self.set_module_name("fill_")
37 | 
38 |     def forward(self, input_one):
39 |         return input_one.fill_(10)
40 | 
41 | 
42 | op_bench.generate_pt_test(fill_short_configs + fill_long_configs,
43 |                           Fill_Benchmark)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     op_bench.benchmark_runner.main()
48 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/gelu_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import operator_benchmark as op_bench
 3 | import torch
 4 | 
 5 | 
 6 | """
 7 | Microbenchmarks for the gelu operators.
 8 | """
 9 | 
10 | gelu_configs_long = op_bench.cross_product_configs(
11 |     N=[1, 4],
12 |     C=[3],
13 |     H=[16, 256],
14 |     W=[16, 256],
15 |     device=['cpu'],
16 |     tags=['long']
17 | )
18 | 
19 | 
20 | class GeluBenchmark(op_bench.TorchBenchmarkBase):
21 |     def init(self, N, C, H, W, device):
22 |         self.inputs = {
23 |             "input": torch.rand(N, C, H, W, device=device)
24 |         }
25 | 
26 |     def forward(self, input):
27 |         return torch.nn.functional.gelu(input)
28 | 
29 | 
30 | op_bench.generate_pt_test(gelu_configs_long, GeluBenchmark)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     op_bench.benchmark_runner.main()
35 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/groupnorm_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import operator_benchmark as op_bench
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | """Microbenchmarks for groupnorm operator."""
 8 | 
 9 | groupnorm_configs_short = op_bench.cross_product_configs(
10 |     dims=(
11 |         (32, 8, 16),
12 |         (32, 8, 56, 56),
13 |     ),
14 |     num_groups=(2, 4),
15 |     tags=["short"],
16 | )
17 | 
18 | 
19 | class GroupNormBenchmark(op_bench.TorchBenchmarkBase):
20 |     def init(self, dims, num_groups):
21 |         num_channels = dims[1]
22 |         self.inputs = {
23 |             "input": (torch.rand(*dims) - 0.5) * 256,
24 |             "num_groups": num_groups,
25 |             "weight": torch.rand(num_channels, dtype=torch.float),
26 |             "bias": torch.rand(num_channels, dtype=torch.float),
27 |             "eps": 1e-5
28 |         }
29 | 
30 |     def forward(self, input, num_groups: int, weight, bias, eps: float):
31 |         return F.group_norm(
32 |             input, num_groups, weight=weight, bias=bias, eps=eps)
33 | 
34 | 
35 | op_bench.generate_pt_test(groupnorm_configs_short, GroupNormBenchmark)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     op_bench.benchmark_runner.main()
40 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/instancenorm_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import operator_benchmark as op_bench
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | """Microbenchmarks for instancenorm operator."""
 8 | 
 9 | instancenorm_configs_short = op_bench.cross_product_configs(
10 |     dims=(
11 |         (32, 8, 16),
12 |         (32, 8, 56, 56),
13 |     ),
14 |     tags=["short"],
15 | )
16 | 
17 | 
18 | class InstanceNormBenchmark(op_bench.TorchBenchmarkBase):
19 |     def init(self, dims):
20 |         num_channels = dims[1]
21 |         self.inputs = {
22 |             "input": (torch.rand(*dims) - 0.5) * 256,
23 |             "weight": torch.rand(num_channels, dtype=torch.float),
24 |             "bias": torch.rand(num_channels, dtype=torch.float),
25 |             "eps": 1e-5
26 |         }
27 | 
28 |     def forward(self, input, weight, bias, eps: float):
29 |         return F.instance_norm(
30 |             input, weight=weight, bias=bias, eps=eps)
31 | 
32 | 
33 | op_bench.generate_pt_test(instancenorm_configs_short, InstanceNormBenchmark)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     op_bench.benchmark_runner.main()
38 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/layernorm_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import operator_benchmark as op_bench
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | """Microbenchmarks for layernorm operator."""
 8 | 
 9 | layernorm_configs_short = op_bench.cross_product_configs(
10 |     dims=(
11 |         (1, 8, 16),
12 |         (8, 8, 16),
13 |         (32, 8, 16),
14 |         (64, 128, 56, 56),
15 |     ),
16 |     tags=["short"],
17 | )
18 | 
19 | 
20 | class LayerNormBenchmark(op_bench.TorchBenchmarkBase):
21 |     def init(self, dims):
22 |         input = (torch.rand(*dims) - 0.5) * 256
23 |         self.inputs = {
24 |             "input": input,
25 |             "weight": torch.rand(*input.size()[1:], dtype=torch.float),
26 |             "bias": torch.rand(*input.size()[1:], dtype=torch.float),
27 |             "eps": 1e-5
28 |         }
29 | 
30 |     def forward(self, input, weight, bias, eps: float):
31 |         return F.layer_norm(
32 |             input, input.size()[1:], weight=weight, bias=bias, eps=eps)
33 | 
34 | 
35 | op_bench.generate_pt_test(layernorm_configs_short, LayerNormBenchmark)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     op_bench.benchmark_runner.main()
40 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/linear_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import operator_benchmark as op_bench
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from pt import configs
 7 | 
 8 | 
 9 | """Microbenchmarks for Linear operator."""
10 | 
11 | 
12 | class LinearBenchmark(op_bench.TorchBenchmarkBase):
13 |     def init(self, N, IN, OUT, device):
14 |         self.inputs = {
15 |             "input_one": torch.rand(N, IN, device=device)
16 |         }
17 |         self.linear = nn.Linear(IN, OUT).to(device=device)
18 |         self.set_module_name("linear")
19 | 
20 |     def forward(self, input_one):
21 |         return self.linear(input_one)
22 | 
23 | 
24 | op_bench.generate_pt_test(configs.linear_configs_short + configs.linear_configs_long,
25 |                           LinearBenchmark)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     op_bench.benchmark_runner.main()
30 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/split_test.py:
--------------------------------------------------------------------------------
 1 | import operator_benchmark as op_bench
 2 | import torch
 3 | 
 4 | 
 5 | """Microbenchmarks for Split operator"""
 6 | 
 7 | 
 8 | # Configs for PT Split operator
 9 | split_configs_short = op_bench.config_list(
10 |     attr_names=["M", "N", "parts"],
11 |     attrs=[
12 |         [8, 8, 2],
13 |         [256, 512, 2],
14 |         [512, 512, 2],
15 |     ],
16 |     cross_product_configs={
17 |         'device': ['cpu', 'cuda'],
18 |     },
19 |     tags=["short"],
20 | )
21 | 
22 | split_configs_long = op_bench.cross_product_configs(
23 |     M=[128, 1024],
24 |     N=[128, 1024],
25 |     parts=[2, 4],
26 |     device=['cpu', 'cuda'],
27 |     tags=['long']
28 | )
29 | 
30 | 
31 | class SplitBenchmark(op_bench.TorchBenchmarkBase):
32 |     def init(self, M, N, parts, device):
33 |         self.inputs = {
34 |             "input": torch.rand(M, N, device=device),
35 |             "split_size": int(M * N / parts)
36 |         }
37 |         self.set_module_name('split')
38 | 
39 |     def forward(self, input, split_size: int):
40 |         return torch.split(input, split_size)
41 | 
42 | 
43 | op_bench.generate_pt_test(split_configs_short + split_configs_long,
44 |                           SplitBenchmark)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     op_bench.benchmark_runner.main()
49 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt_extension/extension.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <torch/script.h>
 3 | 
 4 | using torch::List;
 5 | using torch::Tensor;
 6 | 
 7 | Tensor consume(Tensor a) {
 8 |   return a;
 9 | }
10 | 
11 | List<Tensor> consume_list(List<Tensor> a) {
12 |   return a;
13 | }
14 | 
15 | // When JIT tracing is used on function with constant for loop,
16 | // the for loop is optimized away because of dead code elimination.
17 | // That caused an issue for our op benchmark which needs to run an op
18 | // in a loop and report the execution time. This diff resolves that issue by
19 | // registering this consume op with correct alias information which is DEFAULT.
20 | TORCH_LIBRARY_FRAGMENT(operator_benchmark, m) {
21 |   m.def("_consume", &consume);
22 |   m.def("_consume.list", &consume_list);
23 | }
24 | 
25 | PYBIND11_MODULE(benchmark_cpp_extension, m) {
26 |   m.def("_consume", &consume, "consume");
27 |   m.def("_consume_list", &consume_list, "consume_list");
28 | }
29 | 


--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt_extension/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import CppExtension, BuildExtension
3 | 
4 | setup(name='benchmark_cpp_extension',
5 |       ext_modules=[CppExtension('benchmark_cpp_extension', ['extension.cpp'])],
6 |       cmdclass={'build_ext': BuildExtension})
7 | 


--------------------------------------------------------------------------------
/benchmarks/overrides_benchmark/common.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | NUM_REPEATS = 1000
 4 | NUM_REPEAT_OF_REPEATS = 1000
 5 | 
 6 | 
 7 | class SubTensor(torch.Tensor):
 8 |     pass
 9 | 
10 | 
11 | class WithTorchFunction:
12 |     def __init__(self, data, requires_grad=False):
13 |         if isinstance(data, torch.Tensor):
14 |             self._tensor = data
15 |             return
16 | 
17 |         self._tensor = torch.tensor(data, requires_grad=requires_grad)
18 | 
19 |     @classmethod
20 |     def __torch_function__(cls, func, types, args=(), kwargs=None):
21 |         if kwargs is None:
22 |             kwargs = {}
23 | 
24 |         return WithTorchFunction(args[0]._tensor + args[1]._tensor)
25 | 
26 | 
27 | class SubWithTorchFunction(torch.Tensor):
28 |     @classmethod
29 |     def __torch_function__(cls, func, types, args=(), kwargs=None):
30 |         if kwargs is None:
31 |             kwargs = {}
32 | 
33 |         return super().__torch_function__(func, types, args, kwargs)
34 | 


--------------------------------------------------------------------------------
/benchmarks/overrides_benchmark/pyspybench.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | from common import SubTensor, WithTorchFunction, SubWithTorchFunction  # noqa: F401
 4 | 
 5 | Tensor = torch.tensor
 6 | 
 7 | NUM_REPEATS = 1000000
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(
11 |         description="Run the torch.add for a given class a given number of times."
12 |     )
13 |     parser.add_argument(
14 |         "tensor_class", metavar="TensorClass", type=str, help="The class to benchmark."
15 |     )
16 |     parser.add_argument(
17 |         "--nreps", "-n", type=int, default=NUM_REPEATS, help="The number of repeats."
18 |     )
19 |     args = parser.parse_args()
20 | 
21 |     TensorClass = globals()[args.tensor_class]
22 |     NUM_REPEATS = args.nreps
23 | 
24 |     t1 = TensorClass([1.])
25 |     t2 = TensorClass([2.])
26 | 
27 |     for _ in range(NUM_REPEATS):
28 |         torch.add(t1, t2)
29 | 


--------------------------------------------------------------------------------
/benchmarks/profiler_benchmark/resnet_memory_profiler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision.models as models
 3 | 
 4 | import torch.autograd.profiler as profiler
 5 | 
 6 | for with_cuda in [False, True]:
 7 |     model = models.resnet18()
 8 |     inputs = torch.randn(5, 3, 224, 224)
 9 |     sort_key = "self_cpu_memory_usage"
10 |     if with_cuda and torch.cuda.is_available():
11 |         model = model.cuda()
12 |         inputs = inputs.cuda()
13 |         sort_key = "self_cuda_memory_usage"
14 |         print("Profiling CUDA Resnet model")
15 |     else:
16 |         print("Profiling CPU Resnet model")
17 | 
18 |     with profiler.profile(profile_memory=True, record_shapes=True) as prof:
19 |         with profiler.record_function("root"):
20 |             model(inputs)
21 | 
22 |     print(prof.key_averages(group_by_input_shape=True).table(sort_by=sort_key, row_limit=-1))
23 | 


--------------------------------------------------------------------------------
/benchmarks/serialization/nested_annotation_str.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.utils.benchmark as benchmark
 3 | 
 4 | MEMO = {}
 5 | def create_nested_dict_type(layers):
 6 |     if layers == 0:
 7 |         return torch._C.StringType.get()
 8 |     if layers not in MEMO:
 9 |         less_nested = create_nested_dict_type(layers - 1)
10 |         result = torch._C.DictType(torch._C.StringType.get(), torch._C.TupleType([less_nested, less_nested]))
11 |         MEMO[layers] = result
12 |     return MEMO[layers]
13 | 
14 | 
15 | nesting_levels = (1, 3, 5, 10)
16 | types = (reasonable, medium, big, huge) = [create_nested_dict_type(x) for x in nesting_levels]
17 | 
18 | timers = [benchmark.Timer(stmt='x.annotation_str', globals={'x': nested_type}) for nested_type in types]
19 | 
20 | for nesting_level, typ, timer in zip(nesting_levels, types, timers):
21 |     print("Nesting level:", nesting_level)
22 |     print("output:", typ.annotation_str[:70])
23 |     print(timer.blocked_autorange())
24 | 


--------------------------------------------------------------------------------
/benchmarks/serialization/simple_measurement.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pyarkbench import Benchmark, Timer, default_args
 3 | 
 4 | use_new = True
 5 | 
 6 | class Basic(Benchmark):
 7 |     def benchmark(self):
 8 |         x = [torch.ones(200, 200) for i in range(30)]
 9 |         with Timer() as big1:
10 |             torch.save(x, "big_tensor.zip", _use_new_zipfile_serialization=use_new)
11 | 
12 |         with Timer() as big2:
13 |             v = torch.load("big_tensor.zip")
14 | 
15 |         x = [torch.ones(10, 10) for i in range(200)]
16 |         with Timer() as small1:
17 |             torch.save(x, "small_tensor.zip", _use_new_zipfile_serialization=use_new)
18 | 
19 |         with Timer() as small2:
20 |             v = torch.load("small_tensor.zip")
21 | 
22 |         return {
23 |             "Big Tensors Save": big1.ms_duration,
24 |             "Big Tensors Load": big2.ms_duration,
25 |             "Small Tensors Save": small1.ms_duration,
26 |             "Small Tensors Load": small2.ms_duration,
27 |         }
28 | 
29 | if __name__ == '__main__':
30 |     bench = Basic(*default_args.bench())
31 |     print("Use zipfile serialization:", use_new)
32 |     results = bench.run()
33 |     bench.print_stats(results, stats=['mean', 'median'])
34 | 


--------------------------------------------------------------------------------
/benchmarks/sparse/README.md:
--------------------------------------------------------------------------------
1 | #Sparse benchmarks
2 | 
3 | These sets of benchmarks are for the sparse matrix functionality. They exist for
4 | comparing the performance of sparse matrix routines such as SpMV between various
5 | sparse matrix formats and with other frameworks such as TensorFlow.
6 | 


--------------------------------------------------------------------------------
/benchmarks/sparse/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | if __name__ == "__main__":
3 |     pass
4 | 


--------------------------------------------------------------------------------
/benchmarks/sparse/dlmc/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse benchmarks
 2 | 
 3 | These sets of benchmarks are for the sparse matrix functionality using a popular real dataset collection called the Deep Learning Matrix Collection (DLMC), which were used in recent studies [1, 2].
 4 | 
 5 | Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here.
 6 | 
 7 | - `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward-test`, on CPU or CUDA with `--with-cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
 8 | 
 9 | - `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test.
10 | 
11 | References:
12 | 
13 | 1. Trevor Gale, Matei Zaharia, Cliff Young, Erich Elsen. Sparse GPU Kernels for Deep Learning. Proceedings of the International Conference for High Performance Computing, 2020. https://github.com/google-research/google-research/tree/master/sgk
14 | 
15 | 2. Trevor Gale, Erich Elsen, Sara Hooker. The State of Sparsity in Deep Neural Networks. https://github.com/google-research/google-research/tree/master/state_of_sparsity
16 | 


--------------------------------------------------------------------------------
/benchmarks/sparse/dlmc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | if __name__ == "__main__":
3 |     pass
4 | 


--------------------------------------------------------------------------------
/benchmarks/sparse/test_csr.sh:
--------------------------------------------------------------------------------
 1 | OUTFILE=spmm-no-mkl-test.txt
 2 | PYTORCH_HOME=$1
 3 | 
 4 | cd $PYTORCH_HOME
 5 | 
 6 | echo "" >> $OUTFILE
 7 | echo "----- USE_MKL=1 -----" >> $OUTFILE
 8 | rm -rf build
 9 | 
10 | export USE_MKL=1
11 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
12 | python setup.py build --cmake-only
13 | ccmake build  # or cmake-gui build
14 | 
15 | python setup.py install
16 | 
17 | cd benchmarks
18 | echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
19 | for dim0 in 1000 5000 10000; do
20 |     for nnzr in 0.01 0.05 0.1 0.3; do
21 |         python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
22 |         # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
23 |     done
24 | done
25 | echo "----------------------" >> $OUTFILE
26 | 
27 | cd $PYTORCH_HOME
28 | echo "----- USE_MKL=0 ------" >> $OUTFILE
29 | rm -rf build
30 | 
31 | export USE_MKL=0
32 | python setup.py install
33 | 
34 | cd benchmarks
35 | for dim0 in 1000 5000 10000; do
36 |     for nnzr in 0.01 0.05 0.1 0.3; do
37 |         python -m sparse.spmv --format csr --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
38 |         python -m sparse.spmv --format coo --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
39 |     done
40 | done
41 | echo "----------------------" >> $OUTFILE
42 | 


--------------------------------------------------------------------------------
/benchmarks/static_runtime/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
 2 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
 3 | set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE)
 4 | 
 5 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
 6 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cc)
 7 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc)
 8 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_module.cc)
 9 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_generated_ops.cc)
10 | set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE)
11 | 


--------------------------------------------------------------------------------
/benchmarks/tensorexpr/HowToRun.md:
--------------------------------------------------------------------------------
 1 | From the root of pytorch repo, run:
 2 | ```
 3 | python -m benchmarks.tensorexpr --help
 4 | ```
 5 | to show documentation.
 6 | 
 7 | An example of an actual command line that one might use as a starting point:
 8 | ```
 9 | python -m benchmarks.tensorexpr --device gpu --mode fwd --jit-mode trace --cuda-fuser=te
10 | ```
11 | 


--------------------------------------------------------------------------------
/benchmarks/tensorexpr/nnc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/tensorexpr/nnc.png


--------------------------------------------------------------------------------
/benchmarks/tensorexpr/tensor_engine.py:
--------------------------------------------------------------------------------
 1 | tensor_engine = None
 2 | 
 3 | 
 4 | def unsupported(func):
 5 |     def wrapper(self):
 6 |         return func(self)
 7 | 
 8 |     wrapper.is_supported = False
 9 |     return wrapper
10 | 
11 | 
12 | def is_supported(method):
13 |     if hasattr(method, "is_supported"):
14 |         return method.is_supported
15 |     return True
16 | 
17 | 
18 | def set_engine_mode(mode):
19 |     global tensor_engine
20 |     if mode == "tf":
21 |         from . import tf_engine
22 | 
23 |         tensor_engine = tf_engine.TensorFlowEngine()
24 |     elif mode == "pt":
25 |         from . import pt_engine
26 | 
27 |         tensor_engine = pt_engine.TorchTensorEngine()
28 |     elif mode == "topi":
29 |         from . import topi_engine
30 | 
31 |         tensor_engine = topi_engine.TopiEngine()
32 |     elif mode == "relay":
33 |         from . import relay_engine
34 | 
35 |         tensor_engine = relay_engine.RelayEngine()
36 |     elif mode == "nnc":
37 |         from . import nnc_engine
38 | 
39 |         tensor_engine = nnc_engine.NncEngine()
40 |     else:
41 |         raise ValueError("invalid tensor engine mode: %s" % (mode))
42 |     tensor_engine.mode = mode
43 | 
44 | 
45 | def get_engine():
46 |     if tensor_engine is None:
47 |         raise ValueError("use of get_engine, before calling set_engine_mode is illegal")
48 |     return tensor_engine
49 | 


--------------------------------------------------------------------------------
/binaries/caffe2_benchmark.cc:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <iterator>
 3 | #include <string>
 4 | 
 5 | #include "binaries/benchmark_args.h"
 6 | #include "binaries/benchmark_helper.h"
 7 | 
 8 | 
 9 | int main(int argc, char** argv) {
10 |   caffe2::GlobalInit(&argc, &argv);
11 |   benchmark(
12 |       argc,
13 |       argv,
14 |       FLAGS_backend,
15 |       FLAGS_init_net,
16 |       FLAGS_input,
17 |       FLAGS_input_dims,
18 |       FLAGS_input_file,
19 |       FLAGS_input_type,
20 |       FLAGS_iter,
21 |       FLAGS_measure_memory,
22 |       FLAGS_net,
23 |       FLAGS_output,
24 |       FLAGS_output_folder,
25 |       FLAGS_run_individual,
26 |       FLAGS_sleep_before_run,
27 |       FLAGS_sleep_between_iteration,
28 |       FLAGS_sleep_between_net_and_operator,
29 |       FLAGS_text_output,
30 |       FLAGS_warmup,
31 |       FLAGS_wipe_cache);
32 | }
33 | 


--------------------------------------------------------------------------------
/binaries/lite_interpreter_model_load.cc:
--------------------------------------------------------------------------------
 1 | #include "ATen/ATen.h"
 2 | #include <torch/csrc/jit/api/module.h>
 3 | #include <torch/csrc/autograd/generated/variable_factories.h>
 4 | #include <torch/csrc/jit/mobile/import.h>
 5 | #include <torch/csrc/jit/mobile/module.h>
 6 | #include <torch/csrc/jit/serialization/import.h>
 7 | #include "torch/script.h"
 8 | 
 9 | C10_DEFINE_string(model, "", "The given bytecode model to check if it is supported by lite_interpreter.");
10 | 
11 | int main(int argc, char** argv) {
12 |   c10::SetUsageMessage(
13 |     "Check if exported bytecode model is runnable by lite_interpreter.\n"
14 |     "Example usage:\n"
15 |     "./lite_interpreter_model_load"
16 |     " --model=<model_file>");
17 | 
18 |   if (!c10::ParseCommandLineFlags(&argc, &argv)) {
19 |     std::cerr << "Failed to parse command line flags!" << std::endl;
20 |     return 1;
21 |   }
22 | 
23 |   if (FLAGS_model.empty()) {
24 |     std::cerr << FLAGS_model <<  ":Model file is not provided\n";
25 |     return -1;
26 |   }
27 | 
28 |   // TODO: avoid having to set this guard for custom mobile build with mobile
29 |   // interpreter.
30 |   c10::InferenceMode mode;
31 |   torch::jit::mobile::Module bc = torch::jit::_load_for_mobile(FLAGS_model);
32 |   return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/binaries/parallel_info.cc:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2016-present, Facebook, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "ATen/Parallel.h"
18 | 
19 | #include <iostream>
20 | #include <sstream>
21 | 
22 | #ifdef __linux__
23 | #include <sys/types.h>
24 | #include <unistd.h>
25 | #endif
26 | 
27 | int main(int argc, char** argv) {
28 |   at::init_num_threads();
29 | 
30 |   std::cout << at::get_parallel_info() << std::endl;
31 | 
32 | # ifdef __linux__
33 |   std::ostringstream cmd;
34 |   cmd << "lsof -p " << getpid() << " | grep .so";
35 |   std::cout << "Loaded .so:" << std::endl;
36 |   std::cout << cmd.str() << std::endl;
37 |   std::system(cmd.str().c_str());
38 | # endif
39 | 
40 |   return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/third_party/BUILD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/BUILD


--------------------------------------------------------------------------------
/third_party/METADATA.bzl:
--------------------------------------------------------------------------------
1 | METADATA = {
2 |     "maintainers": [
3 |         "pytorch_dev_infra",
4 |     ],
5 |     "name": "third_party",
6 |     "owner": "pytorch_dev_infra",
7 | }
8 | 


--------------------------------------------------------------------------------
/third_party/README.md:
--------------------------------------------------------------------------------
1 | This folder contains vendored copies of third-party libraries that we
2 | use.
3 | 


--------------------------------------------------------------------------------
/third_party/cudnn.BUILD:
--------------------------------------------------------------------------------
 1 | # Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD
 2 | 
 3 | cc_library(
 4 |     name = "cudnn_headers",
 5 |     hdrs = ["include/cudnn.h"] + glob([
 6 |         "include/cudnn+.h",
 7 |         "include/cudnn_*.h",
 8 |     ]),
 9 |     includes = ["include/"],
10 |     visibility = ["//visibility:private"],
11 | )
12 | 
13 | cc_import(
14 |     name = "cudnn_lib",
15 |     shared_library = "lib/x86_64-linux-gnu/libcudnn.so",
16 |     visibility = ["//visibility:private"],
17 | )
18 | 
19 | cc_library(
20 |     name = "cudnn",
21 |     visibility = ["//visibility:public"],
22 |     deps = [
23 |         "cudnn_headers",
24 |         "cudnn_lib",
25 |     ],
26 | )
27 | 


--------------------------------------------------------------------------------
/third_party/cutlass.BUILD:
--------------------------------------------------------------------------------
 1 | # Description:
 2 | #   CUDA Templates for Linear Algebra Subroutines
 3 | 
 4 | load("@rules_cc//cc:defs.bzl", "cc_library")
 5 | 
 6 | cc_library(
 7 |     name = "cutlass",
 8 |     hdrs = glob(["include/**/*.h"]),
 9 |     includes = ["include/"],
10 |     visibility = ["//visibility:public"],
11 | )
12 | 


--------------------------------------------------------------------------------
/third_party/fmt.BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | cc_library(
 4 |     name = "fmt",
 5 |     hdrs = glob(["include/fmt/*.h",]),
 6 |     defines = ["FMT_HEADER_ONLY=1"],
 7 |     includes = ["include"],
 8 |     visibility = ["//visibility:public"],
 9 | )
10 | 


--------------------------------------------------------------------------------
/third_party/foxi.BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | cc_library(
 4 |     name = "foxi",
 5 |     srcs = [
 6 |         "foxi/onnxifi_loader.c",
 7 |     ],
 8 |     hdrs = glob([
 9 |         "foxi/*.h",
10 |     ]),
11 |     includes = [
12 |         ".",
13 |     ],
14 |     linkstatic = 1,
15 |     visibility = ["//visibility:public"],
16 | )
17 | 


--------------------------------------------------------------------------------
/third_party/ideep.BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | cc_library(
 4 |     name = "ideep",
 5 |     hdrs = glob([
 6 |         "include/**/*.hpp",
 7 |         "include/**/*.h",
 8 |     ]),
 9 |     defines = [
10 |         "IDEEP_USE_MKL",
11 |     ],
12 |     includes = [
13 |         "include/",
14 |     ],
15 |     visibility = ["//visibility:public"],
16 |     deps = ["@mkl_dnn//:mkl-dnn"],
17 | )
18 | 


--------------------------------------------------------------------------------
/third_party/kineto.BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | cc_library(
 4 |     name = "kineto",
 5 |     hdrs = glob(["libkineto/include/*.h",]),
 6 |     includes = [
 7 |         "libkineto/include/",
 8 |     ],
 9 |     visibility = ["//visibility:public"],
10 | )
11 | 


--------------------------------------------------------------------------------
/third_party/miniz-2.1.0/BUILD.bazel:
--------------------------------------------------------------------------------
 1 | cc_library(
 2 |     name = "miniz",
 3 |     srcs = [
 4 |         "miniz.c",
 5 |     ],
 6 |     hdrs = [
 7 |         "miniz.h",
 8 |     ],
 9 |     strip_include_prefix = ".",
10 |     visibility = ["//visibility:public"],
11 | )
12 | 


--------------------------------------------------------------------------------
/third_party/miniz-2.1.0/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2013-2014 RAD Game Tools and Valve Software
 2 | Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
 3 | 
 4 | All Rights Reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/third_party/mkl.BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | cc_library(
 4 |     name = "mkl",
 5 |     srcs = [
 6 |         "libmkl_avx2.so",
 7 |         "libmkl_core.so",
 8 |         "libmkl_def.so",
 9 |         "libmkl_intel_lp64.so",
10 |         "libmkl_rt.so",
11 |         "libmkl_sequential.so",
12 |         "libmkl_vml_avx2.so",
13 |         "libmkl_vml_avx512.so",
14 |         "libmkl_vml_def.so",
15 |     ] + select({
16 |         "@pytorch//tools/config:thread_sanitizer": [],
17 |         "//conditions:default": ["libmkl_tbb_thread.so"],
18 |     }),
19 |     visibility = ["//visibility:public"],
20 |     deps = ["@mkl_headers"],
21 | )
22 | 


--------------------------------------------------------------------------------
/third_party/mkl_headers.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 | 
3 | cc_library(
4 |     name = "mkl_headers",
5 |     hdrs = glob(["include/*.h"]),
6 |     includes = ["include/"],
7 |     visibility = ["//visibility:public"],
8 | )
9 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/benchmark/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if(USE_CUDA)
 2 |   add_executable(nvfuser_bench
 3 |     batch_norm_channels_first.cpp
 4 |     batch_norm_channels_first_backward.cpp
 5 |     batch_norm_channels_last.cpp
 6 |     batch_norm_channels_last_backward.cpp
 7 |     bert.cpp
 8 |     broadcast.cpp
 9 |     gelu_backward.cpp
10 |     heuristic_lookup.cpp
11 |     shape_inference.cpp
12 |     instance_norm.cpp
13 |     layer_norm.cpp
14 |     layer_norm_backward.cpp
15 |     rms_norm.cpp
16 |     rms_norm_backward.cpp
17 |     lstm_cell.cpp
18 |     reduction.cpp
19 |     softmax.cpp
20 |     softmax_backward.cpp
21 |     scale_bias_relu.cpp
22 |     transpose.cpp
23 |     matmul.cpp
24 |     timm.cpp
25 |     utils.cpp
26 |     main.cpp)
27 | 
28 |   target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
29 |   if(NOT MSVC)
30 |     target_compile_options_if_supported(nvfuser_bench -Werror)
31 |     target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
32 |     target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
33 |   endif()
34 | 
35 | endif()
36 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/benchmark/main.cpp:
--------------------------------------------------------------------------------
1 | #include <benchmark/benchmark.h>
2 | 
3 | BENCHMARK_MAIN();
4 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/codegen.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | #include <kernel.h>
 5 | 
 6 | #include <string>
 7 | 
 8 | namespace torch {
 9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 | namespace codegen {
13 | 
14 | //! Generates a CUDA kernel definition for the given kernel
15 | TORCH_CUDA_CU_API std::string generateCudaKernel(
16 |     const kir::Kernel* kernel,
17 |     const std::string& kernel_name = "CUDAGeneratedKernel");
18 | 
19 | } // namespace codegen
20 | } // namespace cuda
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/compute_at.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <inlining.h>
 4 | #include <root_domain_map.h>
 5 | #include <transform_replay.h>
 6 | 
 7 | #include <c10/macros/Export.h>
 8 | #include <c10/util/Exception.h>
 9 | 
10 | #include <deque>
11 | #include <unordered_map>
12 | #include <unordered_set>
13 | #include <vector>
14 | 
15 | namespace torch {
16 | namespace jit {
17 | namespace fuser {
18 | namespace cuda {
19 | 
20 | class TensorDomain;
21 | class TensorView;
22 | 
23 | struct ComputeAt {
24 |  public:
25 |   // Runs the compute at pass making producer look like consumer, computing
26 |   // producer relative to consumer
27 |   static void runAt(
28 |       TensorView* producer,
29 |       TensorView* consumer,
30 |       int64_t consumer_position,
31 |       ComputeAtMode mode = ComputeAtMode::Standard);
32 | 
33 |   // Runs the compute with pass making consumer look like producer, computing
34 |   // producer relative to consumer
35 |   static void runWith(
36 |       TensorView* producer,
37 |       TensorView* consumer,
38 |       int64_t producer_position,
39 |       ComputeAtMode mode = ComputeAtMode::Standard);
40 | };
41 | 
42 | } // namespace cuda
43 | } // namespace fuser
44 | } // namespace jit
45 | } // namespace torch
46 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/.gitignore:
--------------------------------------------------------------------------------
1 | html
2 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/documentation.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #error This is used exclusively for generating the documentation (not a real header)
 3 | 
 4 | //! \namespace torch::jit::fuser
 5 | //! \brief Main PyTorch JIT Fuser namespace
 6 | 
 7 | //! \namespace torch::jit::fuser::cuda
 8 | //! \brief CUDA specific components
 9 | 
10 | //! \namespace torch::jit::fuser::cuda::executor_utils
11 | //! \brief Fuser executor related utilities
12 | 
13 | //! \namespace torch::jit::fuser::kir
14 | //! \brief Kernel IR
15 | 
16 | //! \namespace torch::jit::fuser::ir_utils
17 | //! \brief IR manipulation utilities
18 | 
19 | //! \namespace torch::jit::fuser::loop_utils
20 | //! \brief Loop utilities
21 | 
22 | //! \namespace torch::jit::fuser::scope_utils
23 | //! \brief Scope utilities
24 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/images/ir_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/csrc/docs/images/ir_architecture.png


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/main_page.md:
--------------------------------------------------------------------------------
1 | 
2 | This is the implementation reference for the CUDA PyTorch JIT Fuser
3 | 
4 | - [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
5 | - [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
6 | - Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
7 | 
8 | ![Fuser Architecture Overview](images/ir_architecture.png)
9 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/ir_all_nodes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <ir_base_nodes.h>
4 | #include <ir_interface_nodes.h>
5 | #include <ir_internal_nodes.h>
6 | 
7 | // TODO: remove this once the Kernel IR split is complete
8 | #include <kernel_ir.h>
9 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_alias_memory.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <dispatch.h>
 6 | #include <ir_all_nodes.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 | 
15 | //! Reuse Allocation nodes via pointer aliasing
16 | //!
17 | //! First pass finds candidate TensorViews
18 | //! A candidate TensorView is anything in shared memory OR
19 | //! in local memory with a static size larger than register_size_threshold
20 | //!
21 | //! Second pass finds appropriate input Allocate Node
22 | //! among candidate TensorViews
23 | //!
24 | //! Alias Criteria:
25 | //! If input is a candidate TensorView,
26 | //!          input allocation has the same size as output allocation,
27 | //!          thread bindings match,
28 | //!          is not used after this op:
29 | //! then alias output Allocate to input Allocate.
30 | //!
31 | std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs);
32 | 
33 | } // namespace cuda
34 | } // namespace fuser
35 | } // namespace jit
36 | } // namespace torch
37 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_allocation.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <ir_all_nodes.h>
 6 | #include <kernel_ir.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 | 
15 | //! Buffer allocation information to store in GPU lower to avoid
16 | //!  logic duplication
17 | struct LocalAllocationInfo {
18 |   kir::Allocate* alloc_expr = nullptr;
19 |   std::vector<IterDomain*> alloc_domains;
20 |   bool has_halo = false;
21 | };
22 | 
23 | using LocalAllocationInfoMap =
24 |     std::unordered_map<kir::Allocate*, std::unique_ptr<LocalAllocationInfo>>;
25 | 
26 | //! Insert buffer allocations
27 | std::vector<Expr*> insertAllocations(const std::vector<Expr*>& exprs);
28 | 
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_divisible_split.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <compute_at_map.h>
 6 | #include <fusion.h>
 7 | #include <ir_all_nodes.h>
 8 | 
 9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 | 
14 | // Looks through all transformations assocaited with view, or enforced divisible
15 | // vectorization splits and gathers all splits that provably don't have a
16 | // remainder, therefore the extents of the associated IterDomains do not require
17 | // a ceilDiv expressions.
18 | TORCH_CUDA_CU_API std::unordered_set<Split*> getAllDivisibleSplits(
19 |     Fusion* fusion);
20 | 
21 | // Same as above but will use provided ComputeAtMap instead of building its own.
22 | TORCH_CUDA_CU_API std::unordered_set<Split*> getAllDivisibleSplits(
23 |     Fusion* fusion,
24 |     const ComputeAtMap* ca_map);
25 | 
26 | } // namespace cuda
27 | } // namespace fuser
28 | } // namespace jit
29 | } // namespace torch
30 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_expr_sort.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ir_base_nodes.h>
 4 | 
 5 | namespace torch {
 6 | namespace jit {
 7 | namespace fuser {
 8 | namespace cuda {
 9 | 
10 | std::vector<Expr*> reorderExprsForComputeAt();
11 | 
12 | } // namespace cuda
13 | } // namespace fuser
14 | } // namespace jit
15 | } // namespace torch
16 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_fused_reduction.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ir_all_nodes.h>
 4 | 
 5 | namespace torch {
 6 | namespace jit {
 7 | namespace fuser {
 8 | namespace cuda {
 9 | 
10 | //! Keep track of certain patterns of reductions.
11 | //!
12 | //! - Allreduce IterDomain: reduced and broadcast domain.
13 | class FusedReductionInfo {
14 |  public:
15 |   void markAsAllreduce(IterDomain* id);
16 | 
17 |   bool isAllreduce(IterDomain* id) const;
18 | 
19 |  private:
20 |   // Reduction IterDomains that are also broadcast
21 |   std::unordered_set<IterDomain*> allreduce_ids_;
22 | };
23 | 
24 | //! Detect reductions and broadcasts that are eligible for the fused
25 | //! reduction kernel. When found, the predicate flags of the broadcast
26 | //! is unset, which effectively makes the broadcast just a unary set
27 | //! op.
28 | //! TODO: Consider moving the warp-based fused reduction here.
29 | void fuseReductionsAndBroadcasts(Fusion*);
30 | 
31 | } // namespace cuda
32 | } // namespace fuser
33 | } // namespace jit
34 | } // namespace torch
35 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_fusion_simplifier.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <dispatch.h>
 6 | #include <fusion.h>
 7 | #include <ir_all_nodes.h>
 8 | #include <lower_trivial_reductions.h>
 9 | 
10 | #include <vector>
11 | 
12 | namespace torch {
13 | namespace jit {
14 | namespace fuser {
15 | namespace cuda {
16 | 
17 | // Replaces trivial reductions with Unary Set Ops
18 | void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&);
19 | 
20 | // Transpose, Shift, Gather, and View Ops with Unary Set Ops
21 | std::vector<Expr*> unarySetOpInserter(const std::vector<Expr*>& exprs);
22 | 
23 | } // namespace cuda
24 | } // namespace fuser
25 | } // namespace jit
26 | } // namespace torch
27 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_insert_syncs.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <ir_all_nodes.h>
 6 | #include <kernel_ir.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 | 
15 | //! Insert sync at end of for-loops to prevent write-after-read race condition.
16 | //!
17 | //! WAR race condition occurs when the next iteration of the loop overwrites
18 | //! shared memory value before a previous operation has finished reading it.
19 | std::vector<Expr*> insertWarThreadSynchronization(
20 |     const std::vector<Expr*>& exprs);
21 | 
22 | //! Insert syncs between writing to shared memory and then reading it.
23 | //! RAW pass is run before indexing, unrolling (loop duplication), memory
24 | //! aliasing, and index (grid/block bcast/reduction)
25 | std::vector<Expr*> insertRawThreadSynchronization(
26 |     const std::vector<Expr*>& exprs);
27 | 
28 | } // namespace cuda
29 | } // namespace fuser
30 | } // namespace jit
31 | } // namespace torch
32 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_instrument.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ir_all_nodes.h>
 4 | 
 5 | namespace torch {
 6 | namespace jit {
 7 | namespace fuser {
 8 | namespace cuda {
 9 | 
10 | //! Set up KernelPerformanceProfile of GpuLower when enabled, which
11 | //! keeps track of expressions to profile. A new TensorView is added
12 | //! for storing profiling results. The expression list is prepended
13 | //! with an kir::Allocate node to allocate the TensorView profile
14 | //! buffer. Note that any expression added after this pass will not be
15 | //! profiled, so this pass should be called after all expressions are
16 | //! lowered. KernelPerformanceProfile is copied to Kernel after
17 | //! lowering.
18 | std::vector<Expr*> instrumentKernel(const std::vector<Expr*>& exprs);
19 | 
20 | } // namespace cuda
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_predicate.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <c10/macros/Export.h>
 3 | 
 4 | #include <ir_all_nodes.h>
 5 | #include <kernel_ir.h>
 6 | 
 7 | #include <vector>
 8 | 
 9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 | 
14 | //! Update predicates with valid bool conditionals
15 | //!
16 | std::vector<Expr*> generateConditionalFromPredicate(
17 |     const std::vector<Expr*>& exprs);
18 | 
19 | } // namespace cuda
20 | } // namespace fuser
21 | } // namespace jit
22 | } // namespace torch
23 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_replace_size.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <dispatch.h>
 6 | #include <fusion.h>
 7 | #include <ir_all_nodes.h>
 8 | 
 9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 | 
14 | // TensorViews are all based on symbolic sizes. When we first initialize them
15 | // we don't know if they're inputs or outputs which would mean that they have
16 | // runtime shapes. Intermediate tensors (those not going to global memory) do
17 | // not have this information. Since we need to have the correct information in
18 | // the kernel being fetched for shapes, we want to replace input and output
19 | // tensors to reference the runtime structure containing sizes.
20 | void replaceSymbolicSizes(Fusion*);
21 | 
22 | } // namespace cuda
23 | } // namespace fuser
24 | } // namespace jit
25 | } // namespace torch
26 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_warp_reduce.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <kernel_ir.h>
 4 | 
 5 | namespace torch {
 6 | namespace jit {
 7 | namespace fuser {
 8 | namespace cuda {
 9 | 
10 | struct WarpPaddedParallelInfo {
11 |   bool is_tidx_padded = false;
12 |   bool is_tidx_single_warp = false;
13 |   bool has_warp_reduction = false;
14 | };
15 | 
16 | std::vector<Expr*> fuseWarpReduce(const std::vector<Expr*> exprs);
17 | 
18 | } // namespace cuda
19 | } // namespace fuser
20 | } // namespace jit
21 | } // namespace torch
22 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/mutator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <dispatch.h>
 6 | #include <ir_base_nodes.h>
 7 | 
 8 | #include <unordered_map>
 9 | 
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 | 
15 | /*
16 |  * Mutators are the mechanism used to modify IR nodes. Since most nodes are
17 |  * immutable or at least partially immutable changeing them can require creating
18 |  * a new node. Base mutator at the moment is a dumb sample mutator that takes
19 |  * any float of value 1.0 and converts it to 0.0; It is currently used as a
20 |  * dummy example, however, we should make it a simple instantiation of all the
21 |  * mutate functions on all node types so that people can inherit it, and only
22 |  * specialize those nodes which they want to have a particular transformation.
23 |  */
24 | 
25 | } // namespace cuda
26 | } // namespace fuser
27 | } // namespace jit
28 | } // namespace torch
29 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/ops/all_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <arith.h>
3 | #include <ops/alias.h>
4 | #include <ops/composite.h>
5 | #include <ops/normalization.h>
6 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/parallel_type_bitmap.cpp:
--------------------------------------------------------------------------------
 1 | #include <parallel_type_bitmap.h>
 2 | 
 3 | namespace torch {
 4 | namespace jit {
 5 | namespace fuser {
 6 | namespace cuda {
 7 | 
 8 | constexpr std::bitset<ParallelTypeBitmap::kNumParallelTypes>
 9 |     ParallelTypeBitmap::kTIDBits;
10 | constexpr std::bitset<ParallelTypeBitmap::kNumParallelTypes>
11 |     ParallelTypeBitmap::kBIDBits;
12 | 
13 | std::string ParallelTypeBitmap::toString() const {
14 |   std::stringstream ss;
15 |   ss << "(";
16 |   bool is_first = true;
17 |   for (ParallelType pt : *this) {
18 |     if (!is_first) {
19 |       ss << " ";
20 |     }
21 |     ss << pt;
22 |     is_first = false;
23 |   }
24 |   ss << ")";
25 |   return ss.str();
26 | }
27 | 
28 | } // namespace cuda
29 | } // namespace fuser
30 | } // namespace jit
31 | } // namespace torch
32 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/partial_split_map.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <dispatch.h>
 6 | #include <ir_all_nodes.h>
 7 | #include <kernel_ir.h>
 8 | 
 9 | #include <vector>
10 | 
11 | namespace torch {
12 | namespace jit {
13 | namespace fuser {
14 | namespace cuda {
15 | 
16 | //! Collects start and stop offsets of all split root domains. Offsets
17 | //! are zero unless partially split.
18 | class TORCH_CUDA_CU_API PartialSplitMap {
19 |  public:
20 |   void build(Fusion* fusion);
21 | 
22 |   Val* getStartOffset(IterDomain* root_domain) const;
23 |   Val* getStopOffset(IterDomain* root_domain) const;
24 | 
25 |  private:
26 |   std::unordered_map<IterDomain*, Val*> start_offset_map_;
27 |   std::unordered_map<IterDomain*, Val*> stop_offset_map_;
28 | };
29 | 
30 | } // namespace cuda
31 | } // namespace fuser
32 | } // namespace jit
33 | } // namespace torch
34 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/partition.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | #include <torch/csrc/jit/ir/ir.h>
 5 | 
 6 | /*
 7 |  * API for query node-compatibility in CudaCodeGen
 8 |  *
 9 |  * It is used in the optimization passes, where the graph is traversed and parts
10 |  * that could be handled by CudaCodegen is partitioned and stuffed in
11 |  * `attr::Subgraph` of `prim::CudaFusionGroup`.
12 |  *
13 |  * Logic right now is very simple. On top of device placement, we consider a
14 |  * `Node` compatible when we have a parsing rule for it in our parser.
15 |  */
16 | 
17 | namespace torch {
18 | namespace jit {
19 | namespace fuser {
20 | namespace cuda {
21 | 
22 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup(const Node* node);
23 | 
24 | // consider if `node` could be fused into `fusion`
25 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup(
26 |     const Node* fusion,
27 |     const Node* node);
28 | 
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/python_frontend/python_bindings.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/csrc/jit/python/pybind.h>
 4 | #include <torch/csrc/utils/pybind.h>
 5 | 
 6 | namespace torch {
 7 | namespace jit {
 8 | void initNvFuserPythonBindings(PyObject* module);
 9 | } // namespace jit
10 | } // namespace torch
11 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp:
--------------------------------------------------------------------------------
1 | #include <python_frontend/python_bindings.h>
2 | #include <torch/extension.h>
3 | 
4 | PYBIND11_MODULE(EXTENSION_NAME, m) {
5 |   m.doc() = "nvfuser C API python binding"; // optional module docstring
6 |   torch::jit::initNvFuserPythonBindings(m.ptr());
7 | }
8 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/all_schedulers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <scheduler/normalization.h>
 3 | #include <scheduler/pointwise.h>
 4 | #include <scheduler/reduction.h>
 5 | #include <scheduler/transpose.h>
 6 | 
 7 | namespace torch {
 8 | namespace jit {
 9 | namespace fuser {
10 | namespace cuda {
11 | 
12 | enum class TORCH_CUDA_CU_API ScheduleHeuristic {
13 |   None,
14 |   NoOp,
15 |   PointWise,
16 |   Reduction,
17 |   Persistent,
18 |   Transpose
19 | };
20 | }
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/debug_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace torch {
 4 | namespace jit {
 5 | namespace fuser {
 6 | namespace cuda {
 7 | 
 8 | namespace scheduler_debug_utils {
 9 | 
10 | // Basic logging utility for any messages in scheduler or segmenter
11 | template <typename... Args>
12 | void canScheduleMessage(const Args&... args) {
13 |   // Using builtin expect to reduce the overhead slightly,
14 |   //  alternatively may want to allow this message in debug
15 |   //  build only but that'd be inconvenient for user support.
16 |   if (C10_UNLIKELY(isDebugDumpEnabled(DebugDumpOption::FusionSegmenterLog))) {
17 |     std::cout << c10::str(args...) << "\n";
18 |   }
19 | }
20 | 
21 | // Short-cut message for flagging why shedulers cannot schedule fusions,
22 | //  assuming first argument is heuristic type (not actively checked).
23 | template <typename HeuristicType, typename... Args>
24 | void canScheduleRejectReason(HeuristicType heuristic, const Args&... args) {
25 |   canScheduleMessage(
26 |       "Scheduler _", heuristic, "_ ***rejected*** because : ", args...);
27 | }
28 | 
29 | } // namespace scheduler_debug_utils
30 | 
31 | } // namespace cuda
32 | } // namespace fuser
33 | } // namespace jit
34 | } // namespace torch
35 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/heuristic.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <executor_launch_params.h>
 4 | #include <utils.h>
 5 | 
 6 | #include <string>
 7 | 
 8 | namespace torch {
 9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 | 
13 | class HeuristicParams : public PolymorphicBase {
14 |  public:
15 |   std::string tag = "";
16 | 
17 |   LaunchParams lparams;
18 | 
19 |   virtual std::string toString() const {
20 |     return "Undefined Heuristic Params";
21 |   }
22 | 
23 |   virtual size_t hash() const = 0;
24 | 
25 |   virtual ~HeuristicParams() = default;
26 | 
27 |   virtual bool sameAs(const std::shared_ptr<HeuristicParams>& other) const = 0;
28 | 
29 |   virtual std::shared_ptr<HeuristicParams> clone() const = 0;
30 | 
31 |   HeuristicParams() = default;
32 |   HeuristicParams(const std::string& tag) : tag(tag) {}
33 | };
34 | 
35 | } // namespace cuda
36 | } // namespace fuser
37 | } // namespace jit
38 | } // namespace torch
39 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/normalization.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/ivalue.h>
 4 | 
 5 | #include <fusion.h>
 6 | #include <scheduler/reduction_heuristic.h>
 7 | 
 8 | // TODO: If caching inputs would require persistence we are sending it to the
 9 | // persistent kerenl scheduler. This isn't necessary if the only persistent
10 | // buffers are inputs as we could re-read them from global memory. Need to
11 | // consider if this is worth implementing.
12 | 
13 | namespace torch {
14 | namespace jit {
15 | namespace fuser {
16 | namespace cuda {
17 | 
18 | class SchedulerRuntimeInfo;
19 | class HeuristicSummary;
20 | 
21 | TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
22 |     Fusion* fusion,
23 |     const at::ArrayRef<c10::IValue>& runtime_inputs,
24 |     HeuristicSummary* data_cache = nullptr);
25 | 
26 | TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
27 |     Fusion* fusion,
28 |     SchedulerRuntimeInfo& runtime_info,
29 |     HeuristicSummary* data_cache = nullptr);
30 | 
31 | TORCH_CUDA_CU_API void schedulePersistentKernel(
32 |     Fusion* fusion,
33 |     const ReductionParams& rparams);
34 | 
35 | } // namespace cuda
36 | } // namespace fuser
37 | } // namespace jit
38 | } // namespace torch
39 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/reduction.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/ivalue.h>
 4 | 
 5 | #include <fusion.h>
 6 | #include <scheduler/reduction_heuristic.h>
 7 | 
 8 | namespace torch {
 9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 | 
13 | class SchedulerRuntimeInfo;
14 | class HeuristicSummary;
15 | 
16 | TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
17 |     Fusion* fusion,
18 |     const at::ArrayRef<c10::IValue>& runtime_inputs,
19 |     HeuristicSummary* data_cache = nullptr);
20 | 
21 | TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
22 |     Fusion* fusion,
23 |     SchedulerRuntimeInfo& runtime_info,
24 |     HeuristicSummary* data_cache = nullptr);
25 | 
26 | TORCH_CUDA_CU_API void scheduleReduction(
27 |     Fusion* fusion,
28 |     const ReductionParams& rparams);
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/transform_rfactor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <ir_all_nodes.h>
 6 | #include <transform_iter.h>
 7 | 
 8 | #include <algorithm>
 9 | #include <vector>
10 | 
11 | namespace torch {
12 | namespace jit {
13 | namespace fuser {
14 | namespace cuda {
15 | 
16 | // TODO: Only replay dispatch is really borrowed from TransformIter, we should
17 | // reevaluate the reuse of dispatch for classes that inherit TransformIter.
18 | class TORCH_CUDA_CU_API TransformRFactor {
19 |  public:
20 |   // Transform the provided tensor domain to two domains, a producer and
21 |   // consumer domain. These domains are created by taking axes and reducing them
22 |   // in the producer domain, and taking the remaining reduction axes and
23 |   // reducing them in the consumer domain.
24 |   static std::pair<TensorDomain*, TensorDomain*> runReplay(
25 |       TensorDomain*,
26 |       std::vector<int> axes);
27 | };
28 | 
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/type_inference.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/Context.h>
 4 | #include <torch/csrc/jit/ir/ir.h>
 5 | 
 6 | namespace torch {
 7 | namespace jit {
 8 | namespace fuser {
 9 | namespace cuda {
10 | 
11 | TORCH_CUDA_CU_API void TypePropagate(std::shared_ptr<Graph>& graph);
12 | 
13 | } // namespace cuda
14 | } // namespace fuser
15 | } // namespace jit
16 | } // namespace torch
17 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/vectorization_info.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/macros/Export.h>
 4 | 
 5 | #include <ir_all_nodes.h>
 6 | 
 7 | namespace torch {
 8 | namespace jit {
 9 | namespace fuser {
10 | namespace cuda {
11 | 
12 | struct VectorizedSetInfo {
13 |   //! Producer of a vectorized set
14 |   TensorView* producer_tv = nullptr;
15 |   //! Consumer of a vectorized set
16 |   TensorView* consumer_tv = nullptr;
17 |   //! Number of elements to vectorize
18 |   int word_size = -1;
19 |   //! Vectorized domain
20 |   IterDomain* vectorized_leaf_id = nullptr;
21 |   //! Right-most root dependent domain of the leaf domain
22 |   IterDomain* vectorized_root_id = nullptr;
23 |   //! All of the dependent root domains that are contiguously merged
24 |   std::unordered_set<IterDomain*> contig_root_ids;
25 | };
26 | 
27 | } // namespace cuda
28 | } // namespace fuser
29 | } // namespace jit
30 | } // namespace torch
31 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/README.md:
--------------------------------------------------------------------------------
 1 | # Build
 2 | 
 3 | ```
 4 | python setup.py install
 5 | ```
 6 | 
 7 | # Test
 8 | 
 9 | ```
10 | python test.py
11 | ```
12 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/csrc/jit/codegen/cuda/arith.h>
 2 | #include <torch/csrc/jit/codegen/cuda/executor.h>
 3 | #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 4 | #include <torch/extension.h>
 5 | 
 6 | #include <memory>
 7 | 
 8 | using namespace torch::jit::fuser::cuda;
 9 | 
10 | at::Tensor sinh_nvfuser(const at::Tensor& input) {
11 |   Fusion fusion;
12 |   FusionGuard fg(&fusion);
13 | 
14 |   int dim = input.dim();
15 |   auto dtype = input.scalar_type();
16 |   auto x =
17 |       TensorViewBuilder().ndims(dim).dtype(aten_to_data_type(dtype)).build();
18 |   fusion.addInput(x);
19 | 
20 |   // Using equation sinh(x) = [ exp(x) - exp(-1) ] / 2
21 |   auto output = div(sub(exp(x), exp(neg(x))), IrBuilder::create<Double>(2.0));
22 |   fusion.addOutput(output);
23 | 
24 |   std::cout << "Create fusion:" << std::endl;
25 |   fusion.print();
26 | 
27 |   auto lparams = schedulePointwise(&fusion, {input});
28 | 
29 |   FusionExecutor fe;
30 |   fe.compileFusion(&fusion, {input}, lparams);
31 |   auto outputs = fe.runFusion({input}, lparams);
32 | 
33 |   return outputs[0];
34 | }
35 | 
36 | TORCH_LIBRARY(myop, m) {
37 |   m.def("sinh_nvfuser", sinh_nvfuser);
38 | }
39 | 
40 | TORCH_LIBRARY_IMPL(myop, CUDA, m) {
41 |   m.impl("sinh_nvfuser", sinh_nvfuser);
42 | }
43 | 
44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {}
45 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='nvfuser_extension',
 6 |     ext_modules=[
 7 |         CUDAExtension(
 8 |             name='nvfuser_extension',
 9 |             pkg='nvfuser_extension',
10 |             sources=['main.cpp'])
11 |     ],
12 |     cmdclass={
13 |         'build_ext': BuildExtension
14 |     })
15 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import nvfuser_extension  # noqa: F401
 3 | 
 4 | t = torch.randn((5, 5), device='cuda')
 5 | expected = torch.sinh(t)
 6 | output = torch.ops.myop.sinh_nvfuser(t)
 7 | 
 8 | print("Expected:", expected)
 9 | print("Output:", output)
10 | 
11 | assert torch.allclose(output, expected)
12 | print("They match!")
13 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_libtorch/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
2 | project(sinh_example LANGUAGES CXX)
3 | set(CMAKE_CXX_STANDARD 14)
4 | 
5 | find_package(Torch REQUIRED)
6 | 
7 | add_executable(sinh_example main.cpp)
8 | target_link_libraries(sinh_example ${TORCH_LIBRARIES})
9 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_libtorch/README.md:
--------------------------------------------------------------------------------
 1 | # Build
 2 | 
 3 | ```
 4 | mkdir build
 5 | cd build
 6 | cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
 7 | make -j
 8 | ```
 9 | 
10 | # Test
11 | 
12 | ```
13 | ./sinh_example
14 | ```
15 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/python/__init__.py:
--------------------------------------------------------------------------------
1 | from . import _C
2 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/python_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/python_tests/__init__.py


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/bf16_support.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
 3 | #define __NVFUSER_BFLOAT_TO_CUS(var) \
 4 |   *(reinterpret_cast<const unsigned short*>(&(var)))
 5 | 
 6 | struct __bfloat;
 7 | __device__ __bfloat __float2bfloat(const float);
 8 | 
 9 | struct __align__(2) __bfloat {
10 |   __bfloat() = default;
11 | 
12 |   __device__ __bfloat(const float f) {
13 |     __x = __float2bfloat(f).__x;
14 |   }
15 | 
16 |  protected:
17 |   unsigned short __x;
18 | };
19 | 
20 | __device__ __bfloat __float2bfloat(const float f) {
21 |   __bfloat val;
22 |   asm("{  cvt.rn.bf16.f32 %0, %1;}\n"
23 |       : "=h"(__NVFUSER_BFLOAT_TO_US(val))
24 |       : "f"(f));
25 |   return val;
26 | }
27 | 
28 | __device__ float __bfloat2float(const __bfloat h) {
29 |   float val;
30 |   asm("{  mov.b32 %0, {0,%1};}\n"
31 |       : "=f"(val)
32 |       : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
33 |   return val;
34 | }
35 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/bf16_support_rocm.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | struct __align__(2) __bfloat {
 3 |   __bfloat() = default;
 4 | 
 5 |   inline __device__ __bfloat(const float f) {
 6 |     if (f != f) {
 7 |       __x = uint16_t(0x7FC0);
 8 |     } else {
 9 |       union {
10 |         uint32_t U32;
11 |         float F32;
12 |       };
13 | 
14 |       F32 = f;
15 |       uint32_t rounding_bias = ((U32 >> 16) & 1) + uint32_t(0x7FFF);
16 |       __x = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
17 |     }
18 |   }
19 | 
20 |   inline __device__ operator float() const {
21 |     float res = 0;
22 |     uint32_t tmp = __x;
23 |     tmp <<= 16;
24 |     float* tempRes = reinterpret_cast<float*>(&tmp);
25 |     res = *tempRes;
26 |     return res;
27 |   }
28 | 
29 |  protected:
30 |   unsigned short __x;
31 | };
32 | 
33 | __device__ __bfloat __float2bfloat(const float f) {
34 |   return __bfloat(f);
35 | }
36 | 
37 | __device__ float __bfloat2float(const __bfloat h) {
38 |   return float(h);
39 | }
40 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/block_sync_default.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Default block synchronization. Just use __barrier_sync
 3 | namespace block_sync {
 4 | 
 5 | __forceinline__ __device__ void init() {}
 6 | 
 7 | // Thread-block synchronization
 8 | __forceinline__ __device__ void sync() {
 9 |   __barrier_sync(0);
10 | }
11 | 
12 | } // namespace block_sync
13 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/block_sync_default_rocm.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Default block synchronization. Just use __barrier_sync
 3 | namespace block_sync {
 4 | 
 5 | __forceinline__ __device__ void init() {}
 6 | 
 7 | // Thread-block synchronization
 8 | __forceinline__ __device__ void sync() {
 9 |   __syncthreads();
10 | }
11 | 
12 | } // namespace block_sync
13 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/broadcast.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | namespace broadcast {
 3 | // Broadcasts within partitioned groups of threads.
 4 | //
 5 | // X_THREAD: Broadcast from threadIdx.x == 0 if true
 6 | // Y_THREAD: Broadcast from threadIdx.y == 0 if true
 7 | // Z_THREAD: Broadcast from threadIdx.z == 0 if true
 8 | // inp_val: Per-thread source value. Only valid when the thread is a source.
 9 | // out: Per-thread output location
10 | //
11 | template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
12 | __device__ void blockBroadcast(
13 |     T& out,
14 |     const T& inp_val,
15 |     T* shared_mem,
16 |     bool read_write_pred) {
17 |   const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
18 |       (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
19 | 
20 |   const auto shared_offset =
21 |       index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
22 |           threadIdx, blockDim);
23 | 
24 |   if (has_valid_data && read_write_pred) {
25 |     shared_mem[shared_offset] = inp_val;
26 |   }
27 | 
28 |   block_sync::sync();
29 | 
30 |   if (read_write_pred) {
31 |     out = shared_mem[shared_offset];
32 |   }
33 | 
34 |   block_sync::sync();
35 | }
36 | 
37 | } // namespace broadcast
38 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/tensor.cu:
--------------------------------------------------------------------------------
 1 | template <typename T, int N>
 2 | struct Tensor {
 3 |   __device__ T& operator[](nvfuser_index_t ind) {
 4 |     return data[ind];
 5 |   };
 6 | 
 7 |   T* data;
 8 |   nvfuser_index_t size[N];
 9 |   nvfuser_index_t stride[N];
10 | };
11 | 
12 | // Specialization for 0-dim case as it does not need size and stride arrays.
13 | // They will be an error as well since zero-length arrays are not allowed.
14 | template <typename T>
15 | struct Tensor<T, 0> {
16 |   __device__ T& operator[](nvfuser_index_t) {
17 |     return *data;
18 |   };
19 | 
20 |   T* data;
21 | };
22 | 
23 | // Specialization for 0-dim case that's easy to pass in a CPU based tensor.
24 | template <typename T>
25 | struct CpuScalarTensor {
26 |   __device__ T& operator[](int) {
27 |     return data;
28 |   };
29 | 
30 |   T data;
31 | };
32 | 


--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/type_traits.cu:
--------------------------------------------------------------------------------
 1 | // Type trait utils
 2 | template <typename Type, bool is_volatile>
 3 | struct MaybeVolatile;
 4 | 
 5 | template <typename Type>
 6 | struct MaybeVolatile<Type, true> {
 7 |   using type = volatile Type;
 8 | };
 9 | 
10 | template <typename Type>
11 | struct MaybeVolatile<Type, false> {
12 |   using type = Type;
13 | };
14 | 
15 | template <typename... Types>
16 | struct TypeList {};
17 | 
18 | template <int idx, typename T, typename... Types>
19 | struct TypeSelector {
20 |   using type = typename TypeSelector<idx - 1, Types...>::type;
21 | };
22 | 
23 | template <typename T, typename... Types>
24 | struct TypeSelector<0, T, Types...> {
25 |   using type = T;
26 | };
27 | 
28 | template <typename T0, typename T1>
29 | struct IsSameType {
30 |   static constexpr bool value = false;
31 | };
32 | 
33 | template <typename T0>
34 | struct IsSameType<T0, T0> {
35 |   static constexpr bool value = true;
36 | };
37 | 
38 | template <typename T>
39 | struct IsPointerType {
40 |   static constexpr bool value = false;
41 | };
42 | 
43 | template <typename T>
44 | struct IsPointerType<T*> {
45 |   static constexpr bool value = true;
46 | };
47 | 


--------------------------------------------------------------------------------
/third_party/sleef.bzl:
--------------------------------------------------------------------------------
 1 | load("@rules_cc//cc:defs.bzl", "cc_library")
 2 | 
 3 | # This macro provides for generating both "sleef<foo>" and
 4 | # "sleefdet<foo>" libraries for a given set of code. The difference is
 5 | # that the "det" libraries get compiled with "-DDETERMINISTIC=1".
 6 | 
 7 | def sleef_cc_library(name, copts, **kwargs):
 8 |     cc_library(
 9 |         name = name,
10 |         copts = copts,
11 |         **kwargs
12 |     )
13 | 
14 |     prefix = "sleef"
15 |     if not name.startswith(prefix):
16 |         fail("name {} does not start with {}".format(repr(name), repr(prefix)))
17 | 
18 |     cc_library(
19 |         name = name.replace(prefix, prefix + "det", 1),
20 |         copts = copts + ["-DDETERMINISTIC=1"],
21 |         **kwargs
22 |     )
23 | 


--------------------------------------------------------------------------------
/third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl:
--------------------------------------------------------------------------------
 1 | # Macros for building CUDA code.
 2 | def if_cuda(if_true, if_false = []):
 3 |     """Shorthand for select()'ing on whether we're building with CUDA.
 4 | 
 5 |     Returns a select statement which evaluates to if_true if we're building
 6 |     with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
 7 | 
 8 |     """
 9 |     return select({
10 |         "@local_config_cuda//cuda:using_clang": if_true,
11 |         "@local_config_cuda//cuda:using_nvcc": if_true,
12 |         "//conditions:default": if_false,
13 |     })
14 | 
15 | def cuda_default_copts():
16 |     """Default options for all CUDA compilations."""
17 |     return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
18 | 
19 | def cuda_is_configured():
20 |     """Returns true if CUDA was enabled during the configure process."""
21 |     return True
22 | 
23 | def if_cuda_is_configured(x):
24 |     """Tests if the CUDA was enabled during the configure process.
25 | 
26 |     Unlike if_cuda(), this does not require that we are building with
27 |     --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
28 |     """
29 |     if cuda_is_configured():
30 |         return x
31 |     return []
32 | 


--------------------------------------------------------------------------------
/third_party/valgrind-headers/README.md:
--------------------------------------------------------------------------------
1 | This folder contains 2 Valgrind headers, downloaded from
2 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=callgrind/callgrind.h;hb=HEAD
3 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=include/valgrind.h;hb=HEAD
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/torchgen/BUCK.oss:
--------------------------------------------------------------------------------
 1 | python_library(
 2 |     name = "torchgen",
 3 |     srcs = glob(
 4 |         ["**/*.py"],
 5 |     ),
 6 |     base_module = "torchgen",
 7 |     visibility = ["PUBLIC"],
 8 |     deps = [
 9 |         "//third_party:pyyaml",
10 |         "//third_party:typing-extensions",
11 |     ],
12 | )
13 | 
14 | python_binary(
15 |     name = "gen",
16 |     main_module = "torchgen.gen",
17 |     visibility = [
18 |         "PUBLIC",
19 |     ],
20 |     deps = [
21 |         ":torchgen",
22 |     ],
23 | )
24 | 


--------------------------------------------------------------------------------
/torchgen/BUILD.bazel:
--------------------------------------------------------------------------------
1 | load("//:tools/bazel.bzl", "rules")
2 | load(":build.bzl", "define_targets")
3 | 
4 | define_targets(rules = rules)
5 | 


--------------------------------------------------------------------------------
/torchgen/__init__.py:
--------------------------------------------------------------------------------
 1 | """torchgen
 2 | 
 3 | This module contains codegeneration utilities for PyTorch. It is used to
 4 | build PyTorch from source, but may also be used for out-of-tree projects
 5 | that extend PyTorch.
 6 | 
 7 | Note well that we provide no BC guarantees for torchgen. If you're interested
 8 | in using torchgen and want the PyTorch team to be aware, please reach out
 9 | on GitHub.
10 | """
11 | 


--------------------------------------------------------------------------------
/torchgen/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/api/__init__.py


--------------------------------------------------------------------------------
/torchgen/api/meta.py:
--------------------------------------------------------------------------------
 1 | from torchgen.model import NativeFunctionsGroup
 2 | 
 3 | # Follows dispatcher calling convention, but:
 4 | #   - Mutable arguments not allowed.  Meta functions are always
 5 | #     written in functional form.  Look at FunctionSchema.signature()
 6 | #   - No tensor returns; instead we return a TensorMeta describing
 7 | #     the tensor in question
 8 | 
 9 | 
10 | def name(g: NativeFunctionsGroup) -> str:
11 |     # use the overload name from the functional version
12 |     return str(g.functional.func.name).replace(".", "_")
13 | 


--------------------------------------------------------------------------------
/torchgen/api/types/__init__.py:
--------------------------------------------------------------------------------
1 | from .types import *
2 | from .types_base import *
3 | from .signatures import *  # isort:skip
4 | 


--------------------------------------------------------------------------------
/torchgen/build.bzl:
--------------------------------------------------------------------------------
 1 | def define_targets(rules):
 2 |     rules.py_library(
 3 |         name = "torchgen",
 4 |         srcs = rules.glob(["**/*.py"]),
 5 |         visibility = ["//visibility:public"],
 6 |         deps = [
 7 |             rules.requirement("PyYAML"),
 8 |             rules.requirement("typing-extensions"),
 9 |         ],
10 |     )
11 | 
12 |     rules.py_binary(
13 |         name = "gen",
14 |         srcs = [":torchgen"],
15 |         visibility = ["//visibility:public"],
16 |     )
17 | 
18 |     rules.py_binary(
19 |         name = "gen_executorch",
20 |         srcs = [":torchgen"],
21 |         visibility = ["//visibility:public"],
22 |     )
23 | 


--------------------------------------------------------------------------------
/torchgen/dest/__init__.py:
--------------------------------------------------------------------------------
 1 | from .lazy_ir import (
 2 |     generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes,
 3 |     GenLazyIR as GenLazyIR,
 4 |     GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition,
 5 |     GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition,
 6 | )
 7 | from .native_functions import (
 8 |     compute_native_function_declaration as compute_native_function_declaration,
 9 | )
10 | from .register_dispatch_key import (
11 |     gen_registration_headers as gen_registration_headers,
12 |     gen_registration_helpers as gen_registration_helpers,
13 |     RegisterDispatchKey as RegisterDispatchKey,
14 | )
15 | from .ufunc import (
16 |     compute_ufunc_cpu as compute_ufunc_cpu,
17 |     compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
18 |     compute_ufunc_cuda as compute_ufunc_cuda,
19 | )
20 | 


--------------------------------------------------------------------------------
/torchgen/executorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/__init__.py


--------------------------------------------------------------------------------
/torchgen/executorch/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/api/__init__.py


--------------------------------------------------------------------------------
/torchgen/executorch/api/types/__init__.py:
--------------------------------------------------------------------------------
1 | from .types import *
2 | from .signatures import *  # isort:skip
3 | 


--------------------------------------------------------------------------------
/torchgen/operator_versions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/operator_versions/__init__.py


--------------------------------------------------------------------------------
/torchgen/operator_versions/gen_mobile_upgraders_constant.py:
--------------------------------------------------------------------------------
1 | MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/**
2 |  * @generated
3 |  * This is an auto-generated file. Please do not modify it by hand.
4 |  * To re-generate, please run:
5 |  * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py
6 |  */
7 | """
8 | 


--------------------------------------------------------------------------------
/torchgen/selective_build/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/selective_build/__init__.py


--------------------------------------------------------------------------------
/torchgen/static_runtime/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/static_runtime/__init__.py


--------------------------------------------------------------------------------