├── .bazelrc
├── .bazelversion
├── .buckconfig.oss
├── .ci
├── caffe2
│ ├── README.md
│ ├── common.sh
│ └── test.sh
├── docker
│ ├── README.md
│ ├── android
│ │ ├── AndroidManifest.xml
│ │ └── build.gradle
│ ├── build.sh
│ ├── build_docker.sh
│ ├── centos-rocm
│ │ └── Dockerfile
│ ├── ci_commit_pins
│ │ ├── triton-rocm.txt
│ │ └── triton.txt
│ ├── common
│ │ ├── common_utils.sh
│ │ ├── install_android.sh
│ │ ├── install_base.sh
│ │ ├── install_cache.sh
│ │ ├── install_clang.sh
│ │ ├── install_cmake.sh
│ │ ├── install_conda.sh
│ │ ├── install_cudnn.sh
│ │ ├── install_db.sh
│ │ ├── install_devtoolset.sh
│ │ ├── install_docs_reqs.sh
│ │ ├── install_gcc.sh
│ │ ├── install_glibc.sh
│ │ ├── install_jni.sh
│ │ ├── install_lcov.sh
│ │ ├── install_linter.sh
│ │ ├── install_ninja.sh
│ │ ├── install_onnx.sh
│ │ ├── install_openmpi.sh
│ │ ├── install_openssl.sh
│ │ ├── install_protobuf.sh
│ │ ├── install_rocm.sh
│ │ ├── install_rocm_magma.sh
│ │ ├── install_swiftshader.sh
│ │ ├── install_thrift.sh
│ │ ├── install_triton.sh
│ │ ├── install_ucc.sh
│ │ ├── install_user.sh
│ │ ├── install_vision.sh
│ │ └── install_vulkan_sdk.sh
│ ├── java
│ │ └── jni.h
│ ├── linter
│ │ └── Dockerfile
│ ├── requirements-ci.txt
│ ├── triton_version.txt
│ ├── ubuntu-cuda
│ │ └── Dockerfile
│ ├── ubuntu-rocm
│ │ ├── .gitignore
│ │ └── Dockerfile
│ └── ubuntu
│ │ └── Dockerfile
├── onnx
│ ├── README.md
│ ├── common.sh
│ └── test.sh
└── pytorch
│ ├── .shellcheckrc
│ ├── README.md
│ ├── build-asan.sh
│ ├── build-mobile.sh
│ ├── build.sh
│ ├── codegen-test.sh
│ ├── common-build.sh
│ ├── common.sh
│ ├── common_utils.sh
│ ├── create_test_cert.py
│ ├── docker-build-test.sh
│ ├── docs-test.sh
│ ├── fake_numpy
│ └── numpy.py
│ ├── macos-build-test.sh
│ ├── macos-build.sh
│ ├── macos-common.sh
│ ├── macos-test.sh
│ ├── multigpu-test.sh
│ ├── perf_test
│ ├── common.sh
│ ├── compare_with_baseline.py
│ ├── get_stats.py
│ ├── test_cpu_speed_mini_sequence_labeler.sh
│ ├── test_cpu_speed_mnist.sh
│ ├── test_cpu_speed_torch.sh
│ ├── test_cpu_speed_torch_tensor.sh
│ ├── test_gpu_speed_cudnn_lstm.sh
│ ├── test_gpu_speed_lstm.sh
│ ├── test_gpu_speed_mlstm.sh
│ ├── test_gpu_speed_mnist.sh
│ ├── test_gpu_speed_word_language_model.sh
│ └── update_commit_hash.py
│ ├── print_sccache_log.py
│ ├── run_glootls_test.sh
│ ├── short-perf-test-cpu.sh
│ ├── short-perf-test-gpu.sh
│ ├── test.sh
│ ├── win-build.sh
│ ├── win-test-helpers
│ ├── build_pytorch.bat
│ ├── choose_runtime_cuda_version.bat
│ ├── installation-helpers
│ │ ├── activate_miniconda3.bat
│ │ ├── install_magma.bat
│ │ ├── install_mkl.bat
│ │ └── install_sccache.bat
│ ├── run_python_nn_smoketests.py
│ ├── setup_pytorch_env.bat
│ ├── test_custom_backend.bat
│ ├── test_custom_script_ops.bat
│ ├── test_distributed.bat
│ ├── test_libtorch.bat
│ ├── test_python_jit_legacy.bat
│ └── test_python_shard.bat
│ └── win-test.sh
├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .coveragerc
├── .ctags.d
└── pytorch.ctags
├── .dockerignore
├── .flake8
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
├── ISSUE_TEMPLATE.md
├── ISSUE_TEMPLATE
│ ├── bug-report.yml
│ ├── ci-sev.md
│ ├── config.yml
│ ├── disable-ci-jobs.md
│ ├── documentation.yml
│ ├── feature-request.yml
│ └── pt2-bug-report.yml
├── PULL_REQUEST_TEMPLATE.md
├── actionlint.yaml
├── actions
│ ├── build-android
│ │ └── action.yml
│ ├── calculate-docker-image
│ │ └── action.yml
│ ├── checkout-pytorch
│ │ └── action.yml
│ ├── chown-workspace
│ │ └── action.yml
│ ├── diskspace-cleanup
│ │ └── action.yml
│ ├── download-build-artifacts
│ │ └── action.yml
│ ├── filter-test-configs
│ │ └── action.yml
│ ├── get-workflow-job-id
│ │ └── action.yml
│ ├── setup-linux
│ │ └── action.yml
│ ├── setup-rocm
│ │ └── action.yml
│ ├── setup-win
│ │ └── action.yml
│ ├── teardown-rocm
│ │ └── action.yml
│ ├── teardown-win
│ │ └── action.yml
│ ├── test-pytorch-binary
│ │ └── action.yml
│ └── upload-test-artifacts
│ │ └── action.yml
├── auto_request_review.yml
├── ci_commit_pins
│ ├── audio.txt
│ ├── huggingface.txt
│ ├── multipy.txt
│ ├── text.txt
│ ├── timm.txt
│ ├── torchbench.txt
│ ├── triton.txt
│ ├── vision.txt
│ └── xla.txt
├── labeler.yml
├── merge_rules.yaml
├── pytorch-circleci-labels.yml
├── pytorch-probot.yml
├── regenerate.sh
├── requirements-gha-cache.txt
├── requirements
│ ├── README.md
│ ├── conda-env-Linux-X64
│ ├── conda-env-iOS
│ ├── conda-env-macOS-ARM64
│ ├── conda-env-macOS-X64
│ ├── pip-requirements-iOS.txt
│ ├── pip-requirements-macOS.txt
│ └── regenerate-requirements.txt
├── scripts
│ ├── README.md
│ ├── build_triton_wheel.py
│ ├── check_labels.py
│ ├── collect_ciflow_labels.py
│ ├── comment_on_pr.py
│ ├── convert_lintrunner_annotations_to_github.py
│ ├── ensure_actions_will_cancel.py
│ ├── export_pytorch_labels.py
│ ├── fetch_latest_green_commit.py
│ ├── filter_test_configs.py
│ ├── generate_binary_build_matrix.py
│ ├── generate_ci_workflows.py
│ ├── generate_pytorch_version.py
│ ├── get_workflow_job_id.py
│ ├── github_utils.py
│ ├── gitutils.py
│ ├── gql_mocks.json
│ ├── kill_active_ssh_sessions.ps1
│ ├── label_utils.py
│ ├── lint_native_functions.py
│ ├── on_cancel_merge.py
│ ├── parse_ref.py
│ ├── pr-sanity-check.sh
│ ├── report_git_status.sh
│ ├── rockset_mocks.json
│ ├── run_torchbench.py
│ ├── stop_runner_service.sh
│ ├── test_check_labels.py
│ ├── test_fetch_latest_green_commit.py
│ ├── test_filter_test_configs.py
│ ├── test_gitutils.py
│ ├── test_label_utils.py
│ ├── test_trymerge.py
│ ├── test_tryrebase.py
│ ├── trymerge.py
│ ├── trymerge_explainer.py
│ ├── tryrebase.py
│ ├── update_commit_hashes.py
│ └── wait_for_ssh_to_drain.ps1
└── templates
│ ├── common.yml.j2
│ ├── linux_binary_build_workflow.yml.j2
│ ├── macos_binary_build_workflow.yml.j2
│ ├── upload.yml.j2
│ └── windows_binary_build_workflow.yml.j2
├── .gitignore
├── .gitmodules
├── .isort.cfg
├── .lintrunner.toml
├── .lldbinit
├── BUILD.bazel
├── LICENSE
├── README.md
├── SECURITY.md
├── benchmarks
├── README.md
├── compare-fastrnn-results.py
├── compare.sh
├── cpp
│ ├── CMakeLists.txt
│ ├── convolution.cpp
│ └── tensorexpr
│ │ ├── CMakeLists.txt
│ │ ├── bench_approx.cpp
│ │ ├── bench_batchnorm.cpp
│ │ ├── bench_compile.cpp
│ │ ├── bench_concat.cpp
│ │ ├── bench_fuser_overhead.cpp
│ │ ├── bench_gemm.cpp
│ │ ├── bench_kernels.cpp
│ │ ├── bench_ops.py
│ │ ├── bench_parallel.cpp
│ │ ├── bench_prefix_sum.cpp
│ │ ├── bench_reduce.cpp
│ │ ├── bench_signed_log1p.cpp
│ │ └── main.cpp
├── distributed
│ ├── ddp
│ │ ├── README.md
│ │ ├── benchmark.py
│ │ └── diff.py
│ ├── pipeline
│ │ ├── benchmark_dataset.py
│ │ └── pipe.py
│ └── rpc
│ │ ├── parameter_server
│ │ ├── README.md
│ │ ├── configurations
│ │ │ ├── data_configurations.json
│ │ │ └── model_configurations.json
│ │ ├── data
│ │ │ ├── DummyData.py
│ │ │ └── __init__.py
│ │ ├── launcher.py
│ │ ├── metrics
│ │ │ ├── CPUMetric.py
│ │ │ ├── CUDAMetric.py
│ │ │ ├── MetricBase.py
│ │ │ ├── MetricsLogger.py
│ │ │ └── ProcessedMetricsPrinter.py
│ │ ├── models
│ │ │ ├── DummyModel.py
│ │ │ └── __init__.py
│ │ ├── server
│ │ │ ├── __init__.py
│ │ │ └── server.py
│ │ ├── trainer
│ │ │ ├── __init__.py
│ │ │ ├── criterions.py
│ │ │ ├── ddp_models.py
│ │ │ ├── hook_states.py
│ │ │ ├── hooks.py
│ │ │ ├── iteration_steps.py
│ │ │ ├── preprocess_data.py
│ │ │ └── trainer.py
│ │ └── utils.py
│ │ └── rl
│ │ ├── README.md
│ │ ├── agent.py
│ │ ├── coordinator.py
│ │ ├── launcher.py
│ │ └── observer.py
├── dynamo
│ ├── Makefile
│ ├── README.md
│ ├── __init__.py
│ ├── all_torchbench_models_list.txt
│ ├── benchmarks.py
│ ├── check_accuracy.py
│ ├── check_csv.py
│ ├── check_graph_breaks.py
│ ├── check_hf_bert_perf_csv.py
│ ├── check_memory_compression_ratio.py
│ ├── ci_expected_accuracy
│ │ ├── inductor_huggingface_dynamic_inference.csv
│ │ ├── inductor_huggingface_dynamic_training.csv
│ │ ├── inductor_huggingface_inference.csv
│ │ ├── inductor_huggingface_training.csv
│ │ ├── inductor_timm_dynamic_inference.csv
│ │ ├── inductor_timm_dynamic_training.csv
│ │ ├── inductor_timm_inference.csv
│ │ ├── inductor_timm_training.csv
│ │ ├── inductor_torchbench_dynamic_inference.csv
│ │ ├── inductor_torchbench_dynamic_training.csv
│ │ ├── inductor_torchbench_inference.csv
│ │ ├── inductor_torchbench_training.csv
│ │ └── update_expected.py
│ ├── combine_csv.py
│ ├── common.py
│ ├── dist_util.py
│ ├── distributed.py
│ ├── expected_ci_perf_inductor_torchbench.csv
│ ├── huggingface.py
│ ├── huggingface_models_list.txt
│ ├── huggingface_models_list_cpu.txt
│ ├── microbenchmarks
│ │ ├── __init__.py
│ │ ├── bench_mm_fusion.py
│ │ ├── benchmark_helper.py
│ │ ├── inductor_bmm.py
│ │ ├── inductor_mm.py
│ │ ├── matmul_relu.py
│ │ ├── microbench.py
│ │ ├── model.py
│ │ ├── operator_inp_logs
│ │ │ ├── hf_train
│ │ │ │ ├── AlbertForMaskedLM_training.txt
│ │ │ │ ├── AlbertForQuestionAnswering_training.txt
│ │ │ │ ├── AllenaiLongformerBase_training.txt
│ │ │ │ ├── BartForCausalLM_training.txt
│ │ │ │ ├── BartForConditionalGeneration_training.txt
│ │ │ │ ├── BertForMaskedLM_training.txt
│ │ │ │ ├── BertForQuestionAnswering_training.txt
│ │ │ │ ├── BigBird_training.txt
│ │ │ │ ├── BlenderbotSmallForCausalLM_training.txt
│ │ │ │ ├── BlenderbotSmallForConditionalGeneration_training.txt
│ │ │ │ ├── CamemBert_training.txt
│ │ │ │ ├── DebertaForMaskedLM_training.txt
│ │ │ │ ├── DebertaForQuestionAnswering_training.txt
│ │ │ │ ├── DebertaV2ForMaskedLM_training.txt
│ │ │ │ ├── DebertaV2ForQuestionAnswering_training.txt
│ │ │ │ ├── DistilBertForMaskedLM_training.txt
│ │ │ │ ├── DistilBertForQuestionAnswering_training.txt
│ │ │ │ ├── DistillGPT2_training.txt
│ │ │ │ ├── ElectraForCausalLM_training.txt
│ │ │ │ ├── ElectraForQuestionAnswering_training.txt
│ │ │ │ ├── GPT2ForSequenceClassification_training.txt
│ │ │ │ ├── GPTNeoForCausalLM_training.txt
│ │ │ │ ├── GPTNeoForSequenceClassification_training.txt
│ │ │ │ ├── GoogleFnet_training.txt
│ │ │ │ ├── LayoutLMForMaskedLM_training.txt
│ │ │ │ ├── LayoutLMForSequenceClassification_training.txt
│ │ │ │ ├── M2M100ForConditionalGeneration_training.txt
│ │ │ │ ├── MBartForCausalLM_training.txt
│ │ │ │ ├── MBartForConditionalGeneration_training.txt
│ │ │ │ ├── MegatronBertForCausalLM_training.txt
│ │ │ │ ├── MegatronBertForQuestionAnswering_training.txt
│ │ │ │ ├── MobileBertForMaskedLM_training.txt
│ │ │ │ ├── MobileBertForQuestionAnswering_training.txt
│ │ │ │ ├── OPTForCausalLM_training.txt
│ │ │ │ ├── PLBartForCausalLM_training.txt
│ │ │ │ ├── PLBartForConditionalGeneration_training.txt
│ │ │ │ ├── PegasusForCausalLM_training.txt
│ │ │ │ ├── PegasusForConditionalGeneration_training.txt
│ │ │ │ ├── RobertaForCausalLM_training.txt
│ │ │ │ ├── RobertaForQuestionAnswering_training.txt
│ │ │ │ ├── Speech2Text2ForCausalLM_training.txt
│ │ │ │ ├── TrOCRForCausalLM_training.txt
│ │ │ │ ├── XGLMForCausalLM_training.txt
│ │ │ │ ├── XLNetLMHeadModel_training.txt
│ │ │ │ └── YituTechConvBert_training.txt
│ │ │ ├── timm_train
│ │ │ │ ├── adv_inception_v3_training.txt
│ │ │ │ ├── beit_base_patch16_224_training.txt
│ │ │ │ ├── botnet26t_256_training.txt
│ │ │ │ ├── cait_m36_384_training.txt
│ │ │ │ ├── coat_lite_mini_training.txt
│ │ │ │ ├── convmixer_768_32_training.txt
│ │ │ │ ├── convnext_base_training.txt
│ │ │ │ ├── crossvit_9_240_training.txt
│ │ │ │ ├── cspdarknet53_training.txt
│ │ │ │ ├── deit_base_distilled_patch16_224_training.txt
│ │ │ │ ├── densenet121_training.txt
│ │ │ │ ├── dla102_training.txt
│ │ │ │ ├── dm_nfnet_f0_training.txt
│ │ │ │ ├── dpn107_training.txt
│ │ │ │ ├── eca_botnext26ts_256_training.txt
│ │ │ │ ├── eca_halonext26ts_training.txt
│ │ │ │ ├── ecaresnet101d_training.txt
│ │ │ │ ├── ese_vovnet19b_dw_training.txt
│ │ │ │ ├── fbnetc_100_training.txt
│ │ │ │ ├── fbnetv3_b_training.txt
│ │ │ │ ├── gernet_l_training.txt
│ │ │ │ ├── ghostnet_100_training.txt
│ │ │ │ ├── gluon_inception_v3_training.txt
│ │ │ │ ├── gluon_senet154_training.txt
│ │ │ │ ├── gluon_xception65_training.txt
│ │ │ │ ├── gmixer_24_224_training.txt
│ │ │ │ ├── gmlp_s16_224_training.txt
│ │ │ │ ├── hardcorenas_a_training.txt
│ │ │ │ ├── hrnet_w18_training.txt
│ │ │ │ ├── inception_v3_training.txt
│ │ │ │ ├── jx_nest_base_training.txt
│ │ │ │ ├── lcnet_050_training.txt
│ │ │ │ ├── legacy_senet154_training.txt
│ │ │ │ ├── levit_128_training.txt
│ │ │ │ ├── mixer_b16_224_training.txt
│ │ │ │ ├── mixnet_l_training.txt
│ │ │ │ ├── mnasnet_100_training.txt
│ │ │ │ ├── mobilenetv2_100_training.txt
│ │ │ │ ├── mobilenetv3_large_100_training.txt
│ │ │ │ ├── mobilevit_s_training.txt
│ │ │ │ ├── nasnetalarge_training.txt
│ │ │ │ ├── nfnet_l0_training.txt
│ │ │ │ ├── pit_b_224_training.txt
│ │ │ │ ├── pnasnet5large_training.txt
│ │ │ │ ├── poolformer_m36_training.txt
│ │ │ │ ├── regnety_002_training.txt
│ │ │ │ ├── repvgg_a2_training.txt
│ │ │ │ ├── res2net101_26w_4s_training.txt
│ │ │ │ ├── res2net50_14w_8s_training.txt
│ │ │ │ ├── res2next50_training.txt
│ │ │ │ ├── resmlp_12_224_training.txt
│ │ │ │ ├── resnest101e_training.txt
│ │ │ │ ├── resnet18_training.txt
│ │ │ │ ├── rexnet_100_training.txt
│ │ │ │ ├── sebotnet33ts_256_training.txt
│ │ │ │ ├── selecsls42b_training.txt
│ │ │ │ ├── spnasnet_100_training.txt
│ │ │ │ ├── swin_base_patch4_window7_224_training.txt
│ │ │ │ ├── swsl_resnext101_32x16d_training.txt
│ │ │ │ ├── tf_efficientnet_b0_training.txt
│ │ │ │ ├── tf_mixnet_l_training.txt
│ │ │ │ ├── tinynet_a_training.txt
│ │ │ │ ├── tnt_s_patch16_224_training.txt
│ │ │ │ ├── twins_pcpvt_base_training.txt
│ │ │ │ ├── visformer_small_training.txt
│ │ │ │ ├── vit_base_patch16_224_training.txt
│ │ │ │ └── volo_d1_224_training.txt
│ │ │ └── torchbench_train
│ │ │ │ ├── BERT_pytorch_training.txt
│ │ │ │ ├── Background_Matting_training.txt
│ │ │ │ ├── LearningToPaint_training.txt
│ │ │ │ ├── Super_SloMo_training.txt
│ │ │ │ ├── alexnet_training.txt
│ │ │ │ ├── attention_is_all_you_need_pytorch_training.txt
│ │ │ │ ├── dcgan_training.txt
│ │ │ │ ├── densenet121_training.txt
│ │ │ │ ├── fambench_dlrm_training.txt
│ │ │ │ ├── fastNLP_Bert_training.txt
│ │ │ │ ├── hf_Albert_training.txt
│ │ │ │ ├── hf_Bart_training.txt
│ │ │ │ ├── hf_Bert_training.txt
│ │ │ │ ├── hf_BigBird_training.txt
│ │ │ │ ├── hf_DistilBert_training.txt
│ │ │ │ ├── hf_GPT2_training.txt
│ │ │ │ ├── hf_Longformer_training.txt
│ │ │ │ ├── maml_omniglot_training.txt
│ │ │ │ ├── mnasnet1_0_training.txt
│ │ │ │ ├── mobilenet_v2_training.txt
│ │ │ │ ├── mobilenet_v3_large_training.txt
│ │ │ │ ├── nvidia_deeprecommender_training.txt
│ │ │ │ ├── pytorch_CycleGAN_and_pix2pix_training.txt
│ │ │ │ ├── pytorch_stargan_training.txt
│ │ │ │ ├── pytorch_struct_training.txt
│ │ │ │ ├── pytorch_unet_training.txt
│ │ │ │ ├── resnet18_training.txt
│ │ │ │ ├── resnet50_training.txt
│ │ │ │ ├── resnext50_32x4d_training.txt
│ │ │ │ ├── shufflenet_v2_x1_0_training.txt
│ │ │ │ ├── speech_transformer_training.txt
│ │ │ │ ├── squeezenet1_1_training.txt
│ │ │ │ ├── timm_efficientdet_training.txt
│ │ │ │ ├── timm_efficientnet_training.txt
│ │ │ │ ├── timm_nfnet_training.txt
│ │ │ │ ├── timm_regnet_training.txt
│ │ │ │ ├── timm_resnest_training.txt
│ │ │ │ ├── timm_vision_transformer_training.txt
│ │ │ │ ├── timm_vovnet_training.txt
│ │ │ │ ├── tts_angular_training.txt
│ │ │ │ ├── vgg16_training.txt
│ │ │ │ ├── vision_maskrcnn_training.txt
│ │ │ │ └── yolov3_training.txt
│ │ ├── operator_inp_utils.py
│ │ ├── operatorbench.py
│ │ └── utils.py
│ ├── parse_logs.py
│ ├── run_all.sh
│ ├── run_delta.sh
│ ├── runner.py
│ ├── summarize_perf.py
│ ├── test.py
│ ├── timm_models.py
│ ├── timm_models_list.txt
│ ├── timm_models_list_cpu.txt
│ ├── torchbench.py
│ ├── torchbench_models_list.txt
│ ├── torchbench_models_list_cpu.txt
│ └── training_loss.py
├── fastrnns
│ ├── README.md
│ ├── __init__.py
│ ├── bench.py
│ ├── cells.py
│ ├── conftest.py
│ ├── custom_lstms.py
│ ├── factory.py
│ ├── fuser.py
│ ├── profile.py
│ ├── runner.py
│ ├── scratch.py
│ ├── test.py
│ └── test_bench.py
├── framework_overhead_benchmark
│ ├── C2Module.py
│ ├── SimpleAddModule.py
│ ├── framework_overhead_benchmark.py
│ ├── pt_wrapper_module.py
│ └── utils.py
├── functional_autograd_benchmark
│ ├── README.md
│ ├── audio_text_models.py
│ ├── compare.py
│ ├── functional_autograd_benchmark.py
│ ├── ppl_models.py
│ ├── torchaudio_models.py
│ ├── torchvision_models.py
│ ├── utils.py
│ └── vision_models.py
├── fuser
│ ├── plot_speedups.py
│ └── run_benchmarks.py
├── instruction_counts
│ ├── README.md
│ ├── applications
│ │ ├── __init__.py
│ │ └── ci.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── expand.py
│ │ ├── types.py
│ │ └── utils.py
│ ├── definitions
│ │ ├── __init__.py
│ │ ├── setup.py
│ │ └── standard.py
│ ├── execution
│ │ ├── __init__.py
│ │ ├── runner.py
│ │ └── work.py
│ ├── main.py
│ └── worker
│ │ ├── __init__.py
│ │ └── main.py
├── nested
│ └── nested_bmm_bench.py
├── operator_benchmark
│ ├── README.md
│ ├── __init__.py
│ ├── benchmark_all_other_test.py
│ ├── benchmark_all_quantized_test.py
│ ├── benchmark_all_test.py
│ ├── benchmark_caffe2.py
│ ├── benchmark_core.py
│ ├── benchmark_pytorch.py
│ ├── benchmark_runner.py
│ ├── benchmark_test_generator.py
│ ├── benchmark_utils.py
│ ├── c2
│ │ ├── __init__.py
│ │ ├── add_test.py
│ │ ├── batch_box_cox_test.py
│ │ ├── batch_gather_test.py
│ │ ├── clip_ranges_test.py
│ │ ├── concat_test.py
│ │ ├── matmul_test.py
│ │ ├── quantile_op_test.py
│ │ └── replace_nan_test.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── repeat_benchmark.py
│ │ └── tests
│ │ │ ├── add_ops_list_test.py
│ │ │ ├── c2_cpu_gpu_forward_backward_test.py
│ │ │ ├── jit_forward_test.py
│ │ │ ├── pt_backward_test.py
│ │ │ ├── pt_configs_list_test.py
│ │ │ ├── pt_cpu_gpu_forward_backward_test.py
│ │ │ └── random_sample_test.py
│ ├── operator_benchmark.py
│ ├── pt
│ │ ├── __init__.py
│ │ ├── add_test.py
│ │ ├── ao_sparsifier_test.py
│ │ ├── as_strided_test.py
│ │ ├── batchnorm_test.py
│ │ ├── binary_test.py
│ │ ├── bmm_test.py
│ │ ├── cat_test.py
│ │ ├── channel_shuffle_test.py
│ │ ├── chunk_test.py
│ │ ├── clip_ranges_test.py
│ │ ├── configs.py
│ │ ├── conv_test.py
│ │ ├── diag_test.py
│ │ ├── embeddingbag_test.py
│ │ ├── fill_test.py
│ │ ├── gather_test.py
│ │ ├── gelu_test.py
│ │ ├── groupnorm_test.py
│ │ ├── hardsigmoid_test.py
│ │ ├── hardswish_test.py
│ │ ├── index_select_test.py
│ │ ├── instancenorm_test.py
│ │ ├── interpolate_test.py
│ │ ├── layernorm_test.py
│ │ ├── linear_prepack_fp16_test.py
│ │ ├── linear_test.py
│ │ ├── linear_unpack_fp16_test.py
│ │ ├── matmul_test.py
│ │ ├── matrix_mult_test.py
│ │ ├── nan_to_num_test.py
│ │ ├── pool_test.py
│ │ ├── qactivation_test.py
│ │ ├── qarithmetic_test.py
│ │ ├── qatembedding_ops_test.py
│ │ ├── qbatchnorm_test.py
│ │ ├── qcat_test.py
│ │ ├── qcomparators_test.py
│ │ ├── qconv_test.py
│ │ ├── qembedding_bag_lookups_test.py
│ │ ├── qembedding_pack_test.py
│ │ ├── qembeddingbag_test.py
│ │ ├── qgroupnorm_test.py
│ │ ├── qinstancenorm_test.py
│ │ ├── qinterpolate_test.py
│ │ ├── qlayernorm_test.py
│ │ ├── qlinear_test.py
│ │ ├── qobserver_test.py
│ │ ├── qpool_test.py
│ │ ├── qrnn_test.py
│ │ ├── qtensor_method_test.py
│ │ ├── quantization_test.py
│ │ ├── qunary_test.py
│ │ ├── remainder_test.py
│ │ ├── softmax_test.py
│ │ ├── split_test.py
│ │ ├── stack_test.py
│ │ ├── sum_test.py
│ │ ├── tensor_to_test.py
│ │ └── unary_test.py
│ └── pt_extension
│ │ ├── cpp_extension_test.py
│ │ ├── extension.cpp
│ │ └── setup.py
├── overrides_benchmark
│ ├── README.md
│ ├── bench.py
│ ├── common.py
│ └── pyspybench.py
├── profiler_benchmark
│ ├── profiler_bench.py
│ └── resnet_memory_profiler.py
├── record_function_benchmark
│ └── record_function_bench.py
├── serialization
│ ├── nested_annotation_str.py
│ └── simple_measurement.py
├── sparse
│ ├── README.md
│ ├── __init__.py
│ ├── dlmc
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── matmul_bench.py
│ │ ├── test.sh
│ │ └── utils.py
│ ├── spmm.py
│ ├── spmv.py
│ ├── test_csr.sh
│ └── utils.py
├── static_runtime
│ ├── CMakeLists.txt
│ ├── deep_wide_pt.cc
│ ├── deep_wide_pt.h
│ ├── deep_wide_pt_bench.cc
│ ├── test_cpu_fusion.cc
│ ├── test_generated_ops.cc
│ ├── test_static_module.cc
│ ├── test_static_runtime.cc
│ ├── test_utils.cc
│ └── test_utils.h
├── tensorexpr
│ ├── HowToRun.md
│ ├── __main__.py
│ ├── attention.py
│ ├── benchmark.py
│ ├── broadcast.py
│ ├── concat.py
│ ├── conv.py
│ ├── elementwise.py
│ ├── matmul.py
│ ├── microbenchmarks.py
│ ├── nnc.png
│ ├── normalization.py
│ ├── pooling.py
│ ├── pt_engine.py
│ ├── reduction.py
│ ├── rnn_eltwise.py
│ ├── softmax.py
│ ├── swish.py
│ └── tensor_engine.py
├── transformer
│ ├── better_transformer_vs_mha_functional.py
│ ├── sdp.py
│ └── sdp_backwards.py
└── upload_scribe.py
├── binaries
├── CMakeLists.txt
├── aot_model_compiler.cc
├── at_launch_benchmark.cc
├── bench_gen
│ └── bench_gen.py
├── benchmark_args.h
├── benchmark_helper.cc
├── benchmark_helper.h
├── caffe2_benchmark.cc
├── compare_models_torch.cc
├── convert_and_benchmark.cc
├── convert_caffe_image_db.cc
├── convert_db.cc
├── convert_encoded_to_raw_leveldb.cc
├── convert_image_to_tensor.cc
├── core_overhead_benchmark.cc
├── core_overhead_benchmark_gpu.cc
├── db_throughput.cc
├── dump_operator_names.cc
├── inspect_gpu.cc
├── intra_inter_benchmark.cc
├── lite_interpreter_model_load.cc
├── load_benchmark_torch.cc
├── make_cifar_db.cc
├── make_image_db.cc
├── make_mnist_db.cc
├── optimize_for_mobile.cc
├── parallel_info.cc
├── predictor_verifier.cc
├── print_core_object_sizes_gpu.cc
├── print_registered_core_operators.cc
├── record_function_benchmark.cc
├── run_plan.cc
├── run_plan_mpi.cc
├── speed_benchmark.cc
├── speed_benchmark_torch.cc
├── split_db.cc
├── tsv_2_proto.cc
├── tutorial_blob.cc
└── zmq_feeder.cc
├── third_party
├── BUCK.oss
├── BUILD
├── LICENSES_BUNDLED.txt
├── METADATA.bzl
├── README.md
├── build_bundled.py
├── cuda.BUILD
├── cudnn.BUILD
├── cutlass.BUILD
├── eigen.BUILD
├── fmt.BUILD
├── foxi.BUILD
├── generate-cpuinfo-wrappers.py
├── generate-xnnpack-wrappers.py
├── glog.buck.bzl
├── gloo.BUILD
├── ideep.BUILD
├── kineto.BUILD
├── kineto.buck.bzl
├── miniz-2.1.0
│ ├── BUILD.bazel
│ ├── ChangeLog.md
│ ├── LICENSE
│ ├── examples
│ │ ├── example1.c
│ │ ├── example2.c
│ │ ├── example3.c
│ │ ├── example4.c
│ │ ├── example5.c
│ │ └── example6.c
│ ├── miniz.c
│ ├── miniz.h
│ └── readme.md
├── mkl-dnn.BUILD
├── mkl.BUILD
├── mkl_headers.BUILD
├── nvfuser
│ ├── CMakeLists.txt
│ ├── benchmark
│ │ ├── CMakeLists.txt
│ │ ├── batch_norm_channels_first.cpp
│ │ ├── batch_norm_channels_first_backward.cpp
│ │ ├── batch_norm_channels_last.cpp
│ │ ├── batch_norm_channels_last_backward.cpp
│ │ ├── bert.cpp
│ │ ├── broadcast.cpp
│ │ ├── gelu_backward.cpp
│ │ ├── heuristic_cache.cpp
│ │ ├── heuristic_lookup.cpp
│ │ ├── instance_norm.cpp
│ │ ├── layer_norm.cpp
│ │ ├── layer_norm_backward.cpp
│ │ ├── lstm_cell.cpp
│ │ ├── main.cpp
│ │ ├── matmul.cpp
│ │ ├── reduction.cpp
│ │ ├── rms_norm.cpp
│ │ ├── rms_norm_backward.cpp
│ │ ├── scale_bias_relu.cpp
│ │ ├── shape_inference.cpp
│ │ ├── softmax.cpp
│ │ ├── softmax_backward.cpp
│ │ ├── softmax_dropout.cpp
│ │ ├── timm.cpp
│ │ ├── transpose.cpp
│ │ ├── utils.cpp
│ │ └── utils.h
│ ├── csrc
│ │ ├── arith.cpp
│ │ ├── arith.h
│ │ ├── codegen.cpp
│ │ ├── codegen.h
│ │ ├── compute_at.cpp
│ │ ├── compute_at.h
│ │ ├── compute_at_map.cpp
│ │ ├── compute_at_map.h
│ │ ├── contiguity.cpp
│ │ ├── contiguity.h
│ │ ├── disjoint_set.h
│ │ ├── dispatch.cpp
│ │ ├── dispatch.h
│ │ ├── docs
│ │ │ ├── .gitignore
│ │ │ ├── documentation.h
│ │ │ ├── fuser.doxygen
│ │ │ ├── images
│ │ │ │ └── ir_architecture.png
│ │ │ └── main_page.md
│ │ ├── dynamic_type.h
│ │ ├── evaluator_common.cpp
│ │ ├── evaluator_common.h
│ │ ├── executor.cpp
│ │ ├── executor.h
│ │ ├── executor_kernel_arg.cpp
│ │ ├── executor_kernel_arg.h
│ │ ├── executor_launch_params.cpp
│ │ ├── executor_launch_params.h
│ │ ├── executor_utils.cpp
│ │ ├── executor_utils.h
│ │ ├── expr_evaluator.cpp
│ │ ├── expr_evaluator.h
│ │ ├── fusion.cpp
│ │ ├── fusion.h
│ │ ├── fusion_segmenter.cpp
│ │ ├── fusion_segmenter.h
│ │ ├── graph_fuser.cpp
│ │ ├── grouped_reduction.cpp
│ │ ├── grouped_reduction.h
│ │ ├── index_compute.cpp
│ │ ├── index_compute.h
│ │ ├── inlining.cpp
│ │ ├── inlining.h
│ │ ├── instrumentation.cpp
│ │ ├── instrumentation.h
│ │ ├── ir_all_nodes.h
│ │ ├── ir_base_nodes.cpp
│ │ ├── ir_base_nodes.h
│ │ ├── ir_builder.cpp
│ │ ├── ir_builder.h
│ │ ├── ir_cloner.cpp
│ │ ├── ir_cloner.h
│ │ ├── ir_container.cpp
│ │ ├── ir_container.h
│ │ ├── ir_graphviz.cpp
│ │ ├── ir_graphviz.h
│ │ ├── ir_interface_nodes.h
│ │ ├── ir_internal_nodes.h
│ │ ├── ir_iostream.cpp
│ │ ├── ir_iostream.h
│ │ ├── ir_nodes.cpp
│ │ ├── ir_printer.h
│ │ ├── ir_utils.cpp
│ │ ├── ir_utils.h
│ │ ├── iter_visitor.cpp
│ │ ├── iter_visitor.h
│ │ ├── kernel.cpp
│ │ ├── kernel.h
│ │ ├── kernel_cache.cpp
│ │ ├── kernel_cache.h
│ │ ├── kernel_expr_evaluator.cpp
│ │ ├── kernel_expr_evaluator.h
│ │ ├── kernel_ir.cpp
│ │ ├── kernel_ir.h
│ │ ├── kernel_ir_dispatch.cpp
│ │ ├── kernel_ir_dispatch.h
│ │ ├── lower2device.cpp
│ │ ├── lower2device.h
│ │ ├── lower_alias_memory.cpp
│ │ ├── lower_alias_memory.h
│ │ ├── lower_allocation.cpp
│ │ ├── lower_allocation.h
│ │ ├── lower_bank_conflict.cpp
│ │ ├── lower_bank_conflict.h
│ │ ├── lower_divisible_split.cpp
│ │ ├── lower_divisible_split.h
│ │ ├── lower_double_buffer.cpp
│ │ ├── lower_double_buffer.h
│ │ ├── lower_expr_sort.cpp
│ │ ├── lower_expr_sort.h
│ │ ├── lower_fused_reduction.cpp
│ │ ├── lower_fused_reduction.h
│ │ ├── lower_fusion_simplifier.cpp
│ │ ├── lower_fusion_simplifier.h
│ │ ├── lower_index.cpp
│ │ ├── lower_index.h
│ │ ├── lower_index_compute.cpp
│ │ ├── lower_index_compute.h
│ │ ├── lower_index_hoist.cpp
│ │ ├── lower_index_hoist.h
│ │ ├── lower_insert_syncs.cpp
│ │ ├── lower_insert_syncs.h
│ │ ├── lower_instrument.cpp
│ │ ├── lower_instrument.h
│ │ ├── lower_loops.cpp
│ │ ├── lower_loops.h
│ │ ├── lower_magic_zero.cpp
│ │ ├── lower_magic_zero.h
│ │ ├── lower_misaligned_vectorization.cpp
│ │ ├── lower_misaligned_vectorization.h
│ │ ├── lower_predicate.cpp
│ │ ├── lower_predicate.h
│ │ ├── lower_predicate_elimination.cpp
│ │ ├── lower_predicate_elimination.h
│ │ ├── lower_replace_size.cpp
│ │ ├── lower_replace_size.h
│ │ ├── lower_shift.cpp
│ │ ├── lower_shift.h
│ │ ├── lower_sync_information.cpp
│ │ ├── lower_sync_information.h
│ │ ├── lower_thread_predicate.cpp
│ │ ├── lower_thread_predicate.h
│ │ ├── lower_trivial_broadcast.cpp
│ │ ├── lower_trivial_broadcast.h
│ │ ├── lower_trivial_reductions.cpp
│ │ ├── lower_trivial_reductions.h
│ │ ├── lower_unroll.cpp
│ │ ├── lower_unroll.h
│ │ ├── lower_utils.cpp
│ │ ├── lower_utils.h
│ │ ├── lower_validation.cpp
│ │ ├── lower_validation.h
│ │ ├── lower_warp_reduce.cpp
│ │ ├── lower_warp_reduce.h
│ │ ├── manager.cpp
│ │ ├── manager.h
│ │ ├── maxinfo_propagator.cpp
│ │ ├── maxinfo_propagator.h
│ │ ├── mma_type.cpp
│ │ ├── mma_type.h
│ │ ├── mutator.cpp
│ │ ├── mutator.h
│ │ ├── non_divisible_split.cpp
│ │ ├── non_divisible_split.h
│ │ ├── ops
│ │ │ ├── alias.cpp
│ │ │ ├── alias.h
│ │ │ ├── all_ops.h
│ │ │ ├── composite.cpp
│ │ │ ├── composite.h
│ │ │ ├── normalization.cpp
│ │ │ └── normalization.h
│ │ ├── parallel_dimension_map.cpp
│ │ ├── parallel_dimension_map.h
│ │ ├── parallel_type_bitmap.cpp
│ │ ├── parallel_type_bitmap.h
│ │ ├── parser.cpp
│ │ ├── parser.h
│ │ ├── partial_split_map.cpp
│ │ ├── partial_split_map.h
│ │ ├── partition.cpp
│ │ ├── partition.h
│ │ ├── predicate_compute.cpp
│ │ ├── predicate_compute.h
│ │ ├── python_frontend
│ │ │ ├── README.md
│ │ │ ├── fusion_cache.cpp
│ │ │ ├── fusion_cache.h
│ │ │ ├── fusion_definition.cpp
│ │ │ ├── fusion_definition.h
│ │ │ ├── fusion_interface.cpp
│ │ │ ├── fusion_interface.h
│ │ │ ├── fusion_record.h
│ │ │ ├── python_bindings.cpp
│ │ │ ├── python_bindings.h
│ │ │ ├── python_bindings_extension.cpp
│ │ │ └── test
│ │ │ │ ├── test_nvfuser_fusion_cache.cpp
│ │ │ │ ├── test_nvfuser_fusion_definition.cpp
│ │ │ │ └── test_nvfuser_fusion_record.cpp
│ │ ├── register_interface.cpp
│ │ ├── register_interface.h
│ │ ├── root_domain_map.cpp
│ │ ├── root_domain_map.h
│ │ ├── scheduler
│ │ │ ├── all_schedulers.h
│ │ │ ├── compile_time_info.h
│ │ │ ├── debug_utils.h
│ │ │ ├── heuristic.h
│ │ │ ├── matmul.cpp
│ │ │ ├── matmul.h
│ │ │ ├── mma_utils.cpp
│ │ │ ├── mma_utils.h
│ │ │ ├── normalization.cpp
│ │ │ ├── normalization.h
│ │ │ ├── pointwise.cpp
│ │ │ ├── pointwise.h
│ │ │ ├── pointwise_heuristic.h
│ │ │ ├── pointwise_utils.cpp
│ │ │ ├── pointwise_utils.h
│ │ │ ├── reduction.cpp
│ │ │ ├── reduction.h
│ │ │ ├── reduction_heuristic.h
│ │ │ ├── reduction_utils.cpp
│ │ │ ├── reduction_utils.h
│ │ │ ├── registry.cpp
│ │ │ ├── registry.h
│ │ │ ├── transpose.cpp
│ │ │ ├── transpose.h
│ │ │ ├── transpose_heuristic.h
│ │ │ ├── utils.cpp
│ │ │ ├── utils.h
│ │ │ ├── vectorize_helper.cpp
│ │ │ └── vectorize_helper.h
│ │ ├── tensor_view.cpp
│ │ ├── transform_iter.cpp
│ │ ├── transform_iter.h
│ │ ├── transform_replay.cpp
│ │ ├── transform_replay.h
│ │ ├── transform_rfactor.cpp
│ │ ├── transform_rfactor.h
│ │ ├── transform_view.cpp
│ │ ├── transform_view.h
│ │ ├── type.cpp
│ │ ├── type.h
│ │ ├── type_inference.cpp
│ │ ├── type_inference.h
│ │ ├── type_promotion.cpp
│ │ ├── type_promotion.h
│ │ ├── utils.cpp
│ │ ├── utils.h
│ │ └── vectorization_info.h
│ ├── examples
│ │ ├── sinh_extension
│ │ │ ├── README.md
│ │ │ ├── main.cpp
│ │ │ ├── setup.py
│ │ │ └── test.py
│ │ └── sinh_libtorch
│ │ │ ├── CMakeLists.txt
│ │ │ ├── README.md
│ │ │ └── main.cpp
│ ├── python
│ │ └── __init__.py
│ ├── python_tests
│ │ ├── __init__.py
│ │ ├── test_dynamo.py
│ │ ├── test_python_frontend.py
│ │ └── test_torchscript.py
│ ├── runtime
│ │ ├── array.cu
│ │ ├── array_rocm.cu
│ │ ├── bf16_support.cu
│ │ ├── bf16_support_rocm.cu
│ │ ├── block_reduction.cu
│ │ ├── block_sync_atomic.cu
│ │ ├── block_sync_default.cu
│ │ ├── block_sync_default_rocm.cu
│ │ ├── broadcast.cu
│ │ ├── fp16_support.cu
│ │ ├── fused_reduction.cu
│ │ ├── fused_welford_helper.cu
│ │ ├── fused_welford_impl.cu
│ │ ├── grid_broadcast.cu
│ │ ├── grid_reduction.cu
│ │ ├── grid_sync.cu
│ │ ├── helpers.cu
│ │ ├── index_utils.cu
│ │ ├── memory.cu
│ │ ├── random_numbers.cu
│ │ ├── swizzle.cu
│ │ ├── tensor.cu
│ │ ├── tensorcore.cu
│ │ ├── tuple.cu
│ │ ├── type_traits.cu
│ │ ├── warp.cu
│ │ ├── warp_rocm.cu
│ │ └── welford.cu
│ ├── test
│ │ ├── test_gpu1.cpp
│ │ ├── test_gpu2.cpp
│ │ ├── test_gpu3.cpp
│ │ ├── test_gpu_fused_reduction.cpp
│ │ ├── test_gpu_rng.cu
│ │ ├── test_gpu_shift.cpp
│ │ ├── test_gpu_tensor_factories.cpp
│ │ ├── test_gpu_tensorcore.cpp
│ │ ├── test_gpu_transpose.cpp
│ │ ├── test_gpu_utils.cpp
│ │ ├── test_gpu_validator.h
│ │ ├── test_gpu_view.cpp
│ │ └── test_utils.h
│ └── tools
│ │ └── stringify_file.py
├── onnx.BUILD
├── sleef.BUILD
├── sleef.bzl
├── substitution.bzl
├── tbb.BUILD
├── tbb.patch
├── tensorflow_cuda_bazel_build
│ └── cuda
│ │ └── build_defs.bzl
├── tensorpipe.BUILD
├── valgrind-headers
│ ├── README.md
│ ├── callgrind.h
│ └── valgrind.h
├── xnnpack.buck.bzl
├── xnnpack_src_defs.bzl
└── xnnpack_wrapper_defs.bzl
└── torchgen
├── BUCK.oss
├── BUILD.bazel
├── __init__.py
├── api
├── __init__.py
├── autograd.py
├── cpp.py
├── dispatcher.py
├── functionalization.py
├── lazy.py
├── meta.py
├── native.py
├── python.py
├── structured.py
├── translate.py
├── types
│ ├── __init__.py
│ ├── signatures.py
│ ├── types.py
│ └── types_base.py
├── ufunc.py
└── unboxing.py
├── build.bzl
├── code_template.py
├── context.py
├── decompositions
└── gen_jit_decompositions.py
├── dest
├── __init__.py
├── lazy_ir.py
├── lazy_ts_lowering.py
├── native_functions.py
├── register_dispatch_key.py
└── ufunc.py
├── executorch
├── __init__.py
└── api
│ ├── __init__.py
│ ├── custom_ops.py
│ ├── et_cpp.py
│ ├── types
│ ├── __init__.py
│ ├── signatures.py
│ └── types.py
│ └── unboxing.py
├── gen.py
├── gen_backend_stubs.py
├── gen_executorch.py
├── gen_functionalization_type.py
├── gen_lazy_tensor.py
├── gen_vmap_plumbing.py
├── local.py
├── model.py
├── native_function_generation.py
├── operator_versions
├── __init__.py
├── gen_mobile_upgraders.py
└── gen_mobile_upgraders_constant.py
├── selective_build
├── __init__.py
├── operator.py
└── selector.py
├── shape_functions
└── gen_jit_shape_functions.py
├── static_runtime
├── __init__.py
├── config.py
├── gen_static_runtime_ops.py
└── generator.py
└── utils.py
/.bazelversion:
--------------------------------------------------------------------------------
1 | 6.1.1
2 |
--------------------------------------------------------------------------------
/.buckconfig.oss:
--------------------------------------------------------------------------------
1 | [pt]
2 | is_oss=1
3 |
4 | [buildfile]
5 | name = BUCK.oss
6 | includes = //tools/build_defs/select.bzl
7 |
8 | [repositories]
9 | bazel_skylib = third_party/bazel-skylib/
10 | ovr_config = .
11 |
12 | [download]
13 | in_build = true
14 |
15 | [cxx]
16 | cxxflags = -std=c++17
17 | ldflags = -Wl,--no-undefined
18 | should_remap_host_platform = true
19 | cpp = /usr/bin/clang
20 | cc = /usr/bin/clang
21 | cxx = /usr/bin/clang++
22 | cxxpp = /usr/bin/clang++
23 | ld = /usr/bin/clang++
24 |
25 | [project]
26 | default_flavors_mode=all
27 |
--------------------------------------------------------------------------------
/.ci/caffe2/README.md:
--------------------------------------------------------------------------------
1 | # Jenkins
2 |
3 | The scripts in this directory are the entrypoint for testing Caffe2.
4 |
5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to
6 | the build environment you intend to test. It is a hint for the build
7 | and test scripts to configure Caffe2 a certain way and include/exclude
8 | tests. Docker images, they equal the name of the image itself. For
9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
10 | built on Jenkins and are used in triggered builds already have this
11 | environment variable set in their manifest. Also see
12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13 |
14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
15 |
--------------------------------------------------------------------------------
/.ci/caffe2/common.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
5 | TEST_DIR="$ROOT_DIR/test"
6 | gtest_reports_dir="${TEST_DIR}/test-reports/cpp"
7 | pytest_reports_dir="${TEST_DIR}/test-reports/python"
8 |
9 | # Figure out which Python to use
10 | PYTHON="$(which python)"
11 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
12 | PYTHON=$(which "python${BASH_REMATCH[1]}")
13 | fi
14 |
15 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
16 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
17 | unset HIP_PLATFORM
18 | if which sccache > /dev/null; then
19 | # Save sccache logs to file
20 | sccache --stop-server || true
21 | rm -f ~/sccache_error.log || true
22 | SCCACHE_ERROR_LOG=~/sccache_error.log SCCACHE_IDLE_TIMEOUT=0 sccache --start-server
23 |
24 | # Report sccache stats for easier debugging
25 | sccache --zero-stats
26 | fi
27 | fi
28 |
29 | # /usr/local/caffe2 is where the cpp bits are installed to in cmake-only
30 | # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so
31 | # that the test code in .ci/test.sh is the same
32 | INSTALL_PREFIX="/usr/local/caffe2"
33 |
34 | mkdir -p "$gtest_reports_dir" || true
35 | mkdir -p "$pytest_reports_dir" || true
36 | mkdir -p "$INSTALL_PREFIX" || true
37 |
--------------------------------------------------------------------------------
/.ci/docker/README.md:
--------------------------------------------------------------------------------
1 | # Docker images for Jenkins
2 |
3 | This directory contains everything needed to build the Docker images
4 | that are used in our CI
5 |
6 | The Dockerfiles located in subdirectories are parameterized to
7 | conditionally run build stages depending on build arguments passed to
8 | `docker build`. This lets us use only a few Dockerfiles for many
9 | images. The different configurations are identified by a freeform
10 | string that we call a _build environment_. This string is persisted in
11 | each image as the `BUILD_ENVIRONMENT` environment variable.
12 |
13 | See `build.sh` for valid build environments (it's the giant switch).
14 |
15 | Docker builds are now defined with `.circleci/cimodel/data/simple/docker_definitions.py`
16 |
17 | ## Contents
18 |
19 | * `build.sh` -- dispatch script to launch all builds
20 | * `common` -- scripts used to execute individual Docker build stages
21 | * `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker
22 |
23 | ## Usage
24 |
25 | ```bash
26 | # Build a specific image
27 | ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
28 |
29 | # Set flags (see build.sh) and build image
30 | sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
31 | ```
32 |
--------------------------------------------------------------------------------
/.ci/docker/android/AndroidManifest.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.ci/docker/ci_commit_pins/triton-rocm.txt:
--------------------------------------------------------------------------------
1 | de3f5436247e391b062a7dd7fd42d2a55c2cd524
2 |
--------------------------------------------------------------------------------
/.ci/docker/ci_commit_pins/triton.txt:
--------------------------------------------------------------------------------
1 | 46672772b46b103db7341c9e10fbad7f643557d4
2 |
--------------------------------------------------------------------------------
/.ci/docker/common/common_utils.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Work around bug where devtoolset replaces sudo and breaks it.
4 | if [ -n "$DEVTOOLSET_VERSION" ]; then
5 | export SUDO=/bin/sudo
6 | else
7 | export SUDO=sudo
8 | fi
9 |
10 | as_jenkins() {
11 | # NB: unsetting the environment variables works around a conda bug
12 | # https://github.com/conda/conda/issues/6576
13 | # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
14 | # NB: This must be run from a directory that jenkins has access to,
15 | # works around https://github.com/conda/conda-package-handling/pull/34
16 | $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
17 | }
18 |
19 | conda_install() {
20 | # Ensure that the install command don't upgrade/downgrade Python
21 | # This should be called as
22 | # conda_install pkg1 pkg2 ... [-c channel]
23 | as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
24 | }
25 |
26 | conda_run() {
27 | as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
28 | }
29 |
30 | pip_install() {
31 | as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
32 | }
33 |
34 | get_pinned_commit() {
35 | cat "${1}".txt
36 | }
37 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_cmake.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "$CMAKE_VERSION" ]
6 |
7 | # Remove system cmake install so it won't get used instead
8 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
9 | case "$ID" in
10 | ubuntu)
11 | apt-get remove cmake -y
12 | ;;
13 | centos)
14 | yum remove cmake -y
15 | ;;
16 | *)
17 | echo "Unable to determine OS..."
18 | exit 1
19 | ;;
20 | esac
21 |
22 | # Turn 3.6.3 into v3.6
23 | path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
24 | file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
25 |
26 | # Download and install specific CMake version in /usr/local
27 | pushd /tmp
28 | curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
29 | tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
30 | rm -f cmake-*.tar.gz
31 | popd
32 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_cudnn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ ${CUDNN_VERSION} == 8 ]]; then
4 | # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
5 | mkdir tmp_cudnn && cd tmp_cudnn
6 | CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
7 | if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
8 | CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
9 | curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
10 | elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
11 | CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
12 | curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
13 | else
14 | curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
15 | fi
16 |
17 | tar xf ${CUDNN_NAME}.tar.xz
18 | cp -a ${CUDNN_NAME}/include/* /usr/include/
19 | cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
20 | cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/
21 |
22 | cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
23 | cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/
24 | cd ..
25 | rm -rf tmp_cudnn
26 | ldconfig
27 | fi
28 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | install_ubuntu() {
6 | apt-get update
7 | apt-get install -y --no-install-recommends \
8 | libhiredis-dev \
9 | libleveldb-dev \
10 | liblmdb-dev \
11 | libsnappy-dev
12 |
13 | # Cleanup
14 | apt-get autoclean && apt-get clean
15 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
16 | }
17 |
18 | install_centos() {
19 | # Need EPEL for many packages we depend on.
20 | # See http://fedoraproject.org/wiki/EPEL
21 | yum --enablerepo=extras install -y epel-release
22 |
23 | yum install -y \
24 | hiredis-devel \
25 | leveldb-devel \
26 | lmdb-devel \
27 | snappy-devel
28 |
29 | # Cleanup
30 | yum clean all
31 | rm -rf /var/cache/yum
32 | rm -rf /var/lib/yum/yumdb
33 | rm -rf /var/lib/yum/history
34 | }
35 |
36 | # Install base packages depending on the base OS
37 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
38 | case "$ID" in
39 | ubuntu)
40 | install_ubuntu
41 | ;;
42 | centos)
43 | install_centos
44 | ;;
45 | *)
46 | echo "Unable to determine OS..."
47 | exit 1
48 | ;;
49 | esac
50 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_devtoolset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "$DEVTOOLSET_VERSION" ]
6 |
7 | yum install -y centos-release-scl
8 | yum install -y devtoolset-$DEVTOOLSET_VERSION
9 |
10 | echo "source scl_source enable devtoolset-$DEVTOOLSET_VERSION" > "/etc/profile.d/devtoolset-$DEVTOOLSET_VERSION.sh"
11 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_docs_reqs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | if [ -n "$KATEX" ]; then
6 | apt-get update
7 | # Ignore error if gpg-agent doesn't exist (for Ubuntu 16.04)
8 | apt-get install -y gpg-agent || :
9 |
10 | curl --retry 3 -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
11 | sudo apt-get install -y nodejs
12 |
13 | curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
14 | echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
15 |
16 | apt-get update
17 | apt-get install -y --no-install-recommends yarn
18 | yarn global add katex --prefix /usr/local
19 |
20 | sudo apt-get -y install doxygen
21 |
22 | apt-get autoclean && apt-get clean
23 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
24 |
25 | fi
26 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_gcc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | if [ -n "$GCC_VERSION" ]; then
6 |
7 | # Need the official toolchain repo to get alternate packages
8 | add-apt-repository ppa:ubuntu-toolchain-r/test
9 | apt-get update
10 | if [[ "$UBUNTU_VERSION" == "16.04" && "${GCC_VERSION:0:1}" == "5" ]]; then
11 | apt-get install -y g++-5=5.4.0-6ubuntu1~16.04.12
12 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50
13 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
14 | update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-5 50
15 | else
16 | apt-get install -y g++-$GCC_VERSION
17 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
18 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
19 | update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
20 | fi
21 |
22 |
23 | # Cleanup package manager
24 | apt-get autoclean && apt-get clean
25 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
26 |
27 | fi
28 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_glibc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "$GLIBC_VERSION" ]
6 | if [[ -n "$CENTOS_VERSION" ]]; then
7 | [ -n "$DEVTOOLSET_VERSION" ]
8 | fi
9 |
10 | yum install -y wget sed
11 |
12 | mkdir -p /packages && cd /packages
13 | wget -q http://ftp.gnu.org/gnu/glibc/glibc-$GLIBC_VERSION.tar.gz
14 | tar xzf glibc-$GLIBC_VERSION.tar.gz
15 | if [[ "$GLIBC_VERSION" == "2.26" ]]; then
16 | cd glibc-$GLIBC_VERSION
17 | sed -i 's/$name ne "nss_test1"/$name ne "nss_test1" \&\& $name ne "nss_test2"/' scripts/test-installation.pl
18 | cd ..
19 | fi
20 | mkdir -p glibc-$GLIBC_VERSION-build && cd glibc-$GLIBC_VERSION-build
21 |
22 | if [[ -n "$CENTOS_VERSION" ]]; then
23 | export PATH=/opt/rh/devtoolset-$DEVTOOLSET_VERSION/root/usr/bin:$PATH
24 | fi
25 |
26 | ../glibc-$GLIBC_VERSION/configure --prefix=/usr CFLAGS='-Wno-stringop-truncation -Wno-format-overflow -Wno-restrict -Wno-format-truncation -g -O2'
27 | make -j$(nproc)
28 | make install
29 |
30 | # Cleanup
31 | rm -rf /packages
32 | rm -rf /var/cache/yum/*
33 | rm -rf /var/lib/rpm/__db.*
34 | yum clean all
35 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_jni.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | mkdir -p /usr/local/include
6 | cp jni.h /usr/local/include
7 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_lcov.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | git clone --branch v1.15 https://github.com/linux-test-project/lcov.git
6 | pushd lcov
7 | sudo make install # will be installed in /usr/local/bin/lcov
8 | popd
9 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_linter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
6 |
7 | if [ -n "${UBUNTU_VERSION}" ]; then
8 | apt update
9 | apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
10 | fi
11 |
12 | # Do shallow clone of PyTorch so that we can init lintrunner in Docker build context
13 | git clone https://github.com/pytorch/pytorch.git --depth 1
14 | chown -R jenkins pytorch
15 |
16 | pushd pytorch
17 | # Install all linter dependencies
18 | pip_install -r requirements.txt
19 | conda_run lintrunner init
20 |
21 | # Cache .lintbin directory as part of the Docker image
22 | cp -r .lintbin /tmp
23 | popd
24 |
25 | # Node dependencies required by toc linter job
26 | npm install -g markdown-toc
27 |
28 | # Cleaning up
29 | rm -rf pytorch
30 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_ninja.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "$NINJA_VERSION" ]
6 |
7 | url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
8 |
9 | pushd /tmp
10 | wget --no-verbose --output-document=ninja-linux.zip "$url"
11 | unzip ninja-linux.zip -d /usr/local/bin
12 | rm -f ninja-linux.zip
13 | popd
14 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_openmpi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo apt-get update
4 | # also install ssh to avoid error of:
5 | # --------------------------------------------------------------------------
6 | # The value of the MCA parameter "plm_rsh_agent" was set to a path
7 | # that could not be found:
8 | # plm_rsh_agent: ssh : rsh
9 | sudo apt-get install -y ssh
10 | sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
11 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_openssl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | OPENSSL=openssl-1.1.1k
6 |
7 | wget -q -O "${OPENSSL}.tar.gz" "https://ossci-linux.s3.amazonaws.com/${OPENSSL}.tar.gz"
8 | tar xf "${OPENSSL}.tar.gz"
9 | cd "${OPENSSL}"
10 | ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
11 | # NOTE: openssl install errors out when built with the -j option
12 | make -j6; make install_sw
13 | # Link the ssl libraries to the /usr/lib folder.
14 | sudo ln -s /opt/openssl/lib/lib* /usr/lib
15 | cd ..
16 | rm -rf "${OPENSSL}"
17 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_swiftshader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "${SWIFTSHADER}" ]
6 |
7 | retry () {
8 | $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
9 | }
10 |
11 | _https_amazon_aws=https://ossci-android.s3.amazonaws.com
12 |
13 | # SwiftShader
14 | _swiftshader_dir=/var/lib/jenkins/swiftshader
15 | _swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
16 | mkdir -p $_swiftshader_dir
17 | _tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
18 |
19 | curl --silent --show-error --location --fail --retry 3 \
20 | --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
21 |
22 | tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
23 |
24 | export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
25 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_thrift.sh:
--------------------------------------------------------------------------------
1 | apt-get update
2 | apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
3 | wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz
4 | tar -xvf thrift-0.12.0.tar.gz
5 | cd thrift-0.12.0
6 | for file in ./compiler/cpp/Makefile*; do
7 | sed -i 's/\-Werror//' $file
8 | done
9 | ./bootstrap.sh
10 | ./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby
11 | sudo make
12 | sudo make install
13 | cd ..
14 | rm thrift-0.12.0.tar.gz
15 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_ucc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | if [[ -d "/usr/local/cuda/" ]]; then
6 | with_cuda=/usr/local/cuda/
7 | else
8 | with_cuda=no
9 | fi
10 |
11 | function install_ucx() {
12 | set -ex
13 | git clone --recursive https://github.com/openucx/ucx.git
14 | pushd ucx
15 | git checkout ${UCX_COMMIT}
16 | git submodule update --init --recursive
17 |
18 | ./autogen.sh
19 | ./configure --prefix=$UCX_HOME \
20 | --enable-mt \
21 | --with-cuda=$with_cuda \
22 | --enable-profiling \
23 | --enable-stats
24 | time make -j
25 | sudo make install
26 |
27 | popd
28 | rm -rf ucx
29 | }
30 |
31 | function install_ucc() {
32 | set -ex
33 | git clone --recursive https://github.com/openucx/ucc.git
34 | pushd ucc
35 | git checkout ${UCC_COMMIT}
36 | git submodule update --init --recursive
37 |
38 | ./autogen.sh
39 | ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
40 | time make -j
41 | sudo make install
42 |
43 | popd
44 | rm -rf ucc
45 | }
46 |
47 | install_ucx
48 | install_ucc
49 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | # Mirror jenkins user in container
6 | # jenkins user as ec2-user should have the same user-id
7 | echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
8 | echo "jenkins:x:1000:" >> /etc/group
9 | # Needed on focal or newer
10 | echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow
11 |
12 | # Create $HOME
13 | mkdir -p /var/lib/jenkins
14 | chown jenkins:jenkins /var/lib/jenkins
15 | mkdir -p /var/lib/jenkins/.ccache
16 | chown jenkins:jenkins /var/lib/jenkins/.ccache
17 |
18 | # Allow writing to /usr/local (for make install)
19 | chown jenkins:jenkins /usr/local
20 |
21 | # Allow sudo
22 | # TODO: Maybe we shouldn't
23 | echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
24 |
25 | # Work around bug where devtoolset replaces sudo and breaks it.
26 | if [ -n "$DEVTOOLSET_VERSION" ]; then
27 | SUDO=/bin/sudo
28 | else
29 | SUDO=sudo
30 | fi
31 |
32 | # Test that sudo works
33 | $SUDO -u jenkins $SUDO -v
34 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_vision.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | install_ubuntu() {
6 | apt-get update
7 | apt-get install -y --no-install-recommends \
8 | libopencv-dev \
9 | libavcodec-dev
10 |
11 | # Cleanup
12 | apt-get autoclean && apt-get clean
13 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
14 | }
15 |
16 | install_centos() {
17 | # Need EPEL for many packages we depend on.
18 | # See http://fedoraproject.org/wiki/EPEL
19 | yum --enablerepo=extras install -y epel-release
20 |
21 | yum install -y \
22 | opencv-devel \
23 | ffmpeg-devel
24 |
25 | # Cleanup
26 | yum clean all
27 | rm -rf /var/cache/yum
28 | rm -rf /var/lib/yum/yumdb
29 | rm -rf /var/lib/yum/history
30 | }
31 |
32 | # Install base packages depending on the base OS
33 | ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
34 | case "$ID" in
35 | ubuntu)
36 | install_ubuntu
37 | ;;
38 | centos)
39 | install_centos
40 | ;;
41 | *)
42 | echo "Unable to determine OS..."
43 | exit 1
44 | ;;
45 | esac
46 |
--------------------------------------------------------------------------------
/.ci/docker/common/install_vulkan_sdk.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | [ -n "${VULKAN_SDK_VERSION}" ]
6 |
7 | retry () {
8 | $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
9 | }
10 |
11 | _vulkansdk_dir=/var/lib/jenkins/vulkansdk
12 | _tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
13 |
14 | curl \
15 | --silent \
16 | --show-error \
17 | --location \
18 | --fail \
19 | --retry 3 \
20 | --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
21 |
22 | mkdir -p "${_vulkansdk_dir}"
23 | tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
24 | rm -rf "${_tmp_vulkansdk_targz}"
25 |
--------------------------------------------------------------------------------
/.ci/docker/linter/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG UBUNTU_VERSION
2 |
3 | FROM ubuntu:${UBUNTU_VERSION}
4 |
5 | ARG UBUNTU_VERSION
6 |
7 | ENV DEBIAN_FRONTEND noninteractive
8 |
9 | # Install common dependencies (so that this step can be cached separately)
10 | COPY ./common/install_base.sh install_base.sh
11 | RUN bash ./install_base.sh && rm install_base.sh
12 |
13 | # Install user
14 | COPY ./common/install_user.sh install_user.sh
15 | RUN bash ./install_user.sh && rm install_user.sh
16 |
17 | # Install conda and other packages (e.g., numpy, pytest)
18 | ARG ANACONDA_PYTHON_VERSION
19 | ARG CONDA_CMAKE
20 | ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
21 | ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
22 | COPY requirements-ci.txt /opt/conda/requirements-ci.txt
23 | COPY ./common/install_conda.sh install_conda.sh
24 | COPY ./common/common_utils.sh common_utils.sh
25 | RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
26 |
27 | # Note that Docker build forbids copying file outside the build context
28 | COPY ./common/install_linter.sh install_linter.sh
29 | COPY ./common/common_utils.sh common_utils.sh
30 | RUN bash ./install_linter.sh
31 | RUN rm install_linter.sh common_utils.sh
32 |
33 | USER jenkins
34 | CMD ["bash"]
35 |
--------------------------------------------------------------------------------
/.ci/docker/triton_version.txt:
--------------------------------------------------------------------------------
1 | 2.1.0
2 |
--------------------------------------------------------------------------------
/.ci/docker/ubuntu-rocm/.gitignore:
--------------------------------------------------------------------------------
1 | *.sh
2 |
--------------------------------------------------------------------------------
/.ci/onnx/README.md:
--------------------------------------------------------------------------------
1 | # Jenkins
2 |
3 | The scripts in this directory are the entrypoint for testing ONNX exporter.
4 |
5 | The environment variable `BUILD_ENVIRONMENT` is expected to be set to
6 | the build environment you intend to test. It is a hint for the build
7 | and test scripts to configure Caffe2 a certain way and include/exclude
8 | tests. Docker images, they equal the name of the image itself. For
9 | example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
10 | built on Jenkins and are used in triggered builds already have this
11 | environment variable set in their manifest. Also see
12 | `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13 |
14 | Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
15 |
--------------------------------------------------------------------------------
/.ci/onnx/common.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
4 | ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
5 | TEST_DIR="$ROOT_DIR/test"
6 | pytest_reports_dir="${TEST_DIR}/test-reports/python"
7 |
8 | # Figure out which Python to use
9 | PYTHON="$(which python)"
10 | if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
11 | PYTHON=$(which "python${BASH_REMATCH[1]}")
12 | fi
13 |
14 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
15 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
16 | unset HIP_PLATFORM
17 | fi
18 |
19 | mkdir -p "$pytest_reports_dir" || true
20 |
--------------------------------------------------------------------------------
/.ci/onnx/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # shellcheck source=./common.sh
4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
5 |
6 | # Use to retry ONNX test, only retry it twice
7 | retry () {
8 | "$@" || (sleep 60 && "$@")
9 | }
10 |
11 | if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
12 | pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
13 | # TODO: This can be removed later once vision is also part of the Docker image
14 | pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
15 | # JIT C++ extensions require ninja, so put it into PATH.
16 | export PATH="/var/lib/jenkins/.local/bin:$PATH"
17 | # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
18 | # need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in
19 | # https://github.com/pytorch/pytorch/issues/98626
20 | retry "$ROOT_DIR/scripts/onnx/test.sh"
21 | fi
22 |
--------------------------------------------------------------------------------
/.ci/pytorch/.shellcheckrc:
--------------------------------------------------------------------------------
1 | source-path=SCRIPTDIR
2 |
3 | # we'd like to enable --external-sources here but can't
4 | # https://github.com/koalaman/shellcheck/issues/1818
5 |
--------------------------------------------------------------------------------
/.ci/pytorch/build-asan.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Required environment variable: $BUILD_ENVIRONMENT
4 | # (This is set by default in the Docker images we build, so you don't
5 | # need to set it yourself.
6 |
7 | # shellcheck source=./common.sh
8 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
9 | # shellcheck source=./common-build.sh
10 | source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
11 |
12 | echo "Clang version:"
13 | clang --version
14 |
15 | python tools/stats/export_test_times.py
16 |
17 | if [ -n "$(which conda)" ]; then
18 | export CMAKE_PREFIX_PATH=/opt/conda
19 | fi
20 |
21 | CC="clang" CXX="clang++" LDSHARED="clang --shared" \
22 | USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \
23 | UBSAN_FLAGS="-fno-sanitize-recover=all" \
24 | python setup.py bdist_wheel
25 | pip_install_whl "$(echo dist/*.whl)"
26 |
27 | # Test building via the sdist source tarball
28 | python setup.py sdist
29 | mkdir -p /tmp/tmp
30 | pushd /tmp/tmp
31 | tar zxf "$(dirname "${BASH_SOURCE[0]}")/../../dist/"*.tar.gz
32 | cd torch-*
33 | python setup.py build --cmake-only
34 | popd
35 |
36 | print_sccache_stats
37 |
38 | assert_git_not_dirty
39 |
--------------------------------------------------------------------------------
/.ci/pytorch/common.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Common setup for all Jenkins scripts
4 | # shellcheck source=./common_utils.sh
5 | source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
6 | set -ex
7 |
8 | # Required environment variables:
9 | # $BUILD_ENVIRONMENT (should be set by your Docker image)
10 |
11 | # Figure out which Python to use for ROCm
12 | if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
13 | # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
14 | unset HIP_PLATFORM
15 | export PYTORCH_TEST_WITH_ROCM=1
16 | # temporary to locate some kernel issues on the CI nodes
17 | export HSAKMT_DEBUG_LEVEL=4
18 | # improve rccl performance for distributed tests
19 | export HSA_FORCE_FINE_GRAIN_PCIE=1
20 | fi
21 |
22 | # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
23 | # shellcheck disable=SC2034
24 | BUILD_TEST_LIBTORCH=0
25 |
--------------------------------------------------------------------------------
/.ci/pytorch/docker-build-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # shellcheck source=./common.sh
4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
5 |
6 | docker build -t pytorch .
7 |
--------------------------------------------------------------------------------
/.ci/pytorch/docs-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # shellcheck source=./common.sh
4 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
5 |
6 | echo "Testing pytorch docs"
7 |
8 | cd docs
9 | pip_install -r requirements.txt
10 | make doctest
11 |
--------------------------------------------------------------------------------
/.ci/pytorch/fake_numpy/numpy.py:
--------------------------------------------------------------------------------
1 | raise ModuleNotFoundError("Sorry PyTorch, but our NumPy is in the other folder")
2 |
--------------------------------------------------------------------------------
/.ci/pytorch/macos-build-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-build* ]]; then
4 | # shellcheck source=./macos-build.sh
5 | source "$(dirname "${BASH_SOURCE[0]}")/macos-build.sh"
6 | fi
7 |
8 | if [ -z "${BUILD_ENVIRONMENT}" ] || [[ "${BUILD_ENVIRONMENT}" == *-test* ]]; then
9 | # shellcheck source=./macos-test.sh
10 | source "$(dirname "${BASH_SOURCE[0]}")/macos-test.sh"
11 | fi
12 |
--------------------------------------------------------------------------------
/.ci/pytorch/macos-common.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Common prelude for macos-build.sh and macos-test.sh
4 |
5 | # shellcheck source=./common.sh
6 | source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
7 |
8 | sysctl -a | grep machdep.cpu
9 |
10 | # These are required for both the build job and the test job.
11 | # In the latter to test cpp extensions.
12 | export MACOSX_DEPLOYMENT_TARGET=10.9
13 | export CXX=clang++
14 | export CC=clang
15 |
16 | print_cmake_info() {
17 | CMAKE_EXEC=$(which cmake)
18 | echo "$CMAKE_EXEC"
19 |
20 | CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
21 | # Print all libraries under cmake rpath for debugging
22 | ls -la "$CONDA_INSTALLATION_DIR/../lib"
23 |
24 | export CMAKE_EXEC
25 | # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
26 | # where cmake dependencies couldn't be found. This seems to point to how conda
27 | # links $CMAKE_EXEC to its package cache when cloning a new environment
28 | install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
29 | # Adding the rpath will invalidate cmake signature, so signing it again here
30 | # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
31 | # with an exit code 137 otherwise
32 | codesign -f -s - "${CMAKE_EXEC}" || true
33 | }
34 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/common.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | run_test () {
5 | rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
6 | "$@"
7 | cd .. && rm -rf test_tmp/
8 | }
9 |
10 | get_runtime_of_command () {
11 | TIMEFORMAT=%R
12 |
13 | # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
14 | runtime=$( { time "$@"; } 2>&1 1>/dev/null)
15 | if [[ $runtime == *"Error"* ]]; then
16 | exit 1
17 | fi
18 | runtime=${runtime#+++ $@}
19 | runtime=$(python -c "print($runtime)")
20 |
21 | echo "$runtime"
22 | }
23 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/get_stats.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | import numpy
4 |
5 | sample_data_list = sys.argv[1:]
6 | sample_data_list = [float(v.strip()) for v in sample_data_list]
7 |
8 | sample_mean = numpy.mean(sample_data_list)
9 | sample_sigma = numpy.std(sample_data_list)
10 |
11 | data = {
12 | 'mean': sample_mean,
13 | 'sigma': sample_sigma,
14 | }
15 |
16 | print(json.dumps(data))
17 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_cpu_speed_mini_sequence_labeler () {
7 | echo "Testing: mini sequence labeler, CPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/benchmark.git
13 |
14 | cd benchmark/
15 |
16 | git checkout 726567a455edbfda6199445922a8cfee82535664
17 |
18 | cd scripts/mini_sequence_labeler
19 |
20 | SAMPLE_ARRAY=()
21 | NUM_RUNS=$1
22 |
23 | for (( i=1; i<=NUM_RUNS; i++ )) do
24 | runtime=$(get_runtime_of_command python main.py)
25 | SAMPLE_ARRAY+=("${runtime}")
26 | done
27 |
28 | cd ../../..
29 |
30 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
31 | echo "Runtime stats in seconds:"
32 | echo "$stats"
33 |
34 | if [ "$2" == "compare_with_baseline" ]; then
35 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
36 | elif [ "$2" == "compare_and_update" ]; then
37 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
38 | fi
39 | }
40 |
41 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
42 | run_test test_cpu_speed_mini_sequence_labeler "$@"
43 | fi
44 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_cpu_speed_mnist () {
7 | echo "Testing: MNIST, CPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/examples.git -b perftests
13 |
14 | cd examples/mnist
15 |
16 | conda install -c pytorch torchvision-cpu
17 |
18 | # Download data
19 | python main.py --epochs 0
20 |
21 | SAMPLE_ARRAY=()
22 | NUM_RUNS=$1
23 |
24 | for (( i=1; i<=NUM_RUNS; i++ )) do
25 | runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
26 | echo "$runtime"
27 | SAMPLE_ARRAY+=("${runtime}")
28 | done
29 |
30 | cd ../..
31 |
32 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
33 | echo "Runtime stats in seconds:"
34 | echo "$stats"
35 |
36 | if [ "$2" == "compare_with_baseline" ]; then
37 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
38 | elif [ "$2" == "compare_and_update" ]; then
39 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
40 | fi
41 | }
42 |
43 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
44 | run_test test_cpu_speed_mnist "$@"
45 | fi
46 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_torch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./common.sh
4 |
5 | test_cpu_speed_torch () {
6 | echo "Testing: torch.*, CPU"
7 |
8 | export OMP_NUM_THREADS=4
9 | export MKL_NUM_THREADS=4
10 |
11 | git clone https://github.com/yf225/perf-tests.git
12 |
13 | if [ "$1" == "compare_with_baseline" ]; then
14 | export ARGS=(--compare ../cpu_runtime.json)
15 | elif [ "$1" == "compare_and_update" ]; then
16 | export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
17 | elif [ "$1" == "update_only" ]; then
18 | export ARGS=(--update ../new_cpu_runtime.json)
19 | fi
20 |
21 | if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
22 | echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
23 | exit 1
24 | fi
25 | }
26 |
27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
28 | run_test test_cpu_speed_torch "$@"
29 | fi
30 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | . ./common.sh
4 |
5 | test_cpu_speed_torch_tensor () {
6 | echo "Testing: torch.Tensor.*, CPU"
7 |
8 | export OMP_NUM_THREADS=4
9 | export MKL_NUM_THREADS=4
10 |
11 | git clone https://github.com/yf225/perf-tests.git
12 |
13 | if [ "$1" == "compare_with_baseline" ]; then
14 | export ARGS=(--compare ../cpu_runtime.json)
15 | elif [ "$1" == "compare_and_update" ]; then
16 | export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
17 | elif [ "$1" == "update_only" ]; then
18 | export ARGS=(--update ../new_cpu_runtime.json)
19 | fi
20 |
21 | if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
22 | echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
23 | exit 1
24 | fi
25 | }
26 |
27 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
28 | run_test test_cpu_speed_torch_tensor "$@"
29 | fi
30 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_gpu_speed_cudnn_lstm () {
7 | echo "Testing: CuDNN LSTM, GPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/benchmark.git
13 |
14 | cd benchmark/
15 |
16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 |
18 | cd scripts/
19 |
20 | SAMPLE_ARRAY=()
21 | NUM_RUNS=$1
22 |
23 | for (( i=1; i<=NUM_RUNS; i++ )) do
24 | runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
25 | echo "$runtime"
26 | SAMPLE_ARRAY+=("${runtime}")
27 | done
28 |
29 | cd ../..
30 |
31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 | echo "Runtime stats in seconds:"
33 | echo "$stats"
34 |
35 | if [ "$2" == "compare_with_baseline" ]; then
36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 | elif [ "$2" == "compare_and_update" ]; then
38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 | fi
40 | }
41 |
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 | run_test test_gpu_speed_cudnn_lstm "$@"
44 | fi
45 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_gpu_speed_lstm () {
7 | echo "Testing: LSTM, GPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/benchmark.git
13 |
14 | cd benchmark/
15 |
16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 |
18 | cd scripts/
19 |
20 | SAMPLE_ARRAY=()
21 | NUM_RUNS=$1
22 |
23 | for (( i=1; i<=NUM_RUNS; i++ )) do
24 | runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
25 | echo "$runtime"
26 | SAMPLE_ARRAY+=("${runtime}")
27 | done
28 |
29 | cd ../..
30 |
31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 | echo "Runtime stats in seconds:"
33 | echo "$stats"
34 |
35 | if [ "$2" == "compare_with_baseline" ]; then
36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 | elif [ "$2" == "compare_and_update" ]; then
38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 | fi
40 | }
41 |
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 | run_test test_gpu_speed_lstm "$@"
44 | fi
45 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_gpu_speed_mlstm () {
7 | echo "Testing: MLSTM, GPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/benchmark.git
13 |
14 | cd benchmark/
15 |
16 | git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
17 |
18 | cd scripts/
19 |
20 | SAMPLE_ARRAY=()
21 | NUM_RUNS=$1
22 |
23 | for (( i=1; i<=NUM_RUNS; i++ )) do
24 | runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
25 | echo "$runtime"
26 | SAMPLE_ARRAY+=("${runtime}")
27 | done
28 |
29 | cd ../..
30 |
31 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
32 | echo "Runtime stats in seconds:"
33 | echo "$stats"
34 |
35 | if [ "$2" == "compare_with_baseline" ]; then
36 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
37 | elif [ "$2" == "compare_and_update" ]; then
38 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
39 | fi
40 | }
41 |
42 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
43 | run_test test_gpu_speed_mlstm "$@"
44 | fi
45 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ./common.sh
5 |
6 | test_gpu_speed_mnist () {
7 | echo "Testing: MNIST, GPU"
8 |
9 | export OMP_NUM_THREADS=4
10 | export MKL_NUM_THREADS=4
11 |
12 | git clone https://github.com/pytorch/examples.git -b perftests
13 |
14 | cd examples/mnist
15 |
16 | conda install -c pytorch torchvision
17 |
18 | # Download data
19 | python main.py --epochs 0
20 |
21 | SAMPLE_ARRAY=()
22 | NUM_RUNS=$1
23 |
24 | # Needs warm up to get accurate number
25 | python main.py --epochs 1 --no-log
26 |
27 | for (( i=1; i<=NUM_RUNS; i++ )) do
28 | runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
29 | echo "$runtime"
30 | SAMPLE_ARRAY+=("${runtime}")
31 | done
32 |
33 | cd ../..
34 |
35 | stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
36 | echo "Runtime stats in seconds:"
37 | echo "$stats"
38 |
39 | if [ "$2" == "compare_with_baseline" ]; then
40 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
41 | elif [ "$2" == "compare_and_update" ]; then
42 | python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
43 | fi
44 | }
45 |
46 | if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
47 | run_test test_gpu_speed_mnist "$@"
48 | fi
49 |
--------------------------------------------------------------------------------
/.ci/pytorch/perf_test/update_commit_hash.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 |
4 | data_file_path = sys.argv[1]
5 | commit_hash = sys.argv[2]
6 |
7 | with open(data_file_path) as data_file:
8 | data = json.load(data_file)
9 |
10 | data['commit'] = commit_hash
11 |
12 | with open(data_file_path, 'w') as data_file:
13 | json.dump(data, data_file)
14 |
--------------------------------------------------------------------------------
/.ci/pytorch/print_sccache_log.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | log_file_path = sys.argv[1]
4 |
5 | with open(log_file_path) as f:
6 | lines = f.readlines()
7 |
8 | for line in lines:
9 | # Ignore errors from CPU instruction set, symbol existing testing,
10 | # or compilation error formatting
11 | ignored_keywords = [
12 | 'src.c',
13 | 'CheckSymbolExists.c',
14 | 'test_compilation_error_formatting',
15 | ]
16 | if all([keyword not in line for keyword in ignored_keywords]):
17 | print(line)
18 |
--------------------------------------------------------------------------------
/.ci/pytorch/run_glootls_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CREATE_TEST_CERT="$(dirname "${BASH_SOURCE[0]}")/create_test_cert.py"
4 | TMP_CERT_DIR=$(python "$CREATE_TEST_CERT")
5 |
6 | openssl verify -CAfile "${TMP_CERT_DIR}/ca.pem" "${TMP_CERT_DIR}/cert.pem"
7 |
8 | export GLOO_DEVICE_TRANSPORT=TCP_TLS
9 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY=${TMP_CERT_DIR}/pkey.key
10 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT=${TMP_CERT_DIR}/cert.pem
11 | export GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE=${TMP_CERT_DIR}/ca.pem
12 |
13 | time python test/run_test.py --include distributed/test_c10d_gloo --verbose -- ProcessGroupGlooTest
14 |
15 | unset GLOO_DEVICE_TRANSPORT
16 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_PKEY
17 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CERT
18 | unset GLOO_DEVICE_TRANSPORT_TCP_TLS_CA_FILE
19 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/choose_runtime_cuda_version.bat:
--------------------------------------------------------------------------------
1 | REM The first argument should the CUDA version
2 | echo %PATH%
3 | echo %CUDA_PATH%
4 | set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%1\bin;%PATH%
5 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat:
--------------------------------------------------------------------------------
1 | if "%BUILD_ENVIRONMENT%"=="" (
2 | set CONDA_PARENT_DIR=%CD%
3 | ) else (
4 | set CONDA_PARENT_DIR=C:\Jenkins
5 | )
6 |
7 |
8 | :: Be conservative here when rolling out the new AMI with conda. This will try
9 | :: to install conda as before if it couldn't find the conda installation. This
10 | :: can be removed eventually after we gain enough confidence in the AMI
11 | if not exist %CONDA_PARENT_DIR%\Miniconda3 (
12 | set INSTALL_FRESH_CONDA=1
13 | )
14 |
15 | if "%INSTALL_FRESH_CONDA%"=="1" (
16 | curl --retry 3 --retry-all-errors -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
17 | if errorlevel 1 exit /b
18 | if not errorlevel 0 exit /b
19 |
20 | %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
21 | if errorlevel 1 exit /b
22 | if not errorlevel 0 exit /b
23 | )
24 |
25 | :: Activate conda so that we can use its commands, i.e. conda, python, pip
26 | call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
27 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat:
--------------------------------------------------------------------------------
1 | if "%CUDA_VERSION%" == "cpu" (
2 | echo skip magma installation for cpu builds
3 | exit /b 0
4 | )
5 |
6 | rem remove dot in cuda_version, fox example 11.1 to 111
7 |
8 | if not "%USE_CUDA%"=="1" (
9 | exit /b 0
10 | )
11 |
12 | if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
13 | echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
14 | exit /b 1
15 | )
16 |
17 | set VERSION_SUFFIX=%CUDA_VERSION:.=%
18 | set CUDA_SUFFIX=cuda%VERSION_SUFFIX%
19 |
20 | if "%CUDA_SUFFIX%" == "" (
21 | echo unknown CUDA version, please set `CUDA_VERSION` higher than 10.2
22 | exit /b 1
23 | )
24 |
25 | if "%REBUILD%"=="" (
26 | if "%BUILD_ENVIRONMENT%"=="" (
27 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
28 | ) else (
29 | aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
30 | )
31 | if errorlevel 1 exit /b
32 | if not errorlevel 0 exit /b
33 | 7z x -aoa %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
34 | if errorlevel 1 exit /b
35 | if not errorlevel 0 exit /b
36 | )
37 | set MAGMA_HOME=%TMP_DIR_WIN%\magma
38 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat:
--------------------------------------------------------------------------------
1 | if "%REBUILD%"=="" (
2 | if "%BUILD_ENVIRONMENT%"=="" (
3 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
4 | ) else (
5 | aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
6 | )
7 | if errorlevel 1 exit /b
8 | if not errorlevel 0 exit /b
9 | 7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
10 | if errorlevel 1 exit /b
11 | if not errorlevel 0 exit /b
12 | )
13 | set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
14 | set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
15 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat:
--------------------------------------------------------------------------------
1 | mkdir %TMP_DIR_WIN%\bin
2 |
3 | if "%REBUILD%"=="" (
4 | :check_sccache
5 | %TMP_DIR_WIN%\bin\sccache.exe --show-stats || (
6 | taskkill /im sccache.exe /f /t || ver > nul
7 | del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
8 | del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
9 | if "%BUILD_ENVIRONMENT%"=="" (
10 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
11 | curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
12 | ) else (
13 | aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
14 | aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
15 | )
16 | goto :check_sccache
17 | )
18 | )
19 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_custom_backend.bat:
--------------------------------------------------------------------------------
1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
2 |
3 | git submodule update --init --recursive third_party/pybind11
4 | cd test\custom_backend
5 |
6 | :: Build the custom backend library.
7 | mkdir build
8 | pushd build
9 |
10 | echo "Executing CMake for custom_backend test..."
11 |
12 | :: Note: Caffe2 does not support MSVC + CUDA + Debug mode (has to be Release mode)
13 | cmake -DCMAKE_PREFIX_PATH=%TMP_DIR_WIN%\build\torch -DCMAKE_BUILD_TYPE=Release -GNinja ..
14 | if ERRORLEVEL 1 exit /b 1
15 |
16 | echo "Executing Ninja for custom_backend test..."
17 |
18 | ninja -v
19 | if ERRORLEVEL 1 exit /b 1
20 |
21 | echo "Ninja succeeded for custom_backend test."
22 |
23 | popd
24 |
25 | :: Run tests Python-side and export a script module.
26 | python test_custom_backend.py -v
27 | if ERRORLEVEL 1 exit /b 1
28 |
29 | python backend.py --export-module-to="build/model.pt"
30 | if ERRORLEVEL 1 exit /b 1
31 |
32 | :: Run tests C++-side and load the exported script module.
33 | cd build
34 | set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
35 | test_custom_backend.exe model.pt
36 | if ERRORLEVEL 1 exit /b 1
37 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_distributed.bat:
--------------------------------------------------------------------------------
1 | REM The first argument should lead to the python interpreter
2 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_common
3 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
4 |
5 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_gloo
6 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
7 |
8 | %1\python.exe test/run_test.py --verbose -i distributed/test_c10d_nccl
9 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
10 |
11 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
12 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
13 |
14 | %1\python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
15 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
16 |
17 | %1\python.exe test/run_test.py --verbose -i distributed/test_data_parallel
18 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
19 |
20 | %1\python.exe test/run_test.py --verbose -i distributed/test_store
21 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
22 |
23 | %1\python.exe test/run_test.py --verbose -i distributed/test_pg_wrapper
24 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
25 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat:
--------------------------------------------------------------------------------
1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
2 |
3 | echo Copying over test times file
4 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
5 |
6 | pushd test
7 |
8 | echo Run jit_profiling tests
9 | python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
10 | if ERRORLEVEL 1 exit /b 1
11 |
12 | popd
13 |
--------------------------------------------------------------------------------
/.ci/pytorch/win-test-helpers/test_python_shard.bat:
--------------------------------------------------------------------------------
1 | call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
2 | :: exit the batch once there's an error
3 | if not errorlevel 0 (
4 | echo "setup pytorch env failed"
5 | echo %errorlevel%
6 | exit /b
7 | )
8 |
9 | pushd test
10 |
11 | set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe"
12 | if "%SHARD_NUMBER%" == "1" (
13 | if exist %GFLAGS_EXE% (
14 | echo Some smoke tests
15 | %GFLAGS_EXE% /i python.exe +sls
16 | python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py
17 | if ERRORLEVEL 1 goto fail
18 |
19 | %GFLAGS_EXE% /i python.exe -sls
20 | if ERRORLEVEL 1 goto fail
21 | )
22 | )
23 |
24 | echo Copying over test times file
25 | copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
26 |
27 | echo Run nn tests
28 | python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
29 | if ERRORLEVEL 1 goto fail
30 |
31 | popd
32 |
33 | :eof
34 | exit /b 0
35 |
36 | :fail
37 | exit /b 1
38 |
--------------------------------------------------------------------------------
/.cmakelintrc:
--------------------------------------------------------------------------------
1 | filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
2 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | plugins =
3 | coverage_plugins.jit_plugin
4 | omit =
5 | */tmp*
6 | */Temp/*
7 | */usr/local/lib*
8 | *test/*
9 |
10 | [report]
11 | omit =
12 | */tmp*
13 | */Temp/*
14 | */usr/local/lib*
15 | *test/*
16 |
--------------------------------------------------------------------------------
/.ctags.d/pytorch.ctags:
--------------------------------------------------------------------------------
1 | --exclude=build/*
2 | --exclude=include/*
3 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .gitignore
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bat text eol=crlf
2 | .circleci/config.yml linguist-generated=true
3 | .github/workflows/generated-*.yml linguist-generated=true
4 | .github/generated-* linguist-generated=true
5 | .github/scripts/gql_mocks.json linguist-generated=true
6 | third_party/LICENSES_BUNDLED.txt linguist-generated=true
7 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ci-sev.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "⚠️ CI SEV"
3 | about: Tracking incidents for PyTorch's CI infra.
4 | labels: "ci: sev"
5 | ---
6 |
7 | > NOTE: Remember to label this issue with "`ci: sev`"
8 |
9 | **MERGE BLOCKING**
10 |
11 | ## Current Status
12 | *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
13 |
14 | ## Error looks like
15 | *Provide some way users can tell that this SEV is causing their issue.*
16 |
17 | ## Incident timeline (all times pacific)
18 | *Include when the incident began, when it was detected, mitigated, root caused, and finally closed.*
19 |
20 |
21 | Click for example
22 |
23 | e.g.
24 | - 10/30 7:27a incident began
25 | - 10/30 8:30a detected by
26 | - 10/30 9:00 pm root caused as…
27 | - 10/30 9:10 pm mitigated by…
28 | - 10/31 10: am closed by…
29 |
30 |
31 |
32 | ## User impact
33 | *How does this affect users of PyTorch CI?*
34 |
35 | ## Root cause
36 | *What was the root cause of this issue?*
37 |
38 | ## Mitigation
39 | *How did we mitigate the issue?*
40 |
41 | ## Prevention/followups
42 | *How do we prevent issues like this in the future?*
43 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 | - name: Questions
4 | url: https://discuss.pytorch.org/
5 | about: Ask questions and discuss with other PyTorch community members
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/disable-ci-jobs.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Disable CI jobs (PyTorch Dev Infra only)
3 | about: Use this template to disable CI jobs
4 | title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
5 | labels: "module: ci"
6 | ---
7 |
8 | > For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
9 | > created, the job will be disabled within 15 minutes. You can check the
10 | > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json
11 |
12 | > If you need to get this out ASAP instead of waiting for 15 minutes,
13 | > you can manually trigger the workflow at https://github.com/pytorch/test-infra/actions/workflows/update_disabled_tests.yml
14 | > once the issue is created to update the above JSON list right away.
15 |
16 | > Noted: you need to have write access to PyTorch repo to disable CI
17 | > jobs. The issue will be rejected otherwise.
18 |
19 | ## Reason
20 | *Provide a reason why this is needed and when this can be resolved*.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
1 | name: 📚 Documentation
2 | description: Report an issue related to https://pytorch.org/docs/stable/index.html
3 |
4 | body:
5 | - type: textarea
6 | attributes:
7 | label: 📚 The doc issue
8 | description: >
9 | A clear and concise description of what content in https://pytorch.org/docs/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new.
10 | validations:
11 | required: true
12 | - type: textarea
13 | attributes:
14 | label: Suggest a potential alternative/fix
15 | description: >
16 | Tell us how we could improve the documentation in this regard.
17 | - type: markdown
18 | attributes:
19 | value: >
20 | Thanks for contributing 🎉!
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
1 | name: 🚀 Feature request
2 | description: Submit a proposal/request for a new PyTorch feature
3 |
4 | body:
5 | - type: textarea
6 | attributes:
7 | label: 🚀 The feature, motivation and pitch
8 | description: >
9 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
10 | validations:
11 | required: true
12 | - type: textarea
13 | attributes:
14 | label: Alternatives
15 | description: >
16 | A description of any alternative solutions or features you've considered, if any.
17 | - type: textarea
18 | attributes:
19 | label: Additional context
20 | description: >
21 | Add any other context or screenshots about the feature request.
22 | - type: markdown
23 | attributes:
24 | value: >
25 | Thanks for contributing 🎉!
26 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #ISSUE_NUMBER
2 |
--------------------------------------------------------------------------------
/.github/actionlint.yaml:
--------------------------------------------------------------------------------
1 | self-hosted-runner:
2 | labels:
3 | - linux.20_04.4x
4 | - linux.20_04.16x
5 | - linux.large
6 | - linux.2xlarge
7 | - linux.4xlarge
8 | - linux.12xlarge
9 | - linux.24xlarge
10 | - linux.4xlarge.nvidia.gpu
11 | - linux.8xlarge.nvidia.gpu
12 | - linux.16xlarge.nvidia.gpu
13 | - linux.g5.4xlarge.nvidia.gpu
14 | - windows.4xlarge
15 | - windows.8xlarge.nvidia.gpu
16 | - windows.g5.4xlarge.nvidia.gpu
17 | - bm-runner
18 | - linux.rocm.gpu
19 | - macos-m1-12
20 | - macos-m1-13
21 | - macos-12-xl
22 | - macos-12
23 | - macos12.3-m1
24 |
--------------------------------------------------------------------------------
/.github/actions/chown-workspace/action.yml:
--------------------------------------------------------------------------------
1 | name: Chown workspace
2 |
3 | description: Ensure that the working directory gets chowned back to the current user
4 |
5 | runs:
6 | using: composite
7 | steps:
8 | - run: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .
9 | shell: bash
10 | env:
11 | ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
12 |
--------------------------------------------------------------------------------
/.github/actions/diskspace-cleanup/action.yml:
--------------------------------------------------------------------------------
1 | name: Cleans up diskspace
2 |
3 | description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace.
4 |
5 | inputs:
6 | diskspace-cutoff:
7 | description: The percent amount after which docker prune is run.
8 | required: true
9 | default: 70
10 |
11 | runs:
12 | using: composite
13 | steps:
14 | - name: Cleans up diskspace
15 | shell: bash
16 | run: |
17 | diskspace_cutoff=${{ inputs.diskspace-cutoff }}
18 | diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
19 | msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
20 | if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
21 | docker system prune -af
22 | diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
23 | if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
24 | echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
25 | echo "$msg"
26 | exit 1
27 | else
28 | difference=$((diskspace - diskspace_new))
29 | echo "Diskspace saved: $difference percent"
30 | fi
31 | fi
32 |
--------------------------------------------------------------------------------
/.github/actions/download-build-artifacts/action.yml:
--------------------------------------------------------------------------------
1 | name: Download PyTorch Build Artifacts
2 |
3 | description: Download and unzip artifacts from a previous PyTorch build.
4 |
5 | inputs:
6 | name:
7 | description: Name of what artifact to download
8 | required: true
9 | use-gha:
10 | description: If set to any value, use GHA to download the artifact. Otherwise use s3.
11 | required: false
12 |
13 | runs:
14 | using: composite
15 | steps:
16 | - name: Download PyTorch Build Artifacts from S3
17 | if: ${{ !inputs.use-gha }}
18 | uses: seemethere/download-artifact-s3@v4
19 | with:
20 | name: ${{ inputs.name }}
21 |
22 | - name: Download PyTorch Build Artifacts from GHA
23 | if: inputs.use-gha
24 | uses: actions/download-artifact@v3
25 | with:
26 | name: ${{ inputs.name }}
27 |
28 | - name: Unzip artifacts
29 | shell: bash
30 | run: unzip -o artifacts.zip
31 |
32 | - name: Output disk space left
33 | shell: bash
34 | run: df -H
35 |
--------------------------------------------------------------------------------
/.github/actions/get-workflow-job-id/action.yml:
--------------------------------------------------------------------------------
1 | name: Get workflow job id
2 |
3 | description: Get the ID of the workflow job that is currently running.
4 |
5 | inputs:
6 | github-token:
7 | description: GITHUB_TOKEN
8 | required: true
9 |
10 | outputs:
11 | job-id:
12 | description: The retrieved workflow job id
13 | value: ${{ steps.get-job-id.outputs.job-id }}
14 |
15 | runs:
16 | using: composite
17 | steps:
18 | - name: Get jobid or fail
19 | # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
20 | # timeout-minutes: 10
21 | shell: bash
22 | id: get-job-id
23 | run: |
24 | set -eux
25 | GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
26 | echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}"
27 | env:
28 | GITHUB_TOKEN: ${{ inputs.github-token }}
29 |
--------------------------------------------------------------------------------
/.github/actions/teardown-rocm/action.yml:
--------------------------------------------------------------------------------
1 | name: Teardown ROCm host
2 |
3 | description: Tear down ROCm host for CI
4 |
5 | runs:
6 | using: composite
7 | steps:
8 | - name: Teardown ROCm
9 | if: always()
10 | shell: bash
11 | run: |
12 | # ignore expansion of "docker ps -q" since it could be empty
13 | # shellcheck disable=SC2046
14 | docker stop $(docker ps -q) || true
15 | # Prune all stopped containers.
16 | docker container prune -f
17 | - name: Runner diskspace health check
18 | uses: ./.github/actions/diskspace-cleanup
19 | if: always()
20 |
--------------------------------------------------------------------------------
/.github/actions/teardown-win/action.yml:
--------------------------------------------------------------------------------
1 | name: Teardown Windows
2 |
3 | description: Set up Docker workspace on linux
4 |
5 | inputs:
6 | extra-delete-dir:
7 | description: If set, cleaning up the workspace will delete this too
8 | required: false
9 | default: ""
10 |
11 | runs:
12 | using: composite
13 | steps:
14 | - name: Wait until all sessions have drained
15 | shell: powershell
16 | if: always()
17 | run: |
18 | .github\scripts\wait_for_ssh_to_drain.ps1
19 |
20 | - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
21 | shell: powershell
22 | if: always()
23 | run: |
24 | .github\scripts\kill_active_ssh_sessions.ps1
25 |
26 | - name: Cleanup workspace
27 | if: always()
28 | shell: bash
29 | env:
30 | EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
31 | run: |
32 | [ ! -z "${EXTRA_DELETE_DIR}" ] || rm -rf "${EXTRA_DELETE_DIR}"
33 | rm -rf ./*
34 |
--------------------------------------------------------------------------------
/.github/auto_request_review.yml:
--------------------------------------------------------------------------------
1 | # Documented at https://github.com/necojackarc/auto-request-review
2 | reviewers:
3 | groups:
4 | symbolic-shapes:
5 | - ezyang
6 | - albanD
7 | - miladm
8 | - bdhirsh
9 | - voznesenskym
10 | - jbschlosser
11 |
12 | per_author:
13 | symbolic-shapes:
14 | - symbolic-shapes
15 | - antoniojkim
16 | - wconstab
17 | - SherlockNoMad
18 | Chillee:
19 | - ezyang
20 |
21 | files:
22 | # none yet, TODO: migrate CODEOWNERS here
23 |
24 | options:
25 | ignore_draft: true
26 | ignored_keywords:
27 | - DO NOT REVIEW
28 | # Just manually setup a self-referential per_author rule if you
29 | # want group assignment
30 | enable_group_assignment: false
31 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/audio.txt:
--------------------------------------------------------------------------------
1 | a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602
--------------------------------------------------------------------------------
/.github/ci_commit_pins/huggingface.txt:
--------------------------------------------------------------------------------
1 | ebee0a27940adfbb30444d83387b9ea0f1173f40
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/multipy.txt:
--------------------------------------------------------------------------------
1 | 7dd29931fa8e9bb7c970f05f8c0dc13b69e17494
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/text.txt:
--------------------------------------------------------------------------------
1 | 5b78d074bd303eb230d30567646fcf0358ee2dd4
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/timm.txt:
--------------------------------------------------------------------------------
1 | 6635bc3f7d06c6a0d0481803b24d6ad0004b61ac
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/torchbench.txt:
--------------------------------------------------------------------------------
1 | 159e58f0b36ee22e2b89d74bd7dc8a79376de01d
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/triton.txt:
--------------------------------------------------------------------------------
1 | ../../.ci/docker/ci_commit_pins/triton.txt
--------------------------------------------------------------------------------
/.github/ci_commit_pins/vision.txt:
--------------------------------------------------------------------------------
1 | b78d98bb152ffb9c0c0f5365f59f475c70b1784e
2 |
--------------------------------------------------------------------------------
/.github/ci_commit_pins/xla.txt:
--------------------------------------------------------------------------------
1 | f235d4da06905b35d75879a0a9bc3034ab7385ac
2 |
--------------------------------------------------------------------------------
/.github/pytorch-circleci-labels.yml:
--------------------------------------------------------------------------------
1 | # For documentation concerning this configuration please refer to,
2 | # https://github.com/pytorch/pytorch-probot#trigger-circleci-workflows
3 | labels_to_circle_params:
4 | ci/binaries:
5 | parameter: run_binary_tests
6 | default_true_on:
7 | branches:
8 | - nightly
9 | - release/.*
10 | tags:
11 | - v[0-9]+(\.[0-9]+)*-rc[0-9]+
12 | set_to_false:
13 | - run_build
14 | ci/master:
15 | parameter: run_master_build
16 | set_to_false:
17 | - run_build
18 | ci/slow-gradcheck:
19 | parameter: run_slow_gradcheck_build
20 | set_to_false:
21 | - run_build
22 |
--------------------------------------------------------------------------------
/.github/pytorch-probot.yml:
--------------------------------------------------------------------------------
1 | tracking_issue: 24422
2 | ciflow_tracking_issue: 64124
3 | ciflow_push_tags:
4 | - ciflow/binaries
5 | - ciflow/binaries_conda
6 | - ciflow/binaries_libtorch
7 | - ciflow/binaries_wheel
8 | - ciflow/inductor
9 | - ciflow/inductor-perf-compare
10 | - ciflow/mps
11 | - ciflow/nightly
12 | - ciflow/periodic
13 | - ciflow/slow
14 | - ciflow/trunk
15 | - ciflow/unstable
16 | retryable_workflows:
17 | - lint
18 | - pull
19 | - trunk
20 | - linux-binary
21 | - windows-binary
22 |
--------------------------------------------------------------------------------
/.github/regenerate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | # Allows this script to be invoked from any directory:
4 | cd "$(dirname "$0")"
5 |
6 | python3 scripts/generate_ci_workflows.py
7 |
--------------------------------------------------------------------------------
/.github/requirements-gha-cache.txt:
--------------------------------------------------------------------------------
1 | # This file is to cache other dependencies not specified elsewhere in:
2 | # requirement.txt
3 | # requirements-flake8.txt
4 | # docs/requirements.txt
5 | # docs/cpp/requirements.txt
6 | # functorch/docs/requirements.txt
7 | # .ci/docker/requirements-ci.txt
8 | boto3==1.19.12
9 | jinja2==3.0.1
10 | lintrunner==0.10.7
11 | ninja==1.10.0.post1
12 | nvidia-ml-py==11.525.84
13 | pyyaml==6.0
14 | requests==2.26
15 | rich==10.9.0
16 | rockset==1.0.3
17 |
--------------------------------------------------------------------------------
/.github/requirements/conda-env-Linux-X64:
--------------------------------------------------------------------------------
1 | cmake=3.22.*
2 | mkl=2022.1.0
3 | mkl-include=2022.1.0
4 | ninja=1.10.2
5 | numpy=1.23.3
6 | pyyaml=6.0
7 | requests=2.28.1
8 | setuptools=65.5.0
9 | typing-extensions=4.3.0
10 |
--------------------------------------------------------------------------------
/.github/requirements/conda-env-iOS:
--------------------------------------------------------------------------------
1 | blas=1.0
2 | cmake=3.22.1
3 | mkl=2022.1.0
4 | mkl-include=2022.1.0
5 | ninja=1.10.2
6 | numpy=1.23.3
7 | pyyaml=6.0
8 | requests=2.28.1
9 | setuptools=63.4.1
10 | typing-extensions=4.3.0
11 |
--------------------------------------------------------------------------------
/.github/requirements/conda-env-macOS-ARM64:
--------------------------------------------------------------------------------
1 | numpy=1.22.3
2 | pyyaml=6.0
3 | setuptools=61.2.0
4 | cmake=3.22.*
5 | typing-extensions=4.3.0
6 | dataclasses=0.8
7 | pip=22.2.2
8 | pillow=9.2.0
9 | pkg-config=0.29.2
10 | wheel=0.37.1
11 | expecttest=0.1.3
12 |
13 | # Not pinning certifi so that we can always get the latest certificates
14 | certifi
15 |
16 | # Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64
17 | # itself only has up to 1.39.0 from upstream conda. Both work though
18 | libuv>=1.39.0,<=1.40.0
19 |
--------------------------------------------------------------------------------
/.github/requirements/conda-env-macOS-X64:
--------------------------------------------------------------------------------
1 | mkl=2021.2.0
2 | mkl-include=2021.2.0
3 | numpy=1.21.2
4 | pyyaml=5.3
5 | setuptools=46.0.0
6 | cmake=3.22.*
7 | typing-extensions=4.3.0
8 | dataclasses=0.8
9 | pip=22.2.2
10 | pillow=9.2.0
11 | libuv=1.40.0
12 | pkg-config=0.29.2
13 | wheel=0.37.1
14 |
15 | # Not pinning certifi so that we can always get the latest certificates
16 | certifi
17 |
--------------------------------------------------------------------------------
/.github/requirements/pip-requirements-iOS.txt:
--------------------------------------------------------------------------------
1 | # iOS simulator requirements
2 | coremltools==5.0b5
3 | protobuf==3.20.2
4 |
--------------------------------------------------------------------------------
/.github/requirements/pip-requirements-macOS.txt:
--------------------------------------------------------------------------------
1 | boto3==1.19.12
2 | hypothesis==6.56.4
3 | expecttest==0.1.3
4 | librosa>=0.6.2
5 | mpmath==1.2.1
6 | networkx==2.8.7
7 | # Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available
8 | numba==0.56.0; platform_machine == "arm64"
9 | numba<=0.49.1; platform_machine != "arm64"
10 | opt-einsum>=3.3
11 | psutil==5.9.1
12 | nvidia-ml-py==11.525.84
13 | pygments==2.12.0
14 | pytest==7.2.0
15 | pytest-xdist==3.0.2
16 | pytest-rerunfailures==10.3
17 | pytest-flakefinder==1.1.0
18 | pytest-shard==0.1.2
19 | scipy==1.9.0
20 | sympy==1.11.1
21 | unittest-xml-reporting<=3.2.0,>=2.0.0
22 | xdoctest==1.1.0
23 | filelock==3.6.0
24 | sympy==1.11.1
25 |
--------------------------------------------------------------------------------
/.github/requirements/regenerate-requirements.txt:
--------------------------------------------------------------------------------
1 | typing-extensions
2 | jinja2
3 |
--------------------------------------------------------------------------------
/.github/scripts/comment_on_pr.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any
3 |
4 | from github_utils import gh_post_pr_comment
5 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
6 | from trymerge_explainer import BOT_COMMANDS_WIKI
7 |
8 |
9 | def parse_args() -> Any:
10 | from argparse import ArgumentParser
11 |
12 | parser = ArgumentParser("Comment on a PR")
13 | parser.add_argument("pr_num", type=int)
14 | parser.add_argument("action", type=str)
15 | return parser.parse_args()
16 |
17 |
18 | def main() -> None:
19 | args = parse_args()
20 | repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True)
21 | org, project = repo.gh_owner_and_name()
22 | run_url = os.environ.get("GH_RUN_URL")
23 |
24 | job_link = f"[job]({run_url})" if run_url is not None else "job"
25 | msg = (
26 | f"The {args.action} {job_link} was canceled. If you believe this is a mistake,"
27 | + f"then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})."
28 | )
29 |
30 | gh_post_pr_comment(org, project, args.pr_num, msg)
31 | print(org, project, args.pr_num, msg)
32 |
33 |
34 | if __name__ == "__main__":
35 | main()
36 |
--------------------------------------------------------------------------------
/.github/scripts/export_pytorch_labels.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232.
4 |
5 | As a part of enforcing test ownership, we want to maintain a list of existing PyTorch labels
6 | to verify the owners' existence. This script outputs a file containing a list of existing
7 | pytorch/pytorch labels so that the file could be uploaded to S3.
8 |
9 | This script assumes the correct env vars are set for AWS permissions.
10 |
11 | """
12 |
13 | import json
14 | from typing import Any
15 |
16 | import boto3 # type: ignore[import]
17 |
18 | from label_utils import gh_get_labels
19 |
20 |
21 | def parse_args() -> Any:
22 | from argparse import ArgumentParser
23 |
24 | parser = ArgumentParser("Export PR labels")
25 | parser.add_argument("org", type=str)
26 | parser.add_argument("repo", type=str)
27 |
28 | return parser.parse_args()
29 |
30 |
31 | def main() -> None:
32 | args = parse_args()
33 | print(f"Exporting labels for {args.org}/{args.repo}")
34 | labels_file_name = "pytorch_labels.json"
35 | obj = boto3.resource("s3").Object("ossci-metrics", labels_file_name)
36 | obj.put(Body=json.dumps(gh_get_labels(args.org, args.repo)).encode())
37 |
38 |
39 | if __name__ == "__main__":
40 | main()
41 |
--------------------------------------------------------------------------------
/.github/scripts/kill_active_ssh_sessions.ps1:
--------------------------------------------------------------------------------
1 | function Get-SSH-Sessions {
2 | Get-Process sshd -IncludeUserName |
3 | Where-Object UserName -notLike "*SYSTEM*" |
4 | Select-Object Id
5 | }
6 |
7 | $runningSessions = Get-SSH-Sessions
8 |
9 | foreach ($session in $runningSessions) {
10 | Stop-Process -id $session.Id
11 | }
12 |
--------------------------------------------------------------------------------
/.github/scripts/on_cancel_merge.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import Any
3 |
4 | from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
5 | from trymerge import GitHubPR, MERGE_IN_PROGRESS_LABEL
6 |
7 |
8 | def parse_args() -> Any:
9 | parser = argparse.ArgumentParser(
10 | description="Perform actions when a merge workflow is cancelled"
11 | )
12 | parser.add_argument(
13 | "--pr-num",
14 | type=int,
15 | required=True,
16 | help="The PR number to cancel the merge for",
17 | )
18 | return parser.parse_args()
19 |
20 |
21 | def main() -> None:
22 | args = parse_args()
23 | repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True)
24 | org, project = repo.gh_owner_and_name()
25 | pr_num = args.pr_num
26 |
27 | GitHubPR(org, project, pr_num).remove_label(MERGE_IN_PROGRESS_LABEL)
28 |
29 |
30 | if __name__ == "__main__":
31 | main()
32 |
--------------------------------------------------------------------------------
/.github/scripts/parse_ref.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import re
5 |
6 |
7 | def set_output(name: str, val: str) -> None:
8 | if os.getenv("GITHUB_OUTPUT"):
9 | with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
10 | print(f"{name}={val}", file=env)
11 | else:
12 | print(f"::set-output name={name}::{val}")
13 |
14 |
15 | def main() -> None:
16 | ref = os.environ["GITHUB_REF"]
17 | m = re.match(r"^refs/(\w+)/(.*)$", ref)
18 | if m:
19 | category, stripped = m.groups()
20 | if category == "heads":
21 | set_output("branch", stripped)
22 | elif category == "pull":
23 | set_output("branch", "pull/" + stripped.split("/")[0])
24 | elif category == "tags":
25 | set_output("tag", stripped)
26 |
27 |
28 | if __name__ == "__main__":
29 | main()
30 |
--------------------------------------------------------------------------------
/.github/scripts/report_git_status.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | CHANGES=$(git status --porcelain "$1")
3 | echo "$CHANGES"
4 | git diff "$1"
5 | [ -z "$CHANGES" ]
6 |
--------------------------------------------------------------------------------
/.github/scripts/stop_runner_service.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set +e
4 | set -x
5 |
6 | # Get the service name
7 | RUNNER_SERVICE=$(cat "${RUNNER_WORKSPACE}/../../.service")
8 | echo "GitHub self-hosted runner service: ${RUNNER_SERVICE}"
9 |
10 | if [[ -n "${RUNNER_SERVICE}" ]]; then
11 | echo "The self-hosted runner has encountered an unrecoverable error and will be shutdown"
12 |
13 | pushd "${RUNNER_WORKSPACE}/../../"
14 | # Stop it to prevent the runner from receiving new jobs
15 | sudo ./svc.sh stop
16 | # then uninstall the service
17 | sudo ./svc.sh uninstall
18 | # Finally, shutting down the runner completely
19 | sudo shutdown -P now
20 | # NB: In my test, cleaning up and shutting down the runner this way would already
21 | # remove the runner from the list of registered runners. Calling config.sh remove
22 | # seems redundant as it would require an org token to use, which I don't want to
23 | # add as yet another secret to the CI if there is no need
24 | fi
25 |
--------------------------------------------------------------------------------
/.github/scripts/wait_for_ssh_to_drain.ps1:
--------------------------------------------------------------------------------
1 | function Get-SSH-Users {
2 | # Gets ssh sessions for all users not named SYSTEM
3 | Get-CimInstance -ClassName Win32_Process -Filter "Name = 'sshd.exe'" |
4 | Get-CimAssociatedInstance -Association Win32_SessionProcess |
5 | Get-CimAssociatedInstance -Association Win32_LoggedOnUser |
6 | Where-Object {$_.Name -ne 'SYSTEM'} |
7 | Measure-Object
8 | }
9 |
10 | $usersLoggedOn = Get-SSH-Users
11 |
12 | Write-Output "Holding runner until all ssh sessions have logged out"
13 | while ($usersLoggedOn.Count -gt 0) {
14 | $usersLoggedOn = Get-SSH-Users
15 | Write-Output "."
16 | Start-Sleep -s 5
17 | }
18 |
--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | include_trailing_comma=True
3 | multi_line_output=3
4 | skip=third_party
5 | skip_gitignore=True
6 | use_parentheses=True
7 |
--------------------------------------------------------------------------------
/.lldbinit:
--------------------------------------------------------------------------------
1 | # automatically load the pytorch_lldb extension.
2 | #
3 | # lldb automatically tries to load this file whenever it is executed from the
4 | # root of the pytorch repo, but by default it is not allowed to do so due to
5 | # security reasons. If you want to use pytorch_lldb, please add the following
6 | # line to your ~/.lldbinit (i.e., the .lldbinit file which is in your home
7 | # directory, NOT this file):
8 | # settings set target.load-cwd-lldbinit true
9 | # setting set escape-non-printables false
10 | #
11 | # Alternatively, you can manually load the pytorch_lldb commands into your
12 | # existing lldb session by doing the following:
13 | # (lldb) command script import tools/lldb/pytorch_lldb.py
14 |
15 | command script import tools/lldb/pytorch_lldb.py
16 | setting set escape-non-printables false
17 | type category enable torch
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This is not a real branch.
2 | Please checkout `main`
3 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Reporting Security Issues
2 |
3 | If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
4 |
5 | Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
6 |
7 | Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
8 |
9 | https://www.facebook.com/whitehat
10 |
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Benchmarks
2 |
3 | This folder contains scripts that produce reproducible timings of various PyTorch features.
4 |
5 | It also provides mechanisms to compare PyTorch with other frameworks.
6 |
7 | ## Setup environment
8 | Make sure you're on a machine with CUDA, torchvision, and pytorch installed. Install in the following order:
9 | ```
10 | # Install torchvision. It comes with the pytorch stable release binary
11 | conda install pytorch torchvision -c pytorch
12 |
13 | # Install the latest pytorch master from source.
14 | # It should supersede the installation from the release binary.
15 | cd $PYTORCH_HOME
16 | python setup.py build develop
17 |
18 | # Check the pytorch installation version
19 | python -c "import torch; print(torch.__version__)"
20 | ```
21 |
22 | ## Benchmark List
23 |
24 | Please refer to each subfolder to discover each benchmark suite
25 |
26 | * [Fast RNNs benchmarks](fastrnns/README.md)
27 |
--------------------------------------------------------------------------------
/benchmarks/compare.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m fastrnns.bench --fuser=old --group=rnns --print-json oss > old.json
3 | python -m fastrnns.bench --fuser=te --group=rnns --print-json oss > te.json
4 | python compare-fastrnn-results.py old.json te.json --format md
5 |
--------------------------------------------------------------------------------
/benchmarks/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(convolution_bench convolution.cpp)
2 | target_link_libraries(convolution_bench PRIVATE torch_library benchmark)
3 |
--------------------------------------------------------------------------------
/benchmarks/cpp/tensorexpr/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | find_package(AVX)
2 |
3 | add_executable(
4 | tensorexpr_bench
5 | bench_approx.cpp
6 | bench_batchnorm.cpp
7 | bench_concat.cpp
8 | bench_compile.cpp
9 | bench_signed_log1p.cpp
10 | bench_fuser_overhead.cpp
11 | bench_gemm.cpp
12 | bench_kernels.cpp
13 | bench_parallel.cpp
14 | bench_prefix_sum.cpp
15 | bench_reduce.cpp
16 | main.cpp)
17 |
18 | if(C_AVX2_FOUND)
19 | message(STATUS "AVX2 compiler support found")
20 | target_compile_options(tensorexpr_bench PUBLIC -mavx2)
21 | target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
22 | endif()
23 |
24 | target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)
25 |
--------------------------------------------------------------------------------
/benchmarks/cpp/tensorexpr/main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | BENCHMARK_MAIN();
4 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json:
--------------------------------------------------------------------------------
1 | {
2 | "DummyData": {
3 | "data_class": "DummyData",
4 | "configurations": {
5 | "max_val": 1024,
6 | "sample_count": 1024,
7 | "sample_length": 1024,
8 | "sparsity_percentage": 20
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json:
--------------------------------------------------------------------------------
1 | {
2 | "DummyModel": {
3 | "model_class": "DummyModel",
4 | "configurations": {
5 | "num_embeddings": 1024,
6 | "embedding_dim": 1024,
7 | "dense_input_size": 1024,
8 | "dense_output_size": 1024,
9 | "dense_layers_count": 8,
10 | "sparse": false
11 | }
12 | },
13 | "DummyModelSparse": {
14 | "model_class": "DummyModel",
15 | "configurations": {
16 | "num_embeddings": 1024,
17 | "embedding_dim": 1024,
18 | "dense_input_size": 1024,
19 | "dense_output_size": 1024,
20 | "dense_layers_count": 8,
21 | "sparse": true
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .DummyData import DummyData
2 |
3 | data_map = {
4 | "DummyData": DummyData
5 | }
6 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/CPUMetric.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from .MetricBase import MetricBase
4 |
5 |
6 | class CPUMetric(MetricBase):
7 | def __init__(self, name: str):
8 | self.name = name
9 | self.start = None
10 | self.end = None
11 |
12 | def record_start(self):
13 | self.start = time.time()
14 |
15 | def record_end(self):
16 | self.end = time.time()
17 |
18 | def elapsed_time(self):
19 | if self.start is None:
20 | raise RuntimeError("start is None")
21 | if self.end is None:
22 | raise RuntimeError("end is None")
23 | return self.end - self.start
24 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/CUDAMetric.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .MetricBase import MetricBase
4 |
5 |
6 | class CUDAMetric(MetricBase):
7 | def __init__(self, rank: int, name: str):
8 | self.rank = rank
9 | self.name = name
10 | self.start = None
11 | self.end = None
12 |
13 | def record_start(self):
14 | self.start = torch.cuda.Event(enable_timing=True)
15 | with torch.cuda.device(self.rank):
16 | self.start.record()
17 |
18 | def record_end(self):
19 | self.end = torch.cuda.Event(enable_timing=True)
20 | with torch.cuda.device(self.rank):
21 | self.end.record()
22 |
23 | def elapsed_time(self):
24 | if not self.start.query():
25 | raise RuntimeError("start event did not complete")
26 | if not self.end.query():
27 | raise RuntimeError("end event did not complete")
28 | return self.start.elapsed_time(self.end)
29 |
30 | def synchronize(self):
31 | self.start.synchronize()
32 | self.end.synchronize()
33 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/metrics/MetricBase.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class MetricBase(ABC):
5 | def __init__(self, name):
6 | self.name = name
7 | self.start = None
8 | self.end = None
9 |
10 | @abstractmethod
11 | def record_start(self):
12 | return
13 |
14 | @abstractmethod
15 | def record_end(self):
16 | return
17 |
18 | @abstractmethod
19 | def elapsed_time(self):
20 | return
21 |
22 | def get_name(self):
23 | return self.name
24 |
25 | def get_end(self):
26 | return self.end
27 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/models/DummyModel.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 |
4 |
5 | class DummyModel(nn.Module):
6 | def __init__(
7 | self,
8 | num_embeddings: int,
9 | embedding_dim: int,
10 | dense_input_size: int,
11 | dense_output_size: int,
12 | dense_layers_count: int,
13 | sparse: bool
14 | ):
15 | r"""
16 | A dummy model with an EmbeddingBag Layer and Dense Layer.
17 | Args:
18 | num_embeddings (int): size of the dictionary of embeddings
19 | embedding_dim (int): the size of each embedding vector
20 | dense_input_size (int): size of each input sample
21 | dense_output_size (int): size of each output sample
22 | dense_layers_count: (int): number of dense layers in dense Sequential module
23 | sparse (bool): if True, gradient w.r.t. weight matrix will be a sparse tensor
24 | """
25 | super().__init__()
26 | self.embedding = nn.EmbeddingBag(
27 | num_embeddings, embedding_dim, sparse=sparse
28 | )
29 | self.dense = nn.Sequential(*[nn.Linear(dense_input_size, dense_output_size) for _ in range(dense_layers_count)])
30 |
31 | def forward(self, x):
32 | x = self.embedding(x)
33 | return F.softmax(self.dense(x), dim=1)
34 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .DummyModel import DummyModel
2 |
3 | model_map = {
4 | "DummyModel": DummyModel
5 | }
6 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/server/__init__.py:
--------------------------------------------------------------------------------
1 | from .server import AverageBatchParameterServer, AverageParameterServer
2 |
3 | server_map = {
4 | "AverageParameterServer": AverageParameterServer,
5 | "AverageBatchParameterServer": AverageBatchParameterServer
6 | }
7 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .criterions import cel
2 | from .ddp_models import basic_ddp_model
3 | from .hook_states import BasicHookState
4 | from .hooks import allreduce_hook, hybrid_hook, rpc_hook, sparse_rpc_hook
5 | from .iteration_steps import basic_iteration_step
6 | from .preprocess_data import preprocess_dummy_data
7 | from .trainer import DdpTrainer
8 |
9 | criterion_map = {
10 | "cel": cel
11 | }
12 |
13 | ddp_hook_map = {
14 | "allreduce_hook": allreduce_hook,
15 | "hybrid_hook": hybrid_hook,
16 | "rpc_hook": rpc_hook,
17 | "sparse_rpc_hook": sparse_rpc_hook
18 | }
19 |
20 | ddp_model_map = {
21 | "basic_ddp_model": basic_ddp_model
22 | }
23 |
24 | iteration_step_map = {
25 | "basic_iteration_step": basic_iteration_step
26 | }
27 |
28 | preprocess_data_map = {
29 | "preprocess_dummy_data": preprocess_dummy_data
30 | }
31 |
32 | hook_state_map = {
33 | "BasicHookState": BasicHookState
34 | }
35 |
36 | trainer_map = {
37 | "DdpTrainer": DdpTrainer
38 | }
39 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/criterions.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | def cel(rank):
5 | r"""A function that creates a CrossEntropyLoss
6 | criterion for training.
7 | Args:
8 | rank (int): worker rank
9 | """
10 | return nn.CrossEntropyLoss().cuda(rank)
11 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py:
--------------------------------------------------------------------------------
1 | from torch.nn.parallel import DistributedDataParallel as DDP
2 |
3 |
4 | def basic_ddp_model(self, rank, model, process_group, hook_state, hook):
5 | r"""
6 | A function that creates a ddp_model and hook_state objects.
7 | The ddp model is initialized with a single device id and
8 | the process group. The ddp_model also registers the communication
9 | hook.
10 | Args:
11 | rank (int): worker rank
12 | model (nn.Module): neural network model
13 | process_group (ProcessGroup): distributed process group
14 | hook_state (class): class that will be used to keep track of state
15 | during training.
16 | hook (function): ddp communication hook
17 | """
18 | ddp_model = DDP(
19 | model, device_ids=[rank], process_group=process_group
20 | )
21 | hook_state = hook_state(self, process_group)
22 | ddp_model.register_comm_hook(hook_state, hook)
23 | return ddp_model, hook_state
24 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/hook_states.py:
--------------------------------------------------------------------------------
1 | class BasicHookState:
2 |
3 | def __init__(self, cref, process_group):
4 | r"""
5 | A class that holds state information that is needed by the communication hook
6 | during the training algorithm.
7 | Args:
8 | cref (DdpTrainer): reference to the self keyword of the trainer instance
9 | process_group (ProcessGroup): distributed process group
10 | """
11 | self.cref = cref
12 | self.process_group = process_group
13 | self.batch_number = -1
14 |
15 | def get_key(self, bucket_index):
16 | r"""
17 | A method that returns an encoded key that represents the current batch and
18 | bucket index.
19 | Args:
20 | bucket_index (int): index of the bucket being processed in backward
21 | """
22 | return f"{self.batch_number},{bucket_index}"
23 |
24 | def next_batch(self):
25 | r"""
26 | A method that increments batch_number by 1.
27 | """
28 | self.batch_number += 1
29 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/iteration_steps.py:
--------------------------------------------------------------------------------
1 | def basic_iteration_step(self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch):
2 | r"""
3 | A function that performs an iteration of training.
4 | Args:
5 | ddp_model (nn.Module): distributed data parallel model
6 | criterion (nn.Module): loss function to measure model
7 | optimizer (optim.Optimizer): updates model parameters
8 | hook_state (object): ddp communication hook state object
9 | epoch (int): index of pass through the data
10 | index (int): iteration number - 1 in current batch
11 | batch (list): training examples
12 | """
13 | hook_state.next_batch()
14 | self.record_batch_start(self.epoch_key(epoch, index))
15 | optimizer.zero_grad()
16 | self.record_forward_start(self.epoch_key(epoch, index))
17 | loss = criterion(ddp_model(batch[0]), batch[1])
18 | self.record_forward_end(self.epoch_key(epoch, index))
19 | self.record_backward_start(self.epoch_key(epoch, index))
20 | loss.backward()
21 | self.record_backward_end(self.epoch_key(epoch, index))
22 | optimizer.step()
23 | self.record_batch_end(self.epoch_key(epoch, index))
24 |
--------------------------------------------------------------------------------
/benchmarks/distributed/rpc/parameter_server/trainer/preprocess_data.py:
--------------------------------------------------------------------------------
1 | def preprocess_dummy_data(rank, data):
2 | r"""
3 | A function that moves the data from CPU to GPU
4 | for DummyData class.
5 | Args:
6 | rank (int): worker rank
7 | data (list): training examples
8 | """
9 | for i in range(len(data)):
10 | data[i][0] = data[i][0].cuda(rank)
11 | data[i][1] = data[i][1].cuda(rank)
12 | return data
13 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/__init__.py
--------------------------------------------------------------------------------
/benchmarks/dynamo/check_csv.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import textwrap
4 |
5 | import pandas as pd
6 |
7 |
8 | def check_csv(filename):
9 | """
10 | Basic accuracy checking.
11 | """
12 |
13 | df = pd.read_csv(filename)
14 |
15 | failed = []
16 | for _, row in df.iterrows():
17 | model_name = row["name"]
18 | status = row["accuracy"]
19 | if "pass" not in status:
20 | failed.append(model_name)
21 |
22 | print(f"{model_name:34} {status}")
23 |
24 | if failed:
25 | print(
26 | textwrap.dedent(
27 | f"""
28 | Error {len(failed)} models failed
29 | {' '.join(failed)}
30 | """
31 | )
32 | )
33 | sys.exit(1)
34 |
35 |
36 | if __name__ == "__main__":
37 | parser = argparse.ArgumentParser()
38 | parser.add_argument("--file", "-f", type=str, help="csv file name")
39 | args = parser.parse_args()
40 | check_csv(args.file)
41 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/check_hf_bert_perf_csv.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 | import textwrap
4 |
5 | import pandas as pd
6 |
7 |
8 | def check_hf_bert_perf_csv(filename):
9 | """
10 | Basic performance checking.
11 | """
12 |
13 | df = pd.read_csv(filename)
14 |
15 | failed = []
16 | for _, row in df.iterrows():
17 | model_name = row["name"]
18 | speedup = row["speedup"]
19 | # Reduce from 1.165 to 1.160, see https://github.com/pytorch/pytorch/issues/96530
20 | # Reduce from 1.160 to 1.140 after a transformer version upgrade, see https://github.com/pytorch/benchmark/pull/1406
21 | # The speedup is not backed to 1.16 after the extra graph break issue is fixed in transformer upstream
22 | if speedup < 1.150:
23 | failed.append(model_name)
24 |
25 | print(f"{model_name:34} {speedup}")
26 |
27 | if failed:
28 | print(
29 | textwrap.dedent(
30 | f"""
31 | Error {len(failed)} models performance regressed
32 | {' '.join(failed)}
33 | """
34 | )
35 | )
36 | sys.exit(1)
37 |
38 |
39 | if __name__ == "__main__":
40 | parser = argparse.ArgumentParser()
41 | parser.add_argument("--file", "-f", type=str, help="csv file name")
42 | args = parser.parse_args()
43 | check_hf_bert_perf_csv(args.file)
44 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_dynamic_training.csv:
--------------------------------------------------------------------------------
1 | name,accuracy,graph_breaks
2 | AlbertForMaskedLM,pass,7
3 | AlbertForQuestionAnswering,pass,7
4 | BartForCausalLM,pass,7
5 | BertForMaskedLM,pass,7
6 | BertForQuestionAnswering,pass,7
7 | BlenderbotSmallForCausalLM,pass,7
8 | BlenderbotSmallForConditionalGeneration,pass,7
9 | CamemBert,pass,7
10 | DebertaForMaskedLM,pass,52
11 | DebertaForQuestionAnswering,pass,52
12 | DebertaV2ForMaskedLM,pass_due_to_skip,0
13 | DistilBertForMaskedLM,pass,7
14 | DistilBertForQuestionAnswering,pass,7
15 | DistillGPT2,pass,7
16 | ElectraForCausalLM,pass,7
17 | ElectraForQuestionAnswering,pass,7
18 | GPT2ForSequenceClassification,pass,9
19 | LayoutLMForMaskedLM,pass,7
20 | LayoutLMForSequenceClassification,pass,9
21 | MBartForCausalLM,pass,7
22 | MegatronBertForCausalLM,pass,7
23 | MegatronBertForQuestionAnswering,pass,7
24 | MobileBertForMaskedLM,pass,4
25 | MobileBertForQuestionAnswering,pass,4
26 | PLBartForCausalLM,pass,7
27 | PLBartForConditionalGeneration,pass,7
28 | PegasusForCausalLM,pass,7
29 | PegasusForConditionalGeneration,pass,4
30 | RobertaForCausalLM,pass,7
31 | RobertaForQuestionAnswering,pass,7
32 | Speech2Text2ForCausalLM,pass,7
33 | T5ForConditionalGeneration,pass,7
34 | T5Small,pass,7
35 | TrOCRForCausalLM,pass,7
36 | XLNetLMHeadModel,pass,7
37 | YituTechConvBert,pass,7
38 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv:
--------------------------------------------------------------------------------
1 | name,accuracy,graph_breaks
2 | AlbertForMaskedLM,pass,7
3 | AlbertForQuestionAnswering,pass,7
4 | BartForCausalLM,pass,7
5 | BertForMaskedLM,pass,7
6 | BertForQuestionAnswering,pass,7
7 | BlenderbotSmallForCausalLM,pass,7
8 | BlenderbotSmallForConditionalGeneration,pass,7
9 | CamemBert,pass,7
10 | DebertaForMaskedLM,pass,52
11 | DebertaForQuestionAnswering,pass,52
12 | DebertaV2ForMaskedLM,pass_due_to_skip,0
13 | DistilBertForMaskedLM,pass,7
14 | DistilBertForQuestionAnswering,pass,7
15 | DistillGPT2,pass,7
16 | ElectraForCausalLM,pass,7
17 | ElectraForQuestionAnswering,pass,7
18 | GPT2ForSequenceClassification,pass,9
19 | LayoutLMForMaskedLM,pass,7
20 | LayoutLMForSequenceClassification,pass,9
21 | MBartForCausalLM,pass,7
22 | MegatronBertForCausalLM,pass,7
23 | MegatronBertForQuestionAnswering,pass,7
24 | MobileBertForMaskedLM,pass,4
25 | MobileBertForQuestionAnswering,pass,4
26 | PLBartForCausalLM,pass,7
27 | PLBartForConditionalGeneration,pass,7
28 | PegasusForCausalLM,pass,7
29 | PegasusForConditionalGeneration,pass,4
30 | RobertaForCausalLM,pass,7
31 | RobertaForQuestionAnswering,pass,7
32 | Speech2Text2ForCausalLM,pass,7
33 | T5ForConditionalGeneration,pass,7
34 | T5Small,pass,7
35 | TrOCRForCausalLM,pass,7
36 | XLNetLMHeadModel,pass,7
37 | YituTechConvBert,pass,7
38 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_dynamic_training.csv:
--------------------------------------------------------------------------------
1 | name,accuracy,graph_breaks
2 | BERT_pytorch,pass,11
3 | LearningToPaint,pass,9
4 | Super_SloMo,pass,9
5 | alexnet,pass,9
6 | attention_is_all_you_need_pytorch,pass,9
7 | dcgan,pass,9
8 | densenet121,pass,9
9 | drq,pass,8
10 | fastNLP_Bert,pass,14
11 | functorch_dp_cifar10,pass,9
12 | functorch_maml_omniglot,pass,9
13 | hf_Albert,pass,8
14 | hf_Bart,pass,8
15 | hf_Bert,pass,8
16 | hf_Bert_large,pass,8
17 | hf_DistilBert,pass,8
18 | hf_GPT2,pass,8
19 | hf_Reformer,pass,45
20 | hf_T5_large,pass_due_to_skip,0
21 | lennard_jones,pass,9
22 | maml_omniglot,pass,9
23 | mnasnet1_0,pass,9
24 | mobilenet_v2,pass,9
25 | nvidia_deeprecommender,pass,9
26 | phlippe_densenet,pass,9
27 | phlippe_resnet,pass,9
28 | pytorch_CycleGAN_and_pix2pix,pass,9
29 | pytorch_stargan,pass,9
30 | pytorch_unet,pass,9
31 | resnet152,pass,9
32 | resnet18,pass,9
33 | resnet50,pass,9
34 | resnext50_32x4d,pass,9
35 | shufflenet_v2_x1_0,pass,9
36 | soft_actor_critic,pass,8
37 | speech_transformer,pass,19
38 | squeezenet1_1,pass,9
39 | timm_efficientnet,pass,9
40 | timm_regnet,pass,9
41 | timm_resnest,pass,9
42 | timm_vision_transformer,pass,9
43 | timm_vision_transformer_large,pass_due_to_skip,0
44 | timm_vovnet,pass,9
45 | tts_angular,pass,11
46 | vgg16,pass,9
47 | yolov3,pass,13
48 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv:
--------------------------------------------------------------------------------
1 | name,accuracy,graph_breaks
2 | BERT_pytorch,pass,11
3 | LearningToPaint,pass,9
4 | Super_SloMo,pass,9
5 | alexnet,pass,9
6 | attention_is_all_you_need_pytorch,pass,9
7 | dcgan,pass,9
8 | densenet121,pass,9
9 | drq,pass,8
10 | fastNLP_Bert,pass,14
11 | functorch_dp_cifar10,pass,9
12 | functorch_maml_omniglot,pass,9
13 | hf_Albert,pass,8
14 | hf_Bart,pass,8
15 | hf_Bert,pass,8
16 | hf_Bert_large,pass,8
17 | hf_DistilBert,pass,8
18 | hf_GPT2,pass,8
19 | hf_Reformer,pass,67
20 | hf_T5_large,pass_due_to_skip,0
21 | lennard_jones,pass,9
22 | maml_omniglot,pass,9
23 | mnasnet1_0,pass,9
24 | mobilenet_v2,pass,9
25 | nvidia_deeprecommender,pass,9
26 | phlippe_densenet,pass,9
27 | phlippe_resnet,pass,9
28 | pytorch_CycleGAN_and_pix2pix,pass,9
29 | pytorch_stargan,pass,9
30 | pytorch_unet,pass,9
31 | resnet152,pass,9
32 | resnet18,pass,9
33 | resnet50,pass,9
34 | resnext50_32x4d,pass,9
35 | shufflenet_v2_x1_0,pass,9
36 | soft_actor_critic,pass,8
37 | speech_transformer,pass,19
38 | squeezenet1_1,pass,9
39 | timm_efficientnet,pass,9
40 | timm_regnet,pass,9
41 | timm_resnest,pass,9
42 | timm_vision_transformer,pass,9
43 | timm_vision_transformer_large,pass_due_to_skip,0
44 | timm_vovnet,pass,9
45 | tts_angular,pass,11
46 | vgg16,pass,9
47 | yolov3,pass,13
48 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/dynamo/microbenchmarks/__init__.py
--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/benchmark_helper.py:
--------------------------------------------------------------------------------
1 | from torch.utils.benchmark import Timer
2 |
3 |
4 | def time_with_torch_timer(fn, args, kwargs=None, iters=100):
5 | kwargs = kwargs or {}
6 | env = {"args": args, "kwargs": kwargs, "fn": fn}
7 | fn_call = "fn(*args, **kwargs)"
8 |
9 | # Measure end-to-end time
10 | timer = Timer(stmt=f"{fn_call}", globals=env)
11 | tt = timer.timeit(iters)
12 |
13 | return tt
14 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/model.py:
--------------------------------------------------------------------------------
1 | # resnet50 layer shape
2 | resnet50_layers = (
3 | # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding
4 | (224, 224, 3, 7, 7, 64, (2, 2), (0, 0)),
5 | # conv2_x
6 | (56, 56, 64, 1, 1, 64, (1, 1), (0, 0)),
7 | (56, 56, 64, 3, 3, 64, (1, 1), (0, 0)),
8 | (56, 56, 64, 1, 1, 256, (1, 1), (0, 0)),
9 | # conv3_x
10 | (56, 56, 256, 1, 1, 128, (2, 2), (0, 0)),
11 | (28, 28, 128, 3, 3, 128, (1, 1), (0, 0)),
12 | (28, 28, 128, 1, 1, 512, (1, 1), (0, 0)),
13 | # conv4_x
14 | (28, 28, 512, 1, 1, 256, (2, 2), (0, 0)),
15 | (14, 14, 256, 3, 3, 256, (1, 1), (0, 0)),
16 | (14, 14, 256, 1, 1, 1024, (1, 1), (0, 0)),
17 | # conv5_x
18 | (14, 14, 1024, 1, 1, 512, (2, 2), (0, 0)),
19 | (7, 7, 512, 3, 3, 512, (1, 1), (0, 0)),
20 | (7, 7, 512, 1, 1, 2048, (1, 1), (0, 0)),
21 | )
22 |
23 | alexnet_layers = (
24 | # IN_H, IN_W, IN_C, KERNEL_H, KERNEL_W, KERNEL_N, stride, padding
25 | (224, 224, 3, 11, 11, 64, (4, 4), (2, 2)),
26 | )
27 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/microbenchmarks/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 |
5 |
6 | def rounded_linspace(low, high, steps, div):
7 | ret = torch.linspace(low, high, steps)
8 | ret = (ret.int() + div - 1) // div * div
9 | ret = torch.unique(ret)
10 | return list(map(int, ret))
11 |
12 |
13 | def powspace(start, stop, pow, step):
14 | start = math.log(start, pow)
15 | stop = math.log(stop, pow)
16 | steps = int((stop - start + 1) // step)
17 | ret = torch.pow(pow, torch.linspace(start, stop, steps))
18 | ret = torch.unique(ret)
19 | return list(map(int, ret))
20 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/run_delta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -x
4 |
5 | # Some QoL for people running this script on Meta servers
6 | if getent hosts fwdproxy; then
7 | export https_proxy=http://fwdproxy:8080 http_proxy=http://fwdproxy:8080 no_proxy=.fbcdn.net,.facebook.com,.thefacebook.com,.tfbnw.net,.fb.com,.fburl.com,.facebook.net,.sb.fbsbx.com,localhost
8 | fi
9 |
10 | WORK="$PWD"
11 |
12 | cd "$(dirname "$BASH_SOURCE")"/../..
13 |
14 | ROOT="$PWD"
15 |
16 | mkdir -p "$WORK/sweep/static"
17 | mkdir -p "$WORK/sweep/dynamic"
18 |
19 | (cd "$WORK/sweep/static" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@")
20 | (cd "$WORK/sweep/dynamic" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@" --dynamic-shapes)
21 | python benchmarks/dynamo/combine_csv.py "$WORK/sweep/static/final.csv" "$WORK/sweep/dynamic/final.csv" > "$WORK/delta.csv"
22 | gh gist create "$WORK/delta.csv"
23 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/timm_models_list_cpu.txt:
--------------------------------------------------------------------------------
1 | adv_inception_v3,128
2 | beit_base_patch16_224,64
3 | botnet26t_256,128
4 | cait_m36_384,4
5 | coat_lite_mini,32
6 | convit_base,64
7 | convmixer_768_32,2
8 | convnext_base,64
9 | crossvit_9_240,32
10 | cspdarknet53,64
11 | deit_base_distilled_patch16_224,64
12 | dm_nfnet_f0,128
13 | dpn107,32
14 | eca_botnext26ts_256,128
15 | eca_halonext26ts,128
16 | ese_vovnet19b_dw,128
17 | fbnetc_100,32
18 | fbnetv3_b,32
19 | gernet_l,128
20 | ghostnet_100,128
21 | gluon_inception_v3,128
22 | gluon_xception65,32
23 | gmixer_24_224,16
24 | gmlp_s16_224,128
25 | hrnet_w18,128
26 | inception_v3,128
27 | jx_nest_base,32
28 | lcnet_050,64
29 | mixer_b16_224,128
30 | mixnet_l,128
31 | mnasnet_100,32
32 | mobilenetv2_100,32
33 | mobilenetv3_large_100,32
34 | mobilevit_s,256
35 | nfnet_l0,128
36 | pit_b_224,64
37 | pnasnet5large,16
38 | poolformer_m36,64
39 | regnety_002,128
40 | repvgg_a2,128
41 | res2net101_26w_4s,64
42 | res2net50_14w_8s,128
43 | res2next50,128
44 | resmlp_12_224,128
45 | resnest101e,64
46 | rexnet_100,128
47 | sebotnet33ts_256,64
48 | selecsls42b,128
49 | spnasnet_100,32
50 | swin_base_patch4_window7_224,64
51 | swsl_resnext101_32x16d,32
52 | tf_efficientnet_b0,128
53 | tf_mixnet_l,32
54 | tinynet_a,128
55 | tnt_s_patch16_224,32
56 | twins_pcpvt_base,64
57 | visformer_small,128
58 | vit_base_patch16_224,64
59 | volo_d1_224,64
60 | xcit_large_24_p8_224,5
61 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/torchbench_models_list.txt:
--------------------------------------------------------------------------------
1 | BERT_pytorch,128
2 | Background_Matting, 16
3 | LearningToPaint,1024
4 | alexnet,1024
5 | dcgan,1024
6 | densenet121,64
7 | hf_Albert,32
8 | hf_Bart,16
9 | hf_Bert,16
10 | hf_GPT2,16
11 | hf_T5,4
12 | mnasnet1_0,256
13 | mobilenet_v2,128
14 | mobilenet_v3_large,256
15 | nvidia_deeprecommender,1024
16 | pytorch_unet,8
17 | resnet18,512
18 | resnet50,128
19 | resnext50_32x4d,128
20 | shufflenet_v2_x1_0,512
21 | squeezenet1_1,512
22 | timm_nfnet,256
23 | timm_efficientnet,128
24 | timm_regnet,128
25 | timm_resnest,256
26 | timm_vision_transformer,256
27 | timm_vovnet,128
28 | vgg16,128
29 |
--------------------------------------------------------------------------------
/benchmarks/dynamo/torchbench_models_list_cpu.txt:
--------------------------------------------------------------------------------
1 | alexnet,128
2 | attention_is_all_you_need_pytorch,64
3 | BERT_pytorch,32
4 | dcgan,256
5 | densenet121,512
6 | dlrm,2048
7 | fastNLP_Bert,8
8 | functorch_dp_cifar10,1024
9 | hf_Albert,8
10 | hf_Bart,8
11 | hf_Bert,8
12 | hf_Bert_large,8
13 | hf_DistilBert,8
14 | hf_GPT2,8
15 | hf_GPT2_large,1
16 | hf_Longformer,4
17 | hf_Reformer,8
18 | hf_T5,4
19 | hf_T5_base,1
20 | hf_T5_large,1
21 | LearningToPaint,96
22 | lennard_jones,1024
23 | mnasnet1_0,32
24 | mobilenet_v2,16
25 | mobilenet_v3_large,32
26 | nvidia_deeprecommender,256
27 | phlippe_densenet,128
28 | phlippe_resnet,512
29 | pytorch_unet,4
30 | resnet152,32
31 | resnet18,256
32 | resnet50,256
33 | resnext50_32x4d,256
34 | shufflenet_v2_x1_0,64
35 | speech_transformer,1024
36 | squeezenet1_1,16
37 | Super_SloMo,1024
38 | timm_efficientnet,64
39 | timm_nfnet,128
40 | timm_regnet,32
41 | timm_resnest,32
42 | timm_vision_transformer,16
43 | timm_vision_transformer_large,8
44 | timm_vovnet,32
45 | tts_angular,1024
46 | vgg16,64
47 | vision_maskrcnn,1
48 | yolov3,32
49 |
--------------------------------------------------------------------------------
/benchmarks/fastrnns/README.md:
--------------------------------------------------------------------------------
1 | # Fast RNN benchmarks
2 |
3 | Benchmarks for TorchScript models
4 |
5 | For most stable results, do the following:
6 | - Set CPU Governor to performance mode (as opposed to energy save)
7 | - Turn off turbo for all CPUs (assuming Intel CPUs)
8 | - Shield cpus via `cset shield` when running benchmarks.
9 |
10 | Some of these scripts accept command line args but most of them do not because
11 | I was lazy. They will probably be added sometime in the future, but the default
12 | sizes are pretty reasonable.
13 |
14 | ## Test fastrnns (fwd + bwd) correctness
15 |
16 | Test the fastrnns benchmarking scripts with the following:
17 | `python -m fastrnns.test`
18 | or run the test independently:
19 | `python -m fastrnns.test --rnns jit`
20 |
21 | ## Run benchmarks
22 |
23 | `python -m fastrnns.bench`
24 |
25 | should give a good comparison, or you can specify the type of model to run
26 |
27 | `python -m fastrnns.bench --rnns cudnn aten jit --group rnns`
28 |
29 | ## Run model profiling, calls nvprof
30 |
31 | `python -m fastrnns.profile`
32 |
33 | should generate nvprof file for all models somewhere.
34 | you can also specify the models to generate nvprof files separately:
35 |
36 | `python -m fastrnns.profile --rnns aten jit`
37 |
38 | ### Caveats
39 |
40 | Use Linux for the most accurate timing. A lot of these tests only run
41 | on CUDA.
42 |
--------------------------------------------------------------------------------
/benchmarks/fastrnns/__init__.py:
--------------------------------------------------------------------------------
1 | from .cells import * # noqa: F403
2 | from .factory import * # noqa: F403
3 |
4 | # (output, next_state) = cell(input, state)
5 | seqLength = 100
6 | numLayers = 2
7 | inputSize = 512
8 | hiddenSize = 512
9 | miniBatch = 64
10 |
--------------------------------------------------------------------------------
/benchmarks/fastrnns/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest # noqa: F401
2 |
3 | default_rnns = ['cudnn', 'aten', 'jit', 'jit_premul', 'jit_premul_bias', 'jit_simple',
4 | 'jit_multilayer', 'py']
5 | default_cnns = ['resnet18', 'resnet18_jit', 'resnet50', 'resnet50_jit']
6 | all_nets = default_rnns + default_cnns
7 |
8 | def pytest_generate_tests(metafunc):
9 | # This creates lists of tests to generate, can be customized
10 | if metafunc.cls.__name__ == "TestBenchNetwork":
11 | metafunc.parametrize('net_name', all_nets, scope="class")
12 | metafunc.parametrize("executor", [metafunc.config.getoption("executor")], scope="class")
13 | metafunc.parametrize("fuser", [metafunc.config.getoption("fuser")], scope="class")
14 |
15 | def pytest_addoption(parser):
16 | parser.addoption("--fuser", default="old", help="fuser to use for benchmarks")
17 | parser.addoption("--executor", default="legacy", help="executor to use for benchmarks")
18 |
--------------------------------------------------------------------------------
/benchmarks/fastrnns/scratch.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | @torch.jit.script
5 | def fn(x, scale, shift):
6 | return scale * x / shift
7 |
8 |
9 | @torch.jit.script
10 | def recurrent(x, scale, shift):
11 | y = x
12 | for i in range(100):
13 | y = fn(y, scale, shift)
14 | return y
15 |
16 |
17 | x = torch.randn(2, 2, device='cuda')
18 | scale = torch.randn(2, 2, device='cuda', requires_grad=True)
19 | shift = torch.randn(2, 2, device='cuda', requires_grad=True)
20 | inputs = [x, scale, shift]
21 |
22 |
23 | out = recurrent(x, scale, shift)
24 | recurrent.graph_for(x, scale, shift)
25 |
26 |
27 | import torch
28 |
29 |
30 | @torch.jit.script
31 | def recurrent_scaleshift(x, scale, shift):
32 | y = x
33 | for i in range(64):
34 | y = scale * y + shift
35 | return y
36 |
37 |
38 | x = torch.randn(2, 2, device='cuda')
39 | scale = torch.randn(2, 2, device='cuda', requires_grad=True)
40 | shift = torch.randn(2, 2, device='cuda', requires_grad=True)
41 | inputs = [x, scale, shift]
42 | out = recurrent_scaleshift(x, scale, shift)
43 | recurrent_scaleshift.graph_for(x, scale, shift)
44 |
45 |
46 | import torch
47 | x = torch.tensor([])
48 | x.requires_grad = True
49 | x.mean().backward() # no error triggered
50 | x = x.cuda()
51 | x.mean().backward()
52 |
--------------------------------------------------------------------------------
/benchmarks/framework_overhead_benchmark/SimpleAddModule.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from utils import NUM_LOOP_ITERS
3 |
4 | def add_tensors_loop(x, y):
5 | z = torch.add(x, y)
6 | for i in range(NUM_LOOP_ITERS):
7 | z = torch.add(z, x)
8 | return z
9 |
10 | class SimpleAddModule(torch.nn.Module):
11 | def __init__(self, add_op):
12 | super().__init__()
13 | self.add_op = add_op
14 |
15 | def forward(self, x, y):
16 | return self.add_op(x, y)
17 |
--------------------------------------------------------------------------------
/benchmarks/framework_overhead_benchmark/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | from collections import namedtuple
3 | from torch.utils import ThroughputBenchmark
4 |
5 | NUM_LOOP_ITERS = 1000
6 | BenchmarkConfig = namedtuple('BenchmarkConfig', 'num_warmup_iters num_iters')
7 | ModuleConfig = namedtuple('ModuleConfig', 'pt_fn c2_op num_params graph_mode')
8 |
9 | def ms_to_us(time_ms):
10 | return (time_ms * 1e3)
11 |
12 | def secs_to_us(time_s):
13 | return (time_s * 1e6)
14 |
15 | def secs_to_ms(time_s):
16 | return (time_s * 1e3)
17 |
18 | def benchmark_using_throughput_benchmark(config, module):
19 | print("Benchmarking via ThroughputBenchmark")
20 | bench = ThroughputBenchmark(module.module)
21 | bench.add_input(*module.tensor_inputs)
22 | stats = bench.benchmark(1, config.num_warmup_iters, config.num_iters)
23 | return stats.latency_avg_ms / NUM_LOOP_ITERS
24 |
25 | def benchmark_module(config, module, use_throughput_benchmark=False):
26 | if use_throughput_benchmark:
27 | return benchmark_using_throughput_benchmark(config, module)
28 | module.forward(config.num_warmup_iters)
29 | print("Running module for {} iterations".format(config.num_iters))
30 | start = time.time()
31 | module.forward(config.num_iters)
32 | end = time.time()
33 | time_elapsed_s = (end - start)
34 | return (secs_to_ms(time_elapsed_s) / config.num_iters / NUM_LOOP_ITERS)
35 |
--------------------------------------------------------------------------------
/benchmarks/fuser/plot_speedups.py:
--------------------------------------------------------------------------------
1 | import pandas
2 |
3 | df = pandas.read_csv("perf.csv")
4 |
5 | ops = pandas.unique(df["operator"])
6 | nops = len(ops)
7 | pivot_op_shape = df.pivot_table(
8 | values="time", index=["operator", "shape"], columns=["fuser"]
9 | )
10 | pivot_speedups = (pivot_op_shape.T / pivot_op_shape["eager"]).T
11 |
12 | import matplotlib.pyplot as plt
13 |
14 | plt.rcParams["figure.figsize"] = (20, 100)
15 | fig, axs = plt.subplots(nops)
16 | plt.subplots_adjust(hspace=0.5)
17 | for idx, op in enumerate(ops):
18 | op_speedups = pivot_speedups.T[op].T
19 | op_speedups.plot(ax=axs[idx], kind="bar", ylim=(0, 2), rot=45)
20 | axs[idx].set_title(op)
21 | axs[idx].set_xlabel("")
22 | plt.savefig("perf.png")
23 |
--------------------------------------------------------------------------------
/benchmarks/instruction_counts/applications/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/applications/__init__.py
--------------------------------------------------------------------------------
/benchmarks/instruction_counts/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/core/__init__.py
--------------------------------------------------------------------------------
/benchmarks/instruction_counts/definitions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/definitions/__init__.py
--------------------------------------------------------------------------------
/benchmarks/instruction_counts/execution/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/execution/__init__.py
--------------------------------------------------------------------------------
/benchmarks/instruction_counts/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/instruction_counts/worker/__init__.py
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/__init__.py
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_other_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | from pt import ( # noqa: F401
3 | add_test, as_strided_test, batchnorm_test, binary_test, cat_test,
4 | channel_shuffle_test, chunk_test, conv_test, diag_test, embeddingbag_test,
5 | fill_test, gather_test, linear_test, matmul_test, nan_to_num_test, pool_test,
6 | softmax_test, hardsigmoid_test, hardswish_test, layernorm_test,
7 | groupnorm_test, interpolate_test, instancenorm_test, remainder_test,
8 | split_test, sum_test, tensor_to_test
9 | )
10 | from pt import ( # noqa: F401
11 | ao_sparsifier_test
12 | )
13 |
14 | if __name__ == "__main__":
15 | op_bench.benchmark_runner.main()
16 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_quantized_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | from pt import ( # noqa: F401
3 | qactivation_test,
4 | qarithmetic_test,
5 | qbatchnorm_test,
6 | qcat_test,
7 | qcomparators_test,
8 | qconv_test,
9 | qgroupnorm_test,
10 | qinstancenorm_test,
11 | qinterpolate_test,
12 | qlayernorm_test,
13 | qlinear_test,
14 | qobserver_test,
15 | qpool_test,
16 | qrnn_test,
17 | qtensor_method_test,
18 | quantization_test,
19 | qunary_test,
20 | qembedding_pack_test,
21 | qembeddingbag_test,
22 | qatembedding_ops_test,
23 | )
24 |
25 |
26 | if __name__ == "__main__":
27 | op_bench.benchmark_runner.main()
28 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/benchmark_all_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | from pt import ( # noqa: F401
3 | unary_test,
4 | )
5 | import benchmark_all_other_test # noqa: F401
6 | import benchmark_all_quantized_test # noqa: F401
7 |
8 | if __name__ == "__main__":
9 | op_bench.benchmark_runner.main()
10 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/c2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/c2/__init__.py
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/c2/replace_nan_test.py:
--------------------------------------------------------------------------------
1 | import benchmark_caffe2 as op_bench_c2
2 | import operator_benchmark as op_bench
3 | from benchmark_caffe2 import Caffe2BenchmarkBase # noqa: F401
4 | from caffe2.python import core
5 |
6 |
7 | """Microbenchmarks for element-wise ReplaceNaN operator."""
8 |
9 | # Configs for C2 ReplaceNaN operator
10 | replace_nan_long_configs = op_bench.cross_product_configs(
11 | M=[32, 64, 128], N=range(32, 128, 32), dtype=["float", "double"], tags=["long"]
12 | )
13 |
14 |
15 | replace_nan_short_configs = op_bench.config_list(
16 | attrs=[
17 | [16, 16, "float"],
18 | [16, 16, "double"],
19 | [64, 64, "float"],
20 | [64, 64, "double"],
21 | ],
22 | attr_names=["M", "N", "dtype"],
23 | tags=["short"],
24 | )
25 |
26 |
27 | class ReplaceNaNBenchmark(op_bench_c2.Caffe2BenchmarkBase):
28 | def init(self, M, N, dtype):
29 | self.input = self.tensor([M, N], dtype)
30 | self.set_module_name("replace_nan")
31 |
32 | def forward(self):
33 | op = core.CreateOperator("ReplaceNaN", self.input, self.input, value=1.0)
34 | return op
35 |
36 |
37 | op_bench_c2.generate_c2_test(
38 | replace_nan_long_configs + replace_nan_short_configs, ReplaceNaNBenchmark
39 | )
40 |
41 |
42 | if __name__ == "__main__":
43 | op_bench.benchmark_runner.main()
44 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/common/__init__.py
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/add_ops_list_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | # Configs for pointwise unary ops
6 | unary_ops_configs = op_bench.config_list(
7 | attrs=[
8 | [128, 128],
9 | ],
10 | attr_names=["M", "N"],
11 | tags=["short"]
12 | )
13 |
14 |
15 | unary_ops_list = op_bench.op_list(
16 | attr_names=["op_name", "op_func"],
17 | attrs=[
18 | ["abs", torch.abs],
19 | ["acos", torch.acos],
20 | ],
21 | )
22 |
23 |
24 | class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
25 | def init(self, M, N, op_func):
26 | self.input_one = torch.rand(M, N)
27 | self.op_func = op_func
28 |
29 | def forward(self):
30 | return self.op_func(self.input_one)
31 |
32 |
33 | op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
34 |
35 |
36 | if __name__ == "__main__":
37 | op_bench.benchmark_runner.main()
38 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/jit_forward_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 | intraop_bench_configs = op_bench.config_list(
5 | attrs=[
6 | [8, 16],
7 | ],
8 | attr_names=["M", "N"],
9 | tags=["short"],
10 | )
11 |
12 | @torch.jit.script
13 | def torch_sumall(a, iterations):
14 | # type: (Tensor, int)
15 | result = 0.0
16 | for _ in range(iterations):
17 | result += float(torch.sum(a))
18 | a[0][0] += 0.01
19 | return result
20 |
21 |
22 | class TorchSumBenchmark(op_bench.TorchBenchmarkBase):
23 | def init(self, M, N):
24 | self.input_one = torch.rand(M, N)
25 | self.set_module_name("sum")
26 |
27 | # This is a very temporary method and will be removed soon, so
28 | # don't use this method in your benchmark
29 | # TODO(mingzhe): use one forward method for both JIT and Eager
30 | def jit_forward(self, iters):
31 | return torch_sumall(self.input_one, iters)
32 |
33 | op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark)
34 |
35 |
36 | if __name__ == "__main__":
37 | op_bench.benchmark_runner.main()
38 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_backward_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | add_configs = op_bench.cross_product_configs(
6 | M=[8, 1],
7 | N=[8, 2],
8 | K=[8, 4],
9 | tags=["short"]
10 | )
11 |
12 | # This benchmark uses the auto_set to automatically set requires_grad
13 | # for both inputs. The test name can also be used for filtering.
14 | class AddBenchmark(op_bench.TorchBenchmarkBase):
15 | def init(self, M, N, K):
16 | self.input_one = torch.rand(M, N, K, requires_grad=self.auto_set())
17 | self.input_two = torch.rand(M, N, K, requires_grad=self.auto_set())
18 | self.set_module_name("add")
19 |
20 | def forward(self):
21 | return torch.add(self.input_one, self.input_two)
22 |
23 |
24 | op_bench.generate_pt_test(add_configs, AddBenchmark)
25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark)
26 |
27 |
28 | if __name__ == "__main__":
29 | op_bench.benchmark_runner.main()
30 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 | """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
5 |
6 | add_short_configs = op_bench.config_list(
7 | attr_names=['M', 'N', 'K'],
8 | attrs=[
9 | [8, 16, 32],
10 | [16, 16, 64],
11 | [64, 64, 128],
12 | ],
13 | cross_product_configs={
14 | 'device': ['cpu', 'cuda'],
15 | 'dtype': [torch.float, torch.float64],
16 | },
17 | tags=['short'],
18 | )
19 |
20 |
21 | class AddBenchmark(op_bench.TorchBenchmarkBase):
22 | def init(self, M, N, K, device, dtype):
23 | self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
24 | self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
25 | self.set_module_name('add')
26 |
27 | def forward(self):
28 | return torch.add(self.input_one, self.input_two)
29 |
30 |
31 | op_bench.generate_pt_test(add_short_configs, AddBenchmark)
32 |
33 |
34 | if __name__ == "__main__":
35 | op_bench.benchmark_runner.main()
36 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/pt_cpu_gpu_forward_backward_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | add_configs = op_bench.cross_product_configs(
6 | M=[8],
7 | N=[8],
8 | K=[8],
9 | device=["cuda", "cpu"],
10 | tags=["short"]
11 | )
12 |
13 |
14 | class AddBenchmark(op_bench.TorchBenchmarkBase):
15 | def init(self, M, N, K, device):
16 | self.input_one = torch.rand(M, N, K, device=device, requires_grad=True)
17 | self.input_two = torch.rand(M, N, K, device=device, requires_grad=True)
18 | self.set_module_name("add")
19 |
20 | def forward(self):
21 | return torch.add(self.input_one, self.input_two)
22 |
23 |
24 | op_bench.generate_pt_test(add_configs, AddBenchmark)
25 | op_bench.generate_pt_gradient_test(add_configs, AddBenchmark)
26 |
27 |
28 | if __name__ == "__main__":
29 | op_bench.benchmark_runner.main()
30 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/common/tests/random_sample_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | configs = op_bench.random_sample_configs(
6 | M=[1, 2, 3, 4, 5, 6],
7 | N=[7, 8, 9, 10, 11, 12],
8 | K=[13, 14, 15, 16, 17, 18],
9 | # probs saves the weights of each value
10 | probs=op_bench.attr_probs(
11 | M=[0.5, 0.2, 0.1, 0.05, 0.03, 0.1],
12 | N=[0.1, 0.3, 0.4, 0.02, 0.03, 0.04],
13 | K=[0.03, 0.6, 0.04, 0.02, 0.03, 0.01],
14 | ),
15 | # this is the number of returned inputs
16 | total_samples=10,
17 | tags=["short"],
18 | )
19 |
20 |
21 | class AddBenchmark(op_bench.TorchBenchmarkBase):
22 | def init(self, M, N, K):
23 | self.input_one = torch.rand(M, N, K)
24 | self.input_two = torch.rand(M, N, K)
25 | self.set_module_name("add")
26 |
27 | def forward(self):
28 | return torch.add(self.input_one, self.input_two)
29 |
30 |
31 | op_bench.generate_pt_test(configs, AddBenchmark)
32 |
33 |
34 | if __name__ == "__main__":
35 | op_bench.benchmark_runner.main()
36 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/operator_benchmark.py:
--------------------------------------------------------------------------------
1 | # TODO (mingzhe09088): get rid of noqa
2 | import benchmark_runner # noqa: F401
3 | from benchmark_pytorch import TorchBenchmarkBase # noqa: F401
4 | from benchmark_test_generator import * # noqa: F401,F403
5 | from benchmark_utils import * # noqa: F401,F403
6 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/operator_benchmark/pt/__init__.py
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/bmm_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 | """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""
5 |
6 | class BmmBenchmark(op_bench.TorchBenchmarkBase):
7 | def init(self, B, M, N, K, device, op):
8 | self.inputs = {
9 | "batch1": torch.rand((B, M, K), device=device, requires_grad=self.auto_set()),
10 | "batch2": torch.rand((B, K, N,), device=device, requires_grad=self.auto_set())
11 | }
12 | self.set_module_name(f"bmm (actual op={op}")
13 | self.op = torch.bmm if op == "bmm" else torch.matmul
14 |
15 | def forward(self, batch1, batch2):
16 | return self.op(batch1, batch2)
17 |
18 | bmm_configs = op_bench.cross_product_configs(
19 | B=[2, 100],
20 | M=[8, 256],
21 | N=[256, 16],
22 | K=[16, 32],
23 | device=['cpu'],
24 | tags=["short"],
25 | op=["bmm", "matmul"],
26 | )
27 |
28 | op_bench.generate_pt_test(bmm_configs, BmmBenchmark)
29 |
30 | if __name__ == "__main__":
31 | op_bench.benchmark_runner.main()
32 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/chunk_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | """Microbenchmarks for Chunk operator"""
6 |
7 |
8 | # Configs for PT Chunk operator
9 | chunk_short_configs = op_bench.config_list(
10 | attr_names=["M", "N", "chunks"],
11 | attrs=[
12 | [8, 8, 2],
13 | [256, 512, 2],
14 | [512, 512, 2],
15 | ],
16 | cross_product_configs={
17 | 'device': ['cpu', 'cuda'],
18 | },
19 | tags=["short"],
20 | )
21 |
22 | chunks_long_configs = op_bench.cross_product_configs(
23 | M=[128, 1024],
24 | N=[128, 1024],
25 | chunks=[2, 4],
26 | device=['cpu', 'cuda'],
27 | tags=['long']
28 | )
29 |
30 |
31 | class ChunkBenchmark(op_bench.TorchBenchmarkBase):
32 | def init(self, M, N, chunks, device):
33 | self.inputs = {
34 | "input_one": torch.rand(M, N, device=device),
35 | "chunks": chunks
36 | }
37 | self.set_module_name("chunk")
38 |
39 | def forward(self, input_one, chunks: int):
40 | return torch.chunk(input_one, chunks)
41 |
42 |
43 | op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
44 | ChunkBenchmark)
45 |
46 |
47 | if __name__ == "__main__":
48 | op_bench.benchmark_runner.main()
49 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/diag_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | """Microbenchmarks for diag operator"""
6 |
7 |
8 | # Configs for PT diag operator
9 | diag_configs_short = op_bench.config_list(
10 | attr_names=['dim', 'M', 'N', 'diagonal', 'out'],
11 | attrs=[
12 | [1, 64, 64, 0, True],
13 | [2, 128, 128, -10, False],
14 | [1, 256, 256, 20, True],
15 | ],
16 | cross_product_configs={
17 | 'device': ['cpu', 'cuda'],
18 | },
19 | tags=['short'],
20 | )
21 |
22 |
23 | class DiagBenchmark(op_bench.TorchBenchmarkBase):
24 | def init(self, dim, M, N, diagonal, out, device):
25 | self.inputs = {
26 | "input": torch.rand(M, N, device=device) if dim == 2 else torch.rand(M, device=device),
27 | "diagonal": diagonal,
28 | "out": out,
29 | "out_tensor": torch.tensor((),)
30 | }
31 | self.set_module_name('diag')
32 |
33 | def forward(self, input, diagonal: int, out: bool, out_tensor):
34 | if out:
35 | return torch.diag(input, diagonal=diagonal, out=out_tensor)
36 | else:
37 | return torch.diag(input, diagonal=diagonal)
38 |
39 |
40 | op_bench.generate_pt_test(diag_configs_short, DiagBenchmark)
41 |
42 |
43 | if __name__ == "__main__":
44 | op_bench.benchmark_runner.main()
45 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/fill_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 | from torch.testing._internal.common_device_type import get_all_device_types
5 |
6 | """Microbenchmark for Fill_ operator."""
7 |
8 | fill_short_configs = op_bench.config_list(
9 | attr_names=["N"],
10 | attrs=[
11 | [1],
12 | [1024],
13 | [2048],
14 | ],
15 | cross_product_configs={
16 | 'device': ['cpu', 'cuda'],
17 | 'dtype': [torch.int32],
18 | },
19 | tags=["short"],
20 | )
21 |
22 | fill_long_configs = op_bench.cross_product_configs(
23 | N=[10, 1000],
24 | device=get_all_device_types(),
25 | dtype=[torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
26 | torch.int64, torch.half, torch.float, torch.double],
27 | tags=["long"]
28 | )
29 |
30 |
31 | class Fill_Benchmark(op_bench.TorchBenchmarkBase):
32 | def init(self, N, device, dtype):
33 | self.inputs = {
34 | "input_one": torch.zeros(N, device=device).type(dtype)
35 | }
36 | self.set_module_name("fill_")
37 |
38 | def forward(self, input_one):
39 | return input_one.fill_(10)
40 |
41 |
42 | op_bench.generate_pt_test(fill_short_configs + fill_long_configs,
43 | Fill_Benchmark)
44 |
45 |
46 | if __name__ == "__main__":
47 | op_bench.benchmark_runner.main()
48 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/gelu_test.py:
--------------------------------------------------------------------------------
1 |
2 | import operator_benchmark as op_bench
3 | import torch
4 |
5 |
6 | """
7 | Microbenchmarks for the gelu operators.
8 | """
9 |
10 | gelu_configs_long = op_bench.cross_product_configs(
11 | N=[1, 4],
12 | C=[3],
13 | H=[16, 256],
14 | W=[16, 256],
15 | device=['cpu'],
16 | tags=['long']
17 | )
18 |
19 |
20 | class GeluBenchmark(op_bench.TorchBenchmarkBase):
21 | def init(self, N, C, H, W, device):
22 | self.inputs = {
23 | "input": torch.rand(N, C, H, W, device=device)
24 | }
25 |
26 | def forward(self, input):
27 | return torch.nn.functional.gelu(input)
28 |
29 |
30 | op_bench.generate_pt_test(gelu_configs_long, GeluBenchmark)
31 |
32 |
33 | if __name__ == "__main__":
34 | op_bench.benchmark_runner.main()
35 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/groupnorm_test.py:
--------------------------------------------------------------------------------
1 |
2 | import operator_benchmark as op_bench
3 | import torch
4 | import torch.nn.functional as F
5 |
6 |
7 | """Microbenchmarks for groupnorm operator."""
8 |
9 | groupnorm_configs_short = op_bench.cross_product_configs(
10 | dims=(
11 | (32, 8, 16),
12 | (32, 8, 56, 56),
13 | ),
14 | num_groups=(2, 4),
15 | tags=["short"],
16 | )
17 |
18 |
19 | class GroupNormBenchmark(op_bench.TorchBenchmarkBase):
20 | def init(self, dims, num_groups):
21 | num_channels = dims[1]
22 | self.inputs = {
23 | "input": (torch.rand(*dims) - 0.5) * 256,
24 | "num_groups": num_groups,
25 | "weight": torch.rand(num_channels, dtype=torch.float),
26 | "bias": torch.rand(num_channels, dtype=torch.float),
27 | "eps": 1e-5
28 | }
29 |
30 | def forward(self, input, num_groups: int, weight, bias, eps: float):
31 | return F.group_norm(
32 | input, num_groups, weight=weight, bias=bias, eps=eps)
33 |
34 |
35 | op_bench.generate_pt_test(groupnorm_configs_short, GroupNormBenchmark)
36 |
37 |
38 | if __name__ == "__main__":
39 | op_bench.benchmark_runner.main()
40 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/instancenorm_test.py:
--------------------------------------------------------------------------------
1 |
2 | import operator_benchmark as op_bench
3 | import torch
4 | import torch.nn.functional as F
5 |
6 |
7 | """Microbenchmarks for instancenorm operator."""
8 |
9 | instancenorm_configs_short = op_bench.cross_product_configs(
10 | dims=(
11 | (32, 8, 16),
12 | (32, 8, 56, 56),
13 | ),
14 | tags=["short"],
15 | )
16 |
17 |
18 | class InstanceNormBenchmark(op_bench.TorchBenchmarkBase):
19 | def init(self, dims):
20 | num_channels = dims[1]
21 | self.inputs = {
22 | "input": (torch.rand(*dims) - 0.5) * 256,
23 | "weight": torch.rand(num_channels, dtype=torch.float),
24 | "bias": torch.rand(num_channels, dtype=torch.float),
25 | "eps": 1e-5
26 | }
27 |
28 | def forward(self, input, weight, bias, eps: float):
29 | return F.instance_norm(
30 | input, weight=weight, bias=bias, eps=eps)
31 |
32 |
33 | op_bench.generate_pt_test(instancenorm_configs_short, InstanceNormBenchmark)
34 |
35 |
36 | if __name__ == "__main__":
37 | op_bench.benchmark_runner.main()
38 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/layernorm_test.py:
--------------------------------------------------------------------------------
1 |
2 | import operator_benchmark as op_bench
3 | import torch
4 | import torch.nn.functional as F
5 |
6 |
7 | """Microbenchmarks for layernorm operator."""
8 |
9 | layernorm_configs_short = op_bench.cross_product_configs(
10 | dims=(
11 | (1, 8, 16),
12 | (8, 8, 16),
13 | (32, 8, 16),
14 | (64, 128, 56, 56),
15 | ),
16 | tags=["short"],
17 | )
18 |
19 |
20 | class LayerNormBenchmark(op_bench.TorchBenchmarkBase):
21 | def init(self, dims):
22 | input = (torch.rand(*dims) - 0.5) * 256
23 | self.inputs = {
24 | "input": input,
25 | "weight": torch.rand(*input.size()[1:], dtype=torch.float),
26 | "bias": torch.rand(*input.size()[1:], dtype=torch.float),
27 | "eps": 1e-5
28 | }
29 |
30 | def forward(self, input, weight, bias, eps: float):
31 | return F.layer_norm(
32 | input, input.size()[1:], weight=weight, bias=bias, eps=eps)
33 |
34 |
35 | op_bench.generate_pt_test(layernorm_configs_short, LayerNormBenchmark)
36 |
37 |
38 | if __name__ == "__main__":
39 | op_bench.benchmark_runner.main()
40 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/linear_test.py:
--------------------------------------------------------------------------------
1 |
2 | import operator_benchmark as op_bench
3 | import torch
4 | import torch.nn as nn
5 |
6 | from pt import configs
7 |
8 |
9 | """Microbenchmarks for Linear operator."""
10 |
11 |
12 | class LinearBenchmark(op_bench.TorchBenchmarkBase):
13 | def init(self, N, IN, OUT, device):
14 | self.inputs = {
15 | "input_one": torch.rand(N, IN, device=device)
16 | }
17 | self.linear = nn.Linear(IN, OUT).to(device=device)
18 | self.set_module_name("linear")
19 |
20 | def forward(self, input_one):
21 | return self.linear(input_one)
22 |
23 |
24 | op_bench.generate_pt_test(configs.linear_configs_short + configs.linear_configs_long,
25 | LinearBenchmark)
26 |
27 |
28 | if __name__ == "__main__":
29 | op_bench.benchmark_runner.main()
30 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt/split_test.py:
--------------------------------------------------------------------------------
1 | import operator_benchmark as op_bench
2 | import torch
3 |
4 |
5 | """Microbenchmarks for Split operator"""
6 |
7 |
8 | # Configs for PT Split operator
9 | split_configs_short = op_bench.config_list(
10 | attr_names=["M", "N", "parts"],
11 | attrs=[
12 | [8, 8, 2],
13 | [256, 512, 2],
14 | [512, 512, 2],
15 | ],
16 | cross_product_configs={
17 | 'device': ['cpu', 'cuda'],
18 | },
19 | tags=["short"],
20 | )
21 |
22 | split_configs_long = op_bench.cross_product_configs(
23 | M=[128, 1024],
24 | N=[128, 1024],
25 | parts=[2, 4],
26 | device=['cpu', 'cuda'],
27 | tags=['long']
28 | )
29 |
30 |
31 | class SplitBenchmark(op_bench.TorchBenchmarkBase):
32 | def init(self, M, N, parts, device):
33 | self.inputs = {
34 | "input": torch.rand(M, N, device=device),
35 | "split_size": int(M * N / parts)
36 | }
37 | self.set_module_name('split')
38 |
39 | def forward(self, input, split_size: int):
40 | return torch.split(input, split_size)
41 |
42 |
43 | op_bench.generate_pt_test(split_configs_short + split_configs_long,
44 | SplitBenchmark)
45 |
46 |
47 | if __name__ == "__main__":
48 | op_bench.benchmark_runner.main()
49 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt_extension/extension.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | using torch::List;
5 | using torch::Tensor;
6 |
7 | Tensor consume(Tensor a) {
8 | return a;
9 | }
10 |
11 | List consume_list(List a) {
12 | return a;
13 | }
14 |
15 | // When JIT tracing is used on function with constant for loop,
16 | // the for loop is optimized away because of dead code elimination.
17 | // That caused an issue for our op benchmark which needs to run an op
18 | // in a loop and report the execution time. This diff resolves that issue by
19 | // registering this consume op with correct alias information which is DEFAULT.
20 | TORCH_LIBRARY_FRAGMENT(operator_benchmark, m) {
21 | m.def("_consume", &consume);
22 | m.def("_consume.list", &consume_list);
23 | }
24 |
25 | PYBIND11_MODULE(benchmark_cpp_extension, m) {
26 | m.def("_consume", &consume, "consume");
27 | m.def("_consume_list", &consume_list, "consume_list");
28 | }
29 |
--------------------------------------------------------------------------------
/benchmarks/operator_benchmark/pt_extension/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import CppExtension, BuildExtension
3 |
4 | setup(name='benchmark_cpp_extension',
5 | ext_modules=[CppExtension('benchmark_cpp_extension', ['extension.cpp'])],
6 | cmdclass={'build_ext': BuildExtension})
7 |
--------------------------------------------------------------------------------
/benchmarks/overrides_benchmark/common.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | NUM_REPEATS = 1000
4 | NUM_REPEAT_OF_REPEATS = 1000
5 |
6 |
7 | class SubTensor(torch.Tensor):
8 | pass
9 |
10 |
11 | class WithTorchFunction:
12 | def __init__(self, data, requires_grad=False):
13 | if isinstance(data, torch.Tensor):
14 | self._tensor = data
15 | return
16 |
17 | self._tensor = torch.tensor(data, requires_grad=requires_grad)
18 |
19 | @classmethod
20 | def __torch_function__(cls, func, types, args=(), kwargs=None):
21 | if kwargs is None:
22 | kwargs = {}
23 |
24 | return WithTorchFunction(args[0]._tensor + args[1]._tensor)
25 |
26 |
27 | class SubWithTorchFunction(torch.Tensor):
28 | @classmethod
29 | def __torch_function__(cls, func, types, args=(), kwargs=None):
30 | if kwargs is None:
31 | kwargs = {}
32 |
33 | return super().__torch_function__(func, types, args, kwargs)
34 |
--------------------------------------------------------------------------------
/benchmarks/overrides_benchmark/pyspybench.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import argparse
3 | from common import SubTensor, WithTorchFunction, SubWithTorchFunction # noqa: F401
4 |
5 | Tensor = torch.tensor
6 |
7 | NUM_REPEATS = 1000000
8 |
9 | if __name__ == "__main__":
10 | parser = argparse.ArgumentParser(
11 | description="Run the torch.add for a given class a given number of times."
12 | )
13 | parser.add_argument(
14 | "tensor_class", metavar="TensorClass", type=str, help="The class to benchmark."
15 | )
16 | parser.add_argument(
17 | "--nreps", "-n", type=int, default=NUM_REPEATS, help="The number of repeats."
18 | )
19 | args = parser.parse_args()
20 |
21 | TensorClass = globals()[args.tensor_class]
22 | NUM_REPEATS = args.nreps
23 |
24 | t1 = TensorClass([1.])
25 | t2 = TensorClass([2.])
26 |
27 | for _ in range(NUM_REPEATS):
28 | torch.add(t1, t2)
29 |
--------------------------------------------------------------------------------
/benchmarks/profiler_benchmark/resnet_memory_profiler.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision.models as models
3 |
4 | import torch.autograd.profiler as profiler
5 |
6 | for with_cuda in [False, True]:
7 | model = models.resnet18()
8 | inputs = torch.randn(5, 3, 224, 224)
9 | sort_key = "self_cpu_memory_usage"
10 | if with_cuda and torch.cuda.is_available():
11 | model = model.cuda()
12 | inputs = inputs.cuda()
13 | sort_key = "self_cuda_memory_usage"
14 | print("Profiling CUDA Resnet model")
15 | else:
16 | print("Profiling CPU Resnet model")
17 |
18 | with profiler.profile(profile_memory=True, record_shapes=True) as prof:
19 | with profiler.record_function("root"):
20 | model(inputs)
21 |
22 | print(prof.key_averages(group_by_input_shape=True).table(sort_by=sort_key, row_limit=-1))
23 |
--------------------------------------------------------------------------------
/benchmarks/serialization/nested_annotation_str.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.benchmark as benchmark
3 |
4 | MEMO = {}
5 | def create_nested_dict_type(layers):
6 | if layers == 0:
7 | return torch._C.StringType.get()
8 | if layers not in MEMO:
9 | less_nested = create_nested_dict_type(layers - 1)
10 | result = torch._C.DictType(torch._C.StringType.get(), torch._C.TupleType([less_nested, less_nested]))
11 | MEMO[layers] = result
12 | return MEMO[layers]
13 |
14 |
15 | nesting_levels = (1, 3, 5, 10)
16 | types = (reasonable, medium, big, huge) = [create_nested_dict_type(x) for x in nesting_levels]
17 |
18 | timers = [benchmark.Timer(stmt='x.annotation_str', globals={'x': nested_type}) for nested_type in types]
19 |
20 | for nesting_level, typ, timer in zip(nesting_levels, types, timers):
21 | print("Nesting level:", nesting_level)
22 | print("output:", typ.annotation_str[:70])
23 | print(timer.blocked_autorange())
24 |
--------------------------------------------------------------------------------
/benchmarks/serialization/simple_measurement.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from pyarkbench import Benchmark, Timer, default_args
3 |
4 | use_new = True
5 |
6 | class Basic(Benchmark):
7 | def benchmark(self):
8 | x = [torch.ones(200, 200) for i in range(30)]
9 | with Timer() as big1:
10 | torch.save(x, "big_tensor.zip", _use_new_zipfile_serialization=use_new)
11 |
12 | with Timer() as big2:
13 | v = torch.load("big_tensor.zip")
14 |
15 | x = [torch.ones(10, 10) for i in range(200)]
16 | with Timer() as small1:
17 | torch.save(x, "small_tensor.zip", _use_new_zipfile_serialization=use_new)
18 |
19 | with Timer() as small2:
20 | v = torch.load("small_tensor.zip")
21 |
22 | return {
23 | "Big Tensors Save": big1.ms_duration,
24 | "Big Tensors Load": big2.ms_duration,
25 | "Small Tensors Save": small1.ms_duration,
26 | "Small Tensors Load": small2.ms_duration,
27 | }
28 |
29 | if __name__ == '__main__':
30 | bench = Basic(*default_args.bench())
31 | print("Use zipfile serialization:", use_new)
32 | results = bench.run()
33 | bench.print_stats(results, stats=['mean', 'median'])
34 |
--------------------------------------------------------------------------------
/benchmarks/sparse/README.md:
--------------------------------------------------------------------------------
1 | #Sparse benchmarks
2 |
3 | These sets of benchmarks are for the sparse matrix functionality. They exist for
4 | comparing the performance of sparse matrix routines such as SpMV between various
5 | sparse matrix formats and with other frameworks such as TensorFlow.
6 |
--------------------------------------------------------------------------------
/benchmarks/sparse/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | if __name__ == "__main__":
3 | pass
4 |
--------------------------------------------------------------------------------
/benchmarks/sparse/dlmc/README.md:
--------------------------------------------------------------------------------
1 | # Sparse benchmarks
2 |
3 | These sets of benchmarks are for the sparse matrix functionality using a popular real dataset collection called the Deep Learning Matrix Collection (DLMC), which were used in recent studies [1, 2].
4 |
5 | Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here.
6 |
7 | - `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward-test`, on CPU or CUDA with `--with-cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
8 |
9 | - `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test.
10 |
11 | References:
12 |
13 | 1. Trevor Gale, Matei Zaharia, Cliff Young, Erich Elsen. Sparse GPU Kernels for Deep Learning. Proceedings of the International Conference for High Performance Computing, 2020. https://github.com/google-research/google-research/tree/master/sgk
14 |
15 | 2. Trevor Gale, Erich Elsen, Sara Hooker. The State of Sparsity in Deep Neural Networks. https://github.com/google-research/google-research/tree/master/state_of_sparsity
16 |
--------------------------------------------------------------------------------
/benchmarks/sparse/dlmc/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | if __name__ == "__main__":
3 | pass
4 |
--------------------------------------------------------------------------------
/benchmarks/sparse/test_csr.sh:
--------------------------------------------------------------------------------
1 | OUTFILE=spmm-no-mkl-test.txt
2 | PYTORCH_HOME=$1
3 |
4 | cd $PYTORCH_HOME
5 |
6 | echo "" >> $OUTFILE
7 | echo "----- USE_MKL=1 -----" >> $OUTFILE
8 | rm -rf build
9 |
10 | export USE_MKL=1
11 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
12 | python setup.py build --cmake-only
13 | ccmake build # or cmake-gui build
14 |
15 | python setup.py install
16 |
17 | cd benchmarks
18 | echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
19 | for dim0 in 1000 5000 10000; do
20 | for nnzr in 0.01 0.05 0.1 0.3; do
21 | python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
22 | # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
23 | done
24 | done
25 | echo "----------------------" >> $OUTFILE
26 |
27 | cd $PYTORCH_HOME
28 | echo "----- USE_MKL=0 ------" >> $OUTFILE
29 | rm -rf build
30 |
31 | export USE_MKL=0
32 | python setup.py install
33 |
34 | cd benchmarks
35 | for dim0 in 1000 5000 10000; do
36 | for nnzr in 0.01 0.05 0.1 0.3; do
37 | python -m sparse.spmv --format csr --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
38 | python -m sparse.spmv --format coo --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
39 | done
40 | done
41 | echo "----------------------" >> $OUTFILE
42 |
--------------------------------------------------------------------------------
/benchmarks/static_runtime/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
2 | list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
3 | set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE)
4 |
5 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
6 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cc)
7 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc)
8 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_module.cc)
9 | list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_generated_ops.cc)
10 | set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE)
11 |
--------------------------------------------------------------------------------
/benchmarks/tensorexpr/HowToRun.md:
--------------------------------------------------------------------------------
1 | From the root of pytorch repo, run:
2 | ```
3 | python -m benchmarks.tensorexpr --help
4 | ```
5 | to show documentation.
6 |
7 | An example of an actual command line that one might use as a starting point:
8 | ```
9 | python -m benchmarks.tensorexpr --device gpu --mode fwd --jit-mode trace --cuda-fuser=te
10 | ```
11 |
--------------------------------------------------------------------------------
/benchmarks/tensorexpr/nnc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/benchmarks/tensorexpr/nnc.png
--------------------------------------------------------------------------------
/benchmarks/tensorexpr/tensor_engine.py:
--------------------------------------------------------------------------------
1 | tensor_engine = None
2 |
3 |
4 | def unsupported(func):
5 | def wrapper(self):
6 | return func(self)
7 |
8 | wrapper.is_supported = False
9 | return wrapper
10 |
11 |
12 | def is_supported(method):
13 | if hasattr(method, "is_supported"):
14 | return method.is_supported
15 | return True
16 |
17 |
18 | def set_engine_mode(mode):
19 | global tensor_engine
20 | if mode == "tf":
21 | from . import tf_engine
22 |
23 | tensor_engine = tf_engine.TensorFlowEngine()
24 | elif mode == "pt":
25 | from . import pt_engine
26 |
27 | tensor_engine = pt_engine.TorchTensorEngine()
28 | elif mode == "topi":
29 | from . import topi_engine
30 |
31 | tensor_engine = topi_engine.TopiEngine()
32 | elif mode == "relay":
33 | from . import relay_engine
34 |
35 | tensor_engine = relay_engine.RelayEngine()
36 | elif mode == "nnc":
37 | from . import nnc_engine
38 |
39 | tensor_engine = nnc_engine.NncEngine()
40 | else:
41 | raise ValueError("invalid tensor engine mode: %s" % (mode))
42 | tensor_engine.mode = mode
43 |
44 |
45 | def get_engine():
46 | if tensor_engine is None:
47 | raise ValueError("use of get_engine, before calling set_engine_mode is illegal")
48 | return tensor_engine
49 |
--------------------------------------------------------------------------------
/binaries/caffe2_benchmark.cc:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "binaries/benchmark_args.h"
6 | #include "binaries/benchmark_helper.h"
7 |
8 |
9 | int main(int argc, char** argv) {
10 | caffe2::GlobalInit(&argc, &argv);
11 | benchmark(
12 | argc,
13 | argv,
14 | FLAGS_backend,
15 | FLAGS_init_net,
16 | FLAGS_input,
17 | FLAGS_input_dims,
18 | FLAGS_input_file,
19 | FLAGS_input_type,
20 | FLAGS_iter,
21 | FLAGS_measure_memory,
22 | FLAGS_net,
23 | FLAGS_output,
24 | FLAGS_output_folder,
25 | FLAGS_run_individual,
26 | FLAGS_sleep_before_run,
27 | FLAGS_sleep_between_iteration,
28 | FLAGS_sleep_between_net_and_operator,
29 | FLAGS_text_output,
30 | FLAGS_warmup,
31 | FLAGS_wipe_cache);
32 | }
33 |
--------------------------------------------------------------------------------
/binaries/lite_interpreter_model_load.cc:
--------------------------------------------------------------------------------
1 | #include "ATen/ATen.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include "torch/script.h"
8 |
9 | C10_DEFINE_string(model, "", "The given bytecode model to check if it is supported by lite_interpreter.");
10 |
11 | int main(int argc, char** argv) {
12 | c10::SetUsageMessage(
13 | "Check if exported bytecode model is runnable by lite_interpreter.\n"
14 | "Example usage:\n"
15 | "./lite_interpreter_model_load"
16 | " --model=");
17 |
18 | if (!c10::ParseCommandLineFlags(&argc, &argv)) {
19 | std::cerr << "Failed to parse command line flags!" << std::endl;
20 | return 1;
21 | }
22 |
23 | if (FLAGS_model.empty()) {
24 | std::cerr << FLAGS_model << ":Model file is not provided\n";
25 | return -1;
26 | }
27 |
28 | // TODO: avoid having to set this guard for custom mobile build with mobile
29 | // interpreter.
30 | c10::InferenceMode mode;
31 | torch::jit::mobile::Module bc = torch::jit::_load_for_mobile(FLAGS_model);
32 | return 0;
33 | }
34 |
--------------------------------------------------------------------------------
/binaries/parallel_info.cc:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2016-present, Facebook, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "ATen/Parallel.h"
18 |
19 | #include
20 | #include
21 |
22 | #ifdef __linux__
23 | #include
24 | #include
25 | #endif
26 |
27 | int main(int argc, char** argv) {
28 | at::init_num_threads();
29 |
30 | std::cout << at::get_parallel_info() << std::endl;
31 |
32 | # ifdef __linux__
33 | std::ostringstream cmd;
34 | cmd << "lsof -p " << getpid() << " | grep .so";
35 | std::cout << "Loaded .so:" << std::endl;
36 | std::cout << cmd.str() << std::endl;
37 | std::system(cmd.str().c_str());
38 | # endif
39 |
40 | return 0;
41 | }
42 |
--------------------------------------------------------------------------------
/third_party/BUILD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/BUILD
--------------------------------------------------------------------------------
/third_party/METADATA.bzl:
--------------------------------------------------------------------------------
1 | METADATA = {
2 | "maintainers": [
3 | "pytorch_dev_infra",
4 | ],
5 | "name": "third_party",
6 | "owner": "pytorch_dev_infra",
7 | }
8 |
--------------------------------------------------------------------------------
/third_party/README.md:
--------------------------------------------------------------------------------
1 | This folder contains vendored copies of third-party libraries that we
2 | use.
3 |
--------------------------------------------------------------------------------
/third_party/cudnn.BUILD:
--------------------------------------------------------------------------------
1 | # Adopted from: https://github.com/NVIDIA/TRTorch/blob/master/third_party/cudnn/local/BUILD
2 |
3 | cc_library(
4 | name = "cudnn_headers",
5 | hdrs = ["include/cudnn.h"] + glob([
6 | "include/cudnn+.h",
7 | "include/cudnn_*.h",
8 | ]),
9 | includes = ["include/"],
10 | visibility = ["//visibility:private"],
11 | )
12 |
13 | cc_import(
14 | name = "cudnn_lib",
15 | shared_library = "lib/x86_64-linux-gnu/libcudnn.so",
16 | visibility = ["//visibility:private"],
17 | )
18 |
19 | cc_library(
20 | name = "cudnn",
21 | visibility = ["//visibility:public"],
22 | deps = [
23 | "cudnn_headers",
24 | "cudnn_lib",
25 | ],
26 | )
27 |
--------------------------------------------------------------------------------
/third_party/cutlass.BUILD:
--------------------------------------------------------------------------------
1 | # Description:
2 | # CUDA Templates for Linear Algebra Subroutines
3 |
4 | load("@rules_cc//cc:defs.bzl", "cc_library")
5 |
6 | cc_library(
7 | name = "cutlass",
8 | hdrs = glob(["include/**/*.h"]),
9 | includes = ["include/"],
10 | visibility = ["//visibility:public"],
11 | )
12 |
--------------------------------------------------------------------------------
/third_party/fmt.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "fmt",
5 | hdrs = glob(["include/fmt/*.h",]),
6 | defines = ["FMT_HEADER_ONLY=1"],
7 | includes = ["include"],
8 | visibility = ["//visibility:public"],
9 | )
10 |
--------------------------------------------------------------------------------
/third_party/foxi.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "foxi",
5 | srcs = [
6 | "foxi/onnxifi_loader.c",
7 | ],
8 | hdrs = glob([
9 | "foxi/*.h",
10 | ]),
11 | includes = [
12 | ".",
13 | ],
14 | linkstatic = 1,
15 | visibility = ["//visibility:public"],
16 | )
17 |
--------------------------------------------------------------------------------
/third_party/ideep.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "ideep",
5 | hdrs = glob([
6 | "include/**/*.hpp",
7 | "include/**/*.h",
8 | ]),
9 | defines = [
10 | "IDEEP_USE_MKL",
11 | ],
12 | includes = [
13 | "include/",
14 | ],
15 | visibility = ["//visibility:public"],
16 | deps = ["@mkl_dnn//:mkl-dnn"],
17 | )
18 |
--------------------------------------------------------------------------------
/third_party/kineto.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "kineto",
5 | hdrs = glob(["libkineto/include/*.h",]),
6 | includes = [
7 | "libkineto/include/",
8 | ],
9 | visibility = ["//visibility:public"],
10 | )
11 |
--------------------------------------------------------------------------------
/third_party/miniz-2.1.0/BUILD.bazel:
--------------------------------------------------------------------------------
1 | cc_library(
2 | name = "miniz",
3 | srcs = [
4 | "miniz.c",
5 | ],
6 | hdrs = [
7 | "miniz.h",
8 | ],
9 | strip_include_prefix = ".",
10 | visibility = ["//visibility:public"],
11 | )
12 |
--------------------------------------------------------------------------------
/third_party/miniz-2.1.0/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2013-2014 RAD Game Tools and Valve Software
2 | Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
3 |
4 | All Rights Reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/third_party/mkl.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "mkl",
5 | srcs = [
6 | "libmkl_avx2.so",
7 | "libmkl_core.so",
8 | "libmkl_def.so",
9 | "libmkl_intel_lp64.so",
10 | "libmkl_rt.so",
11 | "libmkl_sequential.so",
12 | "libmkl_vml_avx2.so",
13 | "libmkl_vml_avx512.so",
14 | "libmkl_vml_def.so",
15 | ] + select({
16 | "@pytorch//tools/config:thread_sanitizer": [],
17 | "//conditions:default": ["libmkl_tbb_thread.so"],
18 | }),
19 | visibility = ["//visibility:public"],
20 | deps = ["@mkl_headers"],
21 | )
22 |
--------------------------------------------------------------------------------
/third_party/mkl_headers.BUILD:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | cc_library(
4 | name = "mkl_headers",
5 | hdrs = glob(["include/*.h"]),
6 | includes = ["include/"],
7 | visibility = ["//visibility:public"],
8 | )
9 |
--------------------------------------------------------------------------------
/third_party/nvfuser/benchmark/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(USE_CUDA)
2 | add_executable(nvfuser_bench
3 | batch_norm_channels_first.cpp
4 | batch_norm_channels_first_backward.cpp
5 | batch_norm_channels_last.cpp
6 | batch_norm_channels_last_backward.cpp
7 | bert.cpp
8 | broadcast.cpp
9 | gelu_backward.cpp
10 | heuristic_lookup.cpp
11 | shape_inference.cpp
12 | instance_norm.cpp
13 | layer_norm.cpp
14 | layer_norm_backward.cpp
15 | rms_norm.cpp
16 | rms_norm_backward.cpp
17 | lstm_cell.cpp
18 | reduction.cpp
19 | softmax.cpp
20 | softmax_backward.cpp
21 | scale_bias_relu.cpp
22 | transpose.cpp
23 | matmul.cpp
24 | timm.cpp
25 | utils.cpp
26 | main.cpp)
27 |
28 | target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
29 | if(NOT MSVC)
30 | target_compile_options_if_supported(nvfuser_bench -Werror)
31 | target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
32 | target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
33 | endif()
34 |
35 | endif()
36 |
--------------------------------------------------------------------------------
/third_party/nvfuser/benchmark/main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | BENCHMARK_MAIN();
4 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/codegen.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | #include
7 |
8 | namespace torch {
9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 | namespace codegen {
13 |
14 | //! Generates a CUDA kernel definition for the given kernel
15 | TORCH_CUDA_CU_API std::string generateCudaKernel(
16 | const kir::Kernel* kernel,
17 | const std::string& kernel_name = "CUDAGeneratedKernel");
18 |
19 | } // namespace codegen
20 | } // namespace cuda
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/compute_at.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 |
10 | #include
11 | #include
12 | #include
13 | #include
14 |
15 | namespace torch {
16 | namespace jit {
17 | namespace fuser {
18 | namespace cuda {
19 |
20 | class TensorDomain;
21 | class TensorView;
22 |
23 | struct ComputeAt {
24 | public:
25 | // Runs the compute at pass making producer look like consumer, computing
26 | // producer relative to consumer
27 | static void runAt(
28 | TensorView* producer,
29 | TensorView* consumer,
30 | int64_t consumer_position,
31 | ComputeAtMode mode = ComputeAtMode::Standard);
32 |
33 | // Runs the compute with pass making consumer look like producer, computing
34 | // producer relative to consumer
35 | static void runWith(
36 | TensorView* producer,
37 | TensorView* consumer,
38 | int64_t producer_position,
39 | ComputeAtMode mode = ComputeAtMode::Standard);
40 | };
41 |
42 | } // namespace cuda
43 | } // namespace fuser
44 | } // namespace jit
45 | } // namespace torch
46 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/.gitignore:
--------------------------------------------------------------------------------
1 | html
2 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/documentation.h:
--------------------------------------------------------------------------------
1 |
2 | #error This is used exclusively for generating the documentation (not a real header)
3 |
4 | //! \namespace torch::jit::fuser
5 | //! \brief Main PyTorch JIT Fuser namespace
6 |
7 | //! \namespace torch::jit::fuser::cuda
8 | //! \brief CUDA specific components
9 |
10 | //! \namespace torch::jit::fuser::cuda::executor_utils
11 | //! \brief Fuser executor related utilities
12 |
13 | //! \namespace torch::jit::fuser::kir
14 | //! \brief Kernel IR
15 |
16 | //! \namespace torch::jit::fuser::ir_utils
17 | //! \brief IR manipulation utilities
18 |
19 | //! \namespace torch::jit::fuser::loop_utils
20 | //! \brief Loop utilities
21 |
22 | //! \namespace torch::jit::fuser::scope_utils
23 | //! \brief Scope utilities
24 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/images/ir_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/csrc/docs/images/ir_architecture.png
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/docs/main_page.md:
--------------------------------------------------------------------------------
1 |
2 | This is the implementation reference for the CUDA PyTorch JIT Fuser
3 |
4 | - [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
5 | - [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
6 | - Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
7 |
8 | 
9 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/ir_all_nodes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | // TODO: remove this once the Kernel IR split is complete
8 | #include
9 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_alias_memory.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | #include
9 |
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 |
15 | //! Reuse Allocation nodes via pointer aliasing
16 | //!
17 | //! First pass finds candidate TensorViews
18 | //! A candidate TensorView is anything in shared memory OR
19 | //! in local memory with a static size larger than register_size_threshold
20 | //!
21 | //! Second pass finds appropriate input Allocate Node
22 | //! among candidate TensorViews
23 | //!
24 | //! Alias Criteria:
25 | //! If input is a candidate TensorView,
26 | //! input allocation has the same size as output allocation,
27 | //! thread bindings match,
28 | //! is not used after this op:
29 | //! then alias output Allocate to input Allocate.
30 | //!
31 | std::vector reuseMemoryAllocations(const std::vector& exprs);
32 |
33 | } // namespace cuda
34 | } // namespace fuser
35 | } // namespace jit
36 | } // namespace torch
37 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_allocation.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | #include
9 |
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 |
15 | //! Buffer allocation information to store in GPU lower to avoid
16 | //! logic duplication
17 | struct LocalAllocationInfo {
18 | kir::Allocate* alloc_expr = nullptr;
19 | std::vector alloc_domains;
20 | bool has_halo = false;
21 | };
22 |
23 | using LocalAllocationInfoMap =
24 | std::unordered_map>;
25 |
26 | //! Insert buffer allocations
27 | std::vector insertAllocations(const std::vector& exprs);
28 |
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_divisible_split.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 |
9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 |
14 | // Looks through all transformations assocaited with view, or enforced divisible
15 | // vectorization splits and gathers all splits that provably don't have a
16 | // remainder, therefore the extents of the associated IterDomains do not require
17 | // a ceilDiv expressions.
18 | TORCH_CUDA_CU_API std::unordered_set getAllDivisibleSplits(
19 | Fusion* fusion);
20 |
21 | // Same as above but will use provided ComputeAtMap instead of building its own.
22 | TORCH_CUDA_CU_API std::unordered_set getAllDivisibleSplits(
23 | Fusion* fusion,
24 | const ComputeAtMap* ca_map);
25 |
26 | } // namespace cuda
27 | } // namespace fuser
28 | } // namespace jit
29 | } // namespace torch
30 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_expr_sort.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | namespace torch {
6 | namespace jit {
7 | namespace fuser {
8 | namespace cuda {
9 |
10 | std::vector reorderExprsForComputeAt();
11 |
12 | } // namespace cuda
13 | } // namespace fuser
14 | } // namespace jit
15 | } // namespace torch
16 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_fused_reduction.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | namespace torch {
6 | namespace jit {
7 | namespace fuser {
8 | namespace cuda {
9 |
10 | //! Keep track of certain patterns of reductions.
11 | //!
12 | //! - Allreduce IterDomain: reduced and broadcast domain.
13 | class FusedReductionInfo {
14 | public:
15 | void markAsAllreduce(IterDomain* id);
16 |
17 | bool isAllreduce(IterDomain* id) const;
18 |
19 | private:
20 | // Reduction IterDomains that are also broadcast
21 | std::unordered_set allreduce_ids_;
22 | };
23 |
24 | //! Detect reductions and broadcasts that are eligible for the fused
25 | //! reduction kernel. When found, the predicate flags of the broadcast
26 | //! is unset, which effectively makes the broadcast just a unary set
27 | //! op.
28 | //! TODO: Consider moving the warp-based fused reduction here.
29 | void fuseReductionsAndBroadcasts(Fusion*);
30 |
31 | } // namespace cuda
32 | } // namespace fuser
33 | } // namespace jit
34 | } // namespace torch
35 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_fusion_simplifier.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include
11 |
12 | namespace torch {
13 | namespace jit {
14 | namespace fuser {
15 | namespace cuda {
16 |
17 | // Replaces trivial reductions with Unary Set Ops
18 | void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&);
19 |
20 | // Transpose, Shift, Gather, and View Ops with Unary Set Ops
21 | std::vector unarySetOpInserter(const std::vector& exprs);
22 |
23 | } // namespace cuda
24 | } // namespace fuser
25 | } // namespace jit
26 | } // namespace torch
27 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_insert_syncs.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | #include
9 |
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 |
15 | //! Insert sync at end of for-loops to prevent write-after-read race condition.
16 | //!
17 | //! WAR race condition occurs when the next iteration of the loop overwrites
18 | //! shared memory value before a previous operation has finished reading it.
19 | std::vector insertWarThreadSynchronization(
20 | const std::vector& exprs);
21 |
22 | //! Insert syncs between writing to shared memory and then reading it.
23 | //! RAW pass is run before indexing, unrolling (loop duplication), memory
24 | //! aliasing, and index (grid/block bcast/reduction)
25 | std::vector insertRawThreadSynchronization(
26 | const std::vector& exprs);
27 |
28 | } // namespace cuda
29 | } // namespace fuser
30 | } // namespace jit
31 | } // namespace torch
32 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_instrument.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | namespace torch {
6 | namespace jit {
7 | namespace fuser {
8 | namespace cuda {
9 |
10 | //! Set up KernelPerformanceProfile of GpuLower when enabled, which
11 | //! keeps track of expressions to profile. A new TensorView is added
12 | //! for storing profiling results. The expression list is prepended
13 | //! with an kir::Allocate node to allocate the TensorView profile
14 | //! buffer. Note that any expression added after this pass will not be
15 | //! profiled, so this pass should be called after all expressions are
16 | //! lowered. KernelPerformanceProfile is copied to Kernel after
17 | //! lowering.
18 | std::vector instrumentKernel(const std::vector& exprs);
19 |
20 | } // namespace cuda
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_predicate.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 |
7 | #include
8 |
9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 |
14 | //! Update predicates with valid bool conditionals
15 | //!
16 | std::vector generateConditionalFromPredicate(
17 | const std::vector& exprs);
18 |
19 | } // namespace cuda
20 | } // namespace fuser
21 | } // namespace jit
22 | } // namespace torch
23 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_replace_size.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 |
9 | namespace torch {
10 | namespace jit {
11 | namespace fuser {
12 | namespace cuda {
13 |
14 | // TensorViews are all based on symbolic sizes. When we first initialize them
15 | // we don't know if they're inputs or outputs which would mean that they have
16 | // runtime shapes. Intermediate tensors (those not going to global memory) do
17 | // not have this information. Since we need to have the correct information in
18 | // the kernel being fetched for shapes, we want to replace input and output
19 | // tensors to reference the runtime structure containing sizes.
20 | void replaceSymbolicSizes(Fusion*);
21 |
22 | } // namespace cuda
23 | } // namespace fuser
24 | } // namespace jit
25 | } // namespace torch
26 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/lower_warp_reduce.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | namespace torch {
6 | namespace jit {
7 | namespace fuser {
8 | namespace cuda {
9 |
10 | struct WarpPaddedParallelInfo {
11 | bool is_tidx_padded = false;
12 | bool is_tidx_single_warp = false;
13 | bool has_warp_reduction = false;
14 | };
15 |
16 | std::vector fuseWarpReduce(const std::vector exprs);
17 |
18 | } // namespace cuda
19 | } // namespace fuser
20 | } // namespace jit
21 | } // namespace torch
22 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/mutator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | #include
9 |
10 | namespace torch {
11 | namespace jit {
12 | namespace fuser {
13 | namespace cuda {
14 |
15 | /*
16 | * Mutators are the mechanism used to modify IR nodes. Since most nodes are
17 | * immutable or at least partially immutable changeing them can require creating
18 | * a new node. Base mutator at the moment is a dumb sample mutator that takes
19 | * any float of value 1.0 and converts it to 0.0; It is currently used as a
20 | * dummy example, however, we should make it a simple instantiation of all the
21 | * mutate functions on all node types so that people can inherit it, and only
22 | * specialize those nodes which they want to have a particular transformation.
23 | */
24 |
25 | } // namespace cuda
26 | } // namespace fuser
27 | } // namespace jit
28 | } // namespace torch
29 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/ops/all_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/parallel_type_bitmap.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | namespace torch {
4 | namespace jit {
5 | namespace fuser {
6 | namespace cuda {
7 |
8 | constexpr std::bitset
9 | ParallelTypeBitmap::kTIDBits;
10 | constexpr std::bitset
11 | ParallelTypeBitmap::kBIDBits;
12 |
13 | std::string ParallelTypeBitmap::toString() const {
14 | std::stringstream ss;
15 | ss << "(";
16 | bool is_first = true;
17 | for (ParallelType pt : *this) {
18 | if (!is_first) {
19 | ss << " ";
20 | }
21 | ss << pt;
22 | is_first = false;
23 | }
24 | ss << ")";
25 | return ss.str();
26 | }
27 |
28 | } // namespace cuda
29 | } // namespace fuser
30 | } // namespace jit
31 | } // namespace torch
32 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/partial_split_map.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | namespace torch {
12 | namespace jit {
13 | namespace fuser {
14 | namespace cuda {
15 |
16 | //! Collects start and stop offsets of all split root domains. Offsets
17 | //! are zero unless partially split.
18 | class TORCH_CUDA_CU_API PartialSplitMap {
19 | public:
20 | void build(Fusion* fusion);
21 |
22 | Val* getStartOffset(IterDomain* root_domain) const;
23 | Val* getStopOffset(IterDomain* root_domain) const;
24 |
25 | private:
26 | std::unordered_map start_offset_map_;
27 | std::unordered_map stop_offset_map_;
28 | };
29 |
30 | } // namespace cuda
31 | } // namespace fuser
32 | } // namespace jit
33 | } // namespace torch
34 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/partition.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | /*
7 | * API for query node-compatibility in CudaCodeGen
8 | *
9 | * It is used in the optimization passes, where the graph is traversed and parts
10 | * that could be handled by CudaCodegen is partitioned and stuffed in
11 | * `attr::Subgraph` of `prim::CudaFusionGroup`.
12 | *
13 | * Logic right now is very simple. On top of device placement, we consider a
14 | * `Node` compatible when we have a parsing rule for it in our parser.
15 | */
16 |
17 | namespace torch {
18 | namespace jit {
19 | namespace fuser {
20 | namespace cuda {
21 |
22 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup(const Node* node);
23 |
24 | // consider if `node` could be fused into `fusion`
25 | TORCH_CUDA_CU_API bool isFusibleCudaFusionGroup(
26 | const Node* fusion,
27 | const Node* node);
28 |
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/python_frontend/python_bindings.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | namespace torch {
7 | namespace jit {
8 | void initNvFuserPythonBindings(PyObject* module);
9 | } // namespace jit
10 | } // namespace torch
11 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | PYBIND11_MODULE(EXTENSION_NAME, m) {
5 | m.doc() = "nvfuser C API python binding"; // optional module docstring
6 | torch::jit::initNvFuserPythonBindings(m.ptr());
7 | }
8 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/all_schedulers.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | namespace torch {
8 | namespace jit {
9 | namespace fuser {
10 | namespace cuda {
11 |
12 | enum class TORCH_CUDA_CU_API ScheduleHeuristic {
13 | None,
14 | NoOp,
15 | PointWise,
16 | Reduction,
17 | Persistent,
18 | Transpose
19 | };
20 | }
21 | } // namespace fuser
22 | } // namespace jit
23 | } // namespace torch
24 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/debug_utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace torch {
4 | namespace jit {
5 | namespace fuser {
6 | namespace cuda {
7 |
8 | namespace scheduler_debug_utils {
9 |
10 | // Basic logging utility for any messages in scheduler or segmenter
11 | template
12 | void canScheduleMessage(const Args&... args) {
13 | // Using builtin expect to reduce the overhead slightly,
14 | // alternatively may want to allow this message in debug
15 | // build only but that'd be inconvenient for user support.
16 | if (C10_UNLIKELY(isDebugDumpEnabled(DebugDumpOption::FusionSegmenterLog))) {
17 | std::cout << c10::str(args...) << "\n";
18 | }
19 | }
20 |
21 | // Short-cut message for flagging why shedulers cannot schedule fusions,
22 | // assuming first argument is heuristic type (not actively checked).
23 | template
24 | void canScheduleRejectReason(HeuristicType heuristic, const Args&... args) {
25 | canScheduleMessage(
26 | "Scheduler _", heuristic, "_ ***rejected*** because : ", args...);
27 | }
28 |
29 | } // namespace scheduler_debug_utils
30 |
31 | } // namespace cuda
32 | } // namespace fuser
33 | } // namespace jit
34 | } // namespace torch
35 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/heuristic.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | #include
7 |
8 | namespace torch {
9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 |
13 | class HeuristicParams : public PolymorphicBase {
14 | public:
15 | std::string tag = "";
16 |
17 | LaunchParams lparams;
18 |
19 | virtual std::string toString() const {
20 | return "Undefined Heuristic Params";
21 | }
22 |
23 | virtual size_t hash() const = 0;
24 |
25 | virtual ~HeuristicParams() = default;
26 |
27 | virtual bool sameAs(const std::shared_ptr& other) const = 0;
28 |
29 | virtual std::shared_ptr clone() const = 0;
30 |
31 | HeuristicParams() = default;
32 | HeuristicParams(const std::string& tag) : tag(tag) {}
33 | };
34 |
35 | } // namespace cuda
36 | } // namespace fuser
37 | } // namespace jit
38 | } // namespace torch
39 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/normalization.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | // TODO: If caching inputs would require persistence we are sending it to the
9 | // persistent kerenl scheduler. This isn't necessary if the only persistent
10 | // buffers are inputs as we could re-read them from global memory. Need to
11 | // consider if this is worth implementing.
12 |
13 | namespace torch {
14 | namespace jit {
15 | namespace fuser {
16 | namespace cuda {
17 |
18 | class SchedulerRuntimeInfo;
19 | class HeuristicSummary;
20 |
21 | TORCH_CUDA_CU_API std::shared_ptr getPersistentHeuristics(
22 | Fusion* fusion,
23 | const at::ArrayRef& runtime_inputs,
24 | HeuristicSummary* data_cache = nullptr);
25 |
26 | TORCH_CUDA_CU_API std::shared_ptr getPersistentHeuristics(
27 | Fusion* fusion,
28 | SchedulerRuntimeInfo& runtime_info,
29 | HeuristicSummary* data_cache = nullptr);
30 |
31 | TORCH_CUDA_CU_API void schedulePersistentKernel(
32 | Fusion* fusion,
33 | const ReductionParams& rparams);
34 |
35 | } // namespace cuda
36 | } // namespace fuser
37 | } // namespace jit
38 | } // namespace torch
39 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/scheduler/reduction.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | namespace torch {
9 | namespace jit {
10 | namespace fuser {
11 | namespace cuda {
12 |
13 | class SchedulerRuntimeInfo;
14 | class HeuristicSummary;
15 |
16 | TORCH_CUDA_CU_API std::shared_ptr getReductionHeuristics(
17 | Fusion* fusion,
18 | const at::ArrayRef& runtime_inputs,
19 | HeuristicSummary* data_cache = nullptr);
20 |
21 | TORCH_CUDA_CU_API std::shared_ptr getReductionHeuristics(
22 | Fusion* fusion,
23 | SchedulerRuntimeInfo& runtime_info,
24 | HeuristicSummary* data_cache = nullptr);
25 |
26 | TORCH_CUDA_CU_API void scheduleReduction(
27 | Fusion* fusion,
28 | const ReductionParams& rparams);
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/transform_rfactor.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 |
11 | namespace torch {
12 | namespace jit {
13 | namespace fuser {
14 | namespace cuda {
15 |
16 | // TODO: Only replay dispatch is really borrowed from TransformIter, we should
17 | // reevaluate the reuse of dispatch for classes that inherit TransformIter.
18 | class TORCH_CUDA_CU_API TransformRFactor {
19 | public:
20 | // Transform the provided tensor domain to two domains, a producer and
21 | // consumer domain. These domains are created by taking axes and reducing them
22 | // in the producer domain, and taking the remaining reduction axes and
23 | // reducing them in the consumer domain.
24 | static std::pair runReplay(
25 | TensorDomain*,
26 | std::vector axes);
27 | };
28 |
29 | } // namespace cuda
30 | } // namespace fuser
31 | } // namespace jit
32 | } // namespace torch
33 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/type_inference.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | namespace torch {
7 | namespace jit {
8 | namespace fuser {
9 | namespace cuda {
10 |
11 | TORCH_CUDA_CU_API void TypePropagate(std::shared_ptr& graph);
12 |
13 | } // namespace cuda
14 | } // namespace fuser
15 | } // namespace jit
16 | } // namespace torch
17 |
--------------------------------------------------------------------------------
/third_party/nvfuser/csrc/vectorization_info.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #include
6 |
7 | namespace torch {
8 | namespace jit {
9 | namespace fuser {
10 | namespace cuda {
11 |
12 | struct VectorizedSetInfo {
13 | //! Producer of a vectorized set
14 | TensorView* producer_tv = nullptr;
15 | //! Consumer of a vectorized set
16 | TensorView* consumer_tv = nullptr;
17 | //! Number of elements to vectorize
18 | int word_size = -1;
19 | //! Vectorized domain
20 | IterDomain* vectorized_leaf_id = nullptr;
21 | //! Right-most root dependent domain of the leaf domain
22 | IterDomain* vectorized_root_id = nullptr;
23 | //! All of the dependent root domains that are contiguously merged
24 | std::unordered_set contig_root_ids;
25 | };
26 |
27 | } // namespace cuda
28 | } // namespace fuser
29 | } // namespace jit
30 | } // namespace torch
31 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/README.md:
--------------------------------------------------------------------------------
1 | # Build
2 |
3 | ```
4 | python setup.py install
5 | ```
6 |
7 | # Test
8 |
9 | ```
10 | python test.py
11 | ```
12 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 |
8 | using namespace torch::jit::fuser::cuda;
9 |
10 | at::Tensor sinh_nvfuser(const at::Tensor& input) {
11 | Fusion fusion;
12 | FusionGuard fg(&fusion);
13 |
14 | int dim = input.dim();
15 | auto dtype = input.scalar_type();
16 | auto x =
17 | TensorViewBuilder().ndims(dim).dtype(aten_to_data_type(dtype)).build();
18 | fusion.addInput(x);
19 |
20 | // Using equation sinh(x) = [ exp(x) - exp(-1) ] / 2
21 | auto output = div(sub(exp(x), exp(neg(x))), IrBuilder::create(2.0));
22 | fusion.addOutput(output);
23 |
24 | std::cout << "Create fusion:" << std::endl;
25 | fusion.print();
26 |
27 | auto lparams = schedulePointwise(&fusion, {input});
28 |
29 | FusionExecutor fe;
30 | fe.compileFusion(&fusion, {input}, lparams);
31 | auto outputs = fe.runFusion({input}, lparams);
32 |
33 | return outputs[0];
34 | }
35 |
36 | TORCH_LIBRARY(myop, m) {
37 | m.def("sinh_nvfuser", sinh_nvfuser);
38 | }
39 |
40 | TORCH_LIBRARY_IMPL(myop, CUDA, m) {
41 | m.impl("sinh_nvfuser", sinh_nvfuser);
42 | }
43 |
44 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {}
45 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
3 |
4 | setup(
5 | name='nvfuser_extension',
6 | ext_modules=[
7 | CUDAExtension(
8 | name='nvfuser_extension',
9 | pkg='nvfuser_extension',
10 | sources=['main.cpp'])
11 | ],
12 | cmdclass={
13 | 'build_ext': BuildExtension
14 | })
15 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_extension/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import nvfuser_extension # noqa: F401
3 |
4 | t = torch.randn((5, 5), device='cuda')
5 | expected = torch.sinh(t)
6 | output = torch.ops.myop.sinh_nvfuser(t)
7 |
8 | print("Expected:", expected)
9 | print("Output:", output)
10 |
11 | assert torch.allclose(output, expected)
12 | print("They match!")
13 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_libtorch/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
2 | project(sinh_example LANGUAGES CXX)
3 | set(CMAKE_CXX_STANDARD 14)
4 |
5 | find_package(Torch REQUIRED)
6 |
7 | add_executable(sinh_example main.cpp)
8 | target_link_libraries(sinh_example ${TORCH_LIBRARIES})
9 |
--------------------------------------------------------------------------------
/third_party/nvfuser/examples/sinh_libtorch/README.md:
--------------------------------------------------------------------------------
1 | # Build
2 |
3 | ```
4 | mkdir build
5 | cd build
6 | cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" ..
7 | make -j
8 | ```
9 |
10 | # Test
11 |
12 | ```
13 | ./sinh_example
14 | ```
15 |
--------------------------------------------------------------------------------
/third_party/nvfuser/python/__init__.py:
--------------------------------------------------------------------------------
1 | from . import _C
2 |
--------------------------------------------------------------------------------
/third_party/nvfuser/python_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/third_party/nvfuser/python_tests/__init__.py
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/bf16_support.cu:
--------------------------------------------------------------------------------
1 |
2 | #define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast(&(var)))
3 | #define __NVFUSER_BFLOAT_TO_CUS(var) \
4 | *(reinterpret_cast(&(var)))
5 |
6 | struct __bfloat;
7 | __device__ __bfloat __float2bfloat(const float);
8 |
9 | struct __align__(2) __bfloat {
10 | __bfloat() = default;
11 |
12 | __device__ __bfloat(const float f) {
13 | __x = __float2bfloat(f).__x;
14 | }
15 |
16 | protected:
17 | unsigned short __x;
18 | };
19 |
20 | __device__ __bfloat __float2bfloat(const float f) {
21 | __bfloat val;
22 | asm("{ cvt.rn.bf16.f32 %0, %1;}\n"
23 | : "=h"(__NVFUSER_BFLOAT_TO_US(val))
24 | : "f"(f));
25 | return val;
26 | }
27 |
28 | __device__ float __bfloat2float(const __bfloat h) {
29 | float val;
30 | asm("{ mov.b32 %0, {0,%1};}\n"
31 | : "=f"(val)
32 | : "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
33 | return val;
34 | }
35 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/bf16_support_rocm.cu:
--------------------------------------------------------------------------------
1 |
2 | struct __align__(2) __bfloat {
3 | __bfloat() = default;
4 |
5 | inline __device__ __bfloat(const float f) {
6 | if (f != f) {
7 | __x = uint16_t(0x7FC0);
8 | } else {
9 | union {
10 | uint32_t U32;
11 | float F32;
12 | };
13 |
14 | F32 = f;
15 | uint32_t rounding_bias = ((U32 >> 16) & 1) + uint32_t(0x7FFF);
16 | __x = static_cast((U32 + rounding_bias) >> 16);
17 | }
18 | }
19 |
20 | inline __device__ operator float() const {
21 | float res = 0;
22 | uint32_t tmp = __x;
23 | tmp <<= 16;
24 | float* tempRes = reinterpret_cast(&tmp);
25 | res = *tempRes;
26 | return res;
27 | }
28 |
29 | protected:
30 | unsigned short __x;
31 | };
32 |
33 | __device__ __bfloat __float2bfloat(const float f) {
34 | return __bfloat(f);
35 | }
36 |
37 | __device__ float __bfloat2float(const __bfloat h) {
38 | return float(h);
39 | }
40 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/block_sync_default.cu:
--------------------------------------------------------------------------------
1 |
2 | // Default block synchronization. Just use __barrier_sync
3 | namespace block_sync {
4 |
5 | __forceinline__ __device__ void init() {}
6 |
7 | // Thread-block synchronization
8 | __forceinline__ __device__ void sync() {
9 | __barrier_sync(0);
10 | }
11 |
12 | } // namespace block_sync
13 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/block_sync_default_rocm.cu:
--------------------------------------------------------------------------------
1 |
2 | // Default block synchronization. Just use __barrier_sync
3 | namespace block_sync {
4 |
5 | __forceinline__ __device__ void init() {}
6 |
7 | // Thread-block synchronization
8 | __forceinline__ __device__ void sync() {
9 | __syncthreads();
10 | }
11 |
12 | } // namespace block_sync
13 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/broadcast.cu:
--------------------------------------------------------------------------------
1 |
2 | namespace broadcast {
3 | // Broadcasts within partitioned groups of threads.
4 | //
5 | // X_THREAD: Broadcast from threadIdx.x == 0 if true
6 | // Y_THREAD: Broadcast from threadIdx.y == 0 if true
7 | // Z_THREAD: Broadcast from threadIdx.z == 0 if true
8 | // inp_val: Per-thread source value. Only valid when the thread is a source.
9 | // out: Per-thread output location
10 | //
11 | template
12 | __device__ void blockBroadcast(
13 | T& out,
14 | const T& inp_val,
15 | T* shared_mem,
16 | bool read_write_pred) {
17 | const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
18 | (!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
19 |
20 | const auto shared_offset =
21 | index_utils::maskedOffset(
22 | threadIdx, blockDim);
23 |
24 | if (has_valid_data && read_write_pred) {
25 | shared_mem[shared_offset] = inp_val;
26 | }
27 |
28 | block_sync::sync();
29 |
30 | if (read_write_pred) {
31 | out = shared_mem[shared_offset];
32 | }
33 |
34 | block_sync::sync();
35 | }
36 |
37 | } // namespace broadcast
38 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/tensor.cu:
--------------------------------------------------------------------------------
1 | template
2 | struct Tensor {
3 | __device__ T& operator[](nvfuser_index_t ind) {
4 | return data[ind];
5 | };
6 |
7 | T* data;
8 | nvfuser_index_t size[N];
9 | nvfuser_index_t stride[N];
10 | };
11 |
12 | // Specialization for 0-dim case as it does not need size and stride arrays.
13 | // They will be an error as well since zero-length arrays are not allowed.
14 | template
15 | struct Tensor {
16 | __device__ T& operator[](nvfuser_index_t) {
17 | return *data;
18 | };
19 |
20 | T* data;
21 | };
22 |
23 | // Specialization for 0-dim case that's easy to pass in a CPU based tensor.
24 | template
25 | struct CpuScalarTensor {
26 | __device__ T& operator[](int) {
27 | return data;
28 | };
29 |
30 | T data;
31 | };
32 |
--------------------------------------------------------------------------------
/third_party/nvfuser/runtime/type_traits.cu:
--------------------------------------------------------------------------------
1 | // Type trait utils
2 | template
3 | struct MaybeVolatile;
4 |
5 | template
6 | struct MaybeVolatile {
7 | using type = volatile Type;
8 | };
9 |
10 | template
11 | struct MaybeVolatile {
12 | using type = Type;
13 | };
14 |
15 | template
16 | struct TypeList {};
17 |
18 | template
19 | struct TypeSelector {
20 | using type = typename TypeSelector::type;
21 | };
22 |
23 | template
24 | struct TypeSelector<0, T, Types...> {
25 | using type = T;
26 | };
27 |
28 | template
29 | struct IsSameType {
30 | static constexpr bool value = false;
31 | };
32 |
33 | template
34 | struct IsSameType {
35 | static constexpr bool value = true;
36 | };
37 |
38 | template
39 | struct IsPointerType {
40 | static constexpr bool value = false;
41 | };
42 |
43 | template
44 | struct IsPointerType {
45 | static constexpr bool value = true;
46 | };
47 |
--------------------------------------------------------------------------------
/third_party/sleef.bzl:
--------------------------------------------------------------------------------
1 | load("@rules_cc//cc:defs.bzl", "cc_library")
2 |
3 | # This macro provides for generating both "sleef" and
4 | # "sleefdet" libraries for a given set of code. The difference is
5 | # that the "det" libraries get compiled with "-DDETERMINISTIC=1".
6 |
7 | def sleef_cc_library(name, copts, **kwargs):
8 | cc_library(
9 | name = name,
10 | copts = copts,
11 | **kwargs
12 | )
13 |
14 | prefix = "sleef"
15 | if not name.startswith(prefix):
16 | fail("name {} does not start with {}".format(repr(name), repr(prefix)))
17 |
18 | cc_library(
19 | name = name.replace(prefix, prefix + "det", 1),
20 | copts = copts + ["-DDETERMINISTIC=1"],
21 | **kwargs
22 | )
23 |
--------------------------------------------------------------------------------
/third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl:
--------------------------------------------------------------------------------
1 | # Macros for building CUDA code.
2 | def if_cuda(if_true, if_false = []):
3 | """Shorthand for select()'ing on whether we're building with CUDA.
4 |
5 | Returns a select statement which evaluates to if_true if we're building
6 | with CUDA enabled. Otherwise, the select statement evaluates to if_false.
7 |
8 | """
9 | return select({
10 | "@local_config_cuda//cuda:using_clang": if_true,
11 | "@local_config_cuda//cuda:using_nvcc": if_true,
12 | "//conditions:default": if_false,
13 | })
14 |
15 | def cuda_default_copts():
16 | """Default options for all CUDA compilations."""
17 | return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
18 |
19 | def cuda_is_configured():
20 | """Returns true if CUDA was enabled during the configure process."""
21 | return True
22 |
23 | def if_cuda_is_configured(x):
24 | """Tests if the CUDA was enabled during the configure process.
25 |
26 | Unlike if_cuda(), this does not require that we are building with
27 | --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
28 | """
29 | if cuda_is_configured():
30 | return x
31 | return []
32 |
--------------------------------------------------------------------------------
/third_party/valgrind-headers/README.md:
--------------------------------------------------------------------------------
1 | This folder contains 2 Valgrind headers, downloaded from
2 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=callgrind/callgrind.h;hb=HEAD
3 | https://sourceware.org/git/?p=valgrind.git;a=blob;f=include/valgrind.h;hb=HEAD
4 |
5 |
6 |
--------------------------------------------------------------------------------
/torchgen/BUCK.oss:
--------------------------------------------------------------------------------
1 | python_library(
2 | name = "torchgen",
3 | srcs = glob(
4 | ["**/*.py"],
5 | ),
6 | base_module = "torchgen",
7 | visibility = ["PUBLIC"],
8 | deps = [
9 | "//third_party:pyyaml",
10 | "//third_party:typing-extensions",
11 | ],
12 | )
13 |
14 | python_binary(
15 | name = "gen",
16 | main_module = "torchgen.gen",
17 | visibility = [
18 | "PUBLIC",
19 | ],
20 | deps = [
21 | ":torchgen",
22 | ],
23 | )
24 |
--------------------------------------------------------------------------------
/torchgen/BUILD.bazel:
--------------------------------------------------------------------------------
1 | load("//:tools/bazel.bzl", "rules")
2 | load(":build.bzl", "define_targets")
3 |
4 | define_targets(rules = rules)
5 |
--------------------------------------------------------------------------------
/torchgen/__init__.py:
--------------------------------------------------------------------------------
1 | """torchgen
2 |
3 | This module contains codegeneration utilities for PyTorch. It is used to
4 | build PyTorch from source, but may also be used for out-of-tree projects
5 | that extend PyTorch.
6 |
7 | Note well that we provide no BC guarantees for torchgen. If you're interested
8 | in using torchgen and want the PyTorch team to be aware, please reach out
9 | on GitHub.
10 | """
11 |
--------------------------------------------------------------------------------
/torchgen/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/api/__init__.py
--------------------------------------------------------------------------------
/torchgen/api/meta.py:
--------------------------------------------------------------------------------
1 | from torchgen.model import NativeFunctionsGroup
2 |
3 | # Follows dispatcher calling convention, but:
4 | # - Mutable arguments not allowed. Meta functions are always
5 | # written in functional form. Look at FunctionSchema.signature()
6 | # - No tensor returns; instead we return a TensorMeta describing
7 | # the tensor in question
8 |
9 |
10 | def name(g: NativeFunctionsGroup) -> str:
11 | # use the overload name from the functional version
12 | return str(g.functional.func.name).replace(".", "_")
13 |
--------------------------------------------------------------------------------
/torchgen/api/types/__init__.py:
--------------------------------------------------------------------------------
1 | from .types import *
2 | from .types_base import *
3 | from .signatures import * # isort:skip
4 |
--------------------------------------------------------------------------------
/torchgen/build.bzl:
--------------------------------------------------------------------------------
1 | def define_targets(rules):
2 | rules.py_library(
3 | name = "torchgen",
4 | srcs = rules.glob(["**/*.py"]),
5 | visibility = ["//visibility:public"],
6 | deps = [
7 | rules.requirement("PyYAML"),
8 | rules.requirement("typing-extensions"),
9 | ],
10 | )
11 |
12 | rules.py_binary(
13 | name = "gen",
14 | srcs = [":torchgen"],
15 | visibility = ["//visibility:public"],
16 | )
17 |
18 | rules.py_binary(
19 | name = "gen_executorch",
20 | srcs = [":torchgen"],
21 | visibility = ["//visibility:public"],
22 | )
23 |
--------------------------------------------------------------------------------
/torchgen/dest/__init__.py:
--------------------------------------------------------------------------------
1 | from .lazy_ir import (
2 | generate_non_native_lazy_ir_nodes as generate_non_native_lazy_ir_nodes,
3 | GenLazyIR as GenLazyIR,
4 | GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition,
5 | GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition,
6 | )
7 | from .native_functions import (
8 | compute_native_function_declaration as compute_native_function_declaration,
9 | )
10 | from .register_dispatch_key import (
11 | gen_registration_headers as gen_registration_headers,
12 | gen_registration_helpers as gen_registration_helpers,
13 | RegisterDispatchKey as RegisterDispatchKey,
14 | )
15 | from .ufunc import (
16 | compute_ufunc_cpu as compute_ufunc_cpu,
17 | compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
18 | compute_ufunc_cuda as compute_ufunc_cuda,
19 | )
20 |
--------------------------------------------------------------------------------
/torchgen/executorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/__init__.py
--------------------------------------------------------------------------------
/torchgen/executorch/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/executorch/api/__init__.py
--------------------------------------------------------------------------------
/torchgen/executorch/api/types/__init__.py:
--------------------------------------------------------------------------------
1 | from .types import *
2 | from .signatures import * # isort:skip
3 |
--------------------------------------------------------------------------------
/torchgen/operator_versions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/operator_versions/__init__.py
--------------------------------------------------------------------------------
/torchgen/operator_versions/gen_mobile_upgraders_constant.py:
--------------------------------------------------------------------------------
1 | MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/**
2 | * @generated
3 | * This is an auto-generated file. Please do not modify it by hand.
4 | * To re-generate, please run:
5 | * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py
6 | */
7 | """
8 |
--------------------------------------------------------------------------------
/torchgen/selective_build/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/selective_build/__init__.py
--------------------------------------------------------------------------------
/torchgen/static_runtime/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc123/pytorch/a39ea6f21361e531ce7e703224bfbce7fc564083/torchgen/static_runtime/__init__.py
--------------------------------------------------------------------------------