├── patches ├── 0005-fp16-min-cmake.patch ├── 0004-psimd-min-cmake.patch ├── 0007-onnx-min-cmake.patch ├── 0008-foxi-min-cmake.patch ├── 0002-protobuf-min-cmake.patch ├── 0003-clog-min-cmake.patch ├── 0001-add-blackwell-arch.patch ├── 0009-cufft-guard-missing-enums.patch ├── 0010-cub-equality-fallback.patch ├── 0006-find-cub-from-cuda.patch └── 0011-cub-modernize.patch ├── LICENSE ├── .gitignore └── README.md /patches/0005-fp16-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/third_party/FP16/CMakeLists.txt b/third_party/FP16/CMakeLists.txt 2 | index 937cf3b54b5..1a62278f235 100644 3 | --- a/third_party/FP16/CMakeLists.txt 4 | +++ b/third_party/FP16/CMakeLists.txt 5 | @@ -1,4 +1,4 @@ 6 | -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 7 | +CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) 8 | 9 | INCLUDE(GNUInstallDirs) 10 | 11 | -------------------------------------------------------------------------------- /patches/0004-psimd-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/third_party/psimd/CMakeLists.txt b/third_party/psimd/CMakeLists.txt 2 | index aa2a0781ff4..9b8d97323b4 100644 3 | --- a/third_party/psimd/CMakeLists.txt 4 | +++ b/third_party/psimd/CMakeLists.txt 5 | @@ -1,4 +1,4 @@ 6 | -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 7 | +CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) 8 | 9 | INCLUDE(GNUInstallDirs) 10 | 11 | -------------------------------------------------------------------------------- /patches/0007-onnx-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/third_party/onnx/CMakeLists.txt b/third_party/onnx/CMakeLists.txt 2 | index 6d7ca846..446e4bb0 100644 3 | --- a/third_party/onnx/CMakeLists.txt 4 | +++ b/third_party/onnx/CMakeLists.txt 5 | @@ -1,5 +1,5 @@ 6 | # Minimum CMake required 7 | -cmake_minimum_required(VERSION 3.1) 8 | +cmake_minimum_required(VERSION 3.21) 9 | include(cmake/Utils.cmake) 10 | # Set default build type 11 | if(NOT CMAKE_BUILD_TYPE) 12 | 13 | -------------------------------------------------------------------------------- /patches/0008-foxi-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/third_party/foxi/CMakeLists.txt b/third_party/foxi/CMakeLists.txt 2 | index b406d0f..4120c94 100644 3 | --- a/third_party/foxi/CMakeLists.txt 4 | +++ b/third_party/foxi/CMakeLists.txt 5 | @@ -1,5 +1,5 @@ 6 | # Minimum CMake required 7 | -cmake_minimum_required(VERSION 3.1) 8 | +cmake_minimum_required(VERSION 3.21) 9 | # Set default build type 10 | if(NOT CMAKE_BUILD_TYPE) 11 | message(STATUS "Build type not set - defaulting to Release") 12 | 13 | -------------------------------------------------------------------------------- /patches/0002-protobuf-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/third_party/protobuf/cmake/CMakeLists.txt b/third_party/protobuf/cmake/CMakeLists.txt 2 | index 9ca31ac0b..123501c66 100644 3 | --- a/third_party/protobuf/cmake/CMakeLists.txt 4 | +++ b/third_party/protobuf/cmake/CMakeLists.txt 5 | @@ -1,5 +1,5 @@ 6 | # Minimum CMake required 7 | -cmake_minimum_required(VERSION 3.1.3) 8 | +cmake_minimum_required(VERSION 3.5.0) 9 | 10 | if(protobuf_VERBOSE) 11 | message(STATUS "Protocol Buffers Configuring...") 12 | -------------------------------------------------------------------------------- /patches/0003-clog-min-cmake.patch: -------------------------------------------------------------------------------- 1 | diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt 2 | index e763e4e3ba9..2b62765cf2f 100644 3 | --- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt 4 | +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt 5 | @@ -4,7 +4,7 @@ 6 | # This source code is licensed under the BSD-style license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | 9 | -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) 10 | +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) 11 | 12 | include(GNUInstallDirs) 13 | 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Taco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /patches/0001-add-blackwell-arch.patch: -------------------------------------------------------------------------------- 1 | diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake 2 | index 90de8fb0d84..0ddcd7fb78d 100644 3 | --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake 4 | +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake 5 | @@ -186,7 +186,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) 6 | set(add_ptx TRUE) 7 | set(arch_name ${CMAKE_MATCH_1}) 8 | endif() 9 | - if(arch_name MATCHES "^([0-9]\\.[0-9]a?(\\([0-9]\\.[0-9]\\))?)$") 10 | + if(arch_name MATCHES "^([0-9]+\\.[0-9]a?(\\([0-9]+\\.[0-9]\\))?)$") 11 | set(arch_bin ${CMAKE_MATCH_1}) 12 | set(arch_ptx ${arch_bin}) 13 | else() 14 | @@ -223,6 +223,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) 15 | elseif(${arch_name} STREQUAL "Hopper") 16 | set(arch_bin 9.0) 17 | set(arch_ptx 9.0) 18 | + elseif(${arch_name} STREQUAL "Blackwell") 19 | + set(arch_bin 11.0) 20 | + set(arch_ptx 11.0) 21 | else() 22 | message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") 23 | endif() 24 | -------------------------------------------------------------------------------- /patches/0009-cufft-guard-missing-enums.patch: -------------------------------------------------------------------------------- 1 | diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h 2 | index 4b02f914d7e..b637e99bd4a 100644 3 | --- a/aten/src/ATen/native/cuda/CuFFTUtils.h 4 | +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h 5 | @@ -38,19 +38,25 @@ static inline std::string _cudaGetErrorEnum(cufftResult error) 6 | return "CUFFT_INVALID_SIZE"; 7 | case CUFFT_UNALIGNED_DATA: 8 | return "CUFFT_UNALIGNED_DATA"; 9 | + #if defined(CUFFT_INCOMPLETE_PARAMETER_LIST) 10 | case CUFFT_INCOMPLETE_PARAMETER_LIST: 11 | return "CUFFT_INCOMPLETE_PARAMETER_LIST"; 12 | + #endif 13 | case CUFFT_INVALID_DEVICE: 14 | return "CUFFT_INVALID_DEVICE"; 15 | + #if defined(CUFFT_PARSE_ERROR) 16 | case CUFFT_PARSE_ERROR: 17 | return "CUFFT_PARSE_ERROR"; 18 | + #endif 19 | case CUFFT_NO_WORKSPACE: 20 | return "CUFFT_NO_WORKSPACE"; 21 | case CUFFT_NOT_IMPLEMENTED: 22 | return "CUFFT_NOT_IMPLEMENTED"; 23 | #if !defined(USE_ROCM) 24 | + #if defined(CUFFT_LICENSE_ERROR) 25 | case CUFFT_LICENSE_ERROR: 26 | return "CUFFT_LICENSE_ERROR"; 27 | + #endif 28 | #endif 29 | case CUFFT_NOT_SUPPORTED: 30 | return "CUFFT_NOT_SUPPORTED"; 31 | 32 | -------------------------------------------------------------------------------- /patches/0010-cub-equality-fallback.patch: -------------------------------------------------------------------------------- 1 | diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh 2 | index c5261534dd4..60597b01b8f 100644 3 | --- a/aten/src/ATen/cuda/cub.cuh 4 | +++ b/aten/src/ATen/cuda/cub.cuh 5 | @@ -7,6 +7,9 @@ 6 | #include 7 | 8 | #include 9 | +#if !defined(USE_ROCM) && CUB_VERSION >= 200000 10 | +#include 11 | +#endif 12 | 13 | #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() 14 | 15 | @@ -51,6 +54,12 @@ 16 | #define ROCM_HIPCUB(x) x 17 | #endif 18 | 19 | +#if CUB_VERSION < 200000 20 | +#define AT_CUDA_CUB_EQUALITY() NO_ROCM(at_cuda_detail)::cub::Equality() 21 | +#else 22 | +#define AT_CUDA_CUB_EQUALITY() ::cuda::std::equal_to<>() 23 | +#endif 24 | + 25 | #if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM) 26 | 27 | #if !defined(USE_ROCM) 28 | @@ -364,7 +373,7 @@ inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT i 29 | TORCH_CHECK(num_items <= std::numeric_limits::max(), 30 | "cub InclusiveSumByKey does not support more than INT_MAX elements"); 31 | CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveSumByKey, 32 | - keys, input, output, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream()); 33 | + keys, input, output, num_items, AT_CUDA_CUB_EQUALITY(), at::cuda::getCurrentCUDAStream()); 34 | } 35 | 36 | template 37 | @@ -372,7 +381,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT 38 | TORCH_CHECK(num_items <= std::numeric_limits::max(), 39 | "cub InclusiveSumByKey does not support more than INT_MAX elements"); 40 | CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveScanByKey, 41 | - keys, input, output, scan_op, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream()); 42 | + keys, input, output, scan_op, num_items, AT_CUDA_CUB_EQUALITY(), at::cuda::getCurrentCUDAStream()); 43 | } 44 | 45 | #endif 46 | 47 | -------------------------------------------------------------------------------- /patches/0006-find-cub-from-cuda.patch: -------------------------------------------------------------------------------- 1 | diff --git a/cmake/Modules/FindCUB.cmake b/cmake/Modules/FindCUB.cmake 2 | index e053964e6e4..eadb86b7431 100644 3 | --- a/cmake/Modules/FindCUB.cmake 4 | +++ b/cmake/Modules/FindCUB.cmake 5 | @@ -2,8 +2,56 @@ 6 | # CUB_FOUND - system has CUB 7 | # CUB_INCLUDE_DIRS - the CUB include directory 8 | 9 | +set(_CUB_HINTS) 10 | +set(_CUB_TARGET_SUFFIXES 11 | + "/targets/sbsa-linux/include" 12 | + "/targets/aarch64-linux/include" 13 | + "/targets/x86_64-linux/include" 14 | +) 15 | + 16 | +if(DEFINED CUDA_TOOLKIT_INCLUDE) 17 | + list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_INCLUDE}") 18 | +endif() 19 | +if(DEFINED CUDA_TOOLKIT_ROOT_DIR) 20 | + list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/include") 21 | + foreach(_suffix ${_CUB_TARGET_SUFFIXES}) 22 | + list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_ROOT_DIR}${_suffix}") 23 | + endforeach() 24 | +endif() 25 | +if(DEFINED CUDA_INCLUDE_DIRS) 26 | + list(APPEND _CUB_HINTS ${CUDA_INCLUDE_DIRS}) 27 | +endif() 28 | +if(DEFINED CUDAToolkit_INCLUDE_DIRS) 29 | + list(APPEND _CUB_HINTS ${CUDAToolkit_INCLUDE_DIRS}) 30 | +endif() 31 | +if(DEFINED ENV{CUDA_HOME}) 32 | + list(APPEND _CUB_HINTS "$ENV{CUDA_HOME}/include") 33 | + foreach(_suffix ${_CUB_TARGET_SUFFIXES}) 34 | + list(APPEND _CUB_HINTS "$ENV{CUDA_HOME}${_suffix}") 35 | + endforeach() 36 | +endif() 37 | +if(DEFINED ENV{CUDA_PATH}) 38 | + list(APPEND _CUB_HINTS "$ENV{CUDA_PATH}/include") 39 | + foreach(_suffix ${_CUB_TARGET_SUFFIXES}) 40 | + list(APPEND _CUB_HINTS "$ENV{CUDA_PATH}${_suffix}") 41 | + endforeach() 42 | +endif() 43 | +list(APPEND _CUB_HINTS "/usr/local/cuda/include" "/opt/cuda/include") 44 | + 45 | +set(_CUB_EXPANDED_HINTS) 46 | +foreach(_hint ${_CUB_HINTS}) 47 | + if(_hint) 48 | + list(APPEND _CUB_EXPANDED_HINTS "${_hint}") 49 | + if(EXISTS "${_hint}/cccl") 50 | + list(APPEND _CUB_EXPANDED_HINTS "${_hint}/cccl") 51 | + endif() 52 | + endif() 53 | +endforeach() 54 | + 55 | +list(REMOVE_DUPLICATES _CUB_EXPANDED_HINTS) 56 | + 57 | find_path(CUB_INCLUDE_DIR 58 | - HINTS "${CUDA_TOOLKIT_INCLUDE}" 59 | + HINTS ${_CUB_EXPANDED_HINTS} 60 | NAMES cub/cub.cuh 61 | DOC "The directory where CUB includes reside" 62 | ) 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[codz] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | #poetry.toml 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 115 | #pdm.lock 116 | #pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # pixi 121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 122 | #pixi.lock 123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 124 | # in the .venv directory. It is recommended not to include this directory in version control. 125 | .pixi 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .envrc 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | # Abstra 179 | # Abstra is an AI-powered process automation framework. 180 | # Ignore directories containing user credentials, local state, and settings. 181 | # Learn more at https://abstra.io/docs 182 | .abstra/ 183 | 184 | # Visual Studio Code 185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 187 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 188 | # you could uncomment the following to ignore the entire vscode folder 189 | # .vscode/ 190 | 191 | # Ruff stuff: 192 | .ruff_cache/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | 197 | # Cursor 198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 200 | # refer to https://docs.cursor.com/context/ignore-files 201 | .cursorignore 202 | .cursorindexingignore 203 | 204 | # Marimo 205 | marimo/_static/ 206 | marimo/_lsp/ 207 | __marimo__/ 208 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch from Source on Jetson Orin & Thor 2 | 3 | This workspace automates building stock [PyTorch](https://github.com/pytorch/pytorch#from-source) with CUDA enabled for Python 3.8–3.12 on both Jetson Orin (Ampere, JetPack 6.x) and Jetson AGX Thor (Blackwell, JetPack 7.x). It codifies the upstream PyTorch instructions and the NVIDIA Developer Forum guidance for [Orin builds](https://forums.developer.nvidia.com/t/native-build-of-pytorch-for-jetson/71842) and [Thor/JetPack 7 builds](https://forums.developer.nvidia.com/t/pytorch-2-4-build-jetson-orin/291219). NVIDIA 官方 pip 仓库只提供少量预编译版本,这里可以自编译带 CUDA 的版本以匹配需求。 4 | 5 | ## Prerequisites 6 | 7 | - JetPack 6.x (Orin) or JetPack 7.x (Thor) with CUDA `/usr/local/cuda` and cuDNN already installed. 8 | - At least 32 GB of free disk (more if you plan to keep all 3 wheels at once) and large swap (builds routinely spill >16 GB RAM). 9 | - System packages: 10 | 11 | ```bash 12 | sudo apt update 13 | sudo apt install -y build-essential git cmake ninja-build \ 14 | libopenblas-dev libopenmpi-dev openmpi-bin libatlas-base-dev libprotobuf-dev \ 15 | protobuf-compiler libssl-dev zlib1g-dev libffi-dev 16 | ``` 17 | - `~/miniconda3` (already present on this machine) or any conda distribution. The scripts will create isolated envs per Python version. 18 | 19 | Thor-specific sanity checks (taken from this devkit, JetPack 7.0 / Ubuntu 24.04): 20 | 21 | ```bash 22 | uname -a 23 | # Linux thor-taco 6.8.12-tegra ... aarch64 GNU/Linux 24 | cat /etc/nv_tegra_release 25 | # R38.2.2 ... BOARD: generic (AGX Thor) 26 | nvidia-smi --query-gpu=name,compute_cap,driver_version,cuda_version --format=csv 27 | # NVIDIA Thor, 11.0, 580.00, 13.0 28 | ``` 29 | 30 | The `build.sh` script auto-detects the compute capability (`TORCH_CUDA_ARCH_LIST`) via `nvidia-smi` when present, falling back to `/proc/device-tree/model`. On this Thor devkit it resolves to `11.0`; on Orin it defaults to `8.7`. 31 | 32 | > ℹ️ Jetson builds cannot currently use NVIDIA's binary NCCL. Following the forum advice above, the scripts default to `USE_NCCL=0`, `USE_DISTRIBUTED=0`, `USE_MKLDNN=0`, and `USE_NNPACK=0`. Override them if you have working alternatives. 33 | 34 | ## Layout 35 | 36 | - `build.sh` — clones PyTorch (once), prepares the requested Python env, and runs `python setup.py bdist_wheel` with Jetson-friendly defaults (auto-detected CUDA arch, NCCL disabled unless you opt in, etc.). 37 | - `build-all.sh` — convenience wrapper that invokes `build.sh` for 3.10, 3.11, and 3.12 (or any list of versions you pass). 38 | - `src/` — source tree managed by the scripts (`src/pytorch` is the git checkout). 39 | - `logs/` — timestamped build logs per Python version. 40 | - `wheels/` — collected `.whl` artefacts per Python version (`wheels/py310`, `wheels/py311`, ...). 41 | 42 | ## Quick start 43 | 44 | ```bash 45 | cd ~/jetson-pytorch-builder 46 | chmod +x build.sh build-all.sh 47 | # Build all supported versions (3.8–3.12) 48 | ./build-all.sh 49 | # OR build one at a time 50 | # ^ Python version ^ optional PyTorch git ref/tag 51 | ./build.sh 3.11 v2.4.1 52 | ``` 53 | 54 | Each run: 55 | 56 | 1. Creates/updates `src/pytorch` (defaults to upstream tag `v2.4.0`, override with `PYTORCH_BRANCH=` or pass a second argument such as `./build.sh 3.12 main`). 57 | 2. Creates a matching conda env (`torch-py310`, `torch-py311`, `torch-py312`) if it does not exist yet. 58 | 3. Installs PyTorch's Python build requirements into the env. 59 | 4. Cleans the repo tree (`git clean -fdx`) to avoid cross-version contamination. 60 | 5. Compiles PyTorch with CUDA enabled, targeting the detected GPU (`TORCH_CUDA_ARCH_LIST` auto-detects to 8.7 for Orin, 11.0 for Thor; override via env var to cross-compile). 61 | 6. Copies the newest `torch-*.whl` into `wheels/pyNNN`. 62 | 63 | Successful builds print the wheel path at the end and log everything to `logs/pytorch-py-.log`. 64 | 65 | ## Current Support 66 | Note: Marked is tested versions. Some may work but not tested/updated yet. 67 | 68 | * [X] Pytorch 2.4.0 69 | * [ ] Pytorch 2.9.1 (partial) 70 | 71 | * ### Jetson Orin 72 | 73 | * [ ] Python 3.8 74 | * [ ] Python 3.9 75 | * [ ] Python 3.10 76 | * [X] Python 3.11 77 | * [ ] Python 3.12 78 | * ### Jetson Thor 79 | 80 | * [X] Python 3.8 81 | * [X] Python 3.9 82 | * [X] Python 3.10 83 | * [X] Python 3.11 84 | * [X] Python 3.12 85 | 86 | 87 | ## Customisation 88 | 89 | All relevant knobs can be changed through environment variables: 90 | 91 | | Variable | Default | Meaning | 92 | | ---------------------------------------------------------------------------------- | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 93 | | `PYTORCH_BRANCH` | `v2.4.0` | Upstream tag/branch to checkout (can also pass as the second argument to `build.sh`). | 94 | | `PYTORCH_REPO` | `https://github.com/pytorch/pytorch.git` | Clone source. | 95 | | `TORCH_CUDA_ARCH_LIST` | auto (`11.0` on Thor, `8.7` on Orin) | Target GPU architectures. Override to cross-compile. | 96 | | `MAX_JOBS` | `$(nproc)` | Parallel compilation jobs. Tune to control RAM usage. | 97 | | `CUDA_HOME` | `/usr/local/cuda` | CUDA root. | 98 | | `USE_NCCL`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_NNPACK`, `USE_QNNPACK` | Jetson defaults set in `build.sh`. | | 99 | | `TORCH_VERSION_OVERRIDE` | auto from tag (e.g.,`v2.4.0` → `2.4.0`) | Forces `TORCH_BUILD_VERSION` so the wheel filename/metadata advertises your custom build. Set empty to keep upstream git-style versions or to supply your own (e.g., `2.4.0-jetson.1`). | 100 | | `TORCH_BUILD_NUMBER_OVERRIDE` | `1` | Optional build number passed along when `TORCH_VERSION_OVERRIDE` is set. | 101 | 102 | Example: 103 | 104 | ```bash 105 | TORCH_CUDA_ARCH_LIST="8.7;8.9" USE_NCCL=1 MAX_JOBS=8 ./build.sh 3.12 106 | ``` 107 | 108 | ## Installing the wheels 109 | 110 | Once a build finishes, install it inside any target environment (conda, system Python, etc.): 111 | 112 | ```bash 113 | pip install ~/jetson-pytorch-builder/wheels/py312/torch-*.whl 114 | ``` 115 | 116 | Copy the wheel to other Jetson nodes as needed. Keep the logs handy for support/bug reports. 117 | 118 | ### Versioning and torchvision / torchaudio compatibility 119 | 120 | By default PyTorch's build system emits versions like `2.4.0a0+git`. This repo now **auto-sets `TORCH_BUILD_VERSION` to the numeric part of your tag** (e.g., `v2.4.0` → `2.4.0`), so the wheel name/metadata matches what torchvision/torchaudio expect. For non-tag refs (e.g., `main`), no override is applied unless you set it explicitly. 121 | 122 | Two ways to stay sane: 123 | 124 | 1. **Set an explicit version for your wheel.** 125 | 126 | ```bash 127 | TORCH_VERSION_OVERRIDE="2.4.0-jetson.1" ./build.sh 3.11 v2.4.0 128 | ``` 129 | 130 | The resulting wheel becomes `torch-2.4.0-jetson.1-...whl`, making it easy to match dependencies. 131 | 2. **Install torchvision without re-resolving torch.** 132 | 133 | If you keep the default `2.4.0a0+git...` version, install the matching source release and skip dependency checks: 134 | 135 | ```bash 136 | pip install torchvision==0.19.0 --no-deps 137 | pip install torchaudio==2.4.0 --no-deps # adjust to the PyTorch series you built 138 | ``` 139 | 140 | This mirrors the PyTorch instructions for source builds where `torch` is already present. 141 | 142 | ## Thor (Blackwell / JetPack 7) notes 143 | 144 | - JetPack 7 ships CUDA 13.0 and driver 580; make sure host packages and `CUDA_HOME` point to `/usr/local/cuda-13.0` (symlinked by default). The script logs the resolved path for traceability. 145 | - Blackwell support in PyTorch is still evolving; stick to PyTorch v2.4+ (default `v2.4.0`) or nightly master for proper `sm_110` kernels. You can change `PYTORCH_BRANCH` to `main` when you need bleeding-edge fixes. 146 | - NCCL is still unavailable on Jetson, so distributed training remains disabled. 147 | - If you parallelize with `MAX_JOBS > 8`, ensure Thor's LPDDR memory controller has enough headroom or the build may thrash swap. 148 | 149 | ## Troubleshooting notes 150 | 151 | - Add swap with `sudo fallocate -l 32G /swapfile && sudo mkswap /swapfile ...` if the compiler OOMs. 152 | - Ensure `nvcc --version` matches your JetPack CUDA (`nvcc --version` should report 13.0 on Thor, 12.x on Orin). If not, export `CUDA_HOME` explicitly. 153 | - `python setup.py clean` is implicitly handled by `git clean -fdx`; remove `build/` manually if you pause/resume by hand. 154 | - Refer to the PyTorch source build doc and NVIDIA forum threads listed at the top for more edge-case fixes (e.g., building with TensorRT, CUTLASS tuning, FlashAttention patches, etc.). 155 | 156 | Happy compiling! 157 | 158 | Special notes: This repo contains codes generated by Codex GPT-5.1 medium. I do not garentee this will work on your machine. Merge requests welcomed. 159 | Tested on: 160 | Jetson Thor: Linux thor-taco 6.8.12-tegra #1 SMP PREEMPT Thu Sep 25 15:19:42 PDT 2025 aarch64 aarch64 aarch64 GNU/Linux 161 | Soc: tegra264 162 | CUDA Arch BIN: 13.0 163 | L4T: 38.2.2 164 | Jetpack: 7.0 165 | CUDA: 13.0.48 166 | cuDNN: 9.12.0 167 | TensorRT: 10.13.3.9 168 | -------------------------------------------------------------------------------- /patches/0011-cub-modernize.patch: -------------------------------------------------------------------------------- 1 | diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu 2 | index 839652f581a..25d7285323c 100644 3 | --- a/aten/src/ATen/cuda/cub.cu 4 | +++ b/aten/src/ATen/cuda/cub.cu 5 | @@ -15,8 +15,8 @@ struct SumOp { 6 | 7 | template 8 | void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t num_items) { 9 | - using NO_ROCM(at_cuda_detail)::cub::Sum; 10 | - inclusive_scan(input, output, Sum{}, num_items); 11 | + using scalar_t = std::common_type_t; 12 | + inclusive_scan(input, output, SumOp{}, num_items); 13 | } 14 | 15 | template void inclusive_sum_truncating(const int32_t *input, int32_t *output, int64_t num_items); 16 | @@ -33,18 +33,26 @@ template void exclusive_sum_in_common_type(const int32_t *input, int32_t *output 17 | template void exclusive_sum_in_common_type(const int64_t *input, int64_t *output, int64_t num_items); 18 | 19 | namespace { 20 | -struct CountMaskOp { 21 | - __device__ int64_t operator() (const uint8_t &x) const { 22 | - return x != 0; 23 | +__global__ void mask_to_int64(const uint8_t* mask, int64_t* out, int64_t n) { 24 | + auto idx = blockIdx.x * blockDim.x + threadIdx.x; 25 | + if (idx < n) { 26 | + out[idx] = mask[idx] != 0; 27 | } 28 | -}; 29 | +} 30 | } 31 | 32 | void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n) { 33 | - CountMaskOp op{}; 34 | - auto iter = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator< 35 | - bool, decltype(op), decltype(mask)>(mask, op); 36 | - exclusive_scan(iter, output_idx, SumOp{}, int64_t{0}, n); 37 | + if (n <= 0) { 38 | + return; 39 | + } 40 | + auto allocator = c10::cuda::CUDACachingAllocator::get(); 41 | + auto temp = allocator->allocate(static_cast(n) * sizeof(int64_t)); 42 | + auto temp_ptr = static_cast(temp.get()); 43 | + constexpr int threads = 256; 44 | + const int blocks = static_cast((n + threads - 1) / threads); 45 | + mask_to_int64<<>>(mask, temp_ptr, n); 46 | + C10_CUDA_KERNEL_LAUNCH_CHECK(); 47 | + exclusive_scan(temp_ptr, output_idx, SumOp{}, int64_t{0}, n); 48 | } 49 | 50 | } // namespace at::cuda::cub 51 | diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu 52 | index b8fb51304e4..82d386ffadc 100644 53 | --- a/aten/src/ATen/native/cuda/Embedding.cu 54 | +++ b/aten/src/ATen/native/cuda/Embedding.cu 55 | @@ -16,6 +16,8 @@ 56 | #include 57 | 58 | #if CUB_SUPPORTS_SCAN_BY_KEY() 59 | +#include 60 | +#include 61 | #include 62 | #endif 63 | 64 | @@ -317,7 +319,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice 65 | auto count_data = count.mutable_data_ptr(); 66 | cuda::cub::inclusive_sum_by_key( 67 | sorted_data, 68 | - at_cuda_detail::cub::ConstantInputIterator(1), 69 | + thrust::make_constant_iterator(1), 70 | count_data, 71 | num_indices 72 | ); 73 | @@ -329,7 +331,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice 74 | thrust::make_reverse_iterator(sorted_data + num_indices), 75 | thrust::make_reverse_iterator(static_cast(count_data) + num_indices), 76 | thrust::make_reverse_iterator(count_data + num_indices), 77 | - at_cuda_detail::cub::Max(), 78 | + thrust::maximum{}, 79 | num_indices 80 | ); 81 | }); 82 | diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu 83 | index 7c9f845b7ee..6795282319f 100644 84 | --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu 85 | +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu 86 | @@ -31,6 +31,8 @@ 87 | #include 88 | 89 | #if CUB_SUPPORTS_SCAN_BY_KEY() 90 | +#include 91 | +#include 92 | #include 93 | #endif 94 | 95 | @@ -212,7 +214,7 @@ Tensor embedding_bag_backward_cuda_sum_avg( 96 | auto count_data = count.mutable_data_ptr(); 97 | cuda::cub::inclusive_sum_by_key( 98 | sorted_data, 99 | - at_cuda_detail::cub::ConstantInputIterator(1), 100 | + thrust::make_constant_iterator(1), 101 | count_data, 102 | num_indices 103 | ); 104 | @@ -222,9 +224,9 @@ Tensor embedding_bag_backward_cuda_sum_avg( 105 | // count: 1 3 3 3 2 2 1 2 2 106 | cuda::cub::inclusive_scan_by_key( 107 | thrust::make_reverse_iterator(sorted_data + num_indices), 108 | + thrust::make_reverse_iterator(static_cast(count_data) + num_indices), 109 | thrust::make_reverse_iterator(count_data + num_indices), 110 | - thrust::make_reverse_iterator(count_data + num_indices), 111 | - at_cuda_detail::cub::Max(), 112 | + thrust::maximum{}, 113 | num_indices 114 | ); 115 | }); 116 | diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu 117 | index e87f46cd844..c284b1b368e 100644 118 | --- a/aten/src/ATen/native/cuda/Nonzero.cu 119 | +++ b/aten/src/ATen/native/cuda/Nonzero.cu 120 | @@ -7,6 +7,8 @@ 121 | #include 122 | #include //for MAX_DIMS 123 | #include 124 | +#include 125 | +#include 126 | 127 | #ifndef AT_PER_OPERATOR_HEADERS 128 | #include 129 | @@ -65,7 +67,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){ 130 | size_t temp_storage_bytes=0; 131 | auto& allocator = *c10::cuda::CUDACachingAllocator::get(); 132 | auto num_nonzeros = allocator.allocate(sizeof(int)); 133 | - cub::TransformInputIterator, const scalar_t*> itr(self_.const_data_ptr(), NonZeroOp()); 134 | + auto itr = thrust::make_transform_iterator(self_.const_data_ptr(), NonZeroOp()); 135 | cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream); 136 | auto temp_storage = allocator.allocate(temp_storage_bytes); 137 | cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream); 138 | @@ -82,7 +84,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){ 139 | out.resize_({self.dim(), num_nonzeros_h}); 140 | //Scalars are expected to produce output of size (1,0), so we can't write to it 141 | if (self.dim() > 0) { 142 | - cub::CountingInputIterator counting_itr(0); 143 | + auto counting_itr = thrust::make_counting_iterator(0); 144 | temp_storage_bytes = 0; 145 | cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr, 146 | out_temp.mutable_data_ptr(), (int*)num_nonzeros.get(), N, stream); 147 | diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu 148 | index d06efa66351..b3e52a63313 100644 149 | --- a/aten/src/ATen/native/cuda/TensorTopK.cu 150 | +++ b/aten/src/ATen/native/cuda/TensorTopK.cu 151 | @@ -14,6 +14,8 @@ 152 | #include 153 | #include 154 | #include 155 | +#include 156 | +#include 157 | 158 | #include 159 | 160 | @@ -733,9 +735,8 @@ void launch( 161 | desired, counts, num_blocks, blocks_per_slice, kthCounts); 162 | C10_CUDA_KERNEL_LAUNCH_CHECK(); 163 | // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block 164 | - using counting_iter_t = cub::CountingInputIterator; 165 | - using slice_idx_iter_t = cub::TransformInputIterator; 166 | - slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice)); 167 | + auto counting_iter = thrust::make_counting_iterator(0); 168 | + auto slice_idx_iter = thrust::make_transform_iterator(counting_iter, BlockIdxToKey(blocks_per_slice)); 169 | at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks); 170 | at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks); 171 | // copy topk values to output tensor 172 | diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu 173 | index bbd8673bcf5..f61dca74252 100644 174 | --- a/aten/src/ATen/native/cuda/UniqueCub.cu 175 | +++ b/aten/src/ATen/native/cuda/UniqueCub.cu 176 | @@ -5,6 +5,8 @@ 177 | #include 178 | #include 179 | #include 180 | +#include 181 | +#include 182 | 183 | #include 184 | #include 185 | @@ -53,9 +55,8 @@ struct LoadBoolOp { 186 | 187 | auto wrap_input_iterator(const bool *data) { 188 | // See NOTE [Loading boolean values] 189 | - LoadBoolOp op; 190 | - return NO_ROCM(at_cuda_detail)::cub::TransformInputIterator( 191 | - reinterpret_cast(data), op); 192 | + return thrust::make_transform_iterator( 193 | + reinterpret_cast(data), LoadBoolOp{}); 194 | } 195 | 196 | // A variation of compute_unique (defined in Unique.cu) that doesn't allow 197 | @@ -258,11 +259,10 @@ struct UniqueCub { 198 | c10::DeviceArray tmp_num_true(*allocator, 1); 199 | 200 | const bool* self_data = self.const_data_ptr(); 201 | - MapNumberOfTrueValues op; 202 | - NO_ROCM(at_cuda_detail)::cub::TransformInputIterator 203 | - data_iter(reinterpret_cast(self_data), op); 204 | - at::cuda::cub::reduce(data_iter, tmp_num_true.get(), num_inp, 205 | - NO_ROCM(at_cuda_detail)::cub::Sum{}, 0); 206 | + auto data_iter = thrust::make_transform_iterator( 207 | + reinterpret_cast(self_data), MapNumberOfTrueValues{}); 208 | + at::cuda::cub::reduce( 209 | + data_iter, tmp_num_true.get(), num_inp, thrust::plus{}, 0); 210 | 211 | auto options = self.options(); 212 | output = at::empty({2}, self.options()); 213 | diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp 214 | index 4fb72c5f79b..bb875f4210e 100644 215 | --- a/torch/csrc/cuda/shared/nvtx.cpp 216 | +++ b/torch/csrc/cuda/shared/nvtx.cpp 217 | @@ -1,7 +1,15 @@ 218 | #ifdef _WIN32 219 | #include // _wgetenv for nvtx 220 | #endif 221 | +#if defined(__has_include) 222 | +#if __has_include() 223 | +#include 224 | +#else 225 | #include 226 | +#endif 227 | +#else 228 | +#include 229 | +#endif 230 | #include 231 | 232 | namespace torch::cuda::shared { 233 | diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp 234 | index d0cb3746a21..fb90aeb1d0b 100644 235 | --- a/torch/csrc/profiler/stubs/cuda.cpp 236 | +++ b/torch/csrc/profiler/stubs/cuda.cpp 237 | @@ -1,6 +1,14 @@ 238 | #include 239 | 240 | +#if defined(__has_include) 241 | +#if __has_include() 242 | +#include 243 | +#else 244 | #include 245 | +#endif 246 | +#else 247 | +#include 248 | +#endif 249 | 250 | #include 251 | #include 252 | --------------------------------------------------------------------------------