├── patches
    ├── 0005-fp16-min-cmake.patch
    ├── 0004-psimd-min-cmake.patch
    ├── 0007-onnx-min-cmake.patch
    ├── 0008-foxi-min-cmake.patch
    ├── 0002-protobuf-min-cmake.patch
    ├── 0003-clog-min-cmake.patch
    ├── 0001-add-blackwell-arch.patch
    ├── 0009-cufft-guard-missing-enums.patch
    ├── 0010-cub-equality-fallback.patch
    ├── 0006-find-cub-from-cuda.patch
    └── 0011-cub-modernize.patch
├── LICENSE
├── .gitignore
└── README.md


/patches/0005-fp16-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/third_party/FP16/CMakeLists.txt b/third_party/FP16/CMakeLists.txt
 2 | index 937cf3b54b5..1a62278f235 100644
 3 | --- a/third_party/FP16/CMakeLists.txt
 4 | +++ b/third_party/FP16/CMakeLists.txt
 5 | @@ -1,4 +1,4 @@
 6 | -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 7 | +CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR)
 8 |  
 9 |  INCLUDE(GNUInstallDirs)
10 |  
11 | 


--------------------------------------------------------------------------------
/patches/0004-psimd-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/third_party/psimd/CMakeLists.txt b/third_party/psimd/CMakeLists.txt
 2 | index aa2a0781ff4..9b8d97323b4 100644
 3 | --- a/third_party/psimd/CMakeLists.txt
 4 | +++ b/third_party/psimd/CMakeLists.txt
 5 | @@ -1,4 +1,4 @@
 6 | -CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 7 | +CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR)
 8 |  
 9 |  INCLUDE(GNUInstallDirs)
10 |  
11 | 


--------------------------------------------------------------------------------
/patches/0007-onnx-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/third_party/onnx/CMakeLists.txt b/third_party/onnx/CMakeLists.txt
 2 | index 6d7ca846..446e4bb0 100644
 3 | --- a/third_party/onnx/CMakeLists.txt
 4 | +++ b/third_party/onnx/CMakeLists.txt
 5 | @@ -1,5 +1,5 @@
 6 |  # Minimum CMake required
 7 | -cmake_minimum_required(VERSION 3.1)
 8 | +cmake_minimum_required(VERSION 3.21)
 9 |  include(cmake/Utils.cmake)
10 |  # Set default build type
11 |  if(NOT CMAKE_BUILD_TYPE)
12 | 
13 | 


--------------------------------------------------------------------------------
/patches/0008-foxi-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/third_party/foxi/CMakeLists.txt b/third_party/foxi/CMakeLists.txt
 2 | index b406d0f..4120c94 100644
 3 | --- a/third_party/foxi/CMakeLists.txt
 4 | +++ b/third_party/foxi/CMakeLists.txt
 5 | @@ -1,5 +1,5 @@
 6 |  # Minimum CMake required
 7 | -cmake_minimum_required(VERSION 3.1)
 8 | +cmake_minimum_required(VERSION 3.21)
 9 |  # Set default build type
10 |  if(NOT CMAKE_BUILD_TYPE)
11 |    message(STATUS "Build type not set - defaulting to Release")
12 | 
13 | 


--------------------------------------------------------------------------------
/patches/0002-protobuf-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/third_party/protobuf/cmake/CMakeLists.txt b/third_party/protobuf/cmake/CMakeLists.txt
 2 | index 9ca31ac0b..123501c66 100644
 3 | --- a/third_party/protobuf/cmake/CMakeLists.txt
 4 | +++ b/third_party/protobuf/cmake/CMakeLists.txt
 5 | @@ -1,5 +1,5 @@
 6 |  # Minimum CMake required
 7 | -cmake_minimum_required(VERSION 3.1.3)
 8 | +cmake_minimum_required(VERSION 3.5.0)
 9 |  
10 |  if(protobuf_VERBOSE)
11 |    message(STATUS "Protocol Buffers Configuring...")
12 | 


--------------------------------------------------------------------------------
/patches/0003-clog-min-cmake.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
 2 | index e763e4e3ba9..2b62765cf2f 100644
 3 | --- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
 4 | +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/CMakeLists.txt
 5 | @@ -4,7 +4,7 @@
 6 |  # This source code is licensed under the BSD-style license found in the
 7 |  # LICENSE file in the root directory of this source tree.
 8 |  
 9 | -cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
10 | +cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
11 |  
12 |  include(GNUInstallDirs)
13 |  
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Taco
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/patches/0001-add-blackwell-arch.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
 2 | index 90de8fb0d84..0ddcd7fb78d 100644
 3 | --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
 4 | +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
 5 | @@ -186,7 +186,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
 6 |        set(add_ptx TRUE)
 7 |        set(arch_name ${CMAKE_MATCH_1})
 8 |      endif()
 9 | -    if(arch_name MATCHES "^([0-9]\\.[0-9]a?(\\([0-9]\\.[0-9]\\))?)$")
10 | +    if(arch_name MATCHES "^([0-9]+\\.[0-9]a?(\\([0-9]+\\.[0-9]\\))?)$")
11 |        set(arch_bin ${CMAKE_MATCH_1})
12 |        set(arch_ptx ${arch_bin})
13 |      else()
14 | @@ -223,6 +223,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
15 |        elseif(${arch_name} STREQUAL "Hopper")
16 |          set(arch_bin 9.0)
17 |          set(arch_ptx 9.0)
18 | +      elseif(${arch_name} STREQUAL "Blackwell")
19 | +        set(arch_bin 11.0)
20 | +        set(arch_ptx 11.0)
21 |        else()
22 |          message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
23 |        endif()
24 | 


--------------------------------------------------------------------------------
/patches/0009-cufft-guard-missing-enums.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h
 2 | index 4b02f914d7e..b637e99bd4a 100644
 3 | --- a/aten/src/ATen/native/cuda/CuFFTUtils.h
 4 | +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
 5 | @@ -38,19 +38,25 @@ static inline std::string _cudaGetErrorEnum(cufftResult error)
 6 |        return "CUFFT_INVALID_SIZE";
 7 |      case CUFFT_UNALIGNED_DATA:
 8 |        return "CUFFT_UNALIGNED_DATA";
 9 | +    #if defined(CUFFT_INCOMPLETE_PARAMETER_LIST)
10 |      case CUFFT_INCOMPLETE_PARAMETER_LIST:
11 |        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
12 | +    #endif
13 |      case CUFFT_INVALID_DEVICE:
14 |        return "CUFFT_INVALID_DEVICE";
15 | +    #if defined(CUFFT_PARSE_ERROR)
16 |      case CUFFT_PARSE_ERROR:
17 |        return "CUFFT_PARSE_ERROR";
18 | +    #endif
19 |      case CUFFT_NO_WORKSPACE:
20 |        return "CUFFT_NO_WORKSPACE";
21 |      case CUFFT_NOT_IMPLEMENTED:
22 |        return "CUFFT_NOT_IMPLEMENTED";
23 |  #if !defined(USE_ROCM)
24 | +    #if defined(CUFFT_LICENSE_ERROR)
25 |      case CUFFT_LICENSE_ERROR:
26 |        return "CUFFT_LICENSE_ERROR";
27 | +    #endif
28 |  #endif
29 |      case CUFFT_NOT_SUPPORTED:
30 |        return "CUFFT_NOT_SUPPORTED";
31 | 
32 | 


--------------------------------------------------------------------------------
/patches/0010-cub-equality-fallback.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
 2 | index c5261534dd4..60597b01b8f 100644
 3 | --- a/aten/src/ATen/cuda/cub.cuh
 4 | +++ b/aten/src/ATen/cuda/cub.cuh
 5 | @@ -7,6 +7,9 @@
 6 |  #include <limits>
 7 |  
 8 |  #include <ATen/cuda/cub_definitions.cuh>
 9 | +#if !defined(USE_ROCM) && CUB_VERSION >= 200000
10 | +#include <cuda/std/functional>
11 | +#endif
12 |  
13 |  #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
14 |  
15 | @@ -51,6 +54,12 @@
16 |  #define ROCM_HIPCUB(x) x
17 |  #endif
18 |  
19 | +#if CUB_VERSION < 200000
20 | +#define AT_CUDA_CUB_EQUALITY() NO_ROCM(at_cuda_detail)::cub::Equality()
21 | +#else
22 | +#define AT_CUDA_CUB_EQUALITY() ::cuda::std::equal_to<>()
23 | +#endif
24 | +
25 |  #if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM)
26 |  
27 |  #if !defined(USE_ROCM)
28 | @@ -364,7 +373,7 @@ inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT i
29 |    TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
30 |      "cub InclusiveSumByKey does not support more than INT_MAX elements");
31 |    CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveSumByKey,
32 | -      keys, input, output, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
33 | +      keys, input, output, num_items, AT_CUDA_CUB_EQUALITY(), at::cuda::getCurrentCUDAStream());
34 |  }
35 |  
36 |  template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT>
37 | @@ -372,7 +381,7 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
38 |    TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
39 |      "cub InclusiveSumByKey does not support more than INT_MAX elements");
40 |    CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveScanByKey,
41 | -      keys, input, output, scan_op, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
42 | +      keys, input, output, scan_op, num_items, AT_CUDA_CUB_EQUALITY(), at::cuda::getCurrentCUDAStream());
43 |  }
44 |  
45 |  #endif
46 | 
47 | 


--------------------------------------------------------------------------------
/patches/0006-find-cub-from-cuda.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/cmake/Modules/FindCUB.cmake b/cmake/Modules/FindCUB.cmake
 2 | index e053964e6e4..eadb86b7431 100644
 3 | --- a/cmake/Modules/FindCUB.cmake
 4 | +++ b/cmake/Modules/FindCUB.cmake
 5 | @@ -2,8 +2,56 @@
 6 |  #  CUB_FOUND        - system has CUB
 7 |  #  CUB_INCLUDE_DIRS - the CUB include directory
 8 |  
 9 | +set(_CUB_HINTS)
10 | +set(_CUB_TARGET_SUFFIXES
11 | +        "/targets/sbsa-linux/include"
12 | +        "/targets/aarch64-linux/include"
13 | +        "/targets/x86_64-linux/include"
14 | +)
15 | +
16 | +if(DEFINED CUDA_TOOLKIT_INCLUDE)
17 | +  list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_INCLUDE}")
18 | +endif()
19 | +if(DEFINED CUDA_TOOLKIT_ROOT_DIR)
20 | +  list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/include")
21 | +  foreach(_suffix ${_CUB_TARGET_SUFFIXES})
22 | +    list(APPEND _CUB_HINTS "${CUDA_TOOLKIT_ROOT_DIR}${_suffix}")
23 | +  endforeach()
24 | +endif()
25 | +if(DEFINED CUDA_INCLUDE_DIRS)
26 | +  list(APPEND _CUB_HINTS ${CUDA_INCLUDE_DIRS})
27 | +endif()
28 | +if(DEFINED CUDAToolkit_INCLUDE_DIRS)
29 | +  list(APPEND _CUB_HINTS ${CUDAToolkit_INCLUDE_DIRS})
30 | +endif()
31 | +if(DEFINED ENV{CUDA_HOME})
32 | +  list(APPEND _CUB_HINTS "$ENV{CUDA_HOME}/include")
33 | +  foreach(_suffix ${_CUB_TARGET_SUFFIXES})
34 | +    list(APPEND _CUB_HINTS "$ENV{CUDA_HOME}${_suffix}")
35 | +  endforeach()
36 | +endif()
37 | +if(DEFINED ENV{CUDA_PATH})
38 | +  list(APPEND _CUB_HINTS "$ENV{CUDA_PATH}/include")
39 | +  foreach(_suffix ${_CUB_TARGET_SUFFIXES})
40 | +    list(APPEND _CUB_HINTS "$ENV{CUDA_PATH}${_suffix}")
41 | +  endforeach()
42 | +endif()
43 | +list(APPEND _CUB_HINTS "/usr/local/cuda/include" "/opt/cuda/include")
44 | +
45 | +set(_CUB_EXPANDED_HINTS)
46 | +foreach(_hint ${_CUB_HINTS})
47 | +  if(_hint)
48 | +    list(APPEND _CUB_EXPANDED_HINTS "${_hint}")
49 | +    if(EXISTS "${_hint}/cccl")
50 | +      list(APPEND _CUB_EXPANDED_HINTS "${_hint}/cccl")
51 | +    endif()
52 | +  endif()
53 | +endforeach()
54 | +
55 | +list(REMOVE_DUPLICATES _CUB_EXPANDED_HINTS)
56 | +
57 |  find_path(CUB_INCLUDE_DIR
58 | -        HINTS "${CUDA_TOOLKIT_INCLUDE}"
59 | +        HINTS ${_CUB_EXPANDED_HINTS}
60 |          NAMES cub/cub.cuh
61 |          DOC "The directory where CUB includes reside"
62 |  )
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[codz]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114 | #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115 | #pdm.lock
116 | #pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # pixi
121 | #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122 | #pixi.lock
123 | #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124 | #   in the .venv directory. It is recommended not to include this directory in version control.
125 | .pixi
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .envrc
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | # Pyre type checker
163 | .pyre/
164 | 
165 | # pytype static type analyzer
166 | .pytype/
167 | 
168 | # Cython debug symbols
169 | cython_debug/
170 | 
171 | # PyCharm
172 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
175 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
176 | #.idea/
177 | 
178 | # Abstra
179 | # Abstra is an AI-powered process automation framework.
180 | # Ignore directories containing user credentials, local state, and settings.
181 | # Learn more at https://abstra.io/docs
182 | .abstra/
183 | 
184 | # Visual Studio Code
185 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
186 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187 | #  and can be added to the global gitignore or merged into this file. However, if you prefer, 
188 | #  you could uncomment the following to ignore the entire vscode folder
189 | # .vscode/
190 | 
191 | # Ruff stuff:
192 | .ruff_cache/
193 | 
194 | # PyPI configuration file
195 | .pypirc
196 | 
197 | # Cursor
198 | #  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199 | #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200 | #  refer to https://docs.cursor.com/context/ignore-files
201 | .cursorignore
202 | .cursorindexingignore
203 | 
204 | # Marimo
205 | marimo/_static/
206 | marimo/_lsp/
207 | __marimo__/
208 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PyTorch from Source on Jetson Orin & Thor
  2 | 
  3 | This workspace automates building stock [PyTorch](https://github.com/pytorch/pytorch#from-source) with CUDA enabled for Python 3.8–3.12 on both Jetson Orin (Ampere, JetPack 6.x) and Jetson AGX Thor (Blackwell, JetPack 7.x). It codifies the upstream PyTorch instructions and the NVIDIA Developer Forum guidance for [Orin builds](https://forums.developer.nvidia.com/t/native-build-of-pytorch-for-jetson/71842) and [Thor/JetPack 7 builds](https://forums.developer.nvidia.com/t/pytorch-2-4-build-jetson-orin/291219). NVIDIA 官方 pip 仓库只提供少量预编译版本，这里可以自编译带 CUDA 的版本以匹配需求。
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - JetPack 6.x (Orin) or JetPack 7.x (Thor) with CUDA `/usr/local/cuda` and cuDNN already installed.
  8 | - At least 32 GB of free disk (more if you plan to keep all 3 wheels at once) and large swap (builds routinely spill >16 GB RAM).
  9 | - System packages:
 10 | 
 11 |   ```bash
 12 |   sudo apt update
 13 |   sudo apt install -y build-essential git cmake ninja-build \
 14 |       libopenblas-dev libopenmpi-dev openmpi-bin libatlas-base-dev libprotobuf-dev \
 15 |       protobuf-compiler libssl-dev zlib1g-dev libffi-dev
 16 |   ```
 17 | - `~/miniconda3` (already present on this machine) or any conda distribution. The scripts will create isolated envs per Python version.
 18 | 
 19 | Thor-specific sanity checks (taken from this devkit, JetPack 7.0 / Ubuntu 24.04):
 20 | 
 21 | ```bash
 22 | uname -a
 23 | # Linux thor-taco 6.8.12-tegra ... aarch64 GNU/Linux
 24 | cat /etc/nv_tegra_release
 25 | # R38.2.2 ... BOARD: generic (AGX Thor)
 26 | nvidia-smi --query-gpu=name,compute_cap,driver_version,cuda_version --format=csv
 27 | # NVIDIA Thor, 11.0, 580.00, 13.0
 28 | ```
 29 | 
 30 | The `build.sh` script auto-detects the compute capability (`TORCH_CUDA_ARCH_LIST`) via `nvidia-smi` when present, falling back to `/proc/device-tree/model`. On this Thor devkit it resolves to `11.0`; on Orin it defaults to `8.7`.
 31 | 
 32 | > ℹ️ Jetson builds cannot currently use NVIDIA's binary NCCL. Following the forum advice above, the scripts default to `USE_NCCL=0`, `USE_DISTRIBUTED=0`, `USE_MKLDNN=0`, and `USE_NNPACK=0`. Override them if you have working alternatives.
 33 | 
 34 | ## Layout
 35 | 
 36 | - `build.sh` — clones PyTorch (once), prepares the requested Python env, and runs `python setup.py bdist_wheel` with Jetson-friendly defaults (auto-detected CUDA arch, NCCL disabled unless you opt in, etc.).
 37 | - `build-all.sh` — convenience wrapper that invokes `build.sh` for 3.10, 3.11, and 3.12 (or any list of versions you pass).
 38 | - `src/` — source tree managed by the scripts (`src/pytorch` is the git checkout).
 39 | - `logs/` — timestamped build logs per Python version.
 40 | - `wheels/` — collected `.whl` artefacts per Python version (`wheels/py310`, `wheels/py311`, ...).
 41 | 
 42 | ## Quick start
 43 | 
 44 | ```bash
 45 | cd ~/jetson-pytorch-builder
 46 | chmod +x build.sh build-all.sh
 47 | # Build all supported versions (3.8–3.12)
 48 | ./build-all.sh
 49 | # OR build one at a time
 50 | #    ^ Python version  ^ optional PyTorch git ref/tag
 51 | ./build.sh 3.11 v2.4.1
 52 | ```
 53 | 
 54 | Each run:
 55 | 
 56 | 1. Creates/updates `src/pytorch` (defaults to upstream tag `v2.4.0`, override with `PYTORCH_BRANCH=<tag>` or pass a second argument such as `./build.sh 3.12 main`).
 57 | 2. Creates a matching conda env (`torch-py310`, `torch-py311`, `torch-py312`) if it does not exist yet.
 58 | 3. Installs PyTorch's Python build requirements into the env.
 59 | 4. Cleans the repo tree (`git clean -fdx`) to avoid cross-version contamination.
 60 | 5. Compiles PyTorch with CUDA enabled, targeting the detected GPU (`TORCH_CUDA_ARCH_LIST` auto-detects to 8.7 for Orin, 11.0 for Thor; override via env var to cross-compile).
 61 | 6. Copies the newest `torch-*.whl` into `wheels/pyNNN`.
 62 | 
 63 | Successful builds print the wheel path at the end and log everything to `logs/pytorch-py<version>-<timestamp>.log`.
 64 | 
 65 | ## Current Support
 66 | Note: Marked is tested versions. Some may work but not tested/updated yet.
 67 | 
 68 | * [X] Pytorch 2.4.0
 69 | * [ ] Pytorch 2.9.1  (partial)
 70 | 
 71 | * ### Jetson Orin
 72 | 
 73 |   * [ ] Python 3.8
 74 |   * [ ] Python 3.9
 75 |   * [ ] Python 3.10
 76 |   * [X] Python 3.11
 77 |   * [ ] Python 3.12
 78 | * ### Jetson Thor
 79 | 
 80 |   * [X] Python 3.8
 81 |   * [X] Python 3.9
 82 |   * [X] Python 3.10
 83 |   * [X] Python 3.11
 84 |   * [X] Python 3.12
 85 | 
 86 | 
 87 | ## Customisation
 88 | 
 89 | All relevant knobs can be changed through environment variables:
 90 | 
 91 | | Variable                                                                           | Default                                      | Meaning                                                                                                                                                                                     |
 92 | | ---------------------------------------------------------------------------------- | -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 93 | | `PYTORCH_BRANCH`                                                                 | `v2.4.0`                                   | Upstream tag/branch to checkout (can also pass as the second argument to `build.sh`).                                                                                                     |
 94 | | `PYTORCH_REPO`                                                                   | `https://github.com/pytorch/pytorch.git`   | Clone source.                                                                                                                                                                               |
 95 | | `TORCH_CUDA_ARCH_LIST`                                                           | auto (`11.0` on Thor, `8.7` on Orin)     | Target GPU architectures. Override to cross-compile.                                                                                                                                        |
 96 | | `MAX_JOBS`                                                                       | `$(nproc)`                                 | Parallel compilation jobs. Tune to control RAM usage.                                                                                                                                       |
 97 | | `CUDA_HOME`                                                                      | `/usr/local/cuda`                          | CUDA root.                                                                                                                                                                                  |
 98 | | `USE_NCCL`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_NNPACK`, `USE_QNNPACK` | Jetson defaults set in `build.sh`.         |                                                                                                                                                                                             |
 99 | | `TORCH_VERSION_OVERRIDE`                                                         | auto from tag (e.g.,`v2.4.0` → `2.4.0`) | Forces `TORCH_BUILD_VERSION` so the wheel filename/metadata advertises your custom build. Set empty to keep upstream git-style versions or to supply your own (e.g., `2.4.0-jetson.1`). |
100 | | `TORCH_BUILD_NUMBER_OVERRIDE`                                                    | `1`                                        | Optional build number passed along when `TORCH_VERSION_OVERRIDE` is set.                                                                                                                  |
101 | 
102 | Example:
103 | 
104 | ```bash
105 | TORCH_CUDA_ARCH_LIST="8.7;8.9" USE_NCCL=1 MAX_JOBS=8 ./build.sh 3.12
106 | ```
107 | 
108 | ## Installing the wheels
109 | 
110 | Once a build finishes, install it inside any target environment (conda, system Python, etc.):
111 | 
112 | ```bash
113 | pip install ~/jetson-pytorch-builder/wheels/py312/torch-*.whl
114 | ```
115 | 
116 | Copy the wheel to other Jetson nodes as needed. Keep the logs handy for support/bug reports.
117 | 
118 | ### Versioning and torchvision / torchaudio compatibility
119 | 
120 | By default PyTorch's build system emits versions like `2.4.0a0+git<sha>`. This repo now **auto-sets `TORCH_BUILD_VERSION` to the numeric part of your tag** (e.g., `v2.4.0` → `2.4.0`), so the wheel name/metadata matches what torchvision/torchaudio expect. For non-tag refs (e.g., `main`), no override is applied unless you set it explicitly.
121 | 
122 | Two ways to stay sane:
123 | 
124 | 1. **Set an explicit version for your wheel.**
125 | 
126 |    ```bash
127 |    TORCH_VERSION_OVERRIDE="2.4.0-jetson.1" ./build.sh 3.11 v2.4.0
128 |    ```
129 | 
130 |    The resulting wheel becomes `torch-2.4.0-jetson.1-...whl`, making it easy to match dependencies.
131 | 2. **Install torchvision without re-resolving torch.**
132 | 
133 |    If you keep the default `2.4.0a0+git...` version, install the matching source release and skip dependency checks:
134 | 
135 |    ```bash
136 |    pip install torchvision==0.19.0 --no-deps
137 |    pip install torchaudio==2.4.0 --no-deps   # adjust to the PyTorch series you built
138 |    ```
139 | 
140 |    This mirrors the PyTorch instructions for source builds where `torch` is already present.
141 | 
142 | ## Thor (Blackwell / JetPack 7) notes
143 | 
144 | - JetPack 7 ships CUDA 13.0 and driver 580; make sure host packages and `CUDA_HOME` point to `/usr/local/cuda-13.0` (symlinked by default). The script logs the resolved path for traceability.
145 | - Blackwell support in PyTorch is still evolving; stick to PyTorch v2.4+ (default `v2.4.0`) or nightly master for proper `sm_110` kernels. You can change `PYTORCH_BRANCH` to `main` when you need bleeding-edge fixes.
146 | - NCCL is still unavailable on Jetson, so distributed training remains disabled.
147 | - If you parallelize with `MAX_JOBS > 8`, ensure Thor's LPDDR memory controller has enough headroom or the build may thrash swap.
148 | 
149 | ## Troubleshooting notes
150 | 
151 | - Add swap with `sudo fallocate -l 32G /swapfile && sudo mkswap /swapfile ...` if the compiler OOMs.
152 | - Ensure `nvcc --version` matches your JetPack CUDA (`nvcc --version` should report 13.0 on Thor, 12.x on Orin). If not, export `CUDA_HOME` explicitly.
153 | - `python setup.py clean` is implicitly handled by `git clean -fdx`; remove `build/` manually if you pause/resume by hand.
154 | - Refer to the PyTorch source build doc and NVIDIA forum threads listed at the top for more edge-case fixes (e.g., building with TensorRT, CUTLASS tuning, FlashAttention patches, etc.).
155 | 
156 | Happy compiling!
157 | 
158 | Special notes: This repo contains codes generated by Codex GPT-5.1 medium. I do not garentee this will work on your machine. Merge requests welcomed.
159 | Tested on:
160 | Jetson Thor: Linux thor-taco 6.8.12-tegra #1 SMP PREEMPT Thu Sep 25 15:19:42 PDT 2025 aarch64 aarch64 aarch64 GNU/Linux
161 |   Soc: tegra264
162 |   CUDA Arch BIN: 13.0
163 |   L4T: 38.2.2
164 |   Jetpack: 7.0
165 |   CUDA: 13.0.48
166 |   cuDNN: 9.12.0
167 |   TensorRT: 10.13.3.9
168 | 


--------------------------------------------------------------------------------
/patches/0011-cub-modernize.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu
  2 | index 839652f581a..25d7285323c 100644
  3 | --- a/aten/src/ATen/cuda/cub.cu
  4 | +++ b/aten/src/ATen/cuda/cub.cu
  5 | @@ -15,8 +15,8 @@ struct SumOp {
  6 |  
  7 |  template <typename input_t, typename output_t>
  8 |  void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t num_items) {
  9 | -  using NO_ROCM(at_cuda_detail)::cub::Sum;
 10 | -  inclusive_scan(input, output, Sum{}, num_items);
 11 | +  using scalar_t = std::common_type_t<input_t, output_t>;
 12 | +  inclusive_scan(input, output, SumOp<scalar_t>{}, num_items);
 13 |  }
 14 |  
 15 |  template void inclusive_sum_truncating(const int32_t *input, int32_t *output, int64_t num_items);
 16 | @@ -33,18 +33,26 @@ template void exclusive_sum_in_common_type(const int32_t *input, int32_t *output
 17 |  template void exclusive_sum_in_common_type(const int64_t *input, int64_t *output, int64_t num_items);
 18 |  
 19 |  namespace {
 20 | -struct CountMaskOp {
 21 | -  __device__ int64_t operator() (const uint8_t &x) const {
 22 | -    return x != 0;
 23 | +__global__ void mask_to_int64(const uint8_t* mask, int64_t* out, int64_t n) {
 24 | +  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
 25 | +  if (idx < n) {
 26 | +    out[idx] = mask[idx] != 0;
 27 |    }
 28 | -};
 29 | +}
 30 |  }
 31 |  
 32 |  void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n) {
 33 | -  CountMaskOp op{};
 34 | -  auto iter = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<
 35 | -      bool, decltype(op), decltype(mask)>(mask, op);
 36 | -  exclusive_scan(iter, output_idx, SumOp<int64_t>{}, int64_t{0}, n);
 37 | +  if (n <= 0) {
 38 | +    return;
 39 | +  }
 40 | +  auto allocator = c10::cuda::CUDACachingAllocator::get();
 41 | +  auto temp = allocator->allocate(static_cast<size_t>(n) * sizeof(int64_t));
 42 | +  auto temp_ptr = static_cast<int64_t*>(temp.get());
 43 | +  constexpr int threads = 256;
 44 | +  const int blocks = static_cast<int>((n + threads - 1) / threads);
 45 | +  mask_to_int64<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(mask, temp_ptr, n);
 46 | +  C10_CUDA_KERNEL_LAUNCH_CHECK();
 47 | +  exclusive_scan(temp_ptr, output_idx, SumOp<int64_t>{}, int64_t{0}, n);
 48 |  }
 49 |  
 50 |  }  // namespace at::cuda::cub
 51 | diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
 52 | index b8fb51304e4..82d386ffadc 100644
 53 | --- a/aten/src/ATen/native/cuda/Embedding.cu
 54 | +++ b/aten/src/ATen/native/cuda/Embedding.cu
 55 | @@ -16,6 +16,8 @@
 56 |  #include <ATen/native/cuda/thread_constants.h>
 57 |  
 58 |  #if CUB_SUPPORTS_SCAN_BY_KEY()
 59 | +#include <thrust/functional.h>
 60 | +#include <thrust/iterator/constant_iterator.h>
 61 |  #include <thrust/iterator/reverse_iterator.h>
 62 |  #endif
 63 |  
 64 | @@ -317,7 +319,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
 65 |        auto count_data = count.mutable_data_ptr<index_t>();
 66 |        cuda::cub::inclusive_sum_by_key(
 67 |          sorted_data,
 68 | -        at_cuda_detail::cub::ConstantInputIterator<index_t>(1),
 69 | +        thrust::make_constant_iterator<index_t>(1),
 70 |          count_data,
 71 |          num_indices
 72 |        );
 73 | @@ -329,7 +331,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
 74 |          thrust::make_reverse_iterator(sorted_data + num_indices),
 75 |          thrust::make_reverse_iterator(static_cast<const index_t*>(count_data) + num_indices),
 76 |          thrust::make_reverse_iterator(count_data + num_indices),
 77 | -        at_cuda_detail::cub::Max(),
 78 | +        thrust::maximum<index_t>{},
 79 |          num_indices
 80 |        );
 81 |      });
 82 | diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
 83 | index 7c9f845b7ee..6795282319f 100644
 84 | --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
 85 | +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
 86 | @@ -31,6 +31,8 @@
 87 |  #include <c10/macros/Macros.h>
 88 |  
 89 |  #if CUB_SUPPORTS_SCAN_BY_KEY()
 90 | +#include <thrust/functional.h>
 91 | +#include <thrust/iterator/constant_iterator.h>
 92 |  #include <thrust/iterator/reverse_iterator.h>
 93 |  #endif
 94 |  
 95 | @@ -212,7 +214,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(
 96 |        auto count_data = count.mutable_data_ptr<index_t>();
 97 |        cuda::cub::inclusive_sum_by_key(
 98 |          sorted_data,
 99 | -        at_cuda_detail::cub::ConstantInputIterator<index_t>(1),
100 | +        thrust::make_constant_iterator<index_t>(1),
101 |          count_data,
102 |          num_indices
103 |        );
104 | @@ -222,9 +224,9 @@ Tensor embedding_bag_backward_cuda_sum_avg(
105 |        //  count: 1 3 3 3 2 2 1 2 2
106 |        cuda::cub::inclusive_scan_by_key(
107 |          thrust::make_reverse_iterator(sorted_data + num_indices),
108 | +        thrust::make_reverse_iterator(static_cast<const index_t*>(count_data) + num_indices),
109 |          thrust::make_reverse_iterator(count_data + num_indices),
110 | -        thrust::make_reverse_iterator(count_data + num_indices),
111 | -        at_cuda_detail::cub::Max(),
112 | +        thrust::maximum<index_t>{},
113 |          num_indices
114 |        );
115 |      });
116 | diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
117 | index e87f46cd844..c284b1b368e 100644
118 | --- a/aten/src/ATen/native/cuda/Nonzero.cu
119 | +++ b/aten/src/ATen/native/cuda/Nonzero.cu
120 | @@ -7,6 +7,8 @@
121 |  #include <ATen/cuda/detail/KernelUtils.h>
122 |  #include <ATen/cuda/detail/OffsetCalculator.cuh> //for MAX_DIMS
123 |  #include <ATen/cuda/cub.cuh>
124 | +#include <thrust/iterator/counting_iterator.h>
125 | +#include <thrust/iterator/transform_iterator.h>
126 |  
127 |  #ifndef AT_PER_OPERATOR_HEADERS
128 |  #include <ATen/NativeFunctions.h>
129 | @@ -65,7 +67,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
130 |    size_t temp_storage_bytes=0;
131 |    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
132 |    auto num_nonzeros = allocator.allocate(sizeof(int));
133 | -  cub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
134 | +  auto itr = thrust::make_transform_iterator(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
135 |    cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
136 |    auto temp_storage = allocator.allocate(temp_storage_bytes);
137 |    cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
138 | @@ -82,7 +84,7 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){
139 |        out.resize_({self.dim(), num_nonzeros_h});
140 |    //Scalars are expected to produce output of size (1,0), so we can't write to it
141 |    if (self.dim() > 0) {
142 | -    cub::CountingInputIterator<int64_t> counting_itr(0);
143 | +    auto counting_itr = thrust::make_counting_iterator<int64_t>(0);
144 |      temp_storage_bytes = 0;
145 |      cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
146 |        out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
147 | diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
148 | index d06efa66351..b3e52a63313 100644
149 | --- a/aten/src/ATen/native/cuda/TensorTopK.cu
150 | +++ b/aten/src/ATen/native/cuda/TensorTopK.cu
151 | @@ -14,6 +14,8 @@
152 |  #include <ATen/cuda/cub.cuh>
153 |  #include <c10/cuda/CUDACachingAllocator.h>
154 |  #include <ATen/cuda/detail/KernelUtils.h>
155 | +#include <thrust/iterator/counting_iterator.h>
156 | +#include <thrust/iterator/transform_iterator.h>
157 |  
158 |  #include <c10/macros/Macros.h>
159 |  
160 | @@ -733,9 +735,8 @@ void launch(
161 |      desired, counts, num_blocks, blocks_per_slice, kthCounts);
162 |    C10_CUDA_KERNEL_LAUNCH_CHECK();
163 |    // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block
164 | -  using counting_iter_t = cub::CountingInputIterator<uint32_t, uint32_t>;
165 | -  using slice_idx_iter_t = cub::TransformInputIterator<uint32_t, BlockIdxToKey, counting_iter_t>;
166 | -  slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice));
167 | +  auto counting_iter = thrust::make_counting_iterator<uint32_t>(0);
168 | +  auto slice_idx_iter = thrust::make_transform_iterator(counting_iter, BlockIdxToKey(blocks_per_slice));
169 |    at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks);
170 |    at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks);
171 |    // copy topk values to output tensor
172 | diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
173 | index bbd8673bcf5..f61dca74252 100644
174 | --- a/aten/src/ATen/native/cuda/UniqueCub.cu
175 | +++ b/aten/src/ATen/native/cuda/UniqueCub.cu
176 | @@ -5,6 +5,8 @@
177 |  #include <ATen/cuda/detail/KernelUtils.h>
178 |  #include <ATen/cuda/CUDAApplyUtils.cuh>
179 |  #include <ATen/cuda/cub.cuh>
180 | +#include <thrust/functional.h>
181 | +#include <thrust/iterator/transform_iterator.h>
182 |  
183 |  #include <c10/core/DeviceArray.h>
184 |  #include <c10/util/Load.h>
185 | @@ -53,9 +55,8 @@ struct LoadBoolOp {
186 |  
187 |  auto wrap_input_iterator(const bool *data) {
188 |    // See NOTE [Loading boolean values]
189 | -  LoadBoolOp op;
190 | -  return NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<bool, LoadBoolOp, const uint8_t*, int>(
191 | -      reinterpret_cast<const uint8_t*>(data), op);
192 | +  return thrust::make_transform_iterator(
193 | +      reinterpret_cast<const uint8_t*>(data), LoadBoolOp{});
194 |  }
195 |  
196 |  // A variation of compute_unique (defined in Unique.cu) that doesn't allow
197 | @@ -258,11 +259,10 @@ struct UniqueCub<bool> {
198 |      c10::DeviceArray<int> tmp_num_true(*allocator, 1);
199 |  
200 |      const bool* self_data = self.const_data_ptr<bool>();
201 | -    MapNumberOfTrueValues op;
202 | -    NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<int, MapNumberOfTrueValues, const uint8_t*, int>
203 | -        data_iter(reinterpret_cast<const uint8_t*>(self_data), op);
204 | -    at::cuda::cub::reduce(data_iter, tmp_num_true.get(), num_inp,
205 | -                          NO_ROCM(at_cuda_detail)::cub::Sum{}, 0);
206 | +    auto data_iter = thrust::make_transform_iterator(
207 | +        reinterpret_cast<const uint8_t*>(self_data), MapNumberOfTrueValues{});
208 | +    at::cuda::cub::reduce(
209 | +        data_iter, tmp_num_true.get(), num_inp, thrust::plus<int>{}, 0);
210 |  
211 |      auto options = self.options();
212 |      output = at::empty({2}, self.options());
213 | diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
214 | index 4fb72c5f79b..bb875f4210e 100644
215 | --- a/torch/csrc/cuda/shared/nvtx.cpp
216 | +++ b/torch/csrc/cuda/shared/nvtx.cpp
217 | @@ -1,7 +1,15 @@
218 |  #ifdef _WIN32
219 |  #include <wchar.h> // _wgetenv for nvtx
220 |  #endif
221 | +#if defined(__has_include)
222 | +#if __has_include(<nvtx3/nvToolsExt.h>)
223 | +#include <nvtx3/nvToolsExt.h>
224 | +#else
225 |  #include <nvToolsExt.h>
226 | +#endif
227 | +#else
228 | +#include <nvToolsExt.h>
229 | +#endif
230 |  #include <torch/csrc/utils/pybind.h>
231 |  
232 |  namespace torch::cuda::shared {
233 | diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
234 | index d0cb3746a21..fb90aeb1d0b 100644
235 | --- a/torch/csrc/profiler/stubs/cuda.cpp
236 | +++ b/torch/csrc/profiler/stubs/cuda.cpp
237 | @@ -1,6 +1,14 @@
238 |  #include <sstream>
239 |  
240 | +#if defined(__has_include)
241 | +#if __has_include(<nvtx3/nvToolsExt.h>)
242 | +#include <nvtx3/nvToolsExt.h>
243 | +#else
244 |  #include <nvToolsExt.h>
245 | +#endif
246 | +#else
247 | +#include <nvToolsExt.h>
248 | +#endif
249 |  
250 |  #include <c10/cuda/CUDAGuard.h>
251 |  #include <c10/util/ApproximateClock.h>
252 | 


--------------------------------------------------------------------------------