├── NVIDIA_CLA_v1.0.1.docx
├── docker
    ├── scripts
    │   └── validate_args.sh
    ├── entrypoint.d
    │   ├── 10-banner.sh
    │   └── 50-gpu-driver-check.sh
    ├── Dockerfile
    └── patches
    │   └── torch.patch
├── LICENSE
├── README.rst
└── test
    └── simple.py


/NVIDIA_CLA_v1.0.1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/free-threaded-python/HEAD/NVIDIA_CLA_v1.0.1.docx


--------------------------------------------------------------------------------
/docker/scripts/validate_args.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | 
 5 | # PYTHON_VERSION variable
 6 | [[ "$PYTHON_VERSION" =~ ^3\.([0-9]+)$ ]] || { echo "PYTHON_VERSION must be in format '3.minor'"; exit 1; }
 7 | minor=${BASH_REMATCH[1]}
 8 | if (( minor > 13 )); then
 9 |     echo "Maximum supported Python version is 3.13 (got $PYTHON_VERSION)"
10 |     exit 1
11 | fi
12 | 
13 | # ENABLE_GIL variable check
14 | [[ "$ENABLE_GIL" =~ ^[01]$ ]] || { echo "ENABLE_GIL must be 0 or 1 (got '$ENABLE_GIL')"; exit 1; }
15 | if (( ENABLE_GIL == 0 )) && (( minor != 13 )); then
16 |     echo "ENABLE_GIL=0 is only supported with Python 3.13 (got $PYTHON_VERSION)"
17 |     exit 1
18 | fi
19 | 


--------------------------------------------------------------------------------
/docker/entrypoint.d/10-banner.sh:
--------------------------------------------------------------------------------
 1 | echo ""
 2 | echo "=========================="
 3 | echo "== Free-threaded Python =="
 4 | echo "=========================="
 5 | echo ""
 6 | echo "Welcome to the Experimental Free-Threaded Python."
 7 | echo ""
 8 | echo "This environment is experimental and designed for testing and exploratory purposes."
 9 | echo "Here you can investigate the behavior and performance of Python in a free-threaded context."
10 | echo ""
11 | echo "It is NOT production-ready."
12 | echo ""
13 | echo "Your experiences and insights are invaluable in helping us improve this environment."
14 | echo -e "Please report any bugs, issues, or suggestions via our \e]8;;https://github.com/NVIDIA/free-threaded-python/issues\aGitHub Issues\e]8;;\a."
15 | echo ""
16 | 


--------------------------------------------------------------------------------
/docker/entrypoint.d/50-gpu-driver-check.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | 
 3 | # Check if libcuda.so.1 -- the CUDA driver -- is present in the ld.so cache or in LD_LIBRARY_PATH
 4 | _LIBCUDA_FROM_LD_CACHE=$(ldconfig -p | grep libcuda.so.1)
 5 | _LIBCUDA_FROM_LD_LIBRARY_PATH=$( ( IFS=: ; for i in ${LD_LIBRARY_PATH}; do ls $i/libcuda.so.1 2>/dev/null | grep -v compat; done) )
 6 | _LIBCUDA_FOUND="${_LIBCUDA_FROM_LD_CACHE}${_LIBCUDA_FROM_LD_LIBRARY_PATH}"
 7 | 
 8 | # Check if /dev/nvidiactl (like on Linux) or /dev/dxg (like on WSL2) or /dev/nvgpu (like on Tegra) is present
 9 | _DRIVER_FOUND=$(ls /dev/nvidiactl /dev/dxg /dev/nvgpu 2>/dev/null)
10 | 
11 | # If either is not true, then GPU functionality won't be usable.
12 | if [[ -z "${_LIBCUDA_FOUND}" || -z "${_DRIVER_FOUND}" ]]; then
13 |   echo
14 |   echo "WARNING: The NVIDIA Driver was not detected.  GPU functionality will not be available."
15 |   echo "   Use the NVIDIA Container Toolkit to start this container with GPU support; see"
16 |   echo "   https://docs.nvidia.com/datacenter/cloud-native/ ."
17 |   export NVIDIA_CPU_ONLY=1
18 | fi
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Free-threaded Python
 2 | ====================
 3 | 
 4 | 
 5 | What is that?
 6 | -------------
 7 | 
 8 | Using this repository, you can build and test a free-threaded Python environment containing NVIDIA Python libraries.
 9 | 
10 | Python Steering Commitee approved `PEP 703 <https://peps.python.org/pep-0703/>`_, which removes the `Global Interpreter Lock <https://wiki.python.org/moin/GlobalInterpreterLock>`_ from Python. First Python release that allows for a parallel execution is 3.13 (scheduled October 2024). However, for yet some time you won't be able to ``pip install`` your favourite extensions and libraries.
11 | 
12 | Provided Docker allows you to try the free-threaded environment yourself. We've included build routines to some popular NVIDIA extensions (`"My library is missing"`_). Should you encounter any bugs or problems, please let us know in the Issues.
13 | 
14 | Please note: `This is an experimental software!`_
15 | 
16 | How to try it?
17 | --------------
18 | 
19 | #. Clone the repository::
20 | 
21 |     $ git clone https://github.com/NVIDIA/free-threaded-python.git
22 |     $ cd free-threaded-python
23 | 
24 | #. Build free-threaded Python environement::
25 | 
26 |     $ docker build -t free-threaded-python .
27 | 
28 | #. Try it::
29 | 
30 |    $ docker run -it --gpus all -v test:/test free-threaded-python python3 /test/simple.py
31 | 
32 | This is an experimental software!
33 | ---------------------------------
34 | 
35 | Removing GIL is a breaking change for many Python extensions and it will take years to adjust the ecosystem fully to the parallel execution. Moreover, this will not be possible without the community. **Please keep in mind, that by no means this is a production-ready software**. However, should you run into any bugs, please let us know in the Issues. Contributions and Pull Requests are also welcome.
36 | 
37 | "My library is missing"
38 | -----------------------
39 | 
40 | It's not our intention to recreate whole Python ecosystem. However, if you are using a Python library which is missing here, please let us know in the Issues. Although we focus mostly on NVIDIA Python ecosystem, we'll do our best to include the most popular extensions in this environment configuration.
41 | 


--------------------------------------------------------------------------------
/test/simple.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: MIT
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | 
22 | import math
23 | import time
24 | import threading
25 | 
26 | doc = """
27 | This is a simple test of free-threaded Python environment. It runs a computationally heavy-ish 
28 | task on alternating number of CPU threads. In a free-threaded environment, the execution times 
29 | of the runs shall be similar.
30 | """
31 | 
32 | 
33 | def computational_heavy(iterations):
34 |     val = 0
35 |     sin = math.sin
36 |     cos = math.cos
37 |     for i in range(1, iterations):
38 |         val += sin(i) * cos(i)
39 |     return val
40 | 
41 | 
42 | def test(thread_id, iterations=1000000):
43 |     computational_heavy(iterations)
44 | 
45 | 
46 | print(doc)
47 | 
48 | num_threads = [2, 18, 2, 18, 2, 18]
49 | 
50 | for nt in num_threads:
51 |     threads = [
52 |         threading.Thread(target=test, name=f"Thread{i}", args=(i,)) for i in range(nt)
53 |     ]
54 |     start = time.perf_counter_ns()
55 |     for t in threads:
56 |         t.start()
57 |     for t in threads:
58 |         t.join()
59 |     stop = time.perf_counter_ns()
60 |     print(f"{nt=}.\tElapsed time {stop-start} ns")
61 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: MIT
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a
  5 | # copy of this software and associated documentation files (the "Software"),
  6 | # to deal in the Software without restriction, including without limitation
  7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 | # and/or sell copies of the Software, and to permit persons to whom the
  9 | # Software is furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 | # DEALINGS IN THE SOFTWARE.
 21 | 
 22 | ARG CUDA_VERSION=12.4.0
 23 | 
 24 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 25 | 
 26 | ARG PYTHON_VERSION=3.13
 27 | ARG ENABLE_GIL=0
 28 | ENV PYTHON_GIL=$ENABLE_GIL
 29 | 
 30 | COPY --chmod=744 scripts/validate_args.sh /tmp/validate_args.sh
 31 | RUN PYTHON_VERSION=$PYTHON_VERSION ENABLE_GIL=$ENABLE_GIL /tmp/validate_args.sh
 32 | 
 33 | # Install build tools
 34 | ARG DEBIAN_FRONTEND=noninteractive
 35 | RUN apt update && apt install -y \
 36 |     build-essential \
 37 |     clang \
 38 |     lld \
 39 |     llvm \
 40 |     zlib1g-dev \
 41 |     binutils \
 42 |     zlib1g-dev \
 43 |     xz-utils \
 44 |     tk-dev \
 45 |     libssl-dev \
 46 |     libbz2-dev \
 47 |     libreadline-dev \
 48 |     libncursesw5-dev \
 49 |     libsqlite3-dev \
 50 |     libxml2-dev \
 51 |     libxmlsec1-dev \
 52 |     libffi-dev \
 53 |     liblzma-dev \
 54 |     curl \
 55 |     git \
 56 |     cmake \
 57 |     wget
 58 | 
 59 | WORKDIR /opt
 60 | 
 61 | # Remove system Python 3.10
 62 | RUN apt remove --purge -y python3.10 && apt autoremove -y
 63 | 
 64 | # Build and install the CPython from source
 65 | RUN GIL_FLAG=$([ "$ENABLE_GIL" = "0" ] && echo "--disable-gil" || echo "") && \
 66 |     git clone -b "$PYTHON_VERSION" --recursive -j"$(grep ^processor /proc/cpuinfo | wc -l)" https://github.com/python/cpython.git && cd cpython && \
 67 |     mkdir build && cd build && \
 68 |     CC=clang CXX=clang++ ../configure --prefix=/usr/ --enable-optimizations --with-lto --enable-shared $GIL_FLAG && \
 69 |     LDFLAGS="-fuse-ld=lld" make -j"$(grep ^processor /proc/cpuinfo | wc -l)" && \
 70 |     make install
 71 | 
 72 | RUN update-alternatives --install /usr/bin/python python $(which python$PYTHON_VERSION) 0 && \
 73 |     update-alternatives --install /usr/local/bin/python3 python3 $(which python$PYTHON_VERSION) 0 && \
 74 |     update-alternatives --force --install /usr/bin/pip pip $(which pip$PYTHON_VERSION) 0
 75 | 
 76 | # General build dependencies
 77 | RUN pip install setuptools wheel clang==14 libclang==14.0.1 'cython>=3.1.0b1'
 78 | 
 79 | RUN pip install numpy Pillow warp-lang && \
 80 |     # Disable build isolation to use system-installed Cython
 81 |     pip install --no-build-isolation nvtx
 82 | 
 83 | # PyTorch nightly build
 84 | RUN if [ "$(echo "$CUDA_VERSION" | tr -d . | head -c 3)" != 124 ]; then \
 85 |     echo "No available free-threaded PyTorch wheels for CUDA version $CUDA_VERSION"; \
 86 |     else \
 87 |     python3 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124; \
 88 |     fi
 89 | 
 90 | # nvImageCodec provides 3.13t wheels
 91 | RUN suffix="$(echo "$CUDA_VERSION" | tr -d . | head -c 2)" && \
 92 |     if ! echo "$suffix" | grep -wq -e 11 -e 12; then \
 93 |     echo "No available nvImageCodec wheels for CUDA version $CUDA_VERSION"; \
 94 |     else \
 95 |     python3 -m pip install nvidia-nvimgcodec-cu$suffix; \
 96 |     fi
 97 | 
 98 | ARG CUDA_ARCHS='60;70;80;90'
 99 | 
100 | # Install CV-CUDA from source
101 | # Clone and patch
102 | RUN apt install -y git-lfs patchelf
103 | RUN git clone -b v0.8.0-beta https://github.com/CVCUDA/CV-CUDA.git && \
104 |     cd CV-CUDA && sed -i 's/skip_precommit=0/skip_precommit=1/g' init_repo.sh && \
105 |     ./init_repo.sh && cd 3rdparty/pybind11 && git submodule update --init . && git checkout v2.13.6
106 | # Build and install
107 | RUN cd CV-CUDA && \
108 |     CUDA_MAJOR=12 ci/build.sh release build -DCMAKE_CUDA_ARCHITECTURES="$CUDA_ARCHS" && \
109 |     python3 -m pip install build/python$PYTHON_VERSION/wheel
110 | 
111 | # Install cuDNN FE
112 | RUN apt install -y cudnn && \
113 |     CUDNN_INCLUDE_DIR=/usr/include CMAKE_POLICY_VERSION_MINIMUM=3.5 \
114 |     python3 -m pip install git+https://github.com/NVIDIA/cudnn-frontend.git
115 | 
116 | # Install CUDA-Python
117 | RUN git clone -b "v$CUDA_VERSION" https://github.com/NVIDIA/cuda-python && cd cuda-python && \
118 |     python3 -m pip install -r requirements.txt && export CUDA_HOME=/usr/local/cuda && \
119 |     PARALLEL_LEVEL="$(grep ^processor /proc/cpuinfo | wc -l)" CC=gcc CXX=g++ python3 setup.py bdist_wheel install && \
120 |     python3 -m pip install dist/*.whl
121 | # Install cuda.core
122 | RUN cd cuda-python && git switch main && \
123 |     cd cuda_core && python3 -m pip install --no-build-isolation .
124 | 
125 | # Install Nsight Systems
126 | RUN wget -O nsight.deb https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2023_4_1_97/nsightsystems-linux-cli-public-2023.4.1.97-3355750.deb/ && \
127 |     dpkg -i nsight.deb
128 | 
129 | RUN rm -r /opt/nvidia/entrypoint.d/*
130 | COPY entrypoint.d /opt/nvidia/entrypoint.d
131 | 


--------------------------------------------------------------------------------
/docker/patches/torch.patch:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: MIT
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a
  5 | # copy of this software and associated documentation files (the "Software"),
  6 | # to deal in the Software without restriction, including without limitation
  7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 | # and/or sell copies of the Software, and to permit persons to whom the
  9 | # Software is furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice shall be included in
 12 | # all copies or substantial portions of the Software.
 13 | #
 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 | # DEALINGS IN THE SOFTWARE.
 21 | 
 22 | diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
 23 | index a3f82633037..0c3a8b03435 100644
 24 | --- a/torch/csrc/Storage.cpp
 25 | +++ b/torch/csrc/Storage.cpp
 26 | @@ -214,7 +214,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
 27 |      PyObject_GC_UnTrack(self);
 28 |    }
 29 |  
 30 | -  bool has_finalizer = type->tp_finalize || type->tp_del;
 31 | +  // bool has_finalizer = type->tp_finalize || type->tp_del;
 32 |  
 33 |    if (type->tp_finalize) {
 34 |      PyObject_GC_Track(self);
 35 | @@ -236,13 +236,14 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
 36 |    if (type->tp_del) {
 37 |      PyObject_GC_Track(self);
 38 |      type->tp_del(self);
 39 | -    if (self->ob_refcnt > 0) {
 40 | +    if (Py_REFCNT(self) > 0) {
 41 |        // Resurrected (see above comment about resurrection from `__del__`)
 42 |        return;
 43 |      }
 44 |      PyObject_GC_UnTrack(self);
 45 |    }
 46 |  
 47 | +#if 0 // there's a risk of missing weak refs but _PyWeakref_ClearRef was moved to internal API so this doesn't compile anymore
 48 |    if (has_finalizer) {
 49 |      /* New weakrefs could be created during the finalizer call.
 50 |         If this occurs, clear them out without calling their
 51 | @@ -256,6 +257,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
 52 |          _PyWeakref_ClearRef(*list);
 53 |      }
 54 |    }
 55 | +#endif
 56 |  
 57 |    // Clear slots
 58 |    {
 59 | diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
 60 | index 8f5297e87fe..1186bcc90f3 100644
 61 | --- a/torch/csrc/autograd/python_function.cpp
 62 | +++ b/torch/csrc/autograd/python_function.cpp
 63 | @@ -1,3 +1,4 @@
 64 | +#include <Python.h>
 65 |  #include <torch/csrc/autograd/python_function.h>
 66 |  
 67 |  #include <ATen/ATen.h>
 68 | @@ -1017,12 +1018,14 @@ static void _trace_post_record(
 69 |    }
 70 |  
 71 |    node->i_(jit::attr::inplace, is_inplace);
 72 | -  if (PyObject* module_name = PyDict_GetItemString(
 73 | -          ((PyTypeObject*)op_obj)->tp_dict, "__module__")) {
 74 | +  PyObject *module_name = nullptr;
 75 | +  PyDict_GetItemStringRef(((PyTypeObject*)op_obj)->tp_dict, "__module__", &module_name);
 76 | +  if (module_name != nullptr) {
 77 |      if (auto ptr = PyUnicode_AsUTF8(module_name)) {
 78 |        node->s_(jit::attr::module, std::string(ptr));
 79 |      }
 80 |    }
 81 | +  Py_XDECREF(module_name);
 82 |  
 83 |    // Isolate C variable ptrs in a vector
 84 |    int num_outputs = PyTuple_GET_SIZE(output_objects);
 85 | diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
 86 | index c29e003a0b7..ba4099728c3 100644
 87 | --- a/torch/csrc/autograd/python_hook.cpp
 88 | +++ b/torch/csrc/autograd/python_hook.cpp
 89 | @@ -67,7 +67,7 @@ bool _call_hooks(PyObject* dict, PyObject* args) {
 90 |    bool is_modified = false;
 91 |    const auto len = PyList_Size(hooks);
 92 |    for (Py_ssize_t idx = 0; idx < len; ++idx) {
 93 | -    const auto hook = PyList_GetItem(hooks, idx);
 94 | +    const auto hook = PyList_GetItem(hooks, idx); // borrowed ref
 95 |  
 96 |      THPObjectPtr res(PyObject_CallObject(hook, args));
 97 |      if (!res)
 98 | diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
 99 | index 3705ac5e423..ae47fa5a689 100644
100 | --- a/torch/csrc/autograd/python_variable.cpp
101 | +++ b/torch/csrc/autograd/python_variable.cpp
102 | @@ -1891,7 +1891,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
103 |    PyObject_GC_UnTrack(self);
104 |    // TODO: consider using trash can
105 |  
106 | -  bool has_finalizer = type->tp_finalize || type->tp_del;
107 | +  // bool has_finalizer = type->tp_finalize || type->tp_del;
108 |  
109 |    if (type->tp_finalize) {
110 |      PyObject_GC_Track(self);
111 | @@ -1910,13 +1910,14 @@ void THPVariable_subclass_dealloc(PyObject* self) {
112 |    if (type->tp_del) {
113 |      PyObject_GC_Track(self);
114 |      type->tp_del(self);
115 | -    if (self->ob_refcnt > 0) {
116 | +    if (Py_REFCNT(self) > 0) {
117 |        /* Resurrected */
118 |        return;
119 |      }
120 |      PyObject_GC_UnTrack(self);
121 |    }
122 |  
123 | +#if 0 // there's a risk of missing weak refs but _PyWeakref_ClearRef was moved to internal API so this doesn't compile anymore
124 |    if (has_finalizer) {
125 |      /* New weakrefs could be created during the finalizer call.
126 |         If this occurs, clear them out without calling their
127 | @@ -1930,6 +1931,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
128 |          _PyWeakref_ClearRef(*list);
129 |      }
130 |    }
131 | +#endif
132 |  
133 |    // Clear all slots until we get to base class THPVariableType
134 |    {
135 | diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
136 | index bf710b9ff7e..d740af81c86 100644
137 | --- a/torch/csrc/dynamo/cpython_defs.c
138 | +++ b/torch/csrc/dynamo/cpython_defs.c
139 | @@ -25,17 +25,16 @@
140 |  #endif
141 |  
142 |  #define Py_BUILD_CORE
143 | -#include <internal/pycore_pystate.h>
144 | -#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
145 | -#include <internal/pycore_opcode.h>
146 | +#define NEED_OPCODE_METADATA // To get _PyOpcode_Deopt, _PyOpcode_Caches
147 | +#include <internal/pycore_opcode_metadata.h>
148 |  #undef NEED_OPCODE_TABLES
149 |  #undef Py_BUILD_CORE
150 |  #include <internal/pycore_frame.h>
151 |  
152 |  // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
153 |  // us to manually re-check that the function didn't change on the next major version
154 | -#if PY_VERSION_HEX >= 0x030D0000 // 3.13
155 | -#error "Please ensure that the functions below still match the CPython implementation for 3.13"
156 | +#if PY_VERSION_HEX >= 0x030E0000 // 3.14
157 | +#error "Please ensure that the functions below still match the CPython implementation for 3.14"
158 |  #endif
159 |  
160 |  // https://github.com/python/cpython/blob/a7715ccfba5b86ab09f86ec56ac3755c93b46b48/Objects/frameobject.c#L1079
161 | @@ -45,8 +44,8 @@ THP_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame, int opcode, int oparg)
162 |      // This only works when opcode is a non-quickened form:
163 |      CHECK(_PyOpcode_Deopt[opcode] == opcode);
164 |      int check_oparg = 0;
165 | -    for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code);
166 | -         instruction < frame->prev_instr; instruction++)
167 | +    for (_Py_CODEUNIT *instruction = _PyCode_CODE(_PyFrame_GetCode(frame));
168 | +         instruction < frame->instr_ptr; instruction++)
169 |      {
170 |          int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)];
171 |          check_oparg |= _Py_OPARG(*instruction);
172 | @@ -75,7 +74,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame, int *free_vars_copied)
173 |  {
174 |      // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
175 |      // here:
176 | -    PyCodeObject *co = frame->f_code;
177 | +    PyCodeObject *co = _PyFrame_GetCode(frame);
178 |      int lasti = _PyInterpreterFrame_LASTI(frame);
179 |      if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS
180 |            && PyFunction_Check(frame->f_funcobj)))
181 | @@ -86,13 +85,13 @@ frame_init_get_vars(_PyInterpreterFrame *frame, int *free_vars_copied)
182 |  
183 |      /* Free vars have not been initialized -- Do that */
184 |      PyObject *closure = ((PyFunctionObject *)frame->f_funcobj)->func_closure;
185 | -    int offset = PyCode_GetFirstFree(co);
186 | +    int offset = PyUnstable_Code_GetFirstFree(co);
187 |      for (int i = 0; i < co->co_nfreevars; ++i) {
188 |          PyObject *o = PyTuple_GET_ITEM(closure, i);
189 |          frame->localsplus[offset + i] = Py_NewRef(o);
190 |      }
191 |      // COPY_FREE_VARS doesn't have inline CACHEs, either:
192 | -    frame->prev_instr = _PyCode_CODE(frame->f_code);
193 | +    frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame));
194 |  
195 |      *free_vars_copied = 1;
196 |  }
197 | @@ -175,7 +174,7 @@ THP_PyFrame_GetLocals(_PyInterpreterFrame *frame, int include_hidden, int *free_
198 |  
199 |      frame_init_get_vars(frame, free_vars_copied);
200 |  
201 | -    PyCodeObject *co = frame->f_code;
202 | +    PyCodeObject *co = _PyFrame_GetCode(frame);
203 |      for (int i = 0; i < co->co_nlocalsplus; i++) {
204 |          PyObject *value;  // borrowed reference
205 |          if (!frame_get_var(frame, co, i, &value)) {
206 | @@ -411,7 +410,7 @@ THP_PyFrame_New_NoTrack(const PyCodeObject *code)
207 |      f->f_trace = NULL;
208 |      f->f_trace_lines = 1;
209 |      f->f_trace_opcodes = 0;
210 | -    f->f_fast_as_locals = 0;
211 | +    f->f_extra_locals = NULL;
212 |      f->f_lineno = 0;
213 |      return f;
214 |  }
215 | @@ -424,7 +423,7 @@ THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame)
216 |      PyObject *error_type = NULL, *error_value = NULL, *error_traceback = NULL;
217 |      PyErr_Fetch(&error_type, &error_value, &error_traceback);
218 |  
219 | -    PyFrameObject *f = THP_PyFrame_New_NoTrack(frame->f_code);
220 | +    PyFrameObject *f = THP_PyFrame_New_NoTrack(_PyFrame_GetCode(frame));
221 |      if (f == NULL) {
222 |          Py_XDECREF(error_type);
223 |          Py_XDECREF(error_value);
224 | @@ -484,8 +483,8 @@ THP_take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame)
225 |      if (_PyFrame_IsIncomplete(frame)) {
226 |          // This may be a newly-created generator or coroutine frame. Since it's
227 |          // dead anyways, just pretend that the first RESUME ran:
228 | -        PyCodeObject *code = frame->f_code;
229 | -        frame->prev_instr = _PyCode_CODE(code) + code->_co_firsttraceable;
230 | +        PyCodeObject *code = _PyFrame_GetCode(frame);
231 | +        frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable;
232 |      }
233 |      CHECK(!_PyFrame_IsIncomplete(frame));
234 |      CHECK(f->f_back == NULL);
235 | @@ -523,7 +522,7 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
236 |          _PyFrame_GetGenerator(frame)->gi_frame_state == FRAME_CLEARED);
237 |      // GH-99729: Clearing this frame can expose the stack (via finalizers). It's
238 |      // crucial that this frame has been unlinked, and is no longer visible:
239 | -    CHECK(_PyThreadState_GET()->cframe->current_frame != frame);
240 | +    CHECK(PyThreadState_GET()->current_frame != frame);
241 |      if (frame->frame_obj) {
242 |          PyFrameObject *f = frame->frame_obj;
243 |          frame->frame_obj = NULL;
244 | @@ -546,7 +545,7 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
245 |      #else
246 |      Py_DECREF(frame->f_func);
247 |      #endif
248 | -    Py_DECREF(frame->f_code);
249 | +    Py_DECREF(_PyFrame_GetCode(frame));
250 |  }
251 |  
252 |  // https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L635
253 | diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
254 | index b762f87d69d..d4432b8bb43 100644
255 | --- a/torch/csrc/dynamo/cpython_defs.h
256 | +++ b/torch/csrc/dynamo/cpython_defs.h
257 | @@ -8,7 +8,9 @@
258 |  
259 |  #if IS_PYTHON_3_11_PLUS
260 |  
261 | +#define Py_BUILD_CORE
262 |  #include <internal/pycore_frame.h>
263 | +#undef Py_BUILD_CORE
264 |  
265 |  int THP_PyFrame_FastToLocalsWithError(
266 |      _PyInterpreterFrame* frame,
267 | diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
268 | index b6a26f635ec..34eafba173b 100644
269 | --- a/torch/csrc/dynamo/eval_frame.c
270 | +++ b/torch/csrc/dynamo/eval_frame.c
271 | @@ -18,7 +18,6 @@
272 |  // see https://bugs.python.org/issue35886
273 |  #if PY_VERSION_HEX >= 0x03080000
274 |  #define Py_BUILD_CORE
275 | -#include <internal/pycore_pystate.h>
276 |  
277 |  // These headers were added in 3.11
278 |  #if IS_PYTHON_3_11_PLUS
279 | @@ -58,7 +57,7 @@ DECLARE_PYOBJ_ATTR(f_func)
280 |  DECLARE_PYOBJ_ATTR(f_globals)
281 |  DECLARE_PYOBJ_ATTR(f_builtins)
282 |  DECLARE_PYOBJ_ATTR(f_locals)
283 | -DECLARE_PYOBJ_ATTR(f_code)
284 | +DECLARE_PYOBJ_ATTR(f_executable)
285 |  DECLARE_PYOBJ_ATTR(frame_obj)
286 |  
287 |  #undef DECLARE_PYOBJ_ATTR
288 | @@ -76,7 +75,7 @@ static PyObject* THPPyInterpreterFrame_f_lasti(THPPyInterpreterFrame* self, PyOb
289 |  
290 |  static PyObject* THPPyInterpreterFrame_f_lineno(THPPyInterpreterFrame* self, PyObject* _noargs) {
291 |    if (!self->frame->frame_obj) {
292 | -    return PyLong_FromLong(self->frame->f_code->co_firstlineno);
293 | +    return PyLong_FromLong((_PyFrame_GetCode(self->frame))->co_firstlineno);
294 |    }
295 |    int lineno = PyFrame_GetLineNumber(self->frame->frame_obj);
296 |    if (lineno < 0) {
297 | @@ -102,7 +101,7 @@ static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {
298 |      {"f_globals", (getter)THPPyInterpreterFrame_f_globals, NULL, NULL, NULL},
299 |      {"f_builtins", (getter)THPPyInterpreterFrame_f_builtins, NULL, NULL, NULL},
300 |      {"f_locals", (getter)THPPyInterpreterFrame_f_locals, NULL, NULL, NULL},
301 | -    {"f_code", (getter)THPPyInterpreterFrame_f_code, NULL, NULL, NULL},
302 | +    {"f_executable", (getter)THPPyInterpreterFrame_f_executable, NULL, NULL, NULL},
303 |      {"frame_obj", (getter)THPPyInterpreterFrame_frame_obj, NULL, NULL, NULL},
304 |      {"previous", (getter)THPPyInterpreterFrame_previous, NULL, NULL, NULL},
305 |      {"f_lasti", (getter)THPPyInterpreterFrame_f_lasti, NULL, NULL, NULL},
306 | @@ -239,8 +238,8 @@ inline static void enable_eval_frame_default(PyThreadState* tstate) {
307 |  
308 |  inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
309 |    // Returns the C string name of the current frame.
310 | -  DEBUG_CHECK(PyUnicode_Check(frame->f_code->co_name));
311 | -  return PyUnicode_AsUTF8(frame->f_code->co_name);
312 | +  DEBUG_CHECK(PyUnicode_Check(_PyFrame_GetCode(frame)->co_name));
313 | +  return PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_name);
314 |  }
315 |  
316 |  static inline PyObject* call_callback(
317 | @@ -327,7 +326,7 @@ inline static PyObject* eval_custom_code_impl(
318 |  
319 |    PyObject** fastlocals_old = frame->localsplus;
320 |    PyObject** fastlocals_new = shadow->localsplus;
321 | -  Py_ssize_t n_old = frame->f_code->co_nlocalsplus;
322 | +  Py_ssize_t n_old = _PyFrame_GetCode(frame)->co_nlocalsplus;
323 |    Py_ssize_t n_new = code->co_nlocalsplus;
324 |  
325 |    // localsplus are XINCREF'd by default eval frame, so all values must be valid.
326 | @@ -341,8 +340,8 @@ inline static PyObject* eval_custom_code_impl(
327 |    // for 3.11+, if free_vars_copied is true, we do not need to
328 |    // run the first COPY_FREE_VARS since THP_PyFrame_FastToLocalsWithError
329 |    // already did the equivalent action.
330 | -  if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(shadow->f_code)[0]) == COPY_FREE_VARS) {
331 | -    shadow->prev_instr = _PyCode_CODE(shadow->f_code);
332 | +  if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(_PyFrame_GetCode(shadow))[0]) == COPY_FREE_VARS) {
333 | +    shadow->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(shadow));
334 |    }
335 |  
336 |  #else
337 | @@ -399,11 +398,12 @@ inline static PyObject* eval_custom_code_impl(
338 |  
339 |    // copy args
340 |    // according to https://docs.python.org/3/library/inspect.html , `co_argcount` is the number of arguments (not including keyword only arguments, * or ** args). so we need to add `co_kwonlyargcount` and `co_flags` to get the total number of arguments.
341 | -  // !!(frame->f_code->co_flags & CO_VARARGS) is 1 if the function has *args, 0 otherwise
342 | -  // !!(frame->f_code->co_flags & CO_VARKEYWORDS) is 1 if the function has **kwargs, 0 otherwise
343 | +  // !!(_PyFrame_GetCode(frame)->co_flags & CO_VARARGS) is 1 if the function has *args, 0 otherwise
344 | +  // !!(_PyFrame_GetCode(frame)->co_flags & CO_VARKEYWORDS) is 1 if the function has **kwargs, 0 otherwise
345 |    // they convert bit flags to 0 or 1, and avoid branching.
346 |    // This is performance critical code, so we really care about performance.
347 | -  Py_ssize_t total_argcount_old = frame->f_code->co_argcount + frame->f_code->co_kwonlyargcount + !!(frame->f_code->co_flags & CO_VARARGS) + !!(frame->f_code->co_flags & CO_VARKEYWORDS);
348 | +  PyCodeObject* f_code = _PyFrame_GetCode(frame);
349 | +  Py_ssize_t total_argcount_old = f_code->co_argcount + f_code->co_kwonlyargcount + !!(f_code->co_flags & CO_VARARGS) + !!(f_code->co_flags & CO_VARKEYWORDS);
350 |  
351 |    for (Py_ssize_t i = 0; i < total_argcount_old; i++) {
352 |      Py_XINCREF(fastlocals_old[i]);
353 | @@ -411,7 +411,7 @@ inline static PyObject* eval_custom_code_impl(
354 |    }
355 |  
356 |    // copy free vars
357 | -  Py_ssize_t nfrees_old = PyCode_GetNFreevars(frame->f_code);
358 | +  Py_ssize_t nfrees_old = PyCode_GetNFreevars(f_code);
359 |  
360 |    for (Py_ssize_t i = 0; i < nfrees_old; i++) {
361 |      Py_XINCREF(fastlocals_old[n_old - 1 - i]);
362 | @@ -425,7 +425,7 @@ inline static PyObject* eval_custom_code_impl(
363 |    // this is straightforward in Python 3.11 and higher, as there are bit flags in `co_localspluskinds` to tell if a variable is a cell variable.
364 |    // in Python 3.10 and lower, essentially we are checking if a variable is a new local variable (because of the layout mentioned above, the first variable that is not cell variable is the first new local variable). the corresponding slot in `flocalsplus` is NULL for new local variables.
365 |  #if IS_PYTHON_3_11_PLUS
366 | -    if(!(_PyLocals_GetKind(frame->f_code->co_localspluskinds, i) & CO_FAST_CELL))
367 | +    if(!(_PyLocals_GetKind(f_code->co_localspluskinds, i) & CO_FAST_CELL))
368 |      {
369 |        break;
370 |      }
371 | @@ -526,14 +526,14 @@ static PyObject* _custom_eval_frame(
372 |    DEBUG_TRACE(
373 |        "begin %s %s %i %i",
374 |        get_frame_name(frame),
375 | -      PyUnicode_AsUTF8(frame->f_code->co_filename),
376 | -      frame->f_code->co_firstlineno,
377 | +      PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_filename),
378 | +      _PyFrame_GetCode(frame)->co_firstlineno,
379 |        _PyInterpreterFrame_LASTI(frame));
380 |  #else
381 |    DEBUG_TRACE(
382 |        "begin %s %s %i %i %i",
383 |        get_frame_name(frame),
384 | -      PyUnicode_AsUTF8(frame->f_code->co_filename),
385 | +      PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_filename),
386 |        frame->f_lineno,
387 |        frame->f_lasti,
388 |        frame->f_iblock);
389 | @@ -564,14 +564,14 @@ static PyObject* _custom_eval_frame(
390 |      return eval_frame_default(tstate, frame, throw_flag);
391 |    }
392 |  
393 | -  ExtraState* extra = get_extra_state(frame->f_code);
394 | +  ExtraState* extra = get_extra_state(_PyFrame_GetCode(frame));
395 |    if (extra == SKIP_CODE || (callback == Py_False && extra == NULL)) {
396 |      DEBUG_TRACE("skip %s", get_frame_name(frame));
397 |      return eval_frame_default(tstate, frame, throw_flag);
398 |    }
399 |  
400 |    if (extra == NULL) {
401 | -    extra = init_and_set_extra_state(frame->f_code);
402 | +    extra = init_and_set_extra_state(_PyFrame_GetCode(frame));
403 |    }
404 |  
405 |    // TODO(jansel): investigate directly using the "fast" representation
406 | @@ -667,7 +667,7 @@ static PyObject* _custom_eval_frame(
407 |    } else {
408 |      DEBUG_TRACE("create skip %s", get_frame_name(frame));
409 |      Py_DECREF(result);
410 | -    set_extra_state(frame->f_code, SKIP_CODE);
411 | +    set_extra_state(_PyFrame_GetCode(frame), SKIP_CODE);
412 |      // Re-enable custom behavior
413 |      eval_frame_callback_set(callback);
414 |      return eval_frame_default(tstate, frame, throw_flag);
415 | diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
416 | index d61ac4219a8..9b83ab200b7 100644
417 | --- a/torch/csrc/dynamo/guards.cpp
418 | +++ b/torch/csrc/dynamo/guards.cpp
419 | @@ -233,12 +233,12 @@ static std::vector<std::optional<c10::SymInt>> pyListToVecOptInt(
420 |    std::vector<std::optional<c10::SymInt>> vec;
421 |    Py_ssize_t size = PyList_Size(pyList);
422 |    for (Py_ssize_t i = 0; i < size; i++) {
423 | -    PyObject* item = PyList_GetItem(pyList, i);
424 | -    auto handle = py::handle(item);
425 | +    PyObject* item = PyList_GetItemRef(pyList, i);
426 | +    auto obj = py::reinterpret_steal<py::object>(item);
427 |      if (item == Py_None) {
428 |        vec.emplace_back(std::nullopt);
429 | -    } else if (torch::is_symint(handle)) {
430 | -      vec.emplace_back(py::cast<c10::SymInt>(handle));
431 | +    } else if (torch::is_symint(obj)) {
432 | +      vec.emplace_back(py::cast<c10::SymInt>(obj));
433 |      } else {
434 |        int64_t value = PyLong_AsLongLong(item);
435 |        if (value == -1 && PyErr_Occurred()) {
436 | @@ -259,8 +259,9 @@ static std::vector<std::vector<std::optional<c10::SymInt>>> get_dynamic_dims(
437 |    if (dynamic_dims_py != Py_None) {
438 |      Py_ssize_t size = PyList_Size(dynamic_dims_py);
439 |      for (Py_ssize_t i = 0; i < size; i++) {
440 | -      PyObject* py_list = PyList_GetItem(dynamic_dims_py, i);
441 | +      PyObject* py_list = PyList_GetItemRef(dynamic_dims_py, i);
442 |        std::vector<std::optional<c10::SymInt>> vec = pyListToVecOptInt(py_list);
443 | +      Py_DECREF(py_list);
444 |        per_tensor_dynamic_dims.push_back(std::move(vec));
445 |      }
446 |    }
447 | @@ -411,13 +412,15 @@ PyObject* TensorGuards_check_verbose(
448 |    std::vector<std::string> tensor_check_names;
449 |    tensor_check_names.reserve(names_size);
450 |    for (auto i : c10::irange(names_size)) {
451 | -    PyObject* value = PyList_GetItem(tensor_check_names_py, i);
452 | +    PyObject* value = PyList_GetItemRef(tensor_check_names_py, i);
453 |      if (!PyUnicode_Check(value)) {
454 | +      Py_DECREF(value);
455 |        PyErr_SetString(
456 |            PyExc_TypeError, "tensor_check_names must only contain strings");
457 |        return nullptr;
458 |      }
459 |      tensor_check_names.emplace_back(PyUnicode_AsUTF8(value));
460 | +    Py_DECREF(value);
461 |    }
462 |  
463 |    LocalState state;
464 | @@ -1121,10 +1124,11 @@ class DEFAULT_DEVICE : public LeafGuard {
465 |      // leaked by design.
466 |      static PyObject* current_device_str =
467 |          PyUnicode_InternFromString("CURRENT_DEVICE");
468 | -    PyObject* device = PyDict_GetItem(
469 | -        _utils_device_dict.ptr(), current_device_str); // borrowed ref
470 | +    PyObject* device;
471 | +    PyDict_GetItemRef(_utils_device_dict.ptr(), current_device_str, &device);
472 |      if (device != _device.ptr()) {
473 |        int result = PyObject_RichCompareBool(device, _device.ptr(), Py_EQ);
474 | +      Py_DECREF(device);
475 |        if (result == -1) {
476 |          PyErr_Clear();
477 |          return false;
478 | @@ -2236,14 +2240,17 @@ class DictSubclassGuardManager : public DictGuardManager {
479 |            return false;
480 |          }
481 |  
482 | -        PyObject* value = PyDict_GetItem(obj, key); // borrowed ref
483 | +        PyObject* value;
484 | +        PyDict_GetItemRef(obj, key, &value); // new reference
485 |          std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
486 |          if (value_manager && !value_manager->check_nopybind(value)) {
487 |            Py_DECREF(key);
488 | +          Py_XDECREF(value);
489 |            Py_DECREF(iterator);
490 |            return false;
491 |          }
492 |  
493 | +        Py_XDECREF(value);
494 |          index_pointer++;
495 |        }
496 |        dict_pointer++;
497 | @@ -2305,7 +2312,8 @@ class DictSubclassGuardManager : public DictGuardManager {
498 |            }
499 |          }
500 |  
501 | -        PyObject* value = PyDict_GetItem(obj, key); // borrowed ref
502 | +        PyObject* value;
503 | +        PyDict_GetItemRef(obj, key, &value); // new reference
504 |          std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
505 |          if (value_manager) {
506 |            GuardDebugInfo debug_info =
507 | @@ -2313,11 +2321,14 @@ class DictSubclassGuardManager : public DictGuardManager {
508 |            num_guards_executed += debug_info.num_guards_executed;
509 |            if (!debug_info.result) {
510 |              Py_DECREF(key);
511 | +            Py_XDECREF(value);
512 |              Py_DECREF(iterator);
513 |              return GuardDebugInfo(
514 |                  false, debug_info.verbose_code_parts, num_guards_executed);
515 |            }
516 |          }
517 | +
518 | +        Py_XDECREF(value);
519 |          index_pointer++;
520 |        }
521 |        Py_DECREF(key);
522 | @@ -2602,24 +2613,30 @@ class DictGetItemGuardAccessor : public GuardAccessor {
523 |    // NB: Intentional duplication between check_nopybind and
524 |    // check_verbose_nopybind.
525 |    bool check_nopybind(PyObject* obj) override { // borrowed ref
526 | -    PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref
527 | -    if (x == nullptr) {
528 | +    PyObject* x = nullptr;
529 | +    int res = PyDict_GetItemRef(obj, _key, &x); // new reference
530 | +    if (x == nullptr || res < 0) {
531 | +      Py_XDECREF(x);
532 |        PyErr_Clear();
533 |        return false;
534 |      }
535 |      bool result = _guard_manager->check_nopybind(x);
536 | +    Py_DECREF(x);
537 |      return result;
538 |    }
539 |  
540 |    GuardDebugInfo check_verbose_nopybind(
541 |        PyObject* obj) override { // borrowed ref
542 | -    PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref
543 | -    if (x == nullptr) {
544 | +    PyObject* x = nullptr;
545 | +    int res = PyDict_GetItemRef(obj, _key, &x); // new reference
546 | +    if (x == nullptr || res < 0) {
547 | +      Py_XDECREF(x);
548 |        PyErr_Clear();
549 |        return GuardDebugInfo(
550 |            false, std::string("KeyError on ") + get_source(), 0);
551 |      }
552 |      GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
553 | +    Py_DECREF(x);
554 |      return result;
555 |    }
556 |  
557 | @@ -3077,40 +3094,54 @@ class GlobalWeakRefGuardAccessor : public GuardAccessor {
558 |    bool check_nopybind(PyObject* obj) override { // borrowed ref
559 |      // obj is globals dict because GlobalWeakRefGuardAccessor has to be a
560 |      // child of GlobalsGuardAccessor.
561 | -    PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref
562 | -    if (weakref == nullptr) {
563 | +    PyObject* weakref = nullptr;
564 | +    int res = PyDict_GetItemRef(obj, _global_name, &weakref); // new reference
565 | +    if (weakref == nullptr || res < 0) {
566 |        // The weakref is not in the globals dict.
567 | +      Py_XDECREF(weakref);
568 |        PyErr_Clear();
569 |        return false;
570 |      }
571 |  
572 |      if (!PyWeakref_Check(weakref)) {
573 | +      Py_DECREF(weakref);
574 |        return false;
575 |      }
576 |  
577 | -    PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref
578 | -    return _guard_manager->check_nopybind(x);
579 | +    PyObject* x;
580 | +    PyWeakref_GetRef(weakref, &x);
581 | +    bool result = _guard_manager->check_nopybind(x);
582 | +    Py_DECREF(x);
583 | +    Py_DECREF(weakref);
584 | +    return result;
585 |    }
586 |  
587 |    GuardDebugInfo check_verbose_nopybind(
588 |        PyObject* obj) override { // borrowed ref
589 |      // obj is globals dict because GlobalWeakRefGuardAccessor has to be a
590 |      // child of GlobalsGuardAccessor.
591 | -    PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref
592 | -    if (weakref == nullptr) {
593 | +    PyObject* weakref = nullptr;
594 | +    int res = PyDict_GetItemRef(obj, _global_name, &weakref); // new reference
595 | +    if (weakref == nullptr || res < 0) {
596 |        // The weakref is not in the globals dict.
597 | +      Py_XDECREF(weakref);
598 |        PyErr_Clear();
599 |        return GuardDebugInfo(
600 |            false, std::string("KeyError on ") + get_source(), 0);
601 |      }
602 |  
603 |      if (!PyWeakref_Check(weakref)) {
604 | +      Py_DECREF(weakref);
605 |        return GuardDebugInfo(
606 |            false, std::string("Not a weakref ") + get_source(), 0);
607 |      }
608 |  
609 | -    PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref
610 | -    return _guard_manager->check_verbose_nopybind(x);
611 | +    PyObject* x;
612 | +    PyWeakref_GetRef(weakref, &x);
613 | +    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
614 | +    Py_DECREF(x);
615 | +    Py_DECREF(weakref);
616 | +    return result;
617 |    }
618 |  
619 |    std::string repr() const override {
620 | diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
621 | index 23107d91d99..fcb55e641bc 100644
622 | --- a/torch/csrc/jit/python/pybind_utils.cpp
623 | +++ b/torch/csrc/jit/python/pybind_utils.cpp
624 | @@ -1,3 +1,4 @@
625 | +#include <pybind11/detail/internals.h>
626 |  #include <torch/csrc/jit/ir/graph_utils.h>
627 |  #include <torch/csrc/jit/python/module_python.h>
628 |  #include <torch/csrc/jit/python/pybind_utils.h>
629 | @@ -31,14 +32,14 @@ ToIValueAllowNumbersAsTensors::~ToIValueAllowNumbersAsTensors() {
630 |  // C++->Python. We need this because otherwise we may get the old Python object
631 |  // if C++ creates a new object at the memory location of the deleted object.
632 |  void clear_registered_instances(void* ptr) {
633 | -  auto& registered_instances =
634 | -      pybind11::detail::get_internals().registered_instances;
635 | -  auto range = registered_instances.equal_range(ptr);
636 | -  for (auto it = range.first; it != range.second; ++it) {
637 | -    auto vh = it->second->get_value_and_holder();
638 | -    vh.set_instance_registered(false);
639 | -  }
640 | -  registered_instances.erase(ptr);
641 | +  pybind11::detail::with_instance_map(ptr, [&](pybind11::detail::instance_map &instances) {
642 | +    auto range = instances.equal_range(ptr);
643 | +    for (auto it = range.first; it != range.second; ++it) {
644 | +      auto vh = it->second->get_value_and_holder();
645 | +      vh.set_instance_registered(false);
646 | +    }
647 | +    instances.erase(ptr);
648 | +  });
649 |  }
650 |  
651 |  // WARNING: Precondition for this function is that, e.g., you have tested if a
652 | diff --git a/torch/csrc/utils/nested.cpp b/torch/csrc/utils/nested.cpp
653 | index 29ccf312851..be66215d34b 100644
654 | --- a/torch/csrc/utils/nested.cpp
655 | +++ b/torch/csrc/utils/nested.cpp
656 | @@ -49,9 +49,10 @@ at::Tensor nested_tensor_ctor(
657 |    // Check whether we are dealing with lists of tensors or not
658 |    std::vector<at::Tensor> new_list(PyList_Size(data));
659 |    for (const auto i : c10::irange(PyList_Size(data))) {
660 | -    PyObject* elem = PyList_GetItem(data, i);
661 | +    PyObject* elem = PyList_GetItemRef(data, i);
662 |      if (THPVariable_Check(elem)) {
663 |        new_list[i] = THPVariable_Unpack(PyList_GetItem(data, i)).detach();
664 | +      Py_DECREF(elem);
665 |        TORCH_CHECK(
666 |            !new_list[i].is_nested(),
667 |            "We do not accept nested tensors as input to nested tensors");
668 | @@ -70,6 +71,7 @@ at::Tensor nested_tensor_ctor(
669 |        };
670 |        elem_r.args = elem_args.data();
671 |        new_list[i] = tensor_ctor(dispatch_key, scalar_type, elem_r);
672 | +      Py_DECREF(elem);
673 |      }
674 |    }
675 |  
676 | 


--------------------------------------------------------------------------------