├── NVIDIA_CLA_v1.0.1.docx ├── docker ├── scripts │ └── validate_args.sh ├── entrypoint.d │ ├── 10-banner.sh │ └── 50-gpu-driver-check.sh ├── Dockerfile └── patches │ └── torch.patch ├── LICENSE ├── README.rst └── test └── simple.py /NVIDIA_CLA_v1.0.1.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/free-threaded-python/HEAD/NVIDIA_CLA_v1.0.1.docx -------------------------------------------------------------------------------- /docker/scripts/validate_args.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 | 5 | # PYTHON_VERSION variable 6 | [[ "$PYTHON_VERSION" =~ ^3\.([0-9]+)$ ]] || { echo "PYTHON_VERSION must be in format '3.minor'"; exit 1; } 7 | minor=${BASH_REMATCH[1]} 8 | if (( minor > 13 )); then 9 | echo "Maximum supported Python version is 3.13 (got $PYTHON_VERSION)" 10 | exit 1 11 | fi 12 | 13 | # ENABLE_GIL variable check 14 | [[ "$ENABLE_GIL" =~ ^[01]$ ]] || { echo "ENABLE_GIL must be 0 or 1 (got '$ENABLE_GIL')"; exit 1; } 15 | if (( ENABLE_GIL == 0 )) && (( minor != 13 )); then 16 | echo "ENABLE_GIL=0 is only supported with Python 3.13 (got $PYTHON_VERSION)" 17 | exit 1 18 | fi 19 | -------------------------------------------------------------------------------- /docker/entrypoint.d/10-banner.sh: -------------------------------------------------------------------------------- 1 | echo "" 2 | echo "==========================" 3 | echo "== Free-threaded Python ==" 4 | echo "==========================" 5 | echo "" 6 | echo "Welcome to the Experimental Free-Threaded Python." 7 | echo "" 8 | echo "This environment is experimental and designed for testing and exploratory purposes." 9 | echo "Here you can investigate the behavior and performance of Python in a free-threaded context." 10 | echo "" 11 | echo "It is NOT production-ready." 12 | echo "" 13 | echo "Your experiences and insights are invaluable in helping us improve this environment." 14 | echo -e "Please report any bugs, issues, or suggestions via our \e]8;;https://github.com/NVIDIA/free-threaded-python/issues\aGitHub Issues\e]8;;\a." 15 | echo "" 16 | -------------------------------------------------------------------------------- /docker/entrypoint.d/50-gpu-driver-check.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | 3 | # Check if libcuda.so.1 -- the CUDA driver -- is present in the ld.so cache or in LD_LIBRARY_PATH 4 | _LIBCUDA_FROM_LD_CACHE=$(ldconfig -p | grep libcuda.so.1) 5 | _LIBCUDA_FROM_LD_LIBRARY_PATH=$( ( IFS=: ; for i in ${LD_LIBRARY_PATH}; do ls $i/libcuda.so.1 2>/dev/null | grep -v compat; done) ) 6 | _LIBCUDA_FOUND="${_LIBCUDA_FROM_LD_CACHE}${_LIBCUDA_FROM_LD_LIBRARY_PATH}" 7 | 8 | # Check if /dev/nvidiactl (like on Linux) or /dev/dxg (like on WSL2) or /dev/nvgpu (like on Tegra) is present 9 | _DRIVER_FOUND=$(ls /dev/nvidiactl /dev/dxg /dev/nvgpu 2>/dev/null) 10 | 11 | # If either is not true, then GPU functionality won't be usable. 12 | if [[ -z "${_LIBCUDA_FOUND}" || -z "${_DRIVER_FOUND}" ]]; then 13 | echo 14 | echo "WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available." 15 | echo " Use the NVIDIA Container Toolkit to start this container with GPU support; see" 16 | echo " https://docs.nvidia.com/datacenter/cloud-native/ ." 17 | export NVIDIA_CPU_ONLY=1 18 | fi 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Free-threaded Python 2 | ==================== 3 | 4 | 5 | What is that? 6 | ------------- 7 | 8 | Using this repository, you can build and test a free-threaded Python environment containing NVIDIA Python libraries. 9 | 10 | Python Steering Commitee approved `PEP 703 `_, which removes the `Global Interpreter Lock `_ from Python. First Python release that allows for a parallel execution is 3.13 (scheduled October 2024). However, for yet some time you won't be able to ``pip install`` your favourite extensions and libraries. 11 | 12 | Provided Docker allows you to try the free-threaded environment yourself. We've included build routines to some popular NVIDIA extensions (`"My library is missing"`_). Should you encounter any bugs or problems, please let us know in the Issues. 13 | 14 | Please note: `This is an experimental software!`_ 15 | 16 | How to try it? 17 | -------------- 18 | 19 | #. Clone the repository:: 20 | 21 | $ git clone https://github.com/NVIDIA/free-threaded-python.git 22 | $ cd free-threaded-python 23 | 24 | #. Build free-threaded Python environement:: 25 | 26 | $ docker build -t free-threaded-python . 27 | 28 | #. Try it:: 29 | 30 | $ docker run -it --gpus all -v test:/test free-threaded-python python3 /test/simple.py 31 | 32 | This is an experimental software! 33 | --------------------------------- 34 | 35 | Removing GIL is a breaking change for many Python extensions and it will take years to adjust the ecosystem fully to the parallel execution. Moreover, this will not be possible without the community. **Please keep in mind, that by no means this is a production-ready software**. However, should you run into any bugs, please let us know in the Issues. Contributions and Pull Requests are also welcome. 36 | 37 | "My library is missing" 38 | ----------------------- 39 | 40 | It's not our intention to recreate whole Python ecosystem. However, if you are using a Python library which is missing here, please let us know in the Issues. Although we focus mostly on NVIDIA Python ecosystem, we'll do our best to include the most popular extensions in this environment configuration. 41 | -------------------------------------------------------------------------------- /test/simple.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | 22 | import math 23 | import time 24 | import threading 25 | 26 | doc = """ 27 | This is a simple test of free-threaded Python environment. It runs a computationally heavy-ish 28 | task on alternating number of CPU threads. In a free-threaded environment, the execution times 29 | of the runs shall be similar. 30 | """ 31 | 32 | 33 | def computational_heavy(iterations): 34 | val = 0 35 | sin = math.sin 36 | cos = math.cos 37 | for i in range(1, iterations): 38 | val += sin(i) * cos(i) 39 | return val 40 | 41 | 42 | def test(thread_id, iterations=1000000): 43 | computational_heavy(iterations) 44 | 45 | 46 | print(doc) 47 | 48 | num_threads = [2, 18, 2, 18, 2, 18] 49 | 50 | for nt in num_threads: 51 | threads = [ 52 | threading.Thread(target=test, name=f"Thread{i}", args=(i,)) for i in range(nt) 53 | ] 54 | start = time.perf_counter_ns() 55 | for t in threads: 56 | t.start() 57 | for t in threads: 58 | t.join() 59 | stop = time.perf_counter_ns() 60 | print(f"{nt=}.\tElapsed time {stop-start} ns") 61 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | 22 | ARG CUDA_VERSION=12.4.0 23 | 24 | FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 25 | 26 | ARG PYTHON_VERSION=3.13 27 | ARG ENABLE_GIL=0 28 | ENV PYTHON_GIL=$ENABLE_GIL 29 | 30 | COPY --chmod=744 scripts/validate_args.sh /tmp/validate_args.sh 31 | RUN PYTHON_VERSION=$PYTHON_VERSION ENABLE_GIL=$ENABLE_GIL /tmp/validate_args.sh 32 | 33 | # Install build tools 34 | ARG DEBIAN_FRONTEND=noninteractive 35 | RUN apt update && apt install -y \ 36 | build-essential \ 37 | clang \ 38 | lld \ 39 | llvm \ 40 | zlib1g-dev \ 41 | binutils \ 42 | zlib1g-dev \ 43 | xz-utils \ 44 | tk-dev \ 45 | libssl-dev \ 46 | libbz2-dev \ 47 | libreadline-dev \ 48 | libncursesw5-dev \ 49 | libsqlite3-dev \ 50 | libxml2-dev \ 51 | libxmlsec1-dev \ 52 | libffi-dev \ 53 | liblzma-dev \ 54 | curl \ 55 | git \ 56 | cmake \ 57 | wget 58 | 59 | WORKDIR /opt 60 | 61 | # Remove system Python 3.10 62 | RUN apt remove --purge -y python3.10 && apt autoremove -y 63 | 64 | # Build and install the CPython from source 65 | RUN GIL_FLAG=$([ "$ENABLE_GIL" = "0" ] && echo "--disable-gil" || echo "") && \ 66 | git clone -b "$PYTHON_VERSION" --recursive -j"$(grep ^processor /proc/cpuinfo | wc -l)" https://github.com/python/cpython.git && cd cpython && \ 67 | mkdir build && cd build && \ 68 | CC=clang CXX=clang++ ../configure --prefix=/usr/ --enable-optimizations --with-lto --enable-shared $GIL_FLAG && \ 69 | LDFLAGS="-fuse-ld=lld" make -j"$(grep ^processor /proc/cpuinfo | wc -l)" && \ 70 | make install 71 | 72 | RUN update-alternatives --install /usr/bin/python python $(which python$PYTHON_VERSION) 0 && \ 73 | update-alternatives --install /usr/local/bin/python3 python3 $(which python$PYTHON_VERSION) 0 && \ 74 | update-alternatives --force --install /usr/bin/pip pip $(which pip$PYTHON_VERSION) 0 75 | 76 | # General build dependencies 77 | RUN pip install setuptools wheel clang==14 libclang==14.0.1 'cython>=3.1.0b1' 78 | 79 | RUN pip install numpy Pillow warp-lang && \ 80 | # Disable build isolation to use system-installed Cython 81 | pip install --no-build-isolation nvtx 82 | 83 | # PyTorch nightly build 84 | RUN if [ "$(echo "$CUDA_VERSION" | tr -d . | head -c 3)" != 124 ]; then \ 85 | echo "No available free-threaded PyTorch wheels for CUDA version $CUDA_VERSION"; \ 86 | else \ 87 | python3 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124; \ 88 | fi 89 | 90 | # nvImageCodec provides 3.13t wheels 91 | RUN suffix="$(echo "$CUDA_VERSION" | tr -d . | head -c 2)" && \ 92 | if ! echo "$suffix" | grep -wq -e 11 -e 12; then \ 93 | echo "No available nvImageCodec wheels for CUDA version $CUDA_VERSION"; \ 94 | else \ 95 | python3 -m pip install nvidia-nvimgcodec-cu$suffix; \ 96 | fi 97 | 98 | ARG CUDA_ARCHS='60;70;80;90' 99 | 100 | # Install CV-CUDA from source 101 | # Clone and patch 102 | RUN apt install -y git-lfs patchelf 103 | RUN git clone -b v0.8.0-beta https://github.com/CVCUDA/CV-CUDA.git && \ 104 | cd CV-CUDA && sed -i 's/skip_precommit=0/skip_precommit=1/g' init_repo.sh && \ 105 | ./init_repo.sh && cd 3rdparty/pybind11 && git submodule update --init . && git checkout v2.13.6 106 | # Build and install 107 | RUN cd CV-CUDA && \ 108 | CUDA_MAJOR=12 ci/build.sh release build -DCMAKE_CUDA_ARCHITECTURES="$CUDA_ARCHS" && \ 109 | python3 -m pip install build/python$PYTHON_VERSION/wheel 110 | 111 | # Install cuDNN FE 112 | RUN apt install -y cudnn && \ 113 | CUDNN_INCLUDE_DIR=/usr/include CMAKE_POLICY_VERSION_MINIMUM=3.5 \ 114 | python3 -m pip install git+https://github.com/NVIDIA/cudnn-frontend.git 115 | 116 | # Install CUDA-Python 117 | RUN git clone -b "v$CUDA_VERSION" https://github.com/NVIDIA/cuda-python && cd cuda-python && \ 118 | python3 -m pip install -r requirements.txt && export CUDA_HOME=/usr/local/cuda && \ 119 | PARALLEL_LEVEL="$(grep ^processor /proc/cpuinfo | wc -l)" CC=gcc CXX=g++ python3 setup.py bdist_wheel install && \ 120 | python3 -m pip install dist/*.whl 121 | # Install cuda.core 122 | RUN cd cuda-python && git switch main && \ 123 | cd cuda_core && python3 -m pip install --no-build-isolation . 124 | 125 | # Install Nsight Systems 126 | RUN wget -O nsight.deb https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2023_4_1_97/nsightsystems-linux-cli-public-2023.4.1.97-3355750.deb/ && \ 127 | dpkg -i nsight.deb 128 | 129 | RUN rm -r /opt/nvidia/entrypoint.d/* 130 | COPY entrypoint.d /opt/nvidia/entrypoint.d 131 | -------------------------------------------------------------------------------- /docker/patches/torch.patch: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: MIT 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | 22 | diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp 23 | index a3f82633037..0c3a8b03435 100644 24 | --- a/torch/csrc/Storage.cpp 25 | +++ b/torch/csrc/Storage.cpp 26 | @@ -214,7 +214,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { 27 | PyObject_GC_UnTrack(self); 28 | } 29 | 30 | - bool has_finalizer = type->tp_finalize || type->tp_del; 31 | + // bool has_finalizer = type->tp_finalize || type->tp_del; 32 | 33 | if (type->tp_finalize) { 34 | PyObject_GC_Track(self); 35 | @@ -236,13 +236,14 @@ static void THPStorage_subclass_dealloc(PyObject* self) { 36 | if (type->tp_del) { 37 | PyObject_GC_Track(self); 38 | type->tp_del(self); 39 | - if (self->ob_refcnt > 0) { 40 | + if (Py_REFCNT(self) > 0) { 41 | // Resurrected (see above comment about resurrection from `__del__`) 42 | return; 43 | } 44 | PyObject_GC_UnTrack(self); 45 | } 46 | 47 | +#if 0 // there's a risk of missing weak refs but _PyWeakref_ClearRef was moved to internal API so this doesn't compile anymore 48 | if (has_finalizer) { 49 | /* New weakrefs could be created during the finalizer call. 50 | If this occurs, clear them out without calling their 51 | @@ -256,6 +257,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { 52 | _PyWeakref_ClearRef(*list); 53 | } 54 | } 55 | +#endif 56 | 57 | // Clear slots 58 | { 59 | diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp 60 | index 8f5297e87fe..1186bcc90f3 100644 61 | --- a/torch/csrc/autograd/python_function.cpp 62 | +++ b/torch/csrc/autograd/python_function.cpp 63 | @@ -1,3 +1,4 @@ 64 | +#include 65 | #include 66 | 67 | #include 68 | @@ -1017,12 +1018,14 @@ static void _trace_post_record( 69 | } 70 | 71 | node->i_(jit::attr::inplace, is_inplace); 72 | - if (PyObject* module_name = PyDict_GetItemString( 73 | - ((PyTypeObject*)op_obj)->tp_dict, "__module__")) { 74 | + PyObject *module_name = nullptr; 75 | + PyDict_GetItemStringRef(((PyTypeObject*)op_obj)->tp_dict, "__module__", &module_name); 76 | + if (module_name != nullptr) { 77 | if (auto ptr = PyUnicode_AsUTF8(module_name)) { 78 | node->s_(jit::attr::module, std::string(ptr)); 79 | } 80 | } 81 | + Py_XDECREF(module_name); 82 | 83 | // Isolate C variable ptrs in a vector 84 | int num_outputs = PyTuple_GET_SIZE(output_objects); 85 | diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp 86 | index c29e003a0b7..ba4099728c3 100644 87 | --- a/torch/csrc/autograd/python_hook.cpp 88 | +++ b/torch/csrc/autograd/python_hook.cpp 89 | @@ -67,7 +67,7 @@ bool _call_hooks(PyObject* dict, PyObject* args) { 90 | bool is_modified = false; 91 | const auto len = PyList_Size(hooks); 92 | for (Py_ssize_t idx = 0; idx < len; ++idx) { 93 | - const auto hook = PyList_GetItem(hooks, idx); 94 | + const auto hook = PyList_GetItem(hooks, idx); // borrowed ref 95 | 96 | THPObjectPtr res(PyObject_CallObject(hook, args)); 97 | if (!res) 98 | diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp 99 | index 3705ac5e423..ae47fa5a689 100644 100 | --- a/torch/csrc/autograd/python_variable.cpp 101 | +++ b/torch/csrc/autograd/python_variable.cpp 102 | @@ -1891,7 +1891,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { 103 | PyObject_GC_UnTrack(self); 104 | // TODO: consider using trash can 105 | 106 | - bool has_finalizer = type->tp_finalize || type->tp_del; 107 | + // bool has_finalizer = type->tp_finalize || type->tp_del; 108 | 109 | if (type->tp_finalize) { 110 | PyObject_GC_Track(self); 111 | @@ -1910,13 +1910,14 @@ void THPVariable_subclass_dealloc(PyObject* self) { 112 | if (type->tp_del) { 113 | PyObject_GC_Track(self); 114 | type->tp_del(self); 115 | - if (self->ob_refcnt > 0) { 116 | + if (Py_REFCNT(self) > 0) { 117 | /* Resurrected */ 118 | return; 119 | } 120 | PyObject_GC_UnTrack(self); 121 | } 122 | 123 | +#if 0 // there's a risk of missing weak refs but _PyWeakref_ClearRef was moved to internal API so this doesn't compile anymore 124 | if (has_finalizer) { 125 | /* New weakrefs could be created during the finalizer call. 126 | If this occurs, clear them out without calling their 127 | @@ -1930,6 +1931,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { 128 | _PyWeakref_ClearRef(*list); 129 | } 130 | } 131 | +#endif 132 | 133 | // Clear all slots until we get to base class THPVariableType 134 | { 135 | diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c 136 | index bf710b9ff7e..d740af81c86 100644 137 | --- a/torch/csrc/dynamo/cpython_defs.c 138 | +++ b/torch/csrc/dynamo/cpython_defs.c 139 | @@ -25,17 +25,16 @@ 140 | #endif 141 | 142 | #define Py_BUILD_CORE 143 | -#include 144 | -#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches 145 | -#include 146 | +#define NEED_OPCODE_METADATA // To get _PyOpcode_Deopt, _PyOpcode_Caches 147 | +#include 148 | #undef NEED_OPCODE_TABLES 149 | #undef Py_BUILD_CORE 150 | #include 151 | 152 | // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces 153 | // us to manually re-check that the function didn't change on the next major version 154 | -#if PY_VERSION_HEX >= 0x030D0000 // 3.13 155 | -#error "Please ensure that the functions below still match the CPython implementation for 3.13" 156 | +#if PY_VERSION_HEX >= 0x030E0000 // 3.14 157 | +#error "Please ensure that the functions below still match the CPython implementation for 3.14" 158 | #endif 159 | 160 | // https://github.com/python/cpython/blob/a7715ccfba5b86ab09f86ec56ac3755c93b46b48/Objects/frameobject.c#L1079 161 | @@ -45,8 +44,8 @@ THP_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame, int opcode, int oparg) 162 | // This only works when opcode is a non-quickened form: 163 | CHECK(_PyOpcode_Deopt[opcode] == opcode); 164 | int check_oparg = 0; 165 | - for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code); 166 | - instruction < frame->prev_instr; instruction++) 167 | + for (_Py_CODEUNIT *instruction = _PyCode_CODE(_PyFrame_GetCode(frame)); 168 | + instruction < frame->instr_ptr; instruction++) 169 | { 170 | int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)]; 171 | check_oparg |= _Py_OPARG(*instruction); 172 | @@ -75,7 +74,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame, int *free_vars_copied) 173 | { 174 | // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt 175 | // here: 176 | - PyCodeObject *co = frame->f_code; 177 | + PyCodeObject *co = _PyFrame_GetCode(frame); 178 | int lasti = _PyInterpreterFrame_LASTI(frame); 179 | if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS 180 | && PyFunction_Check(frame->f_funcobj))) 181 | @@ -86,13 +85,13 @@ frame_init_get_vars(_PyInterpreterFrame *frame, int *free_vars_copied) 182 | 183 | /* Free vars have not been initialized -- Do that */ 184 | PyObject *closure = ((PyFunctionObject *)frame->f_funcobj)->func_closure; 185 | - int offset = PyCode_GetFirstFree(co); 186 | + int offset = PyUnstable_Code_GetFirstFree(co); 187 | for (int i = 0; i < co->co_nfreevars; ++i) { 188 | PyObject *o = PyTuple_GET_ITEM(closure, i); 189 | frame->localsplus[offset + i] = Py_NewRef(o); 190 | } 191 | // COPY_FREE_VARS doesn't have inline CACHEs, either: 192 | - frame->prev_instr = _PyCode_CODE(frame->f_code); 193 | + frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)); 194 | 195 | *free_vars_copied = 1; 196 | } 197 | @@ -175,7 +174,7 @@ THP_PyFrame_GetLocals(_PyInterpreterFrame *frame, int include_hidden, int *free_ 198 | 199 | frame_init_get_vars(frame, free_vars_copied); 200 | 201 | - PyCodeObject *co = frame->f_code; 202 | + PyCodeObject *co = _PyFrame_GetCode(frame); 203 | for (int i = 0; i < co->co_nlocalsplus; i++) { 204 | PyObject *value; // borrowed reference 205 | if (!frame_get_var(frame, co, i, &value)) { 206 | @@ -411,7 +410,7 @@ THP_PyFrame_New_NoTrack(const PyCodeObject *code) 207 | f->f_trace = NULL; 208 | f->f_trace_lines = 1; 209 | f->f_trace_opcodes = 0; 210 | - f->f_fast_as_locals = 0; 211 | + f->f_extra_locals = NULL; 212 | f->f_lineno = 0; 213 | return f; 214 | } 215 | @@ -424,7 +423,7 @@ THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame) 216 | PyObject *error_type = NULL, *error_value = NULL, *error_traceback = NULL; 217 | PyErr_Fetch(&error_type, &error_value, &error_traceback); 218 | 219 | - PyFrameObject *f = THP_PyFrame_New_NoTrack(frame->f_code); 220 | + PyFrameObject *f = THP_PyFrame_New_NoTrack(_PyFrame_GetCode(frame)); 221 | if (f == NULL) { 222 | Py_XDECREF(error_type); 223 | Py_XDECREF(error_value); 224 | @@ -484,8 +483,8 @@ THP_take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame) 225 | if (_PyFrame_IsIncomplete(frame)) { 226 | // This may be a newly-created generator or coroutine frame. Since it's 227 | // dead anyways, just pretend that the first RESUME ran: 228 | - PyCodeObject *code = frame->f_code; 229 | - frame->prev_instr = _PyCode_CODE(code) + code->_co_firsttraceable; 230 | + PyCodeObject *code = _PyFrame_GetCode(frame); 231 | + frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable; 232 | } 233 | CHECK(!_PyFrame_IsIncomplete(frame)); 234 | CHECK(f->f_back == NULL); 235 | @@ -523,7 +522,7 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) 236 | _PyFrame_GetGenerator(frame)->gi_frame_state == FRAME_CLEARED); 237 | // GH-99729: Clearing this frame can expose the stack (via finalizers). It's 238 | // crucial that this frame has been unlinked, and is no longer visible: 239 | - CHECK(_PyThreadState_GET()->cframe->current_frame != frame); 240 | + CHECK(PyThreadState_GET()->current_frame != frame); 241 | if (frame->frame_obj) { 242 | PyFrameObject *f = frame->frame_obj; 243 | frame->frame_obj = NULL; 244 | @@ -546,7 +545,7 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) 245 | #else 246 | Py_DECREF(frame->f_func); 247 | #endif 248 | - Py_DECREF(frame->f_code); 249 | + Py_DECREF(_PyFrame_GetCode(frame)); 250 | } 251 | 252 | // https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L635 253 | diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h 254 | index b762f87d69d..d4432b8bb43 100644 255 | --- a/torch/csrc/dynamo/cpython_defs.h 256 | +++ b/torch/csrc/dynamo/cpython_defs.h 257 | @@ -8,7 +8,9 @@ 258 | 259 | #if IS_PYTHON_3_11_PLUS 260 | 261 | +#define Py_BUILD_CORE 262 | #include 263 | +#undef Py_BUILD_CORE 264 | 265 | int THP_PyFrame_FastToLocalsWithError( 266 | _PyInterpreterFrame* frame, 267 | diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c 268 | index b6a26f635ec..34eafba173b 100644 269 | --- a/torch/csrc/dynamo/eval_frame.c 270 | +++ b/torch/csrc/dynamo/eval_frame.c 271 | @@ -18,7 +18,6 @@ 272 | // see https://bugs.python.org/issue35886 273 | #if PY_VERSION_HEX >= 0x03080000 274 | #define Py_BUILD_CORE 275 | -#include 276 | 277 | // These headers were added in 3.11 278 | #if IS_PYTHON_3_11_PLUS 279 | @@ -58,7 +57,7 @@ DECLARE_PYOBJ_ATTR(f_func) 280 | DECLARE_PYOBJ_ATTR(f_globals) 281 | DECLARE_PYOBJ_ATTR(f_builtins) 282 | DECLARE_PYOBJ_ATTR(f_locals) 283 | -DECLARE_PYOBJ_ATTR(f_code) 284 | +DECLARE_PYOBJ_ATTR(f_executable) 285 | DECLARE_PYOBJ_ATTR(frame_obj) 286 | 287 | #undef DECLARE_PYOBJ_ATTR 288 | @@ -76,7 +75,7 @@ static PyObject* THPPyInterpreterFrame_f_lasti(THPPyInterpreterFrame* self, PyOb 289 | 290 | static PyObject* THPPyInterpreterFrame_f_lineno(THPPyInterpreterFrame* self, PyObject* _noargs) { 291 | if (!self->frame->frame_obj) { 292 | - return PyLong_FromLong(self->frame->f_code->co_firstlineno); 293 | + return PyLong_FromLong((_PyFrame_GetCode(self->frame))->co_firstlineno); 294 | } 295 | int lineno = PyFrame_GetLineNumber(self->frame->frame_obj); 296 | if (lineno < 0) { 297 | @@ -102,7 +101,7 @@ static struct PyGetSetDef THPPyInterpreterFrame_properties[] = { 298 | {"f_globals", (getter)THPPyInterpreterFrame_f_globals, NULL, NULL, NULL}, 299 | {"f_builtins", (getter)THPPyInterpreterFrame_f_builtins, NULL, NULL, NULL}, 300 | {"f_locals", (getter)THPPyInterpreterFrame_f_locals, NULL, NULL, NULL}, 301 | - {"f_code", (getter)THPPyInterpreterFrame_f_code, NULL, NULL, NULL}, 302 | + {"f_executable", (getter)THPPyInterpreterFrame_f_executable, NULL, NULL, NULL}, 303 | {"frame_obj", (getter)THPPyInterpreterFrame_frame_obj, NULL, NULL, NULL}, 304 | {"previous", (getter)THPPyInterpreterFrame_previous, NULL, NULL, NULL}, 305 | {"f_lasti", (getter)THPPyInterpreterFrame_f_lasti, NULL, NULL, NULL}, 306 | @@ -239,8 +238,8 @@ inline static void enable_eval_frame_default(PyThreadState* tstate) { 307 | 308 | inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) { 309 | // Returns the C string name of the current frame. 310 | - DEBUG_CHECK(PyUnicode_Check(frame->f_code->co_name)); 311 | - return PyUnicode_AsUTF8(frame->f_code->co_name); 312 | + DEBUG_CHECK(PyUnicode_Check(_PyFrame_GetCode(frame)->co_name)); 313 | + return PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_name); 314 | } 315 | 316 | static inline PyObject* call_callback( 317 | @@ -327,7 +326,7 @@ inline static PyObject* eval_custom_code_impl( 318 | 319 | PyObject** fastlocals_old = frame->localsplus; 320 | PyObject** fastlocals_new = shadow->localsplus; 321 | - Py_ssize_t n_old = frame->f_code->co_nlocalsplus; 322 | + Py_ssize_t n_old = _PyFrame_GetCode(frame)->co_nlocalsplus; 323 | Py_ssize_t n_new = code->co_nlocalsplus; 324 | 325 | // localsplus are XINCREF'd by default eval frame, so all values must be valid. 326 | @@ -341,8 +340,8 @@ inline static PyObject* eval_custom_code_impl( 327 | // for 3.11+, if free_vars_copied is true, we do not need to 328 | // run the first COPY_FREE_VARS since THP_PyFrame_FastToLocalsWithError 329 | // already did the equivalent action. 330 | - if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(shadow->f_code)[0]) == COPY_FREE_VARS) { 331 | - shadow->prev_instr = _PyCode_CODE(shadow->f_code); 332 | + if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(_PyFrame_GetCode(shadow))[0]) == COPY_FREE_VARS) { 333 | + shadow->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(shadow)); 334 | } 335 | 336 | #else 337 | @@ -399,11 +398,12 @@ inline static PyObject* eval_custom_code_impl( 338 | 339 | // copy args 340 | // according to https://docs.python.org/3/library/inspect.html , `co_argcount` is the number of arguments (not including keyword only arguments, * or ** args). so we need to add `co_kwonlyargcount` and `co_flags` to get the total number of arguments. 341 | - // !!(frame->f_code->co_flags & CO_VARARGS) is 1 if the function has *args, 0 otherwise 342 | - // !!(frame->f_code->co_flags & CO_VARKEYWORDS) is 1 if the function has **kwargs, 0 otherwise 343 | + // !!(_PyFrame_GetCode(frame)->co_flags & CO_VARARGS) is 1 if the function has *args, 0 otherwise 344 | + // !!(_PyFrame_GetCode(frame)->co_flags & CO_VARKEYWORDS) is 1 if the function has **kwargs, 0 otherwise 345 | // they convert bit flags to 0 or 1, and avoid branching. 346 | // This is performance critical code, so we really care about performance. 347 | - Py_ssize_t total_argcount_old = frame->f_code->co_argcount + frame->f_code->co_kwonlyargcount + !!(frame->f_code->co_flags & CO_VARARGS) + !!(frame->f_code->co_flags & CO_VARKEYWORDS); 348 | + PyCodeObject* f_code = _PyFrame_GetCode(frame); 349 | + Py_ssize_t total_argcount_old = f_code->co_argcount + f_code->co_kwonlyargcount + !!(f_code->co_flags & CO_VARARGS) + !!(f_code->co_flags & CO_VARKEYWORDS); 350 | 351 | for (Py_ssize_t i = 0; i < total_argcount_old; i++) { 352 | Py_XINCREF(fastlocals_old[i]); 353 | @@ -411,7 +411,7 @@ inline static PyObject* eval_custom_code_impl( 354 | } 355 | 356 | // copy free vars 357 | - Py_ssize_t nfrees_old = PyCode_GetNFreevars(frame->f_code); 358 | + Py_ssize_t nfrees_old = PyCode_GetNFreevars(f_code); 359 | 360 | for (Py_ssize_t i = 0; i < nfrees_old; i++) { 361 | Py_XINCREF(fastlocals_old[n_old - 1 - i]); 362 | @@ -425,7 +425,7 @@ inline static PyObject* eval_custom_code_impl( 363 | // this is straightforward in Python 3.11 and higher, as there are bit flags in `co_localspluskinds` to tell if a variable is a cell variable. 364 | // in Python 3.10 and lower, essentially we are checking if a variable is a new local variable (because of the layout mentioned above, the first variable that is not cell variable is the first new local variable). the corresponding slot in `flocalsplus` is NULL for new local variables. 365 | #if IS_PYTHON_3_11_PLUS 366 | - if(!(_PyLocals_GetKind(frame->f_code->co_localspluskinds, i) & CO_FAST_CELL)) 367 | + if(!(_PyLocals_GetKind(f_code->co_localspluskinds, i) & CO_FAST_CELL)) 368 | { 369 | break; 370 | } 371 | @@ -526,14 +526,14 @@ static PyObject* _custom_eval_frame( 372 | DEBUG_TRACE( 373 | "begin %s %s %i %i", 374 | get_frame_name(frame), 375 | - PyUnicode_AsUTF8(frame->f_code->co_filename), 376 | - frame->f_code->co_firstlineno, 377 | + PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_filename), 378 | + _PyFrame_GetCode(frame)->co_firstlineno, 379 | _PyInterpreterFrame_LASTI(frame)); 380 | #else 381 | DEBUG_TRACE( 382 | "begin %s %s %i %i %i", 383 | get_frame_name(frame), 384 | - PyUnicode_AsUTF8(frame->f_code->co_filename), 385 | + PyUnicode_AsUTF8(_PyFrame_GetCode(frame)->co_filename), 386 | frame->f_lineno, 387 | frame->f_lasti, 388 | frame->f_iblock); 389 | @@ -564,14 +564,14 @@ static PyObject* _custom_eval_frame( 390 | return eval_frame_default(tstate, frame, throw_flag); 391 | } 392 | 393 | - ExtraState* extra = get_extra_state(frame->f_code); 394 | + ExtraState* extra = get_extra_state(_PyFrame_GetCode(frame)); 395 | if (extra == SKIP_CODE || (callback == Py_False && extra == NULL)) { 396 | DEBUG_TRACE("skip %s", get_frame_name(frame)); 397 | return eval_frame_default(tstate, frame, throw_flag); 398 | } 399 | 400 | if (extra == NULL) { 401 | - extra = init_and_set_extra_state(frame->f_code); 402 | + extra = init_and_set_extra_state(_PyFrame_GetCode(frame)); 403 | } 404 | 405 | // TODO(jansel): investigate directly using the "fast" representation 406 | @@ -667,7 +667,7 @@ static PyObject* _custom_eval_frame( 407 | } else { 408 | DEBUG_TRACE("create skip %s", get_frame_name(frame)); 409 | Py_DECREF(result); 410 | - set_extra_state(frame->f_code, SKIP_CODE); 411 | + set_extra_state(_PyFrame_GetCode(frame), SKIP_CODE); 412 | // Re-enable custom behavior 413 | eval_frame_callback_set(callback); 414 | return eval_frame_default(tstate, frame, throw_flag); 415 | diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp 416 | index d61ac4219a8..9b83ab200b7 100644 417 | --- a/torch/csrc/dynamo/guards.cpp 418 | +++ b/torch/csrc/dynamo/guards.cpp 419 | @@ -233,12 +233,12 @@ static std::vector> pyListToVecOptInt( 420 | std::vector> vec; 421 | Py_ssize_t size = PyList_Size(pyList); 422 | for (Py_ssize_t i = 0; i < size; i++) { 423 | - PyObject* item = PyList_GetItem(pyList, i); 424 | - auto handle = py::handle(item); 425 | + PyObject* item = PyList_GetItemRef(pyList, i); 426 | + auto obj = py::reinterpret_steal(item); 427 | if (item == Py_None) { 428 | vec.emplace_back(std::nullopt); 429 | - } else if (torch::is_symint(handle)) { 430 | - vec.emplace_back(py::cast(handle)); 431 | + } else if (torch::is_symint(obj)) { 432 | + vec.emplace_back(py::cast(obj)); 433 | } else { 434 | int64_t value = PyLong_AsLongLong(item); 435 | if (value == -1 && PyErr_Occurred()) { 436 | @@ -259,8 +259,9 @@ static std::vector>> get_dynamic_dims( 437 | if (dynamic_dims_py != Py_None) { 438 | Py_ssize_t size = PyList_Size(dynamic_dims_py); 439 | for (Py_ssize_t i = 0; i < size; i++) { 440 | - PyObject* py_list = PyList_GetItem(dynamic_dims_py, i); 441 | + PyObject* py_list = PyList_GetItemRef(dynamic_dims_py, i); 442 | std::vector> vec = pyListToVecOptInt(py_list); 443 | + Py_DECREF(py_list); 444 | per_tensor_dynamic_dims.push_back(std::move(vec)); 445 | } 446 | } 447 | @@ -411,13 +412,15 @@ PyObject* TensorGuards_check_verbose( 448 | std::vector tensor_check_names; 449 | tensor_check_names.reserve(names_size); 450 | for (auto i : c10::irange(names_size)) { 451 | - PyObject* value = PyList_GetItem(tensor_check_names_py, i); 452 | + PyObject* value = PyList_GetItemRef(tensor_check_names_py, i); 453 | if (!PyUnicode_Check(value)) { 454 | + Py_DECREF(value); 455 | PyErr_SetString( 456 | PyExc_TypeError, "tensor_check_names must only contain strings"); 457 | return nullptr; 458 | } 459 | tensor_check_names.emplace_back(PyUnicode_AsUTF8(value)); 460 | + Py_DECREF(value); 461 | } 462 | 463 | LocalState state; 464 | @@ -1121,10 +1124,11 @@ class DEFAULT_DEVICE : public LeafGuard { 465 | // leaked by design. 466 | static PyObject* current_device_str = 467 | PyUnicode_InternFromString("CURRENT_DEVICE"); 468 | - PyObject* device = PyDict_GetItem( 469 | - _utils_device_dict.ptr(), current_device_str); // borrowed ref 470 | + PyObject* device; 471 | + PyDict_GetItemRef(_utils_device_dict.ptr(), current_device_str, &device); 472 | if (device != _device.ptr()) { 473 | int result = PyObject_RichCompareBool(device, _device.ptr(), Py_EQ); 474 | + Py_DECREF(device); 475 | if (result == -1) { 476 | PyErr_Clear(); 477 | return false; 478 | @@ -2236,14 +2240,17 @@ class DictSubclassGuardManager : public DictGuardManager { 479 | return false; 480 | } 481 | 482 | - PyObject* value = PyDict_GetItem(obj, key); // borrowed ref 483 | + PyObject* value; 484 | + PyDict_GetItemRef(obj, key, &value); // new reference 485 | std::unique_ptr& value_manager = key_value_manager.second; 486 | if (value_manager && !value_manager->check_nopybind(value)) { 487 | Py_DECREF(key); 488 | + Py_XDECREF(value); 489 | Py_DECREF(iterator); 490 | return false; 491 | } 492 | 493 | + Py_XDECREF(value); 494 | index_pointer++; 495 | } 496 | dict_pointer++; 497 | @@ -2305,7 +2312,8 @@ class DictSubclassGuardManager : public DictGuardManager { 498 | } 499 | } 500 | 501 | - PyObject* value = PyDict_GetItem(obj, key); // borrowed ref 502 | + PyObject* value; 503 | + PyDict_GetItemRef(obj, key, &value); // new reference 504 | std::unique_ptr& value_manager = key_value_manager.second; 505 | if (value_manager) { 506 | GuardDebugInfo debug_info = 507 | @@ -2313,11 +2321,14 @@ class DictSubclassGuardManager : public DictGuardManager { 508 | num_guards_executed += debug_info.num_guards_executed; 509 | if (!debug_info.result) { 510 | Py_DECREF(key); 511 | + Py_XDECREF(value); 512 | Py_DECREF(iterator); 513 | return GuardDebugInfo( 514 | false, debug_info.verbose_code_parts, num_guards_executed); 515 | } 516 | } 517 | + 518 | + Py_XDECREF(value); 519 | index_pointer++; 520 | } 521 | Py_DECREF(key); 522 | @@ -2602,24 +2613,30 @@ class DictGetItemGuardAccessor : public GuardAccessor { 523 | // NB: Intentional duplication between check_nopybind and 524 | // check_verbose_nopybind. 525 | bool check_nopybind(PyObject* obj) override { // borrowed ref 526 | - PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref 527 | - if (x == nullptr) { 528 | + PyObject* x = nullptr; 529 | + int res = PyDict_GetItemRef(obj, _key, &x); // new reference 530 | + if (x == nullptr || res < 0) { 531 | + Py_XDECREF(x); 532 | PyErr_Clear(); 533 | return false; 534 | } 535 | bool result = _guard_manager->check_nopybind(x); 536 | + Py_DECREF(x); 537 | return result; 538 | } 539 | 540 | GuardDebugInfo check_verbose_nopybind( 541 | PyObject* obj) override { // borrowed ref 542 | - PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref 543 | - if (x == nullptr) { 544 | + PyObject* x = nullptr; 545 | + int res = PyDict_GetItemRef(obj, _key, &x); // new reference 546 | + if (x == nullptr || res < 0) { 547 | + Py_XDECREF(x); 548 | PyErr_Clear(); 549 | return GuardDebugInfo( 550 | false, std::string("KeyError on ") + get_source(), 0); 551 | } 552 | GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x); 553 | + Py_DECREF(x); 554 | return result; 555 | } 556 | 557 | @@ -3077,40 +3094,54 @@ class GlobalWeakRefGuardAccessor : public GuardAccessor { 558 | bool check_nopybind(PyObject* obj) override { // borrowed ref 559 | // obj is globals dict because GlobalWeakRefGuardAccessor has to be a 560 | // child of GlobalsGuardAccessor. 561 | - PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref 562 | - if (weakref == nullptr) { 563 | + PyObject* weakref = nullptr; 564 | + int res = PyDict_GetItemRef(obj, _global_name, &weakref); // new reference 565 | + if (weakref == nullptr || res < 0) { 566 | // The weakref is not in the globals dict. 567 | + Py_XDECREF(weakref); 568 | PyErr_Clear(); 569 | return false; 570 | } 571 | 572 | if (!PyWeakref_Check(weakref)) { 573 | + Py_DECREF(weakref); 574 | return false; 575 | } 576 | 577 | - PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref 578 | - return _guard_manager->check_nopybind(x); 579 | + PyObject* x; 580 | + PyWeakref_GetRef(weakref, &x); 581 | + bool result = _guard_manager->check_nopybind(x); 582 | + Py_DECREF(x); 583 | + Py_DECREF(weakref); 584 | + return result; 585 | } 586 | 587 | GuardDebugInfo check_verbose_nopybind( 588 | PyObject* obj) override { // borrowed ref 589 | // obj is globals dict because GlobalWeakRefGuardAccessor has to be a 590 | // child of GlobalsGuardAccessor. 591 | - PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref 592 | - if (weakref == nullptr) { 593 | + PyObject* weakref = nullptr; 594 | + int res = PyDict_GetItemRef(obj, _global_name, &weakref); // new reference 595 | + if (weakref == nullptr || res < 0) { 596 | // The weakref is not in the globals dict. 597 | + Py_XDECREF(weakref); 598 | PyErr_Clear(); 599 | return GuardDebugInfo( 600 | false, std::string("KeyError on ") + get_source(), 0); 601 | } 602 | 603 | if (!PyWeakref_Check(weakref)) { 604 | + Py_DECREF(weakref); 605 | return GuardDebugInfo( 606 | false, std::string("Not a weakref ") + get_source(), 0); 607 | } 608 | 609 | - PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref 610 | - return _guard_manager->check_verbose_nopybind(x); 611 | + PyObject* x; 612 | + PyWeakref_GetRef(weakref, &x); 613 | + GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x); 614 | + Py_DECREF(x); 615 | + Py_DECREF(weakref); 616 | + return result; 617 | } 618 | 619 | std::string repr() const override { 620 | diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp 621 | index 23107d91d99..fcb55e641bc 100644 622 | --- a/torch/csrc/jit/python/pybind_utils.cpp 623 | +++ b/torch/csrc/jit/python/pybind_utils.cpp 624 | @@ -1,3 +1,4 @@ 625 | +#include 626 | #include 627 | #include 628 | #include 629 | @@ -31,14 +32,14 @@ ToIValueAllowNumbersAsTensors::~ToIValueAllowNumbersAsTensors() { 630 | // C++->Python. We need this because otherwise we may get the old Python object 631 | // if C++ creates a new object at the memory location of the deleted object. 632 | void clear_registered_instances(void* ptr) { 633 | - auto& registered_instances = 634 | - pybind11::detail::get_internals().registered_instances; 635 | - auto range = registered_instances.equal_range(ptr); 636 | - for (auto it = range.first; it != range.second; ++it) { 637 | - auto vh = it->second->get_value_and_holder(); 638 | - vh.set_instance_registered(false); 639 | - } 640 | - registered_instances.erase(ptr); 641 | + pybind11::detail::with_instance_map(ptr, [&](pybind11::detail::instance_map &instances) { 642 | + auto range = instances.equal_range(ptr); 643 | + for (auto it = range.first; it != range.second; ++it) { 644 | + auto vh = it->second->get_value_and_holder(); 645 | + vh.set_instance_registered(false); 646 | + } 647 | + instances.erase(ptr); 648 | + }); 649 | } 650 | 651 | // WARNING: Precondition for this function is that, e.g., you have tested if a 652 | diff --git a/torch/csrc/utils/nested.cpp b/torch/csrc/utils/nested.cpp 653 | index 29ccf312851..be66215d34b 100644 654 | --- a/torch/csrc/utils/nested.cpp 655 | +++ b/torch/csrc/utils/nested.cpp 656 | @@ -49,9 +49,10 @@ at::Tensor nested_tensor_ctor( 657 | // Check whether we are dealing with lists of tensors or not 658 | std::vector new_list(PyList_Size(data)); 659 | for (const auto i : c10::irange(PyList_Size(data))) { 660 | - PyObject* elem = PyList_GetItem(data, i); 661 | + PyObject* elem = PyList_GetItemRef(data, i); 662 | if (THPVariable_Check(elem)) { 663 | new_list[i] = THPVariable_Unpack(PyList_GetItem(data, i)).detach(); 664 | + Py_DECREF(elem); 665 | TORCH_CHECK( 666 | !new_list[i].is_nested(), 667 | "We do not accept nested tensors as input to nested tensors"); 668 | @@ -70,6 +71,7 @@ at::Tensor nested_tensor_ctor( 669 | }; 670 | elem_r.args = elem_args.data(); 671 | new_list[i] = tensor_ctor(dispatch_key, scalar_type, elem_r); 672 | + Py_DECREF(elem); 673 | } 674 | } 675 | 676 | --------------------------------------------------------------------------------