├── .github └── workflows │ ├── pypi-publish.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── LICENSE_kernprof.txt ├── README.md ├── demo.ipynb ├── pytorch_memlab ├── __init__.py ├── courtesy.py ├── line_profiler │ ├── __init__.py │ ├── extension.py │ ├── line_profiler.py │ ├── line_records.py │ └── profile.py ├── mem_reporter.py └── utils.py ├── readme-output.png ├── setup.py └── test ├── __init__.py ├── test_courtesy.py ├── test_line_profiler.py └── test_mem_reporter.py /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | 16 | jobs: 17 | deploy: 18 | 19 | runs-on: ubuntu-latest 20 | 21 | steps: 22 | - uses: actions/checkout@v4.1.7 23 | - name: Set up Python 24 | uses: actions/setup-python@v5.1.1 25 | with: 26 | python-version: '3.11`' 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install ipython pandas 31 | pip install .[test] 32 | - name: Build package 33 | run: python setup.py bdist 34 | - name: Publish package 35 | uses: pypa/gh-action-pypi-publish@v1.9.0 36 | with: 37 | user: __token__ 38 | password: ${{ secrets.PYPI_API_TOKEN }} 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Test 10 | 11 | on: push 12 | 13 | jobs: 14 | test: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ['3.8', '3.9', '3.10', '3.11'] 19 | 20 | steps: 21 | - uses: actions/checkout@v4.1.7 22 | - name: Set up Python 23 | uses: actions/setup-python@v5.1.1 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install ipython pandas 30 | pip install .[test] 31 | - name: Build package 32 | run: python setup.py bdist 33 | - name: Test 34 | run: | 35 | python -c 'import pytorch_memlab' 36 | pytest test/test_mem_reporter.py 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #### joe made this: http://goel.io/joe 2 | 3 | #####=== IPythonNotebook ===##### 4 | # Temporary data 5 | .ipynb_checkpoints/ 6 | 7 | #####=== Python ===##### 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | #####=== JetBrains ===##### 69 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 70 | 71 | *.iml 72 | 73 | ## Directory-based project format: 74 | .idea/ 75 | # if you remove the above rule, at least ignore the following: 76 | 77 | # User-specific stuff: 78 | # .idea/workspace.xml 79 | # .idea/tasks.xml 80 | # .idea/dictionaries 81 | 82 | # Sensitive or high-churn files: 83 | # .idea/dataSources.ids 84 | # .idea/dataSources.xml 85 | # .idea/sqlDataSources.xml 86 | # .idea/dynamic.xml 87 | # .idea/uiDesigner.xml 88 | 89 | # Gradle: 90 | # .idea/gradle.xml 91 | # .idea/libraries 92 | 93 | # Mongo Explorer plugin: 94 | # .idea/mongoSettings.xml 95 | 96 | ## File-based project format: 97 | *.ipr 98 | *.iws 99 | 100 | ## Plugin-specific files: 101 | 102 | # IntelliJ 103 | /out/ 104 | 105 | # mpeltonen/sbt-idea plugin 106 | .idea_modules/ 107 | 108 | # JIRA plugin 109 | atlassian-ide-plugin.xml 110 | 111 | # Crashlytics plugin (for Android Studio and IntelliJ) 112 | com_crashlytics_export_strings.xml 113 | crashlytics.properties 114 | crashlytics-build.properties 115 | 116 | .ropeproject 117 | 118 | #####=== VSCode ===##### 119 | 120 | .vscode 121 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kaiyu Shi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE_kernprof.txt: -------------------------------------------------------------------------------- 1 | This software is OSI Certified Open Source Software. 2 | OSI Certified is a certification mark of the Open Source Initiative. 3 | 4 | Copyright (c) 2008, Enthought, Inc. 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | * Neither the name of Enthought, Inc. nor the names of its contributors may 16 | be used to endorse or promote products derived from this software without 17 | specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pytorch_memlab 2 | ====== 3 | [![Test](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/test.yml/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/test.yml) 4 | [![Upload Python Package](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/pypi-publish.yml/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/pypi-publish.yml) 5 | ![PyPI](https://img.shields.io/pypi/v/pytorch_memlab.svg) 6 | [![CodeQL: Python](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/github-code-scanning/codeql/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/github-code-scanning/codeql) 7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/pytorch_memlab.svg) 8 | 9 | A simple and accurate **CUDA** memory management laboratory for pytorch, 10 | it consists of different parts about the memory: 11 | 12 | - Features: 13 | 14 | - Memory Profiler: A `line_profiler` style CUDA memory profiler with simple API. 15 | - Memory Reporter: A reporter to inspect tensors occupying the CUDA memory. 16 | - Courtesy: An interesting feature to temporarily move all the CUDA tensors into 17 | CPU memory for courtesy, and of course the backward transferring. 18 | - IPython support through `%mlrun`/`%%mlrun` line/cell magic 19 | commands. 20 | 21 | 22 | - Table of Contents 23 | * [Installation](#installation) 24 | * [User-Doc](#user-doc) 25 | + [Memory Profiler](#memory-profiler) 26 | + [IPython support](#ipython-support) 27 | + [Memory Reporter](#memory-reporter) 28 | + [Courtesy](#courtesy) 29 | + [ACK](#ack) 30 | * [CHANGES](#changes) 31 | 32 | Installation 33 | ----- 34 | 35 | - Released version: 36 | ```bash 37 | pip install pytorch_memlab 38 | ``` 39 | 40 | - Newest version: 41 | ```bash 42 | pip install git+https://github.com/stonesjtu/pytorch_memlab 43 | ``` 44 | 45 | What's for 46 | ----- 47 | 48 | Out-Of-Memory errors in pytorch happen frequently, for new-bees and 49 | experienced programmers. A common reason is that most people don't really 50 | learn the underlying memory management philosophy of pytorch and GPUs. 51 | They wrote memory in-efficient codes and complained about pytorch eating too 52 | much CUDA memory. 53 | 54 | In this repo, I'm going to share some useful tools to help debugging OOM, or 55 | to inspect the underlying mechanism if anyone is interested in. 56 | 57 | 58 | User-Doc 59 | ----- 60 | 61 | ### Memory Profiler 62 | 63 | The memory profiler is a modification of python's `line_profiler`, it gives 64 | the memory usage info for each line of code in the specified function/method. 65 | 66 | #### Sample: 67 | 68 | ```python 69 | import torch 70 | from pytorch_memlab import LineProfiler 71 | 72 | def inner(): 73 | torch.nn.Linear(100, 100).cuda() 74 | 75 | def outer(): 76 | linear = torch.nn.Linear(100, 100).cuda() 77 | linear2 = torch.nn.Linear(100, 100).cuda() 78 | linear3 = torch.nn.Linear(100, 100).cuda() 79 | 80 | work() 81 | ``` 82 | 83 | After the script finishes or interrupted by keyboard, it gives the following 84 | profiling info if you're in a Jupyter notebook: 85 | 86 |

87 | 88 | or the following info if you're in a text-only terminal: 89 | 90 | ``` 91 | ## outer 92 | 93 | active_bytes reserved_bytes line code 94 | all all 95 | peak peak 96 | 0.00B 0.00B 7 def outer(): 97 | 40.00K 2.00M 8 linear = torch.nn.Linear(100, 100).cuda() 98 | 80.00K 2.00M 9 linear2 = torch.nn.Linear(100, 100).cuda() 99 | 120.00K 2.00M 10 inner() 100 | 101 | 102 | ## inner 103 | 104 | active_bytes reserved_bytes line code 105 | all all 106 | peak peak 107 | 80.00K 2.00M 4 def inner(): 108 | 120.00K 2.00M 5 torch.nn.Linear(100, 100).cuda() 109 | ``` 110 | 111 | An explanation of what each column means can be found in the [Torch documentation](https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats). The name of any field from `memory_stats()` 112 | can be passed to `display()` to view the corresponding statistic. 113 | 114 | If you use `profile` decorator, the memory statistics are collected during 115 | multiple runs and only the maximum one is displayed at the end. 116 | We also provide a more flexible API called `profile_every` which prints the 117 | memory info every *N* times of function execution. You can simply replace 118 | `@profile` with `@profile_every(1)` to print the memory usage for each 119 | execution. 120 | 121 | The `@profile` and `@profile_every` can also be mixed to gain more control 122 | of the debugging granularity. 123 | 124 | - You can also add the decorator in the module class: 125 | 126 | ```python 127 | class Net(torch.nn.Module): 128 | def __init__(self): 129 | super().__init__() 130 | @profile 131 | def forward(self, inp): 132 | #do_something 133 | ``` 134 | 135 | - The *Line Profiler* profiles the memory usage of CUDA device 0 by default, 136 | you may want to switch the device to profile by `set_target_gpu`. The gpu 137 | selection is globally, which means you have to remember which gpu you are 138 | profiling on during the whole process: 139 | 140 | ```python 141 | import torch 142 | from pytorch_memlab import profile, set_target_gpu 143 | @profile 144 | def func(): 145 | net1 = torch.nn.Linear(1024, 1024).cuda(0) 146 | set_target_gpu(1) 147 | net2 = torch.nn.Linear(1024, 1024).cuda(1) 148 | set_target_gpu(0) 149 | net3 = torch.nn.Linear(1024, 1024).cuda(0) 150 | 151 | func() 152 | ``` 153 | 154 | 155 | More samples can be found in `test/test_line_profiler.py` 156 | 157 | ### IPython support 158 | 159 | Make sure you have `IPython` installed, or have installed `pytorch-memlab` with 160 | `pip install pytorch-memlab[ipython]`. 161 | 162 | First, load the extension: 163 | 164 | ```python 165 | %load_ext pytorch_memlab 166 | ``` 167 | 168 | This makes the `%mlrun` and `%%mlrun` line/cell magics available for use. For 169 | example, in a new cell run the following to profile an entire cell 170 | 171 | ```python 172 | %%mlrun -f func 173 | import torch 174 | from pytorch_memlab import profile, set_target_gpu 175 | def func(): 176 | net1 = torch.nn.Linear(1024, 1024).cuda(0) 177 | set_target_gpu(1) 178 | net2 = torch.nn.Linear(1024, 1024).cuda(1) 179 | set_target_gpu(0) 180 | net3 = torch.nn.Linear(1024, 1024).cuda(0) 181 | ``` 182 | 183 | Or you can invoke the profiler for a single statement on via the `%mlrun` cell 184 | magic. 185 | 186 | ```python 187 | import torch 188 | from pytorch_memlab import profile, set_target_gpu 189 | def func(input_size): 190 | net1 = torch.nn.Linear(input_size, 1024).cuda(0) 191 | %mlrun -f func func(2048) 192 | ``` 193 | 194 | See `%mlrun?` for help on what arguments are supported. You can set the GPU 195 | device to profile, dump profiling results to a file, and return the 196 | `LineProfiler` object for post-profile inspection. 197 | 198 | Find out more by checking out the [demo Jupyter notebook](./demo.ipynb) 199 | 200 | 201 | ### Memory Reporter 202 | 203 | As *Memory Profiler* only gives the overall memory usage information by lines, 204 | a more low-level memory usage information can be obtained by *Memory Reporter*. 205 | 206 | *Memory reporter* iterates all the `Tensor` objects and gets the underlying 207 | `UntypedStorage` (previously `Storage`) object to get the actual memory usage instead of the surface 208 | `Tensor.size`. 209 | 210 | > see [UntypedStorage](https://pytorch.org/docs/stable/storage.html#torch.UntypedStorage) for detailed 211 | > information 212 | 213 | #### Sample 214 | 215 | - A minimal one: 216 | 217 | ```python 218 | import torch 219 | from pytorch_memlab import MemReporter 220 | linear = torch.nn.Linear(1024, 1024).cuda() 221 | reporter = MemReporter() 222 | reporter.report() 223 | ``` 224 | outputs: 225 | ``` 226 | Element type Size Used MEM 227 | ------------------------------------------------------------------------------- 228 | Storage on cuda:0 229 | Parameter0 (1024, 1024) 4.00M 230 | Parameter1 (1024,) 4.00K 231 | ------------------------------------------------------------------------------- 232 | Total Tensors: 1049600 Used Memory: 4.00M 233 | The allocated memory on cuda:0: 4.00M 234 | ------------------------------------------------------------------------------- 235 | ``` 236 | 237 | - You can also pass in a model object for automatically name inference. 238 | 239 | ```python 240 | import torch 241 | from pytorch_memlab import MemReporter 242 | 243 | linear = torch.nn.Linear(1024, 1024).cuda() 244 | inp = torch.Tensor(512, 1024).cuda() 245 | # pass in a model to automatically infer the tensor names 246 | reporter = MemReporter(linear) 247 | out = linear(inp).mean() 248 | print('========= before backward =========') 249 | reporter.report() 250 | out.backward() 251 | print('========= after backward =========') 252 | reporter.report() 253 | ``` 254 | 255 | outputs: 256 | ``` 257 | ========= before backward ========= 258 | Element type Size Used MEM 259 | ------------------------------------------------------------------------------- 260 | Storage on cuda:0 261 | weight (1024, 1024) 4.00M 262 | bias (1024,) 4.00K 263 | Tensor0 (512, 1024) 2.00M 264 | Tensor1 (1,) 512.00B 265 | ------------------------------------------------------------------------------- 266 | Total Tensors: 1573889 Used Memory: 6.00M 267 | The allocated memory on cuda:0: 6.00M 268 | ------------------------------------------------------------------------------- 269 | ========= after backward ========= 270 | Element type Size Used MEM 271 | ------------------------------------------------------------------------------- 272 | Storage on cuda:0 273 | weight (1024, 1024) 4.00M 274 | weight.grad (1024, 1024) 4.00M 275 | bias (1024,) 4.00K 276 | bias.grad (1024,) 4.00K 277 | Tensor0 (512, 1024) 2.00M 278 | Tensor1 (1,) 512.00B 279 | ------------------------------------------------------------------------------- 280 | Total Tensors: 2623489 Used Memory: 10.01M 281 | The allocated memory on cuda:0: 10.01M 282 | ------------------------------------------------------------------------------- 283 | ``` 284 | 285 | 286 | - The reporter automatically deals with the sharing weights parameters: 287 | 288 | ```python 289 | import torch 290 | from pytorch_memlab import MemReporter 291 | 292 | linear = torch.nn.Linear(1024, 1024).cuda() 293 | linear2 = torch.nn.Linear(1024, 1024).cuda() 294 | linear2.weight = linear.weight 295 | container = torch.nn.Sequential( 296 | linear, linear2 297 | ) 298 | inp = torch.Tensor(512, 1024).cuda() 299 | # pass in a model to automatically infer the tensor names 300 | 301 | out = container(inp).mean() 302 | out.backward() 303 | 304 | # verbose shows how storage is shared across multiple Tensors 305 | reporter = MemReporter(container) 306 | reporter.report(verbose=True) 307 | ``` 308 | 309 | outputs: 310 | ``` 311 | Element type Size Used MEM 312 | ------------------------------------------------------------------------------- 313 | Storage on cuda:0 314 | 0.weight (1024, 1024) 4.00M 315 | 0.weight.grad (1024, 1024) 4.00M 316 | 0.bias (1024,) 4.00K 317 | 0.bias.grad (1024,) 4.00K 318 | 1.bias (1024,) 4.00K 319 | 1.bias.grad (1024,) 4.00K 320 | Tensor0 (512, 1024) 2.00M 321 | Tensor1 (1,) 512.00B 322 | ------------------------------------------------------------------------------- 323 | Total Tensors: 2625537 Used Memory: 10.02M 324 | The allocated memory on cuda:0: 10.02M 325 | ------------------------------------------------------------------------------- 326 | ``` 327 | 328 | - You can better understand the memory layout for more complicated module: 329 | 330 | ```python 331 | import torch 332 | from pytorch_memlab import MemReporter 333 | 334 | lstm = torch.nn.LSTM(1024, 1024).cuda() 335 | reporter = MemReporter(lstm) 336 | reporter.report(verbose=True) 337 | inp = torch.Tensor(10, 10, 1024).cuda() 338 | out, _ = lstm(inp) 339 | out.mean().backward() 340 | reporter.report(verbose=True) 341 | ``` 342 | 343 | As shown below, the `(->)` indicates the re-use of the same storage back-end 344 | outputs: 345 | ``` 346 | Element type Size Used MEM 347 | ------------------------------------------------------------------------------- 348 | Storage on cuda:0 349 | weight_ih_l0 (4096, 1024) 32.03M 350 | weight_hh_l0(->weight_ih_l0) (4096, 1024) 0.00B 351 | bias_ih_l0(->weight_ih_l0) (4096,) 0.00B 352 | bias_hh_l0(->weight_ih_l0) (4096,) 0.00B 353 | Tensor0 (10, 10, 1024) 400.00K 354 | ------------------------------------------------------------------------------- 355 | Total Tensors: 8499200 Used Memory: 32.42M 356 | The allocated memory on cuda:0: 32.52M 357 | Memory differs due to the matrix alignment 358 | ------------------------------------------------------------------------------- 359 | Element type Size Used MEM 360 | ------------------------------------------------------------------------------- 361 | Storage on cuda:0 362 | weight_ih_l0 (4096, 1024) 32.03M 363 | weight_ih_l0.grad (4096, 1024) 32.03M 364 | weight_hh_l0(->weight_ih_l0) (4096, 1024) 0.00B 365 | weight_hh_l0.grad(->weight_ih_l0.grad) (4096, 1024) 0.00B 366 | bias_ih_l0(->weight_ih_l0) (4096,) 0.00B 367 | bias_ih_l0.grad(->weight_ih_l0.grad) (4096,) 0.00B 368 | bias_hh_l0(->weight_ih_l0) (4096,) 0.00B 369 | bias_hh_l0.grad(->weight_ih_l0.grad) (4096,) 0.00B 370 | Tensor0 (10, 10, 1024) 400.00K 371 | Tensor1 (10, 10, 1024) 400.00K 372 | Tensor2 (1, 10, 1024) 40.00K 373 | Tensor3 (1, 10, 1024) 40.00K 374 | ------------------------------------------------------------------------------- 375 | Total Tensors: 17018880 Used Memory: 64.92M 376 | The allocated memory on cuda:0: 65.11M 377 | Memory differs due to the matrix alignment 378 | ------------------------------------------------------------------------------- 379 | ``` 380 | 381 | NOTICE: 382 | > When forwarding with `grad_mode=True`, pytorch maintains tensor buffers for 383 | > future Back-Propagation, in C level. So these buffers are not going to be 384 | > managed or collected by pytorch. But if you store these intermediate results 385 | > as python variables, then they will be reported. 386 | 387 | - You can also filter the device to report on by passing extra arguments: 388 | `report(device=torch.device(0))` 389 | 390 | - A failed example due to pytorch's C side tensor buffers 391 | 392 | In the following example, a temp buffer is created at `inp * (inp + 2)` to 393 | store both `inp` and `inp + 2`, unfortunately python only knows the existence 394 | of inp, so we have *2M* memory lost, which is the same size of Tensor `inp`. 395 | 396 | ```python 397 | import torch 398 | from pytorch_memlab import MemReporter 399 | 400 | linear = torch.nn.Linear(1024, 1024).cuda() 401 | inp = torch.Tensor(512, 1024).cuda() 402 | # pass in a model to automatically infer the tensor names 403 | reporter = MemReporter(linear) 404 | out = linear(inp * (inp + 2)).mean() 405 | reporter.report() 406 | ``` 407 | 408 | outputs: 409 | ``` 410 | Element type Size Used MEM 411 | ------------------------------------------------------------------------------- 412 | Storage on cuda:0 413 | weight (1024, 1024) 4.00M 414 | bias (1024,) 4.00K 415 | Tensor0 (512, 1024) 2.00M 416 | Tensor1 (1,) 512.00B 417 | ------------------------------------------------------------------------------- 418 | Total Tensors: 1573889 Used Memory: 6.00M 419 | The allocated memory on cuda:0: 8.00M 420 | Memory differs due to the matrix alignment or invisible gradient buffer tensors 421 | ------------------------------------------------------------------------------- 422 | ``` 423 | 424 | 425 | ### Courtesy 426 | 427 | Sometimes people would like to preempt your running task, but you don't want 428 | to save checkpoint and then load, actually all they need is GPU resources ( 429 | typically CPU resources and CPU memory is always spare in GPU clusters), so 430 | you can move all your workspaces from GPU to CPU and then halt your task until 431 | a restart signal is triggered, instead of saving&loading checkpoints and 432 | bootstrapping from scratch. 433 | 434 | Still developing..... But you can have fun with: 435 | ```python 436 | from pytorch_memlab import Courtesy 437 | 438 | iamcourtesy = Courtesy() 439 | for i in range(num_iteration): 440 | if something_happens: 441 | iamcourtesy.yield_memory() 442 | wait_for_restart_signal() 443 | iamcourtesy.restore() 444 | ``` 445 | 446 | #### Known Issues 447 | 448 | - As is stated above in `Memory_Reporter`, intermediate tensors are not covered 449 | properly, so you may want to insert such courtesy logics after `backward` or 450 | before `forward`. 451 | - Currently the CUDA context of pytorch requires about 1 GB CUDA memory, which 452 | means even all Tensors are on CPU, 1GB of CUDA memory is wasted, :-(. However 453 | it's still under investigation if I can fully destroy the context and then 454 | re-init. 455 | 456 | 457 | ### ACK 458 | 459 | I suffered a lot debugging weird memory usage during my 3-years of developing 460 | efficient Deep Learning models, and of course learned a lot from the great 461 | open source community. 462 | 463 | ## CHANGES 464 | 465 | 466 | ##### 0.3.0 (2023-7-29) 467 | - Fix `DataFrame.drop` for pandas 1.5+ 468 | ##### 0.2.4 (2021-10-28) 469 | - Fix colab error (#35) 470 | - Support python3.8 (#38) 471 | - Support sparse tensor (#30) 472 | ##### 0.2.3 (2020-12-01) 473 | - Fix name mapping in `MemReporter` (#24) 474 | - Fix reporter without model input (#22 #25) 475 | ##### 0.2.2 (2020-10-23) 476 | - Fix memory leak in `MemReporter` 477 | ##### 0.2.1 (2020-06-18) 478 | - Fix `line_profiler` not found 479 | ##### 0.2.0 (2020-06-15) 480 | - Add jupyter notebook figure and ipython support 481 | ##### 0.1.0 (2020-04-17) 482 | - Add ipython magic support (#8) 483 | ##### 0.0.4 (2019-10-08) 484 | - Add gpu switch for line-profiler(#2) 485 | - Add device filter for reporter 486 | ##### 0.0.3 (2019-06-15) 487 | - Install dependency for pip installation 488 | ##### 0.0.2 (2019-06-04) 489 | - Fix statistics shift in loop 490 | ##### 0.0.1 (2019-05-28) 491 | - initial release 492 | 493 | ## Star History 494 | 495 | [![Star History Chart](https://api.star-history.com/svg?repos=stonesjtu/pytorch_memlab&type=Date)](https://star-history.com/#stonesjtu/pytorch_memlab&Date) 496 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Once installed, you need to load the `pytorch_memlab` IPython extensions:" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%load_ext pytorch_memlab" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "One magic is provided, `mlrun` which can act either as a line magic `%mlrun`, or as a cell magic `%%mlrun`" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "%%mlrun?" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "First we need some torch code to profile:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import torch\n", 49 | "\n", 50 | "def x():\n", 51 | " torch.nn.Linear(100, 100).cuda()\n", 52 | " \n", 53 | "def y(gpu=0):\n", 54 | " torch.nn.Linear(1000, 100).cuda(device=gpu)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "We can profile multiple functions at the same type by repeatedly specifying `-f`" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/html": [ 72 | "

x

\n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 111 | " \n", 112 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B0.00B3def x():\n", 103 | "
40.00K2.00M4 torch.nn.Linear(100, 100).cuda()\n", 110 | "
\n", 113 | "

y

\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 153 | " \n", 154 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B2.00M6def y(gpu=0):\n", 145 | "
391.50K2.00M7 torch.nn.Linear(1000, 100).cuda(device=gpu)\n", 152 | "
" 155 | ], 156 | "text/plain": [ 157 | "## x\n", 158 | "\n", 159 | "active_bytes reserved_bytes line code \n", 160 | " all all \n", 161 | " peak peak \n", 162 | " 0.00B 0.00B 3 def x(): \n", 163 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \n", 164 | "\n", 165 | "\n", 166 | "## y\n", 167 | "\n", 168 | "active_bytes reserved_bytes line code \n", 169 | " all all \n", 170 | " peak peak \n", 171 | " 0.00B 2.00M 6 def y(gpu=0): \n", 172 | " 391.50K 2.00M 7 torch.nn.Linear(1000, 100).cuda(device=gpu) " 173 | ] 174 | }, 175 | "execution_count": 4, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "%%mlrun -f x -f y\n", 182 | "\n", 183 | "x()\n", 184 | "y()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "You can alos profile with the `%mlrun` line magic" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 5, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "

z

\n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 241 | " \n", 242 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B0.00B1def z():\n", 233 | "
40.00K2.00M2 torch.nn.Linear(100, 100).cuda()\n", 240 | "
" 243 | ], 244 | "text/plain": [ 245 | "## z\n", 246 | "\n", 247 | "active_bytes reserved_bytes line code \n", 248 | " all all \n", 249 | " peak peak \n", 250 | " 0.00B 0.00B 1 def z(): \n", 251 | " 40.00K 2.00M 2 torch.nn.Linear(100, 100).cuda() " 252 | ] 253 | }, 254 | "execution_count": 5, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "def z():\n", 261 | " torch.nn.Linear(100, 100).cuda()\n", 262 | "%mlrun -f z z()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "You can specify which GPU you wish to profile using `-g`:" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 6, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/html": [ 280 | "

x

\n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 319 | " \n", 320 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B0.00B3def x():\n", 311 | "
40.00K2.00M4 torch.nn.Linear(100, 100).cuda()\n", 318 | "
\n", 321 | "

y

\n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 361 | " \n", 362 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B2.00M6def y(gpu=0):\n", 353 | "
391.50K2.00M7 torch.nn.Linear(1000, 100).cuda(device=gpu)\n", 360 | "
" 363 | ], 364 | "text/plain": [ 365 | "## x\n", 366 | "\n", 367 | "active_bytes reserved_bytes line code \n", 368 | " all all \n", 369 | " peak peak \n", 370 | " 0.00B 0.00B 3 def x(): \n", 371 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \n", 372 | "\n", 373 | "\n", 374 | "## y\n", 375 | "\n", 376 | "active_bytes reserved_bytes line code \n", 377 | " all all \n", 378 | " peak peak \n", 379 | " 0.00B 2.00M 6 def y(gpu=0): \n", 380 | " 391.50K 2.00M 7 torch.nn.Linear(1000, 100).cuda(device=gpu) " 381 | ] 382 | }, 383 | "execution_count": 6, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "%%mlrun -f x -f y -g 0 y\n", 390 | "\n", 391 | "x()\n", 392 | "y(gpu=0)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "You can get a handle on the `LineProfiler` object using `-r`" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 7, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/html": [ 410 | "

x

\n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 449 | " \n", 450 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B0.00B3def x():\n", 441 | "
40.00K2.00M4 torch.nn.Linear(100, 100).cuda()\n", 448 | "
" 451 | ], 452 | "text/plain": [ 453 | "## x\n", 454 | "\n", 455 | "active_bytes reserved_bytes line code \n", 456 | " all all \n", 457 | " peak peak \n", 458 | " 0.00B 0.00B 3 def x(): \n", 459 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() " 460 | ] 461 | }, 462 | "execution_count": 7, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "profiler = %mlrun -q -r -f x x()\n", 469 | "profiler.display()" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "You can dump stats out to a file using `-T`:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 8, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/html": [ 487 | "

x

\n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 526 | " \n", 527 | "
active_bytes reserved_bytes line code
all all
peak peak
0.00B0.00B3def x():\n", 518 | "
40.00K2.00M4 torch.nn.Linear(100, 100).cuda()\n", 525 | "
" 528 | ], 529 | "text/plain": [ 530 | "## x\n", 531 | "\n", 532 | "active_bytes reserved_bytes line code \n", 533 | " all all \n", 534 | " peak peak \n", 535 | " 0.00B 0.00B 3 def x(): \n", 536 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() " 537 | ] 538 | }, 539 | "execution_count": 8, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "%mlrun -q -T profile.log -f x x()" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 9, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "## x\r\n", 558 | "\r\n", 559 | "active_bytes reserved_bytes line code \r\n", 560 | " all all \r\n", 561 | " peak peak \r\n", 562 | " 0.00B 0.00B 3 def x(): \r\n", 563 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \r\n" 564 | ] 565 | } 566 | ], 567 | "source": [ 568 | "!head profile.log" 569 | ] 570 | } 571 | ], 572 | "metadata": { 573 | "kernelspec": { 574 | "display_name": "Python 3", 575 | "language": "python", 576 | "name": "python3" 577 | }, 578 | "language_info": { 579 | "codemirror_mode": { 580 | "name": "ipython", 581 | "version": 3 582 | }, 583 | "file_extension": ".py", 584 | "mimetype": "text/x-python", 585 | "name": "python", 586 | "nbconvert_exporter": "python", 587 | "pygments_lexer": "ipython3", 588 | "version": "3.7.3" 589 | } 590 | }, 591 | "nbformat": 4, 592 | "nbformat_minor": 4 593 | } 594 | -------------------------------------------------------------------------------- /pytorch_memlab/__init__.py: -------------------------------------------------------------------------------- 1 | from .courtesy import Courtesy 2 | from .mem_reporter import MemReporter 3 | from .line_profiler import LineProfiler, profile, profile_every, set_target_gpu, clear_global_line_profiler 4 | try: 5 | from .line_profiler.extension import load_ipython_extension 6 | except ImportError: 7 | pass 8 | -------------------------------------------------------------------------------- /pytorch_memlab/courtesy.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import torch 3 | 4 | 5 | class Courtesy(): 6 | """A class to yield CUDA memory at any time in the training 7 | 8 | The whole save/load is a bit tricky because all data transfer should 9 | be inplace operation and gradient agnostic 10 | """ 11 | def __init__(self): 12 | self.loc_map = {} 13 | 14 | def yield_memory(self): 15 | """Transfer all the CUDA tensors into CPU memory""" 16 | tensors = [obj for obj in gc.get_objects() if isinstance(obj, torch.Tensor)] 17 | for t in tensors: 18 | # in case tensors appear more than once 19 | if t not in self.loc_map: 20 | self.loc_map[t] = t.device 21 | 22 | t.data = t.data.cpu() 23 | # parameters have one more wrapper for .data 24 | if isinstance(t, torch.nn.Parameter): 25 | # sometimes Parameter does not have grad 26 | try: 27 | t.grad.data = t.grad.cpu() 28 | finally: 29 | pass 30 | torch.cuda.empty_cache() 31 | 32 | def restore(self): 33 | """Restore the tensors into original CUDA devices""" 34 | for t, device in self.loc_map.items(): 35 | t.data = t.data.to(device) 36 | if isinstance(t, torch.nn.Parameter): 37 | # sometimes Parameter does not have grad 38 | try: 39 | t.grad = t.grad.to(device) 40 | finally: 41 | pass 42 | self.loc_map.clear() 43 | 44 | def __enter__(self): 45 | self.yield_memory() 46 | return self 47 | 48 | def __exit__(self, *args): 49 | self.restore() 50 | -------------------------------------------------------------------------------- /pytorch_memlab/line_profiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .line_profiler import LineProfiler 2 | from .profile import profile, profile_every, set_target_gpu, clear_global_line_profiler 3 | -------------------------------------------------------------------------------- /pytorch_memlab/line_profiler/extension.py: -------------------------------------------------------------------------------- 1 | """IPython & notebook extension interface""" 2 | from IPython.core.magic import ( 3 | Magics, 4 | magics_class, 5 | line_cell_magic, 6 | needs_local_scope, 7 | ) 8 | from IPython.core.magic_arguments import magic_arguments, argument, parse_argstring 9 | 10 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS 11 | 12 | 13 | class UsageError(Exception): 14 | pass 15 | 16 | 17 | @magics_class 18 | class MemlabMagics(Magics): 19 | @magic_arguments() 20 | @argument('--function', 21 | '-f', 22 | metavar='FUNC', 23 | action='append', 24 | default=[], 25 | help="""Function to profile. Can be specified multiple times to profile multiple 26 | functions""") 27 | @argument('--column', 28 | '-c', 29 | metavar='COLS', 30 | action='append', 31 | default=[], 32 | help="""Columns to display. Can be specified multiple times to profile multiple 33 | functions. See the Torch CUDA spec at 34 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats for details.""") 35 | @argument('-D', 36 | '--no_default_columns', 37 | action='store_true', 38 | help='Hide the default columns of ' + ", ".join(DEFAULT_COLUMNS)) 39 | @argument('-r', 40 | '--return-profiler', 41 | action='store_true', 42 | help='Return LineProfiler object for introspection') 43 | @argument('-g', 44 | '--gpu', 45 | metavar='GPU_ID', 46 | default=0, 47 | type=int, 48 | help='Profile memory usage of this GPU') 49 | @argument('-q', 50 | '--quiet', 51 | action='store_true', 52 | help='Don\'t print out profile results') 53 | @argument('statement', 54 | nargs='*', 55 | default=None, 56 | help='Code to run under profiler. You can omit this in cell magic mode.') 57 | @argument('-T', 58 | '--dump-profile', 59 | metavar='OUTPUT', 60 | help='Dump text profile output to file') 61 | @line_cell_magic 62 | @needs_local_scope 63 | def mlrun(self, line=None, cell=None, local_ns=None): 64 | """Execute a statement/cell under the PyTorch Memlab profiler to collect CUDA memory 65 | allocation information on a per-line basis. 66 | """ 67 | args = parse_argstring(self.mlrun, line) 68 | global_ns = self.shell.user_global_ns 69 | 70 | funcs = [] 71 | for name in args.function: 72 | try: 73 | fn = eval(name, global_ns, local_ns) 74 | funcs.append(fn) 75 | except NameError as e: 76 | raise UsageError('Could not find function {!r}.\n{}: {}'.format( 77 | name, e.__class__.__name__, e) 78 | ) 79 | profiler = LineProfiler(*funcs, target_gpu=args.gpu) 80 | if cell is not None: 81 | code = cell 82 | else: 83 | assert args.statement is not None 84 | code = '\n'.join(args.statement) 85 | with profiler: 86 | exec(compile(code, filename='', mode='exec'), local_ns) 87 | 88 | if args.dump_profile is not None: 89 | with open(args.dump_profile, 'w') as f: 90 | profiler.print_stats(stream=f) 91 | 92 | if args.return_profiler: 93 | return profiler 94 | else: 95 | defaults = [] if args.no_default_columns else list(DEFAULT_COLUMNS) 96 | return profiler.display(columns=defaults + args.column) 97 | 98 | 99 | def load_ipython_extension(ipython): 100 | ipython.register_magics(MemlabMagics) 101 | -------------------------------------------------------------------------------- /pytorch_memlab/line_profiler/line_profiler.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import sys 3 | from types import FrameType 4 | import warnings 5 | from typing import Any, Callable, Optional, Tuple 6 | 7 | import torch 8 | 9 | from .line_records import LineRecords 10 | 11 | # Seaborn's `muted` color cycle 12 | DEFAULT_COLUMNS = ('active_bytes.all.peak', 'reserved_bytes.all.peak') 13 | 14 | 15 | class LineProfiler: 16 | """Profile the CUDA memory usage info for each line in pytorch 17 | 18 | This class registers callbacks for added functions to profiling them line 19 | by line, and collects all the statistics in CUDA memory. Usually you may 20 | want to use simpler wrapper below `profile` or `profile_every`. 21 | 22 | The CUDA memory is collected only on the **current** cuda device. 23 | 24 | Usage: 25 | ```python 26 | with LineProfiler(func) as lp: 27 | func 28 | lp.display() 29 | 30 | ```python 31 | lp = LineProfiler(func) 32 | lp.enable() 33 | func() 34 | lp.disable() 35 | lp.display() 36 | ``` 37 | """ 38 | 39 | def __init__(self, *functions: Callable, target_gpu: int = 0): 40 | self.target_gpu = target_gpu 41 | self._code_infos = {} 42 | self._raw_line_records = [] 43 | self.enabled = False 44 | for func in functions: 45 | self.add_function(func) 46 | 47 | def add_function(self, func: Callable) -> None: 48 | """ Record line profiling information for the given Python function. 49 | """ 50 | try: 51 | # We need to use the hash here because pandas will later expect something 52 | # orderable for its index 53 | code_hash = hash(func.__code__) 54 | except AttributeError: 55 | warnings.warn( 56 | "Could not extract a code object for the object %r" % (func,)) 57 | return 58 | if code_hash not in self._code_infos: 59 | first_line = inspect.getsourcelines(func)[1] 60 | self._code_infos[code_hash] = { 61 | 'func': func, 62 | 'first_line': first_line, 63 | 'prev_line': first_line, 64 | 'prev_record': -1, 65 | } 66 | 67 | # re-register the newer trace_callback 68 | if self.enabled: 69 | self.register_callback() 70 | 71 | def __enter__(self): 72 | self.enable() 73 | return self 74 | 75 | def __exit__(self, exc_type, exc_val, exc_tb): 76 | self.disable() 77 | 78 | def register_callback(self): 79 | """Register the trace_callback only on demand""" 80 | if self._code_infos: 81 | sys.settrace(self._trace_callback) 82 | 83 | def _reset_cuda_stats(self): 84 | torch.cuda.reset_peak_memory_stats() 85 | torch.cuda.reset_accumulated_memory_stats() 86 | 87 | def enable(self): 88 | """Enable the profiler and register trace callback""" 89 | if not torch.cuda.is_available(): 90 | print('Could not find CUDA deivces and reset CUDA stats and cache') 91 | return 92 | torch.cuda.empty_cache() 93 | self._reset_cuda_stats() 94 | self.enabled = True 95 | self.register_callback() 96 | 97 | def disable(self): 98 | """Disable the profiler and clear trace callback""" 99 | self.enabled = False 100 | sys.settrace(None) 101 | 102 | def clear(self): 103 | """Clear the state of the line profiler""" 104 | self._code_infos = {} 105 | self._raw_line_records = [] 106 | 107 | def _trace_callback(self, frame: FrameType, event: str, _unused_arg: Tuple[Any, ...]): 108 | """Trace the execution of python line-by-line""" 109 | 110 | if event == 'call': 111 | return self._trace_callback 112 | 113 | code_hash = hash(frame.f_code) 114 | if event in ['line', 'return'] and code_hash in self._code_infos: 115 | code_info = self._code_infos[code_hash] 116 | with torch.cuda.device(self.target_gpu): 117 | self._raw_line_records.append({ 118 | 'code_hash': code_hash, 119 | 'line': code_info['prev_line'], 120 | 'prev_record_idx': code_info['prev_record'], 121 | **torch.cuda.memory_stats()}) 122 | self._reset_cuda_stats() 123 | 124 | if event == 'line': 125 | code_info['prev_line'] = frame.f_lineno 126 | code_info['prev_record'] = len(self._raw_line_records)-1 127 | elif event == 'return': 128 | code_info['prev_line'] = code_info['first_line'] 129 | code_info['prev_record'] = -1 130 | 131 | def display(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS) -> LineRecords: 132 | """Display the profiling results on either IPython or CLI 133 | 134 | The columns are explained in the PyTorch documentation: 135 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 136 | 137 | .. note:: To work, this needs to be the last thing returned in the IPython statement or cell. 138 | 139 | Args: 140 | func (str): the function name of interest, None for all registered function 141 | columns (list of str): the column names of interest, See PyTorch's doc for available names. 142 | 143 | Returns: 144 | RecordsDisplay: Returns an object that'll display the recorded stats in the IPython console 145 | """ 146 | return LineRecords(self._raw_line_records, self._code_infos).display(func, columns) 147 | 148 | def print_stats(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS, stream=sys.stdout): 149 | """Print the text profiling results to stream 150 | 151 | The columns are explained in the PyTorch documentation: 152 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 153 | 154 | Args: 155 | func (str): the function name of interest, None for all registered function 156 | columns (list of str): the column names of interest, See PyTorch's doc for available names 157 | stream (IO-like object): the stream to write to 158 | """ 159 | stream.write(str(self.display(func, columns))) 160 | -------------------------------------------------------------------------------- /pytorch_memlab/line_profiler/line_records.py: -------------------------------------------------------------------------------- 1 | """Class and helper functions for processing and displaying line records""" 2 | import inspect 3 | from typing import Callable, Optional, Tuple, List, Dict, Any 4 | import pandas as pd 5 | 6 | from ..utils import readable_size 7 | 8 | 9 | COLORS = [ 10 | '#4878d0', '#ee854a', '#6acc64', '#d65f5f', '#956cb4', 11 | '#8c613c', '#dc7ec0', '#797979', '#d5bb67', '#82c6e2', 12 | ] 13 | 14 | 15 | 16 | def _accumulate_line_records(raw_line_records: List[Dict[str, Any]]) -> pd.DataFrame: 17 | """The raw records give the memory stats between successive lines executed by the profiler. 18 | But we want the memory stats between successive lines in our functions! The two diverge when 19 | a function we're profiling calls another function we're profiling, since then Torch will have 20 | its peak/allocated/freed memory stats reset on each line of the called function. 21 | 22 | To fix that, here we look at each line record in turn, and for peak stats we take the 23 | maximum since the last record _in the same function_. For allocated/freed stats, we take the 24 | sum since the last record in the same function. 25 | """ 26 | 27 | # We'll do this in numpy because indexing lots of rows and columns in pandas is dog-slow. 28 | raw = pd.DataFrame(raw_line_records) 29 | acc_mask = raw.columns.str.match(r'.*(allocated|freed)$') 30 | peak_mask = raw.columns.str.match(r'.*(peak)$') 31 | acc_raw, peak_raw = raw.loc[:, acc_mask].values, raw.loc[:, peak_mask].values 32 | acc_refined, peak_refined = acc_raw.copy(), peak_raw.copy() 33 | 34 | for row, record in enumerate(raw_line_records): 35 | if record['prev_record_idx'] == -1: 36 | # No previous data to accumulate from 37 | continue 38 | if record['prev_record_idx'] == row-1: 39 | # Previous record was the previous line, so no need to accumulate anything 40 | continue 41 | 42 | # Another profiled function has been called since the last record, so we need to 43 | # accumulate the allocated/freed/peaks of the intervening records into this one. 44 | acc_refined[row] = acc_raw[record['prev_record_idx']+1:row+1].sum(0) 45 | peak_refined[row] = peak_raw[record['prev_record_idx']+1:row+1].max(0) 46 | 47 | refined = raw.copy() 48 | refined.loc[:, acc_mask] = acc_refined 49 | refined.loc[:, peak_mask] = peak_refined 50 | return refined 51 | 52 | 53 | def _line_records(raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]) -> pd.DataFrame: 54 | """Converts the raw line records to a nicely-shaped dataframe whose values reflect 55 | the memory usage of lines of _functions_ rather than lines of _execution_. See the 56 | `_accumulate_line_records` docstring for more detail.""" 57 | # Column spec: https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 58 | qual_names = { 59 | code_hash: info['func'].__qualname__ for code_hash, info in code_infos.items()} 60 | # pandas < 2.1.0 support (python3.8) 61 | try: 62 | records = (_accumulate_line_records(raw_line_records) 63 | .assign(qual_name=lambda df: df.code_hash.map(qual_names)) 64 | .set_index(['qual_name', 'line']) 65 | .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1)) 66 | except AttributeError: 67 | records = (_accumulate_line_records(raw_line_records) 68 | .assign(qual_name=lambda df: df.code_hash.applymap(qual_names)) 69 | .set_index(['qual_name', 'line']) 70 | .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1)) 71 | records.columns = pd.MultiIndex.from_tuples( 72 | [c.split('.') for c in records.columns]) 73 | 74 | return records 75 | 76 | 77 | class LineRecords: 78 | """Class for processing raw line records and display on IPython & CLI 79 | """ 80 | 81 | def __init__(self, raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]): 82 | super().__init__() 83 | self._raw_line_records = raw_line_records 84 | self._code_infos = code_infos 85 | 86 | def display(self, func: Callable[..., Any], columns: Tuple[str, ...]): 87 | """Display the records to either notebook or CLI 88 | 89 | The columns are explained in the PyTorch documentation: 90 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 91 | 92 | .. note:: Make this call the last one in a notebook cell 93 | 94 | Args: 95 | func (str): the function name of interest, None for all registered function 96 | columns (list of str): the column names of interest, See PyTorch's doc for available names. 97 | 98 | Returns: 99 | RecordsDisplay: a IPython friendly object which converts records to HTML or plain text 100 | """ 101 | line_records = self._filter_raw_line_records(func, columns) 102 | return RecordsDisplay(line_records, self._code_infos) 103 | 104 | def _filter_raw_line_records(self, func: Callable[..., Any], columns: Tuple[str, ...]) -> pd.DataFrame: 105 | """Get the line records 106 | 107 | The columns are explained in the PyTorch documentation: 108 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 109 | 110 | Args: 111 | func (str): the function name of interest, None for all registered function 112 | columns (list of str): the column names of interest, See PyTorch's doc for available names. 113 | 114 | Returns: 115 | pd.DataFrame: a (line, statistic)-indexed dataframe of memory stats. 116 | """ 117 | if len(self._raw_line_records) == 0: 118 | return pd.DataFrame(index=pd.MultiIndex.from_product([[], []]), columns=columns) 119 | 120 | line_records = _line_records(self._raw_line_records, self._code_infos) 121 | line_records = _extract_line_records(line_records, func, columns) 122 | 123 | if len(line_records) > 0: 124 | line_records = line_records.groupby(level=[0, 1]).max() 125 | 126 | return line_records 127 | 128 | 129 | def _extract_line_records(line_records: LineRecords, func: Optional[Callable] = None, columns: Tuple[str, ...] = None): 130 | """Extracts the subset of a line_records dataframe pertinent to a given set of functions and 131 | columns""" 132 | if func is not None: 133 | # Support both passing the function directly and passing a qual name/list of qual names 134 | line_records = line_records.loc[[func.__qualname__] if callable(func) else func] 135 | 136 | if columns is not None: 137 | columns = [tuple(c.split('.')) for c in columns] 138 | if not all(len(c) == 3 for c in columns): 139 | raise ValueError('Each column name should have three dot-separated parts') 140 | if not all(c in line_records.columns for c in columns): 141 | options = ", ".join(".".join(c) 142 | for c in line_records.columns.tolist()) 143 | raise ValueError( 144 | 'The column names should be fields of torch.cuda.memory_stat(). Options are: ' + options) 145 | line_records = line_records.loc[:, columns] 146 | 147 | return line_records 148 | 149 | 150 | class RecordsDisplay: 151 | """Class for processing raw line records and display on IPython & CLI 152 | 153 | IPython's rich display functionality [requires we return](https://ipython.readthedocs.io/en/stable/config/integrating.html) 154 | an object that has a `_repr_html_` method for when HTML rendering is supported, and 155 | a `__repr__` method for when only text is available 156 | """ 157 | def __init__(self, line_records: LineRecords, code_infos: List[Dict[str, Any]]): 158 | super().__init__() 159 | self._line_records = line_records 160 | self._code_infos = code_infos 161 | self._merged_line_records = self._merge_line_records_with_code() 162 | 163 | def _merge_line_records_with_code(self) -> Dict[str, Any]: 164 | merged_records = {} 165 | for _, info in self._code_infos.items(): 166 | qual_name = info['func'].__qualname__ 167 | if qual_name in self._line_records.index.get_level_values(0): 168 | lines, start_line = inspect.getsourcelines(info['func']) 169 | lines = pd.DataFrame.from_dict({ 170 | 'line': range(start_line, start_line + len(lines)), 171 | 'code': lines}) 172 | lines.columns = pd.MultiIndex.from_product([lines.columns, [''], ['']]) 173 | 174 | merged_records[qual_name] = pd.merge( 175 | self._line_records.loc[qual_name], lines, 176 | right_on='line', left_index=True, how='right') 177 | return merged_records 178 | 179 | def __repr__(self): 180 | """Renders the stats as text""" 181 | if len(self._line_records) == 0: 182 | return 'No data collected\n' 183 | 184 | is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte') 185 | byte_cols = self._line_records.columns[is_byte_col] 186 | 187 | string = {} 188 | for qual_name, merged in self._merge_line_records_with_code().items(): 189 | maxlen = max(len(c) for c in merged.code) 190 | left_align = '{{:{maxlen}s}}'.format(maxlen=maxlen) 191 | # pandas < 2.1.0 support (python3.8) 192 | try: 193 | merged[byte_cols] = merged[byte_cols].map(readable_size) 194 | except AttributeError: 195 | merged[byte_cols] = merged[byte_cols].applymap(readable_size) 196 | 197 | # This is a mess, but I can't find any other way to left-align text strings. 198 | code_header = (left_align.format('code'), '', '') 199 | merged[code_header] = merged['code'].apply(lambda l: left_align.format(l.rstrip('\n\r'))) 200 | merged = merged.drop('code', axis=1, level=0) 201 | 202 | string[qual_name] = merged.to_string(index=False) 203 | 204 | return '\n\n'.join(['## {q}\n\n{c}\n'.format(q=q, c=c) for q, c in string.items()]) 205 | 206 | def _repr_html_(self): 207 | """Renders the stats as HTML""" 208 | if len(self._line_records) == 0: 209 | return '

No data collected

' 210 | 211 | is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte') 212 | byte_cols = self._line_records.columns[is_byte_col] 213 | maxes = self._line_records.max() 214 | 215 | html = {} 216 | for qual_name, merged in self._merge_line_records_with_code().items(): 217 | 218 | style = merged.style 219 | 220 | # Style the bar charts 221 | for i, c in enumerate(self._line_records.columns): 222 | style = style.bar([c], color=COLORS[i % len(COLORS)], 223 | width=99, vmin=0, vmax=maxes[c]) 224 | 225 | # Style the text 226 | html[qual_name] = (style 227 | .format({c: readable_size for c in byte_cols}) 228 | .set_properties( 229 | subset=['code'], **{ 230 | 'text-align': 'left', 231 | 'white-space': 'pre', 232 | 'font-family': 'monospace'}) 233 | .set_table_styles([{ 234 | 'selector': 'th', 235 | 'props': [('text-align', 'left')]}]) 236 | .hide(axis=0) 237 | .to_html()) 238 | 239 | template = '

{q}

{c}
' 240 | return '\n'.join(template.format(q=q, c=c) for q, c in html.items()) 241 | -------------------------------------------------------------------------------- /pytorch_memlab/line_profiler/profile.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Callable, Tuple 3 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS 4 | 5 | 6 | global_line_profiler = LineProfiler() 7 | global_line_profiler.enable() 8 | 9 | 10 | def clear_global_line_profiler(): 11 | """Clears the state of the global line profiler""" 12 | global_line_profiler.clear() 13 | 14 | 15 | def set_target_gpu(gpu_id: int): 16 | """Set the target GPU id to profile memory 17 | 18 | Because of the lack of output space, only one GPU's memory usage is shown 19 | in line profiler. However you can use this function to switch target GPU 20 | to profile on. The GPU switch can be performed before profiling and even 21 | in the profiled functions. 22 | 23 | Args: 24 | - gpu_id: cuda index to profile the memory on, 25 | also accepts `torch.device` object. 26 | """ 27 | global_line_profiler.target_gpu = gpu_id 28 | 29 | 30 | def profile(func, columns: Tuple[str, ...] = DEFAULT_COLUMNS): 31 | """Profile the CUDA memory usage of target function line by line 32 | 33 | The profiling results will be printed at exiting, KeyboardInterrupt raised. 34 | The CUDA memory is collected only on the **current** cuda device. 35 | 36 | The columns are explained in the PyTorch documentation: 37 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 38 | 39 | Args: 40 | func: the function or method to profile on 41 | columns (list of str): the column names of interest, See PyTorch's doc for available names. 42 | 43 | Usage: 44 | ```python 45 | @profile 46 | def foo(): 47 | linear = torch.nn.Linear(100, 100).cuda() 48 | 49 | foo() 50 | 51 | class Foo(torch.nn.Module): 52 | def __init__(self): 53 | super().__init__() 54 | self.linear = torch.nn.Linear(100, 100).cuda() 55 | 56 | @profile 57 | def forward(self, inp): 58 | return self.linear(inp) 59 | 60 | inp = torch.Tensor(50, 100).cuda() 61 | foo = Foo() 62 | foo(inp) 63 | ``` 64 | """ 65 | import atexit 66 | global_line_profiler.add_function(func) 67 | 68 | def print_stats_atexit(): 69 | global_line_profiler.print_stats(func, columns) 70 | atexit.register(print_stats_atexit) 71 | 72 | return func 73 | 74 | 75 | def profile_every(output_interval: int = 1, enable: bool = True, columns: Tuple[str, ...] = DEFAULT_COLUMNS): 76 | """Profile the CUDA memory usage of target function line by line 77 | 78 | Prints the profiling output every `output_interval` execution of the target 79 | function 80 | The CUDA memory is collected only on the **current** cuda device. 81 | 82 | The columns are explained in the PyTorch documentation: 83 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats 84 | 85 | Args: 86 | enable (bool): whether to enable the profiling mode, so users don't have to 87 | modify any source code for enabling and disabling profiling. 88 | output_interval (int): frequency of output the profiling results 89 | columns (list of str): the column names of interest, See PyTorch's doc for available names. 90 | """ 91 | 92 | def inner_decorator(func: Callable): 93 | func.cur_idx = 1 94 | 95 | if enable: 96 | global_line_profiler.add_function(func) 97 | 98 | @wraps(func) 99 | def run_func(*args, **kwargs): 100 | res = func(*args, **kwargs) 101 | if enable: 102 | if func.cur_idx % output_interval == 0: 103 | global_line_profiler.print_stats(func, columns) 104 | 105 | func.cur_idx += 1 106 | return res 107 | 108 | return run_func 109 | return inner_decorator 110 | -------------------------------------------------------------------------------- /pytorch_memlab/mem_reporter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import gc 3 | from collections import defaultdict 4 | from typing import Optional, Tuple, List 5 | 6 | import torch 7 | from .utils import readable_size 8 | 9 | LEN = 79 10 | 11 | # some pytorch low-level memory management constant 12 | # the minimal allocate memory size (Byte) 13 | PYTORCH_MIN_ALLOCATE = 2 ** 9 14 | # the minimal cache memory size (Byte) 15 | PYTORCH_MIN_CACHE = 2 ** 20 16 | 17 | class MemReporter(): 18 | """A memory reporter that collects tensors and memory usages 19 | 20 | Parameters: 21 | - model: an extra nn.Module can be passed to infer the name 22 | of Tensors 23 | - pre_collect: do a garbage collection before getting remaining 24 | Tensors, this gives cleaner outputs. 25 | Caution: This is an intrusive change to your original code. 26 | 27 | """ 28 | def __init__(self, model: Optional[torch.nn.Module] = None, pre_collect: bool = False): 29 | self.tensor_name = {} 30 | self.device_mapping = defaultdict(list) 31 | self.device_tensor_stat = {} 32 | # to numbering the unknown tensors 33 | self.name_idx = 0 34 | self.pre_collect = pre_collect 35 | 36 | tensor_names = defaultdict(list) 37 | if model is not None: 38 | assert isinstance(model, torch.nn.Module) 39 | # for model with tying weight, multiple parameters may share 40 | # the same underlying tensor 41 | for name, param in model.named_parameters(): 42 | tensor_names[param].append(name) 43 | 44 | for param, name in tensor_names.items(): 45 | self.tensor_name[id(param)] = '+'.join(name) 46 | 47 | def _get_tensor_name(self, tensor: torch.Tensor) -> str: 48 | tensor_id = id(tensor) 49 | if tensor_id in self.tensor_name: 50 | name = self.tensor_name[tensor_id] 51 | # use numbering if no name can be inferred 52 | else: 53 | name = type(tensor).__name__ + str(self.name_idx) 54 | self.tensor_name[tensor_id] = name 55 | self.name_idx += 1 56 | return name 57 | 58 | def add_optimizer(self, optimizer: torch.optim.Optimizer): 59 | optimizer_name = optimizer.__class__.__name__ 60 | for param, states in optimizer.state.items(): 61 | param_name = self.tensor_name[id(param)] 62 | for name, tensor in states.items(): 63 | self.tensor_name[id(tensor)] = f'{optimizer_name}.{param_name}.{name}' 64 | # self.tensor_name[id()] 65 | # print(states) 66 | 67 | 68 | def collect_tensor(self): 69 | """Collect all tensor objects tracked by python 70 | 71 | NOTICE: 72 | - the buffers for backward which is implemented in C++ are 73 | not tracked by python's reference counting. 74 | - the gradients(.grad) of Parameters is not collected, and 75 | I don't know why. 76 | """ 77 | #FIXME: make the grad tensor collected by gc 78 | # Do a pre-garbage collect to eliminate python garbage objects 79 | if self.pre_collect: 80 | gc.collect() 81 | objects = gc.get_objects() 82 | tensors = [obj for obj in objects if isinstance(obj, torch.Tensor)] 83 | for t in tensors: 84 | self.device_mapping[t.device].append(t) 85 | 86 | def get_stats(self): 87 | """Get the memory stat of tensors and then release them 88 | 89 | As a memory profiler, we cannot hold the reference to any tensors, which 90 | causes possibly inaccurate memory usage stats, so we delete the tensors after 91 | getting required stats""" 92 | visited_data = {} 93 | self.device_tensor_stat.clear() 94 | 95 | def get_tensor_stat(tensor: torch.Tensor) -> List[Tuple[str, int, int, int]]: 96 | """Get the stat of a single tensor 97 | 98 | Returns: 99 | - stat: a tuple containing (tensor_name, tensor_size, 100 | tensor_numel, tensor_memory) 101 | """ 102 | assert isinstance(tensor, torch.Tensor) 103 | 104 | name = self._get_tensor_name(tensor) 105 | if tensor.is_sparse: 106 | indices_stat = get_tensor_stat(tensor._indices()) 107 | values_stat = get_tensor_stat(tensor._values()) 108 | return indices_stat + values_stat 109 | 110 | numel = tensor.numel() 111 | element_size = tensor.element_size() 112 | fact_numel = tensor.untyped_storage().size() 113 | fact_memory_size = fact_numel * element_size 114 | # since pytorch allocate at least 512 Bytes for any tensor, round 115 | # up to a multiple of 512 116 | memory_size = math.ceil(fact_memory_size / PYTORCH_MIN_ALLOCATE) \ 117 | * PYTORCH_MIN_ALLOCATE 118 | 119 | # tensor.storage should be the actual object related to memory 120 | # allocation 121 | data_ptr = tensor.untyped_storage().data_ptr() 122 | if data_ptr in visited_data: 123 | name = '{}(->{})'.format( 124 | name, 125 | visited_data[data_ptr], 126 | ) 127 | # don't count the memory for reusing same underlying storage 128 | memory_size = 0 129 | else: 130 | visited_data[data_ptr] = name 131 | 132 | size = tuple(tensor.size()) 133 | # torch scalar has empty size 134 | if not size: 135 | size = (1,) 136 | 137 | return [(name, size, numel, memory_size)] 138 | 139 | for device, tensors in self.device_mapping.items(): 140 | tensor_stats = [] 141 | for tensor in tensors: 142 | 143 | if tensor.numel() == 0: 144 | continue 145 | stat = get_tensor_stat(tensor) # (name, shape, numel, memory_size) 146 | tensor_stats += stat 147 | if isinstance(tensor, torch.nn.Parameter): 148 | if tensor.grad is not None: 149 | # manually specify the name of gradient tensor 150 | self.tensor_name[id(tensor.grad)] = '{}.grad'.format( 151 | self._get_tensor_name(tensor) 152 | ) 153 | stat = get_tensor_stat(tensor.grad) 154 | tensor_stats += stat 155 | 156 | self.device_tensor_stat[device] = tensor_stats 157 | 158 | self.device_mapping.clear() 159 | 160 | def print_stats(self, verbose: bool = False, target_device: Optional[torch.device] = None) -> None: 161 | # header 162 | show_reuse = verbose 163 | template_format = '{:<40s}{:>20s}{:>10s}' 164 | print(template_format.format('Element type', 'Size', 'Used MEM') ) 165 | for device, tensor_stats in self.device_tensor_stat.items(): 166 | # By default, if the target_device is not specified, 167 | # print tensors on all devices 168 | if target_device is not None and device != target_device: 169 | continue 170 | print('-' * LEN) 171 | print('Storage on {}'.format(device)) 172 | total_mem = 0 173 | total_numel = 0 174 | for stat in tensor_stats: 175 | name, size, numel, mem = stat 176 | if not show_reuse: 177 | name = name.split('(')[0] 178 | print(template_format.format( 179 | str(name), 180 | str(size), 181 | readable_size(mem), 182 | )) 183 | total_mem += mem 184 | total_numel += numel 185 | 186 | print('-'*LEN) 187 | print('Total Tensors: {} \tUsed Memory: {}'.format( 188 | total_numel, readable_size(total_mem), 189 | )) 190 | 191 | if device != torch.device('cpu'): 192 | with torch.cuda.device(device): 193 | memory_allocated = torch.cuda.memory_allocated() 194 | print('The allocated memory on {}: {}'.format( 195 | device, readable_size(memory_allocated), 196 | )) 197 | if memory_allocated != total_mem: 198 | print('Memory differs due to the matrix alignment or' 199 | ' invisible gradient buffer tensors') 200 | print('-'*LEN) 201 | 202 | def report(self, verbose: bool = False, device: Optional[torch.device] = None) -> None: 203 | """Interface for end-users to directly print the memory usage 204 | 205 | args: 206 | - verbose: flag to show tensor.storage reuse information 207 | - device: `torch.device` object, specify the target device 208 | to report detailed memory usage. It will print memory usage 209 | on all devices if not specified. Usually we only want to 210 | print the memory usage on CUDA devices. 211 | 212 | """ 213 | self.collect_tensor() 214 | self.get_stats() 215 | self.print_stats(verbose, target_device=device) 216 | -------------------------------------------------------------------------------- /pytorch_memlab/utils.py: -------------------------------------------------------------------------------- 1 | from math import isnan 2 | from calmsize import size as calmsize 3 | 4 | def readable_size(num_bytes: int) -> str: 5 | return '' if isnan(num_bytes) else '{:.2f}'.format(calmsize(num_bytes)) 6 | -------------------------------------------------------------------------------- /readme-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/readme-output.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | try: 4 | long_description = open('README.md').read() 5 | except FileNotFoundError: 6 | long_description = '' 7 | 8 | setup( 9 | name='pytorch-memlab', 10 | version='0.3.0', 11 | licence='MIT', 12 | description='A lab to do simple and accurate memory experiments on pytorch', 13 | long_description=long_description, 14 | long_description_content_type='text/markdown', 15 | classifiers=[ 16 | "Programming Language :: Python", 17 | "Topic :: Software Development :: Libraries :: Python Modules", 18 | ], 19 | keywords='pytorch memory profile', 20 | author='Kaiyu Shi', 21 | author_email='skyisno.1@gmail.com', 22 | url='https://github.com/Stonesjtu/pytorch_memlab', 23 | license='MIT', 24 | include_package_data=True, 25 | zip_safe=True, 26 | python_requires='>=3.8', 27 | install_requires=[ 28 | 'setuptools', 29 | 'calmsize', 30 | 'pandas', 31 | 'torch>=2.0', 32 | ], 33 | extras_require={ 34 | 'ipython': ['IPython>=0.13'], 35 | 'test': ['pytest'], 36 | }, 37 | packages=find_packages(), 38 | ) 39 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/test/__init__.py -------------------------------------------------------------------------------- /test/test_courtesy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from pytorch_memlab import Courtesy, MemReporter 4 | 5 | def test_reporter(): 6 | linear = torch.nn.Linear(1024, 1024).cuda() 7 | inp = torch.Tensor(512, 1024).cuda() 8 | 9 | out = linear(inp).mean() 10 | out.backward() 11 | 12 | reporter = MemReporter(linear) 13 | reporter.report() 14 | ct = Courtesy() 15 | ct.yield_memory() 16 | print('gpu>>>>>>>>>>>>>>>>>>cpu') 17 | reporter.report() 18 | ct.restore() 19 | print('cpu>>>>>>>>>>>>>>>>>>gpu') 20 | reporter.report() 21 | 22 | def test_courtesy_context(): 23 | linear = torch.nn.Linear(1024, 1024).cuda() 24 | inp = torch.Tensor(512, 1024).cuda() 25 | 26 | out = linear(inp).mean() 27 | out.backward() 28 | 29 | reporter = MemReporter(linear) 30 | with Courtesy() as ct: 31 | print('gpu>>>>>>>>>>>>>>>>>>cpu') 32 | reporter.report() 33 | -------------------------------------------------------------------------------- /test/test_line_profiler.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import pytest 5 | import torch 6 | from pytorch_memlab import (LineProfiler, clear_global_line_profiler, profile, 7 | profile_every, set_target_gpu) 8 | 9 | 10 | def test_display(): 11 | 12 | def main(): 13 | linear = torch.nn.Linear(100, 100).cuda() 14 | part1() 15 | part2() 16 | 17 | def part1(): 18 | lstm = torch.nn.LSTM(1000, 1000).cuda() 19 | subpart11() 20 | 21 | def part2(): 22 | linear_2 = torch.nn.Linear(100, 100).cuda() 23 | linear_3 = torch.nn.Linear(100, 100).cuda() 24 | 25 | def subpart11(): 26 | linear = torch.nn.Linear(100, 100).cuda() 27 | linear_2 = torch.nn.Linear(100, 100).cuda() 28 | linear_3 = torch.nn.Linear(100, 100).cuda() 29 | 30 | with LineProfiler(subpart11, part2) as prof: 31 | main() 32 | 33 | s = str(prof.display()) # cast from line_records.RecordsDisplay 34 | assert re.search("## .*subpart11", s) 35 | assert "def subpart11():" in s 36 | assert re.search("## .*part2", s) 37 | assert "def part2():" in s 38 | 39 | 40 | def test_line_report(): 41 | 42 | def work(): 43 | # comment 44 | linear = torch.nn.Linear(100, 100).cuda() 45 | linear_2 = torch.nn.Linear(100, 100).cuda() 46 | linear_3 = torch.nn.Linear(100, 100).cuda() 47 | 48 | def work_3(): 49 | lstm = torch.nn.LSTM(1000, 1000).cuda() 50 | 51 | def work_2(): 52 | # comment 53 | linear = torch.nn.Linear(100, 100).cuda() 54 | linear_2 = torch.nn.Linear(100, 100).cuda() 55 | linear_3 = torch.nn.Linear(100, 100).cuda() 56 | work_3() 57 | 58 | line_profiler = LineProfiler(work, work_2) 59 | line_profiler.enable() 60 | 61 | work() 62 | work_2() 63 | 64 | line_profiler.disable() 65 | line_profiler.print_stats() 66 | 67 | 68 | def test_line_report_decorator(): 69 | clear_global_line_profiler() 70 | 71 | @profile_every(output_interval=3) 72 | def work(): 73 | # comment 74 | linear = torch.nn.Linear(100, 100).cuda() 75 | linear_2 = torch.nn.Linear(100, 100).cuda() 76 | linear_3 = torch.nn.Linear(100, 100).cuda() 77 | 78 | @profile_every(output_interval=1) 79 | def work2(): 80 | # comment 81 | linear = torch.nn.Linear(100, 100).cuda() 82 | linear_2 = torch.nn.Linear(100, 100).cuda() 83 | linear_3 = torch.nn.Linear(100, 100).cuda() 84 | 85 | work() 86 | work2() 87 | work() 88 | work() 89 | 90 | 91 | def test_line_report_method(): 92 | clear_global_line_profiler() 93 | 94 | class Net(torch.nn.Module): 95 | def __init__(self): 96 | super().__init__() 97 | self.linear = torch.nn.Linear(100, 100).cuda() 98 | self.drop = torch.nn.Dropout(0.1) 99 | 100 | @profile_every(1) 101 | def forward(self, inp): 102 | return self.drop(self.linear(inp)) 103 | 104 | net = Net() 105 | inp = torch.Tensor(50, 100).cuda() 106 | net(inp) 107 | 108 | 109 | def test_line_report_profile(): 110 | clear_global_line_profiler() 111 | 112 | @profile 113 | def work(): 114 | # comment 115 | linear = torch.nn.Linear(100, 100).cuda() 116 | linear_2 = torch.nn.Linear(100, 100).cuda() 117 | linear_3 = torch.nn.Linear(100, 100).cuda() 118 | 119 | work() 120 | work() 121 | 122 | 123 | def test_line_report_profile_set_gpu(): 124 | clear_global_line_profiler() 125 | 126 | @profile 127 | def work(): 128 | # comment 129 | set_target_gpu(1) 130 | linear = torch.nn.Linear(100, 100).cuda(1) 131 | set_target_gpu(0) 132 | linear_2 = torch.nn.Linear(100, 100).cuda(0) 133 | linear_3 = torch.nn.Linear(100, 100).cuda(1) 134 | 135 | work() 136 | work() 137 | 138 | 139 | def test_line_report_profile_interrupt(): 140 | clear_global_line_profiler() 141 | 142 | @profile 143 | def work(): 144 | # comment 145 | linear = torch.nn.Linear(100, 100).cuda() 146 | linear_2 = torch.nn.Linear(100, 100).cuda() 147 | linear_3 = torch.nn.Linear(100, 100).cuda() 148 | 149 | @profile_every(1) 150 | def work2(): 151 | linear_2 = torch.nn.Linear(100, 100).cuda() 152 | linear_3 = torch.nn.Linear(100, 100).cuda() 153 | 154 | work() 155 | work2() 156 | raise KeyboardInterrupt 157 | -------------------------------------------------------------------------------- /test/test_mem_reporter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim 3 | from pytorch_memlab import MemReporter 4 | 5 | import pytest 6 | 7 | 8 | concentrate_mode = False 9 | 10 | def test_reporter(): 11 | linear = torch.nn.Linear(1024, 1024) 12 | inp = torch.Tensor(512, 1024) 13 | reporter = MemReporter(linear) 14 | 15 | out = linear(inp*(inp+3)).mean() 16 | reporter.report() 17 | out.backward() 18 | 19 | reporter.report() 20 | 21 | def test_reporter_without_model(): 22 | linear = torch.nn.Linear(1024, 1024) 23 | inp = torch.Tensor(512, 1024) 24 | reporter = MemReporter() 25 | 26 | out = linear(inp*(inp+3)).mean() 27 | reporter.report() 28 | out.backward() 29 | 30 | reporter.report() 31 | 32 | def test_reporter_sparse_tensor(): 33 | emb = torch.nn.Embedding(1024, 1024, sparse=True) 34 | inp = torch.arange(0, 128) 35 | reporter = MemReporter() 36 | 37 | out = emb(inp).mean() 38 | reporter.report() 39 | out.backward() 40 | b = emb.weight.grad * 2 41 | 42 | reporter.report() 43 | 44 | @pytest.mark.skipif(concentrate_mode, reason='concentrate') 45 | def test_reporter_tie_weight(): 46 | linear = torch.nn.Linear(1024, 1024) 47 | linear_2 = torch.nn.Linear(1024, 1024) 48 | linear_2.weight = linear.weight 49 | container = torch.nn.Sequential( 50 | linear, linear_2 51 | ) 52 | reporter = MemReporter(container) 53 | inp = torch.Tensor(512, 1024) 54 | 55 | out = container(inp).mean() 56 | out.backward() 57 | 58 | reporter = MemReporter(container) 59 | reporter.report() 60 | 61 | def test_reporter_with_optimizer(): 62 | linear = torch.nn.Linear(1024, 1024) 63 | inp = torch.Tensor(512, 1024) 64 | optimizer = torch.optim.Adam(linear.parameters()) 65 | # reporter = MemReporter(linear) 66 | 67 | out = linear(inp*(inp+3)*(inp+2)).mean() 68 | reporter = MemReporter(linear) 69 | reporter.report() 70 | out.backward() 71 | # reporter.report() 72 | optimizer.step() 73 | 74 | reporter.add_optimizer(optimizer) 75 | reporter.report() 76 | 77 | 78 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA') 79 | @pytest.mark.skipif(concentrate_mode, reason='concentrate') 80 | def test_reporter_LSTM(): 81 | lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda() 82 | # lstm.flatten_parameters() 83 | inp = torch.Tensor(256, 256, 256).cuda() 84 | out, _ = lstm(inp) 85 | out.mean().backward() 86 | 87 | reporter = MemReporter(lstm) 88 | reporter.report() 89 | 90 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA') 91 | @pytest.mark.skipif(concentrate_mode, reason='concentrate') 92 | def test_reporter_device(): 93 | lstm_cpu = torch.nn.LSTM(256, 256) 94 | lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda() 95 | # lstm.flatten_parameters() 96 | inp = torch.Tensor(256, 256, 256).cuda() 97 | out, _ = lstm(inp) 98 | out.mean().backward() 99 | 100 | reporter = MemReporter(lstm) 101 | reporter.report() 102 | reporter.report(device=torch.device('cuda:0')) 103 | --------------------------------------------------------------------------------