├── .github
    └── workflows
    │   ├── pypi-publish.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── LICENSE_kernprof.txt
├── README.md
├── demo.ipynb
├── pytorch_memlab
    ├── __init__.py
    ├── courtesy.py
    ├── line_profiler
    │   ├── __init__.py
    │   ├── extension.py
    │   ├── line_profiler.py
    │   ├── line_records.py
    │   └── profile.py
    ├── mem_reporter.py
    └── utils.py
├── readme-output.png
├── setup.py
└── test
    ├── __init__.py
    ├── test_courtesy.py
    ├── test_line_profiler.py
    └── test_mem_reporter.py


/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | 
16 | jobs:
17 |   deploy:
18 | 
19 |     runs-on: ubuntu-latest
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4.1.7
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v5.1.1
25 |       with:
26 |         python-version: '3.11`'
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install ipython pandas
31 |         pip install .[test]
32 |     - name: Build package
33 |       run: python setup.py bdist
34 |     - name: Publish package
35 |       uses: pypa/gh-action-pypi-publish@v1.9.0
36 |       with:
37 |         user: __token__
38 |         password: ${{ secrets.PYPI_API_TOKEN }}
39 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Test
10 | 
11 | on: push
12 | 
13 | jobs:
14 |   test:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ['3.8', '3.9', '3.10', '3.11']
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v4.1.7
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v5.1.1
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install ipython pandas
30 |         pip install .[test]
31 |     - name: Build package
32 |       run: python setup.py bdist
33 |     - name: Test
34 |       run: |
35 |         python -c 'import pytorch_memlab'
36 |         pytest test/test_mem_reporter.py
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #### joe made this: http://goel.io/joe
  2 | 
  3 | #####=== IPythonNotebook ===#####
  4 | # Temporary data
  5 | .ipynb_checkpoints/
  6 | 
  7 | #####=== Python ===#####
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | env/
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *,cover
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | #####=== JetBrains ===#####
 69 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
 70 | 
 71 | *.iml
 72 | 
 73 | ## Directory-based project format:
 74 | .idea/
 75 | # if you remove the above rule, at least ignore the following:
 76 | 
 77 | # User-specific stuff:
 78 | # .idea/workspace.xml
 79 | # .idea/tasks.xml
 80 | # .idea/dictionaries
 81 | 
 82 | # Sensitive or high-churn files:
 83 | # .idea/dataSources.ids
 84 | # .idea/dataSources.xml
 85 | # .idea/sqlDataSources.xml
 86 | # .idea/dynamic.xml
 87 | # .idea/uiDesigner.xml
 88 | 
 89 | # Gradle:
 90 | # .idea/gradle.xml
 91 | # .idea/libraries
 92 | 
 93 | # Mongo Explorer plugin:
 94 | # .idea/mongoSettings.xml
 95 | 
 96 | ## File-based project format:
 97 | *.ipr
 98 | *.iws
 99 | 
100 | ## Plugin-specific files:
101 | 
102 | # IntelliJ
103 | /out/
104 | 
105 | # mpeltonen/sbt-idea plugin
106 | .idea_modules/
107 | 
108 | # JIRA plugin
109 | atlassian-ide-plugin.xml
110 | 
111 | # Crashlytics plugin (for Android Studio and IntelliJ)
112 | com_crashlytics_export_strings.xml
113 | crashlytics.properties
114 | crashlytics-build.properties
115 | 
116 | .ropeproject
117 | 
118 | #####=== VSCode ===#####
119 | 
120 | .vscode
121 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kaiyu Shi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE_kernprof.txt:
--------------------------------------------------------------------------------
 1 | This software is OSI Certified Open Source Software.
 2 | OSI Certified is a certification mark of the Open Source Initiative.
 3 | 
 4 | Copyright (c) 2008, Enthought, Inc.
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 |  * Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 |  * Neither the name of Enthought, Inc. nor the names of its contributors may
16 |    be used to endorse or promote products derived from this software without
17 |    specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | pytorch_memlab
  2 | ======
  3 | [![Test](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/test.yml/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/test.yml)
  4 | [![Upload Python Package](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/pypi-publish.yml/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/pypi-publish.yml)
  5 | ![PyPI](https://img.shields.io/pypi/v/pytorch_memlab.svg)
  6 | [![CodeQL: Python](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/github-code-scanning/codeql/badge.svg)](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/github-code-scanning/codeql)
  7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/pytorch_memlab.svg)
  8 | 
  9 | A simple and accurate **CUDA** memory management laboratory for pytorch,
 10 | it consists of different parts about the memory:
 11 | 
 12 | - Features:
 13 | 
 14 |   - Memory Profiler: A `line_profiler` style CUDA memory profiler with simple API.
 15 |   - Memory Reporter: A reporter to inspect tensors occupying the CUDA memory.
 16 |   - Courtesy: An interesting feature to temporarily move all the CUDA tensors into
 17 |     CPU memory for courtesy, and of course the backward transferring.
 18 |   - IPython support through `%mlrun`/`%%mlrun` line/cell magic
 19 |     commands.
 20 | 
 21 | 
 22 | - Table of Contents
 23 |   * [Installation](#installation)
 24 |   * [User-Doc](#user-doc)
 25 |     + [Memory Profiler](#memory-profiler)
 26 |     + [IPython support](#ipython-support)
 27 |     + [Memory Reporter](#memory-reporter)
 28 |     + [Courtesy](#courtesy)
 29 |     + [ACK](#ack)
 30 |   * [CHANGES](#changes)
 31 | 
 32 | Installation
 33 | -----
 34 | 
 35 | - Released version:
 36 | ```bash
 37 | pip install pytorch_memlab
 38 | ```
 39 | 
 40 | - Newest version:
 41 | ```bash
 42 | pip install git+https://github.com/stonesjtu/pytorch_memlab
 43 | ```
 44 | 
 45 | What's for
 46 | -----
 47 | 
 48 | Out-Of-Memory errors in pytorch happen frequently, for new-bees and
 49 | experienced programmers. A common reason is that most people don't really
 50 | learn the underlying memory management philosophy of pytorch and GPUs.
 51 | They wrote memory in-efficient codes and complained about pytorch eating too
 52 | much CUDA memory.
 53 | 
 54 | In this repo, I'm going to share some useful tools to help debugging OOM, or
 55 | to inspect the underlying mechanism if anyone is interested in.
 56 | 
 57 | 
 58 | User-Doc
 59 | -----
 60 | 
 61 | ### Memory Profiler
 62 | 
 63 | The memory profiler is a modification of python's `line_profiler`, it gives
 64 | the memory usage info for each line of code in the specified function/method.
 65 | 
 66 | #### Sample:
 67 | 
 68 | ```python
 69 | import torch
 70 | from pytorch_memlab import LineProfiler
 71 | 
 72 | def inner():
 73 |     torch.nn.Linear(100, 100).cuda()
 74 | 
 75 | def outer():
 76 |     linear = torch.nn.Linear(100, 100).cuda()
 77 |     linear2 = torch.nn.Linear(100, 100).cuda()
 78 |     linear3 = torch.nn.Linear(100, 100).cuda()
 79 | 
 80 | work()
 81 | ```
 82 | 
 83 | After the script finishes or interrupted by keyboard, it gives the following
 84 | profiling info if you're in a Jupyter notebook:
 85 | 
 86 | <p align="center"><img src="readme-output.png" width="640"></p>
 87 | 
 88 | or the following info if you're in a text-only terminal:
 89 | 
 90 | ```
 91 | ## outer
 92 | 
 93 | active_bytes reserved_bytes line  code
 94 |          all            all
 95 |         peak           peak
 96 |        0.00B          0.00B    7  def outer():
 97 |       40.00K          2.00M    8      linear = torch.nn.Linear(100, 100).cuda()
 98 |       80.00K          2.00M    9      linear2 = torch.nn.Linear(100, 100).cuda()
 99 |      120.00K          2.00M   10      inner()
100 | 
101 | 
102 | ## inner
103 | 
104 | active_bytes reserved_bytes line  code
105 |          all            all
106 |         peak           peak
107 |       80.00K          2.00M    4  def inner():
108 |      120.00K          2.00M    5      torch.nn.Linear(100, 100).cuda()
109 | ```
110 | 
111 | An explanation of what each column means can be found in the [Torch documentation](https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats). The name of any field from `memory_stats()`
112 | can be passed to `display()` to view the corresponding statistic.
113 | 
114 | If you use `profile` decorator, the memory statistics are collected during
115 | multiple runs and only the maximum one is displayed at the end.
116 | We also provide a more flexible API called `profile_every` which prints the
117 | memory info every *N* times of function execution. You can simply replace
118 | `@profile` with `@profile_every(1)` to print the memory usage for each
119 | execution.
120 | 
121 | The `@profile` and `@profile_every` can also be mixed to gain more control
122 | of the debugging granularity.
123 | 
124 | - You can also add the decorator in the module class:
125 | 
126 | ```python
127 | class Net(torch.nn.Module):
128 |     def __init__(self):
129 |         super().__init__()
130 |     @profile
131 |     def forward(self, inp):
132 |         #do_something
133 | ```
134 | 
135 | - The *Line Profiler* profiles the memory usage of CUDA device 0 by default,
136 | you may want to switch the device to profile by `set_target_gpu`. The gpu
137 | selection is globally,  which means you have to remember which gpu you are
138 | profiling on during the whole process:
139 | 
140 | ```python
141 | import torch
142 | from pytorch_memlab import profile, set_target_gpu
143 | @profile
144 | def func():
145 |     net1 = torch.nn.Linear(1024, 1024).cuda(0)
146 |     set_target_gpu(1)
147 |     net2 = torch.nn.Linear(1024, 1024).cuda(1)
148 |     set_target_gpu(0)
149 |     net3 = torch.nn.Linear(1024, 1024).cuda(0)
150 | 
151 | func()
152 | ```
153 | 
154 | 
155 | More samples can be found in `test/test_line_profiler.py`
156 | 
157 | ### IPython support
158 | 
159 | Make sure you have `IPython` installed, or have installed `pytorch-memlab` with
160 | `pip install pytorch-memlab[ipython]`.
161 | 
162 | First, load the extension:
163 | 
164 | ```python
165 | %load_ext pytorch_memlab
166 | ```
167 | 
168 | This makes the `%mlrun` and `%%mlrun` line/cell magics available for use. For
169 | example, in a new cell run the following to profile an entire cell
170 | 
171 | ```python
172 | %%mlrun -f func
173 | import torch
174 | from pytorch_memlab import profile, set_target_gpu
175 | def func():
176 |     net1 = torch.nn.Linear(1024, 1024).cuda(0)
177 |     set_target_gpu(1)
178 |     net2 = torch.nn.Linear(1024, 1024).cuda(1)
179 |     set_target_gpu(0)
180 |     net3 = torch.nn.Linear(1024, 1024).cuda(0)
181 | ```
182 | 
183 | Or you can invoke the profiler for a single statement on via the `%mlrun` cell
184 | magic.
185 | 
186 | ```python
187 | import torch
188 | from pytorch_memlab import profile, set_target_gpu
189 | def func(input_size):
190 |     net1 = torch.nn.Linear(input_size, 1024).cuda(0)
191 | %mlrun -f func func(2048)
192 | ```
193 | 
194 | See `%mlrun?` for help on what arguments are supported. You can set the GPU
195 | device to profile, dump profiling results to a file, and return the
196 | `LineProfiler` object for post-profile inspection.
197 | 
198 | Find out more by checking out the [demo Jupyter notebook](./demo.ipynb)
199 | 
200 | 
201 | ### Memory Reporter
202 | 
203 | As *Memory Profiler* only gives the overall memory usage information by lines,
204 | a more low-level memory usage information can be obtained by *Memory Reporter*.
205 | 
206 | *Memory reporter* iterates all the `Tensor` objects and gets the underlying
207 | `UntypedStorage` (previously `Storage`) object to get the actual memory usage instead of the surface
208 | `Tensor.size`.
209 | 
210 | > see [UntypedStorage](https://pytorch.org/docs/stable/storage.html#torch.UntypedStorage) for detailed
211 | >  information
212 | 
213 | #### Sample
214 | 
215 | - A minimal one:
216 | 
217 | ```python
218 | import torch
219 | from pytorch_memlab import MemReporter
220 | linear = torch.nn.Linear(1024, 1024).cuda()
221 | reporter = MemReporter()
222 | reporter.report()
223 | ```
224 | outputs:
225 | ```
226 | Element type                                            Size  Used MEM
227 | -------------------------------------------------------------------------------
228 | Storage on cuda:0
229 | Parameter0                                      (1024, 1024)     4.00M
230 | Parameter1                                           (1024,)     4.00K
231 | -------------------------------------------------------------------------------
232 | Total Tensors: 1049600  Used Memory: 4.00M
233 | The allocated memory on cuda:0: 4.00M
234 | -------------------------------------------------------------------------------
235 | ```
236 | 
237 | - You can also pass in a model object for automatically name inference.
238 | 
239 | ```python
240 | import torch
241 | from pytorch_memlab import MemReporter
242 | 
243 | linear = torch.nn.Linear(1024, 1024).cuda()
244 | inp = torch.Tensor(512, 1024).cuda()
245 | # pass in a model to automatically infer the tensor names
246 | reporter = MemReporter(linear)
247 | out = linear(inp).mean()
248 | print('========= before backward =========')
249 | reporter.report()
250 | out.backward()
251 | print('========= after backward =========')
252 | reporter.report()
253 | ```
254 | 
255 | outputs:
256 | ```
257 | ========= before backward =========
258 | Element type                                            Size  Used MEM
259 | -------------------------------------------------------------------------------
260 | Storage on cuda:0
261 | weight                                          (1024, 1024)     4.00M
262 | bias                                                 (1024,)     4.00K
263 | Tensor0                                          (512, 1024)     2.00M
264 | Tensor1                                                 (1,)   512.00B
265 | -------------------------------------------------------------------------------
266 | Total Tensors: 1573889  Used Memory: 6.00M
267 | The allocated memory on cuda:0: 6.00M
268 | -------------------------------------------------------------------------------
269 | ========= after backward =========
270 | Element type                                            Size  Used MEM
271 | -------------------------------------------------------------------------------
272 | Storage on cuda:0
273 | weight                                          (1024, 1024)     4.00M
274 | weight.grad                                     (1024, 1024)     4.00M
275 | bias                                                 (1024,)     4.00K
276 | bias.grad                                            (1024,)     4.00K
277 | Tensor0                                          (512, 1024)     2.00M
278 | Tensor1                                                 (1,)   512.00B
279 | -------------------------------------------------------------------------------
280 | Total Tensors: 2623489  Used Memory: 10.01M
281 | The allocated memory on cuda:0: 10.01M
282 | -------------------------------------------------------------------------------
283 | ```
284 | 
285 | 
286 | - The reporter automatically deals with the sharing weights parameters:
287 | 
288 | ```python
289 | import torch
290 | from pytorch_memlab import MemReporter
291 | 
292 | linear = torch.nn.Linear(1024, 1024).cuda()
293 | linear2 = torch.nn.Linear(1024, 1024).cuda()
294 | linear2.weight = linear.weight
295 | container = torch.nn.Sequential(
296 |     linear, linear2
297 | )
298 | inp = torch.Tensor(512, 1024).cuda()
299 | # pass in a model to automatically infer the tensor names
300 | 
301 | out = container(inp).mean()
302 | out.backward()
303 | 
304 | # verbose shows how storage is shared across multiple Tensors
305 | reporter = MemReporter(container)
306 | reporter.report(verbose=True)
307 | ```
308 | 
309 | outputs:
310 | ```
311 | Element type                                            Size  Used MEM
312 | -------------------------------------------------------------------------------
313 | Storage on cuda:0
314 | 0.weight                                        (1024, 1024)     4.00M
315 | 0.weight.grad                                   (1024, 1024)     4.00M
316 | 0.bias                                               (1024,)     4.00K
317 | 0.bias.grad                                          (1024,)     4.00K
318 | 1.bias                                               (1024,)     4.00K
319 | 1.bias.grad                                          (1024,)     4.00K
320 | Tensor0                                          (512, 1024)     2.00M
321 | Tensor1                                                 (1,)   512.00B
322 | -------------------------------------------------------------------------------
323 | Total Tensors: 2625537  Used Memory: 10.02M
324 | The allocated memory on cuda:0: 10.02M
325 | -------------------------------------------------------------------------------
326 | ```
327 | 
328 | - You can better understand the memory layout for more complicated module:
329 | 
330 | ```python
331 | import torch
332 | from pytorch_memlab import MemReporter
333 | 
334 | lstm = torch.nn.LSTM(1024, 1024).cuda()
335 | reporter = MemReporter(lstm)
336 | reporter.report(verbose=True)
337 | inp = torch.Tensor(10, 10, 1024).cuda()
338 | out, _ = lstm(inp)
339 | out.mean().backward()
340 | reporter.report(verbose=True)
341 | ```
342 | 
343 | As shown below, the `(->)` indicates the re-use of the same storage back-end
344 | outputs:
345 | ```
346 | Element type                                            Size  Used MEM
347 | -------------------------------------------------------------------------------
348 | Storage on cuda:0
349 | weight_ih_l0                                    (4096, 1024)    32.03M
350 | weight_hh_l0(->weight_ih_l0)                    (4096, 1024)     0.00B
351 | bias_ih_l0(->weight_ih_l0)                           (4096,)     0.00B
352 | bias_hh_l0(->weight_ih_l0)                           (4096,)     0.00B
353 | Tensor0                                       (10, 10, 1024)   400.00K
354 | -------------------------------------------------------------------------------
355 | Total Tensors: 8499200  Used Memory: 32.42M
356 | The allocated memory on cuda:0: 32.52M
357 | Memory differs due to the matrix alignment
358 | -------------------------------------------------------------------------------
359 | Element type                                            Size  Used MEM
360 | -------------------------------------------------------------------------------
361 | Storage on cuda:0
362 | weight_ih_l0                                    (4096, 1024)    32.03M
363 | weight_ih_l0.grad                               (4096, 1024)    32.03M
364 | weight_hh_l0(->weight_ih_l0)                    (4096, 1024)     0.00B
365 | weight_hh_l0.grad(->weight_ih_l0.grad)          (4096, 1024)     0.00B
366 | bias_ih_l0(->weight_ih_l0)                           (4096,)     0.00B
367 | bias_ih_l0.grad(->weight_ih_l0.grad)                 (4096,)     0.00B
368 | bias_hh_l0(->weight_ih_l0)                           (4096,)     0.00B
369 | bias_hh_l0.grad(->weight_ih_l0.grad)                 (4096,)     0.00B
370 | Tensor0                                       (10, 10, 1024)   400.00K
371 | Tensor1                                       (10, 10, 1024)   400.00K
372 | Tensor2                                        (1, 10, 1024)    40.00K
373 | Tensor3                                        (1, 10, 1024)    40.00K
374 | -------------------------------------------------------------------------------
375 | Total Tensors: 17018880         Used Memory: 64.92M
376 | The allocated memory on cuda:0: 65.11M
377 | Memory differs due to the matrix alignment
378 | -------------------------------------------------------------------------------
379 | ```
380 | 
381 | NOTICE:
382 | > When forwarding with `grad_mode=True`, pytorch maintains tensor buffers for
383 | > future Back-Propagation, in C level. So these buffers are not going to be
384 | > managed or collected by pytorch. But if you store these intermediate results
385 | > as python variables, then they will be reported.
386 | 
387 | - You can also filter the device to report on by passing extra arguments:
388 | `report(device=torch.device(0))`
389 | 
390 | - A failed example due to pytorch's C side tensor buffers
391 | 
392 | In the following example, a temp buffer is created at `inp * (inp + 2)` to
393 | store both `inp` and `inp + 2`, unfortunately python only knows the existence
394 | of inp, so we have *2M* memory lost, which is the same size of Tensor `inp`.
395 | 
396 | ```python
397 | import torch
398 | from pytorch_memlab import MemReporter
399 | 
400 | linear = torch.nn.Linear(1024, 1024).cuda()
401 | inp = torch.Tensor(512, 1024).cuda()
402 | # pass in a model to automatically infer the tensor names
403 | reporter = MemReporter(linear)
404 | out = linear(inp * (inp + 2)).mean()
405 | reporter.report()
406 | ```
407 | 
408 | outputs:
409 | ```
410 | Element type                                            Size  Used MEM
411 | -------------------------------------------------------------------------------
412 | Storage on cuda:0
413 | weight                                          (1024, 1024)     4.00M
414 | bias                                                 (1024,)     4.00K
415 | Tensor0                                          (512, 1024)     2.00M
416 | Tensor1                                                 (1,)   512.00B
417 | -------------------------------------------------------------------------------
418 | Total Tensors: 1573889  Used Memory: 6.00M
419 | The allocated memory on cuda:0: 8.00M
420 | Memory differs due to the matrix alignment or invisible gradient buffer tensors
421 | -------------------------------------------------------------------------------
422 | ```
423 | 
424 | 
425 | ### Courtesy
426 | 
427 | Sometimes people would like to preempt your running task, but you don't want
428 | to save checkpoint and then load, actually all they need is GPU resources (
429 | typically CPU resources and CPU memory is always spare in GPU clusters), so
430 | you can move all your workspaces from GPU to CPU and then halt your task until
431 | a restart signal is triggered, instead of saving&loading checkpoints and
432 | bootstrapping from scratch.
433 | 
434 | Still developing..... But you can have fun with:
435 | ```python
436 | from pytorch_memlab import Courtesy
437 | 
438 | iamcourtesy = Courtesy()
439 | for i in range(num_iteration):
440 |     if something_happens:
441 |         iamcourtesy.yield_memory()
442 |         wait_for_restart_signal()
443 |         iamcourtesy.restore()
444 | ```
445 | 
446 | #### Known Issues
447 | 
448 | - As is stated above in `Memory_Reporter`, intermediate tensors are not covered
449 | properly, so you may want to insert such courtesy logics after `backward` or
450 | before `forward`.
451 | - Currently the CUDA context of pytorch requires about 1 GB CUDA memory, which
452 | means even all Tensors are on CPU, 1GB of CUDA memory is wasted, :-(. However
453 | it's still under investigation if I can fully destroy the context and then
454 | re-init.
455 | 
456 | 
457 | ### ACK
458 | 
459 | I suffered a lot debugging weird memory usage during my 3-years of developing
460 | efficient Deep Learning models, and of course learned a lot from the great
461 | open source community.
462 | 
463 | ## CHANGES
464 | 
465 | 
466 | ##### 0.3.0 (2023-7-29)
467 |   - Fix `DataFrame.drop` for pandas 1.5+
468 | ##### 0.2.4 (2021-10-28)
469 |   - Fix colab error (#35)
470 |   - Support python3.8 (#38)
471 |   - Support sparse tensor (#30)
472 | ##### 0.2.3 (2020-12-01)
473 |   - Fix name mapping in `MemReporter` (#24)
474 |   - Fix reporter without model input (#22 #25)
475 | ##### 0.2.2 (2020-10-23)
476 |   - Fix memory leak in `MemReporter`
477 | ##### 0.2.1 (2020-06-18)
478 |   - Fix `line_profiler` not found
479 | ##### 0.2.0 (2020-06-15)
480 |   - Add jupyter notebook figure and ipython support
481 | ##### 0.1.0 (2020-04-17)
482 |   - Add ipython magic support (#8)
483 | ##### 0.0.4 (2019-10-08)
484 |   - Add gpu switch for line-profiler(#2)
485 |   - Add device filter for reporter
486 | ##### 0.0.3 (2019-06-15)
487 |   - Install dependency for pip installation
488 | ##### 0.0.2 (2019-06-04)
489 |   - Fix statistics shift in loop
490 | ##### 0.0.1 (2019-05-28)
491 |   - initial release
492 | 
493 | ## Star History
494 | 
495 | [![Star History Chart](https://api.star-history.com/svg?repos=stonesjtu/pytorch_memlab&type=Date)](https://star-history.com/#stonesjtu/pytorch_memlab&Date)
496 | 


--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Once installed, you need to load the `pytorch_memlab` IPython extensions:"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%load_ext pytorch_memlab"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "One magic is provided, `mlrun` which can act either as a line magic `%mlrun`, or as a cell magic `%%mlrun`"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "%%mlrun?"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "First we need some torch code to profile:"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import torch\n",
 49 |     "\n",
 50 |     "def x():\n",
 51 |     "    torch.nn.Linear(100, 100).cuda()\n",
 52 |     "    \n",
 53 |     "def y(gpu=0):\n",
 54 |     "    torch.nn.Linear(1000, 100).cuda(device=gpu)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "We can profile multiple functions at the same type by repeatedly specifying `-f`"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/html": [
 72 |        "<h3><span style=\"font-family: monospace\">x</span></h3><div><style  type=\"text/css\" >\n",
 73 |        "    #T_3e650110_8557_11ea_91ba_14187765e3c9 th {\n",
 74 |        "          text-align: left;\n",
 75 |        "    }    #T_3e650110_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
 76 |        "            width:  10em;\n",
 77 |        "             height:  80%;\n",
 78 |        "        }    #T_3e650110_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
 79 |        "            width:  10em;\n",
 80 |        "             height:  80%;\n",
 81 |        "        }    #T_3e650110_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
 82 |        "            text-align:  left;\n",
 83 |        "            white-space:  pre;\n",
 84 |        "            font-family:  monospace;\n",
 85 |        "        }    #T_3e650110_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
 86 |        "            width:  10em;\n",
 87 |        "             height:  80%;\n",
 88 |        "            background:  linear-gradient(90deg,#4878d0 10.1%, transparent 10.1%);\n",
 89 |        "        }    #T_3e650110_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
 90 |        "            width:  10em;\n",
 91 |        "             height:  80%;\n",
 92 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
 93 |        "        }    #T_3e650110_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
 94 |        "            text-align:  left;\n",
 95 |        "            white-space:  pre;\n",
 96 |        "            font-family:  monospace;\n",
 97 |        "        }</style><table id=\"T_3e650110_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
 98 |        "                <tr>\n",
 99 |        "                                <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
100 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >0.00B</td>\n",
101 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >3</td>\n",
102 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def x():\n",
103 |        "</td>\n",
104 |        "            </tr>\n",
105 |        "            <tr>\n",
106 |        "                                <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >40.00K</td>\n",
107 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
108 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >4</td>\n",
109 |        "                        <td id=\"T_3e650110_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(100, 100).cuda()\n",
110 |        "</td>\n",
111 |        "            </tr>\n",
112 |        "    </tbody></table></div>\n",
113 |        "<h3><span style=\"font-family: monospace\">y</span></h3><div><style  type=\"text/css\" >\n",
114 |        "    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9 th {\n",
115 |        "          text-align: left;\n",
116 |        "    }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
117 |        "            width:  10em;\n",
118 |        "             height:  80%;\n",
119 |        "        }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
120 |        "            width:  10em;\n",
121 |        "             height:  80%;\n",
122 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
123 |        "        }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
124 |        "            text-align:  left;\n",
125 |        "            white-space:  pre;\n",
126 |        "            font-family:  monospace;\n",
127 |        "        }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
128 |        "            width:  10em;\n",
129 |        "             height:  80%;\n",
130 |        "            background:  linear-gradient(90deg,#4878d0 99.0%, transparent 99.0%);\n",
131 |        "        }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
132 |        "            width:  10em;\n",
133 |        "             height:  80%;\n",
134 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
135 |        "        }    #T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
136 |        "            text-align:  left;\n",
137 |        "            white-space:  pre;\n",
138 |        "            font-family:  monospace;\n",
139 |        "        }</style><table id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
140 |        "                <tr>\n",
141 |        "                                <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
142 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >2.00M</td>\n",
143 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >6</td>\n",
144 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def y(gpu=0):\n",
145 |        "</td>\n",
146 |        "            </tr>\n",
147 |        "            <tr>\n",
148 |        "                                <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >391.50K</td>\n",
149 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
150 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >7</td>\n",
151 |        "                        <td id=\"T_3e6e2fd8_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(1000, 100).cuda(device=gpu)\n",
152 |        "</td>\n",
153 |        "            </tr>\n",
154 |        "    </tbody></table></div>"
155 |       ],
156 |       "text/plain": [
157 |        "## x\n",
158 |        "\n",
159 |        "active_bytes reserved_bytes line  code                                 \n",
160 |        "         all            all                                            \n",
161 |        "        peak           peak                                            \n",
162 |        "       0.00B          0.00B    3  def x():                             \n",
163 |        "      40.00K          2.00M    4      torch.nn.Linear(100, 100).cuda() \n",
164 |        "\n",
165 |        "\n",
166 |        "## y\n",
167 |        "\n",
168 |        "active_bytes reserved_bytes line  code                                            \n",
169 |        "         all            all                                                       \n",
170 |        "        peak           peak                                                       \n",
171 |        "       0.00B          2.00M    6  def y(gpu=0):                                   \n",
172 |        "     391.50K          2.00M    7      torch.nn.Linear(1000, 100).cuda(device=gpu) "
173 |       ]
174 |      },
175 |      "execution_count": 4,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "%%mlrun -f x -f y\n",
182 |     "\n",
183 |     "x()\n",
184 |     "y()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "You can alos profile with the `%mlrun` line magic"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 5,
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/html": [
202 |        "<h3><span style=\"font-family: monospace\">z</span></h3><div><style  type=\"text/css\" >\n",
203 |        "    #T_3e870152_8557_11ea_91ba_14187765e3c9 th {\n",
204 |        "          text-align: left;\n",
205 |        "    }    #T_3e870152_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
206 |        "            width:  10em;\n",
207 |        "             height:  80%;\n",
208 |        "        }    #T_3e870152_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
209 |        "            width:  10em;\n",
210 |        "             height:  80%;\n",
211 |        "        }    #T_3e870152_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
212 |        "            text-align:  left;\n",
213 |        "            white-space:  pre;\n",
214 |        "            font-family:  monospace;\n",
215 |        "        }    #T_3e870152_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
216 |        "            width:  10em;\n",
217 |        "             height:  80%;\n",
218 |        "            background:  linear-gradient(90deg,#4878d0 99.0%, transparent 99.0%);\n",
219 |        "        }    #T_3e870152_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
220 |        "            width:  10em;\n",
221 |        "             height:  80%;\n",
222 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
223 |        "        }    #T_3e870152_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
224 |        "            text-align:  left;\n",
225 |        "            white-space:  pre;\n",
226 |        "            font-family:  monospace;\n",
227 |        "        }</style><table id=\"T_3e870152_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
228 |        "                <tr>\n",
229 |        "                                <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
230 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >0.00B</td>\n",
231 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >1</td>\n",
232 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def z():\n",
233 |        "</td>\n",
234 |        "            </tr>\n",
235 |        "            <tr>\n",
236 |        "                                <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >40.00K</td>\n",
237 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
238 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >2</td>\n",
239 |        "                        <td id=\"T_3e870152_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(100, 100).cuda()\n",
240 |        "</td>\n",
241 |        "            </tr>\n",
242 |        "    </tbody></table></div>"
243 |       ],
244 |       "text/plain": [
245 |        "## z\n",
246 |        "\n",
247 |        "active_bytes reserved_bytes line  code                                 \n",
248 |        "         all            all                                            \n",
249 |        "        peak           peak                                            \n",
250 |        "       0.00B          0.00B    1  def z():                             \n",
251 |        "      40.00K          2.00M    2      torch.nn.Linear(100, 100).cuda() "
252 |       ]
253 |      },
254 |      "execution_count": 5,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "def z():\n",
261 |     "    torch.nn.Linear(100, 100).cuda()\n",
262 |     "%mlrun -f z z()"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "You can specify which GPU you wish to profile using `-g`:"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 6,
275 |    "metadata": {},
276 |    "outputs": [
277 |     {
278 |      "data": {
279 |       "text/html": [
280 |        "<h3><span style=\"font-family: monospace\">x</span></h3><div><style  type=\"text/css\" >\n",
281 |        "    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9 th {\n",
282 |        "          text-align: left;\n",
283 |        "    }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
284 |        "            width:  10em;\n",
285 |        "             height:  80%;\n",
286 |        "        }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
287 |        "            width:  10em;\n",
288 |        "             height:  80%;\n",
289 |        "        }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
290 |        "            text-align:  left;\n",
291 |        "            white-space:  pre;\n",
292 |        "            font-family:  monospace;\n",
293 |        "        }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
294 |        "            width:  10em;\n",
295 |        "             height:  80%;\n",
296 |        "            background:  linear-gradient(90deg,#4878d0 10.1%, transparent 10.1%);\n",
297 |        "        }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
298 |        "            width:  10em;\n",
299 |        "             height:  80%;\n",
300 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
301 |        "        }    #T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
302 |        "            text-align:  left;\n",
303 |        "            white-space:  pre;\n",
304 |        "            font-family:  monospace;\n",
305 |        "        }</style><table id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
306 |        "                <tr>\n",
307 |        "                                <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
308 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >0.00B</td>\n",
309 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >3</td>\n",
310 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def x():\n",
311 |        "</td>\n",
312 |        "            </tr>\n",
313 |        "            <tr>\n",
314 |        "                                <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >40.00K</td>\n",
315 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
316 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >4</td>\n",
317 |        "                        <td id=\"T_3ea0c4ac_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(100, 100).cuda()\n",
318 |        "</td>\n",
319 |        "            </tr>\n",
320 |        "    </tbody></table></div>\n",
321 |        "<h3><span style=\"font-family: monospace\">y</span></h3><div><style  type=\"text/css\" >\n",
322 |        "    #T_3ea87986_8557_11ea_91ba_14187765e3c9 th {\n",
323 |        "          text-align: left;\n",
324 |        "    }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
325 |        "            width:  10em;\n",
326 |        "             height:  80%;\n",
327 |        "        }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
328 |        "            width:  10em;\n",
329 |        "             height:  80%;\n",
330 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
331 |        "        }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
332 |        "            text-align:  left;\n",
333 |        "            white-space:  pre;\n",
334 |        "            font-family:  monospace;\n",
335 |        "        }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
336 |        "            width:  10em;\n",
337 |        "             height:  80%;\n",
338 |        "            background:  linear-gradient(90deg,#4878d0 99.0%, transparent 99.0%);\n",
339 |        "        }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
340 |        "            width:  10em;\n",
341 |        "             height:  80%;\n",
342 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
343 |        "        }    #T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
344 |        "            text-align:  left;\n",
345 |        "            white-space:  pre;\n",
346 |        "            font-family:  monospace;\n",
347 |        "        }</style><table id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
348 |        "                <tr>\n",
349 |        "                                <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
350 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >2.00M</td>\n",
351 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >6</td>\n",
352 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def y(gpu=0):\n",
353 |        "</td>\n",
354 |        "            </tr>\n",
355 |        "            <tr>\n",
356 |        "                                <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >391.50K</td>\n",
357 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
358 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >7</td>\n",
359 |        "                        <td id=\"T_3ea87986_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(1000, 100).cuda(device=gpu)\n",
360 |        "</td>\n",
361 |        "            </tr>\n",
362 |        "    </tbody></table></div>"
363 |       ],
364 |       "text/plain": [
365 |        "## x\n",
366 |        "\n",
367 |        "active_bytes reserved_bytes line  code                                 \n",
368 |        "         all            all                                            \n",
369 |        "        peak           peak                                            \n",
370 |        "       0.00B          0.00B    3  def x():                             \n",
371 |        "      40.00K          2.00M    4      torch.nn.Linear(100, 100).cuda() \n",
372 |        "\n",
373 |        "\n",
374 |        "## y\n",
375 |        "\n",
376 |        "active_bytes reserved_bytes line  code                                            \n",
377 |        "         all            all                                                       \n",
378 |        "        peak           peak                                                       \n",
379 |        "       0.00B          2.00M    6  def y(gpu=0):                                   \n",
380 |        "     391.50K          2.00M    7      torch.nn.Linear(1000, 100).cuda(device=gpu) "
381 |       ]
382 |      },
383 |      "execution_count": 6,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "%%mlrun -f x -f y -g 0 y\n",
390 |     "\n",
391 |     "x()\n",
392 |     "y(gpu=0)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "You can get a handle on the `LineProfiler` object using `-r`"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 7,
405 |    "metadata": {},
406 |    "outputs": [
407 |     {
408 |      "data": {
409 |       "text/html": [
410 |        "<h3><span style=\"font-family: monospace\">x</span></h3><div><style  type=\"text/css\" >\n",
411 |        "    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9 th {\n",
412 |        "          text-align: left;\n",
413 |        "    }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
414 |        "            width:  10em;\n",
415 |        "             height:  80%;\n",
416 |        "        }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
417 |        "            width:  10em;\n",
418 |        "             height:  80%;\n",
419 |        "        }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
420 |        "            text-align:  left;\n",
421 |        "            white-space:  pre;\n",
422 |        "            font-family:  monospace;\n",
423 |        "        }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
424 |        "            width:  10em;\n",
425 |        "             height:  80%;\n",
426 |        "            background:  linear-gradient(90deg,#4878d0 99.0%, transparent 99.0%);\n",
427 |        "        }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
428 |        "            width:  10em;\n",
429 |        "             height:  80%;\n",
430 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
431 |        "        }    #T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
432 |        "            text-align:  left;\n",
433 |        "            white-space:  pre;\n",
434 |        "            font-family:  monospace;\n",
435 |        "        }</style><table id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
436 |        "                <tr>\n",
437 |        "                                <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
438 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >0.00B</td>\n",
439 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >3</td>\n",
440 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def x():\n",
441 |        "</td>\n",
442 |        "            </tr>\n",
443 |        "            <tr>\n",
444 |        "                                <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >40.00K</td>\n",
445 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
446 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >4</td>\n",
447 |        "                        <td id=\"T_3ec4a7f0_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(100, 100).cuda()\n",
448 |        "</td>\n",
449 |        "            </tr>\n",
450 |        "    </tbody></table></div>"
451 |       ],
452 |       "text/plain": [
453 |        "## x\n",
454 |        "\n",
455 |        "active_bytes reserved_bytes line  code                                 \n",
456 |        "         all            all                                            \n",
457 |        "        peak           peak                                            \n",
458 |        "       0.00B          0.00B    3  def x():                             \n",
459 |        "      40.00K          2.00M    4      torch.nn.Linear(100, 100).cuda() "
460 |       ]
461 |      },
462 |      "execution_count": 7,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "profiler = %mlrun -q -r -f x x()\n",
469 |     "profiler.display()"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "You can dump stats out to a file using `-T`:"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 8,
482 |    "metadata": {},
483 |    "outputs": [
484 |     {
485 |      "data": {
486 |       "text/html": [
487 |        "<h3><span style=\"font-family: monospace\">x</span></h3><div><style  type=\"text/css\" >\n",
488 |        "    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9 th {\n",
489 |        "          text-align: left;\n",
490 |        "    }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col0 {\n",
491 |        "            width:  10em;\n",
492 |        "             height:  80%;\n",
493 |        "        }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col1 {\n",
494 |        "            width:  10em;\n",
495 |        "             height:  80%;\n",
496 |        "        }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col3 {\n",
497 |        "            text-align:  left;\n",
498 |        "            white-space:  pre;\n",
499 |        "            font-family:  monospace;\n",
500 |        "        }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col0 {\n",
501 |        "            width:  10em;\n",
502 |        "             height:  80%;\n",
503 |        "            background:  linear-gradient(90deg,#4878d0 99.0%, transparent 99.0%);\n",
504 |        "        }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col1 {\n",
505 |        "            width:  10em;\n",
506 |        "             height:  80%;\n",
507 |        "            background:  linear-gradient(90deg,#ee854a 99.0%, transparent 99.0%);\n",
508 |        "        }    #T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col3 {\n",
509 |        "            text-align:  left;\n",
510 |        "            white-space:  pre;\n",
511 |        "            font-family:  monospace;\n",
512 |        "        }</style><table id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9\" ><thead>    <tr>        <th class=\"col_heading level0 col0\" >active_bytes</th>        <th class=\"col_heading level0 col1\" >reserved_bytes</th>        <th class=\"col_heading level0 col2\" >line</th>        <th class=\"col_heading level0 col3\" >code</th>    </tr>    <tr>        <th class=\"col_heading level1 col0\" >all</th>        <th class=\"col_heading level1 col1\" >all</th>        <th class=\"col_heading level1 col2\" ></th>        <th class=\"col_heading level1 col3\" ></th>    </tr>    <tr>        <th class=\"col_heading level2 col0\" >peak</th>        <th class=\"col_heading level2 col1\" >peak</th>        <th class=\"col_heading level2 col2\" ></th>        <th class=\"col_heading level2 col3\" ></th>    </tr></thead><tbody>\n",
513 |        "                <tr>\n",
514 |        "                                <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col0\" class=\"data row0 col0\" >0.00B</td>\n",
515 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col1\" class=\"data row0 col1\" >0.00B</td>\n",
516 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col2\" class=\"data row0 col2\" >3</td>\n",
517 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row0_col3\" class=\"data row0 col3\" >def x():\n",
518 |        "</td>\n",
519 |        "            </tr>\n",
520 |        "            <tr>\n",
521 |        "                                <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col0\" class=\"data row1 col0\" >40.00K</td>\n",
522 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col1\" class=\"data row1 col1\" >2.00M</td>\n",
523 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col2\" class=\"data row1 col2\" >4</td>\n",
524 |        "                        <td id=\"T_3ef1ac96_8557_11ea_91ba_14187765e3c9row1_col3\" class=\"data row1 col3\" >    torch.nn.Linear(100, 100).cuda()\n",
525 |        "</td>\n",
526 |        "            </tr>\n",
527 |        "    </tbody></table></div>"
528 |       ],
529 |       "text/plain": [
530 |        "## x\n",
531 |        "\n",
532 |        "active_bytes reserved_bytes line  code                                 \n",
533 |        "         all            all                                            \n",
534 |        "        peak           peak                                            \n",
535 |        "       0.00B          0.00B    3  def x():                             \n",
536 |        "      40.00K          2.00M    4      torch.nn.Linear(100, 100).cuda() "
537 |       ]
538 |      },
539 |      "execution_count": 8,
540 |      "metadata": {},
541 |      "output_type": "execute_result"
542 |     }
543 |    ],
544 |    "source": [
545 |     "%mlrun -q -T profile.log -f x x()"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": 9,
551 |    "metadata": {},
552 |    "outputs": [
553 |     {
554 |      "name": "stdout",
555 |      "output_type": "stream",
556 |      "text": [
557 |       "## x\r\n",
558 |       "\r\n",
559 |       "active_bytes reserved_bytes line  code                                 \r\n",
560 |       "         all            all                                            \r\n",
561 |       "        peak           peak                                            \r\n",
562 |       "       0.00B          0.00B    3  def x():                             \r\n",
563 |       "      40.00K          2.00M    4      torch.nn.Linear(100, 100).cuda() \r\n"
564 |      ]
565 |     }
566 |    ],
567 |    "source": [
568 |     "!head profile.log"
569 |    ]
570 |   }
571 |  ],
572 |  "metadata": {
573 |   "kernelspec": {
574 |    "display_name": "Python 3",
575 |    "language": "python",
576 |    "name": "python3"
577 |   },
578 |   "language_info": {
579 |    "codemirror_mode": {
580 |     "name": "ipython",
581 |     "version": 3
582 |    },
583 |    "file_extension": ".py",
584 |    "mimetype": "text/x-python",
585 |    "name": "python",
586 |    "nbconvert_exporter": "python",
587 |    "pygments_lexer": "ipython3",
588 |    "version": "3.7.3"
589 |   }
590 |  },
591 |  "nbformat": 4,
592 |  "nbformat_minor": 4
593 | }
594 | 


--------------------------------------------------------------------------------
/pytorch_memlab/__init__.py:
--------------------------------------------------------------------------------
1 | from .courtesy import Courtesy
2 | from .mem_reporter import MemReporter
3 | from .line_profiler import LineProfiler, profile, profile_every, set_target_gpu, clear_global_line_profiler
4 | try:
5 |     from .line_profiler.extension import load_ipython_extension
6 | except ImportError:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/pytorch_memlab/courtesy.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import torch
 3 | 
 4 | 
 5 | class Courtesy():
 6 |     """A class to yield CUDA memory at any time in the training
 7 | 
 8 |     The whole save/load is a bit tricky because all data transfer should
 9 |     be inplace operation and gradient agnostic
10 |     """
11 |     def __init__(self):
12 |         self.loc_map = {}
13 | 
14 |     def yield_memory(self):
15 |         """Transfer all the CUDA tensors into CPU memory"""
16 |         tensors = [obj for obj in gc.get_objects() if isinstance(obj, torch.Tensor)]
17 |         for t in tensors:
18 |             # in case tensors appear more than once
19 |             if t not in self.loc_map:
20 |                 self.loc_map[t] = t.device
21 | 
22 |             t.data = t.data.cpu()
23 |             # parameters have one more wrapper for .data
24 |             if isinstance(t, torch.nn.Parameter):
25 |                 # sometimes Parameter does not have grad
26 |                 try:
27 |                     t.grad.data = t.grad.cpu()
28 |                 finally:
29 |                     pass
30 |         torch.cuda.empty_cache()
31 | 
32 |     def restore(self):
33 |         """Restore the tensors into original CUDA devices"""
34 |         for t, device in self.loc_map.items():
35 |             t.data = t.data.to(device)
36 |             if isinstance(t, torch.nn.Parameter):
37 |                 # sometimes Parameter does not have grad
38 |                 try:
39 |                     t.grad = t.grad.to(device)
40 |                 finally:
41 |                     pass
42 |         self.loc_map.clear()
43 | 
44 |     def __enter__(self):
45 |         self.yield_memory()
46 |         return self
47 | 
48 |     def __exit__(self, *args):
49 |         self.restore()
50 | 


--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/__init__.py:
--------------------------------------------------------------------------------
1 | from .line_profiler import LineProfiler
2 | from .profile import profile, profile_every, set_target_gpu, clear_global_line_profiler
3 | 


--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/extension.py:
--------------------------------------------------------------------------------
  1 | """IPython & notebook extension interface"""
  2 | from IPython.core.magic import (
  3 |     Magics,
  4 |     magics_class,
  5 |     line_cell_magic,
  6 |     needs_local_scope,
  7 | )
  8 | from IPython.core.magic_arguments import magic_arguments, argument, parse_argstring
  9 | 
 10 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS
 11 | 
 12 | 
 13 | class UsageError(Exception):
 14 |     pass
 15 | 
 16 | 
 17 | @magics_class
 18 | class MemlabMagics(Magics):
 19 |     @magic_arguments()
 20 |     @argument('--function',
 21 |               '-f',
 22 |               metavar='FUNC',
 23 |               action='append',
 24 |               default=[],
 25 |               help="""Function to profile. Can be specified multiple times to profile multiple
 26 |                    functions""")
 27 |     @argument('--column',
 28 |               '-c',
 29 |               metavar='COLS',
 30 |               action='append',
 31 |               default=[],
 32 |               help="""Columns to display. Can be specified multiple times to profile multiple
 33 |                    functions. See the Torch CUDA spec at
 34 |                    https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats for details.""")
 35 |     @argument('-D',
 36 |               '--no_default_columns',
 37 |               action='store_true',
 38 |               help='Hide the default columns of ' + ", ".join(DEFAULT_COLUMNS))
 39 |     @argument('-r',
 40 |               '--return-profiler',
 41 |               action='store_true',
 42 |               help='Return LineProfiler object for introspection')
 43 |     @argument('-g',
 44 |               '--gpu',
 45 |               metavar='GPU_ID',
 46 |               default=0,
 47 |               type=int,
 48 |               help='Profile memory usage of this GPU')
 49 |     @argument('-q',
 50 |               '--quiet',
 51 |               action='store_true',
 52 |               help='Don\'t print out profile results')
 53 |     @argument('statement',
 54 |               nargs='*',
 55 |               default=None,
 56 |               help='Code to run under profiler. You can omit this in cell magic mode.')
 57 |     @argument('-T',
 58 |               '--dump-profile',
 59 |               metavar='OUTPUT',
 60 |               help='Dump text profile output to file')
 61 |     @line_cell_magic
 62 |     @needs_local_scope
 63 |     def mlrun(self, line=None, cell=None, local_ns=None):
 64 |         """Execute a statement/cell under the PyTorch Memlab profiler to collect CUDA memory
 65 |         allocation information on a per-line basis.
 66 |         """
 67 |         args = parse_argstring(self.mlrun, line)
 68 |         global_ns = self.shell.user_global_ns
 69 | 
 70 |         funcs = []
 71 |         for name in args.function:
 72 |             try:
 73 |                 fn = eval(name, global_ns, local_ns)
 74 |                 funcs.append(fn)
 75 |             except NameError as e:
 76 |                 raise UsageError('Could not find function {!r}.\n{}: {}'.format(
 77 |                     name, e.__class__.__name__, e)
 78 |                 )
 79 |         profiler = LineProfiler(*funcs, target_gpu=args.gpu)
 80 |         if cell is not None:
 81 |             code = cell
 82 |         else:
 83 |             assert args.statement is not None
 84 |             code = '\n'.join(args.statement)
 85 |         with profiler:
 86 |             exec(compile(code, filename='<ipython>', mode='exec'), local_ns)
 87 | 
 88 |         if args.dump_profile is not None:
 89 |             with open(args.dump_profile, 'w') as f:
 90 |                 profiler.print_stats(stream=f)
 91 | 
 92 |         if args.return_profiler:
 93 |             return profiler
 94 |         else:
 95 |             defaults = [] if args.no_default_columns else list(DEFAULT_COLUMNS)
 96 |             return profiler.display(columns=defaults + args.column)
 97 | 
 98 | 
 99 | def load_ipython_extension(ipython):
100 |     ipython.register_magics(MemlabMagics)
101 | 


--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/line_profiler.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import sys
  3 | from types import FrameType
  4 | import warnings
  5 | from typing import Any, Callable, Optional, Tuple
  6 | 
  7 | import torch
  8 | 
  9 | from .line_records import LineRecords
 10 | 
 11 | # Seaborn's `muted` color cycle
 12 | DEFAULT_COLUMNS = ('active_bytes.all.peak', 'reserved_bytes.all.peak')
 13 | 
 14 | 
 15 | class LineProfiler:
 16 |     """Profile the CUDA memory usage info for each line in pytorch
 17 | 
 18 |     This class registers callbacks for added functions to profiling them line
 19 |     by line, and collects all the statistics in CUDA memory. Usually you may
 20 |     want to use simpler wrapper below `profile` or `profile_every`.
 21 | 
 22 |     The CUDA memory is collected only on the **current** cuda device.
 23 | 
 24 |     Usage:
 25 |         ```python
 26 |         with LineProfiler(func) as lp:
 27 |             func
 28 |         lp.display()
 29 | 
 30 |         ```python
 31 |         lp = LineProfiler(func)
 32 |         lp.enable()
 33 |         func()
 34 |         lp.disable()
 35 |         lp.display()
 36 |         ```
 37 |     """
 38 | 
 39 |     def __init__(self, *functions: Callable, target_gpu: int = 0):
 40 |         self.target_gpu = target_gpu
 41 |         self._code_infos = {}
 42 |         self._raw_line_records = []
 43 |         self.enabled = False
 44 |         for func in functions:
 45 |             self.add_function(func)
 46 | 
 47 |     def add_function(self, func: Callable) -> None:
 48 |         """ Record line profiling information for the given Python function.
 49 |         """
 50 |         try:
 51 |             # We need to use the hash here because pandas will later expect something
 52 |             # orderable for its index
 53 |             code_hash = hash(func.__code__)
 54 |         except AttributeError:
 55 |             warnings.warn(
 56 |                 "Could not extract a code object for the object %r" % (func,))
 57 |             return
 58 |         if code_hash not in self._code_infos:
 59 |             first_line = inspect.getsourcelines(func)[1]
 60 |             self._code_infos[code_hash] = {
 61 |                 'func': func,
 62 |                 'first_line': first_line,
 63 |                 'prev_line': first_line,
 64 |                 'prev_record': -1,
 65 |             }
 66 | 
 67 |         # re-register the newer trace_callback
 68 |         if self.enabled:
 69 |             self.register_callback()
 70 | 
 71 |     def __enter__(self):
 72 |         self.enable()
 73 |         return self
 74 | 
 75 |     def __exit__(self, exc_type, exc_val, exc_tb):
 76 |         self.disable()
 77 | 
 78 |     def register_callback(self):
 79 |         """Register the trace_callback only on demand"""
 80 |         if self._code_infos:
 81 |             sys.settrace(self._trace_callback)
 82 | 
 83 |     def _reset_cuda_stats(self):
 84 |         torch.cuda.reset_peak_memory_stats()
 85 |         torch.cuda.reset_accumulated_memory_stats()
 86 | 
 87 |     def enable(self):
 88 |         """Enable the profiler and register trace callback"""
 89 |         if not torch.cuda.is_available():
 90 |             print('Could not find CUDA deivces and reset CUDA stats and cache')
 91 |             return
 92 |         torch.cuda.empty_cache()
 93 |         self._reset_cuda_stats()
 94 |         self.enabled = True
 95 |         self.register_callback()
 96 | 
 97 |     def disable(self):
 98 |         """Disable the profiler and clear trace callback"""
 99 |         self.enabled = False
100 |         sys.settrace(None)
101 | 
102 |     def clear(self):
103 |         """Clear the state of the line profiler"""
104 |         self._code_infos = {}
105 |         self._raw_line_records = []
106 | 
107 |     def _trace_callback(self, frame: FrameType, event: str, _unused_arg: Tuple[Any, ...]):
108 |         """Trace the execution of python line-by-line"""
109 | 
110 |         if event == 'call':
111 |             return self._trace_callback
112 | 
113 |         code_hash = hash(frame.f_code)
114 |         if event in ['line', 'return'] and code_hash in self._code_infos:
115 |             code_info = self._code_infos[code_hash]
116 |             with torch.cuda.device(self.target_gpu):
117 |                 self._raw_line_records.append({
118 |                     'code_hash': code_hash,
119 |                     'line': code_info['prev_line'],
120 |                     'prev_record_idx': code_info['prev_record'],
121 |                     **torch.cuda.memory_stats()})
122 |                 self._reset_cuda_stats()
123 | 
124 |             if event == 'line':
125 |                 code_info['prev_line'] = frame.f_lineno
126 |                 code_info['prev_record'] = len(self._raw_line_records)-1
127 |             elif event == 'return':
128 |                 code_info['prev_line'] = code_info['first_line']
129 |                 code_info['prev_record'] = -1
130 | 
131 |     def display(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS) -> LineRecords:
132 |         """Display the profiling results on either IPython or CLI
133 | 
134 |         The columns are explained in the PyTorch documentation:
135 |         https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
136 | 
137 |         .. note:: To work, this needs to be the last thing returned in the IPython statement or cell.
138 | 
139 |         Args:
140 |             func (str): the function name of interest, None for all registered function
141 |             columns (list of str): the column names of interest, See PyTorch's doc for available names.
142 | 
143 |         Returns:
144 |             RecordsDisplay: Returns an object that'll display the recorded stats in the IPython console
145 |         """
146 |         return LineRecords(self._raw_line_records, self._code_infos).display(func, columns)
147 | 
148 |     def print_stats(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS, stream=sys.stdout):
149 |         """Print the text profiling results to stream
150 | 
151 |         The columns are explained in the PyTorch documentation:
152 |         https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
153 | 
154 |         Args:
155 |             func (str): the function name of interest, None for all registered function
156 |             columns (list of str): the column names of interest, See PyTorch's doc for available names
157 |             stream (IO-like object): the stream to write to
158 |         """
159 |         stream.write(str(self.display(func, columns)))
160 | 


--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/line_records.py:
--------------------------------------------------------------------------------
  1 | """Class and helper functions for processing and displaying line records"""
  2 | import inspect
  3 | from typing import Callable, Optional, Tuple, List, Dict, Any
  4 | import pandas as pd
  5 | 
  6 | from ..utils import readable_size
  7 | 
  8 | 
  9 | COLORS = [
 10 |     '#4878d0', '#ee854a', '#6acc64', '#d65f5f', '#956cb4',
 11 |     '#8c613c', '#dc7ec0', '#797979', '#d5bb67', '#82c6e2',
 12 | ]
 13 | 
 14 | 
 15 | 
 16 | def _accumulate_line_records(raw_line_records: List[Dict[str, Any]]) -> pd.DataFrame:
 17 |     """The raw records give the memory stats between successive lines executed by the profiler.
 18 |     But we want the memory stats between successive lines in our functions! The two diverge when
 19 |     a function we're profiling calls another function we're profiling, since then Torch will have
 20 |     its peak/allocated/freed memory stats reset on each line of the called function.
 21 | 
 22 |     To fix that, here we look at each line record in turn, and for peak stats we take the
 23 |     maximum since the last record _in the same function_. For allocated/freed stats, we take the
 24 |     sum since the last record in the same function.
 25 |     """
 26 | 
 27 |     # We'll do this in numpy because indexing lots of rows and columns in pandas is dog-slow.
 28 |     raw = pd.DataFrame(raw_line_records)
 29 |     acc_mask = raw.columns.str.match(r'.*(allocated|freed)$')
 30 |     peak_mask = raw.columns.str.match(r'.*(peak)$')
 31 |     acc_raw, peak_raw = raw.loc[:, acc_mask].values, raw.loc[:, peak_mask].values
 32 |     acc_refined, peak_refined = acc_raw.copy(), peak_raw.copy()
 33 | 
 34 |     for row, record in enumerate(raw_line_records):
 35 |         if record['prev_record_idx'] == -1:
 36 |             # No previous data to accumulate from
 37 |             continue
 38 |         if record['prev_record_idx'] == row-1:
 39 |             # Previous record was the previous line, so no need to accumulate anything
 40 |             continue
 41 | 
 42 |         # Another profiled function has been called since the last record, so we need to
 43 |         # accumulate the allocated/freed/peaks of the intervening records into this one.
 44 |         acc_refined[row] = acc_raw[record['prev_record_idx']+1:row+1].sum(0)
 45 |         peak_refined[row] = peak_raw[record['prev_record_idx']+1:row+1].max(0)
 46 | 
 47 |     refined = raw.copy()
 48 |     refined.loc[:, acc_mask] = acc_refined
 49 |     refined.loc[:, peak_mask] = peak_refined
 50 |     return refined
 51 | 
 52 | 
 53 | def _line_records(raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]) -> pd.DataFrame:
 54 |     """Converts the raw line records to a nicely-shaped dataframe whose values reflect
 55 |     the memory usage of lines of _functions_ rather than lines of _execution_. See the
 56 |     `_accumulate_line_records` docstring for more detail."""
 57 |     # Column spec: https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
 58 |     qual_names = {
 59 |         code_hash: info['func'].__qualname__ for code_hash, info in code_infos.items()}
 60 |     # pandas < 2.1.0 support (python3.8)
 61 |     try:
 62 |         records = (_accumulate_line_records(raw_line_records)
 63 |                 .assign(qual_name=lambda df: df.code_hash.map(qual_names))
 64 |                 .set_index(['qual_name', 'line'])
 65 |                 .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1))
 66 |     except AttributeError:
 67 |         records = (_accumulate_line_records(raw_line_records)
 68 |                 .assign(qual_name=lambda df: df.code_hash.applymap(qual_names))
 69 |                 .set_index(['qual_name', 'line'])
 70 |                 .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1))
 71 |     records.columns = pd.MultiIndex.from_tuples(
 72 |         [c.split('.') for c in records.columns])
 73 | 
 74 |     return records
 75 | 
 76 | 
 77 | class LineRecords:
 78 |     """Class for processing raw line records and display on IPython & CLI
 79 |     """
 80 | 
 81 |     def __init__(self, raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]):
 82 |         super().__init__()
 83 |         self._raw_line_records = raw_line_records
 84 |         self._code_infos = code_infos
 85 | 
 86 |     def display(self, func: Callable[..., Any], columns: Tuple[str, ...]):
 87 |         """Display the records to either notebook or CLI
 88 | 
 89 |         The columns are explained in the PyTorch documentation:
 90 |         https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
 91 | 
 92 |         .. note:: Make this call the last one in a notebook cell
 93 | 
 94 |         Args:
 95 |             func (str): the function name of interest, None for all registered function
 96 |             columns (list of str): the column names of interest, See PyTorch's doc for available names.
 97 | 
 98 |         Returns:
 99 |             RecordsDisplay: a IPython friendly object which converts records to HTML or plain text
100 |         """
101 |         line_records = self._filter_raw_line_records(func, columns)
102 |         return RecordsDisplay(line_records, self._code_infos)
103 | 
104 |     def _filter_raw_line_records(self, func: Callable[..., Any], columns: Tuple[str, ...]) -> pd.DataFrame:
105 |         """Get the line records
106 | 
107 |         The columns are explained in the PyTorch documentation:
108 |         https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
109 | 
110 |         Args:
111 |             func (str): the function name of interest, None for all registered function
112 |             columns (list of str): the column names of interest, See PyTorch's doc for available names.
113 | 
114 |         Returns:
115 |             pd.DataFrame: a (line, statistic)-indexed dataframe of memory stats.
116 |         """
117 |         if len(self._raw_line_records) == 0:
118 |             return pd.DataFrame(index=pd.MultiIndex.from_product([[], []]), columns=columns)
119 | 
120 |         line_records = _line_records(self._raw_line_records, self._code_infos)
121 |         line_records = _extract_line_records(line_records, func, columns)
122 | 
123 |         if len(line_records) > 0:
124 |             line_records = line_records.groupby(level=[0, 1]).max()
125 | 
126 |         return line_records
127 | 
128 | 
129 | def _extract_line_records(line_records: LineRecords, func: Optional[Callable] = None, columns: Tuple[str, ...] = None):
130 |     """Extracts the subset of a line_records dataframe pertinent to a given set of functions and
131 |     columns"""
132 |     if func is not None:
133 |         # Support both passing the function directly and passing a qual name/list of qual names
134 |         line_records = line_records.loc[[func.__qualname__] if callable(func) else func]
135 | 
136 |     if columns is not None:
137 |         columns = [tuple(c.split('.')) for c in columns]
138 |         if not all(len(c) == 3 for c in columns):
139 |             raise ValueError('Each column name should have three dot-separated parts')
140 |         if not all(c in line_records.columns for c in columns):
141 |             options = ", ".join(".".join(c)
142 |                                 for c in line_records.columns.tolist())
143 |             raise ValueError(
144 |                 'The column names should be fields of torch.cuda.memory_stat(). Options are: ' + options)
145 |         line_records = line_records.loc[:, columns]
146 | 
147 |     return line_records
148 | 
149 | 
150 | class RecordsDisplay:
151 |     """Class for processing raw line records and display on IPython & CLI
152 | 
153 |     IPython's rich display functionality [requires we return](https://ipython.readthedocs.io/en/stable/config/integrating.html)
154 |     an object that has a `_repr_html_` method for when HTML rendering is supported, and
155 |     a `__repr__` method for when only text is available
156 |     """
157 |     def __init__(self, line_records: LineRecords, code_infos: List[Dict[str, Any]]):
158 |         super().__init__()
159 |         self._line_records = line_records
160 |         self._code_infos = code_infos
161 |         self._merged_line_records = self._merge_line_records_with_code()
162 | 
163 |     def _merge_line_records_with_code(self) -> Dict[str, Any]:
164 |         merged_records = {}
165 |         for _, info in self._code_infos.items():
166 |             qual_name = info['func'].__qualname__
167 |             if qual_name in self._line_records.index.get_level_values(0):
168 |                 lines, start_line = inspect.getsourcelines(info['func'])
169 |                 lines = pd.DataFrame.from_dict({
170 |                     'line': range(start_line, start_line + len(lines)),
171 |                     'code': lines})
172 |                 lines.columns = pd.MultiIndex.from_product([lines.columns, [''], ['']])
173 | 
174 |                 merged_records[qual_name] = pd.merge(
175 |                     self._line_records.loc[qual_name], lines,
176 |                     right_on='line', left_index=True, how='right')
177 |         return merged_records
178 | 
179 |     def __repr__(self):
180 |         """Renders the stats as text"""
181 |         if len(self._line_records) == 0:
182 |             return 'No data collected\n'
183 | 
184 |         is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte')
185 |         byte_cols = self._line_records.columns[is_byte_col]
186 | 
187 |         string = {}
188 |         for qual_name, merged in self._merge_line_records_with_code().items():
189 |             maxlen = max(len(c) for c in merged.code)
190 |             left_align = '{{:{maxlen}s}}'.format(maxlen=maxlen)
191 |             # pandas < 2.1.0 support (python3.8)
192 |             try:
193 |                 merged[byte_cols] = merged[byte_cols].map(readable_size)
194 |             except AttributeError:
195 |                 merged[byte_cols] = merged[byte_cols].applymap(readable_size)
196 | 
197 |             # This is a mess, but I can't find any other way to left-align text strings.
198 |             code_header = (left_align.format('code'), '', '')
199 |             merged[code_header] = merged['code'].apply(lambda l: left_align.format(l.rstrip('\n\r')))
200 |             merged = merged.drop('code', axis=1, level=0)
201 | 
202 |             string[qual_name] = merged.to_string(index=False)
203 | 
204 |         return '\n\n'.join(['## {q}\n\n{c}\n'.format(q=q, c=c) for q, c in string.items()])
205 | 
206 |     def _repr_html_(self):
207 |         """Renders the stats as HTML"""
208 |         if len(self._line_records) == 0:
209 |             return '<p>No data collected</p>'
210 | 
211 |         is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte')
212 |         byte_cols = self._line_records.columns[is_byte_col]
213 |         maxes = self._line_records.max()
214 | 
215 |         html = {}
216 |         for qual_name, merged in self._merge_line_records_with_code().items():
217 | 
218 |             style = merged.style
219 | 
220 |             # Style the bar charts
221 |             for i, c in enumerate(self._line_records.columns):
222 |                 style = style.bar([c], color=COLORS[i % len(COLORS)],
223 |                                   width=99, vmin=0, vmax=maxes[c])
224 | 
225 |             # Style the text
226 |             html[qual_name] = (style
227 |                                 .format({c: readable_size for c in byte_cols})
228 |                                 .set_properties(
229 |                                     subset=['code'], **{
230 |                                         'text-align': 'left',
231 |                                         'white-space': 'pre',
232 |                                         'font-family': 'monospace'})
233 |                                 .set_table_styles([{
234 |                                     'selector': 'th',
235 |                                     'props': [('text-align', 'left')]}])
236 |                                 .hide(axis=0)
237 |                                 .to_html())
238 | 
239 |         template = '<h3><span style="font-family: monospace">{q}</span></h3><div>{c}</div>'
240 |         return '\n'.join(template.format(q=q, c=c) for q, c in html.items())
241 | 


--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/profile.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps
  2 | from typing import Callable, Tuple
  3 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS
  4 | 
  5 | 
  6 | global_line_profiler = LineProfiler()
  7 | global_line_profiler.enable()
  8 | 
  9 | 
 10 | def clear_global_line_profiler():
 11 |     """Clears the state of the global line profiler"""
 12 |     global_line_profiler.clear()
 13 | 
 14 | 
 15 | def set_target_gpu(gpu_id: int):
 16 |     """Set the target GPU id to profile memory
 17 | 
 18 |     Because of the lack of output space, only one GPU's memory usage is shown
 19 |     in line profiler. However you can use this function to switch target GPU
 20 |     to profile on. The GPU switch can be performed before profiling and even
 21 |     in the profiled functions.
 22 | 
 23 |     Args:
 24 |         - gpu_id: cuda index to profile the memory on,
 25 |                   also accepts `torch.device` object.
 26 |     """
 27 |     global_line_profiler.target_gpu = gpu_id
 28 | 
 29 | 
 30 | def profile(func, columns: Tuple[str, ...] = DEFAULT_COLUMNS):
 31 |     """Profile the CUDA memory usage of target function line by line
 32 | 
 33 |     The profiling results will be printed at exiting, KeyboardInterrupt raised.
 34 |     The CUDA memory is collected only on the **current** cuda device.
 35 | 
 36 |     The columns are explained in the PyTorch documentation:
 37 |     https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
 38 | 
 39 |     Args:
 40 |         func: the function or method to profile on
 41 |         columns (list of str): the column names of interest, See PyTorch's doc for available names.
 42 | 
 43 |     Usage:
 44 |         ```python
 45 |         @profile
 46 |         def foo():
 47 |             linear = torch.nn.Linear(100, 100).cuda()
 48 | 
 49 |         foo()
 50 | 
 51 |         class Foo(torch.nn.Module):
 52 |             def __init__(self):
 53 |                 super().__init__()
 54 |                 self.linear = torch.nn.Linear(100, 100).cuda()
 55 | 
 56 |             @profile
 57 |             def forward(self, inp):
 58 |                 return self.linear(inp)
 59 | 
 60 |         inp = torch.Tensor(50, 100).cuda()
 61 |         foo = Foo()
 62 |         foo(inp)
 63 |         ```
 64 |     """
 65 |     import atexit
 66 |     global_line_profiler.add_function(func)
 67 | 
 68 |     def print_stats_atexit():
 69 |         global_line_profiler.print_stats(func, columns)
 70 |     atexit.register(print_stats_atexit)
 71 | 
 72 |     return func
 73 | 
 74 | 
 75 | def profile_every(output_interval: int = 1, enable: bool = True, columns: Tuple[str, ...] = DEFAULT_COLUMNS):
 76 |     """Profile the CUDA memory usage of target function line by line
 77 | 
 78 |     Prints the profiling output every `output_interval` execution of the target
 79 |     function
 80 |     The CUDA memory is collected only on the **current** cuda device.
 81 | 
 82 |     The columns are explained in the PyTorch documentation:
 83 |     https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
 84 | 
 85 |     Args:
 86 |         enable (bool): whether to enable the profiling mode, so users don't have to
 87 |                        modify any source code for enabling and disabling profiling.
 88 |         output_interval (int): frequency of output the profiling results
 89 |         columns (list of str): the column names of interest, See PyTorch's doc for available names.
 90 |     """
 91 | 
 92 |     def inner_decorator(func: Callable):
 93 |         func.cur_idx = 1
 94 | 
 95 |         if enable:
 96 |             global_line_profiler.add_function(func)
 97 | 
 98 |         @wraps(func)
 99 |         def run_func(*args, **kwargs):
100 |             res = func(*args, **kwargs)
101 |             if enable:
102 |                 if func.cur_idx % output_interval == 0:
103 |                     global_line_profiler.print_stats(func, columns)
104 | 
105 |                 func.cur_idx += 1
106 |             return res
107 | 
108 |         return run_func
109 |     return inner_decorator
110 | 


--------------------------------------------------------------------------------
/pytorch_memlab/mem_reporter.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import gc
  3 | from collections import defaultdict
  4 | from typing import Optional, Tuple, List
  5 | 
  6 | import torch
  7 | from .utils import readable_size
  8 | 
  9 | LEN = 79
 10 | 
 11 | # some pytorch low-level memory management constant
 12 | # the minimal allocate memory size (Byte)
 13 | PYTORCH_MIN_ALLOCATE = 2 ** 9
 14 | # the minimal cache memory size (Byte)
 15 | PYTORCH_MIN_CACHE = 2 ** 20
 16 | 
 17 | class MemReporter():
 18 |     """A memory reporter that collects tensors and memory usages
 19 | 
 20 |     Parameters:
 21 |         - model: an extra nn.Module can be passed to infer the name
 22 |         of Tensors
 23 |         - pre_collect: do a garbage collection before getting remaining
 24 |         Tensors, this gives cleaner outputs.
 25 |           Caution: This is an intrusive change to your original code.
 26 | 
 27 |     """
 28 |     def __init__(self, model: Optional[torch.nn.Module] = None, pre_collect: bool = False):
 29 |         self.tensor_name = {}
 30 |         self.device_mapping = defaultdict(list)
 31 |         self.device_tensor_stat = {}
 32 |         # to numbering the unknown tensors
 33 |         self.name_idx = 0
 34 |         self.pre_collect = pre_collect
 35 | 
 36 |         tensor_names = defaultdict(list)
 37 |         if model is not None:
 38 |             assert isinstance(model, torch.nn.Module)
 39 |             # for model with tying weight, multiple parameters may share
 40 |             # the same underlying tensor
 41 |             for name, param in model.named_parameters():
 42 |                 tensor_names[param].append(name)
 43 | 
 44 |         for param, name in tensor_names.items():
 45 |             self.tensor_name[id(param)] = '+'.join(name)
 46 | 
 47 |     def _get_tensor_name(self, tensor: torch.Tensor) -> str:
 48 |         tensor_id = id(tensor)
 49 |         if tensor_id in self.tensor_name:
 50 |             name = self.tensor_name[tensor_id]
 51 |         # use numbering if no name can be inferred
 52 |         else:
 53 |             name = type(tensor).__name__ + str(self.name_idx)
 54 |             self.tensor_name[tensor_id] = name
 55 |             self.name_idx += 1
 56 |         return name
 57 | 
 58 |     def add_optimizer(self, optimizer: torch.optim.Optimizer):
 59 |         optimizer_name = optimizer.__class__.__name__
 60 |         for param, states in optimizer.state.items():
 61 |             param_name = self.tensor_name[id(param)]
 62 |             for name, tensor in states.items():
 63 |                 self.tensor_name[id(tensor)] = f'{optimizer_name}.{param_name}.{name}'
 64 |             # self.tensor_name[id()]
 65 |             # print(states)
 66 | 
 67 | 
 68 |     def collect_tensor(self):
 69 |         """Collect all tensor objects tracked by python
 70 | 
 71 |         NOTICE:
 72 |             - the buffers for backward which is implemented in C++ are
 73 |             not tracked by python's reference counting.
 74 |             - the gradients(.grad) of Parameters is not collected, and
 75 |             I don't know why.
 76 |         """
 77 |         #FIXME: make the grad tensor collected by gc
 78 |         # Do a pre-garbage collect to eliminate python garbage objects
 79 |         if self.pre_collect:
 80 |             gc.collect()
 81 |         objects = gc.get_objects()
 82 |         tensors = [obj for obj in objects if isinstance(obj, torch.Tensor)]
 83 |         for t in tensors:
 84 |             self.device_mapping[t.device].append(t)
 85 | 
 86 |     def get_stats(self):
 87 |         """Get the memory stat of tensors and then release them
 88 | 
 89 |         As a memory profiler, we cannot hold the reference to any tensors, which
 90 |         causes possibly inaccurate memory usage stats, so we delete the tensors after
 91 |         getting required stats"""
 92 |         visited_data = {}
 93 |         self.device_tensor_stat.clear()
 94 | 
 95 |         def get_tensor_stat(tensor: torch.Tensor) -> List[Tuple[str, int, int, int]]:
 96 |             """Get the stat of a single tensor
 97 | 
 98 |             Returns:
 99 |                 - stat: a tuple containing (tensor_name, tensor_size,
100 |             tensor_numel, tensor_memory)
101 |             """
102 |             assert isinstance(tensor, torch.Tensor)
103 | 
104 |             name = self._get_tensor_name(tensor)
105 |             if tensor.is_sparse:
106 |                 indices_stat = get_tensor_stat(tensor._indices())
107 |                 values_stat = get_tensor_stat(tensor._values())
108 |                 return indices_stat + values_stat
109 | 
110 |             numel = tensor.numel()
111 |             element_size = tensor.element_size()
112 |             fact_numel = tensor.untyped_storage().size()
113 |             fact_memory_size = fact_numel * element_size
114 |             # since pytorch allocate at least 512 Bytes for any tensor, round
115 |             # up to a multiple of 512
116 |             memory_size = math.ceil(fact_memory_size / PYTORCH_MIN_ALLOCATE) \
117 |                     * PYTORCH_MIN_ALLOCATE
118 | 
119 |             # tensor.storage should be the actual object related to memory
120 |             # allocation
121 |             data_ptr = tensor.untyped_storage().data_ptr()
122 |             if data_ptr in visited_data:
123 |                 name = '{}(->{})'.format(
124 |                     name,
125 |                     visited_data[data_ptr],
126 |                 )
127 |                 # don't count the memory for reusing same underlying storage
128 |                 memory_size = 0
129 |             else:
130 |                 visited_data[data_ptr] = name
131 | 
132 |             size = tuple(tensor.size())
133 |             # torch scalar has empty size
134 |             if not size:
135 |                 size = (1,)
136 | 
137 |             return [(name, size, numel, memory_size)]
138 | 
139 |         for device, tensors in self.device_mapping.items():
140 |             tensor_stats = []
141 |             for tensor in tensors:
142 | 
143 |                 if tensor.numel() == 0:
144 |                     continue
145 |                 stat = get_tensor_stat(tensor)  # (name, shape, numel, memory_size)
146 |                 tensor_stats += stat
147 |                 if isinstance(tensor, torch.nn.Parameter):
148 |                     if tensor.grad is not None:
149 |                         # manually specify the name of gradient tensor
150 |                         self.tensor_name[id(tensor.grad)] = '{}.grad'.format(
151 |                             self._get_tensor_name(tensor)
152 |                         )
153 |                         stat = get_tensor_stat(tensor.grad)
154 |                         tensor_stats += stat
155 | 
156 |             self.device_tensor_stat[device] = tensor_stats
157 | 
158 |         self.device_mapping.clear()
159 | 
160 |     def print_stats(self, verbose: bool = False, target_device: Optional[torch.device] = None) -> None:
161 |         # header
162 |         show_reuse = verbose
163 |         template_format = '{:<40s}{:>20s}{:>10s}'
164 |         print(template_format.format('Element type', 'Size', 'Used MEM') )
165 |         for device, tensor_stats in self.device_tensor_stat.items():
166 |             # By default, if the target_device is not specified,
167 |             # print tensors on all devices
168 |             if target_device is not None and device != target_device:
169 |                 continue
170 |             print('-' * LEN)
171 |             print('Storage on {}'.format(device))
172 |             total_mem = 0
173 |             total_numel = 0
174 |             for stat in tensor_stats:
175 |                 name, size, numel, mem = stat
176 |                 if not show_reuse:
177 |                     name = name.split('(')[0]
178 |                 print(template_format.format(
179 |                     str(name),
180 |                     str(size),
181 |                     readable_size(mem),
182 |                 ))
183 |                 total_mem += mem
184 |                 total_numel += numel
185 | 
186 |             print('-'*LEN)
187 |             print('Total Tensors: {} \tUsed Memory: {}'.format(
188 |                 total_numel, readable_size(total_mem),
189 |             ))
190 | 
191 |             if device != torch.device('cpu'):
192 |                 with torch.cuda.device(device):
193 |                     memory_allocated = torch.cuda.memory_allocated()
194 |                 print('The allocated memory on {}: {}'.format(
195 |                     device, readable_size(memory_allocated),
196 |                 ))
197 |                 if memory_allocated != total_mem:
198 |                     print('Memory differs due to the matrix alignment or'
199 |                           ' invisible gradient buffer tensors')
200 |             print('-'*LEN)
201 | 
202 |     def report(self, verbose: bool = False, device: Optional[torch.device] = None) -> None:
203 |         """Interface for end-users to directly print the memory usage
204 | 
205 |         args:
206 |             - verbose: flag to show tensor.storage reuse information
207 |             - device: `torch.device` object, specify the target device
208 |             to report detailed memory usage. It will print memory usage
209 |             on all devices if not specified. Usually we only want to
210 |             print the memory usage on CUDA devices.
211 | 
212 |         """
213 |         self.collect_tensor()
214 |         self.get_stats()
215 |         self.print_stats(verbose, target_device=device)
216 | 


--------------------------------------------------------------------------------
/pytorch_memlab/utils.py:
--------------------------------------------------------------------------------
1 | from math import isnan
2 | from calmsize import size as calmsize
3 | 
4 | def readable_size(num_bytes: int) -> str:
5 |     return '' if isnan(num_bytes) else '{:.2f}'.format(calmsize(num_bytes))
6 | 


--------------------------------------------------------------------------------
/readme-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/readme-output.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | try:
 4 |     long_description = open('README.md').read()
 5 | except FileNotFoundError:
 6 |     long_description = ''
 7 | 
 8 | setup(
 9 |     name='pytorch-memlab',
10 |     version='0.3.0',
11 |     licence='MIT',
12 |     description='A lab to do simple and accurate memory experiments on pytorch',
13 |     long_description=long_description,
14 |     long_description_content_type='text/markdown',
15 |     classifiers=[
16 |         "Programming Language :: Python",
17 |         "Topic :: Software Development :: Libraries :: Python Modules",
18 |         ],
19 |     keywords='pytorch memory profile',
20 |     author='Kaiyu Shi',
21 |     author_email='skyisno.1@gmail.com',
22 |     url='https://github.com/Stonesjtu/pytorch_memlab',
23 |     license='MIT',
24 |     include_package_data=True,
25 |     zip_safe=True,
26 |     python_requires='>=3.8',
27 |     install_requires=[
28 |         'setuptools',
29 |         'calmsize',
30 |         'pandas',
31 |         'torch>=2.0',
32 |     ],
33 |     extras_require={
34 |         'ipython': ['IPython>=0.13'],
35 |         'test': ['pytest'],
36 |     },
37 |     packages=find_packages(),
38 | )
39 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/test/__init__.py


--------------------------------------------------------------------------------
/test/test_courtesy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from pytorch_memlab import Courtesy, MemReporter
 4 | 
 5 | def test_reporter():
 6 |     linear = torch.nn.Linear(1024, 1024).cuda()
 7 |     inp = torch.Tensor(512, 1024).cuda()
 8 | 
 9 |     out = linear(inp).mean()
10 |     out.backward()
11 | 
12 |     reporter = MemReporter(linear)
13 |     reporter.report()
14 |     ct = Courtesy()
15 |     ct.yield_memory()
16 |     print('gpu>>>>>>>>>>>>>>>>>>cpu')
17 |     reporter.report()
18 |     ct.restore()
19 |     print('cpu>>>>>>>>>>>>>>>>>>gpu')
20 |     reporter.report()
21 | 
22 | def test_courtesy_context():
23 |     linear = torch.nn.Linear(1024, 1024).cuda()
24 |     inp = torch.Tensor(512, 1024).cuda()
25 | 
26 |     out = linear(inp).mean()
27 |     out.backward()
28 | 
29 |     reporter = MemReporter(linear)
30 |     with Courtesy() as ct:
31 |         print('gpu>>>>>>>>>>>>>>>>>>cpu')
32 |         reporter.report()
33 | 


--------------------------------------------------------------------------------
/test/test_line_profiler.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | import torch
  6 | from pytorch_memlab import (LineProfiler, clear_global_line_profiler, profile,
  7 |                             profile_every, set_target_gpu)
  8 | 
  9 | 
 10 | def test_display():
 11 | 
 12 |     def main():
 13 |         linear = torch.nn.Linear(100, 100).cuda()
 14 |         part1()
 15 |         part2()
 16 | 
 17 |     def part1():
 18 |         lstm = torch.nn.LSTM(1000, 1000).cuda()
 19 |         subpart11()
 20 | 
 21 |     def part2():
 22 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 23 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 24 | 
 25 |     def subpart11():
 26 |         linear = torch.nn.Linear(100, 100).cuda()
 27 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 28 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 29 | 
 30 |     with LineProfiler(subpart11, part2) as prof:
 31 |         main()
 32 | 
 33 |     s = str(prof.display())  # cast from line_records.RecordsDisplay
 34 |     assert re.search("## .*subpart11", s)
 35 |     assert "def subpart11():" in s
 36 |     assert re.search("## .*part2", s)
 37 |     assert "def part2():" in s
 38 | 
 39 | 
 40 | def test_line_report():
 41 | 
 42 |     def work():
 43 |         # comment
 44 |         linear = torch.nn.Linear(100, 100).cuda()
 45 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 46 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 47 | 
 48 |     def work_3():
 49 |         lstm = torch.nn.LSTM(1000, 1000).cuda()
 50 | 
 51 |     def work_2():
 52 |         # comment
 53 |         linear = torch.nn.Linear(100, 100).cuda()
 54 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 55 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 56 |         work_3()
 57 | 
 58 |     line_profiler = LineProfiler(work, work_2)
 59 |     line_profiler.enable()
 60 | 
 61 |     work()
 62 |     work_2()
 63 | 
 64 |     line_profiler.disable()
 65 |     line_profiler.print_stats()
 66 | 
 67 | 
 68 | def test_line_report_decorator():
 69 |     clear_global_line_profiler()
 70 | 
 71 |     @profile_every(output_interval=3)
 72 |     def work():
 73 |         # comment
 74 |         linear = torch.nn.Linear(100, 100).cuda()
 75 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 76 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 77 | 
 78 |     @profile_every(output_interval=1)
 79 |     def work2():
 80 |         # comment
 81 |         linear = torch.nn.Linear(100, 100).cuda()
 82 |         linear_2 = torch.nn.Linear(100, 100).cuda()
 83 |         linear_3 = torch.nn.Linear(100, 100).cuda()
 84 | 
 85 |     work()
 86 |     work2()
 87 |     work()
 88 |     work()
 89 | 
 90 | 
 91 | def test_line_report_method():
 92 |     clear_global_line_profiler()
 93 | 
 94 |     class Net(torch.nn.Module):
 95 |         def __init__(self):
 96 |             super().__init__()
 97 |             self.linear = torch.nn.Linear(100, 100).cuda()
 98 |             self.drop = torch.nn.Dropout(0.1)
 99 | 
100 |         @profile_every(1)
101 |         def forward(self, inp):
102 |             return self.drop(self.linear(inp))
103 | 
104 |     net = Net()
105 |     inp = torch.Tensor(50, 100).cuda()
106 |     net(inp)
107 | 
108 | 
109 | def test_line_report_profile():
110 |     clear_global_line_profiler()
111 | 
112 |     @profile
113 |     def work():
114 |         # comment
115 |         linear = torch.nn.Linear(100, 100).cuda()
116 |         linear_2 = torch.nn.Linear(100, 100).cuda()
117 |         linear_3 = torch.nn.Linear(100, 100).cuda()
118 | 
119 |     work()
120 |     work()
121 | 
122 | 
123 | def test_line_report_profile_set_gpu():
124 |     clear_global_line_profiler()
125 | 
126 |     @profile
127 |     def work():
128 |         # comment
129 |         set_target_gpu(1)
130 |         linear = torch.nn.Linear(100, 100).cuda(1)
131 |         set_target_gpu(0)
132 |         linear_2 = torch.nn.Linear(100, 100).cuda(0)
133 |         linear_3 = torch.nn.Linear(100, 100).cuda(1)
134 | 
135 |     work()
136 |     work()
137 | 
138 | 
139 | def test_line_report_profile_interrupt():
140 |     clear_global_line_profiler()
141 | 
142 |     @profile
143 |     def work():
144 |         # comment
145 |         linear = torch.nn.Linear(100, 100).cuda()
146 |         linear_2 = torch.nn.Linear(100, 100).cuda()
147 |         linear_3 = torch.nn.Linear(100, 100).cuda()
148 | 
149 |     @profile_every(1)
150 |     def work2():
151 |         linear_2 = torch.nn.Linear(100, 100).cuda()
152 |         linear_3 = torch.nn.Linear(100, 100).cuda()
153 | 
154 |     work()
155 |     work2()
156 |     raise KeyboardInterrupt
157 | 


--------------------------------------------------------------------------------
/test/test_mem_reporter.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.optim
  3 | from pytorch_memlab import MemReporter
  4 | 
  5 | import pytest
  6 | 
  7 | 
  8 | concentrate_mode = False
  9 | 
 10 | def test_reporter():
 11 |     linear = torch.nn.Linear(1024, 1024)
 12 |     inp = torch.Tensor(512, 1024)
 13 |     reporter = MemReporter(linear)
 14 | 
 15 |     out = linear(inp*(inp+3)).mean()
 16 |     reporter.report()
 17 |     out.backward()
 18 | 
 19 |     reporter.report()
 20 | 
 21 | def test_reporter_without_model():
 22 |     linear = torch.nn.Linear(1024, 1024)
 23 |     inp = torch.Tensor(512, 1024)
 24 |     reporter = MemReporter()
 25 | 
 26 |     out = linear(inp*(inp+3)).mean()
 27 |     reporter.report()
 28 |     out.backward()
 29 | 
 30 |     reporter.report()
 31 | 
 32 | def test_reporter_sparse_tensor():
 33 |     emb = torch.nn.Embedding(1024, 1024, sparse=True)
 34 |     inp = torch.arange(0, 128)
 35 |     reporter = MemReporter()
 36 | 
 37 |     out = emb(inp).mean()
 38 |     reporter.report()
 39 |     out.backward()
 40 |     b = emb.weight.grad * 2
 41 | 
 42 |     reporter.report()
 43 | 
 44 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
 45 | def test_reporter_tie_weight():
 46 |     linear = torch.nn.Linear(1024, 1024)
 47 |     linear_2 = torch.nn.Linear(1024, 1024)
 48 |     linear_2.weight = linear.weight
 49 |     container = torch.nn.Sequential(
 50 |         linear, linear_2
 51 |     )
 52 |     reporter = MemReporter(container)
 53 |     inp = torch.Tensor(512, 1024)
 54 | 
 55 |     out = container(inp).mean()
 56 |     out.backward()
 57 | 
 58 |     reporter = MemReporter(container)
 59 |     reporter.report()
 60 | 
 61 | def test_reporter_with_optimizer():
 62 |     linear = torch.nn.Linear(1024, 1024)
 63 |     inp = torch.Tensor(512, 1024)
 64 |     optimizer = torch.optim.Adam(linear.parameters())
 65 |     # reporter = MemReporter(linear)
 66 | 
 67 |     out = linear(inp*(inp+3)*(inp+2)).mean()
 68 |     reporter = MemReporter(linear)
 69 |     reporter.report()
 70 |     out.backward()
 71 |     # reporter.report()
 72 |     optimizer.step()
 73 | 
 74 |     reporter.add_optimizer(optimizer)
 75 |     reporter.report()
 76 | 
 77 | 
 78 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
 79 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
 80 | def test_reporter_LSTM():
 81 |     lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda()
 82 |     # lstm.flatten_parameters()
 83 |     inp = torch.Tensor(256, 256, 256).cuda()
 84 |     out, _ = lstm(inp)
 85 |     out.mean().backward()
 86 | 
 87 |     reporter = MemReporter(lstm)
 88 |     reporter.report()
 89 | 
 90 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
 91 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
 92 | def test_reporter_device():
 93 |     lstm_cpu = torch.nn.LSTM(256, 256)
 94 |     lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda()
 95 |     # lstm.flatten_parameters()
 96 |     inp = torch.Tensor(256, 256, 256).cuda()
 97 |     out, _ = lstm(inp)
 98 |     out.mean().backward()
 99 | 
100 |     reporter = MemReporter(lstm)
101 |     reporter.report()
102 |     reporter.report(device=torch.device('cuda:0'))
103 | 


--------------------------------------------------------------------------------