├── .github
└── workflows
│ ├── pypi-publish.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── LICENSE_kernprof.txt
├── README.md
├── demo.ipynb
├── pytorch_memlab
├── __init__.py
├── courtesy.py
├── line_profiler
│ ├── __init__.py
│ ├── extension.py
│ ├── line_profiler.py
│ ├── line_records.py
│ └── profile.py
├── mem_reporter.py
└── utils.py
├── readme-output.png
├── setup.py
└── test
├── __init__.py
├── test_courtesy.py
├── test_line_profiler.py
└── test_mem_reporter.py
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 |
16 | jobs:
17 | deploy:
18 |
19 | runs-on: ubuntu-latest
20 |
21 | steps:
22 | - uses: actions/checkout@v4.1.7
23 | - name: Set up Python
24 | uses: actions/setup-python@v5.1.1
25 | with:
26 | python-version: '3.11`'
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | pip install ipython pandas
31 | pip install .[test]
32 | - name: Build package
33 | run: python setup.py bdist
34 | - name: Publish package
35 | uses: pypa/gh-action-pypi-publish@v1.9.0
36 | with:
37 | user: __token__
38 | password: ${{ secrets.PYPI_API_TOKEN }}
39 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Test
10 |
11 | on: push
12 |
13 | jobs:
14 | test:
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: ['3.8', '3.9', '3.10', '3.11']
19 |
20 | steps:
21 | - uses: actions/checkout@v4.1.7
22 | - name: Set up Python
23 | uses: actions/setup-python@v5.1.1
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install ipython pandas
30 | pip install .[test]
31 | - name: Build package
32 | run: python setup.py bdist
33 | - name: Test
34 | run: |
35 | python -c 'import pytorch_memlab'
36 | pytest test/test_mem_reporter.py
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #### joe made this: http://goel.io/joe
2 |
3 | #####=== IPythonNotebook ===#####
4 | # Temporary data
5 | .ipynb_checkpoints/
6 |
7 | #####=== Python ===#####
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | .Python
19 | env/
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *,cover
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 |
62 | # Sphinx documentation
63 | docs/_build/
64 |
65 | # PyBuilder
66 | target/
67 |
68 | #####=== JetBrains ===#####
69 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
70 |
71 | *.iml
72 |
73 | ## Directory-based project format:
74 | .idea/
75 | # if you remove the above rule, at least ignore the following:
76 |
77 | # User-specific stuff:
78 | # .idea/workspace.xml
79 | # .idea/tasks.xml
80 | # .idea/dictionaries
81 |
82 | # Sensitive or high-churn files:
83 | # .idea/dataSources.ids
84 | # .idea/dataSources.xml
85 | # .idea/sqlDataSources.xml
86 | # .idea/dynamic.xml
87 | # .idea/uiDesigner.xml
88 |
89 | # Gradle:
90 | # .idea/gradle.xml
91 | # .idea/libraries
92 |
93 | # Mongo Explorer plugin:
94 | # .idea/mongoSettings.xml
95 |
96 | ## File-based project format:
97 | *.ipr
98 | *.iws
99 |
100 | ## Plugin-specific files:
101 |
102 | # IntelliJ
103 | /out/
104 |
105 | # mpeltonen/sbt-idea plugin
106 | .idea_modules/
107 |
108 | # JIRA plugin
109 | atlassian-ide-plugin.xml
110 |
111 | # Crashlytics plugin (for Android Studio and IntelliJ)
112 | com_crashlytics_export_strings.xml
113 | crashlytics.properties
114 | crashlytics-build.properties
115 |
116 | .ropeproject
117 |
118 | #####=== VSCode ===#####
119 |
120 | .vscode
121 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Kaiyu Shi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/LICENSE_kernprof.txt:
--------------------------------------------------------------------------------
1 | This software is OSI Certified Open Source Software.
2 | OSI Certified is a certification mark of the Open Source Initiative.
3 |
4 | Copyright (c) 2008, Enthought, Inc.
5 | All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without
8 | modification, are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 | * Neither the name of Enthought, Inc. nor the names of its contributors may
16 | be used to endorse or promote products derived from this software without
17 | specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | pytorch_memlab
2 | ======
3 | [](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/test.yml)
4 | [](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/pypi-publish.yml)
5 | 
6 | [](https://github.com/Stonesjtu/pytorch_memlab/actions/workflows/github-code-scanning/codeql)
7 | 
8 |
9 | A simple and accurate **CUDA** memory management laboratory for pytorch,
10 | it consists of different parts about the memory:
11 |
12 | - Features:
13 |
14 | - Memory Profiler: A `line_profiler` style CUDA memory profiler with simple API.
15 | - Memory Reporter: A reporter to inspect tensors occupying the CUDA memory.
16 | - Courtesy: An interesting feature to temporarily move all the CUDA tensors into
17 | CPU memory for courtesy, and of course the backward transferring.
18 | - IPython support through `%mlrun`/`%%mlrun` line/cell magic
19 | commands.
20 |
21 |
22 | - Table of Contents
23 | * [Installation](#installation)
24 | * [User-Doc](#user-doc)
25 | + [Memory Profiler](#memory-profiler)
26 | + [IPython support](#ipython-support)
27 | + [Memory Reporter](#memory-reporter)
28 | + [Courtesy](#courtesy)
29 | + [ACK](#ack)
30 | * [CHANGES](#changes)
31 |
32 | Installation
33 | -----
34 |
35 | - Released version:
36 | ```bash
37 | pip install pytorch_memlab
38 | ```
39 |
40 | - Newest version:
41 | ```bash
42 | pip install git+https://github.com/stonesjtu/pytorch_memlab
43 | ```
44 |
45 | What's for
46 | -----
47 |
48 | Out-Of-Memory errors in pytorch happen frequently, for new-bees and
49 | experienced programmers. A common reason is that most people don't really
50 | learn the underlying memory management philosophy of pytorch and GPUs.
51 | They wrote memory in-efficient codes and complained about pytorch eating too
52 | much CUDA memory.
53 |
54 | In this repo, I'm going to share some useful tools to help debugging OOM, or
55 | to inspect the underlying mechanism if anyone is interested in.
56 |
57 |
58 | User-Doc
59 | -----
60 |
61 | ### Memory Profiler
62 |
63 | The memory profiler is a modification of python's `line_profiler`, it gives
64 | the memory usage info for each line of code in the specified function/method.
65 |
66 | #### Sample:
67 |
68 | ```python
69 | import torch
70 | from pytorch_memlab import LineProfiler
71 |
72 | def inner():
73 | torch.nn.Linear(100, 100).cuda()
74 |
75 | def outer():
76 | linear = torch.nn.Linear(100, 100).cuda()
77 | linear2 = torch.nn.Linear(100, 100).cuda()
78 | linear3 = torch.nn.Linear(100, 100).cuda()
79 |
80 | work()
81 | ```
82 |
83 | After the script finishes or interrupted by keyboard, it gives the following
84 | profiling info if you're in a Jupyter notebook:
85 |
86 |

87 |
88 | or the following info if you're in a text-only terminal:
89 |
90 | ```
91 | ## outer
92 |
93 | active_bytes reserved_bytes line code
94 | all all
95 | peak peak
96 | 0.00B 0.00B 7 def outer():
97 | 40.00K 2.00M 8 linear = torch.nn.Linear(100, 100).cuda()
98 | 80.00K 2.00M 9 linear2 = torch.nn.Linear(100, 100).cuda()
99 | 120.00K 2.00M 10 inner()
100 |
101 |
102 | ## inner
103 |
104 | active_bytes reserved_bytes line code
105 | all all
106 | peak peak
107 | 80.00K 2.00M 4 def inner():
108 | 120.00K 2.00M 5 torch.nn.Linear(100, 100).cuda()
109 | ```
110 |
111 | An explanation of what each column means can be found in the [Torch documentation](https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats). The name of any field from `memory_stats()`
112 | can be passed to `display()` to view the corresponding statistic.
113 |
114 | If you use `profile` decorator, the memory statistics are collected during
115 | multiple runs and only the maximum one is displayed at the end.
116 | We also provide a more flexible API called `profile_every` which prints the
117 | memory info every *N* times of function execution. You can simply replace
118 | `@profile` with `@profile_every(1)` to print the memory usage for each
119 | execution.
120 |
121 | The `@profile` and `@profile_every` can also be mixed to gain more control
122 | of the debugging granularity.
123 |
124 | - You can also add the decorator in the module class:
125 |
126 | ```python
127 | class Net(torch.nn.Module):
128 | def __init__(self):
129 | super().__init__()
130 | @profile
131 | def forward(self, inp):
132 | #do_something
133 | ```
134 |
135 | - The *Line Profiler* profiles the memory usage of CUDA device 0 by default,
136 | you may want to switch the device to profile by `set_target_gpu`. The gpu
137 | selection is globally, which means you have to remember which gpu you are
138 | profiling on during the whole process:
139 |
140 | ```python
141 | import torch
142 | from pytorch_memlab import profile, set_target_gpu
143 | @profile
144 | def func():
145 | net1 = torch.nn.Linear(1024, 1024).cuda(0)
146 | set_target_gpu(1)
147 | net2 = torch.nn.Linear(1024, 1024).cuda(1)
148 | set_target_gpu(0)
149 | net3 = torch.nn.Linear(1024, 1024).cuda(0)
150 |
151 | func()
152 | ```
153 |
154 |
155 | More samples can be found in `test/test_line_profiler.py`
156 |
157 | ### IPython support
158 |
159 | Make sure you have `IPython` installed, or have installed `pytorch-memlab` with
160 | `pip install pytorch-memlab[ipython]`.
161 |
162 | First, load the extension:
163 |
164 | ```python
165 | %load_ext pytorch_memlab
166 | ```
167 |
168 | This makes the `%mlrun` and `%%mlrun` line/cell magics available for use. For
169 | example, in a new cell run the following to profile an entire cell
170 |
171 | ```python
172 | %%mlrun -f func
173 | import torch
174 | from pytorch_memlab import profile, set_target_gpu
175 | def func():
176 | net1 = torch.nn.Linear(1024, 1024).cuda(0)
177 | set_target_gpu(1)
178 | net2 = torch.nn.Linear(1024, 1024).cuda(1)
179 | set_target_gpu(0)
180 | net3 = torch.nn.Linear(1024, 1024).cuda(0)
181 | ```
182 |
183 | Or you can invoke the profiler for a single statement on via the `%mlrun` cell
184 | magic.
185 |
186 | ```python
187 | import torch
188 | from pytorch_memlab import profile, set_target_gpu
189 | def func(input_size):
190 | net1 = torch.nn.Linear(input_size, 1024).cuda(0)
191 | %mlrun -f func func(2048)
192 | ```
193 |
194 | See `%mlrun?` for help on what arguments are supported. You can set the GPU
195 | device to profile, dump profiling results to a file, and return the
196 | `LineProfiler` object for post-profile inspection.
197 |
198 | Find out more by checking out the [demo Jupyter notebook](./demo.ipynb)
199 |
200 |
201 | ### Memory Reporter
202 |
203 | As *Memory Profiler* only gives the overall memory usage information by lines,
204 | a more low-level memory usage information can be obtained by *Memory Reporter*.
205 |
206 | *Memory reporter* iterates all the `Tensor` objects and gets the underlying
207 | `UntypedStorage` (previously `Storage`) object to get the actual memory usage instead of the surface
208 | `Tensor.size`.
209 |
210 | > see [UntypedStorage](https://pytorch.org/docs/stable/storage.html#torch.UntypedStorage) for detailed
211 | > information
212 |
213 | #### Sample
214 |
215 | - A minimal one:
216 |
217 | ```python
218 | import torch
219 | from pytorch_memlab import MemReporter
220 | linear = torch.nn.Linear(1024, 1024).cuda()
221 | reporter = MemReporter()
222 | reporter.report()
223 | ```
224 | outputs:
225 | ```
226 | Element type Size Used MEM
227 | -------------------------------------------------------------------------------
228 | Storage on cuda:0
229 | Parameter0 (1024, 1024) 4.00M
230 | Parameter1 (1024,) 4.00K
231 | -------------------------------------------------------------------------------
232 | Total Tensors: 1049600 Used Memory: 4.00M
233 | The allocated memory on cuda:0: 4.00M
234 | -------------------------------------------------------------------------------
235 | ```
236 |
237 | - You can also pass in a model object for automatically name inference.
238 |
239 | ```python
240 | import torch
241 | from pytorch_memlab import MemReporter
242 |
243 | linear = torch.nn.Linear(1024, 1024).cuda()
244 | inp = torch.Tensor(512, 1024).cuda()
245 | # pass in a model to automatically infer the tensor names
246 | reporter = MemReporter(linear)
247 | out = linear(inp).mean()
248 | print('========= before backward =========')
249 | reporter.report()
250 | out.backward()
251 | print('========= after backward =========')
252 | reporter.report()
253 | ```
254 |
255 | outputs:
256 | ```
257 | ========= before backward =========
258 | Element type Size Used MEM
259 | -------------------------------------------------------------------------------
260 | Storage on cuda:0
261 | weight (1024, 1024) 4.00M
262 | bias (1024,) 4.00K
263 | Tensor0 (512, 1024) 2.00M
264 | Tensor1 (1,) 512.00B
265 | -------------------------------------------------------------------------------
266 | Total Tensors: 1573889 Used Memory: 6.00M
267 | The allocated memory on cuda:0: 6.00M
268 | -------------------------------------------------------------------------------
269 | ========= after backward =========
270 | Element type Size Used MEM
271 | -------------------------------------------------------------------------------
272 | Storage on cuda:0
273 | weight (1024, 1024) 4.00M
274 | weight.grad (1024, 1024) 4.00M
275 | bias (1024,) 4.00K
276 | bias.grad (1024,) 4.00K
277 | Tensor0 (512, 1024) 2.00M
278 | Tensor1 (1,) 512.00B
279 | -------------------------------------------------------------------------------
280 | Total Tensors: 2623489 Used Memory: 10.01M
281 | The allocated memory on cuda:0: 10.01M
282 | -------------------------------------------------------------------------------
283 | ```
284 |
285 |
286 | - The reporter automatically deals with the sharing weights parameters:
287 |
288 | ```python
289 | import torch
290 | from pytorch_memlab import MemReporter
291 |
292 | linear = torch.nn.Linear(1024, 1024).cuda()
293 | linear2 = torch.nn.Linear(1024, 1024).cuda()
294 | linear2.weight = linear.weight
295 | container = torch.nn.Sequential(
296 | linear, linear2
297 | )
298 | inp = torch.Tensor(512, 1024).cuda()
299 | # pass in a model to automatically infer the tensor names
300 |
301 | out = container(inp).mean()
302 | out.backward()
303 |
304 | # verbose shows how storage is shared across multiple Tensors
305 | reporter = MemReporter(container)
306 | reporter.report(verbose=True)
307 | ```
308 |
309 | outputs:
310 | ```
311 | Element type Size Used MEM
312 | -------------------------------------------------------------------------------
313 | Storage on cuda:0
314 | 0.weight (1024, 1024) 4.00M
315 | 0.weight.grad (1024, 1024) 4.00M
316 | 0.bias (1024,) 4.00K
317 | 0.bias.grad (1024,) 4.00K
318 | 1.bias (1024,) 4.00K
319 | 1.bias.grad (1024,) 4.00K
320 | Tensor0 (512, 1024) 2.00M
321 | Tensor1 (1,) 512.00B
322 | -------------------------------------------------------------------------------
323 | Total Tensors: 2625537 Used Memory: 10.02M
324 | The allocated memory on cuda:0: 10.02M
325 | -------------------------------------------------------------------------------
326 | ```
327 |
328 | - You can better understand the memory layout for more complicated module:
329 |
330 | ```python
331 | import torch
332 | from pytorch_memlab import MemReporter
333 |
334 | lstm = torch.nn.LSTM(1024, 1024).cuda()
335 | reporter = MemReporter(lstm)
336 | reporter.report(verbose=True)
337 | inp = torch.Tensor(10, 10, 1024).cuda()
338 | out, _ = lstm(inp)
339 | out.mean().backward()
340 | reporter.report(verbose=True)
341 | ```
342 |
343 | As shown below, the `(->)` indicates the re-use of the same storage back-end
344 | outputs:
345 | ```
346 | Element type Size Used MEM
347 | -------------------------------------------------------------------------------
348 | Storage on cuda:0
349 | weight_ih_l0 (4096, 1024) 32.03M
350 | weight_hh_l0(->weight_ih_l0) (4096, 1024) 0.00B
351 | bias_ih_l0(->weight_ih_l0) (4096,) 0.00B
352 | bias_hh_l0(->weight_ih_l0) (4096,) 0.00B
353 | Tensor0 (10, 10, 1024) 400.00K
354 | -------------------------------------------------------------------------------
355 | Total Tensors: 8499200 Used Memory: 32.42M
356 | The allocated memory on cuda:0: 32.52M
357 | Memory differs due to the matrix alignment
358 | -------------------------------------------------------------------------------
359 | Element type Size Used MEM
360 | -------------------------------------------------------------------------------
361 | Storage on cuda:0
362 | weight_ih_l0 (4096, 1024) 32.03M
363 | weight_ih_l0.grad (4096, 1024) 32.03M
364 | weight_hh_l0(->weight_ih_l0) (4096, 1024) 0.00B
365 | weight_hh_l0.grad(->weight_ih_l0.grad) (4096, 1024) 0.00B
366 | bias_ih_l0(->weight_ih_l0) (4096,) 0.00B
367 | bias_ih_l0.grad(->weight_ih_l0.grad) (4096,) 0.00B
368 | bias_hh_l0(->weight_ih_l0) (4096,) 0.00B
369 | bias_hh_l0.grad(->weight_ih_l0.grad) (4096,) 0.00B
370 | Tensor0 (10, 10, 1024) 400.00K
371 | Tensor1 (10, 10, 1024) 400.00K
372 | Tensor2 (1, 10, 1024) 40.00K
373 | Tensor3 (1, 10, 1024) 40.00K
374 | -------------------------------------------------------------------------------
375 | Total Tensors: 17018880 Used Memory: 64.92M
376 | The allocated memory on cuda:0: 65.11M
377 | Memory differs due to the matrix alignment
378 | -------------------------------------------------------------------------------
379 | ```
380 |
381 | NOTICE:
382 | > When forwarding with `grad_mode=True`, pytorch maintains tensor buffers for
383 | > future Back-Propagation, in C level. So these buffers are not going to be
384 | > managed or collected by pytorch. But if you store these intermediate results
385 | > as python variables, then they will be reported.
386 |
387 | - You can also filter the device to report on by passing extra arguments:
388 | `report(device=torch.device(0))`
389 |
390 | - A failed example due to pytorch's C side tensor buffers
391 |
392 | In the following example, a temp buffer is created at `inp * (inp + 2)` to
393 | store both `inp` and `inp + 2`, unfortunately python only knows the existence
394 | of inp, so we have *2M* memory lost, which is the same size of Tensor `inp`.
395 |
396 | ```python
397 | import torch
398 | from pytorch_memlab import MemReporter
399 |
400 | linear = torch.nn.Linear(1024, 1024).cuda()
401 | inp = torch.Tensor(512, 1024).cuda()
402 | # pass in a model to automatically infer the tensor names
403 | reporter = MemReporter(linear)
404 | out = linear(inp * (inp + 2)).mean()
405 | reporter.report()
406 | ```
407 |
408 | outputs:
409 | ```
410 | Element type Size Used MEM
411 | -------------------------------------------------------------------------------
412 | Storage on cuda:0
413 | weight (1024, 1024) 4.00M
414 | bias (1024,) 4.00K
415 | Tensor0 (512, 1024) 2.00M
416 | Tensor1 (1,) 512.00B
417 | -------------------------------------------------------------------------------
418 | Total Tensors: 1573889 Used Memory: 6.00M
419 | The allocated memory on cuda:0: 8.00M
420 | Memory differs due to the matrix alignment or invisible gradient buffer tensors
421 | -------------------------------------------------------------------------------
422 | ```
423 |
424 |
425 | ### Courtesy
426 |
427 | Sometimes people would like to preempt your running task, but you don't want
428 | to save checkpoint and then load, actually all they need is GPU resources (
429 | typically CPU resources and CPU memory is always spare in GPU clusters), so
430 | you can move all your workspaces from GPU to CPU and then halt your task until
431 | a restart signal is triggered, instead of saving&loading checkpoints and
432 | bootstrapping from scratch.
433 |
434 | Still developing..... But you can have fun with:
435 | ```python
436 | from pytorch_memlab import Courtesy
437 |
438 | iamcourtesy = Courtesy()
439 | for i in range(num_iteration):
440 | if something_happens:
441 | iamcourtesy.yield_memory()
442 | wait_for_restart_signal()
443 | iamcourtesy.restore()
444 | ```
445 |
446 | #### Known Issues
447 |
448 | - As is stated above in `Memory_Reporter`, intermediate tensors are not covered
449 | properly, so you may want to insert such courtesy logics after `backward` or
450 | before `forward`.
451 | - Currently the CUDA context of pytorch requires about 1 GB CUDA memory, which
452 | means even all Tensors are on CPU, 1GB of CUDA memory is wasted, :-(. However
453 | it's still under investigation if I can fully destroy the context and then
454 | re-init.
455 |
456 |
457 | ### ACK
458 |
459 | I suffered a lot debugging weird memory usage during my 3-years of developing
460 | efficient Deep Learning models, and of course learned a lot from the great
461 | open source community.
462 |
463 | ## CHANGES
464 |
465 |
466 | ##### 0.3.0 (2023-7-29)
467 | - Fix `DataFrame.drop` for pandas 1.5+
468 | ##### 0.2.4 (2021-10-28)
469 | - Fix colab error (#35)
470 | - Support python3.8 (#38)
471 | - Support sparse tensor (#30)
472 | ##### 0.2.3 (2020-12-01)
473 | - Fix name mapping in `MemReporter` (#24)
474 | - Fix reporter without model input (#22 #25)
475 | ##### 0.2.2 (2020-10-23)
476 | - Fix memory leak in `MemReporter`
477 | ##### 0.2.1 (2020-06-18)
478 | - Fix `line_profiler` not found
479 | ##### 0.2.0 (2020-06-15)
480 | - Add jupyter notebook figure and ipython support
481 | ##### 0.1.0 (2020-04-17)
482 | - Add ipython magic support (#8)
483 | ##### 0.0.4 (2019-10-08)
484 | - Add gpu switch for line-profiler(#2)
485 | - Add device filter for reporter
486 | ##### 0.0.3 (2019-06-15)
487 | - Install dependency for pip installation
488 | ##### 0.0.2 (2019-06-04)
489 | - Fix statistics shift in loop
490 | ##### 0.0.1 (2019-05-28)
491 | - initial release
492 |
493 | ## Star History
494 |
495 | [](https://star-history.com/#stonesjtu/pytorch_memlab&Date)
496 |
--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Once installed, you need to load the `pytorch_memlab` IPython extensions:"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "%load_ext pytorch_memlab"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "One magic is provided, `mlrun` which can act either as a line magic `%mlrun`, or as a cell magic `%%mlrun`"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "%%mlrun?"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "First we need some torch code to profile:"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import torch\n",
49 | "\n",
50 | "def x():\n",
51 | " torch.nn.Linear(100, 100).cuda()\n",
52 | " \n",
53 | "def y(gpu=0):\n",
54 | " torch.nn.Linear(1000, 100).cuda(device=gpu)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "We can profile multiple functions at the same type by repeatedly specifying `-f`"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/html": [
72 | "x
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
98 | " \n",
99 | " 0.00B | \n",
100 | " 0.00B | \n",
101 | " 3 | \n",
102 | " def x():\n",
103 | " | \n",
104 | "
\n",
105 | " \n",
106 | " 40.00K | \n",
107 | " 2.00M | \n",
108 | " 4 | \n",
109 | " torch.nn.Linear(100, 100).cuda()\n",
110 | " | \n",
111 | "
\n",
112 | "
\n",
113 | "y
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
140 | " \n",
141 | " 0.00B | \n",
142 | " 2.00M | \n",
143 | " 6 | \n",
144 | " def y(gpu=0):\n",
145 | " | \n",
146 | "
\n",
147 | " \n",
148 | " 391.50K | \n",
149 | " 2.00M | \n",
150 | " 7 | \n",
151 | " torch.nn.Linear(1000, 100).cuda(device=gpu)\n",
152 | " | \n",
153 | "
\n",
154 | "
"
155 | ],
156 | "text/plain": [
157 | "## x\n",
158 | "\n",
159 | "active_bytes reserved_bytes line code \n",
160 | " all all \n",
161 | " peak peak \n",
162 | " 0.00B 0.00B 3 def x(): \n",
163 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \n",
164 | "\n",
165 | "\n",
166 | "## y\n",
167 | "\n",
168 | "active_bytes reserved_bytes line code \n",
169 | " all all \n",
170 | " peak peak \n",
171 | " 0.00B 2.00M 6 def y(gpu=0): \n",
172 | " 391.50K 2.00M 7 torch.nn.Linear(1000, 100).cuda(device=gpu) "
173 | ]
174 | },
175 | "execution_count": 4,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "%%mlrun -f x -f y\n",
182 | "\n",
183 | "x()\n",
184 | "y()"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "You can alos profile with the `%mlrun` line magic"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 5,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/html": [
202 | "z
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
228 | " \n",
229 | " 0.00B | \n",
230 | " 0.00B | \n",
231 | " 1 | \n",
232 | " def z():\n",
233 | " | \n",
234 | "
\n",
235 | " \n",
236 | " 40.00K | \n",
237 | " 2.00M | \n",
238 | " 2 | \n",
239 | " torch.nn.Linear(100, 100).cuda()\n",
240 | " | \n",
241 | "
\n",
242 | "
"
243 | ],
244 | "text/plain": [
245 | "## z\n",
246 | "\n",
247 | "active_bytes reserved_bytes line code \n",
248 | " all all \n",
249 | " peak peak \n",
250 | " 0.00B 0.00B 1 def z(): \n",
251 | " 40.00K 2.00M 2 torch.nn.Linear(100, 100).cuda() "
252 | ]
253 | },
254 | "execution_count": 5,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "def z():\n",
261 | " torch.nn.Linear(100, 100).cuda()\n",
262 | "%mlrun -f z z()"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "You can specify which GPU you wish to profile using `-g`:"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 6,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "data": {
279 | "text/html": [
280 | "x
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
306 | " \n",
307 | " 0.00B | \n",
308 | " 0.00B | \n",
309 | " 3 | \n",
310 | " def x():\n",
311 | " | \n",
312 | "
\n",
313 | " \n",
314 | " 40.00K | \n",
315 | " 2.00M | \n",
316 | " 4 | \n",
317 | " torch.nn.Linear(100, 100).cuda()\n",
318 | " | \n",
319 | "
\n",
320 | "
\n",
321 | "y
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
348 | " \n",
349 | " 0.00B | \n",
350 | " 2.00M | \n",
351 | " 6 | \n",
352 | " def y(gpu=0):\n",
353 | " | \n",
354 | "
\n",
355 | " \n",
356 | " 391.50K | \n",
357 | " 2.00M | \n",
358 | " 7 | \n",
359 | " torch.nn.Linear(1000, 100).cuda(device=gpu)\n",
360 | " | \n",
361 | "
\n",
362 | "
"
363 | ],
364 | "text/plain": [
365 | "## x\n",
366 | "\n",
367 | "active_bytes reserved_bytes line code \n",
368 | " all all \n",
369 | " peak peak \n",
370 | " 0.00B 0.00B 3 def x(): \n",
371 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \n",
372 | "\n",
373 | "\n",
374 | "## y\n",
375 | "\n",
376 | "active_bytes reserved_bytes line code \n",
377 | " all all \n",
378 | " peak peak \n",
379 | " 0.00B 2.00M 6 def y(gpu=0): \n",
380 | " 391.50K 2.00M 7 torch.nn.Linear(1000, 100).cuda(device=gpu) "
381 | ]
382 | },
383 | "execution_count": 6,
384 | "metadata": {},
385 | "output_type": "execute_result"
386 | }
387 | ],
388 | "source": [
389 | "%%mlrun -f x -f y -g 0 y\n",
390 | "\n",
391 | "x()\n",
392 | "y(gpu=0)"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "You can get a handle on the `LineProfiler` object using `-r`"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 7,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "data": {
409 | "text/html": [
410 | "x
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
436 | " \n",
437 | " 0.00B | \n",
438 | " 0.00B | \n",
439 | " 3 | \n",
440 | " def x():\n",
441 | " | \n",
442 | "
\n",
443 | " \n",
444 | " 40.00K | \n",
445 | " 2.00M | \n",
446 | " 4 | \n",
447 | " torch.nn.Linear(100, 100).cuda()\n",
448 | " | \n",
449 | "
\n",
450 | "
"
451 | ],
452 | "text/plain": [
453 | "## x\n",
454 | "\n",
455 | "active_bytes reserved_bytes line code \n",
456 | " all all \n",
457 | " peak peak \n",
458 | " 0.00B 0.00B 3 def x(): \n",
459 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() "
460 | ]
461 | },
462 | "execution_count": 7,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "profiler = %mlrun -q -r -f x x()\n",
469 | "profiler.display()"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "You can dump stats out to a file using `-T`:"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 8,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "data": {
486 | "text/html": [
487 | "x
active_bytes | reserved_bytes | line | code |
all | all | | |
peak | peak | | |
\n",
513 | " \n",
514 | " 0.00B | \n",
515 | " 0.00B | \n",
516 | " 3 | \n",
517 | " def x():\n",
518 | " | \n",
519 | "
\n",
520 | " \n",
521 | " 40.00K | \n",
522 | " 2.00M | \n",
523 | " 4 | \n",
524 | " torch.nn.Linear(100, 100).cuda()\n",
525 | " | \n",
526 | "
\n",
527 | "
"
528 | ],
529 | "text/plain": [
530 | "## x\n",
531 | "\n",
532 | "active_bytes reserved_bytes line code \n",
533 | " all all \n",
534 | " peak peak \n",
535 | " 0.00B 0.00B 3 def x(): \n",
536 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() "
537 | ]
538 | },
539 | "execution_count": 8,
540 | "metadata": {},
541 | "output_type": "execute_result"
542 | }
543 | ],
544 | "source": [
545 | "%mlrun -q -T profile.log -f x x()"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 9,
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "## x\r\n",
558 | "\r\n",
559 | "active_bytes reserved_bytes line code \r\n",
560 | " all all \r\n",
561 | " peak peak \r\n",
562 | " 0.00B 0.00B 3 def x(): \r\n",
563 | " 40.00K 2.00M 4 torch.nn.Linear(100, 100).cuda() \r\n"
564 | ]
565 | }
566 | ],
567 | "source": [
568 | "!head profile.log"
569 | ]
570 | }
571 | ],
572 | "metadata": {
573 | "kernelspec": {
574 | "display_name": "Python 3",
575 | "language": "python",
576 | "name": "python3"
577 | },
578 | "language_info": {
579 | "codemirror_mode": {
580 | "name": "ipython",
581 | "version": 3
582 | },
583 | "file_extension": ".py",
584 | "mimetype": "text/x-python",
585 | "name": "python",
586 | "nbconvert_exporter": "python",
587 | "pygments_lexer": "ipython3",
588 | "version": "3.7.3"
589 | }
590 | },
591 | "nbformat": 4,
592 | "nbformat_minor": 4
593 | }
594 |
--------------------------------------------------------------------------------
/pytorch_memlab/__init__.py:
--------------------------------------------------------------------------------
1 | from .courtesy import Courtesy
2 | from .mem_reporter import MemReporter
3 | from .line_profiler import LineProfiler, profile, profile_every, set_target_gpu, clear_global_line_profiler
4 | try:
5 | from .line_profiler.extension import load_ipython_extension
6 | except ImportError:
7 | pass
8 |
--------------------------------------------------------------------------------
/pytorch_memlab/courtesy.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import torch
3 |
4 |
5 | class Courtesy():
6 | """A class to yield CUDA memory at any time in the training
7 |
8 | The whole save/load is a bit tricky because all data transfer should
9 | be inplace operation and gradient agnostic
10 | """
11 | def __init__(self):
12 | self.loc_map = {}
13 |
14 | def yield_memory(self):
15 | """Transfer all the CUDA tensors into CPU memory"""
16 | tensors = [obj for obj in gc.get_objects() if isinstance(obj, torch.Tensor)]
17 | for t in tensors:
18 | # in case tensors appear more than once
19 | if t not in self.loc_map:
20 | self.loc_map[t] = t.device
21 |
22 | t.data = t.data.cpu()
23 | # parameters have one more wrapper for .data
24 | if isinstance(t, torch.nn.Parameter):
25 | # sometimes Parameter does not have grad
26 | try:
27 | t.grad.data = t.grad.cpu()
28 | finally:
29 | pass
30 | torch.cuda.empty_cache()
31 |
32 | def restore(self):
33 | """Restore the tensors into original CUDA devices"""
34 | for t, device in self.loc_map.items():
35 | t.data = t.data.to(device)
36 | if isinstance(t, torch.nn.Parameter):
37 | # sometimes Parameter does not have grad
38 | try:
39 | t.grad = t.grad.to(device)
40 | finally:
41 | pass
42 | self.loc_map.clear()
43 |
44 | def __enter__(self):
45 | self.yield_memory()
46 | return self
47 |
48 | def __exit__(self, *args):
49 | self.restore()
50 |
--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/__init__.py:
--------------------------------------------------------------------------------
1 | from .line_profiler import LineProfiler
2 | from .profile import profile, profile_every, set_target_gpu, clear_global_line_profiler
3 |
--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/extension.py:
--------------------------------------------------------------------------------
1 | """IPython & notebook extension interface"""
2 | from IPython.core.magic import (
3 | Magics,
4 | magics_class,
5 | line_cell_magic,
6 | needs_local_scope,
7 | )
8 | from IPython.core.magic_arguments import magic_arguments, argument, parse_argstring
9 |
10 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS
11 |
12 |
13 | class UsageError(Exception):
14 | pass
15 |
16 |
17 | @magics_class
18 | class MemlabMagics(Magics):
19 | @magic_arguments()
20 | @argument('--function',
21 | '-f',
22 | metavar='FUNC',
23 | action='append',
24 | default=[],
25 | help="""Function to profile. Can be specified multiple times to profile multiple
26 | functions""")
27 | @argument('--column',
28 | '-c',
29 | metavar='COLS',
30 | action='append',
31 | default=[],
32 | help="""Columns to display. Can be specified multiple times to profile multiple
33 | functions. See the Torch CUDA spec at
34 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats for details.""")
35 | @argument('-D',
36 | '--no_default_columns',
37 | action='store_true',
38 | help='Hide the default columns of ' + ", ".join(DEFAULT_COLUMNS))
39 | @argument('-r',
40 | '--return-profiler',
41 | action='store_true',
42 | help='Return LineProfiler object for introspection')
43 | @argument('-g',
44 | '--gpu',
45 | metavar='GPU_ID',
46 | default=0,
47 | type=int,
48 | help='Profile memory usage of this GPU')
49 | @argument('-q',
50 | '--quiet',
51 | action='store_true',
52 | help='Don\'t print out profile results')
53 | @argument('statement',
54 | nargs='*',
55 | default=None,
56 | help='Code to run under profiler. You can omit this in cell magic mode.')
57 | @argument('-T',
58 | '--dump-profile',
59 | metavar='OUTPUT',
60 | help='Dump text profile output to file')
61 | @line_cell_magic
62 | @needs_local_scope
63 | def mlrun(self, line=None, cell=None, local_ns=None):
64 | """Execute a statement/cell under the PyTorch Memlab profiler to collect CUDA memory
65 | allocation information on a per-line basis.
66 | """
67 | args = parse_argstring(self.mlrun, line)
68 | global_ns = self.shell.user_global_ns
69 |
70 | funcs = []
71 | for name in args.function:
72 | try:
73 | fn = eval(name, global_ns, local_ns)
74 | funcs.append(fn)
75 | except NameError as e:
76 | raise UsageError('Could not find function {!r}.\n{}: {}'.format(
77 | name, e.__class__.__name__, e)
78 | )
79 | profiler = LineProfiler(*funcs, target_gpu=args.gpu)
80 | if cell is not None:
81 | code = cell
82 | else:
83 | assert args.statement is not None
84 | code = '\n'.join(args.statement)
85 | with profiler:
86 | exec(compile(code, filename='', mode='exec'), local_ns)
87 |
88 | if args.dump_profile is not None:
89 | with open(args.dump_profile, 'w') as f:
90 | profiler.print_stats(stream=f)
91 |
92 | if args.return_profiler:
93 | return profiler
94 | else:
95 | defaults = [] if args.no_default_columns else list(DEFAULT_COLUMNS)
96 | return profiler.display(columns=defaults + args.column)
97 |
98 |
99 | def load_ipython_extension(ipython):
100 | ipython.register_magics(MemlabMagics)
101 |
--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/line_profiler.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import sys
3 | from types import FrameType
4 | import warnings
5 | from typing import Any, Callable, Optional, Tuple
6 |
7 | import torch
8 |
9 | from .line_records import LineRecords
10 |
11 | # Seaborn's `muted` color cycle
12 | DEFAULT_COLUMNS = ('active_bytes.all.peak', 'reserved_bytes.all.peak')
13 |
14 |
15 | class LineProfiler:
16 | """Profile the CUDA memory usage info for each line in pytorch
17 |
18 | This class registers callbacks for added functions to profiling them line
19 | by line, and collects all the statistics in CUDA memory. Usually you may
20 | want to use simpler wrapper below `profile` or `profile_every`.
21 |
22 | The CUDA memory is collected only on the **current** cuda device.
23 |
24 | Usage:
25 | ```python
26 | with LineProfiler(func) as lp:
27 | func
28 | lp.display()
29 |
30 | ```python
31 | lp = LineProfiler(func)
32 | lp.enable()
33 | func()
34 | lp.disable()
35 | lp.display()
36 | ```
37 | """
38 |
39 | def __init__(self, *functions: Callable, target_gpu: int = 0):
40 | self.target_gpu = target_gpu
41 | self._code_infos = {}
42 | self._raw_line_records = []
43 | self.enabled = False
44 | for func in functions:
45 | self.add_function(func)
46 |
47 | def add_function(self, func: Callable) -> None:
48 | """ Record line profiling information for the given Python function.
49 | """
50 | try:
51 | # We need to use the hash here because pandas will later expect something
52 | # orderable for its index
53 | code_hash = hash(func.__code__)
54 | except AttributeError:
55 | warnings.warn(
56 | "Could not extract a code object for the object %r" % (func,))
57 | return
58 | if code_hash not in self._code_infos:
59 | first_line = inspect.getsourcelines(func)[1]
60 | self._code_infos[code_hash] = {
61 | 'func': func,
62 | 'first_line': first_line,
63 | 'prev_line': first_line,
64 | 'prev_record': -1,
65 | }
66 |
67 | # re-register the newer trace_callback
68 | if self.enabled:
69 | self.register_callback()
70 |
71 | def __enter__(self):
72 | self.enable()
73 | return self
74 |
75 | def __exit__(self, exc_type, exc_val, exc_tb):
76 | self.disable()
77 |
78 | def register_callback(self):
79 | """Register the trace_callback only on demand"""
80 | if self._code_infos:
81 | sys.settrace(self._trace_callback)
82 |
83 | def _reset_cuda_stats(self):
84 | torch.cuda.reset_peak_memory_stats()
85 | torch.cuda.reset_accumulated_memory_stats()
86 |
87 | def enable(self):
88 | """Enable the profiler and register trace callback"""
89 | if not torch.cuda.is_available():
90 | print('Could not find CUDA deivces and reset CUDA stats and cache')
91 | return
92 | torch.cuda.empty_cache()
93 | self._reset_cuda_stats()
94 | self.enabled = True
95 | self.register_callback()
96 |
97 | def disable(self):
98 | """Disable the profiler and clear trace callback"""
99 | self.enabled = False
100 | sys.settrace(None)
101 |
102 | def clear(self):
103 | """Clear the state of the line profiler"""
104 | self._code_infos = {}
105 | self._raw_line_records = []
106 |
107 | def _trace_callback(self, frame: FrameType, event: str, _unused_arg: Tuple[Any, ...]):
108 | """Trace the execution of python line-by-line"""
109 |
110 | if event == 'call':
111 | return self._trace_callback
112 |
113 | code_hash = hash(frame.f_code)
114 | if event in ['line', 'return'] and code_hash in self._code_infos:
115 | code_info = self._code_infos[code_hash]
116 | with torch.cuda.device(self.target_gpu):
117 | self._raw_line_records.append({
118 | 'code_hash': code_hash,
119 | 'line': code_info['prev_line'],
120 | 'prev_record_idx': code_info['prev_record'],
121 | **torch.cuda.memory_stats()})
122 | self._reset_cuda_stats()
123 |
124 | if event == 'line':
125 | code_info['prev_line'] = frame.f_lineno
126 | code_info['prev_record'] = len(self._raw_line_records)-1
127 | elif event == 'return':
128 | code_info['prev_line'] = code_info['first_line']
129 | code_info['prev_record'] = -1
130 |
131 | def display(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS) -> LineRecords:
132 | """Display the profiling results on either IPython or CLI
133 |
134 | The columns are explained in the PyTorch documentation:
135 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
136 |
137 | .. note:: To work, this needs to be the last thing returned in the IPython statement or cell.
138 |
139 | Args:
140 | func (str): the function name of interest, None for all registered function
141 | columns (list of str): the column names of interest, See PyTorch's doc for available names.
142 |
143 | Returns:
144 | RecordsDisplay: Returns an object that'll display the recorded stats in the IPython console
145 | """
146 | return LineRecords(self._raw_line_records, self._code_infos).display(func, columns)
147 |
148 | def print_stats(self, func: Optional[Callable] = None, columns: Tuple[str, ...] = DEFAULT_COLUMNS, stream=sys.stdout):
149 | """Print the text profiling results to stream
150 |
151 | The columns are explained in the PyTorch documentation:
152 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
153 |
154 | Args:
155 | func (str): the function name of interest, None for all registered function
156 | columns (list of str): the column names of interest, See PyTorch's doc for available names
157 | stream (IO-like object): the stream to write to
158 | """
159 | stream.write(str(self.display(func, columns)))
160 |
--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/line_records.py:
--------------------------------------------------------------------------------
1 | """Class and helper functions for processing and displaying line records"""
2 | import inspect
3 | from typing import Callable, Optional, Tuple, List, Dict, Any
4 | import pandas as pd
5 |
6 | from ..utils import readable_size
7 |
8 |
9 | COLORS = [
10 | '#4878d0', '#ee854a', '#6acc64', '#d65f5f', '#956cb4',
11 | '#8c613c', '#dc7ec0', '#797979', '#d5bb67', '#82c6e2',
12 | ]
13 |
14 |
15 |
16 | def _accumulate_line_records(raw_line_records: List[Dict[str, Any]]) -> pd.DataFrame:
17 | """The raw records give the memory stats between successive lines executed by the profiler.
18 | But we want the memory stats between successive lines in our functions! The two diverge when
19 | a function we're profiling calls another function we're profiling, since then Torch will have
20 | its peak/allocated/freed memory stats reset on each line of the called function.
21 |
22 | To fix that, here we look at each line record in turn, and for peak stats we take the
23 | maximum since the last record _in the same function_. For allocated/freed stats, we take the
24 | sum since the last record in the same function.
25 | """
26 |
27 | # We'll do this in numpy because indexing lots of rows and columns in pandas is dog-slow.
28 | raw = pd.DataFrame(raw_line_records)
29 | acc_mask = raw.columns.str.match(r'.*(allocated|freed)$')
30 | peak_mask = raw.columns.str.match(r'.*(peak)$')
31 | acc_raw, peak_raw = raw.loc[:, acc_mask].values, raw.loc[:, peak_mask].values
32 | acc_refined, peak_refined = acc_raw.copy(), peak_raw.copy()
33 |
34 | for row, record in enumerate(raw_line_records):
35 | if record['prev_record_idx'] == -1:
36 | # No previous data to accumulate from
37 | continue
38 | if record['prev_record_idx'] == row-1:
39 | # Previous record was the previous line, so no need to accumulate anything
40 | continue
41 |
42 | # Another profiled function has been called since the last record, so we need to
43 | # accumulate the allocated/freed/peaks of the intervening records into this one.
44 | acc_refined[row] = acc_raw[record['prev_record_idx']+1:row+1].sum(0)
45 | peak_refined[row] = peak_raw[record['prev_record_idx']+1:row+1].max(0)
46 |
47 | refined = raw.copy()
48 | refined.loc[:, acc_mask] = acc_refined
49 | refined.loc[:, peak_mask] = peak_refined
50 | return refined
51 |
52 |
53 | def _line_records(raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]) -> pd.DataFrame:
54 | """Converts the raw line records to a nicely-shaped dataframe whose values reflect
55 | the memory usage of lines of _functions_ rather than lines of _execution_. See the
56 | `_accumulate_line_records` docstring for more detail."""
57 | # Column spec: https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
58 | qual_names = {
59 | code_hash: info['func'].__qualname__ for code_hash, info in code_infos.items()}
60 | # pandas < 2.1.0 support (python3.8)
61 | try:
62 | records = (_accumulate_line_records(raw_line_records)
63 | .assign(qual_name=lambda df: df.code_hash.map(qual_names))
64 | .set_index(['qual_name', 'line'])
65 | .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1))
66 | except AttributeError:
67 | records = (_accumulate_line_records(raw_line_records)
68 | .assign(qual_name=lambda df: df.code_hash.applymap(qual_names))
69 | .set_index(['qual_name', 'line'])
70 | .drop(['code_hash', 'num_alloc_retries', 'num_ooms', 'prev_record_idx'], axis=1))
71 | records.columns = pd.MultiIndex.from_tuples(
72 | [c.split('.') for c in records.columns])
73 |
74 | return records
75 |
76 |
77 | class LineRecords:
78 | """Class for processing raw line records and display on IPython & CLI
79 | """
80 |
81 | def __init__(self, raw_line_records: List[Dict[str, Any]], code_infos: List[Dict[str, Any]]):
82 | super().__init__()
83 | self._raw_line_records = raw_line_records
84 | self._code_infos = code_infos
85 |
86 | def display(self, func: Callable[..., Any], columns: Tuple[str, ...]):
87 | """Display the records to either notebook or CLI
88 |
89 | The columns are explained in the PyTorch documentation:
90 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
91 |
92 | .. note:: Make this call the last one in a notebook cell
93 |
94 | Args:
95 | func (str): the function name of interest, None for all registered function
96 | columns (list of str): the column names of interest, See PyTorch's doc for available names.
97 |
98 | Returns:
99 | RecordsDisplay: a IPython friendly object which converts records to HTML or plain text
100 | """
101 | line_records = self._filter_raw_line_records(func, columns)
102 | return RecordsDisplay(line_records, self._code_infos)
103 |
104 | def _filter_raw_line_records(self, func: Callable[..., Any], columns: Tuple[str, ...]) -> pd.DataFrame:
105 | """Get the line records
106 |
107 | The columns are explained in the PyTorch documentation:
108 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
109 |
110 | Args:
111 | func (str): the function name of interest, None for all registered function
112 | columns (list of str): the column names of interest, See PyTorch's doc for available names.
113 |
114 | Returns:
115 | pd.DataFrame: a (line, statistic)-indexed dataframe of memory stats.
116 | """
117 | if len(self._raw_line_records) == 0:
118 | return pd.DataFrame(index=pd.MultiIndex.from_product([[], []]), columns=columns)
119 |
120 | line_records = _line_records(self._raw_line_records, self._code_infos)
121 | line_records = _extract_line_records(line_records, func, columns)
122 |
123 | if len(line_records) > 0:
124 | line_records = line_records.groupby(level=[0, 1]).max()
125 |
126 | return line_records
127 |
128 |
129 | def _extract_line_records(line_records: LineRecords, func: Optional[Callable] = None, columns: Tuple[str, ...] = None):
130 | """Extracts the subset of a line_records dataframe pertinent to a given set of functions and
131 | columns"""
132 | if func is not None:
133 | # Support both passing the function directly and passing a qual name/list of qual names
134 | line_records = line_records.loc[[func.__qualname__] if callable(func) else func]
135 |
136 | if columns is not None:
137 | columns = [tuple(c.split('.')) for c in columns]
138 | if not all(len(c) == 3 for c in columns):
139 | raise ValueError('Each column name should have three dot-separated parts')
140 | if not all(c in line_records.columns for c in columns):
141 | options = ", ".join(".".join(c)
142 | for c in line_records.columns.tolist())
143 | raise ValueError(
144 | 'The column names should be fields of torch.cuda.memory_stat(). Options are: ' + options)
145 | line_records = line_records.loc[:, columns]
146 |
147 | return line_records
148 |
149 |
150 | class RecordsDisplay:
151 | """Class for processing raw line records and display on IPython & CLI
152 |
153 | IPython's rich display functionality [requires we return](https://ipython.readthedocs.io/en/stable/config/integrating.html)
154 | an object that has a `_repr_html_` method for when HTML rendering is supported, and
155 | a `__repr__` method for when only text is available
156 | """
157 | def __init__(self, line_records: LineRecords, code_infos: List[Dict[str, Any]]):
158 | super().__init__()
159 | self._line_records = line_records
160 | self._code_infos = code_infos
161 | self._merged_line_records = self._merge_line_records_with_code()
162 |
163 | def _merge_line_records_with_code(self) -> Dict[str, Any]:
164 | merged_records = {}
165 | for _, info in self._code_infos.items():
166 | qual_name = info['func'].__qualname__
167 | if qual_name in self._line_records.index.get_level_values(0):
168 | lines, start_line = inspect.getsourcelines(info['func'])
169 | lines = pd.DataFrame.from_dict({
170 | 'line': range(start_line, start_line + len(lines)),
171 | 'code': lines})
172 | lines.columns = pd.MultiIndex.from_product([lines.columns, [''], ['']])
173 |
174 | merged_records[qual_name] = pd.merge(
175 | self._line_records.loc[qual_name], lines,
176 | right_on='line', left_index=True, how='right')
177 | return merged_records
178 |
179 | def __repr__(self):
180 | """Renders the stats as text"""
181 | if len(self._line_records) == 0:
182 | return 'No data collected\n'
183 |
184 | is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte')
185 | byte_cols = self._line_records.columns[is_byte_col]
186 |
187 | string = {}
188 | for qual_name, merged in self._merge_line_records_with_code().items():
189 | maxlen = max(len(c) for c in merged.code)
190 | left_align = '{{:{maxlen}s}}'.format(maxlen=maxlen)
191 | # pandas < 2.1.0 support (python3.8)
192 | try:
193 | merged[byte_cols] = merged[byte_cols].map(readable_size)
194 | except AttributeError:
195 | merged[byte_cols] = merged[byte_cols].applymap(readable_size)
196 |
197 | # This is a mess, but I can't find any other way to left-align text strings.
198 | code_header = (left_align.format('code'), '', '')
199 | merged[code_header] = merged['code'].apply(lambda l: left_align.format(l.rstrip('\n\r')))
200 | merged = merged.drop('code', axis=1, level=0)
201 |
202 | string[qual_name] = merged.to_string(index=False)
203 |
204 | return '\n\n'.join(['## {q}\n\n{c}\n'.format(q=q, c=c) for q, c in string.items()])
205 |
206 | def _repr_html_(self):
207 | """Renders the stats as HTML"""
208 | if len(self._line_records) == 0:
209 | return 'No data collected
'
210 |
211 | is_byte_col = self._line_records.columns.get_level_values(0).str.contains('byte')
212 | byte_cols = self._line_records.columns[is_byte_col]
213 | maxes = self._line_records.max()
214 |
215 | html = {}
216 | for qual_name, merged in self._merge_line_records_with_code().items():
217 |
218 | style = merged.style
219 |
220 | # Style the bar charts
221 | for i, c in enumerate(self._line_records.columns):
222 | style = style.bar([c], color=COLORS[i % len(COLORS)],
223 | width=99, vmin=0, vmax=maxes[c])
224 |
225 | # Style the text
226 | html[qual_name] = (style
227 | .format({c: readable_size for c in byte_cols})
228 | .set_properties(
229 | subset=['code'], **{
230 | 'text-align': 'left',
231 | 'white-space': 'pre',
232 | 'font-family': 'monospace'})
233 | .set_table_styles([{
234 | 'selector': 'th',
235 | 'props': [('text-align', 'left')]}])
236 | .hide(axis=0)
237 | .to_html())
238 |
239 | template = '{q}
{c}
'
240 | return '\n'.join(template.format(q=q, c=c) for q, c in html.items())
241 |
--------------------------------------------------------------------------------
/pytorch_memlab/line_profiler/profile.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 | from typing import Callable, Tuple
3 | from .line_profiler import LineProfiler, DEFAULT_COLUMNS
4 |
5 |
6 | global_line_profiler = LineProfiler()
7 | global_line_profiler.enable()
8 |
9 |
10 | def clear_global_line_profiler():
11 | """Clears the state of the global line profiler"""
12 | global_line_profiler.clear()
13 |
14 |
15 | def set_target_gpu(gpu_id: int):
16 | """Set the target GPU id to profile memory
17 |
18 | Because of the lack of output space, only one GPU's memory usage is shown
19 | in line profiler. However you can use this function to switch target GPU
20 | to profile on. The GPU switch can be performed before profiling and even
21 | in the profiled functions.
22 |
23 | Args:
24 | - gpu_id: cuda index to profile the memory on,
25 | also accepts `torch.device` object.
26 | """
27 | global_line_profiler.target_gpu = gpu_id
28 |
29 |
30 | def profile(func, columns: Tuple[str, ...] = DEFAULT_COLUMNS):
31 | """Profile the CUDA memory usage of target function line by line
32 |
33 | The profiling results will be printed at exiting, KeyboardInterrupt raised.
34 | The CUDA memory is collected only on the **current** cuda device.
35 |
36 | The columns are explained in the PyTorch documentation:
37 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
38 |
39 | Args:
40 | func: the function or method to profile on
41 | columns (list of str): the column names of interest, See PyTorch's doc for available names.
42 |
43 | Usage:
44 | ```python
45 | @profile
46 | def foo():
47 | linear = torch.nn.Linear(100, 100).cuda()
48 |
49 | foo()
50 |
51 | class Foo(torch.nn.Module):
52 | def __init__(self):
53 | super().__init__()
54 | self.linear = torch.nn.Linear(100, 100).cuda()
55 |
56 | @profile
57 | def forward(self, inp):
58 | return self.linear(inp)
59 |
60 | inp = torch.Tensor(50, 100).cuda()
61 | foo = Foo()
62 | foo(inp)
63 | ```
64 | """
65 | import atexit
66 | global_line_profiler.add_function(func)
67 |
68 | def print_stats_atexit():
69 | global_line_profiler.print_stats(func, columns)
70 | atexit.register(print_stats_atexit)
71 |
72 | return func
73 |
74 |
75 | def profile_every(output_interval: int = 1, enable: bool = True, columns: Tuple[str, ...] = DEFAULT_COLUMNS):
76 | """Profile the CUDA memory usage of target function line by line
77 |
78 | Prints the profiling output every `output_interval` execution of the target
79 | function
80 | The CUDA memory is collected only on the **current** cuda device.
81 |
82 | The columns are explained in the PyTorch documentation:
83 | https://pytorch.org/docs/stable/cuda.html#torch.cuda.memory_stats
84 |
85 | Args:
86 | enable (bool): whether to enable the profiling mode, so users don't have to
87 | modify any source code for enabling and disabling profiling.
88 | output_interval (int): frequency of output the profiling results
89 | columns (list of str): the column names of interest, See PyTorch's doc for available names.
90 | """
91 |
92 | def inner_decorator(func: Callable):
93 | func.cur_idx = 1
94 |
95 | if enable:
96 | global_line_profiler.add_function(func)
97 |
98 | @wraps(func)
99 | def run_func(*args, **kwargs):
100 | res = func(*args, **kwargs)
101 | if enable:
102 | if func.cur_idx % output_interval == 0:
103 | global_line_profiler.print_stats(func, columns)
104 |
105 | func.cur_idx += 1
106 | return res
107 |
108 | return run_func
109 | return inner_decorator
110 |
--------------------------------------------------------------------------------
/pytorch_memlab/mem_reporter.py:
--------------------------------------------------------------------------------
1 | import math
2 | import gc
3 | from collections import defaultdict
4 | from typing import Optional, Tuple, List
5 |
6 | import torch
7 | from .utils import readable_size
8 |
9 | LEN = 79
10 |
11 | # some pytorch low-level memory management constant
12 | # the minimal allocate memory size (Byte)
13 | PYTORCH_MIN_ALLOCATE = 2 ** 9
14 | # the minimal cache memory size (Byte)
15 | PYTORCH_MIN_CACHE = 2 ** 20
16 |
17 | class MemReporter():
18 | """A memory reporter that collects tensors and memory usages
19 |
20 | Parameters:
21 | - model: an extra nn.Module can be passed to infer the name
22 | of Tensors
23 | - pre_collect: do a garbage collection before getting remaining
24 | Tensors, this gives cleaner outputs.
25 | Caution: This is an intrusive change to your original code.
26 |
27 | """
28 | def __init__(self, model: Optional[torch.nn.Module] = None, pre_collect: bool = False):
29 | self.tensor_name = {}
30 | self.device_mapping = defaultdict(list)
31 | self.device_tensor_stat = {}
32 | # to numbering the unknown tensors
33 | self.name_idx = 0
34 | self.pre_collect = pre_collect
35 |
36 | tensor_names = defaultdict(list)
37 | if model is not None:
38 | assert isinstance(model, torch.nn.Module)
39 | # for model with tying weight, multiple parameters may share
40 | # the same underlying tensor
41 | for name, param in model.named_parameters():
42 | tensor_names[param].append(name)
43 |
44 | for param, name in tensor_names.items():
45 | self.tensor_name[id(param)] = '+'.join(name)
46 |
47 | def _get_tensor_name(self, tensor: torch.Tensor) -> str:
48 | tensor_id = id(tensor)
49 | if tensor_id in self.tensor_name:
50 | name = self.tensor_name[tensor_id]
51 | # use numbering if no name can be inferred
52 | else:
53 | name = type(tensor).__name__ + str(self.name_idx)
54 | self.tensor_name[tensor_id] = name
55 | self.name_idx += 1
56 | return name
57 |
58 | def add_optimizer(self, optimizer: torch.optim.Optimizer):
59 | optimizer_name = optimizer.__class__.__name__
60 | for param, states in optimizer.state.items():
61 | param_name = self.tensor_name[id(param)]
62 | for name, tensor in states.items():
63 | self.tensor_name[id(tensor)] = f'{optimizer_name}.{param_name}.{name}'
64 | # self.tensor_name[id()]
65 | # print(states)
66 |
67 |
68 | def collect_tensor(self):
69 | """Collect all tensor objects tracked by python
70 |
71 | NOTICE:
72 | - the buffers for backward which is implemented in C++ are
73 | not tracked by python's reference counting.
74 | - the gradients(.grad) of Parameters is not collected, and
75 | I don't know why.
76 | """
77 | #FIXME: make the grad tensor collected by gc
78 | # Do a pre-garbage collect to eliminate python garbage objects
79 | if self.pre_collect:
80 | gc.collect()
81 | objects = gc.get_objects()
82 | tensors = [obj for obj in objects if isinstance(obj, torch.Tensor)]
83 | for t in tensors:
84 | self.device_mapping[t.device].append(t)
85 |
86 | def get_stats(self):
87 | """Get the memory stat of tensors and then release them
88 |
89 | As a memory profiler, we cannot hold the reference to any tensors, which
90 | causes possibly inaccurate memory usage stats, so we delete the tensors after
91 | getting required stats"""
92 | visited_data = {}
93 | self.device_tensor_stat.clear()
94 |
95 | def get_tensor_stat(tensor: torch.Tensor) -> List[Tuple[str, int, int, int]]:
96 | """Get the stat of a single tensor
97 |
98 | Returns:
99 | - stat: a tuple containing (tensor_name, tensor_size,
100 | tensor_numel, tensor_memory)
101 | """
102 | assert isinstance(tensor, torch.Tensor)
103 |
104 | name = self._get_tensor_name(tensor)
105 | if tensor.is_sparse:
106 | indices_stat = get_tensor_stat(tensor._indices())
107 | values_stat = get_tensor_stat(tensor._values())
108 | return indices_stat + values_stat
109 |
110 | numel = tensor.numel()
111 | element_size = tensor.element_size()
112 | fact_numel = tensor.untyped_storage().size()
113 | fact_memory_size = fact_numel * element_size
114 | # since pytorch allocate at least 512 Bytes for any tensor, round
115 | # up to a multiple of 512
116 | memory_size = math.ceil(fact_memory_size / PYTORCH_MIN_ALLOCATE) \
117 | * PYTORCH_MIN_ALLOCATE
118 |
119 | # tensor.storage should be the actual object related to memory
120 | # allocation
121 | data_ptr = tensor.untyped_storage().data_ptr()
122 | if data_ptr in visited_data:
123 | name = '{}(->{})'.format(
124 | name,
125 | visited_data[data_ptr],
126 | )
127 | # don't count the memory for reusing same underlying storage
128 | memory_size = 0
129 | else:
130 | visited_data[data_ptr] = name
131 |
132 | size = tuple(tensor.size())
133 | # torch scalar has empty size
134 | if not size:
135 | size = (1,)
136 |
137 | return [(name, size, numel, memory_size)]
138 |
139 | for device, tensors in self.device_mapping.items():
140 | tensor_stats = []
141 | for tensor in tensors:
142 |
143 | if tensor.numel() == 0:
144 | continue
145 | stat = get_tensor_stat(tensor) # (name, shape, numel, memory_size)
146 | tensor_stats += stat
147 | if isinstance(tensor, torch.nn.Parameter):
148 | if tensor.grad is not None:
149 | # manually specify the name of gradient tensor
150 | self.tensor_name[id(tensor.grad)] = '{}.grad'.format(
151 | self._get_tensor_name(tensor)
152 | )
153 | stat = get_tensor_stat(tensor.grad)
154 | tensor_stats += stat
155 |
156 | self.device_tensor_stat[device] = tensor_stats
157 |
158 | self.device_mapping.clear()
159 |
160 | def print_stats(self, verbose: bool = False, target_device: Optional[torch.device] = None) -> None:
161 | # header
162 | show_reuse = verbose
163 | template_format = '{:<40s}{:>20s}{:>10s}'
164 | print(template_format.format('Element type', 'Size', 'Used MEM') )
165 | for device, tensor_stats in self.device_tensor_stat.items():
166 | # By default, if the target_device is not specified,
167 | # print tensors on all devices
168 | if target_device is not None and device != target_device:
169 | continue
170 | print('-' * LEN)
171 | print('Storage on {}'.format(device))
172 | total_mem = 0
173 | total_numel = 0
174 | for stat in tensor_stats:
175 | name, size, numel, mem = stat
176 | if not show_reuse:
177 | name = name.split('(')[0]
178 | print(template_format.format(
179 | str(name),
180 | str(size),
181 | readable_size(mem),
182 | ))
183 | total_mem += mem
184 | total_numel += numel
185 |
186 | print('-'*LEN)
187 | print('Total Tensors: {} \tUsed Memory: {}'.format(
188 | total_numel, readable_size(total_mem),
189 | ))
190 |
191 | if device != torch.device('cpu'):
192 | with torch.cuda.device(device):
193 | memory_allocated = torch.cuda.memory_allocated()
194 | print('The allocated memory on {}: {}'.format(
195 | device, readable_size(memory_allocated),
196 | ))
197 | if memory_allocated != total_mem:
198 | print('Memory differs due to the matrix alignment or'
199 | ' invisible gradient buffer tensors')
200 | print('-'*LEN)
201 |
202 | def report(self, verbose: bool = False, device: Optional[torch.device] = None) -> None:
203 | """Interface for end-users to directly print the memory usage
204 |
205 | args:
206 | - verbose: flag to show tensor.storage reuse information
207 | - device: `torch.device` object, specify the target device
208 | to report detailed memory usage. It will print memory usage
209 | on all devices if not specified. Usually we only want to
210 | print the memory usage on CUDA devices.
211 |
212 | """
213 | self.collect_tensor()
214 | self.get_stats()
215 | self.print_stats(verbose, target_device=device)
216 |
--------------------------------------------------------------------------------
/pytorch_memlab/utils.py:
--------------------------------------------------------------------------------
1 | from math import isnan
2 | from calmsize import size as calmsize
3 |
4 | def readable_size(num_bytes: int) -> str:
5 | return '' if isnan(num_bytes) else '{:.2f}'.format(calmsize(num_bytes))
6 |
--------------------------------------------------------------------------------
/readme-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/readme-output.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | try:
4 | long_description = open('README.md').read()
5 | except FileNotFoundError:
6 | long_description = ''
7 |
8 | setup(
9 | name='pytorch-memlab',
10 | version='0.3.0',
11 | licence='MIT',
12 | description='A lab to do simple and accurate memory experiments on pytorch',
13 | long_description=long_description,
14 | long_description_content_type='text/markdown',
15 | classifiers=[
16 | "Programming Language :: Python",
17 | "Topic :: Software Development :: Libraries :: Python Modules",
18 | ],
19 | keywords='pytorch memory profile',
20 | author='Kaiyu Shi',
21 | author_email='skyisno.1@gmail.com',
22 | url='https://github.com/Stonesjtu/pytorch_memlab',
23 | license='MIT',
24 | include_package_data=True,
25 | zip_safe=True,
26 | python_requires='>=3.8',
27 | install_requires=[
28 | 'setuptools',
29 | 'calmsize',
30 | 'pandas',
31 | 'torch>=2.0',
32 | ],
33 | extras_require={
34 | 'ipython': ['IPython>=0.13'],
35 | 'test': ['pytest'],
36 | },
37 | packages=find_packages(),
38 | )
39 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stonesjtu/pytorch_memlab/d7ed8e0f75abaaac197c0f9271085a40f2c9083b/test/__init__.py
--------------------------------------------------------------------------------
/test/test_courtesy.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from pytorch_memlab import Courtesy, MemReporter
4 |
5 | def test_reporter():
6 | linear = torch.nn.Linear(1024, 1024).cuda()
7 | inp = torch.Tensor(512, 1024).cuda()
8 |
9 | out = linear(inp).mean()
10 | out.backward()
11 |
12 | reporter = MemReporter(linear)
13 | reporter.report()
14 | ct = Courtesy()
15 | ct.yield_memory()
16 | print('gpu>>>>>>>>>>>>>>>>>>cpu')
17 | reporter.report()
18 | ct.restore()
19 | print('cpu>>>>>>>>>>>>>>>>>>gpu')
20 | reporter.report()
21 |
22 | def test_courtesy_context():
23 | linear = torch.nn.Linear(1024, 1024).cuda()
24 | inp = torch.Tensor(512, 1024).cuda()
25 |
26 | out = linear(inp).mean()
27 | out.backward()
28 |
29 | reporter = MemReporter(linear)
30 | with Courtesy() as ct:
31 | print('gpu>>>>>>>>>>>>>>>>>>cpu')
32 | reporter.report()
33 |
--------------------------------------------------------------------------------
/test/test_line_profiler.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import numpy as np
4 | import pytest
5 | import torch
6 | from pytorch_memlab import (LineProfiler, clear_global_line_profiler, profile,
7 | profile_every, set_target_gpu)
8 |
9 |
10 | def test_display():
11 |
12 | def main():
13 | linear = torch.nn.Linear(100, 100).cuda()
14 | part1()
15 | part2()
16 |
17 | def part1():
18 | lstm = torch.nn.LSTM(1000, 1000).cuda()
19 | subpart11()
20 |
21 | def part2():
22 | linear_2 = torch.nn.Linear(100, 100).cuda()
23 | linear_3 = torch.nn.Linear(100, 100).cuda()
24 |
25 | def subpart11():
26 | linear = torch.nn.Linear(100, 100).cuda()
27 | linear_2 = torch.nn.Linear(100, 100).cuda()
28 | linear_3 = torch.nn.Linear(100, 100).cuda()
29 |
30 | with LineProfiler(subpart11, part2) as prof:
31 | main()
32 |
33 | s = str(prof.display()) # cast from line_records.RecordsDisplay
34 | assert re.search("## .*subpart11", s)
35 | assert "def subpart11():" in s
36 | assert re.search("## .*part2", s)
37 | assert "def part2():" in s
38 |
39 |
40 | def test_line_report():
41 |
42 | def work():
43 | # comment
44 | linear = torch.nn.Linear(100, 100).cuda()
45 | linear_2 = torch.nn.Linear(100, 100).cuda()
46 | linear_3 = torch.nn.Linear(100, 100).cuda()
47 |
48 | def work_3():
49 | lstm = torch.nn.LSTM(1000, 1000).cuda()
50 |
51 | def work_2():
52 | # comment
53 | linear = torch.nn.Linear(100, 100).cuda()
54 | linear_2 = torch.nn.Linear(100, 100).cuda()
55 | linear_3 = torch.nn.Linear(100, 100).cuda()
56 | work_3()
57 |
58 | line_profiler = LineProfiler(work, work_2)
59 | line_profiler.enable()
60 |
61 | work()
62 | work_2()
63 |
64 | line_profiler.disable()
65 | line_profiler.print_stats()
66 |
67 |
68 | def test_line_report_decorator():
69 | clear_global_line_profiler()
70 |
71 | @profile_every(output_interval=3)
72 | def work():
73 | # comment
74 | linear = torch.nn.Linear(100, 100).cuda()
75 | linear_2 = torch.nn.Linear(100, 100).cuda()
76 | linear_3 = torch.nn.Linear(100, 100).cuda()
77 |
78 | @profile_every(output_interval=1)
79 | def work2():
80 | # comment
81 | linear = torch.nn.Linear(100, 100).cuda()
82 | linear_2 = torch.nn.Linear(100, 100).cuda()
83 | linear_3 = torch.nn.Linear(100, 100).cuda()
84 |
85 | work()
86 | work2()
87 | work()
88 | work()
89 |
90 |
91 | def test_line_report_method():
92 | clear_global_line_profiler()
93 |
94 | class Net(torch.nn.Module):
95 | def __init__(self):
96 | super().__init__()
97 | self.linear = torch.nn.Linear(100, 100).cuda()
98 | self.drop = torch.nn.Dropout(0.1)
99 |
100 | @profile_every(1)
101 | def forward(self, inp):
102 | return self.drop(self.linear(inp))
103 |
104 | net = Net()
105 | inp = torch.Tensor(50, 100).cuda()
106 | net(inp)
107 |
108 |
109 | def test_line_report_profile():
110 | clear_global_line_profiler()
111 |
112 | @profile
113 | def work():
114 | # comment
115 | linear = torch.nn.Linear(100, 100).cuda()
116 | linear_2 = torch.nn.Linear(100, 100).cuda()
117 | linear_3 = torch.nn.Linear(100, 100).cuda()
118 |
119 | work()
120 | work()
121 |
122 |
123 | def test_line_report_profile_set_gpu():
124 | clear_global_line_profiler()
125 |
126 | @profile
127 | def work():
128 | # comment
129 | set_target_gpu(1)
130 | linear = torch.nn.Linear(100, 100).cuda(1)
131 | set_target_gpu(0)
132 | linear_2 = torch.nn.Linear(100, 100).cuda(0)
133 | linear_3 = torch.nn.Linear(100, 100).cuda(1)
134 |
135 | work()
136 | work()
137 |
138 |
139 | def test_line_report_profile_interrupt():
140 | clear_global_line_profiler()
141 |
142 | @profile
143 | def work():
144 | # comment
145 | linear = torch.nn.Linear(100, 100).cuda()
146 | linear_2 = torch.nn.Linear(100, 100).cuda()
147 | linear_3 = torch.nn.Linear(100, 100).cuda()
148 |
149 | @profile_every(1)
150 | def work2():
151 | linear_2 = torch.nn.Linear(100, 100).cuda()
152 | linear_3 = torch.nn.Linear(100, 100).cuda()
153 |
154 | work()
155 | work2()
156 | raise KeyboardInterrupt
157 |
--------------------------------------------------------------------------------
/test/test_mem_reporter.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.optim
3 | from pytorch_memlab import MemReporter
4 |
5 | import pytest
6 |
7 |
8 | concentrate_mode = False
9 |
10 | def test_reporter():
11 | linear = torch.nn.Linear(1024, 1024)
12 | inp = torch.Tensor(512, 1024)
13 | reporter = MemReporter(linear)
14 |
15 | out = linear(inp*(inp+3)).mean()
16 | reporter.report()
17 | out.backward()
18 |
19 | reporter.report()
20 |
21 | def test_reporter_without_model():
22 | linear = torch.nn.Linear(1024, 1024)
23 | inp = torch.Tensor(512, 1024)
24 | reporter = MemReporter()
25 |
26 | out = linear(inp*(inp+3)).mean()
27 | reporter.report()
28 | out.backward()
29 |
30 | reporter.report()
31 |
32 | def test_reporter_sparse_tensor():
33 | emb = torch.nn.Embedding(1024, 1024, sparse=True)
34 | inp = torch.arange(0, 128)
35 | reporter = MemReporter()
36 |
37 | out = emb(inp).mean()
38 | reporter.report()
39 | out.backward()
40 | b = emb.weight.grad * 2
41 |
42 | reporter.report()
43 |
44 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
45 | def test_reporter_tie_weight():
46 | linear = torch.nn.Linear(1024, 1024)
47 | linear_2 = torch.nn.Linear(1024, 1024)
48 | linear_2.weight = linear.weight
49 | container = torch.nn.Sequential(
50 | linear, linear_2
51 | )
52 | reporter = MemReporter(container)
53 | inp = torch.Tensor(512, 1024)
54 |
55 | out = container(inp).mean()
56 | out.backward()
57 |
58 | reporter = MemReporter(container)
59 | reporter.report()
60 |
61 | def test_reporter_with_optimizer():
62 | linear = torch.nn.Linear(1024, 1024)
63 | inp = torch.Tensor(512, 1024)
64 | optimizer = torch.optim.Adam(linear.parameters())
65 | # reporter = MemReporter(linear)
66 |
67 | out = linear(inp*(inp+3)*(inp+2)).mean()
68 | reporter = MemReporter(linear)
69 | reporter.report()
70 | out.backward()
71 | # reporter.report()
72 | optimizer.step()
73 |
74 | reporter.add_optimizer(optimizer)
75 | reporter.report()
76 |
77 |
78 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
79 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
80 | def test_reporter_LSTM():
81 | lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda()
82 | # lstm.flatten_parameters()
83 | inp = torch.Tensor(256, 256, 256).cuda()
84 | out, _ = lstm(inp)
85 | out.mean().backward()
86 |
87 | reporter = MemReporter(lstm)
88 | reporter.report()
89 |
90 | @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
91 | @pytest.mark.skipif(concentrate_mode, reason='concentrate')
92 | def test_reporter_device():
93 | lstm_cpu = torch.nn.LSTM(256, 256)
94 | lstm = torch.nn.LSTM(256, 256, num_layers=1).cuda()
95 | # lstm.flatten_parameters()
96 | inp = torch.Tensor(256, 256, 256).cuda()
97 | out, _ = lstm(inp)
98 | out.mean().backward()
99 |
100 | reporter = MemReporter(lstm)
101 | reporter.report()
102 | reporter.report(device=torch.device('cuda:0'))
103 |
--------------------------------------------------------------------------------