├── .gitignore
├── CONTRIBUTING.md
├── ICLA.txt
├── LICENSE
├── README.md
├── docs
    ├── Internals.md
    └── images
    │   └── LMS_Flow.png
├── examples
    ├── LICENSE
    ├── ManyModel.py
    ├── README.md
    └── lmsstats.py
└── patches
    ├── pytorch_v1.1.0_large_model_support.patch
    ├── pytorch_v1.2.0_large_model_support.patch
    ├── pytorch_v1.3.0_large_model_support.patch
    ├── pytorch_v1.3.1_large_model_support.patch
    ├── pytorch_v1.4.0_large_model_support.patch
    └── pytorch_v1.5.0_large_model_support.patch


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing guidelines
2 | 
3 | ## Contributor License Agreement
4 | 
5 | Contributors are required to execute a contributor license agreement before
6 | their contributions can be accepted. The ICLA (individual contributor license
7 | agreement) can be found [here](ICLA.txt).
8 | 


--------------------------------------------------------------------------------
/ICLA.txt:
--------------------------------------------------------------------------------
  1 | Individual Contributor License Agreement
  2 | In order to clarify the intellectual property license
  3 | granted with Contributions from any person or entity, the IBM
  4 | must have a Contributor License Agreement ("CLA") on file that has
  5 | been signed by each Contributor, indicating agreement to the license
  6 | terms below. This license is for your protection as a Contributor as
  7 | well as the protection of IBM and users the users of this repository; it does not
  8 | change your rights to use your own Contributions for any other purpose.
  9 | If you have not already done so, please complete and sign, then scan
 10 | and email a pdf file of this Agreement to mbrandy@us.ibm.com.
 11 | Please read this document carefully before signing and keep a copy
 12 | for your records.
 13 | Full name: ______________________________________________________
 14 | (optional) Public name: _________________________________________
 15 | Postal Address: ________________________________________________
 16 | ________________________________________________
 17 | Country: ________________________________________________
 18 | Telephone: ______________________________________________________
 19 | E-Mail: ______________________________________________________
 20 | You accept and agree to the following terms and conditions for Your
 21 | present and future Contributions submitted to this repository. In
 22 | return, IBM shall not use Your Contributions in a way that
 23 | is contrary to the public benefit. Except
 24 | for the license granted herein to IBM and recipients of
 25 | software distributed by IBM, You reserve all right, title,
 26 | and interest in and to Your Contributions.
 27 | 1. Definitions.
 28 | "You" (or "Your") shall mean the copyright owner or legal entity
 29 | authorized by the copyright owner that is making this Agreement
 30 | with IBM. For legal entities, the entity making a
 31 | Contribution and all other entities that control, are controlled
 32 | by, or are under common control with that entity are considered to
 33 | be a single Contributor. For the purposes of this definition,
 34 | "control" means (i) the power, direct or indirect, to cause the
 35 | direction or management of such entity, whether by contract or
 36 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
 37 | outstanding shares, or (iii) beneficial ownership of such entity.
 38 | "Contribution" shall mean any original work of authorship,
 39 | including any modifications or additions to an existing work, that
 40 | is intentionally submitted by You to IBM for inclusion
 41 | in, or documentation of, any of the products owned or managed by
 42 | IBM (the "Work"). For the purposes of this definition,
 43 | "submitted" means any form of electronic, verbal, or written
 44 | communication sent to IBM or its representatives,
 45 | including but not limited to communication on electronic mailing
 46 | lists, source code control systems, and issue tracking systems that
 47 | are managed by, or on behalf of, IBM for the purpose of
 48 | discussing and improving the Work, but excluding communication that
 49 | is conspicuously marked or otherwise designated in writing by You
 50 | as "Not a Contribution."
 51 | 2. Grant of Copyright License. Subject to the terms and conditions of
 52 | this Agreement, You hereby grant to IBM and to
 53 | recipients of software distributed by IBM a perpetual,
 54 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 55 | copyright license to reproduce, prepare derivative works of,
 56 | publicly display, publicly perform, sublicense, and distribute Your
 57 | Contributions and such derivative works.
 58 | 3. Grant of Patent License. Subject to the terms and conditions of
 59 | this Agreement, You hereby grant to IBM and to
 60 | recipients of software distributed by IBM a perpetual,
 61 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 62 | (except as stated in this section) patent license to make, have
 63 | made, use, offer to sell, sell, import, and otherwise transfer the
 64 | Work, where such license applies only to those patent claims
 65 | licensable by You that are necessarily infringed by Your
 66 | Contribution(s) alone or by combination of Your Contribution(s)
 67 | with the Work to which such Contribution(s) was submitted. If any
 68 | entity institutes patent litigation against You or any other entity
 69 | (including a cross-claim or counterclaim in a lawsuit) alleging
 70 | that your Contribution, or the Work to which you have contributed,
 71 | constitutes direct or contributory patent infringement, then any
 72 | patent licenses granted to that entity under this Agreement for
 73 | that Contribution or Work shall terminate as of the date such
 74 | litigation is filed.
 75 | 4. You represent that you are legally entitled to grant the above
 76 | license. If your employer(s) has rights to intellectual property
 77 | that you create that includes your Contributions, you represent
 78 | that you have received permission to make Contributions on behalf
 79 | of that employer, that your employer has waived such rights for
 80 | your Contributions to IBM, or that your employer has
 81 | executed a separate Corporate CLA with IBM.
 82 | 5. You represent that each of Your Contributions is Your original
 83 | creation (see section 7 for submissions on behalf of others). You
 84 | represent that Your Contribution submissions include complete
 85 | details of any third-party license or other restriction (including,
 86 | but not limited to, related patents and trademarks) of which you
 87 | are personally aware and which are associated with any part of Your
 88 | Contributions.
 89 | 6. You are not expected to provide support for Your Contributions,
 90 | except to the extent You desire to provide support. You may provide
 91 | support for free, for a fee, or not at all. Unless required by
 92 | applicable law or agreed to in writing, You provide Your
 93 | Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
 94 | OF ANY KIND, either express or implied, including, without
 95 | limitation, any warranties or conditions of TITLE, NON-
 96 | INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
 97 | 7. Should You wish to submit work that is not Your original creation,
 98 | You may submit it to IBM separately from any
 99 | Contribution, identifying the complete details of its source and of
100 | any license or other restriction (including, but not limited to,
101 | related patents, trademarks, and license agreements) of which you
102 | are personally aware, and conspicuously marking the work as
103 | "Submitted on behalf of a third-party: [named here]".
104 | 8. You agree to notify IBM of any facts or circumstances of
105 | which you become aware that would make these representations
106 | inaccurate in any respect.
107 | Please sign: __________________________________ Date: ________________
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #### &lt;Call for Feedback&gt;
  2 | 
  3 | A PyTorch LMS user recently opened an issue asking for community support to integrate LMS into an official version of PyTorch:
  4 | 
  5 | https://github.com/pytorch/pytorch/issues/35633
  6 | 
  7 | This is a good opportunity to gather any and all user testimonials and success stories to document the value of LMS in a public place.  Please feel free to share your support and any thoughts there in the conversation.
  8 | 
  9 | #### &lt;/Call for Feedback&gt;
 10 | 
 11 | ***
 12 | 
 13 | # PyTorch Large Model Support
 14 | 
 15 | PyTorch Large Model Support (LMS) is a feature in the PyTorch provided
 16 | by [IBM Watson Machine Learning Community Edition](https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda/) (WML CE) that allows the
 17 | successful training of deep learning models that would otherwise exhaust GPU
 18 | memory and abort with "out-of-memory" errors. LMS manages this
 19 | oversubscription of GPU memory by temporarily swapping tensors to host memory
 20 | when they are not needed.
 21 | 
 22 | One or more elements of a deep learning model can lead to GPU memory exhaustion.
 23 | 
 24 | These include:
 25 | 
 26 |  * Model depth and complexity
 27 |  * Base data size (for example, high-resolution images)
 28 |  * Batch size
 29 | 
 30 | Traditionally, the solution to this problem has been to modify the model until
 31 | it fits in GPU memory. This approach, however, can negatively impact
 32 | accuracy – especially if concessions are made by reducing data
 33 | fidelity or model complexity.
 34 | 
 35 | With LMS, deep learning models can scale significantly beyond what was
 36 | previously possible and, ultimately, generate more accurate results.
 37 | 
 38 | # Installing PyTorch Large Model Support
 39 | 
 40 | LMS is built into the `pytorch` conda package so it is installed by
 41 | default when you install the GPU enabled PyTorch from WML CE.
 42 | The support is currently available in the [WML CE conda channel](https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda/#/).
 43 | For more information on this channel, how to add channels, and install
 44 | frameworks see [this WML CE install documentation](https://www.ibm.com/support/knowledgecenter/SS5SF7_1.7.0/navigation/wmlce_install.htm).
 45 | 
 46 | 
 47 | # How to enable LMS
 48 | 
 49 | The LMS functionality is disabled by default in PyTorch and needs to be
 50 | enabled before your model creates tensors. Enabling LMS is
 51 | as simple as calling the enablement API at the start of your program:
 52 | 
 53 | ```python
 54 | import torch
 55 | torch.cuda.set_enabled_lms(True)
 56 | ```
 57 | 
 58 | # Examples
 59 | The ManyModel.py example, found in the [PyTorch LMS examples](examples/),
 60 | uses synthetic random images with multiple models provided by
 61 | PyTorch's torchvision to allow users a fast hands-on experience with
 62 | LMS. The example allows users to change the image size, explore auto-tuning,
 63 | and manually set the LMS tunable parameters on various model architectures.
 64 | 
 65 | # Usage tips
 66 | 
 67 | ## Use NUMA pinning for single GPU use
 68 | If you are utilizing a single GPU it is recommended to use NUMA pinning to pin
 69 | the process to the CPU and memory that is on the same system socket as the
 70 | GPU being used. Pinning the process allows the fastest connection paths between
 71 | system memory and GPU memory, which reduces the training or inferencing time.
 72 | WML CE includes the numactl utility that can be used to do this pinning. It
 73 | can be installed with the `conda install numactl` command. The following
 74 | example shows how to specify a single GPU to be used and how to pin the
 75 | process to use the CPU cores and memory that are on the same socket
 76 | as the specified GPU:
 77 | 
 78 | ```sh
 79 | export CUDA_VISIBLE_DEVICES=0
 80 | numactl --cpunodebind=0 --membind=0 python train.py
 81 | ```
 82 | 
 83 | ## Use Horovod when using more than one GPU
 84 | It is recommended to use Horovod distribution when using more than one GPU
 85 | because Horovod creates a separate process per GPU and automatically sets the
 86 | process have socket affinity with the GPU which allows the fastest
 87 | connection paths between system memory and GPU memory, which reduces the
 88 | training or inferencing time.
 89 | 
 90 | # Model memory usage analysis with allocator statistics
 91 | LMS adds a few statistics to the GPU memory statistics API such as
 92 | the distribution of allocation sources (free-list, cudaMalloc, LMS reclaim, etc), the amount
 93 | of memory swapped, and more. For more information on the new statistics
 94 | and examples of their usage see the [PyTorch LMS examples](examples/).
 95 | 
 96 | # Building PyTorch from source with Large Model Support
 97 | The [patches](patches/) directory contains git patches for the LMS code.
 98 | The file names correspond to tag levels in the
 99 | [PyTorch source](https://github.com/pytorch/pytorch/). To build
100 | PyTorch from source with Large Model Support, check out the
101 | specific PyTorch git tag and then apply the corresponding PyTorch Large
102 | Model Support patch file.
103 | 
104 | For example:
105 | ```sh
106 | git clone https://github.com/pytorch/pytorch
107 | cd pytorch
108 | git checkout v1.4.0
109 | git am /pytorch-large-model-support/patches/pytorch_v1.4.0_large_model_support.patch
110 | ```
111 | 
112 | # Contribution guidelines
113 | 
114 | If you want to contribute to PyTorch Large Model Support please read the
115 | [contribution guidelines](CONTRIBUTING.md).
116 | 


--------------------------------------------------------------------------------
/docs/Internals.md:
--------------------------------------------------------------------------------
 1 | ## PyTorch Large Model Support Internals
 2 | 
 3 | The implemenation of Large Model Support introduces some basic tensor states.  Understanding these states and the actions that trigger state transitions provides a good overview of how LMS works.
 4 | 
 5 | ![LMS phase diagram](/docs/images/LMS_Flow.png)
 6 | 
 7 | A newly constructed tensor starts in the Allocated state.  Tensors may be destroyed in any state other than Active.<br>
 8 | Any transition to/from the Reclaimed state (crossing the dotted line in the figure above) requires a transfer of the tensor's data between GPU and host memory.
 9 | 
10 | Actions/Triggers:
11 | * _Pin_: Performed on all input tensors to a given operation (e.g. network layer) prior to data access and computation.
12 | * _Unpin_: Reverse of the pin operation.  Performed after operation completes.
13 | * _Reclaim_: Performed by the CUDA Caching Allocator as needed to satisfy new allocation requests.  This is done only when the free list (cache) contains no suitable allocations and the allocation limit has been met.  The operation is performed on a minimal subset of inactive tensors in order to satisfy the allocation request.
14 | * _Access_: This represents a request to access the data of an unpinned tensor. This is rare.
15 | 
16 | ## Implementation Details
17 | 
18 | CUDA Caching Allocator (`c10::cuda::CudaCachingAllocator` et al.)
19 | * Add per-device allocator object (`DeviceCachingAllocator`) to reduce lock contention and `BlockPool` management costs. The new device-specific allocator manages BlockPools (free blocks) and the set of currently inactive tensors (`reclaim_list`) for each GPU.
20 | * Add management of LMS settings (enabled, allocation limit) to `THCCachingAllocator` (`lms_settings`).
21 | * Provide CUDA-specific implementation of `LmsStorageImpl` (`CudaLmsStorageImpl`).  This defines the low level Tensor operations required for LMS (page-out, page-in, reclaim-list-add/remove).
22 | * Provide CUDA-specific implementation of `Allocator::AsLmsStorage()` (`CudaCachingAllocator::AsLmsStorage()`).  When LMS is enabled, this supplies a new `CudaLmsStorageImpl` instance during `StorageImpl` construction -- effectively enabling LMS for any associated Tensor.
23 | * Add ability to reclaim GPU memory from suitable inactive tensors in order to satisfy new allocation requests (`reclaim_block`).
24 | * Add speculative page-out mechanism.  This predicts which tensors will be reclaimed and triggers early page-out (concurrent with the compute stream) to reduce the swapping latency (`predict_reclaim()`, `record_reclaim()`)
25 | * Add new statistics (pinned, reclaimed, allocation distribution).
26 | 
27 | Allocator (`c10::Allocator`)
28 | * Add `AsLmsStorage()` virtual function with default implementation that simply returns `nullptr`. LMS is not enabled/supported by default.  Subclasses must explicitly implement and advertise support.
29 | 
30 | LmsStorageImpl (`c10::LmsStorageImpl`)
31 | * This new abstract class represents an LMS implementation.
32 | * It defines operations required for LMS (pin, unpin, page-out, page-in, reclaim-list-add/remove) -- providing common logic (applicable across different implementations) and calling out to the low-level methods implemented in the allocator's derived class otherwise.
33 | 
34 | StorageImpl (`c10::StorageImpl`)
35 | * Add member, `std::unique_ptr<LmsStorageImpl> lms_`.  This provides access to the underlying LMS implementation (if any) specified by the allocator during construction.
36 | * Add high level entry points for operations required for LMS (pin, unpin, data-access). These are simply pass-throughs to the underlying LMS object.
37 | 
38 | IntrusiveList and IntrusiveListHook (`c10::IntrusiveList`, `c10::IntrusiveListHook`)
39 | * These new classes are used to manage the set of inactive tensors.
40 | * Element objects embed the `IntrustiveListHook`, which provides the following properties:
41 |   * Insertion and removal operations are O(1) and require no memory allocation or deletion.
42 |   * Element destruction is valid and can be performed safely regardless of list membership.
43 | 
44 | TensorGuard (`at::TensorGuard`)
45 | * This new class ensures that a tensor's storage is pinned during an operation in which its data may be accessed.
46 | * This is analogous to the existing `DeviceGuard`. Like `DeviceGuard`, these objects are instantiated in the operation-specific generated code (see `function_wrapper.py`) and leverage C++ scoping to pin/unpin the storage corresponding to the set of tensors involved in the given operation.
47 | 
48 | PyTorch Python API (`torch.cuda`)
49 | * Add LMS control and tuning services (enable, allocation limit).
50 | * Add LMS statistics to cuda `memory_stats` API (pinned, reclaimed, allocation distribution).
51 | 
52 | Unit Tests (`test_cuda.py`)
53 | * Add `test_large_model_support`.
54 | 


--------------------------------------------------------------------------------
/docs/images/LMS_Flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IBM/pytorch-large-model-support/cf971a3ddbfe5643556d88be4d4723370d88c3fd/docs/images/LMS_Flow.png


--------------------------------------------------------------------------------
/examples/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/examples/ManyModel.py:
--------------------------------------------------------------------------------
  1 | # This example is based on:
  2 | #     https://github.com/pytorch/examples/blob/ee964a2/imagenet/main.py
  3 | #
  4 | # It provides a convenient way to test out the capabilities of Large
  5 | # Model Support (LMS). It uses a variety of models from torchvision to
  6 | # demonstrate LMS on model configurations that cannot fit in GPU
  7 | # memory when using larger resolution data.
  8 | #
  9 | # To simplify the running of the model with different higher
 10 | # resolution images, the code uses torchvision.datasets.FakeData to
 11 | # generate synthetic image data.
 12 | #
 13 | # Command line parameters allow the user to test different models,
 14 | # change the size of the input image data and enable or disable LMS.
 15 | #
 16 | # Invocation examples:
 17 | # Run with LMS:
 18 | #   python ManyModel.py -a resnet152 --image-size 4000
 19 | # Run without LMS:
 20 | #   python ManyModel.py -a resnet152 --image-size 2000 --no-lms
 21 | #
 22 | # To observe the behavior of LMS across a series of resolutions --
 23 | # logging the allocator stats to a file '<model>-summary.csv' -- create
 24 | # a bash script with the following commands and run it (passing the
 25 | # model architecture of your choice):
 26 | #
 27 | #     model=$1
 28 | #     let res=500
 29 | #     while : ; do
 30 | #         echo $res
 31 | #         python ManyModel.py -a $model --image-size $res --log summary
 32 | #         [[ $? -eq 0 ]] || break
 33 | #         ((res = res + 500))
 34 | #     done
 35 | #
 36 | 
 37 | import argparse
 38 | import os
 39 | import random
 40 | import shutil
 41 | import sys
 42 | import time
 43 | import warnings
 44 | 
 45 | import torch
 46 | import torch.nn as nn
 47 | import torch.nn.parallel
 48 | import torch.backends.cudnn as cudnn
 49 | import torch.distributed as dist
 50 | import torch.optim
 51 | import torch.multiprocessing as mp
 52 | import torch.utils.data
 53 | import torch.utils.data.distributed
 54 | import torchvision.transforms as transforms
 55 | import torchvision.datasets as datasets
 56 | import torchvision.models as models
 57 | 
 58 | from lmsstats import LMSStatsLogger, LMSStatsSummary
 59 | 
 60 | model_names = sorted(name for name in models.__dict__
 61 |     if name.islower() and not name.startswith("__")
 62 |     and callable(models.__dict__[name]))
 63 | 
 64 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 65 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
 66 |                     choices=model_names,
 67 |                     help='model architecture: ' +
 68 |                         ' | '.join(model_names) +
 69 |                         ' (default: resnet18)')
 70 | parser.add_argument('--image-size', default=500, type=int, metavar='N')
 71 | parser.add_argument('--log', default='steps', type=str)
 72 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 73 |                     help='number of data loading workers (default: 4)')
 74 | parser.add_argument('--epochs', default=1, type=int, metavar='N',
 75 |                     help='number of total epochs to run')
 76 | parser.add_argument('-b', '--batch-size', default=1, type=int,
 77 |                     metavar='N',
 78 |                     help='mini-batch size (default: 1), this is the total '
 79 |                          'batch size of all GPUs on the current node when '
 80 |                          'using Data Parallel or Distributed Data Parallel')
 81 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 82 |                     metavar='LR', help='initial learning rate', dest='lr')
 83 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 84 |                     help='momentum')
 85 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 86 |                     metavar='W', help='weight decay (default: 1e-4)',
 87 |                     dest='weight_decay')
 88 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 89 |                     help='use pre-trained model')
 90 | parser.add_argument('--world-size', default=-1, type=int,
 91 |                     help='number of nodes for distributed training')
 92 | parser.add_argument('--rank', default=-1, type=int,
 93 |                     help='node rank for distributed training')
 94 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 95 |                     help='url used to set up distributed training')
 96 | parser.add_argument('--dist-backend', default='nccl', type=str,
 97 |                     help='distributed backend')
 98 | parser.add_argument('--seed', default=None, type=int,
 99 |                     help='seed for initializing training. ')
100 | parser.add_argument('--gpu', default=None, type=int,
101 |                     help='GPU id to use.')
102 | parser.add_argument('--multiprocessing-distributed', action='store_true',
103 |                     help='Use multi-processing distributed training to launch '
104 |                          'N processes per node, which has N GPUs. This is the '
105 |                          'fastest way to use PyTorch for either single node or '
106 |                          'multi node data parallel training')
107 | parser.add_argument('--lms', dest='userEnabledLMS', action='store_true')
108 | parser.add_argument('--no-lms', dest='userEnabledLMS', action='store_false')
109 | parser.set_defaults(userEnabledLMS=True)
110 | parser.add_argument('--lms-limit', default=0, type=int, help='limit (in MB)')
111 | 
112 | def main():
113 |     args = parser.parse_args()
114 | 
115 |     if not torch.cuda.is_available():
116 |         print("Error: CUDA is not available. This example requires CUDA to run. Make sure you are not using a CPU-only version of PyTorch or try another example.")
117 |         sys.exit(1)
118 | 
119 |     if args.seed is not None:
120 |         random.seed(args.seed)
121 |         torch.manual_seed(args.seed)
122 |         cudnn.deterministic = True
123 |         warnings.warn('You have chosen to seed training. '
124 |                       'This will turn on the CUDNN deterministic setting, '
125 |                       'which can slow down your training considerably!')
126 | 
127 |     if args.gpu is not None:
128 |         warnings.warn('You have chosen a specific GPU. This will completely '
129 |                       'disable data parallelism.')
130 | 
131 |     if args.dist_url == "env://" and args.world_size == -1:
132 |         args.world_size = int(os.environ["WORLD_SIZE"])
133 | 
134 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
135 | 
136 |     ngpus_per_node = torch.cuda.device_count()
137 |     if args.multiprocessing_distributed:
138 |         # Since we have ngpus_per_node processes per node, the total world_size
139 |         # needs to be adjusted accordingly
140 |         args.world_size = ngpus_per_node * args.world_size
141 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
142 |         # main_worker process function
143 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
144 |     else:
145 |         # Simply call main_worker function
146 |         main_worker(args.gpu, ngpus_per_node, args)
147 | 
148 | 
149 | def main_worker(gpu, ngpus_per_node, args):
150 |     args.gpu = gpu
151 | 
152 |     if args.gpu is not None:
153 |         print("Use GPU: {} for training".format(args.gpu))
154 | 
155 |     if args.distributed:
156 |         if args.dist_url == "env://" and args.rank == -1:
157 |             args.rank = int(os.environ["RANK"])
158 |         if args.multiprocessing_distributed:
159 |             # For multiprocessing distributed training, rank needs to be the
160 |             # global rank among all the processes
161 |             args.rank = args.rank * ngpus_per_node + gpu
162 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
163 |                                 world_size=args.world_size, rank=args.rank)
164 | 
165 |     if args.userEnabledLMS:
166 |         torch.cuda.set_enabled_lms(True)
167 |     if (torch.cuda.get_enabled_lms()):
168 |         if (args.lms_limit > 0):
169 |             torch.cuda.set_limit_lms(args.lms_limit * 1024 * 1024)
170 |         print('[LMS=On limit=' + str(torch.cuda.get_limit_lms()) + ']')
171 | 
172 |     # create model
173 |     if args.pretrained:
174 |         print("=> using pre-trained model '{}'".format(args.arch))
175 |         model = models.__dict__[args.arch](pretrained=True)
176 |     else:
177 |         print("=> creating model '{}'".format(args.arch))
178 |         model = models.__dict__[args.arch]()
179 | 
180 |     if args.distributed:
181 |         # For multiprocessing distributed, DistributedDataParallel constructor
182 |         # should always set the single device scope, otherwise,
183 |         # DistributedDataParallel will use all available devices.
184 |         if args.gpu is not None:
185 |             torch.cuda.set_device(args.gpu)
186 |             model.cuda(args.gpu)
187 |             # When using a single GPU per process and per
188 |             # DistributedDataParallel, we need to divide the batch size
189 |             # ourselves based on the total number of GPUs we have
190 |             args.batch_size = int(args.batch_size / ngpus_per_node)
191 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
192 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
193 |         else:
194 |             model.cuda()
195 |             # DistributedDataParallel will divide and allocate batch_size to all
196 |             # available GPUs if device_ids are not set
197 |             model = torch.nn.parallel.DistributedDataParallel(model)
198 |     elif args.gpu is not None:
199 |         torch.cuda.set_device(args.gpu)
200 |         model = model.cuda(args.gpu)
201 |     else:
202 |         # DataParallel will divide and allocate batch_size to all available GPUs
203 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
204 |             model.features = torch.nn.DataParallel(model.features)
205 |             model.cuda()
206 |         else:
207 |             model = torch.nn.DataParallel(model).cuda()
208 | 
209 |     # define loss function (criterion) and optimizer
210 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
211 | 
212 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
213 |                                 momentum=args.momentum,
214 |                                 weight_decay=args.weight_decay)
215 | 
216 |     cudnn.benchmark = True
217 | 
218 |     # Data loading code
219 |     image_shape = (3, args.image_size, args.image_size)
220 |     num_classes = 15
221 |     transform = transforms.ToTensor()
222 | 
223 |     train_dataset = datasets.FakeData(size=10, num_classes=num_classes, image_size=image_shape,
224 |                                       transform=transform)
225 |     val_dataset   = datasets.FakeData(size=2,  num_classes=num_classes, image_size=image_shape,
226 |                                       transform=transform)
227 | 
228 |     if args.distributed:
229 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
230 |     else:
231 |         train_sampler = None
232 | 
233 |     train_loader = torch.utils.data.DataLoader(
234 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
235 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler)
236 | 
237 |     val_loader = torch.utils.data.DataLoader(
238 |         val_dataset, batch_size=args.batch_size, shuffle=False,
239 |         num_workers=args.workers, pin_memory=True)
240 | 
241 |     logger = get_logger(args)
242 | 
243 |     for epoch in range(0, args.epochs):
244 |         logger.epoch_begin(epoch)
245 |         if args.distributed:
246 |             train_sampler.set_epoch(epoch)
247 |         adjust_learning_rate(optimizer, epoch, args)
248 | 
249 |         # train for one epoch
250 |         train(train_loader, model, criterion, optimizer, epoch, logger, args)
251 | 
252 |         # evaluate on validation set
253 |         validate(val_loader, model, criterion, logger, args)
254 | 
255 |     logger.train_end()
256 | 
257 | def train(train_loader, model, criterion, optimizer, epoch, logger, args):
258 |     # switch to train mode
259 |     model.train()
260 | 
261 |     logger.train_batch_begin(0)
262 |     for i, (images, target) in enumerate(train_loader):
263 |         if args.gpu is not None:
264 |             images = images.cuda(args.gpu, non_blocking=True)
265 |         target = target.cuda(args.gpu, non_blocking=True)
266 | 
267 |         # compute output
268 |         output = model(images)
269 |         loss = criterion(output, target)
270 | 
271 |         # measure accuracy and record loss
272 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
273 | 
274 |         # compute gradient and do SGD step
275 |         optimizer.zero_grad()
276 |         loss.backward()
277 |         optimizer.step()
278 | 
279 |         logger.train_batch_end(i)
280 |         logger.train_batch_begin(i+1)
281 | 
282 | 
283 | def validate(val_loader, model, criterion, logger, args):
284 |     # switch to evaluate mode
285 |     model.eval()
286 | 
287 |     with torch.no_grad():
288 |         logger.validation_batch_begin(0)
289 |         for i, (images, target) in enumerate(val_loader):
290 |             if args.gpu is not None:
291 |                 images = images.cuda(args.gpu, non_blocking=True)
292 |             target = target.cuda(args.gpu, non_blocking=True)
293 | 
294 |             # compute output
295 |             output = model(images)
296 |             loss = criterion(output, target)
297 | 
298 |             # measure accuracy and record loss
299 |             acc1, acc5 = accuracy(output, target, topk=(1, 5))
300 | 
301 |             logger.validation_batch_end(i)
302 |             logger.validation_batch_begin(i+1)
303 | 
304 | 
305 | def adjust_learning_rate(optimizer, epoch, args):
306 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
307 |     lr = args.lr * (0.1 ** (epoch // 30))
308 |     for param_group in optimizer.param_groups:
309 |         param_group['lr'] = lr
310 | 
311 | 
312 | def accuracy(output, target, topk=(1,)):
313 |     """Computes the accuracy over the k top predictions for the specified values of k"""
314 |     with torch.no_grad():
315 |         maxk = max(topk)
316 |         batch_size = target.size(0)
317 | 
318 |         _, pred = output.topk(maxk, 1, True, True)
319 |         pred = pred.t()
320 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
321 | 
322 |         res = []
323 |         for k in topk:
324 |             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
325 |             res.append(correct_k.mul_(100.0 / batch_size))
326 |         return res
327 | 
328 | 
329 | def get_logger(args):
330 |     if args.log == 'summary':
331 |         return LMSStatsSummary('{}-summary.csv'.format(args.arch),
332 |                                input_shape=(args.image_size, args.image_size),
333 |                                gpu_id=args.gpu)
334 |     else:
335 |         return LMSStatsLogger('{}-{}.csv'.format(args.arch, args.image_size), gpu_id=args.gpu)
336 | 
337 | 
338 | if __name__ == '__main__':
339 |     main()
340 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Large Model Support Examples
 2 | 
 3 | This directory contains examples for using the PyTorch
 4 | Large Model Support (LMS).
 5 | 
 6 | ## Adjustable image resolution ResNet, DenseNet, and other models
 7 | 
 8 | The [ManyModel.py](ManyModel.py) file (based on PyTorch's
 9 | [imagenet example](https://github.com/pytorch/examples/blob/ee964a2/imagenet/main.py))
10 | uses the various models from `torchvision` to demonstrate PyTorch
11 | Large Model Support in models that cannot fit in GPU memory when using
12 | larger resolution data. It provides a convenient way to test out the
13 | capabilities of LMS with various model architectures (ResNet,
14 | DenseNet, Inception, MobileNet, NASNet, etc.). Command line parameters
15 | allow the user to change the size of the input image data, enable or
16 | disable LMS, and log memory allocator statistics.
17 | 
18 | The ManyModel.py example can be run by adding the `examples` directory to
19 | the PYTHONPATH and running like as shown:
20 | 
21 | ```bash
22 | cd examples
23 | export PYTHONPATH=`pwd`
24 | python ManyModel.py -h
25 | ```
26 | 
27 | ## Memory Allocator statistics
28 | PyTorch provides APIs to retrieve statistics from
29 | the GPU memory allocator. These statistics provide a means to
30 | do deeper analysis of a model's memory usage, including how often LMS
31 | reclaims memory and how many bytes of memory are being reclaimed.
32 | 
33 | The [statistics module](lmsstats.py) provides a working example of how the APIs
34 | can be used in used to log per-iteration and aggregate memory statistics. The
35 | `LMSStatsLogger` and `LMSStatsSummary` classes in this module are used by the ManyModel
36 | example to demonstrate how the statistics APIs can be used in model training.
37 | 


--------------------------------------------------------------------------------
/examples/lmsstats.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import ctypes
  3 | import os
  4 | import time
  5 | import torch
  6 | import statistics
  7 | 
  8 | 
  9 | STAT_KEYS = [('reclaimed_blocks',        'reclaimed'),
 10 |              ('reclaimed_bytes',         'reclaimed_bytes'),
 11 |              ('alloc_freelist',          'alloc_distribution.freelist'),
 12 |              ('alloc_cudamalloc',        'alloc_distribution.cudamalloc'),
 13 |              ('alloc_reclaim_one',       'alloc_distribution.reclaim_one'),
 14 |              ('alloc_reclaim_fragments', 'alloc_distribution.reclaim_fragments'),
 15 |              ('alloc_reclaim_all',       'alloc_distribution.reclaim_all'),
 16 |              ('alloc_cudamalloc_retry',  'alloc_distribution.cudamalloc_retry')]
 17 | 
 18 | ALL_KEYS = ['duration'] + [k[0] for k in STAT_KEYS]
 19 | 
 20 | class LMSStats():
 21 |     def __init__(self, gpu_id=0):
 22 |         self._gpu_id = gpu_id
 23 |         self._start_stats = {k:0 for k in ALL_KEYS}
 24 |         self._end_stats = self._start_stats.copy()
 25 |         self._delta = self._start_stats.copy()
 26 |         self._cumulative_stats = self._start_stats.copy()
 27 |         self._num_steps = 0
 28 |         self._step_times = []
 29 | 
 30 |     def _get_stats(self):
 31 |         s = torch.cuda.memory_stats(self._gpu_id)
 32 |         stats = {k[0]:s[k[1]] for k in STAT_KEYS}
 33 |         stats['duration'] = time.time()
 34 |         return stats
 35 | 
 36 |     def step_begin(self):
 37 |         self._start_stats = self._get_stats()
 38 | 
 39 |     def step_end(self):
 40 |         self._num_steps += 1
 41 |         self._end_stats = self._get_stats()
 42 |         self._delta = {k: self._end_stats[k] - self._start_stats[k] for k in ALL_KEYS}
 43 |         for k in ALL_KEYS:
 44 |             self._cumulative_stats[k] += self._delta[k]
 45 |         self._cumulative_stats['num_steps'] = self._num_steps
 46 |         self._step_times.append(self._delta['duration'])
 47 | 
 48 |     def get_last_step_delta(self):
 49 |         return self._delta.copy()
 50 | 
 51 |     def get_average_stats(self):
 52 |         if self._num_steps:
 53 |             s = self._num_steps * 1.0
 54 |             average = {k: self._cumulative_stats[k] / s for k in ALL_KEYS}
 55 |         else:
 56 |             average = {k: 0 for k in ALL_KEYS}
 57 |         average['num_steps'] = self._num_steps
 58 |         return average
 59 | 
 60 |     def get_median_time(self):
 61 |         if not self._step_times:
 62 |             return 0
 63 |         return statistics.median(self._step_times)
 64 | 
 65 | 
 66 | class LMSStatsLogger():
 67 |     def __init__(self, logfile, gpu_id=0):
 68 |         self._epoch = 0
 69 |         self._logfile = logfile
 70 |         self._lms_stats = LMSStats(gpu_id=gpu_id)
 71 |         self._write_header()
 72 | 
 73 |     def epoch_begin(self, epoch):
 74 |         self._epoch = epoch
 75 | 
 76 |     def train_batch_begin(self, batch):
 77 |         self._lms_stats.step_begin()
 78 | 
 79 |     def train_batch_end(self, batch):
 80 |         self._lms_stats.step_end()
 81 |         self._write_step_stats('t', batch)
 82 | 
 83 |     def train_end(self):
 84 |         pass
 85 | 
 86 |     def validation_batch_begin(self, batch):
 87 |         self._lms_stats.step_begin()
 88 | 
 89 |     def validation_batch_end(self, batch):
 90 |         self._lms_stats.step_end()
 91 |         self._write_step_stats('v', batch)
 92 | 
 93 |     def _write_header(self):
 94 |         header = ['step type', 'epoch', 'step'] + ALL_KEYS
 95 |         with open(self._logfile, 'w', newline='') as csvfile:
 96 |             statswriter = csv.writer(csvfile)
 97 |             statswriter.writerow(header)
 98 | 
 99 |     def _write_step_stats(self, step_type, step_num):
100 |         delta = self._lms_stats.get_last_step_delta()
101 |         row = [step_type, self._epoch, step_num] + [delta[k] for k in ALL_KEYS]
102 |         with open(self._logfile, 'a+', newline='') as csvfile:
103 |             statswriter = csv.writer(csvfile)
104 |             statswriter.writerow(row)
105 | 
106 | 
107 | class LMSStatsSummary():
108 |     def __init__(self, logfile, input_shape, gpu_id=0,
109 |                  batch_size=1, start_epoch=0, start_batch=2):
110 |         self._epoch = 0
111 |         self._logfile = logfile
112 |         self._lms_stats = LMSStats(gpu_id=gpu_id)
113 |         self._input_shape = input_shape
114 |         self._start_epoch = start_epoch
115 |         self._start_batch = start_batch
116 |         self._batch_size = batch_size
117 | 
118 |     def _should_record(self, batch):
119 |         if (batch >= self._start_batch) and (self._epoch >= self._start_epoch):
120 |             return True
121 |         return False
122 | 
123 |     def epoch_begin(self, epoch):
124 |         self._epoch = epoch
125 | 
126 |     def train_batch_begin(self, batch):
127 |         if not self._should_record(batch):
128 |             return
129 |         self._lms_stats.step_begin()
130 | 
131 |     def train_batch_end(self, batch):
132 |         if not self._should_record(batch):
133 |             return
134 |         self._lms_stats.step_end()
135 | 
136 |     def train_end(self):
137 |         stats_dict = self._lms_stats.get_average_stats()
138 | 
139 |         input_size_field = 'image_size'
140 |         stats_dict[input_size_field] = self._input_shape[0]
141 | 
142 |         input_size = self._batch_size
143 |         for dim in self._input_shape:
144 |             input_size *= dim
145 |         input_size /= 1000000.0
146 | 
147 |         rate_field = 'megapixels/sec' if len(self._input_shape) == 2 else 'megavoxels/sec'
148 |         duration = stats_dict['duration']
149 |         stats_dict[rate_field] = input_size / duration if duration != 0 else 0
150 | 
151 |         median_rate_field = 'median ' + rate_field
152 |         duration = self._lms_stats.get_median_time()
153 |         stats_dict[median_rate_field] = input_size / duration if duration != 0 else 0
154 | 
155 |         reclaimed_field = 'reclaimed_bytes'
156 | 
157 |         # Put these columns first
158 |         fieldnames = [input_size_field, rate_field, median_rate_field, reclaimed_field]
159 |         dictkeys = list(stats_dict)
160 |         for k in fieldnames:
161 |             dictkeys.remove(k)
162 |         fieldnames.extend(dictkeys)
163 | 
164 |         write_header = not os.path.exists(self._logfile)
165 |         with open(self._logfile, 'a+', newline='') as csvfile:
166 |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
167 |             if write_header:
168 |                 writer.writeheader()
169 |             writer.writerow(stats_dict)
170 | 
171 |     def validation_batch_begin(self, batch):
172 |         pass
173 | 
174 |     def validation_batch_end(self, batch):
175 |         pass
176 | 


--------------------------------------------------------------------------------
/patches/pytorch_v1.1.0_large_model_support.patch:
--------------------------------------------------------------------------------
   1 | From 5cf3a29fd47652ab8e1c650e17f2a7a34dbdff01 Mon Sep 17 00:00:00 2001
   2 | From: "Matthew T. Brandyberry" <mbrandy@us.ibm.com>
   3 | Date: Tue, 7 May 2019 15:58:23 -0500
   4 | Subject: [PATCH] PyTorch Large Model Support for PyTorch 1.1.0
   5 | 
   6 | This commit delivers PyTorch Large Model Support
   7 | for PyTorch at version 1.1.0
   8 | 
   9 | See: https://github.com/IBM/pytorch-large-model-support
  10 | ---
  11 |  aten/src/ATen/TensorGuard.h                   |  61 ++
  12 |  aten/src/ATen/function_wrapper.py             |  35 +-
  13 |  aten/src/ATen/native/cudnn/RNN.cpp            |   4 +
  14 |  aten/src/ATen/templates/SparseTypeDerived.cpp |   1 +
  15 |  aten/src/ATen/templates/TypeDefault.cpp       |   1 +
  16 |  aten/src/ATen/templates/TypeDerived.cpp       |   1 +
  17 |  aten/src/TH/generic/THStorage.cpp             |   4 +-
  18 |  aten/src/TH/generic/THStorage.h               |   4 +-
  19 |  aten/src/THC/THCGeneral.cpp                   |   2 +
  20 |  aten/src/THC/THCStorage.cpp                   |  14 +
  21 |  aten/src/THC/THCStorage.hpp                   |   2 +
  22 |  aten/src/THC/generic/THCStorage.cpp           |   4 +-
  23 |  aten/src/THC/generic/THCStorage.h             |   4 +-
  24 |  c10/core/Allocator.h                          |   5 +
  25 |  c10/core/LargeModelSupport.cpp                |   9 +
  26 |  c10/core/LargeModelSupport.h                  | 191 +++++
  27 |  c10/core/Storage.h                            |   6 +-
  28 |  c10/core/StorageImpl.cpp                      |   8 +
  29 |  c10/core/StorageImpl.h                        |  74 +-
  30 |  c10/cuda/CUDACachingAllocator.cpp             | 784 +++++++++++++-----
  31 |  c10/cuda/CUDACachingAllocator.h               |  28 +
  32 |  c10/cuda/CUDAStream.cpp                       |  50 ++
  33 |  c10/cuda/CUDAStream.h                         |   5 +
  34 |  c10/util/IntrusiveList.h                      |  64 ++
  35 |  test/test_cuda.py                             | 116 +++
  36 |  torch/csrc/cuda/Module.cpp                    | 159 ++++
  37 |  torch/csrc/generic/serialization.cpp          |   2 +-
  38 |  torch/cuda/__init__.py                        | 175 ++++
  39 |  28 files changed, 1590 insertions(+), 223 deletions(-)
  40 |  create mode 100644 aten/src/ATen/TensorGuard.h
  41 |  create mode 100644 c10/core/LargeModelSupport.cpp
  42 |  create mode 100644 c10/core/LargeModelSupport.h
  43 |  create mode 100644 c10/util/IntrusiveList.h
  44 | 
  45 | diff --git a/aten/src/ATen/TensorGuard.h b/aten/src/ATen/TensorGuard.h
  46 | new file mode 100644
  47 | index 0000000000..60b31c82f3
  48 | --- /dev/null
  49 | +++ b/aten/src/ATen/TensorGuard.h
  50 | @@ -0,0 +1,61 @@
  51 | +#pragma once
  52 | +
  53 | +#include <ATen/ATen.h>
  54 | +#include <ATen/ScalarType.h>
  55 | +#include <ATen/Tensor.h>
  56 | +
  57 | +#include <cstddef>
  58 | +#include <vector>
  59 | +
  60 | +namespace at {
  61 | +
  62 | +struct TensorGuard {
  63 | +  TensorGuard() = default;
  64 | +
  65 | +  explicit TensorGuard(const Tensor& tensor) {
  66 | +    if (tensor.has_storage()) {
  67 | +      StorageImpl* storage = tensor.storage().unsafeGetStorageImpl();
  68 | +      if (storage->lms_enabled()) {
  69 | +        storage->lms_pin();
  70 | +        storage_ = storage;
  71 | +      }
  72 | +    }
  73 | +  }
  74 | +
  75 | +  ~TensorGuard() {
  76 | +    if (storage_ != nullptr)
  77 | +      storage_->lms_unpin();
  78 | +  }
  79 | +
  80 | + private:
  81 | +  StorageImpl* storage_ = nullptr;
  82 | +};
  83 | +
  84 | +struct TensorListGuard {
  85 | +  TensorListGuard() = default;
  86 | +
  87 | +  explicit TensorListGuard(const TensorList& tensors) {
  88 | +    int len = tensors.size();
  89 | +    for (int i = 0; i < len; i++) {
  90 | +      const Tensor &tensor = tensors[i];
  91 | +      if (tensor.has_storage()) {
  92 | +        StorageImpl* storage = tensor.storage().unsafeGetStorageImpl();
  93 | +        if (storage->lms_enabled()) {
  94 | +          storage->lms_pin();
  95 | +          storage_.push_back(storage);
  96 | +        }
  97 | +      }
  98 | +    }
  99 | +  }
 100 | +
 101 | +  ~TensorListGuard() {
 102 | +    for (auto storage : storage_) {
 103 | +      storage->lms_unpin();
 104 | +    }
 105 | +  }
 106 | +
 107 | + private:
 108 | +  std::vector<StorageImpl*> storage_;
 109 | +};
 110 | +
 111 | +} // namespace at
 112 | diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
 113 | index 81d40813da..003f261ded 100644
 114 | --- a/aten/src/ATen/function_wrapper.py
 115 | +++ b/aten/src/ATen/function_wrapper.py
 116 | @@ -44,7 +44,7 @@ ${return_type} ${api_name}(${type_method_formals}) const override;
 117 |  # 2. broadcasting functions are implemented in Type.cpp
 118 |  TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\
 119 |  ${return_type} TypeDefault::${api_name}(${type_method_formals}) const {
 120 | -    ${device_guard_declaration}
 121 | +    ${device_guard_declarations}
 122 |      Tensor ${broadcast_returns};
 123 |      std::tie(${broadcast_returns}) = ${broadcast_function}(${broadcast_actuals}, "${api_name}");
 124 |      return ${method_prefix_derived}${api_name}(${broadcast_modified_actuals});
 125 | @@ -83,7 +83,7 @@ ${return_type} ${api_name}(${type_method_formals}) const override;
 126 |  """)
 127 |  TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\
 128 |  ${return_type} TypeDefault::${api_name}(${type_method_formals}) const {
 129 | -    ${device_guard_declaration}
 130 | +    ${device_guard_declarations}
 131 |      ${type_definition_body}
 132 |  }
 133 |  """)
 134 | @@ -94,7 +94,7 @@ ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const
 135 |  # 5. add override definition to TypeDerived.cpp
 136 |  TYPE_DERIVED_DEFINITION = CodeTemplate("""\
 137 |  ${return_type} ${Type}::${method_prefix_derived}${api_name}(${type_method_formals}) const {
 138 | -    ${device_guard_declaration}
 139 | +    ${device_guard_declarations}
 140 |      ${type_definition_body}
 141 |  }
 142 |  """)
 143 | @@ -114,7 +114,7 @@ case ScalarType::${ScalarName}: {
 144 |  """)
 145 |  TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 146 |  ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
 147 | -    ${device_guard_declaration}
 148 | +    ${device_guard_declarations}
 149 |      ${dispatch_scalar_type_declaration}
 150 |      switch (dispatch_scalar_type) {
 151 |          ${cases}
 152 | @@ -518,7 +518,7 @@ FunctionOption = TypedDict('FunctionOption', {
 153 |      'condition': str,
 154 |      'const_mark': str,
 155 |      'device_guard': bool,
 156 | -    'device_guard_declaration': str,
 157 | +    'device_guard_declarations': List[str],
 158 |      'dispatch_scalar_type_declaration': str,
 159 |      'with_gil': bool,
 160 |      'cpu_half': bool,
 161 | @@ -591,14 +591,18 @@ OutputDeclaration = NamedTuple('OutputDeclaration', [
 162 |  ])
 163 |  
 164 |  
 165 | -def device_guard(option, dispatch_options, dispatch_tensor):
 166 | +def device_guards(option, dispatch_options, dispatch_tensor, formals):
 167 |      # For factory methods the `DeviceGuard` is already in the template.
 168 |      if option.get('device_guard', True):
 169 | +        code = []
 170 |          if dispatch_options:
 171 | -            return 'const DeviceGuard device_guard({}.device());'.format(dispatch_options['name'])
 172 | -        if dispatch_tensor:
 173 | -            return 'const OptionalDeviceGuard device_guard(device_of({}));'.format(dispatch_tensor)
 174 | -    return '// DeviceGuard omitted'
 175 | +            code.append('const DeviceGuard device_guard({}.device());'.format(dispatch_options['name']))
 176 | +        elif dispatch_tensor:
 177 | +            code.append('const OptionalDeviceGuard device_guard(device_of({}));'.format(dispatch_tensor))
 178 | +        for arg in [f for f in formals if f['dynamic_type'] in {'Tensor', 'TensorList'}]:
 179 | +            code.append('const {0}Guard {1}_tensor_guard({1});'.format(arg['dynamic_type'], arg['name']))
 180 | +        return code
 181 | +    return ['// DeviceGuard omitted']
 182 |  
 183 |  
 184 |  def dispatch_scalar_type(option, dispatch_options, dispatch_tensor):
 185 | @@ -846,6 +850,7 @@ def create_generic(top_env, declarations):
 186 |              # arguments list between output and input arguments
 187 |              for buffer in option['buffers']:
 188 |                  body.append('Tensor {} = at::empty({{0}}, this->options());'.format(buffer['name']))
 189 | +                body.append('const TensorGuard {0}_tensor_guard({0});'.format(buffer['name']))
 190 |              actuals = [arg['name'] for arg in option['arguments'] if arg.get('output')]
 191 |              actuals += [buffer['name'] for buffer in option['buffers']]
 192 |              actuals += [arg['name'] for arg in option['arguments'] if not arg.get('output')]
 193 | @@ -891,7 +896,7 @@ def create_generic(top_env, declarations):
 194 |          option['method_prefix_derived'] = '' if broadcast_arg is None else 's_'
 195 |          if option['mode'] == 'TH':
 196 |              option['device_guard'] = False
 197 | -        option['device_guard_declaration'] = device_guard(option, False, dispatch_tensor)
 198 | +        option['device_guard_declarations'] = device_guards(option, False, dispatch_tensor, formals)
 199 |          option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, False, dispatch_tensor)
 200 |  
 201 |          env = nested_dict(option, top_env)
 202 | @@ -1124,7 +1129,7 @@ def create_generic(top_env, declarations):
 203 |          check_methods_do_not_start_with_underscore(option['name'], is_method)
 204 |  
 205 |          option['method_prefix_derived'] = ''
 206 | -        option['device_guard_declaration'] = device_guard(option, dispatch_options, dispatch_tensor)
 207 | +        option['device_guard_declarations'] = device_guards(option, dispatch_options, dispatch_tensor, formals)
 208 |          option['dispatch_scalar_type_declaration'] = dispatch_scalar_type(option, dispatch_options, dispatch_tensor)
 209 |  
 210 |          env = nested_dict(option, top_env)
 211 | @@ -1366,10 +1371,14 @@ def create_derived(backend_type_env, declarations):
 212 |              tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_'
 213 |                            .format(name, name))
 214 |          intrusive_ptr_type = 'c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>'
 215 | -        return [
 216 | +        code = [
 217 |              'auto {}_ = {};'.format(name, allocation),
 218 |              'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg),
 219 |          ]
 220 | +        if is_cuda:
 221 | +            code.append('const TensorGuard {0}_tensor_guard({0});'.format(name))
 222 | +        return code
 223 | +
 224 |  
 225 |      def resize_arg(arg):
 226 |          # type: (THFormal) -> str
 227 | diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
 228 | index 39e0e1cd49..fd999b8761 100644
 229 | --- a/aten/src/ATen/native/cudnn/RNN.cpp
 230 | +++ b/aten/src/ATen/native/cudnn/RNN.cpp
 231 | @@ -4,6 +4,7 @@
 232 |  #include <ATen/InitialTensorOptions.h>
 233 |  #include <ATen/MatrixRef.h>
 234 |  #include <ATen/NativeFunctions.h>
 235 | +#include <ATen/TensorGuard.h>
 236 |  #include <ATen/TensorUtils.h>
 237 |  #include <ATen/cuda/CUDAConfig.h>
 238 |  #include <ATen/cuda/CUDAEvent.h>
 239 | @@ -778,6 +779,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
 240 |            &reserve_size
 241 |            ));
 242 |      reserve = at::empty(reserve_size, input.options().dtype(kByte));
 243 | +    TensorListGuard rnn_tensor_guard({x, y, hy, cy});
 244 |      AT_CUDNN_CHECK(cudnnRNNForwardTraining(
 245 |            handle,
 246 |            descs.rnn_desc.desc(),
 247 | @@ -794,6 +796,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
 248 |            ));
 249 |    } else { // inference
 250 |      reserve = at::empty({0}, input.options().dtype(kByte));
 251 | +    TensorListGuard rnn_tensor_guard({x, y, hy, cy});
 252 |      AT_CUDNN_CHECK(cudnnRNNForwardInference(
 253 |            handle,
 254 |            descs.rnn_desc.desc(),
 255 | @@ -1199,6 +1202,7 @@ Tensor try_get_weight_buf(
 256 |    }
 257 |  
 258 |    // Get and check data pointers
 259 | +  TensorGuard weight_buf_tensor_guard(weight_buf);
 260 |    auto expected_data_ptrs = get_expected_data_ptrs(
 261 |        weight_buf, handle, rnn, rnn_desc, x_desc, datatype);
 262 |  
 263 | diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
 264 | index c9a2f73ac5..07e1134789 100644
 265 | --- a/aten/src/ATen/templates/SparseTypeDerived.cpp
 266 | +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
 267 | @@ -10,6 +10,7 @@
 268 |  #include <ATen/${Generator}.h>
 269 |  #include <c10/core/Allocator.h>
 270 |  #include <ATen/DeviceGuard.h>
 271 | +#include <ATen/TensorGuard.h>
 272 |  #include <ATen/NativeFunctions.h>
 273 |  #include <ATen/Utils.h>
 274 |  #include <ATen/WrapDimUtils.h>
 275 | diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
 276 | index 78d2149642..a7b55e4d94 100644
 277 | --- a/aten/src/ATen/templates/TypeDefault.cpp
 278 | +++ b/aten/src/ATen/templates/TypeDefault.cpp
 279 | @@ -13,6 +13,7 @@
 280 |  #include <ATen/Tensor.h>
 281 |  #include <c10/core/TensorOptions.h>
 282 |  #include <ATen/DeviceGuard.h>
 283 | +#include <ATen/TensorGuard.h>
 284 |  #include <ATen/SparseTensorUtils.h>
 285 |  
 286 |  namespace at {
 287 | diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
 288 | index 0d1f8b8a86..5c29bc4a4e 100644
 289 | --- a/aten/src/ATen/templates/TypeDerived.cpp
 290 | +++ b/aten/src/ATen/templates/TypeDerived.cpp
 291 | @@ -12,6 +12,7 @@ $storage_tensor_headers
 292 |  #include <ATen/${Generator}.h>
 293 |  #include <c10/core/Allocator.h>
 294 |  #include <ATen/DeviceGuard.h>
 295 | +#include <ATen/TensorGuard.h>
 296 |  #include <ATen/NativeFunctions.h>
 297 |  #include <ATen/Utils.h>
 298 |  #include <ATen/WrapDimUtils.h>
 299 | diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
 300 | index db45be9908..31e43752cf 100644
 301 | --- a/aten/src/TH/generic/THStorage.cpp
 302 | +++ b/aten/src/TH/generic/THStorage.cpp
 303 | @@ -4,7 +4,7 @@
 304 |  
 305 |  #include <new>
 306 |  
 307 | -scalar_t* THStorage_(data)(const THStorage *self)
 308 | +scalar_t* THStorage_(data)(THStorage *self)
 309 |  {
 310 |    return self->data<scalar_t>();
 311 |  }
 312 | @@ -142,7 +142,7 @@ void THStorage_(set)(THStorage *self, ptrdiff_t idx, scalar_t value)
 313 |    THStorage_(data)(self)[idx] = value;
 314 |  }
 315 |  
 316 | -scalar_t THStorage_(get)(const THStorage *self, ptrdiff_t idx)
 317 | +scalar_t THStorage_(get)(THStorage *self, ptrdiff_t idx)
 318 |  {
 319 |    THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds");
 320 |    return THStorage_(data)(self)[idx];
 321 | diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h
 322 | index 2e432c1daf..67f6457a7d 100644
 323 | --- a/aten/src/TH/generic/THStorage.h
 324 | +++ b/aten/src/TH/generic/THStorage.h
 325 | @@ -35,13 +35,13 @@
 326 |  #define THLongStorage THStorage
 327 |  #define THBoolStorage THStorage
 328 |  
 329 | -TH_API scalar_t* THStorage_(data)(const THStorage*);
 330 | +TH_API scalar_t* THStorage_(data)(THStorage*);
 331 |  TH_API ptrdiff_t THStorage_(size)(const THStorage*);
 332 |  TH_API size_t THStorage_(elementSize)(void);
 333 |  
 334 |  /* slow access -- checks everything */
 335 |  TH_API void THStorage_(set)(THStorage*, ptrdiff_t, scalar_t);
 336 | -TH_API scalar_t THStorage_(get)(const THStorage*, ptrdiff_t);
 337 | +TH_API scalar_t THStorage_(get)(THStorage*, ptrdiff_t);
 338 |  
 339 |  TH_API THStorage* THStorage_(new)(void);
 340 |  TH_API THStorage* THStorage_(newWithSize)(ptrdiff_t size);
 341 | diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
 342 | index 6c7d6cabbe..152008ad4e 100644
 343 | --- a/aten/src/THC/THCGeneral.cpp
 344 | +++ b/aten/src/THC/THCGeneral.cpp
 345 | @@ -51,6 +51,8 @@ void THCudaInit(THCState* state)
 346 |    THCudaCheck(cudaGetDeviceCount(&numDevices));
 347 |    state->numDevices = numDevices;
 348 |  
 349 | +  c10::cuda::CUDACachingAllocator::init(numDevices, state->cudaHostAllocator);
 350 | +
 351 |    int device = 0;
 352 |    THCudaCheck(cudaGetDevice(&device));
 353 |  
 354 | diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
 355 | index af7117925e..0b22b95fb4 100644
 356 | --- a/aten/src/THC/THCStorage.cpp
 357 | +++ b/aten/src/THC/THCStorage.cpp
 358 | @@ -25,6 +25,11 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
 359 |  
 360 |    size_t itemsize = self->itemsize();
 361 |  
 362 | +  if (self->lms_enabled()) {
 363 | +    THAssert(!self->lms_reclaimed());
 364 | +    self->lms_release_resources();
 365 | +  }
 366 | +
 367 |    if(size == 0)
 368 |    {
 369 |      self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device)));
 370 | @@ -66,3 +71,12 @@ THC_API THCStorage* THCStorage_new(
 371 |        true).release();
 372 |    return storage;
 373 |  }
 374 | +
 375 | +void THCStorage_copy_to_host(THCState *state, THCStorage *storage, void *dst) {
 376 | +  size_t size = storage->capacity();
 377 | +  if (storage->lms_reclaimed()) {
 378 | +    storage->lms_copy_reclaimed_data(dst, size);
 379 | +  } else {
 380 | +    THCudaCheck(cudaMemcpy(dst, storage->data(), size, cudaMemcpyDeviceToHost));
 381 | +  }
 382 | +}
 383 | diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
 384 | index 62a1d950a4..6f539274a0 100644
 385 | --- a/aten/src/THC/THCStorage.hpp
 386 | +++ b/aten/src/THC/THCStorage.hpp
 387 | @@ -20,6 +20,8 @@ THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
 388 |  THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size);
 389 |  THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
 390 |  
 391 | +THC_API void THCStorage_copy_to_host(THCState *state, THCStorage *storage, void *dst);
 392 | +
 393 |  THC_API THCStorage* THCStorage_newWithDataAndAllocator(
 394 |    THCState *state, at::ScalarType scalar_type,
 395 |    at::DataPtr&& data, ptrdiff_t size,
 396 | diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
 397 | index b5495e1296..d2dd0fd402 100644
 398 | --- a/aten/src/THC/generic/THCStorage.cpp
 399 | +++ b/aten/src/THC/generic/THCStorage.cpp
 400 | @@ -5,7 +5,7 @@
 401 |  #include <c10/util/intrusive_ptr.h>
 402 |  #include <c10/util/typeid.h>
 403 |  
 404 | -scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
 405 | +scalar_t* THCStorage_(data)(THCState *state, THCStorage *self)
 406 |  {
 407 |    return self->data<scalar_t>();
 408 |  }
 409 | @@ -30,7 +30,7 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar
 410 |    THCudaCheck(cudaStreamSynchronize(stream));
 411 |  }
 412 |  
 413 | -scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 414 | +scalar_t THCStorage_(get)(THCState *state, THCStorage *self, ptrdiff_t index)
 415 |  {
 416 |    THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
 417 |    scalar_t value;
 418 | diff --git a/aten/src/THC/generic/THCStorage.h b/aten/src/THC/generic/THCStorage.h
 419 | index 5fdf41d560..06bcbcaff1 100644
 420 | --- a/aten/src/THC/generic/THCStorage.h
 421 | +++ b/aten/src/THC/generic/THCStorage.h
 422 | @@ -16,13 +16,13 @@
 423 |  #define THCudaLongStorage   THCStorage
 424 |  #define THCudaBoolStorage   THCStorage
 425 |  
 426 | -THC_API scalar_t* THCStorage_(data)(THCState *state, const THCStorage*);
 427 | +THC_API scalar_t* THCStorage_(data)(THCState *state, THCStorage*);
 428 |  THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*);
 429 |  THC_API int THCStorage_(elementSize)(THCState *state);
 430 |  
 431 |  /* slow access -- checks everything */
 432 |  THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, scalar_t);
 433 | -THC_API scalar_t THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
 434 | +THC_API scalar_t THCStorage_(get)(THCState *state, THCStorage*, ptrdiff_t);
 435 |  
 436 |  THC_API THCStorage* THCStorage_(new)(THCState *state);
 437 |  THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
 438 | diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
 439 | index 06b77c7b95..0badabd385 100644
 440 | --- a/c10/core/Allocator.h
 441 | +++ b/c10/core/Allocator.h
 442 | @@ -124,6 +124,8 @@ inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
 443 |    return dp;
 444 |  }
 445 |  
 446 | +struct LMSImpl;
 447 | +
 448 |  // Note [raw_allocate/raw_deallocate and Thrust]
 449 |  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 450 |  // Thrust's support for custom allocators requires us to write something
 451 | @@ -157,6 +159,9 @@ struct C10_API Allocator {
 452 |    virtual DeleterFnPtr raw_deleter() const {
 453 |      return nullptr;
 454 |    }
 455 | +  virtual LMSImpl* lms() const {
 456 | +    return nullptr;
 457 | +  }
 458 |    void* raw_allocate(size_t n) {
 459 |      auto dptr = allocate(n);
 460 |      AT_ASSERT(dptr.get() == dptr.get_context());
 461 | diff --git a/c10/core/LargeModelSupport.cpp b/c10/core/LargeModelSupport.cpp
 462 | new file mode 100644
 463 | index 0000000000..5a5b59f5dc
 464 | --- /dev/null
 465 | +++ b/c10/core/LargeModelSupport.cpp
 466 | @@ -0,0 +1,9 @@
 467 | +#include <c10/core/LargeModelSupport.h>
 468 | +
 469 | +namespace c10 {
 470 | +
 471 | +c10::LMS* c10::LMS::from_list_hook(c10::IntrusiveListHook *hook) {
 472 | +  return (LMS *)((char *)hook - offsetof(LMS, list_hook_));
 473 | +}
 474 | +
 475 | +} // namespace at
 476 | diff --git a/c10/core/LargeModelSupport.h b/c10/core/LargeModelSupport.h
 477 | new file mode 100644
 478 | index 0000000000..7e2518f9c8
 479 | --- /dev/null
 480 | +++ b/c10/core/LargeModelSupport.h
 481 | @@ -0,0 +1,191 @@
 482 | +#pragma once
 483 | +
 484 | +#include <c10/core/Allocator.h>
 485 | +#include <c10/util/IntrusiveList.h>
 486 | +
 487 | +#include <atomic>
 488 | +#include <cstring>
 489 | +
 490 | +namespace c10 {
 491 | +
 492 | +typedef void* LMSSyncEvent_t;
 493 | +
 494 | +struct LMSImpl {
 495 | +  LMSImpl(Allocator* allocator) : allocator_(allocator), reclaimed_(false), pincount_(0) {}
 496 | +  LMSImpl() = delete;
 497 | +  virtual ~LMSImpl() {}
 498 | +
 499 | +  virtual void release_resources() {
 500 | +    data_ptr_.clear();
 501 | +    reclaimed_ = false;
 502 | +  }
 503 | +
 504 | +  virtual void reclaim_list_add(IntrusiveListHook* list_hook) = 0;
 505 | +  virtual bool reclaim_list_remove(IntrusiveListHook* list_hook) = 0;
 506 | +
 507 | +  bool reclaimed() const {
 508 | +    return reclaimed_;
 509 | +  };
 510 | +
 511 | +  bool pin() {
 512 | +    bool initial = (++pincount_ == 1);
 513 | +    return initial;
 514 | +  }
 515 | +
 516 | +  bool unpin() {
 517 | +    bool final = (--pincount_ == 0);
 518 | +    if (final && reclaimed_)
 519 | +      pagein_sync();
 520 | +    return final;
 521 | +  }
 522 | +
 523 | +  void pagein(void* dst, size_t size) {
 524 | +    AT_ASSERT(reclaimed_ == true);
 525 | +    void* src = data_ptr_.get();
 526 | +    AT_ASSERT(dst);
 527 | +    AT_ASSERT(src);
 528 | +
 529 | +    do_pagein(dst, src, size);
 530 | +  }
 531 | +
 532 | +  void pagein_sync() {
 533 | +    do_pagein_sync();
 534 | +    reclaimed_ = false;
 535 | +  }
 536 | +
 537 | +  void pageout(void* src, size_t size, LMSSyncEvent_t sync_event) {
 538 | +    AT_ASSERT(reclaimed_ == false);
 539 | +
 540 | +    void* dst = data_ptr_.get();
 541 | +    if (!dst) {
 542 | +      data_ptr_ = allocator_->allocate(size);
 543 | +      dst = data_ptr_.get();
 544 | +    }
 545 | +    AT_ASSERT(src);
 546 | +    AT_ASSERT(dst);
 547 | +
 548 | +    do_pageout(dst, src, size, sync_event);
 549 | +  }
 550 | +
 551 | +  void pageout_sync() {
 552 | +    do_pageout_sync();
 553 | +    reclaimed_ = true;
 554 | +  }
 555 | +
 556 | +  void copy_reclaimed_data(void* dst, size_t size) const {
 557 | +    AT_ASSERT(reclaimed_ == true);
 558 | +    memcpy(dst, data_ptr_.get(), size);
 559 | +  }
 560 | +
 561 | +protected:
 562 | +  virtual void do_pagein(void* dst, void* src, size_t size) = 0;
 563 | +  virtual void do_pagein_sync() = 0;
 564 | +  virtual void do_pageout(void* dst, void* src, size_t size, LMSSyncEvent_t sync_event) = 0;
 565 | +  virtual void do_pageout_sync() = 0;
 566 | +
 567 | +  Allocator* allocator_ = nullptr;
 568 | +  bool reclaimed_ = false;
 569 | +  DataPtr data_ptr_;
 570 | +  mutable std::atomic<size_t> pincount_;
 571 | +};
 572 | +
 573 | +
 574 | +struct LMS {
 575 | +  LMS(LMSImpl* lms) { set(lms); }
 576 | +  LMS() = delete;
 577 | +  ~LMS() { unset(); }
 578 | +
 579 | +  LMS& operator=(LMS&& other) = default;
 580 | +  LMS(LMS&& other) = default;
 581 | +
 582 | +  static LMS* from_list_hook(IntrusiveListHook *hook);
 583 | +
 584 | +  bool enabled() const {
 585 | +    return lms_ != nullptr;
 586 | +  };
 587 | +
 588 | +  void set(LMSImpl* lms) {
 589 | +    AT_ASSERT(lms_ == nullptr);
 590 | +    lms_ = lms;
 591 | +  }
 592 | +
 593 | +  void unset() {
 594 | +    if (enabled()) {
 595 | +      reclaim_list_remove();
 596 | +      delete lms_;
 597 | +      lms_ = nullptr;
 598 | +    }
 599 | +  }
 600 | +
 601 | +  void release_resources() {
 602 | +    if (enabled()) {
 603 | +      reclaim_list_remove();
 604 | +      lms_->release_resources();
 605 | +    }
 606 | +  }
 607 | +
 608 | +  bool reclaimed() const {
 609 | +    return enabled() && lms_->reclaimed();
 610 | +  };
 611 | +
 612 | +  void list_add(IntrusiveList* list) {
 613 | +    list->append(&list_hook_);
 614 | +  }
 615 | +
 616 | +  bool list_remove() {
 617 | +    return list_hook_.remove();
 618 | +  }
 619 | +
 620 | +  bool pin() {
 621 | +    bool initial = enabled() && lms_->pin();
 622 | +    if (initial)
 623 | +      reclaim_list_remove();
 624 | +    return initial;
 625 | +  }
 626 | +
 627 | +  bool unpin() {
 628 | +    bool final = enabled() && lms_->unpin();
 629 | +    if (final)
 630 | +      reclaim_list_add();
 631 | +    return final;
 632 | +  }
 633 | +
 634 | +  void pagein(void* data_ptr, size_t size) const {
 635 | +    lms_->pagein(data_ptr, size);
 636 | +  }
 637 | +
 638 | +  void pagein_sync() const {
 639 | +    lms_->pagein_sync();
 640 | +  }
 641 | +
 642 | +  void pageout(void* data_ptr, size_t size, LMSSyncEvent_t sync_event, IntrusiveList *async_queue = nullptr) {
 643 | +    lms_->pageout(data_ptr, size, sync_event);
 644 | +    if (async_queue)
 645 | +      list_add(async_queue);
 646 | +  }
 647 | +
 648 | +  void pageout_sync(IntrusiveList *async_queue = nullptr) {
 649 | +    if (async_queue)
 650 | +      list_remove();
 651 | +    lms_->pageout_sync();
 652 | +  }
 653 | +
 654 | +  void copy_reclaimed_data(void* dst, size_t size) const {
 655 | +    lms_->copy_reclaimed_data(dst, size);
 656 | +  }
 657 | +
 658 | +  void reclaim_list_add() {
 659 | +    lms_->reclaim_list_add(&list_hook_);
 660 | +  }
 661 | +
 662 | +  bool reclaim_list_remove() {
 663 | +    if (!list_hook_.attached()) return false;
 664 | +
 665 | +    return lms_->reclaim_list_remove(&list_hook_);
 666 | +  }
 667 | +
 668 | + private:
 669 | +  IntrusiveListHook list_hook_;
 670 | +  LMSImpl* lms_ = nullptr;
 671 | +};
 672 | +} // namespace c10
 673 | diff --git a/c10/core/Storage.h b/c10/core/Storage.h
 674 | index 6d86119eff..9b614ce929 100644
 675 | --- a/c10/core/Storage.h
 676 | +++ b/c10/core/Storage.h
 677 | @@ -56,10 +56,10 @@ struct C10_API Storage {
 678 |    }
 679 |  
 680 |    template <typename T>
 681 | -  T* data() const { return storage_impl_->data<T>(); }
 682 | +  T* data() const { return storage_impl_.get()->data<T>(); }
 683 |  
 684 |    template <typename T>
 685 | -  T* unsafe_data() const { return storage_impl_->unsafe_data<T>(); }
 686 | +  T* unsafe_data() const { return storage_impl_.get()->unsafe_data<T>(); }
 687 |  
 688 |    size_t elementSize() const {
 689 |      return storage_impl_->itemsize();
 690 | @@ -104,7 +104,7 @@ struct C10_API Storage {
 691 |    }
 692 |  
 693 |    const at::DataPtr& data_ptr() const {
 694 | -    return storage_impl_->data_ptr();
 695 | +    return storage_impl_.get()->data_ptr();
 696 |    }
 697 |  
 698 |    // Returns the previous data_ptr
 699 | diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp
 700 | index 797e21f079..90e3ac0f7b 100644
 701 | --- a/c10/core/StorageImpl.cpp
 702 | +++ b/c10/core/StorageImpl.cpp
 703 | @@ -1 +1,9 @@
 704 |  #include <c10/core/StorageImpl.h>
 705 | +
 706 | +namespace c10 {
 707 | +
 708 | +c10::StorageImpl* c10::StorageImpl::from_list_hook(c10::IntrusiveListHook *hook) {
 709 | +  return (StorageImpl *)((char *)c10::LMS::from_list_hook(hook) - offsetof(StorageImpl, lms_));
 710 | +}
 711 | +
 712 | +} // namespace at
 713 | diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
 714 | index 579ef00820..4bb39a4c16 100644
 715 | --- a/c10/core/StorageImpl.h
 716 | +++ b/c10/core/StorageImpl.h
 717 | @@ -2,6 +2,7 @@
 718 |  
 719 |  #include <c10/core/Allocator.h>
 720 |  #include <c10/core/ScalarType.h>
 721 | +#include <c10/core/LargeModelSupport.h>
 722 |  
 723 |  #include <c10/util/intrusive_ptr.h>
 724 |  
 725 | @@ -20,7 +21,8 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 726 |          numel_(numel),
 727 |          resizable_(resizable),
 728 |          received_cuda_(false),
 729 | -        allocator_(allocator) {
 730 | +        allocator_(allocator),
 731 | +        lms_(allocator ? allocator->lms() : nullptr) {
 732 |      if (resizable) {
 733 |        AT_ASSERTM(
 734 |            allocator_, "For resizable storage, allocator must be provided");
 735 | @@ -53,6 +55,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 736 |    ~StorageImpl() = default;
 737 |  
 738 |    void reset() {
 739 | +    lms_.unset();
 740 |      data_ptr_.clear();
 741 |      numel_ = 0;
 742 |    }
 743 | @@ -63,7 +66,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 744 |    }
 745 |  
 746 |    template <typename T>
 747 | -  inline T* data() const {
 748 | +  inline T* data() {
 749 |      auto data_type = caffe2::TypeMeta::Make<T>();
 750 |      if (dtype() != data_type) {
 751 |        AT_ERROR(
 752 | @@ -76,11 +79,13 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 753 |    }
 754 |  
 755 |    template <typename T>
 756 | -  inline T* unsafe_data() const {
 757 | +  inline T* unsafe_data() {
 758 | +    lms_ensure_data();
 759 |      return static_cast<T*>(this->data_ptr_.get());
 760 |    }
 761 |  
 762 |    void release_resources() override {
 763 | +    lms_.release_resources();
 764 |      data_ptr_.clear();
 765 |    }
 766 |  
 767 | @@ -106,10 +111,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 768 |    };
 769 |  
 770 |    at::DataPtr& data_ptr() {
 771 | -    return data_ptr_;
 772 | -  };
 773 | -
 774 | -  const at::DataPtr& data_ptr() const {
 775 | +    lms_ensure_data();
 776 |      return data_ptr_;
 777 |    };
 778 |  
 779 | @@ -130,10 +132,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 780 |  
 781 |    // TODO: Return const ptr eventually if possible
 782 |    void* data() {
 783 | -    return data_ptr_.get();
 784 | -  }
 785 | -
 786 | -  void* data() const {
 787 | +    lms_ensure_data();
 788 |      return data_ptr_.get();
 789 |    }
 790 |  
 791 | @@ -192,6 +191,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 792 |        at::DataPtr&& data_ptr,
 793 |        const caffe2::TypeMeta& data_type,
 794 |        size_t capacity) {
 795 | +    lms_.unset();
 796 |      data_type_ = data_type;
 797 |      // TODO: Use CAFFE_ENFORCE_WITH_CALLER equivalent
 798 |      // For now causes lots of redefine issues if caffe2/core/logging.h is used
 799 | @@ -221,7 +221,58 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 800 |      return received_cuda_;
 801 |    }
 802 |  
 803 | +  // Large Model Support
 804 | +  bool lms_enabled() const                  { return lms_.enabled(); }
 805 | +  bool lms_reclaimed() const                { return lms_.reclaimed(); }
 806 | +  void lms_release_resources()              { lms_.release_resources(); }
 807 | +  void lms_list_add(IntrusiveList* list)    { lms_.list_add(list); }
 808 | +  bool lms_list_remove()                    { return lms_.list_remove(); }
 809 | +  void* allocation_ptr() const              { return data_ptr_.get(); }
 810 | +  static StorageImpl* from_list_hook(IntrusiveListHook *hook);
 811 | +
 812 | +  bool lms_pin() {
 813 | +    bool initial = lms_.pin();
 814 | +    if (initial && lms_reclaimed()) {
 815 | +      lms_pagein();
 816 | +    }
 817 | +    return initial;
 818 | +  }
 819 | +
 820 | +  bool lms_unpin() {
 821 | +    bool final = lms_.unpin();
 822 | +    return final;
 823 | +  }
 824 | +
 825 | +  void lms_pageout(LMSSyncEvent_t sync_event, IntrusiveList *async_queue = nullptr) {
 826 | +    lms_.pageout(data_ptr_.get(), capacity(), sync_event, async_queue);
 827 | +  }
 828 | +
 829 | +  void lms_pageout_sync(IntrusiveList *async_queue = nullptr) {
 830 | +    lms_.pageout_sync(async_queue);
 831 | +    set_data_ptr(at::DataPtr(nullptr, device()));
 832 | +  }
 833 | +
 834 | +  void lms_copy_reclaimed_data(void* dst, size_t size) {
 835 | +    lms_.copy_reclaimed_data(dst, size);
 836 | +  }
 837 | +
 838 |   private:
 839 | +  void lms_pagein() {
 840 | +    AT_ASSERT(!data_ptr_);
 841 | +    size_t size = capacity();
 842 | +    set_data_ptr(allocator()->allocate(size));
 843 | +    lms_.pagein(data_ptr_.get(), size);
 844 | +  }
 845 | +
 846 | +  void lms_ensure_data() {
 847 | +    if (!lms_enabled() || lms_.reclaim_list_remove() || !lms_reclaimed())
 848 | +      return;
 849 | +
 850 | +    if (!data_ptr_)
 851 | +      lms_pagein();
 852 | +    lms_.pagein_sync();
 853 | +  }
 854 | +
 855 |    caffe2::TypeMeta data_type_;
 856 |    DataPtr data_ptr_;
 857 |    int64_t numel_;
 858 | @@ -230,5 +281,6 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
 859 |    // local to process cuda memory allocation
 860 |    bool received_cuda_;
 861 |    Allocator* allocator_;
 862 | +  LMS lms_;
 863 |  };
 864 |  } // namespace c10
 865 | diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
 866 | index f03ba432f2..eb608ab228 100644
 867 | --- a/c10/cuda/CUDACachingAllocator.cpp
 868 | +++ b/c10/cuda/CUDACachingAllocator.cpp
 869 | @@ -68,14 +68,24 @@ struct DeviceStats {
 870 |    uint64_t   max_amount_allocated;  // max total amount allocated in bytes
 871 |    uint64_t   amount_cached;         // total amount in cache in bytes
 872 |    uint64_t   max_amount_cached;     // max total amount in cache in bytes
 873 | +  uint64_t   amount_inactive;       // total amount in reclaim list in bytes
 874 | +  uint64_t   amount_active()        { return amount_allocated - amount_inactive; }
 875 | +  uint64_t   max_amount_active;     // max total active in bytes
 876 | +  uint64_t   amount_reclaimed;
 877 | +  uint64_t   alloc_distribution[NUM_ALLOC_SOURCES];
 878 |  
 879 |    DeviceStats() :
 880 |        amount_allocated(0), max_amount_allocated(0),
 881 | -      amount_cached(0), max_amount_cached(0) { }
 882 | +      amount_cached(0), max_amount_cached(0),
 883 | +      amount_inactive(0), max_amount_active(0),
 884 | +      amount_reclaimed(0) {
 885 | +    resetAllocStats();
 886 | +  }
 887 |  
 888 |    void increaseAllocated(size_t delta) {
 889 |      amount_allocated += delta;
 890 |      max_amount_allocated = std::max(max_amount_allocated, amount_allocated);
 891 | +    max_amount_active = std::max(max_amount_active, amount_active());
 892 |    }
 893 |  
 894 |    void decreaseAllocated(size_t delta) {
 895 | @@ -90,6 +100,28 @@ struct DeviceStats {
 896 |    void decreaseCached(size_t delta) {
 897 |      amount_cached -= delta;
 898 |    }
 899 | +
 900 | +  void increaseInactive(size_t delta) {
 901 | +    amount_inactive += delta;
 902 | +  }
 903 | +
 904 | +  void decreaseInactive(size_t delta, bool reclaimed=false) {
 905 | +    amount_inactive -= delta;
 906 | +    max_amount_active = std::max(max_amount_active, amount_active());
 907 | +
 908 | +    if (reclaimed)
 909 | +      amount_reclaimed += delta;
 910 | +  }
 911 | +
 912 | +  void resetAllocStats() {
 913 | +    memset(alloc_distribution, 0, sizeof(alloc_distribution));
 914 | +  }
 915 | +  void getAllocStats(uint64_t* distribution) {
 916 | +    memcpy(distribution, alloc_distribution, sizeof(alloc_distribution));
 917 | +  }
 918 | +  void recordAllocSource(AllocSource source) {
 919 | +    alloc_distribution[source] += 1;
 920 | +  }
 921 |  };
 922 |  
 923 |  struct Block;
 924 | @@ -120,9 +152,6 @@ struct Block {
 925 |  
 926 |  static bool BlockComparator(const Block* a, const Block* b)
 927 |  {
 928 | -  if (a->device != b->device) {
 929 | -    return a->device < b->device;
 930 | -  }
 931 |    if (a->stream != b->stream) {
 932 |      return (uintptr_t)a->stream < (uintptr_t)b->stream;
 933 |    }
 934 | @@ -151,128 +180,212 @@ static std::string format_size(uint64_t size) {
 935 |    return os.str();
 936 |  }
 937 |  
 938 | +#define LMS_SIZE_DEFAULT (1 << 20) // 1 MB
 939 | +
 940 | +struct LMSSettings {
 941 | +  LMSSettings() :
 942 | +    enabled_(false), size_(LMS_SIZE_DEFAULT), limit_(0), host_allocator_(nullptr) {}
 943 | +
 944 | +  bool enabled()                 { return enabled_; }
 945 | +  void set_enabled(bool enabled) { enabled_ = enabled; }
 946 | +  size_t size()                  { return size_; }
 947 | +  void set_size(size_t size)     { size_ = size; }
 948 | +  size_t limit()                 { return limit_; }
 949 | +  void set_limit(size_t limit)   { limit_ = limit; }
 950 | +  at::Allocator* host_allocator()                        { return host_allocator_; }
 951 | +  void set_host_allocator(at::Allocator* host_allocator) { host_allocator_ = host_allocator; }
 952 | +
 953 | +  bool enabled(size_t size) {
 954 | +    return enabled_ && size >= size_;
 955 | +  }
 956 | +  bool limit_alloc(DeviceStats& stats, size_t alloc_size) {
 957 | +    return (stats.amount_cached + alloc_size) > limit_;
 958 | +  }
 959 | +
 960 | +private:
 961 | +  bool enabled_;
 962 | +  size_t size_;
 963 | +  size_t limit_;
 964 | +  at::Allocator* host_allocator_;
 965 | +};
 966 | +
 967 | +struct AllocParams {
 968 | +  AllocParams(int device, size_t size, cudaStream_t stream, BlockPool* pool, size_t alloc_size,
 969 | +              LMSSettings* lms, DeviceStats& stats) :
 970 | +    search_key(device, stream, size),
 971 | +    pool(pool),
 972 | +    alloc_size(alloc_size),
 973 | +    lms_enabled(lms->enabled(size)),
 974 | +    limit_alloc(lms_enabled && lms->limit_alloc(stats, alloc_size)),
 975 | +    block(nullptr),
 976 | +    err(cudaSuccess) {}
 977 | +
 978 | +  int device() { return search_key.device; }
 979 | +  cudaStream_t stream() { return search_key.stream; }
 980 | +  size_t size() { return search_key.size; }
 981 | +
 982 | +  Block search_key;
 983 | +  BlockPool* pool;
 984 | +  size_t alloc_size;
 985 | +  bool lms_enabled;
 986 | +  bool limit_alloc;
 987 | +  Block* block;
 988 | +  AllocSource source;
 989 | +  cudaError_t err;
 990 | +};
 991 | +
 992 |  } // namespace
 993 |  
 994 | -struct THCCachingAllocator
 995 | +struct DeviceCachingAllocator
 996 |  {
 997 |    // device statistics
 998 | -  std::vector<DeviceStats> device_stats;
 999 | +  DeviceStats stats;
1000 |  
1001 |    // lock around all operations
1002 |    std::recursive_mutex mutex;
1003 |  
1004 | -  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
1005 | -  std::mutex cuda_free_mutex;
1006 | -
1007 |    // cached blocks larger than 1 MB
1008 |    BlockPool large_blocks;
1009 |  
1010 |    // cached blocks 1 MB or smaller
1011 |    BlockPool small_blocks;
1012 |  
1013 | -  // allocated blocks by device pointer
1014 | -  std::unordered_map<void*, Block*> allocated_blocks;
1015 | -
1016 |    // outstanding cuda events
1017 |    std::deque<std::pair<cudaEvent_t, Block*>> cuda_events;
1018 |  
1019 | -  THCCachingAllocator() :
1020 | +  at::IntrusiveList reclaim_list;
1021 | +
1022 | +  DeviceCachingAllocator() :
1023 |        large_blocks(BlockComparator),
1024 |        small_blocks(BlockComparator) {}
1025 |  
1026 | -  DeviceStats &get_stats_for_device(int device) {
1027 | -    AT_ASSERT(device >= 0);
1028 | -    if ((size_t) device >= device_stats.size()) {
1029 | -      device_stats.resize(device + 1);
1030 | +  bool get_free_block(AllocParams& p, AllocSource source)
1031 | +  {
1032 | +    BlockPool& pool = *p.pool;
1033 | +    auto it = pool.lower_bound(&p.search_key);
1034 | +    if (it == pool.end() || (*it)->stream != p.stream())
1035 | +      return false;
1036 | +    p.block = *it;
1037 | +    p.source = source;
1038 | +    pool.erase(it);
1039 | +    return true;
1040 | +  }
1041 | +
1042 | +  bool trigger_free_memory_callbacks(AllocParams& p) {
1043 | +    bool freed_memory = false;
1044 | +    for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
1045 | +      freed_memory |=
1046 | +        FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
1047 |      }
1048 | -    return device_stats.at(device);
1049 | +    return freed_memory;
1050 | +  }
1051 | +
1052 | +  bool alloc_block(AllocParams& p, bool record_error, AllocSource source)
1053 | +  {
1054 | +    size_t size = p.alloc_size;
1055 | +    void* ptr;
1056 | +    cudaError_t err;
1057 | +    err = cudaMalloc(&ptr, size);
1058 | +    if (err != cudaSuccess) {
1059 | +      if (record_error) p.err = err; else cudaGetLastError();
1060 | +      return false;
1061 | +    }
1062 | +
1063 | +    stats.increaseCached(size);
1064 | +    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
1065 | +    p.source = source;
1066 | +    return (p.block != nullptr);
1067 | +  }
1068 | +
1069 | +  bool try_lms_reclaim(AllocParams& p) {
1070 | +    size_t size = p.size();
1071 | +    cudaStream_t stream = p.stream();
1072 | +    cudaEvent_t sync_event;
1073 | +
1074 | +    AT_ASSERT(stream == cuda::getCurrentCUDAStream().stream());
1075 | +    C10_CUDA_CHECK(cudaEventCreate(&sync_event));
1076 | +    C10_CUDA_CHECK(cudaEventRecord(sync_event, stream));
1077 | +
1078 | +    bool found =
1079 | +      // a. Search reclaim list for a suitable inactive allocation
1080 | +      (reclaim_one(size, sync_event) && get_free_block(p, RECLAIM_ONE))
1081 | +      // b. Reclaim fragments of suitable allocations
1082 | +      || (reclaim_fragments(size, sync_event) && get_free_block(p, RECLAIM_FRAGMENTS))
1083 | +      // c. Attempt allocate (if not done earlier due to limit)
1084 | +      || (p.limit_alloc && alloc_block(p, false, CUDAMALLOC_OVER_LIMIT))
1085 | +      // d. Reclaim everything else
1086 | +      || (reclaim_all(sync_event) && get_free_block(p, RECLAIM_ALL));
1087 | +
1088 | +    C10_CUDA_CHECK(cudaEventDestroy(sync_event));
1089 | +
1090 | +    return found;
1091 |    }
1092 |  
1093 |    /** allocates a block which is safe to use from the provided stream */
1094 | -  void malloc(void** devPtr, size_t size, cudaStream_t stream)
1095 | +  Block* malloc(int device, size_t size, cudaStream_t stream, LMSSettings* lms)
1096 |    {
1097 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1098 |  
1099 | -    int device;
1100 | -    C10_CUDA_CHECK(cudaGetDevice(&device));
1101 | -
1102 |      // process outstanding cudaEvents
1103 |      process_events();
1104 |  
1105 |      size = round_size(size);
1106 | -
1107 | -    DeviceStats &stats = get_stats_for_device(device);
1108 | -
1109 | -    Block search_key(device, stream, size);
1110 |      auto& pool = get_pool(size);
1111 | -
1112 | -    auto find_free_block = [&]()->Block*{
1113 | -      auto it = pool.lower_bound(&search_key);
1114 | -      if (it != pool.end() && (*it)->device == device &&
1115 | -          (*it)->stream == stream) {
1116 | -        Block* block = *it;
1117 | -        pool.erase(it);
1118 | -        return block;
1119 | -      }
1120 | -      return nullptr;
1121 | -    };
1122 | -
1123 | -    Block* block = find_free_block();
1124 | -    if (block == nullptr) {
1125 | -      bool freed_memory = false;
1126 | -      for (const auto& name : FreeCudaMemoryCallbacksRegistry()->Keys()) {
1127 | -        freed_memory |=
1128 | -            FreeCudaMemoryCallbacksRegistry()->Create(name)->Execute();
1129 | -      }
1130 | -      if (freed_memory) {
1131 | -        block = find_free_block();
1132 | -      }
1133 | -    }
1134 | -    if (block == nullptr) {
1135 | -      void* ptr;
1136 | -      size_t alloc_size = get_allocation_size(size);
1137 | -      cudaError_t err = cuda_malloc_retry(device, &ptr, alloc_size);
1138 | -      if (err != cudaSuccess) {
1139 | -        if (err == cudaErrorMemoryAllocation) {
1140 | -          cudaGetLastError();  // clear CUDA error
1141 | -
1142 | -          size_t device_free;
1143 | -          size_t device_total;
1144 | -          C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1145 | -          const auto& stats = get_stats_for_device(device);
1146 | -
1147 | -          // "total capacity": total global memory on GPU
1148 | -          // "already allocated": memory allocated by the program using the
1149 | -          //                      caching allocator
1150 | -          // "free": free memory as reported by the CUDA API
1151 | -          // "cached": memory held by the allocator but not used by the program
1152 | -          //
1153 | -          // The "allocated" amount  does not include memory allocated outside
1154 | -          // of the caching allocator, such as memory allocated by other programs
1155 | -          // or memory held by the driver.
1156 | -          //
1157 | -          // The sum of "allocated" + "free" + "cached" may be less than the
1158 | -          // total capacity due to memory held by the driver and usage by other
1159 | -          // programs.
1160 | -          //
1161 | -          // Note that at this point cuda_malloc_retry has already returned all
1162 | -          // possible "cached" memory to the driver. The only remaining "cached"
1163 | -          // memory is split from a larger block that is partially in-use.
1164 | -          AT_ERROR(
1165 | -            "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
1166 | -            " (GPU ", device, "; ",
1167 | -            format_size(device_total), " total capacity; ",
1168 | -            format_size(stats.amount_allocated), " already allocated; ",
1169 | -            format_size(device_free), " free; ",
1170 | -            format_size(stats.amount_cached - stats.amount_allocated), " cached)");
1171 | -        } else {
1172 | -          C10_CUDA_CHECK(err);
1173 | -        }
1174 | +    const size_t alloc_size = get_allocation_size(size);
1175 | +    AllocParams params(device, size, stream, &pool, alloc_size, lms, stats);
1176 | +
1177 | +    bool block_found =
1178 | +      // 1. Search pool
1179 | +      get_free_block(params, FREELIST)
1180 | +      // 2. Trigger callbacks and retry search
1181 | +      || (trigger_free_memory_callbacks(params) && get_free_block(params, FREELIST))
1182 | +      // 3. Attempt allocate (if not limited by lms settings)
1183 | +      || (!params.limit_alloc && alloc_block(params, false, CUDAMALLOC_UNDER_LIMIT))
1184 | +      // 4. If LMS enabled, try to reclaim inactive allocations
1185 | +      || (params.lms_enabled && try_lms_reclaim(params))
1186 | +      // 5. Free all non-split cached blocks and retry alloc.
1187 | +      || (free_cached_blocks() && alloc_block(params, true, CUDAMALLOC_PURGE));
1188 | +
1189 | +    AT_ASSERT((!block_found && params.err != cudaSuccess) || params.block);
1190 | +    if (!block_found) {
1191 | +      if (params.err == cudaErrorMemoryAllocation) {
1192 | +        cudaGetLastError();  // clear CUDA error
1193 | +
1194 | +        size_t device_free;
1195 | +        size_t device_total;
1196 | +        C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
1197 | +
1198 | +        // "total capacity": total global memory on GPU
1199 | +        // "already allocated": memory allocated by the program using the
1200 | +        //                      caching allocator
1201 | +        // "free": free memory as reported by the CUDA API
1202 | +        // "cached": memory held by the allocator but not used by the program
1203 | +        //
1204 | +        // The "allocated" amount  does not include memory allocated outside
1205 | +        // of the caching allocator, such as memory allocated by other programs
1206 | +        // or memory held by the driver.
1207 | +        //
1208 | +        // The sum of "allocated" + "free" + "cached" may be less than the
1209 | +        // total capacity due to memory held by the driver and usage by other
1210 | +        // programs.
1211 | +        //
1212 | +        // Note that at this point cuda_malloc_retry has already returned all
1213 | +        // possible "cached" memory to the driver. The only remaining "cached"
1214 | +        // memory is split from a larger block that is partially in-use.
1215 | +        AT_ERROR(
1216 | +          "CUDA out of memory. Tried to allocate ", format_size(alloc_size),
1217 | +          " (GPU ", device, "; ",
1218 | +          format_size(device_total), " total capacity; ",
1219 | +          format_size(stats.amount_allocated), " already allocated; ",
1220 | +          format_size(device_free), " free; ",
1221 | +          format_size(stats.amount_cached - stats.amount_allocated), " cached; ",
1222 | +          format_size(stats.amount_inactive), " inactive)");
1223 | +      } else {
1224 | +        C10_CUDA_CHECK(params.err);
1225 |        }
1226 | -      stats.increaseCached(alloc_size);
1227 | -      block = new Block(device, stream, alloc_size, &pool, ptr);
1228 |      }
1229 |  
1230 | +    Block* block = params.block;
1231 |      Block* remaining = nullptr;
1232 |      AT_ASSERT(block);
1233 |      if (should_split(block, size)) {
1234 | @@ -293,30 +406,19 @@ struct THCCachingAllocator
1235 |      }
1236 |  
1237 |      block->allocated = true;
1238 | -    allocated_blocks[block->ptr] = block;
1239 | -
1240 | -    *devPtr = block->ptr;
1241 |  
1242 |      stats.increaseAllocated(block->size);
1243 | +    stats.recordAllocSource(params.source);
1244 | +
1245 | +    return block;
1246 |    }
1247 |  
1248 | -  void free(void* ptr)
1249 | +  void free(Block* block)
1250 |    {
1251 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1252 | -    if (!ptr) {
1253 | -      return;
1254 | -    }
1255 | -
1256 | -    auto it = allocated_blocks.find(ptr);
1257 | -    if (it == allocated_blocks.end()) {
1258 | -      AT_ERROR("invalid device pointer: ", ptr);
1259 | -    }
1260 | -
1261 | -    Block* block = it->second;
1262 | -    allocated_blocks.erase(it);
1263 |      block->allocated = false;
1264 |  
1265 | -    get_stats_for_device(block->device).decreaseAllocated(block->size);
1266 | +    stats.decreaseAllocated(block->size);
1267 |      if (!block->stream_uses.empty()) {
1268 |        insert_events(block);
1269 |      } else {
1270 | @@ -328,18 +430,12 @@ struct THCCachingAllocator
1271 |    void emptyCache()
1272 |    {
1273 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1274 | -    synchronize_and_free_events(nullopt);
1275 | -    free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
1276 | -    free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
1277 | +    free_cached_blocks();
1278 |    }
1279 |  
1280 | -  void* getBaseAllocation(void* ptr, size_t* outSize)
1281 | +  void* getBaseAllocation(Block* block, size_t* outSize)
1282 |    {
1283 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1284 | -    Block* block = find_allocated_block(ptr);
1285 | -    if (!block) {
1286 | -      AT_ERROR("invalid device pointer: %p", ptr);
1287 | -    }
1288 |      while (block->prev) {
1289 |        block = block->prev;
1290 |      }
1291 | @@ -356,11 +452,9 @@ struct THCCachingAllocator
1292 |    }
1293 |  
1294 |    // Accumulates sizes of all memory blocks for given device in given pool
1295 | -  void cacheInfoAux(BlockPool& blocks, int dev_id, size_t* total, size_t* largest)
1296 | +  void cacheInfoAux(BlockPool& blocks, size_t* total, size_t* largest)
1297 |    {
1298 | -    Block search_key(dev_id, 0, 0);
1299 | -    auto it = blocks.lower_bound(&search_key);
1300 | -    for (; it != blocks.end() && *it && (*it)->device == dev_id; ++it) {
1301 | +    for (auto it = blocks.begin(); it != blocks.end(); ++it) {
1302 |        size_t blocksize = (*it)->size;
1303 |        *total += blocksize;
1304 |        if (blocksize > *largest) {
1305 | @@ -369,20 +463,16 @@ struct THCCachingAllocator
1306 |      }
1307 |    }
1308 |  
1309 | -  void cacheInfo(int dev_id, size_t* total, size_t* largest)
1310 | +  void cacheInfo(size_t* total, size_t* largest)
1311 |    {
1312 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1313 | -    cacheInfoAux(large_blocks, dev_id, total, largest);
1314 | -    cacheInfoAux(small_blocks, dev_id, total, largest);
1315 | +    cacheInfoAux(large_blocks, total, largest);
1316 | +    cacheInfoAux(small_blocks, total, largest);
1317 |    }
1318 |  
1319 | -  void recordStream(void* ptr, cuda::CUDAStream stream)
1320 | +  void recordStream(Block* block, cuda::CUDAStream stream)
1321 |    {
1322 |      std::lock_guard<std::recursive_mutex> lock(mutex);
1323 | -    Block* block = find_allocated_block(ptr);
1324 | -    if (!block) {
1325 | -      AT_ERROR("invalid device pointer: %p", ptr);
1326 | -    }
1327 |      if (stream.stream() == block->stream) {
1328 |        // ignore uses on the allocation stream, since those don't require any
1329 |        // special synchronization
1330 | @@ -443,7 +533,7 @@ struct THCCachingAllocator
1331 |      }
1332 |    }
1333 |  
1334 | -  size_t round_size(size_t size) {
1335 | +  static size_t round_size(size_t size) {
1336 |      if (size < kMinBlockSize) {
1337 |        return kMinBlockSize;
1338 |      } else {
1339 | @@ -451,7 +541,7 @@ struct THCCachingAllocator
1340 |      }
1341 |    }
1342 |  
1343 | -  size_t get_allocation_size(size_t size) {
1344 | +  static size_t get_allocation_size(size_t size) {
1345 |      if (size <= kSmallSize) {
1346 |        return kSmallBuffer;
1347 |      } else if (size < kMinLargeAlloc) {
1348 | @@ -461,51 +551,28 @@ struct THCCachingAllocator
1349 |      }
1350 |    }
1351 |  
1352 | -  cudaError_t cuda_malloc_retry(int device, void** devPtr, size_t size)
1353 | -  {
1354 | -    // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks
1355 | -    // and retries.
1356 | -    cudaError_t err = cudaMalloc(devPtr, size);
1357 | -    if (err != cudaSuccess) {
1358 | -      cudaGetLastError();  // reset the last CUDA error
1359 | -      free_cached_blocks(device);
1360 | -      err = cudaMalloc(devPtr, size);
1361 | -      if (err != cudaSuccess) {
1362 | -        return err;
1363 | -      }
1364 | -    }
1365 | -    return cudaSuccess;
1366 | -  }
1367 | -
1368 | -  void free_cached_blocks(int device)
1369 | +  bool free_cached_blocks()
1370 |    {
1371 |      // First ensure that all blocks that can't currently be allocated due to
1372 |      // outstanding events are returned to the pool.
1373 | -    synchronize_and_free_events(device);
1374 | +    synchronize_and_free_events();
1375 |  
1376 | -    // Free all non-split cached blocks on device
1377 | -    Block lower_bound(device, nullptr, 0);
1378 | -    Block upper_bound(device + 1, nullptr, 0);
1379 | -
1380 | -    free_blocks(
1381 | -        large_blocks,
1382 | -        large_blocks.lower_bound(&lower_bound),
1383 | -        large_blocks.lower_bound(&upper_bound));
1384 | -    free_blocks(
1385 | -        small_blocks,
1386 | -        small_blocks.lower_bound(&lower_bound),
1387 | -        small_blocks.lower_bound(&upper_bound));
1388 | +    // Free all non-split cached blocks
1389 | +    free_blocks(large_blocks);
1390 | +    free_blocks(small_blocks);
1391 | +    return true;
1392 |    }
1393 |  
1394 | -  void free_blocks(BlockPool& blocks, BlockPool::iterator it, BlockPool::iterator end)
1395 | +  void free_blocks(BlockPool& blocks)
1396 |    {
1397 | -    // Frees all non-split blocks between `it` and `end`
1398 | -    std::lock_guard<std::mutex> lock(cuda_free_mutex);
1399 | -    while (it != end) {
1400 | +    // Frees all non-split blocks
1401 | +    std::lock_guard<std::mutex> lock(*CUDACachingAllocator::getFreeMutex());
1402 | +    auto it = blocks.begin();
1403 | +    while (it != blocks.end()) {
1404 |        Block* block = *it;
1405 |        if (!block->prev && !block->next) {
1406 |          C10_CUDA_CHECK(cudaFree((void*)block->ptr));
1407 | -        get_stats_for_device(block->device).decreaseCached(block->size);
1408 | +        stats.decreaseCached(block->size);
1409 |          auto cur = it;
1410 |          ++it;
1411 |          blocks.erase(cur);
1412 | @@ -516,19 +583,12 @@ struct THCCachingAllocator
1413 |      }
1414 |    }
1415 |  
1416 | -  void synchronize_and_free_events(optional<int> device) {
1417 | +  void synchronize_and_free_events() {
1418 |      // Synchronize on outstanding events and then free associated blocks.
1419 | -    // Limited to blocks on the given device if specified.
1420 | -
1421 | -    auto remaining_events = decltype(cuda_events)();
1422 |  
1423 |      for (auto& e : cuda_events) {
1424 |        cudaEvent_t event = e.first;
1425 |        Block* block = e.second;
1426 | -      if (device.has_value() && block->device != *device) {
1427 | -        remaining_events.push_back(e);
1428 | -        continue;
1429 | -      }
1430 |  
1431 |        C10_CUDA_CHECK(cudaEventSynchronize(event));
1432 |        C10_CUDA_CHECK(cudaEventDestroy(event));
1433 | @@ -539,15 +599,7 @@ struct THCCachingAllocator
1434 |        }
1435 |      }
1436 |  
1437 | -    std::swap(cuda_events, remaining_events);
1438 | -  }
1439 | -
1440 | -  Block* find_allocated_block(void *ptr) {
1441 | -    auto it = allocated_blocks.find(ptr);
1442 | -    if (it == allocated_blocks.end()) {
1443 | -      return nullptr;
1444 | -    }
1445 | -    return it->second;
1446 | +    cuda_events.clear();
1447 |    }
1448 |  
1449 |    void insert_events(Block* block)
1450 | @@ -601,10 +653,291 @@ struct THCCachingAllocator
1451 |        cuda_events.pop_front();
1452 |      }
1453 |    }
1454 | +
1455 | +  void reclaim_list_add(StorageImpl* storage) {
1456 | +    std::lock_guard<std::recursive_mutex> lock(mutex);
1457 | +    size_t storage_size = round_size(storage->capacity());
1458 | +    stats.increaseInactive(storage_size);
1459 | +    storage->lms_list_add(&reclaim_list);
1460 | +  }
1461 | +
1462 | +  bool reclaim_list_remove(StorageImpl* storage) {
1463 | +    std::lock_guard<std::recursive_mutex> lock(mutex);
1464 | +    if (!storage->lms_list_remove())
1465 | +      return false;
1466 | +
1467 | +    size_t storage_size = round_size(storage->capacity());
1468 | +    stats.decreaseInactive(storage_size);
1469 | +    return true;
1470 | +  }
1471 | +
1472 | +  bool reclaim_one(size_t size, cudaEvent_t sync_event) {
1473 | +    StorageImpl *best = nullptr;
1474 | +    size_t best_size = ULONG_MAX;
1475 | +
1476 | +    if (!reclaim_list.empty()) {
1477 | +      auto hook = reclaim_list.head();
1478 | +      auto end = reclaim_list.terminator();
1479 | +      do {
1480 | +        StorageImpl *storage = at::StorageImpl::from_list_hook(hook);
1481 | +        hook = hook->next();
1482 | +
1483 | +        size_t storage_size = round_size(storage->capacity());
1484 | +        if (storage_size >= size && storage_size < best_size) {
1485 | +          best = storage;
1486 | +          best_size = storage_size;
1487 | +          if (storage_size == size)
1488 | +            break;
1489 | +        }
1490 | +      } while (hook != end);
1491 | +    }
1492 | +
1493 | +    if (best == nullptr)
1494 | +      return false;
1495 | +
1496 | +    stats.decreaseInactive(best_size, true);
1497 | +    best->lms_list_remove();
1498 | +    best->lms_pageout(sync_event);
1499 | +    best->lms_pageout_sync();
1500 | +    return true;
1501 | +  }
1502 | +
1503 | +  static inline void process_pageout_sync(at::IntrusiveList* iodone_queue) {
1504 | +    while (!iodone_queue->empty()) {
1505 | +      auto hook = iodone_queue->head();
1506 | +      StorageImpl *storage = at::StorageImpl::from_list_hook(hook);
1507 | +      storage->lms_pageout_sync(iodone_queue);
1508 | +    }
1509 | +  }
1510 | +
1511 | +  bool reclaim_fragments(size_t size, cudaEvent_t sync_event) {
1512 | +    at::IntrusiveList iodone_queue;
1513 | +    size_t alloc_size;
1514 | +    int count = 0;
1515 | +
1516 | +    if (!reclaim_list.empty()) {
1517 | +      auto hook = reclaim_list.head();
1518 | +      auto end = reclaim_list.terminator();
1519 | +      do {
1520 | +        StorageImpl *storage = at::StorageImpl::from_list_hook(hook);
1521 | +        hook = hook->next();
1522 | +
1523 | +        CUDACachingAllocator::getBaseAllocation(storage->allocation_ptr(), &alloc_size);
1524 | +        if (alloc_size >= size) {
1525 | +          size_t storage_size = round_size(storage->capacity());
1526 | +          stats.decreaseInactive(storage_size, true);
1527 | +          storage->lms_list_remove();
1528 | +          storage->lms_pageout(sync_event, &iodone_queue);
1529 | +          count++;
1530 | +        }
1531 | +      } while (hook != end);
1532 | +    }
1533 | +
1534 | +    if (count == 0)
1535 | +      return false;
1536 | +
1537 | +    process_pageout_sync(&iodone_queue);
1538 | +    return true;
1539 | +  }
1540 | +
1541 | +  bool reclaim_all(cudaEvent_t sync_event) {
1542 | +    at::IntrusiveList iodone_queue;
1543 | +    int count = 0;
1544 | +
1545 | +    if (!reclaim_list.empty()) {
1546 | +      auto hook = reclaim_list.head();
1547 | +      auto end = reclaim_list.terminator();
1548 | +      do {
1549 | +        StorageImpl *storage = at::StorageImpl::from_list_hook(hook);
1550 | +        hook = hook->next();
1551 | +
1552 | +        size_t storage_size = round_size(storage->capacity());
1553 | +        stats.decreaseInactive(storage_size, true);
1554 | +        storage->lms_list_remove();
1555 | +        storage->lms_pageout(sync_event, &iodone_queue);
1556 | +        count++;
1557 | +      } while (hook != end);
1558 | +    }
1559 | +
1560 | +    if (count == 0)
1561 | +      return false;
1562 | +
1563 | +    process_pageout_sync(&iodone_queue);
1564 | +    return true;
1565 | +  }
1566 | +
1567 | +  void reclaimInactive()
1568 | +  {
1569 | +    std::lock_guard<std::recursive_mutex> lock(mutex);
1570 | +
1571 | +    if (!reclaim_list.empty()) {
1572 | +      cudaStream_t stream = cuda::getCurrentCUDAStream().stream();
1573 | +      cudaEvent_t sync_event;
1574 | +
1575 | +      C10_CUDA_CHECK(cudaEventCreate(&sync_event));
1576 | +      C10_CUDA_CHECK(cudaEventRecord(sync_event, stream));
1577 | +      reclaim_all(sync_event);
1578 | +      C10_CUDA_CHECK(cudaEventDestroy(sync_event));
1579 | +    }
1580 | +  }
1581 | +};
1582 | +
1583 | +struct THCCachingAllocator {
1584 | +  std::mutex mutex;
1585 | +  std::vector<DeviceCachingAllocator*> device_allocator;
1586 | +
1587 | +  // allocated blocks by device pointer
1588 | +  std::unordered_map<void*, Block*> allocated_blocks;
1589 | +
1590 | +  // lock around calls to cudaFree (to prevent deadlocks with NCCL)
1591 | +  std::mutex cuda_free_mutex;
1592 | +
1593 | +  LMSSettings lms_settings;
1594 | +
1595 | +  void init(int device_count, at::Allocator* host_allocator) {
1596 | +    int size = device_allocator.size();
1597 | +    if (size < device_count) {
1598 | +      device_allocator.resize(device_count);
1599 | +      for (int i = size; i < device_count; i++) {
1600 | +        device_allocator[i] = new DeviceCachingAllocator();
1601 | +      }
1602 | +    }
1603 | +    lms_settings.set_host_allocator(host_allocator);
1604 | +  }
1605 | +
1606 | +  void malloc(void** devPtr, size_t size, cudaStream_t stream) {
1607 | +    int device;
1608 | +    C10_CUDA_CHECK(cudaGetDevice(&device));
1609 | +    Block* block = device_allocator[device]->malloc(device, size, stream, &lms_settings);
1610 | +    {
1611 | +      std::lock_guard<std::mutex> lock(mutex);
1612 | +      allocated_blocks[block->ptr] = block;
1613 | +    }
1614 | +    *devPtr = (void*)block->ptr;
1615 | +  }
1616 | +
1617 | +  void free(void* ptr) {
1618 | +    if (!ptr) {
1619 | +      return;
1620 | +    }
1621 | +    Block* block = nullptr;
1622 | +    {
1623 | +      std::lock_guard<std::mutex> lock(mutex);
1624 | +      auto it = allocated_blocks.find(ptr);
1625 | +      if (it == allocated_blocks.end()) {
1626 | +        AT_ERROR("invalid device pointer: ", ptr);
1627 | +      }
1628 | +      block = it->second;
1629 | +      allocated_blocks.erase(it);
1630 | +    }
1631 | +    device_allocator[block->device]->free(block);
1632 | +  }
1633 | +
1634 | +  void emptyCache() {
1635 | +    int count = device_allocator.size();
1636 | +    for (int i = 0; i < count; i++)
1637 | +      device_allocator[i]->emptyCache();
1638 | +  }
1639 | +
1640 | +  Block* find_allocated_block(void *ptr) {
1641 | +    std::lock_guard<std::mutex> lock(mutex);
1642 | +    auto it = allocated_blocks.find(ptr);
1643 | +    if (it == allocated_blocks.end()) {
1644 | +      return nullptr;
1645 | +    }
1646 | +    return it->second;
1647 | +  }
1648 | +
1649 | +  void* getBaseAllocation(void* ptr, size_t* outSize)
1650 | +  {
1651 | +    Block* block = find_allocated_block(ptr);
1652 | +    if (!block) {
1653 | +      AT_ERROR("invalid device pointer: %p", ptr);
1654 | +    }
1655 | +    return device_allocator[block->device]->getBaseAllocation(block, outSize);
1656 | +  }
1657 | +
1658 | +  void recordStream(void* ptr, cuda::CUDAStream stream)
1659 | +  {
1660 | +    Block* block = find_allocated_block(ptr);
1661 | +    if (!block) {
1662 | +      AT_ERROR("invalid device pointer: %p", ptr);
1663 | +    }
1664 | +    device_allocator[block->device]->recordStream(block, stream);
1665 | +  }
1666 | +
1667 | +  void cacheInfo(int dev_id, size_t* total, size_t* largest) {
1668 | +    device_allocator[dev_id]->cacheInfo(total, largest);
1669 | +  }
1670 | +
1671 | +  void reclaimInactive() {
1672 | +    int count = device_allocator.size();
1673 | +    for (int i = 0; i < count; i++)
1674 | +      device_allocator[i]->reclaimInactive();
1675 | +  }
1676 |  };
1677 |  
1678 |  THCCachingAllocator caching_allocator;
1679 |  
1680 | +
1681 | +#define LMS_INVALID_STREAM ((cudaStream_t)-1)
1682 | +
1683 | +struct CudaLMSImpl : public at::LMSImpl {
1684 | +  CudaLMSImpl() :
1685 | +    at::LMSImpl(caching_allocator.lms_settings.host_allocator()),
1686 | +    stream_(LMS_INVALID_STREAM) {}
1687 | +  ~CudaLMSImpl() {}
1688 | +
1689 | +  void reclaim_list_add(at::IntrusiveListHook* hook) {
1690 | +    at::StorageImpl* storage = at::StorageImpl::from_list_hook(hook);
1691 | +    size_t size = storage->capacity();
1692 | +    size_t storage_size = DeviceCachingAllocator::round_size(size);
1693 | +    if (size == 0 || !caching_allocator.lms_settings.enabled(storage_size))
1694 | +      return;
1695 | +    int device = storage->device().index();
1696 | +    caching_allocator.device_allocator[device]->reclaim_list_add(storage);
1697 | +  }
1698 | +
1699 | +  bool reclaim_list_remove(at::IntrusiveListHook* hook) {
1700 | +    at::StorageImpl* storage = at::StorageImpl::from_list_hook(hook);
1701 | +    int device = storage->device().index();
1702 | +    return caching_allocator.device_allocator[device]->reclaim_list_remove(storage);
1703 | +  }
1704 | +
1705 | + protected:
1706 | +  cudaStream_t stream() const {
1707 | +    AT_ASSERT(stream_ != LMS_INVALID_STREAM);
1708 | +    return stream_;
1709 | +  }
1710 | +
1711 | +  void assign_stream() {
1712 | +    if (stream_ == LMS_INVALID_STREAM) {
1713 | +      stream_ = cuda::getLMSCUDAStream().stream();
1714 | +    }
1715 | +  }
1716 | +
1717 | +  void do_pagein(void* dst, void* src, size_t size) {
1718 | +    C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream()));
1719 | +  }
1720 | +
1721 | +  void do_pagein_sync() {
1722 | +    C10_CUDA_CHECK(cudaStreamSynchronize(stream()));
1723 | +  }
1724 | +
1725 | +  void do_pageout(void* dst, void* src, size_t size, at::LMSSyncEvent_t sync_event) {
1726 | +    assign_stream();
1727 | +    C10_CUDA_CHECK(cudaStreamWaitEvent(stream(), (cudaEvent_t)sync_event, 0));
1728 | +    C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream()));
1729 | +  }
1730 | +
1731 | +  void do_pageout_sync() {
1732 | +    C10_CUDA_CHECK(cudaStreamSynchronize(stream()));
1733 | +  }
1734 | +
1735 | +  cudaStream_t stream_;
1736 | +};
1737 | +
1738 | +
1739 |  static void CudaCachingDeleter(void* ptr) {
1740 |    caching_allocator.free(ptr);
1741 |  }
1742 | @@ -625,6 +958,9 @@ struct CudaCachingAllocator : public Allocator {
1743 |    DeleterFnPtr raw_deleter() const override {
1744 |      return &CudaCachingDeleter;
1745 |    }
1746 | +  at::LMSImpl* lms() const {
1747 | +    return caching_allocator.lms_settings.enabled() ? new CudaLMSImpl() : nullptr;
1748 | +  }
1749 |  };
1750 |  
1751 |  CudaCachingAllocator device_allocator;
1752 | @@ -634,6 +970,10 @@ Allocator* get(void)
1753 |    return &device_allocator;
1754 |  }
1755 |  
1756 | +void init(int device_count, at::Allocator* host_allocator) {
1757 | +  caching_allocator.init(device_count, host_allocator);
1758 | +}
1759 | +
1760 |  void emptyCache(void) {
1761 |    caching_allocator.emptyCache();
1762 |  }
1763 | @@ -665,37 +1005,107 @@ static inline void assertValidDevice(int device) {
1764 |  uint64_t currentMemoryAllocated(int device)
1765 |  {
1766 |    assertValidDevice(device);
1767 | -  return caching_allocator.get_stats_for_device(device).amount_allocated;
1768 | +  return caching_allocator.device_allocator[device]->stats.amount_allocated;
1769 |  }
1770 |  
1771 |  uint64_t maxMemoryAllocated(int device) {
1772 |    assertValidDevice(device);
1773 | -  return caching_allocator.get_stats_for_device(device).max_amount_allocated;
1774 | +  return caching_allocator.device_allocator[device]->stats.max_amount_allocated;
1775 |  }
1776 |  
1777 |  void resetMaxMemoryAllocated(int device) {
1778 |    assertValidDevice(device);
1779 | -  DeviceStats& stats = caching_allocator.get_stats_for_device(device);
1780 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1781 |    stats.max_amount_allocated = stats.amount_allocated;
1782 |  }
1783 |  
1784 |  uint64_t currentMemoryCached(int device)
1785 |  {
1786 |    assertValidDevice(device);
1787 | -  return caching_allocator.get_stats_for_device(device).amount_cached;
1788 | +  return caching_allocator.device_allocator[device]->stats.amount_cached;
1789 |  }
1790 |  
1791 |  uint64_t maxMemoryCached(int device) {
1792 |    assertValidDevice(device);
1793 | -  return caching_allocator.get_stats_for_device(device).max_amount_cached;
1794 | +  return caching_allocator.device_allocator[device]->stats.max_amount_cached;
1795 |  }
1796 |  
1797 |  void resetMaxMemoryCached(int device) {
1798 |    assertValidDevice(device);
1799 | -  DeviceStats& stats = caching_allocator.get_stats_for_device(device);
1800 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1801 |    stats.max_amount_cached = stats.amount_cached;
1802 |  }
1803 |  
1804 | +uint64_t currentMemoryActive(int device)
1805 | +{
1806 | +  assertValidDevice(device);
1807 | +  return caching_allocator.device_allocator[device]->stats.amount_active();
1808 | +}
1809 | +
1810 | +uint64_t maxMemoryActive(int device) {
1811 | +  assertValidDevice(device);
1812 | +  return caching_allocator.device_allocator[device]->stats.max_amount_active;
1813 | +}
1814 | +
1815 | +void resetMaxMemoryActive(int device) {
1816 | +  assertValidDevice(device);
1817 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1818 | +  stats.max_amount_active = stats.amount_active();
1819 | +}
1820 | +
1821 | +uint64_t currentMemoryReclaimed(int device)
1822 | +{
1823 | +  assertValidDevice(device);
1824 | +  return caching_allocator.device_allocator[device]->stats.amount_reclaimed;
1825 | +}
1826 | +
1827 | +void resetMemoryReclaimed(int device) {
1828 | +  assertValidDevice(device);
1829 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1830 | +  stats.amount_reclaimed = 0;
1831 | +}
1832 | +
1833 | +void currentAllocDistribution(int device, uint64_t* distribution)
1834 | +{
1835 | +  assertValidDevice(device);
1836 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1837 | +  stats.getAllocStats(distribution);
1838 | +}
1839 | +
1840 | +void resetAllocDistribution(int device) {
1841 | +  assertValidDevice(device);
1842 | +  DeviceStats& stats = caching_allocator.device_allocator[device]->stats;
1843 | +  stats.resetAllocStats();
1844 | +}
1845 | +
1846 | +void setUserEnabledLMS(bool enable) {
1847 | +  caching_allocator.lms_settings.set_enabled(enable);
1848 | +}
1849 | +
1850 | +bool userEnabledLMS(void) {
1851 | +  return caching_allocator.lms_settings.enabled();
1852 | +}
1853 | +
1854 | +void setUserSizeLMS(size_t size) {
1855 | +  caching_allocator.lms_settings.set_size(size);
1856 | +}
1857 | +
1858 | +size_t userSizeLMS(void) {
1859 | +  return caching_allocator.lms_settings.size();
1860 | +}
1861 | +
1862 | +void setUserLimitLMS(size_t limit) {
1863 | +  caching_allocator.lms_settings.set_limit(limit);
1864 | +}
1865 | +
1866 | +size_t userLimitLMS(void) {
1867 | +  return caching_allocator.lms_settings.limit();
1868 | +}
1869 | +
1870 | +void reclaimInactive(void) {
1871 | +  caching_allocator.reclaimInactive();
1872 | +}
1873 | +
1874 |  //
1875 |  // In CUDA IPC, sender sends a tensor to receiver, getIpcDevPtr
1876 |  // is called by the receiving process to map the CUDA memory from the sending
1877 | diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
1878 | index 2376446a6f..21eed40c8d 100644
1879 | --- a/c10/cuda/CUDACachingAllocator.h
1880 | +++ b/c10/cuda/CUDACachingAllocator.h
1881 | @@ -3,6 +3,7 @@
1882 |  
1883 |  #include <c10/cuda/CUDAStream.h>
1884 |  #include <c10/core/Allocator.h>
1885 | +#include <c10/core/StorageImpl.h>
1886 |  #include <c10/cuda/CUDAMacros.h>
1887 |  #include <c10/util/Registry.h>
1888 |  
1889 | @@ -43,6 +44,7 @@ C10_CUDA_API void* raw_alloc(size_t nbytes);
1890 |  C10_CUDA_API void raw_delete(void* ptr);
1891 |  
1892 |  C10_CUDA_API Allocator* get();
1893 | +C10_CUDA_API void init(int device_count, at::Allocator* host_allocator);
1894 |  C10_CUDA_API void emptyCache();
1895 |  C10_CUDA_API void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock);
1896 |  C10_CUDA_API void* getBaseAllocation(void *ptr, size_t *size);
1897 | @@ -53,11 +55,37 @@ C10_CUDA_API void     resetMaxMemoryAllocated(int device);
1898 |  C10_CUDA_API uint64_t currentMemoryCached(int device);
1899 |  C10_CUDA_API uint64_t maxMemoryCached(int device);
1900 |  C10_CUDA_API void     resetMaxMemoryCached(int device);
1901 | +C10_CUDA_API uint64_t currentMemoryActive(int device);
1902 | +C10_CUDA_API uint64_t maxMemoryActive(int device);
1903 | +C10_CUDA_API void     resetMaxMemoryActive(int device);
1904 | +C10_CUDA_API uint64_t currentMemoryReclaimed(int device);
1905 | +C10_CUDA_API void     resetMemoryReclaimed(int device);
1906 | +C10_CUDA_API void   setUserEnabledLMS(bool enable);
1907 | +C10_CUDA_API bool   userEnabledLMS(void);
1908 | +C10_CUDA_API void   setUserSizeLMS(size_t size);
1909 | +C10_CUDA_API size_t userSizeLMS(void);
1910 | +C10_CUDA_API void   setUserLimitLMS(size_t limit);
1911 | +C10_CUDA_API size_t userLimitLMS(void);
1912 | +C10_CUDA_API void reclaimInactive();
1913 |  
1914 |  C10_CUDA_API std::mutex* getFreeMutex();
1915 |  
1916 |  C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
1917 |  
1918 | +enum AllocSource {
1919 | +  FREELIST,
1920 | +  CUDAMALLOC_UNDER_LIMIT,
1921 | +  RECLAIM_ONE,
1922 | +  RECLAIM_FRAGMENTS,
1923 | +  CUDAMALLOC_OVER_LIMIT,
1924 | +  RECLAIM_ALL,
1925 | +  CUDAMALLOC_PURGE,
1926 | +  NUM_ALLOC_SOURCES
1927 | +};
1928 | +
1929 | +C10_CUDA_API void currentAllocDistribution(int device, uint64_t* distribution);
1930 | +C10_CUDA_API void resetAllocDistribution(int device);
1931 | +
1932 |  } // namespace CUDACachingAllocator
1933 |  
1934 |  }} // namespace c10::cuda
1935 | diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
1936 | index 393826f75a..01e12ff83d 100644
1937 | --- a/c10/cuda/CUDAStream.cpp
1938 | +++ b/c10/cuda/CUDAStream.cpp
1939 | @@ -73,6 +73,13 @@ static std::array<LeakyStreamInternals, kStreamsPerPool>
1940 |  static std::array<LeakyStreamInternals, kStreamsPerPool>
1941 |      high_priority_streams[C10_COMPILE_TIME_MAX_GPUS];
1942 |  
1943 | +// LMS streams
1944 | +static constexpr unsigned int kLMSFlags = cudaStreamDefault;
1945 | +static std::once_flag device_flags_lms[C10_COMPILE_TIME_MAX_GPUS];
1946 | +static std::atomic<uint32_t> lms_counters[C10_COMPILE_TIME_MAX_GPUS];
1947 | +static std::array<LeakyStreamInternals, kStreamsPerPool>
1948 | +    lms_streams[C10_COMPILE_TIME_MAX_GPUS];
1949 | +
1950 |  // Note [StreamId assignment]
1951 |  // ~~~~~~~~~~~~~~~~~~~~~~~~~~
1952 |  // How do we assign stream IDs?
1953 | @@ -84,6 +91,7 @@ static std::array<LeakyStreamInternals, kStreamsPerPool>
1954 |  //  00 = default stream
1955 |  //  01 = low priority stream
1956 |  //  10 = high priority stream
1957 | +//  11 = LMS stream
1958 |  //
1959 |  // This is not really for efficiency; it's just easier to write the code
1960 |  // to extract the index if we do this with bitmasks :)
1961 | @@ -104,6 +112,7 @@ enum class StreamIdType : uint8_t {
1962 |    DEFAULT = 0x0,
1963 |    LOW = 0x1,
1964 |    HIGH = 0x2,
1965 | +  LMS = 0x3,
1966 |  };
1967 |  
1968 |  std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
1969 | @@ -117,6 +126,9 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
1970 |      case StreamIdType::HIGH:
1971 |        stream << "HIGH";
1972 |        break;
1973 | +    case StreamIdType::LMS:
1974 | +      stream << "LMS";
1975 | +      break;
1976 |      default:
1977 |        stream << static_cast<uint8_t>(s);
1978 |        break;
1979 | @@ -178,6 +190,13 @@ static StreamId CUDAStream_getStreamId(const LeakyStreamInternals* ptr) {
1980 |          StreamIdType::HIGH, ptr - high_priority_streams[device_index].data());
1981 |    }
1982 |  
1983 | +  // Check if it's a LMS stream
1984 | +  if (pointer_within<LeakyStreamInternals>(
1985 | +          ptr, lms_streams[device_index])) {
1986 | +    return makeStreamId(
1987 | +        StreamIdType::LMS, ptr - lms_streams[device_index].data());
1988 | +  }
1989 | +
1990 |    AT_ASSERTM(
1991 |        0,
1992 |        "Could not compute stream ID for ",
1993 | @@ -243,6 +262,21 @@ static void initDeviceStreamState(DeviceIndex device_index) {
1994 |    }
1995 |  }
1996 |  
1997 | +// Creates the LMS stream pools for the specified device
1998 | +// Warning: only call once per device!
1999 | +static void initDeviceLMSStreamState(DeviceIndex device_index) {
2000 | +  // Switches to the requested device so streams are properly associated
2001 | +  // with it.
2002 | +  CUDAGuard device_guard{device_index};
2003 | +
2004 | +  for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
2005 | +    auto& stream = lms_streams[device_index][i];
2006 | +
2007 | +    stream.device_index = device_index;
2008 | +    C10_CUDA_CHECK(cudaStreamCreateWithFlags(&stream.stream, kLMSFlags));
2009 | +  }
2010 | +}
2011 | +
2012 |  // Init front-end to ensure initialization only occurs once
2013 |  static void initCUDAStreamsOnce() {
2014 |    // Inits default streams (once, globally)
2015 | @@ -293,6 +327,8 @@ LeakyStreamInternals* CUDAStream_internals(CUDAStream s) {
2016 |        return &low_priority_streams[device_index][si];
2017 |      case StreamIdType::HIGH:
2018 |        return &high_priority_streams[device_index][si];
2019 | +    case StreamIdType::LMS:
2020 | +      return &lms_streams[device_index][si];
2021 |      default:
2022 |        AT_ASSERTM(
2023 |            0,
2024 | @@ -369,6 +405,20 @@ void setCurrentCUDAStream(CUDAStream stream) {
2025 |    current_streams[ptr->device_index] = ptr;
2026 |  }
2027 |  
2028 | +CUDAStream getLMSCUDAStream(DeviceIndex device_index) {
2029 | +  initCUDAStreamsOnce();
2030 | +  if (device_index == -1)
2031 | +    device_index = current_device();
2032 | +  check_gpu(device_index);
2033 | +
2034 | +  // Initializes the LMS stream pool (once)
2035 | +  std::call_once(
2036 | +      device_flags_lms[device_index], initDeviceLMSStreamState, device_index);
2037 | +
2038 | +  const auto idx = get_idx(lms_counters[device_index]);
2039 | +  return CUDAStream_fromInternals(&lms_streams[device_index][idx]);
2040 | +}
2041 | +
2042 |  std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) {
2043 |    return stream << s.unwrap();
2044 |  }
2045 | diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
2046 | index 7f7f8640ae..c3192f1063 100644
2047 | --- a/c10/cuda/CUDAStream.h
2048 | +++ b/c10/cuda/CUDAStream.h
2049 | @@ -213,6 +213,11 @@ CAFFE2_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
2050 |   */
2051 |  CAFFE2_API void setCurrentCUDAStream(CUDAStream stream);
2052 |  
2053 | +/**
2054 | + * Get a new stream from the CUDA stream pool for LMS.
2055 | + */
2056 | +CAFFE2_API CUDAStream getLMSCUDAStream(DeviceIndex device = -1);
2057 | +
2058 |  C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
2059 |  
2060 |  } // namespace cuda
2061 | diff --git a/c10/util/IntrusiveList.h b/c10/util/IntrusiveList.h
2062 | new file mode 100644
2063 | index 0000000000..7e895416f4
2064 | --- /dev/null
2065 | +++ b/c10/util/IntrusiveList.h
2066 | @@ -0,0 +1,64 @@
2067 | +//===--- IntrusiveList.h - --------------------------------------*- C++ -*-===//
2068 | +
2069 | +#pragma once
2070 | +
2071 | +#include "c10/util/Exception.h"
2072 | +
2073 | +namespace c10 {
2074 | +  class IntrusiveListHook {
2075 | +  public:
2076 | +    IntrusiveListHook() {
2077 | +      next_ = prev_ = this;
2078 | +    }
2079 | +    ~IntrusiveListHook() {
2080 | +      remove();
2081 | +    }
2082 | +
2083 | +    IntrusiveListHook(IntrusiveListHook&& other) : IntrusiveListHook() {}
2084 | +    IntrusiveListHook& operator=(IntrusiveListHook&& other) { return *this; }
2085 | +
2086 | +    bool attached() const { return next_ != this; }
2087 | +    bool detached() const { return next_ == this; }
2088 | +
2089 | +    void insertbefore(IntrusiveListHook *x) {
2090 | +      if (x->attached()) {
2091 | +        AT_ERROR("Double insertion of IntrusiveListHook");
2092 | +      }
2093 | +      x->prev_ = prev_;
2094 | +      x->next_ = this;
2095 | +      prev_->next_ = x;
2096 | +      prev_ = x;
2097 | +    }
2098 | +
2099 | +    bool remove() {
2100 | +      if (!attached()) return false;
2101 | +
2102 | +      prev_->next_ = next_;
2103 | +      next_->prev_ = prev_;
2104 | +      next_ = prev_ = this;
2105 | +      return true;
2106 | +    }
2107 | +    IntrusiveListHook *next() const { return next_; }
2108 | +    IntrusiveListHook *prev() const { return prev_; }
2109 | +
2110 | +  private:
2111 | +    IntrusiveListHook *next_;
2112 | +    IntrusiveListHook *prev_;
2113 | +  };
2114 | +
2115 | +  class IntrusiveList {
2116 | +  public:
2117 | +    IntrusiveList() {}
2118 | +    ~IntrusiveList() {}
2119 | +    bool empty() const { return anchor_.detached(); }
2120 | +    void append(IntrusiveListHook *x) { anchor_.insertbefore(x); }
2121 | +    void prepend(IntrusiveListHook *x) { anchor_.next()->insertbefore(x); }
2122 | +    IntrusiveListHook *head() const { return anchor_.next(); }
2123 | +    IntrusiveListHook *tail() const { return anchor_.prev(); }
2124 | +    const IntrusiveListHook *terminator() const { return &anchor_; }
2125 | +
2126 | +  private:
2127 | +    IntrusiveListHook anchor_;
2128 | +  };
2129 | +
2130 | +} // end namespace c10
2131 | diff --git a/test/test_cuda.py b/test/test_cuda.py
2132 | index 5afcfd1b74..62aa668f01 100644
2133 | --- a/test/test_cuda.py
2134 | +++ b/test/test_cuda.py
2135 | @@ -2803,6 +2803,122 @@ class TestCuda(TestCase):
2136 |              torch.DoubleTensor(a).cuda().round().cpu(),
2137 |              torch.DoubleTensor(res).cpu())
2138 |  
2139 | +    def test_large_model_support(self):
2140 | +        device = torch.cuda.current_device()
2141 | +        default_enabled = torch.cuda.get_enabled_lms()
2142 | +        default_size = torch.cuda.get_size_lms()
2143 | +        default_limit = torch.cuda.get_limit_lms()
2144 | +
2145 | +        def alloc(*size):
2146 | +            with torch.cuda.device(device):
2147 | +                return torch.cuda.FloatTensor(*size).normal_()
2148 | +
2149 | +        # 1. Test Inactive LMS Off
2150 | +        #    LMS Off / alloc multiple small and large
2151 | +        #    assert(active memory == allocated memory)
2152 | +        # 2. Test Inactive LMS On
2153 | +        #    LMS On  / alloc multiple small and large
2154 | +        #    assert(active memory < allocated memory)
2155 | +        def _test_lms_enabled(enabled):
2156 | +            torch.cuda.empty_cache()
2157 | +            torch.cuda.set_enabled_lms(enabled)
2158 | +            tensors = [alloc(32), alloc(128), alloc(10, 1024, 1024)]
2159 | +            if not enabled:
2160 | +                self.assertEqual(torch.cuda.memory_allocated(device), torch.cuda.memory_active(device))
2161 | +            else:
2162 | +                self.assertGreater(torch.cuda.memory_allocated(device), torch.cuda.memory_active(device))
2163 | +            del tensors
2164 | +
2165 | +        _test_lms_enabled(enabled=False)
2166 | +        _test_lms_enabled(enabled=True)
2167 | +
2168 | +        # 3. Test LMS Limit Swap
2169 | +        #    LMS On, limit 0 / alloc multiple small and large / record memory stats / alloc large
2170 | +        #    assert(allocated is unchanged)
2171 | +        # 4. Test LMS Limit Alloc
2172 | +        #    LMS On, limit high / alloc multiple small and large / record memory stats / alloc large
2173 | +        #    assert(allocated has increased)
2174 | +        def _test_lms_limit(zero):
2175 | +            torch.cuda.empty_cache()
2176 | +            torch.cuda.set_limit_lms(0 if zero else 1024*1024*1024)
2177 | +            tensors = [alloc(32), alloc(128), alloc(10, 1024, 1024)]
2178 | +            allocated = torch.cuda.memory_allocated(device)
2179 | +            reclaimed = torch.cuda.memory_reclaimed(device)
2180 | +            dist_before = torch.cuda.alloc_distribution(device)
2181 | +            tensors.append(alloc(10, 1024, 1024))
2182 | +            dist_after = torch.cuda.alloc_distribution(device)
2183 | +            if zero:
2184 | +                self.assertEqual(torch.cuda.memory_allocated(device), allocated)
2185 | +                self.assertGreater(torch.cuda.memory_reclaimed(device), reclaimed)
2186 | +                self.assertEqual(dist_after['cudamalloc'], dist_before['cudamalloc'])
2187 | +                self.assertGreater(dist_after['reclaim_one'], dist_before['reclaim_one'])
2188 | +            else:
2189 | +                self.assertGreater(torch.cuda.memory_allocated(device), allocated)
2190 | +                self.assertEqual(torch.cuda.memory_reclaimed(device), reclaimed)
2191 | +                self.assertGreater(dist_after['cudamalloc'], dist_before['cudamalloc'])
2192 | +                self.assertEqual(dist_after['reclaim_one'], dist_before['reclaim_one'])
2193 | +            del tensors
2194 | +
2195 | +        _test_lms_limit(zero=True)
2196 | +        _test_lms_limit(zero=False)
2197 | +        torch.cuda.set_limit_lms(default_limit)
2198 | +
2199 | +        # 5. Test LMS Size Threshold On
2200 | +        #    LMS On, size 1MB / record memory stats / alloc multple small and large
2201 | +        #    assert(active memory has increased)
2202 | +        # 6. Test LMS Size Threshold Off
2203 | +        #    LMS On, size 0 / record memory stats / alloc multiple small and large
2204 | +        #    assert(active memory is unchanged)
2205 | +        def _test_lms_size(zero):
2206 | +            torch.cuda.empty_cache()
2207 | +            torch.cuda.set_size_lms(0 if zero else 1024*1024)
2208 | +            active = torch.cuda.memory_active(device)
2209 | +            tensors = [alloc(32), alloc(128), alloc(10, 1024, 1024)]
2210 | +            if zero:
2211 | +                self.assertEqual(torch.cuda.memory_active(device), active)
2212 | +            else:
2213 | +                self.assertGreater(torch.cuda.memory_active(device), active)
2214 | +            del tensors
2215 | +
2216 | +        _test_lms_size(zero=False)
2217 | +        _test_lms_size(zero=True)
2218 | +        torch.cuda.set_size_lms(default_size)
2219 | +
2220 | +        # 7. Test LMS Page-out
2221 | +        #    LMS On / alloc multiple small and large / record memory stats / reclaim all
2222 | +        #    assert(allocated has decreased && active/cached are unchanged)
2223 | +        torch.cuda.empty_cache()
2224 | +        tensors = [alloc(32), alloc(128), alloc(10, 1024, 1024)]
2225 | +        sums = list(map(torch.sum, tensors))
2226 | +        cached = torch.cuda.memory_cached(device)
2227 | +        allocated = torch.cuda.memory_allocated(device)
2228 | +        active = torch.cuda.memory_active(device)
2229 | +        reclaimed = torch.cuda.memory_reclaimed(device)
2230 | +        torch.cuda.reclaim_inactive()
2231 | +        self.assertGreater(torch.cuda.memory_reclaimed(device), reclaimed)
2232 | +        self.assertEqual(active, torch.cuda.memory_active(device))
2233 | +        self.assertEqual(cached, torch.cuda.memory_cached(device))
2234 | +        self.assertGreater(allocated, torch.cuda.memory_allocated(device))
2235 | +
2236 | +        # 8. Test LMS Page-in
2237 | +        #    Access tensors again
2238 | +        #    assert(tensor data is preserved during reclaim)
2239 | +        #    assert(allocated been restored && active/cached are still unchanged)
2240 | +        dist_before = torch.cuda.alloc_distribution(device)
2241 | +        sums2 = list(map(torch.sum, tensors))
2242 | +        dist_after = torch.cuda.alloc_distribution(device)
2243 | +        self.assertEqual(sums, sums2)
2244 | +        del sums2
2245 | +        self.assertEqual(active, torch.cuda.memory_active(device))
2246 | +        self.assertEqual(cached, torch.cuda.memory_cached(device))
2247 | +        self.assertEqual(allocated, torch.cuda.memory_allocated(device))
2248 | +        self.assertGreater(dist_after['freelist'], dist_before['freelist'])
2249 | +        self.assertEqual(dist_after['cudamalloc'], dist_before['cudamalloc'])
2250 | +        del sums
2251 | +        del tensors
2252 | +
2253 | +        # Reset LMS state
2254 | +        torch.cuda.set_enabled_lms(default_enabled)
2255 |  
2256 |  def load_ignore_file():
2257 |      from os.path import join, dirname
2258 | diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
2259 | index ac46b6ca11..9ff50f16e9 100644
2260 | --- a/torch/csrc/cuda/Module.cpp
2261 | +++ b/torch/csrc/cuda/Module.cpp
2262 | @@ -334,6 +334,151 @@ PyObject * THCPModule_resetMaxMemoryCached(PyObject *_unused, PyObject *arg)
2263 |    Py_RETURN_NONE;
2264 |  }
2265 |  
2266 | +PyObject * THCPModule_memoryActive(PyObject *_unused, PyObject *arg)
2267 | +{
2268 | +  HANDLE_TH_ERRORS
2269 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_active");
2270 | +  int device = (int) THPUtils_unpackLong(arg);
2271 | +  auto memory_active = c10::cuda::CUDACachingAllocator::currentMemoryActive(device);
2272 | +  return PyLong_FromUnsignedLongLong(memory_active);
2273 | +  END_HANDLE_TH_ERRORS
2274 | +}
2275 | +
2276 | +PyObject * THCPModule_maxMemoryActive(PyObject *_unused, PyObject *arg)
2277 | +{
2278 | +  HANDLE_TH_ERRORS
2279 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to max_memory_active");
2280 | +  int device = (int) THPUtils_unpackLong(arg);
2281 | +  auto max_memory_active = c10::cuda::CUDACachingAllocator::maxMemoryActive(device);
2282 | +  return PyLong_FromUnsignedLongLong(max_memory_active);
2283 | +  END_HANDLE_TH_ERRORS
2284 | +}
2285 | +
2286 | +PyObject * THCPModule_resetMaxMemoryActive(PyObject *_unused, PyObject *arg)
2287 | +{
2288 | +  HANDLE_TH_ERRORS
2289 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to reset_max_memory_active");
2290 | +  int device = (int) THPUtils_unpackLong(arg);
2291 | +  c10::cuda::CUDACachingAllocator::resetMaxMemoryActive(device);
2292 | +  END_HANDLE_TH_ERRORS
2293 | +  Py_RETURN_NONE;
2294 | +}
2295 | +
2296 | +PyObject * THCPModule_memoryReclaimed(PyObject *_unused, PyObject *arg)
2297 | +{
2298 | +  HANDLE_TH_ERRORS
2299 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to memory_reclaimed");
2300 | +  int device = (int) THPUtils_unpackLong(arg);
2301 | +  auto memory_reclaimed = c10::cuda::CUDACachingAllocator::currentMemoryReclaimed(device);
2302 | +  return PyLong_FromUnsignedLongLong(memory_reclaimed);
2303 | +  END_HANDLE_TH_ERRORS
2304 | +}
2305 | +
2306 | +PyObject * THCPModule_resetMemoryReclaimed(PyObject *_unused, PyObject *arg)
2307 | +{
2308 | +  HANDLE_TH_ERRORS
2309 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to reset_memory_reclaimed");
2310 | +  int device = (int) THPUtils_unpackLong(arg);
2311 | +  c10::cuda::CUDACachingAllocator::resetMemoryReclaimed(device);
2312 | +  END_HANDLE_TH_ERRORS
2313 | +  Py_RETURN_NONE;
2314 | +}
2315 | +
2316 | +const char* const bucket_label[c10::cuda::CUDACachingAllocator::NUM_ALLOC_SOURCES] = {
2317 | +  "freelist",
2318 | +  "cudamalloc",
2319 | +  "reclaim_one",
2320 | +  "reclaim_fragments",
2321 | +  "cudamalloc_over_limit",
2322 | +  "reclaim_all",
2323 | +  "cudamalloc_purge"
2324 | +};
2325 | +
2326 | +PyObject * THCPModule_allocDistribution(PyObject *_unused, PyObject *arg)
2327 | +{
2328 | +  HANDLE_TH_ERRORS
2329 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to alloc_distribution");
2330 | +  int device = (int) THPUtils_unpackLong(arg);
2331 | +  const int nbuckets = c10::cuda::CUDACachingAllocator::NUM_ALLOC_SOURCES;
2332 | +  uint64_t counts[nbuckets];
2333 | +  c10::cuda::CUDACachingAllocator::currentAllocDistribution(device, counts);
2334 | +  PyObject* distribution = PyDict_New();
2335 | +  for (int i = 0; i < nbuckets; i++)
2336 | +    PyDict_SetItemString(distribution, bucket_label[i], PyLong_FromUnsignedLongLong(counts[i]));
2337 | +  return distribution;
2338 | +  END_HANDLE_TH_ERRORS
2339 | +}
2340 | +
2341 | +PyObject * THCPModule_resetAllocDistribution(PyObject *_unused, PyObject *arg)
2342 | +{
2343 | +  HANDLE_TH_ERRORS
2344 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to reset_alloc_distribution");
2345 | +  int device = (int) THPUtils_unpackLong(arg);
2346 | +  c10::cuda::CUDACachingAllocator::resetAllocDistribution(device);
2347 | +  END_HANDLE_TH_ERRORS
2348 | +  Py_RETURN_NONE;
2349 | +}
2350 | +
2351 | +PyObject *THCPModule_setUserEnabledLMS(PyObject *_unused, PyObject *arg)
2352 | +{
2353 | +  HANDLE_TH_ERRORS
2354 | +  THPUtils_assert(PyBool_Check(arg), "set_enabled_lms expects a bool, "
2355 | +          "but got %s", THPUtils_typename(arg));
2356 | +  c10::cuda::CUDACachingAllocator::setUserEnabledLMS(arg == Py_True);
2357 | +  Py_RETURN_NONE;
2358 | +  END_HANDLE_TH_ERRORS
2359 | +}
2360 | +
2361 | +PyObject *THCPModule_userEnabledLMS(PyObject *_unused)
2362 | +{
2363 | +  HANDLE_TH_ERRORS
2364 | +  if (c10::cuda::CUDACachingAllocator::userEnabledLMS()) Py_RETURN_TRUE;
2365 | +  else Py_RETURN_FALSE;
2366 | +  END_HANDLE_TH_ERRORS
2367 | +}
2368 | +
2369 | +PyObject *THCPModule_setUserSizeLMS(PyObject *_unused, PyObject *arg)
2370 | +{
2371 | +  HANDLE_TH_ERRORS
2372 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to set_size_lms");
2373 | +  size_t size = THPUtils_unpackLong(arg);
2374 | +  c10::cuda::CUDACachingAllocator::setUserSizeLMS(size);
2375 | +  Py_RETURN_NONE;
2376 | +  END_HANDLE_TH_ERRORS
2377 | +}
2378 | +
2379 | +PyObject *THCPModule_userSizeLMS(PyObject *_unused)
2380 | +{
2381 | +  HANDLE_TH_ERRORS
2382 | +  return PyLong_FromLong(c10::cuda::CUDACachingAllocator::userSizeLMS());
2383 | +  END_HANDLE_TH_ERRORS
2384 | +}
2385 | +
2386 | +PyObject *THCPModule_setUserLimitLMS(PyObject *_unused, PyObject *arg)
2387 | +{
2388 | +  HANDLE_TH_ERRORS
2389 | +  THPUtils_assert(THPUtils_checkLong(arg), "invalid argument to set_limit_lms");
2390 | +  size_t limit = THPUtils_unpackLong(arg);
2391 | +  c10::cuda::CUDACachingAllocator::setUserLimitLMS(limit);
2392 | +  Py_RETURN_NONE;
2393 | +  END_HANDLE_TH_ERRORS
2394 | +}
2395 | +
2396 | +PyObject *THCPModule_userLimitLMS(PyObject *_unused)
2397 | +{
2398 | +  HANDLE_TH_ERRORS
2399 | +  return PyLong_FromLong(c10::cuda::CUDACachingAllocator::userLimitLMS());
2400 | +  END_HANDLE_TH_ERRORS
2401 | +}
2402 | +
2403 | +PyObject * THCPModule_reclaimInactive(PyObject *_unused)
2404 | +{
2405 | +  HANDLE_TH_ERRORS
2406 | +  c10::cuda::CUDACachingAllocator::reclaimInactive();
2407 | +  END_HANDLE_TH_ERRORS
2408 | +  Py_RETURN_NONE;
2409 | +}
2410 | +
2411 |  ////////////////////////////////////////////////////////////////////////////////
2412 |  // Cuda module initialization
2413 |  ////////////////////////////////////////////////////////////////////////////////
2414 | @@ -445,6 +590,20 @@ static struct PyMethodDef _THCPModule_methods[] = {
2415 |    {"_cuda_memoryCached", (PyCFunction) THCPModule_memoryCached, METH_O,  nullptr},
2416 |    {"_cuda_maxMemoryCached", (PyCFunction) THCPModule_maxMemoryCached, METH_O,  nullptr},
2417 |    {"_cuda_resetMaxMemoryCached", (PyCFunction) THCPModule_resetMaxMemoryCached, METH_O,  nullptr},
2418 | +  {"_cuda_memoryActive", (PyCFunction) THCPModule_memoryActive, METH_O,  nullptr},
2419 | +  {"_cuda_maxMemoryActive", (PyCFunction) THCPModule_maxMemoryActive, METH_O,  nullptr},
2420 | +  {"_cuda_resetMaxMemoryActive", (PyCFunction) THCPModule_resetMaxMemoryActive, METH_O,  nullptr},
2421 | +  {"_cuda_memoryReclaimed", (PyCFunction) THCPModule_memoryReclaimed, METH_O,  nullptr},
2422 | +  {"_cuda_resetMemoryReclaimed", (PyCFunction) THCPModule_resetMemoryReclaimed, METH_O,  nullptr},
2423 | +  {"_cuda_allocDistribution", (PyCFunction) THCPModule_allocDistribution, METH_O,  nullptr},
2424 | +  {"_cuda_resetAllocDistribution", (PyCFunction) THCPModule_resetAllocDistribution, METH_O,  nullptr},
2425 | +  {"_cuda_getEnabledLMS", (PyCFunction)THCPModule_userEnabledLMS, METH_NOARGS, nullptr},
2426 | +  {"_cuda_setEnabledLMS", (PyCFunction)THCPModule_setUserEnabledLMS, METH_O,   nullptr},
2427 | +  {"_cuda_getSizeLMS", (PyCFunction)THCPModule_userSizeLMS, METH_NOARGS,       nullptr},
2428 | +  {"_cuda_setSizeLMS", (PyCFunction)THCPModule_setUserSizeLMS, METH_O,         nullptr},
2429 | +  {"_cuda_getLimitLMS", (PyCFunction)THCPModule_userLimitLMS, METH_NOARGS,     nullptr},
2430 | +  {"_cuda_setLimitLMS", (PyCFunction)THCPModule_setUserLimitLMS, METH_O,       nullptr},
2431 | +  {"_cuda_reclaimInactive", (PyCFunction) THCPModule_reclaimInactive, METH_NOARGS,  nullptr},
2432 |    {"_cuda_manualSeed",  (PyCFunction)THCPModule_manualSeed,       METH_O,       nullptr},
2433 |    {"_cuda_manualSeedAll", (PyCFunction)THCPModule_manualSeedAll,  METH_O,       nullptr},
2434 |    {"_cuda_seed",        (PyCFunction)THCPModule_seed,             METH_NOARGS,  nullptr},
2435 | diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
2436 | index f4e47a436c..940fa8757d 100644
2437 | --- a/torch/csrc/generic/serialization.cpp
2438 | +++ b/torch/csrc/generic/serialization.cpp
2439 | @@ -20,7 +20,7 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
2440 |  #else
2441 |    std::unique_ptr<char[]> cpu_data(new char[size * sizeof(scalar_t)]);
2442 |    data = (scalar_t*)cpu_data.get();
2443 | -  THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(scalar_t), cudaMemcpyDeviceToHost));
2444 | +  THCStorage_copy_to_host(LIBRARY_STATE self, data);
2445 |  #endif
2446 |    doWrite(fd, &size, sizeof(int64_t));
2447 |    // fast track for bytes and little endian
2448 | diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
2449 | index 94fa9b6b4f..ba7012094b 100644
2450 | --- a/torch/cuda/__init__.py
2451 | +++ b/torch/cuda/__init__.py
2452 | @@ -546,6 +546,181 @@ def reset_max_memory_cached(device=None):
2453 |      return torch._C._cuda_resetMaxMemoryCached(device)
2454 |  
2455 |  
2456 | +def memory_active(device=None):
2457 | +    r"""Returns the current GPU memory occupied by active tensors in bytes for a given
2458 | +    device.
2459 | +
2460 | +    Arguments:
2461 | +        device (torch.device or int, optional): selected device. Returns
2462 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2463 | +            if :attr:`device` is ``None`` (default).
2464 | +
2465 | +    .. note::
2466 | +        When Large Model Support is enabled, this should be less than the total amount of
2467 | +        GPU memory occupied by tensors.
2468 | +    """
2469 | +    device = _get_device_index(device, optional=True)
2470 | +    return torch._C._cuda_memoryActive(device)
2471 | +
2472 | +
2473 | +def max_memory_active(device=None):
2474 | +    r"""Returns the maximum GPU memory occupied by active tensors in bytes for a given
2475 | +    device.
2476 | +
2477 | +    By default, this returns the peak active memory since the beginning of
2478 | +    this program. :func:`~torch.cuda.reset_max_memory_active` can be used to
2479 | +    reset the starting point in tracking this metric. For example, these two
2480 | +    functions can measure the peak active memory usage of each iteration in a
2481 | +    training loop.
2482 | +
2483 | +    Arguments:
2484 | +        device (torch.device or int, optional): selected device. Returns
2485 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2486 | +            if :attr:`device` is ``None`` (default).
2487 | +    """
2488 | +    device = _get_device_index(device, optional=True)
2489 | +    return torch._C._cuda_maxMemoryActive(device)
2490 | +
2491 | +
2492 | +def reset_max_memory_active(device=None):
2493 | +    r"""Resets the starting point in tracking maximum GPU memory occupied by
2494 | +    active tensors for a given device.
2495 | +
2496 | +    See :func:`~torch.cuda.max_memory_allocated` for details.
2497 | +
2498 | +    Arguments:
2499 | +        device (torch.device or int, optional): selected device. Returns
2500 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2501 | +            if :attr:`device` is ``None`` (default).
2502 | +    """
2503 | +    device = _get_device_index(device, optional=True)
2504 | +    return torch._C._cuda_resetMaxMemoryActive(device)
2505 | +
2506 | +
2507 | +def memory_reclaimed(device=None):
2508 | +    r"""Returns the total GPU memory transferred to the host by Large Model Support
2509 | +    in bytes for a given device.
2510 | +
2511 | +    Arguments:
2512 | +        device (torch.device or int, optional): selected device. Returns
2513 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2514 | +            if :attr:`device` is ``None`` (default).
2515 | +
2516 | +    .. note::
2517 | +        This will be non-zero only when Large Model Support is enabled.
2518 | +    """
2519 | +    device = _get_device_index(device, optional=True)
2520 | +    return torch._C._cuda_memoryReclaimed(device)
2521 | +
2522 | +def reset_memory_reclaimed(device=None):
2523 | +    r"""Resets the starting point in tracking the total GPU memory transfered to the host
2524 | +    by Large Model Support.
2525 | +
2526 | +    See :func:`~torch.cuda.memory_reclaimed` for details.
2527 | +
2528 | +    Arguments:
2529 | +        device (torch.device or int, optional): selected device. Returns
2530 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2531 | +            if :attr:`device` is ``None`` (default).
2532 | +    """
2533 | +    device = _get_device_index(device, optional=True)
2534 | +    return torch._C._cuda_resetMemoryReclaimed(device)
2535 | +
2536 | +def alloc_distribution(device=None):
2537 | +    r"""Returns a histogram (encoded as a python dictionary) showing the distribution of allocation
2538 | +    sources for a given device.  Each allocation satisfied by the CUDA Caching Allocator is retrieved
2539 | +    from a particular source.  The allocation distribution counts the number of allocations satisfied
2540 | +    from each source.
2541 | +
2542 | +    The set of possible sources are:
2543 | +
2544 | +    * `'freelist'`
2545 | +    * `'cudamalloc'`
2546 | +    * `'reclaim_one'`
2547 | +    * `'reclaim_fragments'`
2548 | +    * `'cudamalloc_over_limit'`
2549 | +    * `'reclaim_all'`
2550 | +    * `'cudamalloc_purge'`
2551 | +
2552 | +    Arguments:
2553 | +        device (torch.device or int, optional): selected device. Returns
2554 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2555 | +            if :attr:`device` is ``None`` (default).
2556 | +
2557 | +    .. note::
2558 | +        The `reclaim_one`, `reclaim_fragments`, `cudamalloc_over_limit`, and `reclaim_all` allocation
2559 | +        sources are applicable only when Large Model Support is enabled.
2560 | +    """
2561 | +
2562 | +    device = _get_device_index(device, optional=True)
2563 | +    return torch._C._cuda_allocDistribution(device)
2564 | +
2565 | +def reset_alloc_distribution(device=None):
2566 | +    r"""Resets the starting point in tracking the distribution of allocation sources.
2567 | +
2568 | +    See :func:`~torch.cuda.alloc_distribution` for details.
2569 | +
2570 | +    Arguments:
2571 | +        device (torch.device or int, optional): selected device. Returns
2572 | +            statistic for the current device, given by :func:`~torch.cuda.current_device`,
2573 | +            if :attr:`device` is ``None`` (default).
2574 | +    """
2575 | +    device = _get_device_index(device, optional=True)
2576 | +    return torch._C._cuda_resetAllocDistribution(device)
2577 | +
2578 | +def set_enabled_lms(enable):
2579 | +    r"""Enable/disable Large Model Support.
2580 | +
2581 | +    Arguments:
2582 | +        enable (bool): desired LMS setting.
2583 | +    """
2584 | +    torch._C._cuda_setEnabledLMS(enable)
2585 | +
2586 | +
2587 | +def get_enabled_lms():
2588 | +    r"""Returns a bool indicating if Large Model Support is currently enabled."""
2589 | +    return torch._C._cuda_getEnabledLMS()
2590 | +
2591 | +
2592 | +def set_size_lms(size):
2593 | +    r"""Mininum size (in bytes) for LMS.
2594 | +
2595 | +    Arguments:
2596 | +        size (integer): Any memory block larger than this will be subject to LMS optimization.
2597 | +    """
2598 | +    torch._C._cuda_setSizeLMS(size)
2599 | +
2600 | +
2601 | +def get_size_lms():
2602 | +    r"""Returns the minimum size (in bytes) for LMS."""
2603 | +    return torch._C._cuda_getSizeLMS()
2604 | +
2605 | +
2606 | +def set_limit_lms(limit):
2607 | +    r"""Allocation limit (in bytes) for LMS.
2608 | +
2609 | +    Arguments:
2610 | +        limit (integer): LMS limit on device memory.
2611 | +    """
2612 | +    torch._C._cuda_setLimitLMS(limit)
2613 | +
2614 | +
2615 | +def get_limit_lms():
2616 | +    r"""Returns the limit (in bytes) for LMS."""
2617 | +    return torch._C._cuda_getLimitLMS()
2618 | +
2619 | +
2620 | +def reclaim_inactive():
2621 | +    r"""Swaps the memory of all inactive tensors out to the host so that those can be returned
2622 | +    to the caching allocator.
2623 | +
2624 | +    .. note::
2625 | +        The set of inactive tensors is maintained only when Large Model Support is enabled.
2626 | +    """
2627 | +    if _initialized:
2628 | +        torch._C._cuda_reclaimInactive()
2629 | +
2630 | +
2631 |  def _host_allocator():
2632 |      _lazy_init()
2633 |      return torch._C._cuda_cudaHostAllocator()
2634 | -- 
2635 | 2.21.0 (Apple Git-122)
2636 | 
2637 | 


--------------------------------------------------------------------------------