├── .github
    └── workflows
    │   ├── ci.yml
    │   └── python-publish.yml
├── .gitignore
├── .pylintrc
├── HISTORY.md
├── LICENSE
├── Makefile
├── README.md
├── examples
    ├── example.py
    └── pair_testing.sh
├── gpu_tester
    ├── __init__.py
    ├── ddp_worker.py
    ├── main.py
    ├── simple_forward_worker.py
    └── world_info_from_env.py
├── mypy.ini
├── requirements-test.txt
├── requirements.txt
├── setup.py
└── tests
    └── test_main.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |   pull_request:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   lint:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Set up Python 3.8
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: 3.8
20 |       - name: Install
21 |         run: |
22 |           python3 -m venv .env
23 |           source .env/bin/activate
24 |           python -m pip install -U pip
25 |           make install-dev
26 |       - name: Lint
27 |         run: |
28 |           source .env/bin/activate
29 |           make lint
30 |   tests:
31 |     runs-on: ubuntu-latest
32 |     strategy:
33 |       matrix:
34 |         python-version: [3.7, 3.8]
35 | 
36 |     steps:
37 |     - uses: actions/checkout@v2
38 |     - name: Set up Python ${{ matrix.python-version }}
39 |       uses: actions/setup-python@v2
40 |       with:
41 |         python-version: ${{ matrix.python-version }}
42 |     - name: Install
43 |       run: |
44 |         python3 -m venv .env
45 |         source .env/bin/activate
46 |         make install
47 |         make install-dev
48 |     - name: Unit tests
49 |       run: |
50 |         source .env/bin/activate
51 |         make test
52 | 
53 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - uses: actions-ecosystem/action-regex-match@v2
13 |       id: regex-match
14 |       with:
15 |         text: ${{ github.event.head_commit.message }}
16 |         regex: '^Release ([^ ]+)'
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.8'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Release
26 |       if: ${{ steps.regex-match.outputs.match != '' }}
27 |       uses: softprops/action-gh-release@v1
28 |       with:
29 |         tag_name: ${{ steps.regex-match.outputs.group1 }}
30 |     - name: Build and publish
31 |       if: ${{ steps.regex-match.outputs.match != '' }}
32 |       env:
33 |         TWINE_USERNAME: __token__
34 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
35 |       run: |
36 |         python setup.py sdist bdist_wheel
37 |         twine upload dist/*
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | .nenv
 3 | .vscode
 4 | .env
 5 | __pycache__
 6 | .envtest
 7 | .coverage*
 8 | .env*
 9 | wandb
10 | *.pex
11 | .pexing
12 | .mypy_cache
13 | results
14 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Profiled execution.
 11 | profile=no
 12 | 
 13 | # Add files or directories to the blacklist. They should be base names, not
 14 | # paths.
 15 | ignore=CVS
 16 | 
 17 | # Pickle collected data for later comparisons.
 18 | persistent=yes
 19 | 
 20 | # List of plugins (as comma separated values of python modules names) to load,
 21 | # usually to register additional checkers.
 22 | load-plugins=
 23 | 
 24 | 
 25 | [MESSAGES CONTROL]
 26 | 
 27 | # Enable the message, report, category or checker with the given id(s). You can
 28 | # either give multiple identifier separated by comma (,) or put this option
 29 | # multiple time. See also the "--disable" option for examples.
 30 | enable=indexing-exception,old-raise-syntax
 31 | 
 32 | # Disable the message, report, category or checker with the given id(s). You
 33 | # can either give multiple identifiers separated by comma (,) or put this
 34 | # option multiple times (only on the command line, not in the configuration
 35 | # file where it should appear only once).You can also use "--disable=all" to
 36 | # disable everything first and then reenable specific checks. For example, if
 37 | # you want to run only the similarities checker, you can use "--disable=all
 38 | # --enable=similarities". If you want to run only the classes checker, but have
 39 | # no Warning level messages displayed, use"--disable=all --enable=classes
 40 | # --disable=W"
 41 | disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330
 42 | 
 43 | 
 44 | # Set the cache size for astng objects.
 45 | cache-size=500
 46 | 
 47 | 
 48 | [REPORTS]
 49 | 
 50 | # Set the output format. Available formats are text, parseable, colorized, msvs
 51 | # (visual studio) and html. You can also give a reporter class, eg
 52 | # mypackage.mymodule.MyReporterClass.
 53 | output-format=text
 54 | 
 55 | # Put messages in a separate file for each module / package specified on the
 56 | # command line instead of printing them on stdout. Reports (if any) will be
 57 | # written in a file name "pylint_global.[txt|html]".
 58 | files-output=no
 59 | 
 60 | # Tells whether to display a full report or only the messages
 61 | reports=no
 62 | 
 63 | # Python expression which should return a note less than 10 (10 is the highest
 64 | # note). You have access to the variables errors warning, statement which
 65 | # respectively contain the number of errors / warnings messages and the total
 66 | # number of statements analyzed. This is used by the global evaluation report
 67 | # (RP0004).
 68 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 69 | 
 70 | # Add a comment according to your evaluation note. This is used by the global
 71 | # evaluation report (RP0004).
 72 | comment=no
 73 | 
 74 | # Template used to display messages. This is a python new-style format string
 75 | # used to format the message information. See doc for all details
 76 | #msg-template=
 77 | 
 78 | 
 79 | [TYPECHECK]
 80 | 
 81 | # Tells whether missing members accessed in mixin class should be ignored. A
 82 | # mixin class is detected if its name ends with "mixin" (case insensitive).
 83 | ignore-mixin-members=yes
 84 | 
 85 | # List of classes names for which member attributes should not be checked
 86 | # (useful for classes with attributes dynamically set).
 87 | ignored-classes=SQLObject
 88 | 
 89 | # When zope mode is activated, add a predefined set of Zope acquired attributes
 90 | # to generated-members.
 91 | zope=no
 92 | 
 93 | # List of members which are set dynamically and missed by pylint inference
 94 | # system, and so shouldn't trigger E0201 when accessed. Python regular
 95 | # expressions are accepted.
 96 | generated-members=REQUEST,acl_users,aq_parent
 97 | 
 98 | # List of decorators that create context managers from functions, such as
 99 | # contextlib.contextmanager.
100 | contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
101 | 
102 | 
103 | [VARIABLES]
104 | 
105 | # Tells whether we should check for unused import in __init__ files.
106 | init-import=no
107 | 
108 | # A regular expression matching the beginning of the name of dummy variables
109 | # (i.e. not used).
110 | dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
111 | 
112 | # List of additional names supposed to be defined in builtins. Remember that
113 | # you should avoid to define new builtins when possible.
114 | additional-builtins=
115 | 
116 | 
117 | [BASIC]
118 | 
119 | # Required attributes for module, separated by a comma
120 | required-attributes=
121 | 
122 | # List of builtins function names that should not be used, separated by a comma
123 | bad-functions=apply,input,reduce
124 | 
125 | 
126 | # Disable the report(s) with the given id(s).
127 | # All non-Google reports are disabled by default.
128 | disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923
129 | 
130 | # Regular expression which should only match correct module names
131 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
132 | 
133 | # Regular expression which should only match correct module level names
134 | const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
135 | 
136 | # Regular expression which should only match correct class names
137 | class-rgx=^_?[A-Z][a-zA-Z0-9]*$
138 | 
139 | # Regular expression which should only match correct function names
140 | function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
141 | 
142 | # Regular expression which should only match correct method names
143 | method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
144 | 
145 | # Regular expression which should only match correct instance attribute names
146 | attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
147 | 
148 | # Regular expression which should only match correct argument names
149 | argument-rgx=^[a-z][a-z0-9_]*$
150 | 
151 | # Regular expression which should only match correct variable names
152 | variable-rgx=^[a-z][a-z0-9_]*$
153 | 
154 | # Regular expression which should only match correct attribute names in class
155 | # bodies
156 | class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
157 | 
158 | # Regular expression which should only match correct list comprehension /
159 | # generator expression variable names
160 | inlinevar-rgx=^[a-z][a-z0-9_]*$
161 | 
162 | # Good variable names which should always be accepted, separated by a comma
163 | good-names=main,_
164 | 
165 | # Bad variable names which should always be refused, separated by a comma
166 | bad-names=
167 | 
168 | # Regular expression which should only match function or class names that do
169 | # not require a docstring.
170 | no-docstring-rgx=(__.*__|main)
171 | 
172 | # Minimum line length for functions/classes that require docstrings, shorter
173 | # ones are exempt.
174 | docstring-min-length=10
175 | 
176 | 
177 | [FORMAT]
178 | 
179 | # Maximum number of characters on a single line.
180 | max-line-length=120
181 | 
182 | # Regexp for a line that is allowed to be longer than the limit.
183 | ignore-long-lines=(?x)
184 |   (^\s*(import|from)\s
185 |    |\$Id:\s\/\/depot\/.+#\d+\s\$
186 |    |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
187 |    |^\s*\#\ LINT\.ThenChange
188 |    |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
189 |    |pylint
190 |    |"""
191 |    |\#
192 |    |lambda
193 |    |(https?|ftp):)
194 | 
195 | # Allow the body of an if to be on the same line as the test if there is no
196 | # else.
197 | single-line-if-stmt=y
198 | 
199 | # List of optional constructs for which whitespace checking is disabled
200 | no-space-check=
201 | 
202 | # Maximum number of lines in a module
203 | max-module-lines=99999
204 | 
205 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
206 | # tab).
207 | indent-string='    '
208 | 
209 | 
210 | [SIMILARITIES]
211 | 
212 | # Minimum lines number of a similarity.
213 | min-similarity-lines=4
214 | 
215 | # Ignore comments when computing similarities.
216 | ignore-comments=yes
217 | 
218 | # Ignore docstrings when computing similarities.
219 | ignore-docstrings=yes
220 | 
221 | # Ignore imports when computing similarities.
222 | ignore-imports=no
223 | 
224 | 
225 | [MISCELLANEOUS]
226 | 
227 | # List of note tags to take in consideration, separated by a comma.
228 | notes=
229 | 
230 | 
231 | [IMPORTS]
232 | 
233 | # Deprecated modules which should not be used, separated by a comma
234 | deprecated-modules=regsub,TERMIOS,Bastion,rexec,sets
235 | 
236 | # Create a graph of every (i.e. internal and external) dependencies in the
237 | # given file (report RP0402 must not be disabled)
238 | import-graph=
239 | 
240 | # Create a graph of external dependencies in the given file (report RP0402 must
241 | # not be disabled)
242 | ext-import-graph=
243 | 
244 | # Create a graph of internal dependencies in the given file (report RP0402 must
245 | # not be disabled)
246 | int-import-graph=
247 | 
248 | extension-pkg-whitelist=_jsonnet
249 | 
250 | 
251 | [CLASSES]
252 | 
253 | # List of interface methods to ignore, separated by a comma. This is used for
254 | # instance to not check methods defines in Zope's Interface base class.
255 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
256 | 
257 | # List of method names used to declare (i.e. assign) instance attributes.
258 | defining-attr-methods=__init__,__new__,setUp
259 | 
260 | # List of valid names for the first argument in a class method.
261 | valid-classmethod-first-arg=cls,class_
262 | 
263 | # List of valid names for the first argument in a metaclass class method.
264 | valid-metaclass-classmethod-first-arg=mcs
265 | 
266 | 
267 | [DESIGN]
268 | 
269 | # Maximum number of arguments for function / method
270 | max-args=5
271 | 
272 | # Argument names that match this expression will be ignored. Default to name
273 | # with leading underscore
274 | ignored-argument-names=_.*
275 | 
276 | # Maximum number of locals for function / method body
277 | max-locals=15
278 | 
279 | # Maximum number of return / yield for function / method body
280 | max-returns=6
281 | 
282 | # Maximum number of branch for function / method body
283 | max-branches=12
284 | 
285 | # Maximum number of statements in function / method body
286 | max-statements=50
287 | 
288 | # Maximum number of parents for a class (see R0901).
289 | max-parents=7
290 | 
291 | # Maximum number of attributes for a class (see R0902).
292 | max-attributes=7
293 | 
294 | # Minimum number of public methods for a class (see R0903).
295 | min-public-methods=2
296 | 
297 | # Maximum number of public methods for a class (see R0904).
298 | max-public-methods=20
299 | 
300 | 
301 | [EXCEPTIONS]
302 | 
303 | # Exceptions that will emit a warning when being caught. Defaults to
304 | # "Exception"
305 | overgeneral-exceptions=Exception,StandardError,BaseException
306 | 
307 | 
308 | [AST]
309 | 
310 | # Maximum line length for lambdas
311 | short-func-length=1
312 | 
313 | # List of module members that should be marked as deprecated.
314 | # All of the string functions are listed in 4.1.4 Deprecated string functions
315 | # in the Python 2.4 docs.
316 | deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc
317 | 
318 | 
319 | [DOCSTRING]
320 | 
321 | # List of exceptions that do not need to be mentioned in the Raises section of
322 | # a docstring.
323 | ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError
324 | 
325 | 
326 | 
327 | [TOKENS]
328 | 
329 | # Number of spaces of indent required when the last token on the preceding line
330 | # is an open (, [, or {.
331 | indent-after-paren=4


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
 1 | ## 1.2.0
 2 | 
 3 | * add job_account
 4 | * change timeout a bit
 5 | * avoid doing too many sbatch at once
 6 | * aggregate the output per node
 7 | 
 8 | ## 1.1.1
 9 | 
10 | * Make ddp worker wait a bit to avoid stopping before having time to start all
11 | 
12 | ## 1.1.0
13 | 
14 | * implement pair based testing
15 | 
16 | ## 1.0.2
17 | 
18 | * job_comment rather than sbatch_args
19 | 
20 | ## 1.0.1
21 | 
22 | * sbatch_args param
23 | 
24 | ## 1.0.0
25 | 
26 | * support forward test
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Romain Beaumont
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install: ## [Local development] Upgrade pip, install requirements, install package.
 2 | 	python -m pip install -U pip
 3 | 	python -m pip install -e .
 4 | 
 5 | install-dev: ## [Local development] Install test requirements
 6 | 	python -m pip install -r requirements-test.txt
 7 | 
 8 | lint: ## [Local development] Run mypy, pylint and black
 9 | 	python -m mypy gpu_tester
10 | 	python -m pylint gpu_tester
11 | 	python -m black --check -l 120 gpu_tester
12 | 
13 | black: ## [Local development] Auto-format python code using black
14 | 	python -m black -l 120 .
15 | 
16 | build-pex:
17 | 	python3 -m venv .pexing
18 | 	. .pexing/bin/activate && python -m pip install -U pip && python -m pip install pex
19 | 	. .pexing/bin/activate && python -m pex setuptools . -o gpu_tester.pex -v
20 | 	rm -rf .pexing
21 | 
22 | test: ## [Local development] Run unit tests
23 | 	python -m pytest -x -s -v tests
24 | 
25 | .PHONY: help
26 | 
27 | help: # Run `make help` to get help on the make commands
28 | 	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gpu_tester
  2 | [![pypi](https://img.shields.io/pypi/v/gpu_tester.svg)](https://pypi.python.org/pypi/gpu_tester)
  3 | 
  4 | Gpu tester finds all your bad gpus.
  5 | 
  6 | Works on slurm.
  7 | 
  8 | Features:
  9 | * does a forward on each gpu
 10 | * check for gpu returning incorrect results
 11 | * check for gpu failing due to ECC errors
 12 | 
 13 | Roadmap:
 14 | * sanity check forward speed
 15 | * sanity check broadcast speed
 16 | 
 17 | ## Install
 18 | 
 19 | Create a venv:
 20 | 
 21 | ```
 22 | python3 -m venv .env
 23 | source .env/bin/activate
 24 | pip install -U pip
 25 | ```
 26 | 
 27 | Then:
 28 | ```
 29 | pip3 install torch --extra-index-url https://download.pytorch.org/whl/cu116
 30 | pip install gpu_tester
 31 | ```
 32 | 
 33 | ## Python examples
 34 | 
 35 | Checkout these examples to call this as a lib:
 36 | * [example.py](examples/example.py)
 37 | 
 38 | ## Output
 39 | 
 40 | Output looks like this:
 41 | 
 42 | ```
 43 | job succeeded
 44 | 0 have incorrect results, 1 have gpu errors and 319 succeeded
 45 | incorrect results:
 46 | []
 47 | gpu errors:
 48 | [['gpu_error', 'compute-od-gpu-st-p4d-24xlarge-156', '3']]
 49 | ```
 50 | 
 51 | ## Recommended testing strategy
 52 | 
 53 | ### Pair based strategy
 54 | 
 55 | The easiest way to quickly spot broken node is to do the pair-based strategy.
 56 | It will run many jobs in parallel and find which node can talk together
 57 | Here is one example
 58 | ```
 59 | gpu_tester --nodes 2 --parallel-tests 50 --job_comment laion --partition "gpu" --test_kind "ddp" --job_timeout 45 --exclude 'gpu-st-p4d-24xlarge-[66]'
 60 | ```
 61 | 
 62 | ### All at once strategy
 63 | 
 64 | Once you validated this works, you may want to try the DDP strategy over all nodes, eg:
 65 | ```
 66 | gpu_tester --nodes 100 --parallel-tests 1 --job_comment laion --partition "gpu" --test_kind "ddp" --job_timeout 300 --exclude 'gpu-st-p4d-24xlarge-[66]'
 67 | ```
 68 | 
 69 | ### Simple forward
 70 | 
 71 | If you want to only validate the forward functionality of gpus and not the communication, you may use:
 72 | 
 73 | ```
 74 | gpu_tester --nodes 100 --parallel-tests 1 --job_comment laion --partition "gpu" --test_kind "simple_forward" --job_timeout 50 --exclude 'gpu-st-p4d-24xlarge-[66]'
 75 | ```
 76 | 
 77 | 
 78 | ## API
 79 | 
 80 | This module exposes a single function `gpu_tester` which takes the same arguments as the command line tool:
 81 | 
 82 | * **cluster** the cluster. (default *slurm*)
 83 | * **job_name** slurm job name. (default *gpu_tester*)
 84 | * **partition** slurm partition. (default *compute-od-gpu*)
 85 | * **gpu_per_node** numbe of gpu per node. (default *8*)
 86 | * **nodes** number of gpu nodes. (default *1*)
 87 | * **output_folder** the output folder. (default *None* which means current folder / results)
 88 | * **job_timeout** job timeout (default *150* seconds)
 89 | * **job_comment** optional comment arg given to slurm (default *None*)
 90 | * **job_account** optional account arg given to slurm (default *None*)
 91 | * **test_kind** simple_forward or ddp. simple_forward is quick forward test. DDP uses pytorch ddp to check gpu interconnect (default *simple_forward*)
 92 | * **parallel_tests** number of tests to run in parallel. Recommended to use that with nodes == 2 to test pair by pair (default *1*)
 93 | * **nodelist** node whitelist, example 'gpu-st-p4d-24xlarge-[66-67]' (default *None*)
 94 | * **exclude** node blacklist, example 'gpu-st-p4d-24xlarge-[66-67]' (default *None*)
 95 | 
 96 | ## For development
 97 | 
 98 | Either locally, or in [gitpod](https://gitpod.io/#https://github.com/rom1504/gpu_tester) (do `export PIP_USER=false` there)
 99 | 
100 | Setup a virtualenv:
101 | 
102 | ```
103 | python3 -m venv .env
104 | source .env/bin/activate
105 | pip install -e .
106 | ```
107 | 
108 | to run tests:
109 | ```
110 | pip install -r requirements-test.txt
111 | ```
112 | then 
113 | ```
114 | make lint
115 | make test
116 | ```
117 | 
118 | You can use `make black` to reformat the code
119 | 
120 | `python -m pytest -x -s -v tests -k "dummy"` to run a specific test
121 | 


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
 1 | from gpu_tester import gpu_tester
 2 | 
 3 | # this command will start a test on these gpu
 4 | gpu_tester(
 5 |     cluster="slurm",
 6 |     gpu_nodes="compute-od-gpu-st-p4d-24xlarge-[10-20]",
 7 |     partition="compute-od-gpu",
 8 |     gpu_per_node=8,
 9 |     nodes=11,
10 |     output_folder=None,
11 | )
12 | 


--------------------------------------------------------------------------------
/examples/pair_testing.sh:
--------------------------------------------------------------------------------
1 | gpu_tester --nodes 2 --parallel-tests 60 --job_account openclip --partition "g40423" --test_kind "ddp" --job_timeout 150
2 | 


--------------------------------------------------------------------------------
/gpu_tester/__init__.py:
--------------------------------------------------------------------------------
1 | """gpu_tester"""
2 | 
3 | from gpu_tester.main import main
4 | from gpu_tester.main import gpu_tester
5 | 


--------------------------------------------------------------------------------
/gpu_tester/ddp_worker.py:
--------------------------------------------------------------------------------
 1 | """ddp worker"""
 2 | 
 3 | import torch
 4 | import torch.distributed as dist
 5 | from torch import nn
 6 | from torch import optim
 7 | from torch.nn.parallel import DistributedDataParallel as DDP
 8 | import os
 9 | import socket
10 | import time
11 | from .world_info_from_env import world_info_from_env
12 | 
13 | torch.manual_seed(0)
14 | 
15 | 
16 | def main():
17 |     """example"""
18 |     local_rank, global_rank, world_size = world_info_from_env()
19 |     os.environ["LOCAL_RANK"] = str(local_rank)
20 |     os.environ["RANK"] = str(global_rank)
21 |     os.environ["WORLD_SIZE"] = str(world_size)
22 | 
23 |     hostname = socket.gethostname()
24 |     try:
25 |         # create default process group
26 |         dist.init_process_group("nccl", rank=global_rank, world_size=world_size)
27 |         # create local model
28 |         model = nn.Linear(1000, 1000).to(local_rank)
29 |         # construct DDP model
30 |         ddp_model = DDP(model, device_ids=[local_rank])
31 |         # define loss function and optimizer
32 |         loss_fn = nn.MSELoss()
33 |         optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
34 | 
35 |         inputs = torch.randn(200, 1000).to(local_rank)
36 |         labels = torch.randn(200, 1000).to(local_rank)
37 |         # warmup
38 |         for _ in range(100):
39 |             outputs = ddp_model(inputs)
40 |             loss_fn(outputs, labels).backward()
41 |             optimizer.step()
42 |         # measure
43 |         t = time.time()
44 |         for _ in range(1000):
45 |             outputs = ddp_model(inputs)
46 |             loss_fn(outputs, labels).backward()
47 |             optimizer.step()
48 |         d = time.time() - t
49 |         print("result", hostname, local_rank, outputs.detach().cpu().numpy()[0][0], d)
50 |         time.sleep(45)
51 |     except RuntimeError as err:
52 |         print("gpu_error", hostname, local_rank)
53 |         print(err)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 | 


--------------------------------------------------------------------------------
/gpu_tester/main.py:
--------------------------------------------------------------------------------
  1 | """gpu tester"""
  2 | 
  3 | import os
  4 | import fire
  5 | import subprocess
  6 | import time
  7 | from multiprocessing.pool import ThreadPool
  8 | 
  9 | 
 10 | def is_job_finished(job_id):
 11 |     status = subprocess.check_output(["squeue", "-j", job_id]).decode("utf8")
 12 |     print(f"job status is {status}")
 13 |     return status == "slurm_load_jobs error: Invalid job id specified" or len(status.split("\n")) == 2
 14 | 
 15 | 
 16 | def wait_for_job_to_finish(job_id, timeout=30):
 17 |     t = time.time()
 18 |     while 1:
 19 |         if time.time() - t > timeout:
 20 |             return False
 21 |         time.sleep(1)
 22 |         if is_job_finished(job_id):
 23 |             return True
 24 | 
 25 | 
 26 | def start_job(sbatch_file):
 27 |     """start job"""
 28 |     args = ["sbatch"]
 29 |     args.append(sbatch_file)
 30 |     sbatch_output = subprocess.check_output(args).decode("utf8")
 31 |     lines = sbatch_output.split("\n")
 32 | 
 33 |     lines = [line for line in lines if "Submitted" in line]
 34 |     if len(lines) == 0:
 35 |         raise ValueError(f"slurm sbatch failed: {sbatch_output}")
 36 | 
 37 |     parsed_sbatch = lines[0].split(" ")
 38 |     job_id = parsed_sbatch[3].strip()
 39 |     return job_id
 40 | 
 41 | 
 42 | def get_boilerplate():
 43 |     return """
 44 | module load openmpi
 45 | module load cuda/11.7
 46 | 
 47 | # sent to sub script
 48 | export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
 49 | echo hosts $HOSTNAMES
 50 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 51 | export MASTER_PORT=12802
 52 | export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
 53 | 
 54 | echo go $COUNT_NODE
 55 | """
 56 | 
 57 | 
 58 | def generate_sbatch(
 59 |     job_name, partition, nodes, gpu_per_node, output_file, job_comment, test_kind, nodelist, exclude, job_account
 60 | ):
 61 |     """generate sbatch"""
 62 |     ntasks_per_node = gpu_per_node
 63 |     constant_boilerplate = get_boilerplate()
 64 |     venv = os.environ["VIRTUAL_ENV"]
 65 |     scomment = ("--comment " + job_comment) if job_comment is not None else ""
 66 |     sbatch_scomment = ("#SBATCH --comment " + job_comment) if job_comment is not None else ""
 67 |     sbatch_saccount = ("#SBATCH --account " + job_account) if job_account is not None else ""
 68 |     worker = test_kind + "_worker"
 69 |     nodelist = ("#SBATCH --nodelist " + nodelist) if nodelist is not None else ""
 70 |     exclude = ("#SBATCH --exclude " + exclude) if exclude is not None else ""
 71 | 
 72 |     return f"""#!/bin/bash
 73 | #SBATCH --partition={partition}
 74 | #SBATCH --job-name={job_name}
 75 | #SBATCH --nodes {nodes}
 76 | #SBATCH --ntasks-per-node {ntasks_per_node}
 77 | #SBATCH --output={output_file}
 78 | #SBATCH --exclusive
 79 | {sbatch_scomment}
 80 | {sbatch_saccount}
 81 | {nodelist}
 82 | {exclude}
 83 | 
 84 | {constant_boilerplate}
 85 | 
 86 | source {venv}/bin/activate
 87 | 
 88 | 
 89 | srun {scomment} --cpu_bind=v --accel-bind=gn python -m gpu_tester.{worker}
 90 | """
 91 | 
 92 | 
 93 | def run_test(
 94 |     output_folder,
 95 |     job_name,
 96 |     partition,
 97 |     nodes,
 98 |     gpu_per_node,
 99 |     job_comment,
100 |     job_timeout,
101 |     test_kind,
102 |     nodelist,
103 |     exclude,
104 |     job_account,
105 | ):
106 |     """run test"""
107 | 
108 |     if not os.path.isdir(output_folder):
109 |         os.mkdir(output_folder)
110 | 
111 |     tmp_file = output_folder + "/sbatch_output"
112 |     sbatch_content = generate_sbatch(
113 |         job_name, partition, nodes, gpu_per_node, tmp_file, job_comment, test_kind, nodelist, exclude, job_account
114 |     )
115 |     sbatch_file = output_folder + "/sbatch_file"
116 |     with open(sbatch_file, "w", encoding="utf8") as f:
117 |         f.write(sbatch_content)
118 | 
119 |     print("starting job")
120 |     job_id = start_job(sbatch_file)
121 |     print(f"waiting for job {job_id}")
122 |     status = wait_for_job_to_finish(job_id, job_timeout)
123 |     if not status:
124 |         print(f"canceling {job_id}")
125 |         subprocess.check_output(["scancel", job_id]).decode("utf8")
126 |         status = wait_for_job_to_finish(job_id)
127 |         print("job cancelled")
128 |     else:
129 |         print("job succeeded")
130 | 
131 |     with open(tmp_file, "r", encoding="utf8") as f:
132 |         result_output = f.read()
133 | 
134 |     results = result_output.split("\n")
135 |     hosts = [h for h in results if "hosts" in h]
136 |     if len(hosts) == 0:
137 |         raise ValueError("failed" + result_output)
138 |     hosts = hosts[0].split(" ")[1:]
139 |     hosts_gpus = [h + " " + str(gpu) for gpu in range(8) for h in hosts]
140 | 
141 |     status_dict = {}
142 | 
143 |     for h in hosts_gpus:
144 |         status_dict[h] = ("no_answer", "")
145 | 
146 |     error_gpu = [r for r in results if "gpu_error" in r]
147 |     error_gpus = [r.split(" ") for r in error_gpu]
148 | 
149 |     for r in error_gpus:
150 |         status_dict[r[1] + " " + r[2]] = ("gpu_error", " ".join(r[3:]))
151 | 
152 |     real_results = [r for r in results if "result" in r]
153 | 
154 |     parsed_results = [r.split(" ") for r in real_results]
155 | 
156 |     if test_kind == "simple_forward":
157 |         expected_value = "24954.1"
158 |         expected_delay = 5
159 |     elif test_kind == "ddp":
160 |         expected_value = None
161 |         expected_delay = 5
162 | 
163 |     for r in parsed_results:
164 |         name = r[1] + " " + r[2]
165 |         if expected_value is not None and abs(float(r[3]) - float(expected_value)) > 0.01:
166 |             status_dict[name] = ("wrong", str(r[3]))
167 |         elif float(r[4]) > expected_delay:
168 |             status_dict[name] = ("slow", str(r[4]))
169 |         else:
170 |             status_dict[name] = ("success", "")
171 | 
172 |     return status_dict
173 | 
174 | 
175 | def display_results(status_dict):
176 |     """display results"""
177 | 
178 |     per_node = {}
179 |     for gpu, status in status_dict.items():
180 |         per_node[gpu.split(" ")[0]] = status
181 |     success = [x for x, y in per_node.items() if y[0] == "success"]
182 |     slow = [x for x, y in per_node.items() if y[0] == "slow"]
183 |     wrong = [x for x, y in per_node.items() if y[0] == "wrong"]
184 |     gpu_error = [x for x, y in per_node.items() if y[0] == "gpu_error"]
185 |     no_answer = [x for x, y in per_node.items() if y[0] == "no_answer"]
186 | 
187 |     print(
188 |         f"""on a total of {len(per_node)}:
189 |         * {len(wrong)} have incorrect results
190 |         * {len(slow)} have slow results
191 |         * {len(gpu_error)} have gpu errors
192 |         * {len(no_answer)} did not answer
193 |         * {len(success)} succeeded"""
194 |     )
195 | 
196 |     print("slow results:")
197 |     print(",".join(slow))
198 | 
199 |     print("incorrect results:")
200 |     print(",".join(wrong))
201 | 
202 |     print("gpu errors:")
203 |     print(",".join(gpu_error))
204 | 
205 |     print("no_answer:")
206 |     print(",".join(no_answer))
207 | 
208 | 
209 | def gpu_tester(
210 |     cluster="slurm",
211 |     job_name="gpu_tester",
212 |     partition="compute-od-gpu",
213 |     gpu_per_node=8,
214 |     nodes=1,
215 |     output_folder=None,
216 |     job_timeout=150,
217 |     job_comment=None,
218 |     test_kind="simple_forward",
219 |     parallel_tests=1,
220 |     nodelist=None,
221 |     exclude=None,
222 |     job_account=None,
223 | ):
224 |     """gpu tester main function"""
225 |     if cluster != "slurm":
226 |         raise ValueError("only slurm is supported currently")
227 |     if output_folder is None:
228 |         output_folder = os.getcwd() + "/results"
229 |     if not os.path.isdir(output_folder):
230 |         os.mkdir(output_folder)
231 | 
232 |     def wait_then_run(wait_time, params):
233 |         time.sleep(wait_time)
234 |         return run_test(**params)
235 | 
236 |     all_results = {}
237 |     with ThreadPool(parallel_tests) as p:
238 |         for result in p.imap_unordered(
239 |             lambda x: wait_then_run(
240 |                 wait_time=7 * (x // 10),  # 10 concurrent, first wait 0, second wait 7, third 14, ...
241 |                 params={
242 |                     "output_folder": output_folder + "/" + str(x),
243 |                     "job_name": job_name,
244 |                     "partition": partition,
245 |                     "nodes": nodes,
246 |                     "gpu_per_node": gpu_per_node,
247 |                     "job_comment": job_comment,
248 |                     "job_timeout": job_timeout,
249 |                     "test_kind": test_kind,
250 |                     "nodelist": nodelist,
251 |                     "exclude": exclude,
252 |                     "job_account": job_account,
253 |                 },
254 |             ),
255 |             range(parallel_tests),
256 |         ):
257 |             all_results.update(result)
258 | 
259 |     display_results(all_results)
260 | 
261 | 
262 | def main():
263 |     fire.Fire(gpu_tester)
264 | 
265 | 
266 | if __name__ == "__main__":
267 |     main()
268 | 


--------------------------------------------------------------------------------
/gpu_tester/simple_forward_worker.py:
--------------------------------------------------------------------------------
 1 | """worker running in each gpu"""
 2 | 
 3 | import torch
 4 | import socket
 5 | import time
 6 | from .world_info_from_env import world_info_from_env
 7 | 
 8 | torch.manual_seed(0)
 9 | 
10 | 
11 | def main():
12 |     local_rank, _, _ = world_info_from_env()
13 | 
14 |     hostname = socket.gethostname()
15 |     try:
16 |         device = torch.device(f"cuda:{local_rank}")
17 |         torch.cuda.set_device(device)
18 | 
19 |         vector1 = torch.rand(1, 100000, device=device)
20 |         vector2 = torch.rand(1, 100000, device=device)
21 |         t = time.time()
22 |         dot = (vector1 @ vector2.T).cpu().numpy()
23 |         d = time.time() - t
24 | 
25 |         print("result", hostname, local_rank, dot[0][0], d)
26 |     except RuntimeError as _:
27 |         print("gpu_error", hostname, local_rank)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/gpu_tester/world_info_from_env.py:
--------------------------------------------------------------------------------
 1 | """world info from env"""
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | def world_info_from_env():
 7 |     """world info from env"""
 8 |     local_rank = 0
 9 |     for v in ("LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", "OMPI_COMM_WORLD_LOCAL_RANK"):
10 |         if v in os.environ:
11 |             local_rank = int(os.environ[v])
12 |             break
13 |     global_rank = 0
14 |     for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"):
15 |         if v in os.environ:
16 |             global_rank = int(os.environ[v])
17 |             break
18 |     world_size = 1
19 |     for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"):
20 |         if v in os.environ:
21 |             world_size = int(os.environ[v])
22 |             break
23 | 
24 |     return local_rank, global_rank, world_size
25 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | # Global options:
2 | 
3 | [mypy]
4 | python_version = 3.8
5 | ignore_missing_imports = True
6 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | black==22.3.0
2 | mypy==0.942
3 | pylint==2.13.4
4 | pytest-cov==3.0.0
5 | pytest-xdist==2.5.0
6 | pytest==7.0.1
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fire
2 | numpy
3 | torch


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | import os
 4 | 
 5 | if __name__ == "__main__":
 6 |     with Path(Path(__file__).parent, "README.md").open(encoding="utf-8") as file:
 7 |         long_description = file.read()
 8 | 
 9 |     def _read_reqs(relpath):
10 |         fullpath = os.path.join(os.path.dirname(__file__), relpath)
11 |         with open(fullpath) as f:
12 |             return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))]
13 | 
14 |     REQUIREMENTS = _read_reqs("requirements.txt")
15 | 
16 |     setup(
17 |         name="gpu_tester",
18 |         packages=find_packages(),
19 |         include_package_data=True,
20 |         version="1.2.0",
21 |         license="MIT",
22 |         description="A python template",
23 |         long_description=long_description,
24 |         long_description_content_type="text/markdown",
25 |         author="Romain Beaumont",
26 |         author_email="romain.rom1@gmail.com",
27 |         entry_points={"console_scripts": ["gpu_tester = gpu_tester:main"]},
28 |         url="https://github.com/rom1504/gpu_tester",
29 |         data_files=[(".", ["README.md"])],
30 |         keywords=["machine learning"],
31 |         install_requires=REQUIREMENTS,
32 |         classifiers=[
33 |             "Development Status :: 4 - Beta",
34 |             "Intended Audience :: Developers",
35 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
36 |             "License :: OSI Approved :: MIT License",
37 |             "Programming Language :: Python :: 3.6",
38 |         ],
39 |     )
40 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from gpu_tester import gpu_tester
3 | 
4 | 
5 | def test_fake():
6 |     print("hi!")
7 | 


--------------------------------------------------------------------------------