├── .github └── workflows │ ├── ci.yml │ └── python-publish.yml ├── .gitignore ├── .pylintrc ├── HISTORY.md ├── LICENSE ├── Makefile ├── README.md ├── examples ├── example.py └── pair_testing.sh ├── gpu_tester ├── __init__.py ├── ddp_worker.py ├── main.py ├── simple_forward_worker.py └── world_info_from_env.py ├── mypy.ini ├── requirements-test.txt ├── requirements.txt ├── setup.py └── tests └── test_main.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous integration 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python 3.8 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.8 20 | - name: Install 21 | run: | 22 | python3 -m venv .env 23 | source .env/bin/activate 24 | python -m pip install -U pip 25 | make install-dev 26 | - name: Lint 27 | run: | 28 | source .env/bin/activate 29 | make lint 30 | tests: 31 | runs-on: ubuntu-latest 32 | strategy: 33 | matrix: 34 | python-version: [3.7, 3.8] 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | - name: Set up Python ${{ matrix.python-version }} 39 | uses: actions/setup-python@v2 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | - name: Install 43 | run: | 44 | python3 -m venv .env 45 | source .env/bin/activate 46 | make install 47 | make install-dev 48 | - name: Unit tests 49 | run: | 50 | source .env/bin/activate 51 | make test 52 | 53 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions-ecosystem/action-regex-match@v2 13 | id: regex-match 14 | with: 15 | text: ${{ github.event.head_commit.message }} 16 | regex: '^Release ([^ ]+)' 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.8' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Release 26 | if: ${{ steps.regex-match.outputs.match != '' }} 27 | uses: softprops/action-gh-release@v1 28 | with: 29 | tag_name: ${{ steps.regex-match.outputs.group1 }} 30 | - name: Build and publish 31 | if: ${{ steps.regex-match.outputs.match != '' }} 32 | env: 33 | TWINE_USERNAME: __token__ 34 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 35 | run: | 36 | python setup.py sdist bdist_wheel 37 | twine upload dist/* 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | .nenv 3 | .vscode 4 | .env 5 | __pycache__ 6 | .envtest 7 | .coverage* 8 | .env* 9 | wandb 10 | *.pex 11 | .pexing 12 | .mypy_cache 13 | results 14 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Profiled execution. 11 | profile=no 12 | 13 | # Add files or directories to the blacklist. They should be base names, not 14 | # paths. 15 | ignore=CVS 16 | 17 | # Pickle collected data for later comparisons. 18 | persistent=yes 19 | 20 | # List of plugins (as comma separated values of python modules names) to load, 21 | # usually to register additional checkers. 22 | load-plugins= 23 | 24 | 25 | [MESSAGES CONTROL] 26 | 27 | # Enable the message, report, category or checker with the given id(s). You can 28 | # either give multiple identifier separated by comma (,) or put this option 29 | # multiple time. See also the "--disable" option for examples. 30 | enable=indexing-exception,old-raise-syntax 31 | 32 | # Disable the message, report, category or checker with the given id(s). You 33 | # can either give multiple identifiers separated by comma (,) or put this 34 | # option multiple times (only on the command line, not in the configuration 35 | # file where it should appear only once).You can also use "--disable=all" to 36 | # disable everything first and then reenable specific checks. For example, if 37 | # you want to run only the similarities checker, you can use "--disable=all 38 | # --enable=similarities". If you want to run only the classes checker, but have 39 | # no Warning level messages displayed, use"--disable=all --enable=classes 40 | # --disable=W" 41 | disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330 42 | 43 | 44 | # Set the cache size for astng objects. 45 | cache-size=500 46 | 47 | 48 | [REPORTS] 49 | 50 | # Set the output format. Available formats are text, parseable, colorized, msvs 51 | # (visual studio) and html. You can also give a reporter class, eg 52 | # mypackage.mymodule.MyReporterClass. 53 | output-format=text 54 | 55 | # Put messages in a separate file for each module / package specified on the 56 | # command line instead of printing them on stdout. Reports (if any) will be 57 | # written in a file name "pylint_global.[txt|html]". 58 | files-output=no 59 | 60 | # Tells whether to display a full report or only the messages 61 | reports=no 62 | 63 | # Python expression which should return a note less than 10 (10 is the highest 64 | # note). You have access to the variables errors warning, statement which 65 | # respectively contain the number of errors / warnings messages and the total 66 | # number of statements analyzed. This is used by the global evaluation report 67 | # (RP0004). 68 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 69 | 70 | # Add a comment according to your evaluation note. This is used by the global 71 | # evaluation report (RP0004). 72 | comment=no 73 | 74 | # Template used to display messages. This is a python new-style format string 75 | # used to format the message information. See doc for all details 76 | #msg-template= 77 | 78 | 79 | [TYPECHECK] 80 | 81 | # Tells whether missing members accessed in mixin class should be ignored. A 82 | # mixin class is detected if its name ends with "mixin" (case insensitive). 83 | ignore-mixin-members=yes 84 | 85 | # List of classes names for which member attributes should not be checked 86 | # (useful for classes with attributes dynamically set). 87 | ignored-classes=SQLObject 88 | 89 | # When zope mode is activated, add a predefined set of Zope acquired attributes 90 | # to generated-members. 91 | zope=no 92 | 93 | # List of members which are set dynamically and missed by pylint inference 94 | # system, and so shouldn't trigger E0201 when accessed. Python regular 95 | # expressions are accepted. 96 | generated-members=REQUEST,acl_users,aq_parent 97 | 98 | # List of decorators that create context managers from functions, such as 99 | # contextlib.contextmanager. 100 | contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager 101 | 102 | 103 | [VARIABLES] 104 | 105 | # Tells whether we should check for unused import in __init__ files. 106 | init-import=no 107 | 108 | # A regular expression matching the beginning of the name of dummy variables 109 | # (i.e. not used). 110 | dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) 111 | 112 | # List of additional names supposed to be defined in builtins. Remember that 113 | # you should avoid to define new builtins when possible. 114 | additional-builtins= 115 | 116 | 117 | [BASIC] 118 | 119 | # Required attributes for module, separated by a comma 120 | required-attributes= 121 | 122 | # List of builtins function names that should not be used, separated by a comma 123 | bad-functions=apply,input,reduce 124 | 125 | 126 | # Disable the report(s) with the given id(s). 127 | # All non-Google reports are disabled by default. 128 | disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923 129 | 130 | # Regular expression which should only match correct module names 131 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 132 | 133 | # Regular expression which should only match correct module level names 134 | const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ 135 | 136 | # Regular expression which should only match correct class names 137 | class-rgx=^_?[A-Z][a-zA-Z0-9]*$ 138 | 139 | # Regular expression which should only match correct function names 140 | function-rgx=^(?:(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ 141 | 142 | # Regular expression which should only match correct method names 143 | method-rgx=^(?:(?P__[a-z0-9_]+__|next)|(?P_{0,2}[A-Z][a-zA-Z0-9]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ 144 | 145 | # Regular expression which should only match correct instance attribute names 146 | attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ 147 | 148 | # Regular expression which should only match correct argument names 149 | argument-rgx=^[a-z][a-z0-9_]*$ 150 | 151 | # Regular expression which should only match correct variable names 152 | variable-rgx=^[a-z][a-z0-9_]*$ 153 | 154 | # Regular expression which should only match correct attribute names in class 155 | # bodies 156 | class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ 157 | 158 | # Regular expression which should only match correct list comprehension / 159 | # generator expression variable names 160 | inlinevar-rgx=^[a-z][a-z0-9_]*$ 161 | 162 | # Good variable names which should always be accepted, separated by a comma 163 | good-names=main,_ 164 | 165 | # Bad variable names which should always be refused, separated by a comma 166 | bad-names= 167 | 168 | # Regular expression which should only match function or class names that do 169 | # not require a docstring. 170 | no-docstring-rgx=(__.*__|main) 171 | 172 | # Minimum line length for functions/classes that require docstrings, shorter 173 | # ones are exempt. 174 | docstring-min-length=10 175 | 176 | 177 | [FORMAT] 178 | 179 | # Maximum number of characters on a single line. 180 | max-line-length=120 181 | 182 | # Regexp for a line that is allowed to be longer than the limit. 183 | ignore-long-lines=(?x) 184 | (^\s*(import|from)\s 185 | |\$Id:\s\/\/depot\/.+#\d+\s\$ 186 | |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+') 187 | |^\s*\#\ LINT\.ThenChange 188 | |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$ 189 | |pylint 190 | |""" 191 | |\# 192 | |lambda 193 | |(https?|ftp):) 194 | 195 | # Allow the body of an if to be on the same line as the test if there is no 196 | # else. 197 | single-line-if-stmt=y 198 | 199 | # List of optional constructs for which whitespace checking is disabled 200 | no-space-check= 201 | 202 | # Maximum number of lines in a module 203 | max-module-lines=99999 204 | 205 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 206 | # tab). 207 | indent-string=' ' 208 | 209 | 210 | [SIMILARITIES] 211 | 212 | # Minimum lines number of a similarity. 213 | min-similarity-lines=4 214 | 215 | # Ignore comments when computing similarities. 216 | ignore-comments=yes 217 | 218 | # Ignore docstrings when computing similarities. 219 | ignore-docstrings=yes 220 | 221 | # Ignore imports when computing similarities. 222 | ignore-imports=no 223 | 224 | 225 | [MISCELLANEOUS] 226 | 227 | # List of note tags to take in consideration, separated by a comma. 228 | notes= 229 | 230 | 231 | [IMPORTS] 232 | 233 | # Deprecated modules which should not be used, separated by a comma 234 | deprecated-modules=regsub,TERMIOS,Bastion,rexec,sets 235 | 236 | # Create a graph of every (i.e. internal and external) dependencies in the 237 | # given file (report RP0402 must not be disabled) 238 | import-graph= 239 | 240 | # Create a graph of external dependencies in the given file (report RP0402 must 241 | # not be disabled) 242 | ext-import-graph= 243 | 244 | # Create a graph of internal dependencies in the given file (report RP0402 must 245 | # not be disabled) 246 | int-import-graph= 247 | 248 | extension-pkg-whitelist=_jsonnet 249 | 250 | 251 | [CLASSES] 252 | 253 | # List of interface methods to ignore, separated by a comma. This is used for 254 | # instance to not check methods defines in Zope's Interface base class. 255 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by 256 | 257 | # List of method names used to declare (i.e. assign) instance attributes. 258 | defining-attr-methods=__init__,__new__,setUp 259 | 260 | # List of valid names for the first argument in a class method. 261 | valid-classmethod-first-arg=cls,class_ 262 | 263 | # List of valid names for the first argument in a metaclass class method. 264 | valid-metaclass-classmethod-first-arg=mcs 265 | 266 | 267 | [DESIGN] 268 | 269 | # Maximum number of arguments for function / method 270 | max-args=5 271 | 272 | # Argument names that match this expression will be ignored. Default to name 273 | # with leading underscore 274 | ignored-argument-names=_.* 275 | 276 | # Maximum number of locals for function / method body 277 | max-locals=15 278 | 279 | # Maximum number of return / yield for function / method body 280 | max-returns=6 281 | 282 | # Maximum number of branch for function / method body 283 | max-branches=12 284 | 285 | # Maximum number of statements in function / method body 286 | max-statements=50 287 | 288 | # Maximum number of parents for a class (see R0901). 289 | max-parents=7 290 | 291 | # Maximum number of attributes for a class (see R0902). 292 | max-attributes=7 293 | 294 | # Minimum number of public methods for a class (see R0903). 295 | min-public-methods=2 296 | 297 | # Maximum number of public methods for a class (see R0904). 298 | max-public-methods=20 299 | 300 | 301 | [EXCEPTIONS] 302 | 303 | # Exceptions that will emit a warning when being caught. Defaults to 304 | # "Exception" 305 | overgeneral-exceptions=Exception,StandardError,BaseException 306 | 307 | 308 | [AST] 309 | 310 | # Maximum line length for lambdas 311 | short-func-length=1 312 | 313 | # List of module members that should be marked as deprecated. 314 | # All of the string functions are listed in 4.1.4 Deprecated string functions 315 | # in the Python 2.4 docs. 316 | deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc 317 | 318 | 319 | [DOCSTRING] 320 | 321 | # List of exceptions that do not need to be mentioned in the Raises section of 322 | # a docstring. 323 | ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError 324 | 325 | 326 | 327 | [TOKENS] 328 | 329 | # Number of spaces of indent required when the last token on the preceding line 330 | # is an open (, [, or {. 331 | indent-after-paren=4 -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | ## 1.2.0 2 | 3 | * add job_account 4 | * change timeout a bit 5 | * avoid doing too many sbatch at once 6 | * aggregate the output per node 7 | 8 | ## 1.1.1 9 | 10 | * Make ddp worker wait a bit to avoid stopping before having time to start all 11 | 12 | ## 1.1.0 13 | 14 | * implement pair based testing 15 | 16 | ## 1.0.2 17 | 18 | * job_comment rather than sbatch_args 19 | 20 | ## 1.0.1 21 | 22 | * sbatch_args param 23 | 24 | ## 1.0.0 25 | 26 | * support forward test 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Romain Beaumont 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: ## [Local development] Upgrade pip, install requirements, install package. 2 | python -m pip install -U pip 3 | python -m pip install -e . 4 | 5 | install-dev: ## [Local development] Install test requirements 6 | python -m pip install -r requirements-test.txt 7 | 8 | lint: ## [Local development] Run mypy, pylint and black 9 | python -m mypy gpu_tester 10 | python -m pylint gpu_tester 11 | python -m black --check -l 120 gpu_tester 12 | 13 | black: ## [Local development] Auto-format python code using black 14 | python -m black -l 120 . 15 | 16 | build-pex: 17 | python3 -m venv .pexing 18 | . .pexing/bin/activate && python -m pip install -U pip && python -m pip install pex 19 | . .pexing/bin/activate && python -m pex setuptools . -o gpu_tester.pex -v 20 | rm -rf .pexing 21 | 22 | test: ## [Local development] Run unit tests 23 | python -m pytest -x -s -v tests 24 | 25 | .PHONY: help 26 | 27 | help: # Run `make help` to get help on the make commands 28 | @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpu_tester 2 | [![pypi](https://img.shields.io/pypi/v/gpu_tester.svg)](https://pypi.python.org/pypi/gpu_tester) 3 | 4 | Gpu tester finds all your bad gpus. 5 | 6 | Works on slurm. 7 | 8 | Features: 9 | * does a forward on each gpu 10 | * check for gpu returning incorrect results 11 | * check for gpu failing due to ECC errors 12 | 13 | Roadmap: 14 | * sanity check forward speed 15 | * sanity check broadcast speed 16 | 17 | ## Install 18 | 19 | Create a venv: 20 | 21 | ``` 22 | python3 -m venv .env 23 | source .env/bin/activate 24 | pip install -U pip 25 | ``` 26 | 27 | Then: 28 | ``` 29 | pip3 install torch --extra-index-url https://download.pytorch.org/whl/cu116 30 | pip install gpu_tester 31 | ``` 32 | 33 | ## Python examples 34 | 35 | Checkout these examples to call this as a lib: 36 | * [example.py](examples/example.py) 37 | 38 | ## Output 39 | 40 | Output looks like this: 41 | 42 | ``` 43 | job succeeded 44 | 0 have incorrect results, 1 have gpu errors and 319 succeeded 45 | incorrect results: 46 | [] 47 | gpu errors: 48 | [['gpu_error', 'compute-od-gpu-st-p4d-24xlarge-156', '3']] 49 | ``` 50 | 51 | ## Recommended testing strategy 52 | 53 | ### Pair based strategy 54 | 55 | The easiest way to quickly spot broken node is to do the pair-based strategy. 56 | It will run many jobs in parallel and find which node can talk together 57 | Here is one example 58 | ``` 59 | gpu_tester --nodes 2 --parallel-tests 50 --job_comment laion --partition "gpu" --test_kind "ddp" --job_timeout 45 --exclude 'gpu-st-p4d-24xlarge-[66]' 60 | ``` 61 | 62 | ### All at once strategy 63 | 64 | Once you validated this works, you may want to try the DDP strategy over all nodes, eg: 65 | ``` 66 | gpu_tester --nodes 100 --parallel-tests 1 --job_comment laion --partition "gpu" --test_kind "ddp" --job_timeout 300 --exclude 'gpu-st-p4d-24xlarge-[66]' 67 | ``` 68 | 69 | ### Simple forward 70 | 71 | If you want to only validate the forward functionality of gpus and not the communication, you may use: 72 | 73 | ``` 74 | gpu_tester --nodes 100 --parallel-tests 1 --job_comment laion --partition "gpu" --test_kind "simple_forward" --job_timeout 50 --exclude 'gpu-st-p4d-24xlarge-[66]' 75 | ``` 76 | 77 | 78 | ## API 79 | 80 | This module exposes a single function `gpu_tester` which takes the same arguments as the command line tool: 81 | 82 | * **cluster** the cluster. (default *slurm*) 83 | * **job_name** slurm job name. (default *gpu_tester*) 84 | * **partition** slurm partition. (default *compute-od-gpu*) 85 | * **gpu_per_node** numbe of gpu per node. (default *8*) 86 | * **nodes** number of gpu nodes. (default *1*) 87 | * **output_folder** the output folder. (default *None* which means current folder / results) 88 | * **job_timeout** job timeout (default *150* seconds) 89 | * **job_comment** optional comment arg given to slurm (default *None*) 90 | * **job_account** optional account arg given to slurm (default *None*) 91 | * **test_kind** simple_forward or ddp. simple_forward is quick forward test. DDP uses pytorch ddp to check gpu interconnect (default *simple_forward*) 92 | * **parallel_tests** number of tests to run in parallel. Recommended to use that with nodes == 2 to test pair by pair (default *1*) 93 | * **nodelist** node whitelist, example 'gpu-st-p4d-24xlarge-[66-67]' (default *None*) 94 | * **exclude** node blacklist, example 'gpu-st-p4d-24xlarge-[66-67]' (default *None*) 95 | 96 | ## For development 97 | 98 | Either locally, or in [gitpod](https://gitpod.io/#https://github.com/rom1504/gpu_tester) (do `export PIP_USER=false` there) 99 | 100 | Setup a virtualenv: 101 | 102 | ``` 103 | python3 -m venv .env 104 | source .env/bin/activate 105 | pip install -e . 106 | ``` 107 | 108 | to run tests: 109 | ``` 110 | pip install -r requirements-test.txt 111 | ``` 112 | then 113 | ``` 114 | make lint 115 | make test 116 | ``` 117 | 118 | You can use `make black` to reformat the code 119 | 120 | `python -m pytest -x -s -v tests -k "dummy"` to run a specific test 121 | -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from gpu_tester import gpu_tester 2 | 3 | # this command will start a test on these gpu 4 | gpu_tester( 5 | cluster="slurm", 6 | gpu_nodes="compute-od-gpu-st-p4d-24xlarge-[10-20]", 7 | partition="compute-od-gpu", 8 | gpu_per_node=8, 9 | nodes=11, 10 | output_folder=None, 11 | ) 12 | -------------------------------------------------------------------------------- /examples/pair_testing.sh: -------------------------------------------------------------------------------- 1 | gpu_tester --nodes 2 --parallel-tests 60 --job_account openclip --partition "g40423" --test_kind "ddp" --job_timeout 150 2 | -------------------------------------------------------------------------------- /gpu_tester/__init__.py: -------------------------------------------------------------------------------- 1 | """gpu_tester""" 2 | 3 | from gpu_tester.main import main 4 | from gpu_tester.main import gpu_tester 5 | -------------------------------------------------------------------------------- /gpu_tester/ddp_worker.py: -------------------------------------------------------------------------------- 1 | """ddp worker""" 2 | 3 | import torch 4 | import torch.distributed as dist 5 | from torch import nn 6 | from torch import optim 7 | from torch.nn.parallel import DistributedDataParallel as DDP 8 | import os 9 | import socket 10 | import time 11 | from .world_info_from_env import world_info_from_env 12 | 13 | torch.manual_seed(0) 14 | 15 | 16 | def main(): 17 | """example""" 18 | local_rank, global_rank, world_size = world_info_from_env() 19 | os.environ["LOCAL_RANK"] = str(local_rank) 20 | os.environ["RANK"] = str(global_rank) 21 | os.environ["WORLD_SIZE"] = str(world_size) 22 | 23 | hostname = socket.gethostname() 24 | try: 25 | # create default process group 26 | dist.init_process_group("nccl", rank=global_rank, world_size=world_size) 27 | # create local model 28 | model = nn.Linear(1000, 1000).to(local_rank) 29 | # construct DDP model 30 | ddp_model = DDP(model, device_ids=[local_rank]) 31 | # define loss function and optimizer 32 | loss_fn = nn.MSELoss() 33 | optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 34 | 35 | inputs = torch.randn(200, 1000).to(local_rank) 36 | labels = torch.randn(200, 1000).to(local_rank) 37 | # warmup 38 | for _ in range(100): 39 | outputs = ddp_model(inputs) 40 | loss_fn(outputs, labels).backward() 41 | optimizer.step() 42 | # measure 43 | t = time.time() 44 | for _ in range(1000): 45 | outputs = ddp_model(inputs) 46 | loss_fn(outputs, labels).backward() 47 | optimizer.step() 48 | d = time.time() - t 49 | print("result", hostname, local_rank, outputs.detach().cpu().numpy()[0][0], d) 50 | time.sleep(45) 51 | except RuntimeError as err: 52 | print("gpu_error", hostname, local_rank) 53 | print(err) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /gpu_tester/main.py: -------------------------------------------------------------------------------- 1 | """gpu tester""" 2 | 3 | import os 4 | import fire 5 | import subprocess 6 | import time 7 | from multiprocessing.pool import ThreadPool 8 | 9 | 10 | def is_job_finished(job_id): 11 | status = subprocess.check_output(["squeue", "-j", job_id]).decode("utf8") 12 | print(f"job status is {status}") 13 | return status == "slurm_load_jobs error: Invalid job id specified" or len(status.split("\n")) == 2 14 | 15 | 16 | def wait_for_job_to_finish(job_id, timeout=30): 17 | t = time.time() 18 | while 1: 19 | if time.time() - t > timeout: 20 | return False 21 | time.sleep(1) 22 | if is_job_finished(job_id): 23 | return True 24 | 25 | 26 | def start_job(sbatch_file): 27 | """start job""" 28 | args = ["sbatch"] 29 | args.append(sbatch_file) 30 | sbatch_output = subprocess.check_output(args).decode("utf8") 31 | lines = sbatch_output.split("\n") 32 | 33 | lines = [line for line in lines if "Submitted" in line] 34 | if len(lines) == 0: 35 | raise ValueError(f"slurm sbatch failed: {sbatch_output}") 36 | 37 | parsed_sbatch = lines[0].split(" ") 38 | job_id = parsed_sbatch[3].strip() 39 | return job_id 40 | 41 | 42 | def get_boilerplate(): 43 | return """ 44 | module load openmpi 45 | module load cuda/11.7 46 | 47 | # sent to sub script 48 | export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` 49 | echo hosts $HOSTNAMES 50 | export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 51 | export MASTER_PORT=12802 52 | export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` 53 | 54 | echo go $COUNT_NODE 55 | """ 56 | 57 | 58 | def generate_sbatch( 59 | job_name, partition, nodes, gpu_per_node, output_file, job_comment, test_kind, nodelist, exclude, job_account 60 | ): 61 | """generate sbatch""" 62 | ntasks_per_node = gpu_per_node 63 | constant_boilerplate = get_boilerplate() 64 | venv = os.environ["VIRTUAL_ENV"] 65 | scomment = ("--comment " + job_comment) if job_comment is not None else "" 66 | sbatch_scomment = ("#SBATCH --comment " + job_comment) if job_comment is not None else "" 67 | sbatch_saccount = ("#SBATCH --account " + job_account) if job_account is not None else "" 68 | worker = test_kind + "_worker" 69 | nodelist = ("#SBATCH --nodelist " + nodelist) if nodelist is not None else "" 70 | exclude = ("#SBATCH --exclude " + exclude) if exclude is not None else "" 71 | 72 | return f"""#!/bin/bash 73 | #SBATCH --partition={partition} 74 | #SBATCH --job-name={job_name} 75 | #SBATCH --nodes {nodes} 76 | #SBATCH --ntasks-per-node {ntasks_per_node} 77 | #SBATCH --output={output_file} 78 | #SBATCH --exclusive 79 | {sbatch_scomment} 80 | {sbatch_saccount} 81 | {nodelist} 82 | {exclude} 83 | 84 | {constant_boilerplate} 85 | 86 | source {venv}/bin/activate 87 | 88 | 89 | srun {scomment} --cpu_bind=v --accel-bind=gn python -m gpu_tester.{worker} 90 | """ 91 | 92 | 93 | def run_test( 94 | output_folder, 95 | job_name, 96 | partition, 97 | nodes, 98 | gpu_per_node, 99 | job_comment, 100 | job_timeout, 101 | test_kind, 102 | nodelist, 103 | exclude, 104 | job_account, 105 | ): 106 | """run test""" 107 | 108 | if not os.path.isdir(output_folder): 109 | os.mkdir(output_folder) 110 | 111 | tmp_file = output_folder + "/sbatch_output" 112 | sbatch_content = generate_sbatch( 113 | job_name, partition, nodes, gpu_per_node, tmp_file, job_comment, test_kind, nodelist, exclude, job_account 114 | ) 115 | sbatch_file = output_folder + "/sbatch_file" 116 | with open(sbatch_file, "w", encoding="utf8") as f: 117 | f.write(sbatch_content) 118 | 119 | print("starting job") 120 | job_id = start_job(sbatch_file) 121 | print(f"waiting for job {job_id}") 122 | status = wait_for_job_to_finish(job_id, job_timeout) 123 | if not status: 124 | print(f"canceling {job_id}") 125 | subprocess.check_output(["scancel", job_id]).decode("utf8") 126 | status = wait_for_job_to_finish(job_id) 127 | print("job cancelled") 128 | else: 129 | print("job succeeded") 130 | 131 | with open(tmp_file, "r", encoding="utf8") as f: 132 | result_output = f.read() 133 | 134 | results = result_output.split("\n") 135 | hosts = [h for h in results if "hosts" in h] 136 | if len(hosts) == 0: 137 | raise ValueError("failed" + result_output) 138 | hosts = hosts[0].split(" ")[1:] 139 | hosts_gpus = [h + " " + str(gpu) for gpu in range(8) for h in hosts] 140 | 141 | status_dict = {} 142 | 143 | for h in hosts_gpus: 144 | status_dict[h] = ("no_answer", "") 145 | 146 | error_gpu = [r for r in results if "gpu_error" in r] 147 | error_gpus = [r.split(" ") for r in error_gpu] 148 | 149 | for r in error_gpus: 150 | status_dict[r[1] + " " + r[2]] = ("gpu_error", " ".join(r[3:])) 151 | 152 | real_results = [r for r in results if "result" in r] 153 | 154 | parsed_results = [r.split(" ") for r in real_results] 155 | 156 | if test_kind == "simple_forward": 157 | expected_value = "24954.1" 158 | expected_delay = 5 159 | elif test_kind == "ddp": 160 | expected_value = None 161 | expected_delay = 5 162 | 163 | for r in parsed_results: 164 | name = r[1] + " " + r[2] 165 | if expected_value is not None and abs(float(r[3]) - float(expected_value)) > 0.01: 166 | status_dict[name] = ("wrong", str(r[3])) 167 | elif float(r[4]) > expected_delay: 168 | status_dict[name] = ("slow", str(r[4])) 169 | else: 170 | status_dict[name] = ("success", "") 171 | 172 | return status_dict 173 | 174 | 175 | def display_results(status_dict): 176 | """display results""" 177 | 178 | per_node = {} 179 | for gpu, status in status_dict.items(): 180 | per_node[gpu.split(" ")[0]] = status 181 | success = [x for x, y in per_node.items() if y[0] == "success"] 182 | slow = [x for x, y in per_node.items() if y[0] == "slow"] 183 | wrong = [x for x, y in per_node.items() if y[0] == "wrong"] 184 | gpu_error = [x for x, y in per_node.items() if y[0] == "gpu_error"] 185 | no_answer = [x for x, y in per_node.items() if y[0] == "no_answer"] 186 | 187 | print( 188 | f"""on a total of {len(per_node)}: 189 | * {len(wrong)} have incorrect results 190 | * {len(slow)} have slow results 191 | * {len(gpu_error)} have gpu errors 192 | * {len(no_answer)} did not answer 193 | * {len(success)} succeeded""" 194 | ) 195 | 196 | print("slow results:") 197 | print(",".join(slow)) 198 | 199 | print("incorrect results:") 200 | print(",".join(wrong)) 201 | 202 | print("gpu errors:") 203 | print(",".join(gpu_error)) 204 | 205 | print("no_answer:") 206 | print(",".join(no_answer)) 207 | 208 | 209 | def gpu_tester( 210 | cluster="slurm", 211 | job_name="gpu_tester", 212 | partition="compute-od-gpu", 213 | gpu_per_node=8, 214 | nodes=1, 215 | output_folder=None, 216 | job_timeout=150, 217 | job_comment=None, 218 | test_kind="simple_forward", 219 | parallel_tests=1, 220 | nodelist=None, 221 | exclude=None, 222 | job_account=None, 223 | ): 224 | """gpu tester main function""" 225 | if cluster != "slurm": 226 | raise ValueError("only slurm is supported currently") 227 | if output_folder is None: 228 | output_folder = os.getcwd() + "/results" 229 | if not os.path.isdir(output_folder): 230 | os.mkdir(output_folder) 231 | 232 | def wait_then_run(wait_time, params): 233 | time.sleep(wait_time) 234 | return run_test(**params) 235 | 236 | all_results = {} 237 | with ThreadPool(parallel_tests) as p: 238 | for result in p.imap_unordered( 239 | lambda x: wait_then_run( 240 | wait_time=7 * (x // 10), # 10 concurrent, first wait 0, second wait 7, third 14, ... 241 | params={ 242 | "output_folder": output_folder + "/" + str(x), 243 | "job_name": job_name, 244 | "partition": partition, 245 | "nodes": nodes, 246 | "gpu_per_node": gpu_per_node, 247 | "job_comment": job_comment, 248 | "job_timeout": job_timeout, 249 | "test_kind": test_kind, 250 | "nodelist": nodelist, 251 | "exclude": exclude, 252 | "job_account": job_account, 253 | }, 254 | ), 255 | range(parallel_tests), 256 | ): 257 | all_results.update(result) 258 | 259 | display_results(all_results) 260 | 261 | 262 | def main(): 263 | fire.Fire(gpu_tester) 264 | 265 | 266 | if __name__ == "__main__": 267 | main() 268 | -------------------------------------------------------------------------------- /gpu_tester/simple_forward_worker.py: -------------------------------------------------------------------------------- 1 | """worker running in each gpu""" 2 | 3 | import torch 4 | import socket 5 | import time 6 | from .world_info_from_env import world_info_from_env 7 | 8 | torch.manual_seed(0) 9 | 10 | 11 | def main(): 12 | local_rank, _, _ = world_info_from_env() 13 | 14 | hostname = socket.gethostname() 15 | try: 16 | device = torch.device(f"cuda:{local_rank}") 17 | torch.cuda.set_device(device) 18 | 19 | vector1 = torch.rand(1, 100000, device=device) 20 | vector2 = torch.rand(1, 100000, device=device) 21 | t = time.time() 22 | dot = (vector1 @ vector2.T).cpu().numpy() 23 | d = time.time() - t 24 | 25 | print("result", hostname, local_rank, dot[0][0], d) 26 | except RuntimeError as _: 27 | print("gpu_error", hostname, local_rank) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /gpu_tester/world_info_from_env.py: -------------------------------------------------------------------------------- 1 | """world info from env""" 2 | 3 | import os 4 | 5 | 6 | def world_info_from_env(): 7 | """world info from env""" 8 | local_rank = 0 9 | for v in ("LOCAL_RANK", "MPI_LOCALRANKID", "SLURM_LOCALID", "OMPI_COMM_WORLD_LOCAL_RANK"): 10 | if v in os.environ: 11 | local_rank = int(os.environ[v]) 12 | break 13 | global_rank = 0 14 | for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"): 15 | if v in os.environ: 16 | global_rank = int(os.environ[v]) 17 | break 18 | world_size = 1 19 | for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"): 20 | if v in os.environ: 21 | world_size = int(os.environ[v]) 22 | break 23 | 24 | return local_rank, global_rank, world_size 25 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | # Global options: 2 | 3 | [mypy] 4 | python_version = 3.8 5 | ignore_missing_imports = True 6 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | black==22.3.0 2 | mypy==0.942 3 | pylint==2.13.4 4 | pytest-cov==3.0.0 5 | pytest-xdist==2.5.0 6 | pytest==7.0.1 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire 2 | numpy 3 | torch -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | import os 4 | 5 | if __name__ == "__main__": 6 | with Path(Path(__file__).parent, "README.md").open(encoding="utf-8") as file: 7 | long_description = file.read() 8 | 9 | def _read_reqs(relpath): 10 | fullpath = os.path.join(os.path.dirname(__file__), relpath) 11 | with open(fullpath) as f: 12 | return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))] 13 | 14 | REQUIREMENTS = _read_reqs("requirements.txt") 15 | 16 | setup( 17 | name="gpu_tester", 18 | packages=find_packages(), 19 | include_package_data=True, 20 | version="1.2.0", 21 | license="MIT", 22 | description="A python template", 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | author="Romain Beaumont", 26 | author_email="romain.rom1@gmail.com", 27 | entry_points={"console_scripts": ["gpu_tester = gpu_tester:main"]}, 28 | url="https://github.com/rom1504/gpu_tester", 29 | data_files=[(".", ["README.md"])], 30 | keywords=["machine learning"], 31 | install_requires=REQUIREMENTS, 32 | classifiers=[ 33 | "Development Status :: 4 - Beta", 34 | "Intended Audience :: Developers", 35 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 36 | "License :: OSI Approved :: MIT License", 37 | "Programming Language :: Python :: 3.6", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from gpu_tester import gpu_tester 3 | 4 | 5 | def test_fake(): 6 | print("hi!") 7 | --------------------------------------------------------------------------------