├── bobber ├── lib │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── run_tests.py │ ├── analysis │ │ ├── __init__.py │ │ ├── nccl.py │ │ ├── meta.py │ │ ├── fio.py │ │ ├── common.py │ │ ├── compare_baseline.py │ │ ├── dali.py │ │ ├── table.py │ │ ├── parse-mlperf.py │ │ ├── parse_results.py │ │ └── aggregate_results.py │ ├── system │ │ ├── __init__.py │ │ └── file_handler.py │ ├── docker │ │ ├── __init__.py │ │ ├── Dockerfile │ │ └── management.py │ ├── exit_codes.py │ └── constants.py ├── __version__.py ├── __init__.py └── test_scripts │ ├── fio_fill_single.sh │ ├── call_dali_multi.sh │ ├── mdtest_multi.sh │ ├── nccl_multi.sh │ ├── fio_multi.sh │ ├── dali_multi.sh │ └── setup_fio.sh ├── .gitignore ├── requirements.txt ├── LICENSE ├── .github └── workflows │ └── python-package.yml ├── docs ├── sample_baseline.yaml ├── building.md ├── troubleshooting.md ├── parsing.md ├── docker.md ├── non_dgx_support.md └── baselines.md ├── setup.py ├── .gitlab-ci.yml └── CONTRIBUTING.md /bobber/lib/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | -------------------------------------------------------------------------------- /bobber/lib/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | -------------------------------------------------------------------------------- /bobber/lib/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | -------------------------------------------------------------------------------- /bobber/lib/system/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | -------------------------------------------------------------------------------- /bobber/__version__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | __version__ = '6.3.1' 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | *.pyc 3 | build/* 4 | dist/* 5 | env/* 6 | nvidia_bobber.egg-info/ 7 | -------------------------------------------------------------------------------- /bobber/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | from bobber.__version__ import __version__ 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.6.20 2 | chardet==3.0.4 3 | docker==4.3.1 4 | idna==2.10 5 | numpy==1.19.5 6 | pycodestyle==2.6.0 7 | PyYAML==5.4.1 8 | requests==2.26.0 9 | six==1.15.0 10 | tabulate==0.8.7 11 | urllib3==1.26.5 12 | websocket-client==0.57.0 13 | -------------------------------------------------------------------------------- /bobber/test_scripts/fio_fill_single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: MIT 3 | 4 | cd /storage-perf-test 5 | 6 | if [ "x$THREADS" = "x" ]; then 7 | THREADS=80 8 | fi 9 | 10 | if [ "x$DIRECTIO" = "x" ]; then 11 | DIRECTIO=0 12 | fi 13 | 14 | NO_FIO_SERVER=1 DIRECTIO=$DIRECTIO FSDIR=/mnt/fs_under_test NJOBS=$THREADS ./run_disk_fill_test.sh 15 | -------------------------------------------------------------------------------- /bobber/lib/docker/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import docker 3 | from bobber.lib.docker.management import DockerManager 4 | 5 | manager = DockerManager() 6 | 7 | # Map the instance methods to allow importing as "bobber.docker." 8 | # in other modules. 9 | build = manager.build 10 | cast = manager.cast 11 | execute = manager.execute 12 | export = manager.export 13 | load = manager.load 14 | -------------------------------------------------------------------------------- /bobber/test_scripts/call_dali_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: MIT 3 | BATCH_SIZE=$1 4 | DATASET=$2 5 | GPUS=$3 6 | 7 | if [[ "$DATASET" == *tfrecord* ]]; then 8 | python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET" 9 | else 10 | python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET" 11 | fi 12 | -------------------------------------------------------------------------------- /bobber/lib/exit_codes.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # This file contains a list of exit codes for debugging 3 | SUCCESS = 0 # Successful termination 4 | BASELINE_FAILURE = 10 # Performance did not meet criteria 5 | MISSING_LOG_FILES = 20 # Parsing directory with no logs 6 | DOCKER_BUILD_FAILURE = 30 # Failure building Docker image 7 | DOCKER_COMMUNICATION_ERROR = 31 # Unable to communicate with Docker 8 | CONTAINER_NOT_RUNNING = 32 # Bobber container not running 9 | NVIDIA_RUNTIME_ERROR = 33 # NVIDIA container runtime not found 10 | CONTAINER_VERSION_MISMATCH = 34 # Container different from application 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 NVIDIA CORPORATION 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bobber/test_scripts/mdtest_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: MIT 3 | 4 | #force threads to 44 for now - unclear on why we can't use more threads with mdtest but it blows up 5 | THREADS=44 6 | FSDIR=/mnt/fs_under_test 7 | 8 | mkdir $FSDIR/mdtest 9 | 10 | if [ "x$HOSTS" = "x" ]; then 11 | HOSTS=localhost:$THREADS 12 | fi 13 | 14 | if [ "x$NCCL_IB_HCAS" = "x" ]; then 15 | NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9 16 | fi 17 | 18 | if [ "x$SSH_IFACE" = "x" ]; then 19 | SSH_IFACE=enp226s0 20 | fi 21 | 22 | IFS="," read -r -a HOST_ARRAY <<< "$HOSTS" 23 | 24 | HOST_COUNT=${#HOST_ARRAY[@]} 25 | 26 | #remove trailing comma when passing the argument 27 | HOST_STRING="" 28 | for i in ${HOST_ARRAY[@]}; do 29 | HOST_STRING+="$i:$THREADS," 30 | done 31 | 32 | set -x 33 | 34 | mpirun -np $(($HOST_COUNT*$THREADS)) -H ${HOST_STRING%?} -map-by ppr:$THREADS:node --allow-run-as-root --mca btl_openib_warn_default_gid_prefix 0 --mca btl_openib_if_exclude mlx5_0,mlx5_5,mlx5_6 --mca plm_base_verbose 0 --mca plm_rsh_agent ssh -x IBV_DRIVERS -mca btl_tcp_if_include $SSH_IFACE -mca plm_rsh_args "-p 2222" /io-500-dev/bin/mdtest -i 3 -I 4 -z 3 -b 8 -u -d $FSDIR/mdtest 35 | 36 | rm -rf $FSDIR/mdtest 37 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: [push, pull_request] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [3.6, 3.7, 3.8, 3.9] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -r requirements.txt 26 | pip install wheel 27 | - name: Lint with pycodestyle 28 | run: | 29 | pycodestyle bobber 30 | - name: Build and install the wheel 31 | run: | 32 | python setup.py bdist_wheel sdist 33 | pip install dist/nvidia_bobber-*.whl 34 | - name: Build the latest image 35 | run: | 36 | bobber build 37 | docker images | grep nvidia/bobber 38 | -------------------------------------------------------------------------------- /docs/sample_baseline.yaml: -------------------------------------------------------------------------------- 1 | systems: 2 | 1: 3 | bandwidth: 4 | # FIO BW speed in bytes/second 5 | read: 1200000000 6 | write: 1000000000 7 | iops: 8 | # FIO IOPS speed in ops/second 9 | read: 100000 10 | write: 100000 11 | nccl: 12 | # NCCL maximum bus bandwidth in GB/s 13 | max_bus_bw: 230 14 | dali: 15 | # DALI average speed in images/second 16 | 800x600 standard jpg: 2000 17 | 3840x2160 standard jpg: 300 18 | 800x600 tfrecord: 2000 19 | 3840x2160 tfrecord: 300 20 | 2: 21 | bandwidth: 22 | # FIO BW speed in bytes/second 23 | read: 2400000000 24 | write: 2000000000 25 | iops: 26 | # FIO IOPS speed in ops/second 27 | read: 200000 28 | write: 200000 29 | nccl: 30 | # NCCL maximum bus bandwidth in GB/s 31 | max_bus_bw: 185 32 | dali: 33 | # DALI average speed in images/second 34 | 800x600 standard jpg: 4000 35 | 3840x2160 standard jpg: 600 36 | 800x600 tfrecord: 4000 37 | 3840x2160 tfrecord: 600 38 | -------------------------------------------------------------------------------- /bobber/test_scripts/nccl_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: MIT 3 | 4 | if [ "x$GPUS" = "x" ]; then 5 | GPUS=8 6 | fi 7 | 8 | if [ "x$HOSTS" = "x" ]; then 9 | HOSTS=localhost:$GPUS 10 | fi 11 | 12 | if [ "x$NCCL_IB_HCAS" = "x" ]; then 13 | NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9 14 | fi 15 | 16 | if [ "x$SSH_IFACE" = "x" ]; then 17 | SSH_IFACE=enp226s0 18 | fi 19 | 20 | if [ "x$NCCL_MAX" = "x" ]; then 21 | NCCL_MAX=1 22 | fi 23 | 24 | if [ "x$COMPUTE_GID" = "x" ]; then 25 | COMPUTE_GID=0 26 | fi 27 | 28 | if [ "x$NCCL_TC" = "x" ]; then 29 | NCCL_TC='' 30 | fi 31 | 32 | IFS="," read -r -a HOST_ARRAY <<< "$HOSTS" 33 | 34 | HOST_COUNT=${#HOST_ARRAY[@]} 35 | 36 | #remove trailing comma when passing the argument 37 | for i in ${HOST_ARRAY[@]}; do 38 | HOST_STRING+="$i:$GPUS," 39 | done 40 | 41 | mpirun -report-uri -display-allocation -v --allow-run-as-root --np $(($GPUS*$HOST_COUNT)) -H ${HOST_STRING%?} -bind-to none -map-by slot -x IBV_DRIVERS -x LD_LIBRARY_PATH -x PATH -x NCCL_IB_HCA=$NCCL_IB_HCAS -x NCCL_IB_TC=$NCCL_TC -x NCCL_IB_GID_INDEX=$COMPUTE_GID -x NCCL_IB_CUDA_SUPPORT=1 -mca orte_base_help_aggregate 0 -mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include $SSH_IFACE -mca btl_openib_verbose 1 /nccl-tests/build/all_reduce_perf -b 8 -e ${NCCL_MAX}G -f 2 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | from bobber import __version__ 3 | from setuptools import setup 4 | 5 | with open('README.md', 'r') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name='nvidia-bobber', 10 | version=__version__, 11 | description='Containerized testing of system components that impact AI workload performance', 12 | long_description=long_description, 13 | packages=['bobber', 14 | 'bobber/lib', 15 | 'bobber/lib/analysis', 16 | 'bobber/lib/docker', 17 | 'bobber/lib/system', 18 | 'bobber/lib/tests'], 19 | include_package_data=True, 20 | package_data={'': ['lib/docker/Dockerfile', 21 | 'test_scripts/call_dali_multi.sh', 22 | 'test_scripts/dali_multi.sh', 23 | 'test_scripts/fio_fill_single.sh', 24 | 'test_scripts/fio_multi.sh', 25 | 'test_scripts/mdtest_multi.sh', 26 | 'test_scripts/nccl_multi.sh', 27 | 'test_scripts/setup_fio.sh']}, 28 | license='MIT', 29 | python_requires='>=3.6', 30 | entry_points={ 31 | 'console_scripts': ['bobber=bobber.bobber:main'] 32 | }, 33 | install_requires=[ 34 | 'docker >= 4.3.1', 35 | 'numpy >= 1.9.5', 36 | 'pyyaml >= 5.4.0', 37 | 'tabulate >= 0.8.7', 38 | 'six>=1.15.0' 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /bobber/lib/analysis/nccl.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | from typing import Tuple 4 | 5 | 6 | def parse_nccl_file(log_files: list, systems: int) -> Tuple[list, list]: 7 | """ 8 | Find the maximum bus bandwidth and bus bytes from NCCL tests. 9 | 10 | Parse the bandwidth at all byte sizes achieved during NCCL tests and match 11 | the maximum bus bandwidth with the corresponding byte size from the 12 | results. Only the maximum and corresponding byte size from each log are 13 | returned to later find the overall average. 14 | 15 | Parameters 16 | ---------- 17 | log_files : list 18 | A ``list`` of ``strings`` of the filenames for all NCCL log files in 19 | the results directory. 20 | systems : int 21 | An ``integer`` of the number of systems used during the current test. 22 | 23 | Returns 24 | ------- 25 | tuple 26 | Returns a ``tuple`` of (``list``, ``list``) containing the maximum bus 27 | bandwidth and the bus bytes, respectively. 28 | """ 29 | max_bus_bw_list = [] 30 | bus_bytes_list = [] 31 | 32 | for log in log_files: 33 | with open(log, 'r') as f: 34 | log_contents = f.read() 35 | out_of_place_results = re.findall('.*float sum.*', log_contents) 36 | results = [line.split() for line in out_of_place_results] 37 | bytes_array = [float(result[0]) for result in results] 38 | bus_bw_array = [float(result[6]) for result in results] 39 | max_bus_bw_list.append(max(bus_bw_array)) 40 | max_index = bus_bw_array.index(max(bus_bw_array)) 41 | bus_bytes_list.append(bytes_array[max_index]) 42 | return max_bus_bw_list, bus_bytes_list 43 | -------------------------------------------------------------------------------- /bobber/lib/system/file_handler.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import os 3 | import yaml 4 | from typing import NoReturn 5 | 6 | 7 | def create_directory(directory: str) -> NoReturn: 8 | """ 9 | Create a directory if it doesn't exist. 10 | 11 | Parameters 12 | ---------- 13 | directory : string 14 | A ``string`` of the full directory path to create if it doesn't exist. 15 | """ 16 | if not os.path.exists(directory): 17 | os.makedirs(directory) 18 | 19 | 20 | def update_log(logfile: str, contents: str) -> NoReturn: 21 | """ 22 | Append a log with new output from a test. 23 | 24 | Parameters 25 | ---------- 26 | logfile : string 27 | A ``string`` of the logfile to write data to. 28 | contents : string 29 | A ``string`` of the contents to append the log file with. 30 | """ 31 | with open(logfile, 'a') as log: 32 | log.write(contents) 33 | 34 | 35 | def write_file(filename: str, contents: str) -> NoReturn: 36 | """ 37 | Write data to a file. 38 | 39 | Parameters 40 | ---------- 41 | filename : string 42 | A ``string`` of the file to write data to. 43 | contents : string 44 | A ``string`` of the contents to write to the file. 45 | """ 46 | with open(filename, 'w') as fp: 47 | fp.write(contents) 48 | 49 | 50 | def read_yaml(filename: str) -> dict: 51 | """ 52 | Read a YAML file and return the contents. 53 | 54 | Parameters 55 | ---------- 56 | filename : string 57 | A ``string`` of the full file path to read. 58 | 59 | Returns 60 | ------- 61 | dict 62 | Returns a ``dict`` representing the entire contents of the file. 63 | """ 64 | with open(filename, 'r') as handler: 65 | return yaml.safe_load(handler) 66 | -------------------------------------------------------------------------------- /bobber/test_scripts/fio_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "x$THREADS" = "x" ]; then 4 | THREADS=80 5 | fi 6 | 7 | if [ "x$DIRECTIO" = "x" ]; then 8 | DIRECTIO=0 9 | fi 10 | 11 | if [ "x$HOSTS" = "x" ]; then 12 | HOSTS='' 13 | fi 14 | 15 | if [ "x$IO_DEPTH" = "x" ]; then 16 | IO_DEPTH=16 17 | fi 18 | 19 | if [ "x$EXTRA_FLAGS" = "x" ]; then 20 | EXTRA_FLAGS='' 21 | fi 22 | 23 | if [ "x$READ_PATTERN" = "x" ]; then 24 | READ_PATTERN="read" 25 | fi 26 | 27 | if [ "x$WRITE_PATTERN" = "x" ]; then 28 | WRITE_PATTERN="write" 29 | fi 30 | 31 | HOSTS_WITH_SPACES=`echo $HOSTS | sed "s/,/ /g"` 32 | 33 | FSDIR=/mnt/fs_under_test 34 | 35 | IODEPTH=$IO_DEPTH 36 | NJOBS=$THREADS 37 | 38 | # Process all settings 39 | source /tests/setup_fio.sh 40 | 41 | # Clean up old jobs 42 | stop_servers 43 | 44 | # Start servers 45 | start_servers 46 | 47 | RUNOPTS="--invalidate=${INVALIDATE} --blocksize=${IOSIZE}k --size=${SIZE}k --numjobs=${NJOBS} --directory=${WORKDIR} ${FSYNC}" 48 | CREATEOPTS="--invalidate=${INVALIDATE} --blocksize=${CREATE_IOSIZE}k --size=${SIZE}k --numjobs=${NJOBS} --directory=${WORKDIR} ${FSYNC}" 49 | 50 | # List of commands 51 | ## Run create only first as it has been said it improves performance 52 | ## Run create with a large blocksize, because using a smaller blocksize will take an inordinate amount of time 53 | launch_fio --create_only=1 --rw=write ${IOSETTINGS} ${STDOPTS} ${CREATEOPTS} 54 | 55 | launch_fio --rw=${WRITE_PATTERN} ${IOSETTINGS} ${STDOPTS} ${RUNOPTS} ${EXTRA_FLAGS} 56 | drop_caches 57 | 58 | launch_fio --rw=${READ_PATTERN} ${IOSETTINGS} ${STDOPTS} ${RUNOPTS} ${EXTRA_FLAGS} 59 | drop_caches 60 | 61 | # Clean up the job 62 | stop_servers 63 | 64 | echo "Cleaning workspace" 65 | rm -f $JOBFN 66 | if [ "x$NORMDATA" == "x" ]; then 67 | rm -rf $WORKDIR 68 | fi 69 | 70 | echo "Done Running FIO Test" 71 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - pre_clean 3 | - docker_build 4 | - test 5 | 6 | variables: 7 | GIT_SUBMODULE_STRATEGY: recursive 8 | 9 | pre_clean: 10 | stage: pre_clean 11 | script: 12 | - echo "Cleaning all Docker containers, images, and volumes" 13 | - if [[ $(docker ps -q) ]]; then docker kill $(docker ps -q); fi 14 | - if [[ $(docker ps -q -a) ]]; then docker rm $(docker ps -q -a); fi 15 | - docker system prune --all --force 16 | - echo "Removing old results" 17 | - rm -rf ~/build_output 18 | - echo "Removing old virtual environments" 19 | - rm -rf env/ 20 | 21 | docker_build: 22 | stage: docker_build 23 | script: 24 | - echo "Testing all containers to verify successful building" 25 | - echo "Building Python wheel" 26 | - virtualenv --python python3 env 27 | - source env/bin/activate 28 | - python setup.py bdist_wheel sdist 29 | - pip install dist/nvidia_bobber-*-none-any.whl 30 | - echo "Building latest Bobber image" 31 | - bobber build 32 | # Capture the build ID during the image build process and ensure it is listed in the system 33 | - docker images | grep `bobber build | grep "Successfully built" | awk '{print $3}'` 34 | 35 | test: 36 | stage: test 37 | script: 38 | - echo "Running a single-node test to verify functionality" 39 | - virtualenv --python python3 env 40 | - source env/bin/activate 41 | - python setup.py bdist_wheel sdist 42 | - pip install dist/nvidia_bobber-*-none-any.whl 43 | - bobber cast /raid 44 | - bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --125k-threads 32 --iops-threads 96 --read-pattern randread test_results localhost 45 | - bobber parse-results --compare-baseline single-dgx-station-baseline test_results/ 46 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to Bobber 2 | 3 | Before contributing to Bobber, we require all users to sign-off on their work. 4 | 5 | ## Sign your work 6 | 7 | The sign-off is a simple line at the end of the explanation for the patch. Your 8 | signature certifies that you wrote the patch or otherwise have the right to pass 9 | it on as an open-source patch. The rules are pretty simple: if you can certify 10 | the below (from [developercertificate.org](http://developercertificate.org/)): 11 | 12 | ``` 13 | Developer Certificate of Origin 14 | Version 1.1 15 | 16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 17 | 1 Letterman Drive 18 | Suite D4700 19 | San Francisco, CA, 94129 20 | 21 | Everyone is permitted to copy and distribute verbatim copies of this 22 | license document, but changing it is not allowed. 23 | 24 | Developer's Certificate of Origin 1.1 25 | 26 | By making a contribution to this project, I certify that: 27 | 28 | (a) The contribution was created in whole or in part by me and I 29 | have the right to submit it under the open source license 30 | indicated in the file; or 31 | 32 | (b) The contribution is based upon previous work that, to the best 33 | of my knowledge, is covered under an appropriate open source 34 | license and I have the right under that license to submit that 35 | work with modifications, whether created in whole or in part 36 | by me, under the same open source license (unless I am 37 | permitted to submit under a different license), as indicated 38 | in the file; or 39 | 40 | (c) The contribution was provided directly to me by some other 41 | person who certified (a), (b) or (c) and I have not modified 42 | it. 43 | 44 | (d) I understand and agree that this project and the contribution 45 | are public and that a record of the contribution (including all 46 | personal information I submit with it, including my sign-off) is 47 | maintained indefinitely and may be redistributed consistent with 48 | this project or the open source license(s) involved. 49 | ``` 50 | 51 | Then you just add a line to every git commit message: 52 | 53 | Signed-off-by: Joe Smith 54 | 55 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 56 | 57 | If you set your `user.name` and `user.email` git configs, you can sign your 58 | commit automatically with `git commit -s`. 59 | -------------------------------------------------------------------------------- /docs/building.md: -------------------------------------------------------------------------------- 1 | # Building and running from source 2 | While it is recommended to run Bobber using the latest Python wheel available on 3 | the [GitHub Releases](https://github.com/NVIDIA/Bobber/releases) page, it is 4 | possible to run Bobber directly from source, either by building and installing a 5 | wheel locally or using Python to call specific Bobber modules. 6 | 7 | **NOTE:** If any changes are made to the application, any Bobber containers must 8 | be killed and re-launched to pickup the changes. This can be done with the 9 | following which is expected to be run after building and installing a new wheel 10 | or modifying code locally: 11 | 12 | ``` 13 | docker kill bobber # Only necessary if Bobber is already running 14 | bobber cast /path/to/storage # If a new wheel was built/installed 15 | # OR 16 | python3 -m bobber.bobber cast /path/to/storage # If modifying and running code directly 17 | ``` 18 | 19 | ## Running the Python modules 20 | To run the code directly using Python, first ensure all dependencies are 21 | installed using PIP. This can be done globally using `sudo`, or in a virtual 22 | environment with `virtualenv` or Anaconda. 23 | 24 | ``` 25 | sudo pip3 install -r requirements.txt 26 | ``` 27 | 28 | Once installed, Bobber can be called directly by calling the `bobber` package: 29 | 30 | ``` 31 | python3 -m bobber.bobber ... 32 | ``` 33 | 34 | Using `python3 -m bobber.bobber ...` is analogous to running `bobber ...` from 35 | the installed wheel with the exception of calling the code directly. The above 36 | command needs to be run from the root `bobber` directory of the repo. 37 | 38 | For example, to build the Bobber image directly from the code, run the 39 | following: 40 | 41 | ``` 42 | cd ~/bobber 43 | python3 -m bobber.bobber build 44 | ``` 45 | 46 | ## Building a Python wheel 47 | A Python wheel can be built directly from the source and installed to replace 48 | any existing Bobber wheels and run Bobber as normal without calling the code. A 49 | bash script has been created which automatically builds a development version of 50 | the Python wheel based on the local changes. Running the `./build-dev-wheel` 51 | script will update the version number to a dev version with a timestamp and 52 | build a new wheel of the current code with the updated version. By adding 53 | `minor` or `patch` to the script as an argument, the minor and patch version 54 | will be updated in addition to the timestamp, respectively. 55 | 56 | For example, to build a dev wheel without updating the minor or patch versions, 57 | run: 58 | 59 | ``` 60 | ./build-dev-wheel 61 | ``` 62 | 63 | If the current version of the package is `6.3.1`, this will generated a new 64 | wheel in the local `dist/` directory (which will be created if not already done) 65 | with the version `6.3.1.dev20210323084016` depending on the time the script was 66 | run. 67 | 68 | Likewise, running 69 | 70 | ``` 71 | ./build-dev-wheel patch 72 | ``` 73 | 74 | will generate a wheel in `dist/` with version `6.3.2.dev20210323084016` and 75 | 76 | ``` 77 | ./build-dev-wheel minor 78 | ``` 79 | 80 | will generate a wheel in `dist/` with version `6.4.0.dev20210323084016`. 81 | 82 | To generate a wheel manually without altering the version number, run 83 | 84 | ``` 85 | python3 setup.py bdist_wheel sdist 86 | ``` 87 | 88 | ### Installing the built wheel 89 | Install the generated wheel by ignoring any existing packages using PIP: 90 | 91 | ``` 92 | sudo pip3 install --ignore-installed dist/nvidia_bobber-.whl 93 | ``` 94 | 95 | Bobber can now be used normally with `bobber ...` targeting the code as written 96 | when the wheel was built. 97 | -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | Things don't always go as planned. This guide provides some steps to 3 | troubleshoot Bobber when it doesn't work as expected. 4 | 5 | ## General troubleshooting 6 | The first item to check when something goes wrong is the Docker image and 7 | containers across the cluster. On all hosts, verify the Docker image is built 8 | and matches the version of the Bobber wheel. Check the Bobber version with: 9 | 10 | ``` 11 | $ bobber --version 12 | 6.3.1 13 | ``` 14 | 15 | The version number is listed in the first line. Check the Bobber image is built 16 | and matches the version above with: 17 | 18 | ``` 19 | $ docker images | grep nvidia/bobber 20 | nvidia/bobber 6.3.1 a467a25ff008 10 minutes ago 5.23GB 21 | ``` 22 | 23 | If the above command does not contain output or the second column (`6.3.1` in 24 | the example) does not match the version of Bobber from the first step, the image 25 | needs to be built. Run `bobber build` to build the image and verify using the 26 | steps above once complete. 27 | 28 | Before any tests can be run, the container needs to be launched on all nodes. 29 | This can be verified with: 30 | 31 | ``` 32 | $ docker ps | grep bobber 33 | 1ab2b10f8eb1 nvidia/bobber:6.3.1 "/usr/local/bin/nvid..." 4 days ago Up 4 days bobber 34 | ``` 35 | 36 | If the above command does not contain output, the container needs to be launched 37 | using the `bobber cast /path/to/storage` command. 38 | 39 | ## Exit codes 40 | When the application terminates after a handled issue, various exit codes may be 41 | thrown depending on the situation. The following list provides extra context on 42 | these codes: 43 | 44 | * `0`: Exit Success - The application terminated successfully. 45 | * `10`: Baseline Failure - This is thrown while comparing results from a test run against a baseline (either one of the defaults or a custom baseline) using the `bobber parse-results --compare-baseline ...` or `bobber parse-results --custom-baseline ...` command. If at least one result doesn't exceed the baseline performance, it will be marked as a failure. Check the output of the command for a list of the results that don't exceed baseline performance and verify connectivity and configuration. 46 | * `20`: Missing Log Files - Thrown while attempting to parse results while specifying a directory that does not contain valid log files. Verify the directory being parsed contains log files with data. 47 | * `30`: Docker Build Failure - Thrown while trying to build the Bobber image with `bobber build`. Look at the output from the command to see if there are any specific issues while building. This is commonly seen when networking on host and/or Docker levels are down. 48 | * `31`: Docker Communication Error - Bobber was unable to communicate with the Docker daemon. Ensure Docker is running `systemctl start docker` and verify it is working properly with `docker images`. This command should not throw errors if Docker can communicate properly. 49 | * `32`: Container Not Running - The Bobber container needs to be running on all nodes prior to starting any tests. Use the `bobber cast` command to launch the container on all hosts. 50 | * `33`: NVIDIA Runtime Error - The Bobber container is unable to be launched with NVIDIA runtime capabilities. Ensure the latest NVIDIA drivers are installed as well as the latest nvidia-docker libraries. Verify GPUs can be accessed inside containers by running `docker run --rm -it nvcr.io/nvidia/cuda:11.2.1-runtime nvidia-smi`. This should display the list of GPUs installed in the system if the NVIDIA container runtime is installed properly. 51 | * `34`: Container Version Mismatch - The Bobber container and application version need to match to ensure proper functionality of the tests. To rectify the situation, first kill the running Bobber container with `docker kill bobber` then re-cast a new container with the same version as the Bobber application with the `bobber cast` command. If an image isn't already built for that version of Bobber, it will be built automatically with `bobber cast`. Note that if a new image is built, it will need to be re-copied to all hosts in the cluster for multi-node tests and subsequently killed/launched on all nodes using the above commands. 52 | -------------------------------------------------------------------------------- /bobber/test_scripts/dali_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: MIT 3 | 4 | if [ "x$GPUS" = "x" ]; then 5 | GPUS=8 6 | fi 7 | 8 | if [ "x$BATCH_SIZE_SM" = "x" ]; then 9 | BATCH_SIZE_SM=150 10 | fi 11 | 12 | if [ "x$BATCH_SIZE_LG" = "x" ]; then 13 | BATCH_SIZE_LG=150 14 | fi 15 | 16 | GPUS_ZERO_BASE=$(($GPUS-1)) 17 | 18 | if [ "x$HOSTS" = "x" ]; then 19 | HOSTS=localhost:1 20 | fi 21 | 22 | if [ "x$SSH_IFACE" = "x" ]; then 23 | SSH_IFACE=enp226s0 24 | fi 25 | 26 | IFS=',' read -r -a HOST_ARRAY <<< "$HOSTS" 27 | 28 | HOST_COUNT=${#HOST_ARRAY[@]} 29 | 30 | #remove trailing comma when passing the argument 31 | for i in ${HOST_ARRAY[@]}; do 32 | HOST_STRING+="$i:$GPUS," 33 | done 34 | 35 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images 36 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images 37 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline 38 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline 39 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx 40 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx 41 | 42 | imagine create-images --width 3840 --height 2160 --count $(($GPUS*1000)) --size /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images 4k_image_ jpg 43 | imagine create-images --width 800 --height 600 --count $(($GPUS*1000)) --size /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images small_image_ jpg 44 | 45 | imagine create-tfrecord --img-per-file 1000 /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline tfrecord- 46 | imagine create-tfrecord --img-per-file 1000 /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline tfrecord- 47 | 48 | for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx/tfrecord-$i; done 49 | for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx/tfrecord-$i; done 50 | 51 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_SM /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images $GPUS 52 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3 53 | 54 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_LG /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images $GPUS 55 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3 56 | 57 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_SM "/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" $GPUS 58 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3 59 | 60 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_LG "/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" $GPUS 61 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3 62 | 63 | rm -r /mnt/fs_under_test/imageinary_data 64 | -------------------------------------------------------------------------------- /bobber/lib/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | # Larger base stage with required items for building various tools 3 | FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04 as build 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # Install all required build dependencies 8 | RUN apt-get update && apt-get -y install apt-utils && rm -rf /var/lib/apt/lists/* 9 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 10 | swig \ 11 | bison \ 12 | gcc \ 13 | libgfortran4 \ 14 | pkg-config \ 15 | autotools-dev \ 16 | debhelper \ 17 | automake \ 18 | m4 \ 19 | gfortran \ 20 | tk \ 21 | flex \ 22 | libltdl-dev \ 23 | autoconf \ 24 | dpatch \ 25 | graphviz \ 26 | tcl \ 27 | chrpath \ 28 | libglib2.0-0 \ 29 | python-libxml2 \ 30 | build-essential \ 31 | cmake \ 32 | git \ 33 | curl \ 34 | wget \ 35 | ca-certificates \ 36 | iputils-ping \ 37 | net-tools \ 38 | ethtool \ 39 | perl \ 40 | lsb-release \ 41 | iproute2 \ 42 | pciutils \ 43 | kmod \ 44 | libnuma1 \ 45 | lsof \ 46 | libopenmpi-dev && \ 47 | rm -rf /var/lib/apt/lists/* 48 | 49 | # Compile NVIDIA's NCCL tests 50 | RUN git clone https://github.com/NVIDIA/nccl-tests && \ 51 | cd nccl-tests/ && \ 52 | git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \ 53 | make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi 54 | 55 | # Compile OSU microbenchmarks 56 | RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \ 57 | tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \ 58 | cd osu-micro-benchmarks-5.6.2 && \ 59 | ./configure CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \ 60 | make && \ 61 | make install && \ 62 | rm -rf ../*.tar.gz 63 | 64 | # Build IO500, IOR, and mdtest 65 | RUN git clone https://github.com/jyvet/io-500-dev && \ 66 | cd io-500-dev && \ 67 | git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \ 68 | utilities/prepare.sh 69 | 70 | # Lighter runtime stage copying only necessary build artifacts from earlier 71 | FROM nvcr.io/nvidia/cuda:11.2.0-runtime-ubuntu20.04 72 | 73 | ENV DEBIAN_FRONTEND=noninteractive 74 | 75 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 76 | openssh-client \ 77 | openssh-server \ 78 | git \ 79 | fio \ 80 | psmisc \ 81 | libopenmpi-dev \ 82 | openmpi-bin \ 83 | python \ 84 | python3-dev \ 85 | python3-pip \ 86 | python3-distutils && \ 87 | rm -rf /var/lib/apt/lists/* 88 | 89 | # Set default NCCL parameters 90 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 91 | 92 | # Install OpenSSH for MPI to communicate between containers 93 | RUN mkdir -p /var/run/sshd && \ 94 | mkdir -p /root/.ssh && \ 95 | echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \ 96 | echo "UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 97 | sed -i 's/^#*Port 22/Port 2222/' /etc/ssh/sshd_config && \ 98 | echo "HOST *" >> /root/.ssh/config && \ 99 | echo "PORT 2222" >> /root/.ssh/config && \ 100 | ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" && \ 101 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 102 | chmod 700 /root/.ssh && \ 103 | chmod 600 /root/.ssh/* 104 | 105 | WORKDIR / 106 | 107 | # Copy the compiled nccl-tests binaries to the runtime image 108 | COPY --from=build /nccl-tests/build /nccl-tests/build 109 | 110 | # Copy the compiled OSU microbenchmarks to the runtime image 111 | COPY --from=build /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ 112 | 113 | # Copy the compiled IO500 binaries to the runtime image 114 | COPY --from=build /io-500-dev/bin /io-500-dev/bin 115 | 116 | RUN git clone https://github.com/NVIDIA/DALI dali && \ 117 | cd dali/ && \ 118 | git reset --hard fd30786d773d08185d78988b2903dce2ace0a00b 119 | 120 | RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools && \ 121 | python3 -m pip install --no-cache-dir nvidia-pyindex && \ 122 | python3 -m pip install --no-cache-dir \ 123 | nvidia-imageinary['tfrecord']>=1.1.2 \ 124 | nvidia-dali-cuda110 125 | 126 | COPY test_scripts /tests/ 127 | 128 | EXPOSE 2222 129 | -------------------------------------------------------------------------------- /docs/parsing.md: -------------------------------------------------------------------------------- 1 | # Parsing 2 | Bobber includes a couple different parsers which can be used to easily verify 3 | performance results from a test. By pointing Bobber to the directory where 4 | results were saved, aggregate values per system-count level will be displayed. 5 | 6 | The output displays a table with the aggregate results among all iterations per 7 | number of nodes tested. For example, if 10 iterations were run, the 1-node 8 | results will be the average values for all test runs for a single node. As the 9 | node count goes up, the results reflect the aggregate value for all nodes that 10 | were tested. The table is automatically generated based on the values above to 11 | make it possible to view how results scale with additional node counts. 12 | 13 | ``` 14 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 15 | | Test | 1 Node(s) | 2 Node(s) | 3 Node(s) | 4 Node(s) | 5 Node(s) | 6 Node(s) | 7 Node(s) | 8 Node(s) | Scale | 16 | +====================================================+=============+=============+=============+=============+=============+=============+=============+=============+=========+ 17 | | FIO Read (GB/s) - 1MB BS | 7.996 | 18.208 | 22.707 | 23.43 | 34.916 | 37.28 | 44.941 | 45.316 | 1.67X | 18 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 19 | | FIO Write (GB/s) - 1MB BS | 4.439 | 5.291 | 5.46 | 5.6 | 7.116 | 7.444 | 7.588 | 7.486 | 1.11X | 20 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 21 | | FIO Read (k IOPS) - 4K BS | 306.9 | 515.2 | 546.9 | 566.1 | 625 | 638.6 | 790.8 | 978.8 | 1.25X | 22 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 23 | | FIO Write (k IOPS) - 4K BS | 295.5 | 437.9 | 445.5 | 427 | 474.4 | 474.7 | 502.4 | 484.2 | 1.07X | 24 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 25 | | NCCL Max BW (GB/s) | 235.253 | 141.883 | 140.335 | 140.731 | 140.083 | 140.966 | 139.593 | 140.715 | N/A | 26 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 27 | | DALI Standard 800x600 throughput (images/second) | 5821.49 | 11849.7 | 17719.4 | 23654.6 | 29508.7 | 35501.1 | 41282.8 | 47250.2 | 2.02X | 28 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+ 29 | ``` 30 | 31 | ### Parser Assumptions 32 | The auto-parser makes the following assumptions: 33 | * The Bobber version must match for all files in a directory so results aren't 34 | getting mixed. This can be overriden with `--override-version-check` while 35 | calling the script. 36 | * If a result file is invalid or missing data, it is skipped and not included 37 | with the results. The average results will reflect the limited number of valid 38 | results. 39 | * The lowest N-results in DALI tests are dropped for N-nodes. These results 40 | are part of a known warm-up period for DALI and do not indicate actual 41 | performance. 42 | * The scale 43 | 44 | ## Parsing MLPerf 45 | This repository includes a Python package that can quickly and easily parse 46 | MLPerf results. Note that MLPerf is **not** included in Bobber though results 47 | from the ResNet50 image classification benchmarks can be parsed here. 48 | 49 | ```bash 50 | $ python3 bobber/lib/analysis/parse-mlperf.py path_to_results/ 51 | MLPerf Results: 52 | Directory name: path_to_results/ 53 | Number of iterations: 5 54 | Nodes tested: 8 55 | Epoch 0: 56 | Speed: 80113.842 images/second 57 | Average time: 15.901 seconds 58 | Overall: 59 | Speed: 148134.457 images/second 60 | Average time: 7.116 minutes 61 | ``` 62 | 63 | The output displays the aggregate results among all MLPerf test passes and finds 64 | the average speed and times for all runs. Results for both Epoch 0 and overall 65 | numbers are displayed to provide different insights. Epoch 0 is helpful to best 66 | identify the storage performance as images are likely not to be cached in the 67 | system. 68 | 69 | ### Parser Assumptions 70 | The parser makes the following assumptions: 71 | * The parser assumes the directory only contains results for a single test 72 | sweep for a set number of nodes (ie. all results are from a 10-iteration test 73 | for 4 nodes and no results from different node counts are included). 74 | * The elapsed time is found by taking the difference between the start time 75 | for the first epoch (Epoch 0) and the stop time for the last epoch. 76 | * All results are averaged together based on the number of results in the 77 | directory. 78 | -------------------------------------------------------------------------------- /bobber/test_scripts/setup_fio.sh: -------------------------------------------------------------------------------- 1 | drop_caches () { 2 | 3 | DC=0 4 | case $FSTYPE in 5 | gpfs) 6 | echo "Cannot drop cache on GPFS" 7 | ;; 8 | lustre|nfs|ext4|wekafs) 9 | DC=1 10 | ;; 11 | *) 12 | echo "Unable to determine how to drop cache on FSTYPE: $FSTYPE, dropping anyway" 13 | DC=1 14 | ;; 15 | esac 16 | if [ $DC -eq 1 ]; then 17 | echo "Starting Drop Caches: $(date)" 18 | declare -a pidlist 19 | unset pidlist 20 | for N in ${FIO_NODELIST}; do 21 | ssh $N $SSHOPTS /sbin/sysctl vm.drop_caches=3 & 22 | p=$! 23 | pidlist=(${pidlist[@]} $p) 24 | done 25 | wait ${pidlist[@]} 26 | echo "Ending Drop Caches: $(date)" 27 | fi 28 | } 29 | 30 | stop_servers () { 31 | 32 | declare -a pidlist 33 | pidlist="" 34 | for N in $FIO_NODELIST; do 35 | echo "Killing Server on $N" 36 | 37 | if [ "$N" == "localhost" ]; then 38 | killall fio 39 | else 40 | ssh ${SSHOPTS} $N killall fio > /dev/null 2>&1 & 41 | fi 42 | p=$! 43 | pidlist=(${pidlist[@]} $p) 44 | done 45 | wait ${pidlist[@]} 46 | } 47 | 48 | start_servers () { 49 | 50 | if [ x"$NO_FIO_SERVER" != x"1" ]; then 51 | declare -a pidlist 52 | pidlist="" 53 | for N in $FIO_NODELIST; do 54 | echo "Launching Server on $N" 55 | 56 | if [ "$N" == "localhost" ]; then 57 | $FIOBIN --server --daemonize=/tmp/pidfile.$$ > /dev/null 2>&1 & 58 | else 59 | ssh ${SSHOPTS} $N $FIOBIN --server --daemonize=/tmp/pidfile.$$ > /dev/null 2>&1 & 60 | fi 61 | p=$! 62 | pidlist=(${pidlist[@]} $p) 63 | done 64 | wait ${pidlist[@]} 65 | else 66 | echo "Not Starting FIO Server" 67 | fi 68 | } 69 | 70 | create_jobfile () { 71 | 72 | # Write job to stdout 73 | echo "[${NAME}]" 74 | for O in $@; do 75 | if [ "$O" != "--create_jobfile" ]; then 76 | echo $O | sed 's/^\-\-//g' 77 | fi 78 | done 79 | } 80 | 81 | launch_fio () { 82 | 83 | echo "Command: " 84 | echo $FIOBIN $@ 85 | 86 | # Create Job File 87 | JOBFN=.jobfn.$$ 88 | create_jobfile $@ > $JOBFN 89 | cat $JOBFN 90 | 91 | if [ x"$NO_FIO_SERVER" != x"1" ]; then 92 | 93 | # Run Jobfile 94 | MFILE=/tmp/mfile.$$ 95 | rm -f $MFILE 96 | echo $FIO_NODELIST | tr ' ' '\n' > $MFILE 97 | 98 | $FIOBIN --client=$MFILE $JOBFN 99 | 100 | # Cleanup job file 101 | rm -rf $JOBFN 102 | rm -f $MFILE 103 | else 104 | taskset -c 0-23,48-71 $FIOBIN $JOBFN 105 | fi 106 | } 107 | 108 | #Filesystem type 109 | export FSTYPE=$(df -T $FSDIR | tail -1 | awk '{print $2}') 110 | # Set Size of file per thread 111 | export SIZE=${SIZE:-$(( 4096 * 1024 ))} 112 | # Set Size of each IO in KB 113 | export IOSIZE=${IOSIZE:-1024} 114 | # Set size of the IOs for file creation in KB 115 | export CREATE_IOSIZE=${CREATE_IOSIZE:-1024} 116 | # Number of Files per job 117 | export NRFILES=${NRFILES:-256} 118 | # Use DirectIO? 119 | export DIRECTIO=${DIRECTIO:-0} 120 | # Use MMAP IO? 121 | export MMAPIO=${MMAPIO:-0} 122 | # Set IODepth for DirectIO cases 123 | export IODEPTH=${IODEPTH:-16} 124 | # Set the invalidate flag or not, default is yes 125 | export INVALIDATE=${INVALIDATE:-1} 126 | # Set SSH options 127 | export SSHOPTS=${SSHOPTS:-"-o StrictHostKeyChecking=no"} 128 | # Set extra flags, if present 129 | export EXTRA_FLAGS=${EXTRA_FLAGS:-""} 130 | # Set JobName 131 | export NAME=${NAME:-iotest} 132 | # Set DirectIO settings if needed, allow for IOENGINE flexibility 133 | export IOENGINE=${IOENGINE:-posixaio} 134 | IOSETTINGS="" 135 | 136 | if [ $DIRECTIO -eq 1 ] && [ $MMAPIO -eq 1 ]; then 137 | echo "ERROR, unable to use both Direct IO and MMAP I/O simultaenously. Exiting" 138 | exit 1 139 | fi 140 | 141 | if [ $DIRECTIO -eq 1 ]; then 142 | IOSETTINGS="--direct=${DIRECTIO} --ioengine=${IOENGINE} --iodepth=${IODEPTH}" 143 | fi 144 | 145 | if [ $MMAPIO -eq 1 ]; then 146 | IOSETTINGS="--ioengine=mmap" 147 | fi 148 | 149 | # Set FSYNC if needed 150 | if [ x"$FSYNC" != x"" ]; then 151 | FSYNC="--fsync=${FSYNC}" 152 | fi 153 | 154 | #### Settings to run 155 | FIOBIN=${FIOBIN:-fio} 156 | 157 | if [ x"$(which $FIOBIN)" == x"" ]; then 158 | echo "ERROR: Enable to find fio binary at <$FIOBIN>. Set with FIOBIN. Exiting" 159 | exit 160 | fi 161 | 162 | FIODIR=$(cd $(dirname $(which $FIOBIN)) && pwd) 163 | FIOBIN=$FIODIR/$(basename $FIOBIN) 164 | 165 | DATETAG=$(date +%Y%m%d%H%M%S) 166 | 167 | export STDOPTS="--create_serialize=0 --fallocate=none --group_reporting=1 --disable_lat=1 --disable_clat=1 --disable_slat=1 --startdelay=5 --ramp_time=3 --runtime=180 --time_based=1" 168 | 169 | echo "IOTEST Settings:" 170 | for E in FSDIR FSTYPE NJOBS SIZE IOSIZE NRFILES DIRECTIO MMAPIO IOSETTINGS INVALIDATE FSYNC STDOPTS FIOBIN DATETAG SSHOPTS RUNTIME EXTRA_FLAGS; do 171 | eval V=\$$E 172 | echo $E | awk '{printf("%-12s: ", $1);}' 173 | echo $V 174 | done 175 | echo "" 176 | 177 | ########## Create 178 | WORKDIR=$FSDIR/fiodir.$DATETAG 179 | echo "Creating output directory $WORKDIR" 180 | mkdir $WORKDIR 181 | 182 | ########## Use nodelist from Bobber 183 | FIO_NODELIST=$HOSTS_WITH_SPACES 184 | echo "NCOUNT : $(echo $FIO_NODELIST | wc -w)" 185 | 186 | if [ x"$(which numactl)" != x"" ]; then 187 | numactl --show 188 | fi 189 | 190 | echo "FIO_NODELIST: $FIO_NODELIST" 191 | -------------------------------------------------------------------------------- /bobber/lib/analysis/meta.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | 4 | 5 | def avg(stats: list) -> float: 6 | """ 7 | Find the average of a list. 8 | 9 | Given a list of numbers, calculate the average of all values in the list. 10 | If the list is empty, default to 0.0. 11 | 12 | Parameters 13 | ---------- 14 | input_list : list 15 | A ``list`` of ``floats`` to find an average of. 16 | 17 | Returns 18 | ------- 19 | float 20 | Returns a ``float`` of the average value of the list. 21 | """ 22 | if len(stats) > 0: 23 | return sum(stats) / len(stats) 24 | else: 25 | return 0.0 26 | 27 | 28 | def pull_stats(summary: list) -> dict: 29 | """ 30 | Convert stats to a dictionary. 31 | 32 | Each line in the summary table in the log file needs to be parsed by first 33 | converting the table to a comma-separated list for easy parsing, then 34 | taking the first column as the statistical category and placing the 35 | remaining values into maximum, minimum, mean, and standard deviation. 36 | 37 | Parameters 38 | ---------- 39 | summary : list 40 | A ``list`` of ``strings`` representing each line in the summary table 41 | of the metadata file. 42 | 43 | Returns 44 | ------- 45 | dict 46 | Returns a ``dictionary`` of the converted table. 47 | """ 48 | results = {} 49 | 50 | for stat in summary: 51 | # Convert the table to a comma-separated list to make it easier to 52 | # parse. 53 | stat = stat.replace(':', '') 54 | stat_csv = re.sub(' +', ',', stat.strip()) 55 | components = stat_csv.split(',') 56 | key, max_val, min_val, mean, stdev = components 57 | results[key] = { 58 | 'max': float(max_val), 59 | 'min': float(min_val), 60 | 'mean': float(mean), 61 | 'stdev': float(stdev) 62 | } 63 | return results 64 | 65 | 66 | def parse_summary(log_contents: str) -> list: 67 | """ 68 | Pull the summary table from the metadata log. 69 | 70 | The bottom of the metadata log contains a summary table with all of the 71 | individual metadata operations and the results from the test. This table is 72 | denoted by a line of '-' signs and is ended with '-- finished'. Since these 73 | lines are used to make parsing easier, they should be dropped in the end. 74 | 75 | Parameters 76 | ---------- 77 | log_contents : str 78 | A ``string`` of the contents of the entire contents of a metadata log 79 | file. 80 | 81 | Returns 82 | ------- 83 | list 84 | Returns a ``list`` of ``strings`` representing each line in the summary 85 | table. 86 | """ 87 | summary = re.findall('--------- .*-- finished', 88 | log_contents, re.DOTALL) 89 | if len(summary) == 0: 90 | return None 91 | # `summary` is a single-element list where the element is a list of all of 92 | # the metadata stats. The first and last lines are unecessary as they are 93 | # only used to parse the table and can be dropped. 94 | summary = summary[0].split('\n')[1:-1] 95 | return summary 96 | 97 | 98 | def aggregate_results(combined_results: list) -> dict: 99 | """ 100 | Find the aggregate results for all categories. 101 | 102 | Parse every result from the metadata log files and capture the min, max, 103 | and mean for each operation for all iterations in a single object. 104 | 105 | Parameters 106 | ---------- 107 | combined_results : list 108 | A ``list`` of ``dictionaries`` containing the results from each summary 109 | table in each log file. 110 | 111 | Returns 112 | ------- 113 | dict 114 | Returns a ``dictionary`` of the final aggregate results for each 115 | operation in the summary tables of all logs. 116 | """ 117 | final_aggregate = {} 118 | 119 | if len(combined_results) == 0: 120 | return final_aggregate 121 | 122 | for key, stats in combined_results[0].items(): 123 | key_metrics = [stat[key] for stat in combined_results] 124 | final_aggregate[key] = { 125 | 'max': max([result['max'] for result in key_metrics]), 126 | 'min': min([result['min'] for result in key_metrics]), 127 | 'mean': avg([result['mean'] for result in key_metrics]) 128 | } 129 | return final_aggregate 130 | 131 | 132 | def parse_meta_file(log_files: list, systems: int, results: dict) -> dict: 133 | """ 134 | Parse the metadata results from the metadata logs. 135 | 136 | Search through each metadata log and extract the operations in the summary 137 | table, saving the aggregate results in a dictionary. 138 | 139 | Parameters 140 | ---------- 141 | log_files : list 142 | A ``list`` of ``strings`` of the filename of each metadata log file in 143 | the results directory. 144 | systems : int 145 | An ``integer`` of the number of systems used during the current test. 146 | results : dict 147 | A ``dictionary`` of the aggregate metadata results for each system 148 | count. 149 | 150 | Returns 151 | ------- 152 | dict 153 | Returns an updated ``dictionary`` including the aggregate metadata 154 | results for N-systems. 155 | """ 156 | combined_results = [] 157 | 158 | for log in log_files: 159 | with open(log, 'r') as f: 160 | log_contents = f.read() 161 | summary = parse_summary(log_contents) 162 | if not summary: 163 | print(f'Warning: Invalid results found in {log} log file.') 164 | print('Skipping...') 165 | continue 166 | stats = pull_stats(summary) 167 | combined_results.append(stats) 168 | results[systems] = aggregate_results(combined_results) 169 | return results 170 | -------------------------------------------------------------------------------- /docs/docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | This document demonstrates how to verify Docker installations and proper GPU 3 | functionality for Docker containers. 4 | 5 | ## Docker installation/upgrade 6 | This project requires Docker version 19.03 or newer to be installed. Check the 7 | version of Docker installed on the system with 8 | 9 | ```bash 10 | docker --version 11 | ``` 12 | 13 | If the version is 19.03 or newer, you may continue to the next sub-section. 14 | 15 | If your Docker version is older than 19.03, or Docker is not installed, follow 16 | the steps listed on Docker's website for 17 | [upgrading the Docker client](https://docs.docker.com/engine/install/ubuntu/), 18 | which are copied below for reference: 19 | 20 | First, remove any existing installations: 21 | 22 | ```bash 23 | sudo apt-get remove docker docker-engine docker.io containerd runc 24 | ``` 25 | 26 | Next, install required dependencies and add the Docker GPG key: 27 | 28 | ```bash 29 | sudo apt-get update 30 | 31 | sudo apt-get install \ 32 | apt-transport-https \ 33 | ca-certificates \ 34 | curl \ 35 | gnupg-agent \ 36 | software-properties-common 37 | 38 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 39 | ``` 40 | 41 | Lastly, add the stable repository and install Docker: 42 | 43 | ```bash 44 | sudo add-apt-repository \ 45 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 46 | $(lsb_release -cs) \ 47 | stable" 48 | 49 | sudo apt-get update 50 | 51 | sudo apt-get install docker-ce docker-ce-cli containerd.io 52 | ``` 53 | 54 | ## Docker permissions 55 | By default, only the `root` user is able to use Docker. To enable other users to 56 | use Docker without `sudo`, the user must be added to the `docker` group: 57 | 58 | ```bash 59 | sudo usermod -aG docker $USER 60 | newgrp docker 61 | ``` 62 | 63 | Verify your user is now able to interact directly with Docker without `sudo`: 64 | 65 | ```bash 66 | $ docker images 67 | REPOSITORY TAG IMAGE ID CREATED SIZE 68 | ``` 69 | 70 | ## Install NVIDIA-Docker 71 | In order to access GPUs inside Docker containers, the `nvidia-docker` package 72 | needs to be installed on all systems. The following installs the package 73 | (taken from [the nvidia-docker docs](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker)): 74 | 75 | ```bash 76 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && \ 77 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - && \ 78 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 79 | sudo apt update 80 | sudo apt install nvidia-docker2 81 | sudo systemctl restart docker 82 | ``` 83 | 84 | ## Testing containers 85 | Once Docker is fully installed, ensure GPUs are accessible from containers by 86 | pulling a CUDA container and running `nvidia-smi`: 87 | 88 | ```bash 89 | docker run --rm -it --gpus all nvidia/cuda:11.0-base nvidia-smi 90 | ``` 91 | 92 | This should output information on the GPUs installed on a system, similar to 93 | below: 94 | 95 | ``` 96 | +-----------------------------------------------------------------------------+ 97 | | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | 98 | |-------------------------------+----------------------+----------------------+ 99 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 100 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 101 | | | | MIG M. | 102 | |===============================+======================+======================| 103 | | 0 A100-SXM4-40GB On | 00000000:07:00.0 Off | 0 | 104 | | N/A 31C P0 62W / 400W | 0MiB / 40537MiB | 0% Default | 105 | | | | Disabled | 106 | +-------------------------------+----------------------+----------------------+ 107 | | 1 A100-SXM4-40GB On | 00000000:0F:00.0 Off | 0 | 108 | | N/A 29C P0 60W / 400W | 0MiB / 40537MiB | 0% Default | 109 | | | | Disabled | 110 | +-------------------------------+----------------------+----------------------+ 111 | | 2 A100-SXM4-40GB On | 00000000:47:00.0 Off | 0 | 112 | | N/A 30C P0 63W / 400W | 0MiB / 40537MiB | 0% Default | 113 | | | | Disabled | 114 | +-------------------------------+----------------------+----------------------+ 115 | | 3 A100-SXM4-40GB On | 00000000:4E:00.0 Off | 0 | 116 | | N/A 30C P0 60W / 400W | 0MiB / 40537MiB | 0% Default | 117 | | | | Disabled | 118 | +-------------------------------+----------------------+----------------------+ 119 | | 4 A100-SXM4-40GB On | 00000000:87:00.0 Off | 0 | 120 | | N/A 34C P0 64W / 400W | 0MiB / 40537MiB | 0% Default | 121 | | | | Disabled | 122 | +-------------------------------+----------------------+----------------------+ 123 | | 5 A100-SXM4-40GB On | 00000000:90:00.0 Off | 0 | 124 | | N/A 33C P0 66W / 400W | 0MiB / 40537MiB | 0% Default | 125 | | | | Disabled | 126 | +-------------------------------+----------------------+----------------------+ 127 | | 6 A100-SXM4-40GB On | 00000000:B7:00.0 Off | 0 | 128 | | N/A 34C P0 61W / 400W | 0MiB / 40537MiB | 0% Default | 129 | | | | Disabled | 130 | +-------------------------------+----------------------+----------------------+ 131 | | 7 A100-SXM4-40GB On | 00000000:BD:00.0 Off | 0 | 132 | | N/A 33C P0 58W / 400W | 0MiB / 40537MiB | 0% Default | 133 | | | | Disabled | 134 | +-------------------------------+----------------------+----------------------+ 135 | 136 | +-----------------------------------------------------------------------------+ 137 | | Processes: | 138 | | GPU GI CI PID Type Process name GPU Memory | 139 | | ID ID Usage | 140 | |=============================================================================| 141 | | No running processes found | 142 | +-----------------------------------------------------------------------------+ 143 | ``` -------------------------------------------------------------------------------- /docs/non_dgx_support.md: -------------------------------------------------------------------------------- 1 | # Non-DGX Support 2 | While Bobber supports both the NVIDIA DGX A100 and the DGX-2 platforms out of 3 | the box, it can also be run on most non-DGX Linux-based platforms with at least 4 | one NVIDIA Turing-based architecture or newer GPU installed. Depending on the 5 | systems being tested, a few parameters will need to be updated for Bobber to run 6 | as intended. This guide provides information on how to find the parameters to 7 | use: 8 | 9 | ## GPU Count 10 | By default, Bobber expects 8 GPUs in a system, similar to the DGX A100. If a 11 | system has a different number of GPUs installed than the default value, it will 12 | need to be specified by passing the `--gpus N` flag to any of the test Bobber 13 | test commands. To find the number of NVIDIA GPUs available, use `nvidia-smi` to 14 | list system-level GPU information. The following will list the GPUs installed on 15 | a system: 16 | 17 | ```bash 18 | $ nvidia-smi --query-gpu=gpu_name --format=csv,noheader 19 | Quadro RTX 8000 20 | Quadro RTX 8000 21 | ``` 22 | 23 | In the example above, the system has two RTX 8000 GPUs available. To run a test 24 | with this system, the `--gpus 2` flag will need to be passed, similar to the 25 | following: 26 | 27 | ```bash 28 | $ bobber run-all --gpus 2 /home/user/logs test-machine-1 29 | ``` 30 | 31 | At present, Bobber assumes all test systems in a cluster have the **same** 32 | number of GPUs available. To run a test pass with multiple nodes that all have 33 | two GPUs, run the following: 34 | 35 | ```bash 36 | $ bobber run-all --gpus 2 /home/user/logs test-machine-1,test-machine-2,... 37 | ``` 38 | 39 | ## SSH Interface 40 | While not important for single-node tests, the `--ssh-iface` flag is used to 41 | tell Bobber which network interface to use to communicate with other test nodes 42 | for multi-node tests. This can be found by using the `ip link show` command to 43 | list the active network interfaces on a system: 44 | 45 | ```bash 46 | $ ip link show | grep "state UP" 47 | 2: enp67s0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 48 | 4: wlo2: mtu 1500 qdisc noqueue state UP mode DORMANT group default qlen 1000 49 | ``` 50 | 51 | The example above shows two interfaces are "UP" - `enp67s0` which is a wired 52 | connection and `wlo2` which is a wireless connection. For this system, the wired 53 | connection is desired as it should provide better stability and performance. In 54 | general, the chosen interface should also be the primary management interface 55 | used by the operating system. 56 | 57 | To use `enp67s0` as the interface per the example above, tests can be started 58 | with: 59 | 60 | ```bash 61 | bobber run-all --ssh-iface enp67s0 /home/user/logs test-machine-1,test-machine-2,... 62 | ``` 63 | 64 | ## DALI Batch Sizes 65 | The DALI preprocesses large and small images which is typical of ResNet50 66 | workflows. Depending on the amount of GPU memory available, the DALI tests could 67 | run out of memory and terminate prematurely. Lowering the batch size for DALI 68 | allows the GPUs to allocate less memory to the test, enabling the process to 69 | complete as intended. It is recommended to attempt the tests once with the 70 | default batch sizes to verify successful completion. If the GPUs ran out of 71 | memory, a line similar to the following will be shown in the test log (note 72 | that the line number (`11`) and process number (`5349`) may differ): 73 | 74 | ``` 75 | /tests/call_dali_multi.sh: line 11: 5349 Killed 76 | ``` 77 | 78 | This will also be accompanied by the following mpirun error: 79 | 80 | ``` 81 | -------------------------------------------------------------------------- 82 | Primary job terminated normally, but 1 process returned 83 | a non-zero exit code. Per user-direction, the job has been aborted. 84 | -------------------------------------------------------------------------- 85 | -------------------------------------------------------------------------- 86 | mpirun detected that one or more processes exited with non-zero status, thus causing 87 | the job to be terminated. The first process to do so was: 88 | Process name: [[37996,1],0] 89 | Exit code: 137 90 | -------------------------------------------------------------------------- 91 | ``` 92 | 93 | If this error is shown, drop the batch size by half and restart the test to see 94 | if it completes successfully. Continue this process until the test is able to 95 | run. While it is possible both the small and large image sizes are causing the 96 | GPUs to run out of memory, it is much more likely that the large image batch 97 | size needs to be dropped. The default batch size for large images is `256` and 98 | for small images it is `512`. Specify the batch sizes with the following: 99 | 100 | ```bash 101 | $ bobber run-dali --batch-size-lg 128 --batch-size-sm 256 /home/user/logs test-machine-1,test-machine-2,... 102 | # OR 103 | $ bobber run-all --batch-size-lg 128 --batch-size-sm 256 /home/user/logs test-machine-1,test-machine-2,... 104 | ``` 105 | 106 | ## FIO Thread Flags 107 | Depending on the performance of the filesystem under test in addition to the 108 | CPUs, the FIO tests might stall, though this is very unlikely. If any of the FIO 109 | tests are stuck for a long time (10 minutes or more), the thread counts for both 110 | IOPS and bandwidth tests can be dropped to a lower level. These can be specified 111 | with the `--iops-threads` and `--bw-threads` flags. Note that for high 112 | performance filesystems and beefy compute nodes, these values can also be 113 | increased to attempt to achieve higher test results. The flags can be specified 114 | as follows: 115 | 116 | ```bash 117 | $ bobber run-all --iops-threads 100 --bw-threads 32 /home/user/logs test-machine-1,test-machine-2,... 118 | ``` 119 | 120 | ## NCCL HCAs 121 | The NCCL tests use specific HCAs to communicate across nodes. At present, this 122 | requires NVIDIA Mellanox network adapters connected between nodes either 123 | directly or via a network switch. For most server configurations, there will be 124 | a dedicated compute fabric used for high-speed communication between nodes. 125 | These adapters should be targeted for the NCCL tests. To find the appropriate 126 | adapters, run `ibdev2netdev` to find the HCA device name that corresponds to the 127 | network device name for the compute network. For example, consider the following 128 | output: 129 | 130 | ```bash 131 | $ ibdev2netdev 132 | mlx5_0 ==> ib0 (Up) 133 | mlx5_1 ==> ib1 (Up) 134 | mlx5_10 ==> ib10 (Up) 135 | mlx5_11 ==> ib11 (Up) 136 | mlx5_2 ==> ib2 (Up) 137 | mlx5_3 ==> ib3 (Up) 138 | mlx5_4 ==> ib4 (Up) 139 | mlx5_5 ==> ib5 (Up) 140 | mlx5_6 ==> ib6 (Up) 141 | mlx5_7 ==> ib7 (Up) 142 | mlx5_8 ==> ib8 (Up) 143 | mlx5_9 ==> ib9 (Up) 144 | ``` 145 | 146 | If the compute network is on adapters ib0-ib7, the devices to use for NCCL are 147 | `mlx5_0`, `mlx5_1`, ..., `mlx5_7`, matching the list above. To use these devices 148 | for the NCCL tests, the `--nccl-ib-hcas` flag needs to be passed, similar to the 149 | following. Note that the devices are separated with commas and no spaces: 150 | 151 | ```bash 152 | $ bobber run-all --nccl-ib-hcas mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 /home/user/logs test-machine-1,test-machine-2,... 153 | # OR 154 | $ bobber run-nccl --nccl-ib-hcas mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 /home/user/logs test-machine-1,test-machine-2,... 155 | ``` 156 | -------------------------------------------------------------------------------- /docs/baselines.md: -------------------------------------------------------------------------------- 1 | # Baselines 2 | The results parser included with Bobber is able to compare results against a 3 | pre-defined baseline or a custom baseline passed in as a YAML file. By comparing 4 | results against a baseline, it is possible to easily check if a round of tests 5 | meets the expected performance level or if any tests are underperforming. This 6 | is useful to verify new systems are performing as expected or to view if any 7 | changes to the hardware or software affect stability. 8 | 9 | ## Running the baseline comparison 10 | There are two main paths to compare baselines in Bobber, either by using a 11 | built-in baseline config or using a custom file. 12 | 13 | ### Using built-in baselines 14 | To compare against a built-in baseline, use the `--compare-baseline` flag with 15 | the `parse-results` command. To list the possible choices, pass the `--help` 16 | flag as below. The choices are listed in the curly brackets (`{}`): 17 | 18 | ``` 19 | bobber parse-results --help 20 | ... 21 | --compare-baseline {single-dgx-station-baseline} 22 | Compare the values produced by a test run against a pre-defined baseline to verify performance meets an acceptable threshold. 23 | ``` 24 | 25 | To run the comparison against existing results, run the following while updating 26 | the baseline and log directory, if applicable. 27 | 28 | ``` 29 | bobber parse-results --compare-baseline single-dgx-station-baseline results_logs/ 30 | ``` 31 | 32 | ### Using custom baselines 33 | To use a custom baseline, a YAML file needs to be created which specifies the 34 | expected performance for every test. This can be done by running the 35 | `parse-results` command against a directory which will automatically generate a 36 | YAML baseline in the directory named `baseline.yaml`. A 37 | [sample file](sample_baseline.yaml) has also been created as a reference if a 38 | custom baseline is desired. Every custom baseline file must have the following 39 | structure: 40 | 41 | ``` 42 | systems: # This should always be the first line 43 | 1: # This designates all results in the sub-block are specific to a single compute node 44 | bandwidth: # This section is for the FIO bandwidth results in bytes/second 45 | read: 1200000000 # The FIO bandwidth read results in bytes/second 46 | write: 1000000000 # The FIO bandwidth write results in bytes/second 47 | iops: # This section is for the FIO IOPS results in ops/second 48 | read: 100000 # The FIO IOPS read speed in ops/second 49 | write: 100000 # The FIO IOPS write speed in ops/second 50 | nccl: # The maximum bus bandwidth in GB/s for NCCL 51 | max_bus_bw: 230 # The maximum bus bandwidth in GB/s for NCCL 52 | dali: # The average speed in images/second from DALI tests 53 | 800x600 standard jpg: 2000 # The speed in images/second for 800x600 standard JPG images in DALI 54 | 3840x2160 standard jpg: 300 # The speed in images/second for 4K standard JPG images in DALI 55 | 800x600 tfrecord: 2000 # The speed in images/second for 800x600 TFRecords in DALI 56 | 3840x2160 tfrecord: 300 # The speed in images/second for 4K TFRecords in DALI 57 | 2: # Continue the same pattern as above for results specific to two compute nodes, if applicable 58 | ... 59 | ``` 60 | 61 | The custom results parser will only compare against the system counts that are 62 | provided in the YAML file, meaning if only results for 8 compute nodes are 63 | included in the YAML file, only those results will be compared. As many or as 64 | few system counts as desired can be added to the YAML file to more extensively 65 | compare results at all levels. 66 | 67 | After saving the YAML file locally, run the comparison as follows while updating 68 | the YAML file location and log directory, if applicable: 69 | 70 | ``` 71 | bobber parse-results --custom-baseline baseline.yaml results_log/ 72 | ``` 73 | 74 | ### Adding a tolerance 75 | Both of the baseline methods above allow a custom tolerance to be specified to 76 | give some wiggle-room in the results. Pass a percentage amount to allow below 77 | the baseline. 78 | 79 | Take for example a baseline that expects 10 GB/s from reads using FIO. If the 80 | test results yield 9.8 GB/s, this will be marked as a FAIL. However, if the 81 | tolerance is 5%, this will instead be marked as a PASS as 9.8 GB/s is within 5% 82 | of the expected value of 10 GB/s. 83 | 84 | To add a tolerance, add the `--baseline-tolerance` flag to either of the 85 | commands above. The default tolerance is 0% if not specified, meaning the test 86 | will fail if it is exactly at or below the baseline value. 87 | 88 | ## Baseline results output 89 | Regardless of which baseline method from above is chosen, the results will 90 | compare the performance from the requested results file with the baseline of 91 | choice. The comparison does a simple PASS/FAIL for every result depending on 92 | whether it surpasses performance or not. If at least one result does not meet 93 | performance expectations, the comparison will be marked as failed. 94 | 95 | Example of results that pass every threshold: 96 | 97 | ``` 98 | bobber parse-results --compare-baseline single-dgx-station-baseline log_files/ 99 | 100 | ... 101 | 102 | ================================================================================ 103 | Baseline assessment 104 | Comparing against "single-dgx-station-baseline" 105 | ================================================================================ 106 | 1 System(s) 107 | -------------------------------------------------------------------------------- 108 | FIO Bandwidth Read (GB/s) 109 | Expected: 1.2, Got: 1.595, Result: PASS 110 | FIO Bandwidth Write (GB/s) 111 | Expected: 1.0, Got: 1.232, Result: PASS 112 | -------------------------------------------------------------------------------- 113 | FIO IOPS Read (k IOPS) 114 | Expected: 100.0, Got: 136.5, Result: PASS 115 | FIO IOPS Write (k IOPS) 116 | Expected: 100.0, Got: 135.0, Result: PASS 117 | -------------------------------------------------------------------------------- 118 | NCCL Max Bus Bandwidth (GB/s) 119 | Expected: 70, Got: 79.86500000000001, Result: PASS 120 | -------------------------------------------------------------------------------- 121 | DALI 800x600 standard jpg (images/second) 122 | Expected: 2000, Got: 2694.595, Result: PASS 123 | DALI 3840x2160 standard jpg (images/second) 124 | Expected: 300, Got: 430.854, Result: PASS 125 | DALI 800x600 tfrecord (images/second) 126 | Expected: 2000, Got: 2665.653, Result: PASS 127 | DALI 3840x2160 tfrecord (images/second) 128 | Expected: 300, Got: 376.862, Result: PASS 129 | ================================================================================ 130 | ``` 131 | 132 | Example of results that fail one or more thresholds: 133 | 134 | ``` 135 | bobber parse-results --custom-baseline sample_baseline.yaml log_files/ 136 | 137 | ... 138 | 139 | ================================================================================ 140 | Baseline assessment 141 | Comparing against a custom config 142 | ================================================================================ 143 | 1 System(s) 144 | -------------------------------------------------------------------------------- 145 | FIO Bandwidth Read (GB/s) 146 | Expected: 7.0, Got: 1.595, Result: FAIL 147 | FIO Bandwidth Write (GB/s) 148 | Expected: 3.0, Got: 1.232, Result: FAIL 149 | -------------------------------------------------------------------------------- 150 | FIO IOPS Read (k IOPS) 151 | Expected: 300.0, Got: 136.5, Result: FAIL 152 | FIO IOPS Write (k IOPS) 153 | Expected: 200.0, Got: 135.0, Result: FAIL 154 | -------------------------------------------------------------------------------- 155 | NCCL Max Bus Bandwidth (GB/s) 156 | Expected: 230, Got: 79.86500000000001, Result: FAIL 157 | -------------------------------------------------------------------------------- 158 | DALI 800x600 standard jpg (images/second) 159 | Expected: 2000, Got: 2694.595, Result: PASS 160 | DALI 3840x2160 standard jpg (images/second) 161 | Expected: 300, Got: 430.854, Result: PASS 162 | DALI 800x600 tfrecord (images/second) 163 | Expected: 2000, Got: 2665.653, Result: PASS 164 | DALI 3840x2160 tfrecord (images/second) 165 | Expected: 300, Got: 376.862, Result: PASS 166 | -------------------------------------------------------------------------------- 167 | 5 tests did not meet the suggested criteria! 168 | See results above for failed tests and verify setup. 169 | ``` 170 | -------------------------------------------------------------------------------- /bobber/lib/constants.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | BUILD = 'build' 3 | EXPORT = 'export' 4 | CAST = 'cast' 5 | LOAD = 'load' 6 | PARSE_RESULTS = 'parse-results' 7 | RUN_ALL = 'run-all' 8 | RUN_DALI = 'run-dali' 9 | RUN_NCCL = 'run-nccl' 10 | RUN_STG_BW = 'run-stg-bw' 11 | RUN_STG_IOPS = 'run-stg-iops' 12 | RUN_STG_125K = 'run-stg-125k' 13 | RUN_STG_META = 'run-stg-meta' 14 | 15 | DGX_A100_SINGLE = { 16 | 'gpus': 8, 17 | 'bw_threads': 16, 18 | 'stg_125k_threads': 16, 19 | 'iops_threads': 200, 20 | 'batch_size_sm': 512, 21 | 'batch_size_lg': 256, 22 | 'ssh_iface': 'enp226s0', 23 | 'nccl_ib_hcas': 'mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7', 24 | 'nccl_max': 4 25 | } 26 | 27 | DGX_A100_DUAL = { 28 | 'gpus': 8, 29 | 'bw_threads': 16, 30 | 'stg_125k_threads': 16, 31 | 'iops_threads': 200, 32 | 'batch_size_sm': 512, 33 | 'batch_size_lg': 256, 34 | 'ssh_iface': 'enp226s0', 35 | 'nccl_ib_hcas': 'mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9', 36 | 'nccl_max': 4 37 | } 38 | 39 | DGX_2 = { 40 | 'gpus': 16, 41 | 'bw-threads': 16, 42 | 'stg_125k_threads': 16, 43 | 'batch-size-sm': 150, 44 | 'batch-size-lg': 75, 45 | 'iops-threads': 80, 46 | 'ssh-iface': 'enp6s0', 47 | 'nccl-ib-hcas': 48 | 'mlx5_13,mlx5_15,mlx5_17,mlx5_19,mlx5_3,mlx5_5,mlx5_7,mlx5_9', 49 | 'nccl-max': 1 50 | } 51 | 52 | SYSTEMS = { 53 | 'dgx-a100-single': DGX_A100_SINGLE, 54 | 'dgx-a100-dual': DGX_A100_DUAL, 55 | 'dgx-2': DGX_2 56 | } 57 | 58 | READ_PATTERNS = { 59 | 'read', 60 | 'randread' 61 | } 62 | 63 | WRITE_PATTERNS = { 64 | 'write', 65 | 'randwrite' 66 | } 67 | 68 | # Baseline Results 69 | # This is considered a minimum value that tests should hit in order to be 70 | # verified the system has been configured properly for HPC and AI workloads. 71 | SINGLE_DGX_STATION_BASELINE = { 72 | 'systems': { 73 | '1': { 74 | 'bandwidth': { 75 | # FIO BW speed in bytes/second 76 | 'read': 1200000000, 77 | 'write': 1000000000 78 | }, 79 | 'iops': { 80 | # FIO IOPS speed in ops/second 81 | 'read': 100000, 82 | 'write': 100000 83 | }, 84 | 'nccl': { 85 | # NCCL maximum bus bandwidth in GB/s 86 | 'max_bus_bw': 70 87 | }, 88 | 'dali': { 89 | # DALI average speed in images/second 90 | '800x600 standard jpg': 2000, 91 | '3840x2160 standard jpg': 300, 92 | '800x600 tfrecord': 2000, 93 | '3840x2160 tfrecord': 250 94 | } 95 | } 96 | } 97 | } 98 | 99 | DGX_A100_POD_BASELINE = { 100 | 'systems': { 101 | '1': { 102 | 'bandwidth': { 103 | # FIO BW speed in bytes/second 104 | 'read': 2250000000, 105 | 'write': 875000000 106 | }, 107 | 'iops': { 108 | # FIO IOPS speed in ops/second 109 | 'read': 87500, 110 | 'write': 16250 111 | }, 112 | 'nccl': { 113 | # NCCL maximum bus bandwidth in GB/s 114 | 'max_bus_bw': 230 115 | }, 116 | 'dali': { 117 | # DALI average speed in images/second 118 | '800x600 standard jpg': 2000, 119 | '3840x2160 standard jpg': 1000, 120 | '800x600 tfrecord': 4000, 121 | '3840x2160 tfrecord': 1000 122 | } 123 | }, 124 | '2': { 125 | 'bandwidth': { 126 | # FIO BW speed in bytes/second 127 | 'read': 4500000000, 128 | 'write': 1750000000 129 | }, 130 | 'iops': { 131 | # FIO IOPS speed in ops/second 132 | 'read': 175000, 133 | 'write': 32500 134 | }, 135 | 'nccl': { 136 | # NCCL maximum bus bandwidth in GB/s 137 | 'max_bus_bw': 180 138 | }, 139 | 'dali': { 140 | # DALI average speed in images/second 141 | '800x600 standard jpg': 4000, 142 | '3840x2160 standard jpg': 2000, 143 | '800x600 tfrecord': 8000, 144 | '3840x2160 tfrecord': 2000 145 | } 146 | }, 147 | '3': { 148 | 'bandwidth': { 149 | # FIO BW speed in bytes/second 150 | 'read': 6750000000, 151 | 'write': 2625000000 152 | }, 153 | 'iops': { 154 | # FIO IOPS speed in ops/second 155 | 'read': 262500, 156 | 'write': 48750 157 | }, 158 | 'nccl': { 159 | # NCCL maximum bus bandwidth in GB/s 160 | 'max_bus_bw': 180 161 | }, 162 | 'dali': { 163 | # DALI average speed in images/second 164 | '800x600 standard jpg': 6000, 165 | '3840x2160 standard jpg': 3000, 166 | '800x600 tfrecord': 12000, 167 | '3840x2160 tfrecord': 3000 168 | } 169 | }, 170 | '4': { 171 | 'bandwidth': { 172 | # FIO BW speed in bytes/second 173 | 'read': 9000000000, 174 | 'write': 3500000000 175 | }, 176 | 'iops': { 177 | # FIO IOPS speed in ops/second 178 | 'read': 350000, 179 | 'write': 65000 180 | }, 181 | 'nccl': { 182 | # NCCL maximum bus bandwidth in GB/s 183 | 'max_bus_bw': 180 184 | }, 185 | 'dali': { 186 | # DALI average speed in images/second 187 | '800x600 standard jpg': 8000, 188 | '3840x2160 standard jpg': 4000, 189 | '800x600 tfrecord': 16000, 190 | '3840x2160 tfrecord': 4000 191 | } 192 | }, 193 | '5': { 194 | 'bandwidth': { 195 | # FIO BW speed in bytes/second 196 | 'read': 11250000000, 197 | 'write': 4375000000 198 | }, 199 | 'iops': { 200 | # FIO IOPS speed in ops/second 201 | 'read': 437500, 202 | 'write': 81250 203 | }, 204 | 'nccl': { 205 | # NCCL maximum bus bandwidth in GB/s 206 | 'max_bus_bw': 180 207 | }, 208 | 'dali': { 209 | # DALI average speed in images/second 210 | '800x600 standard jpg': 20000, 211 | '3840x2160 standard jpg': 5000, 212 | '800x600 tfrecord': 20000, 213 | '3840x2160 tfrecord': 5000 214 | } 215 | }, 216 | '6': { 217 | 'bandwidth': { 218 | # FIO BW speed in bytes/second 219 | 'read': 13500000000, 220 | 'write': 5250000000 221 | }, 222 | 'iops': { 223 | # FIO IOPS speed in ops/second 224 | 'read': 525000, 225 | 'write': 97500 226 | }, 227 | 'nccl': { 228 | # NCCL maximum bus bandwidth in GB/s 229 | 'max_bus_bw': 180 230 | }, 231 | 'dali': { 232 | # DALI average speed in images/second 233 | '800x600 standard jpg': 24000, 234 | '3840x2160 standard jpg': 6000, 235 | '800x600 tfrecord': 24000, 236 | '3840x2160 tfrecord': 6000 237 | } 238 | }, 239 | '7': { 240 | 'bandwidth': { 241 | # FIO BW speed in bytes/second 242 | 'read': 15750000000, 243 | 'write': 6125000000 244 | }, 245 | 'iops': { 246 | # FIO IOPS speed in ops/second 247 | 'read': 612500, 248 | 'write': 113750 249 | }, 250 | 'nccl': { 251 | # NCCL maximum bus bandwidth in GB/s 252 | 'max_bus_bw': 180 253 | }, 254 | 'dali': { 255 | # DALI average speed in images/second 256 | '800x600 standard jpg': 28000, 257 | '3840x2160 standard jpg': 7000, 258 | '800x600 tfrecord': 28000, 259 | '3840x2160 tfrecord': 7000 260 | } 261 | }, 262 | '8': { 263 | 'bandwidth': { 264 | # FIO BW speed in bytes/second 265 | 'read': 18000000000, 266 | 'write': 7000000000 267 | }, 268 | 'iops': { 269 | # FIO IOPS speed in ops/second 270 | 'read': 700000, 271 | 'write': 130000 272 | }, 273 | 'nccl': { 274 | # NCCL maximum bus bandwidth in GB/s 275 | 'max_bus_bw': 180 276 | }, 277 | 'dali': { 278 | # DALI average speed in images/second 279 | '800x600 standard jpg': 32000, 280 | '3840x2160 standard jpg': 8000, 281 | '800x600 tfrecord': 32000, 282 | '3840x2160 tfrecord': 8000 283 | } 284 | } 285 | } 286 | } 287 | 288 | BASELINES = { 289 | 'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE, 290 | 'dgx-a100-pod-baseline': DGX_A100_POD_BASELINE 291 | } 292 | -------------------------------------------------------------------------------- /bobber/lib/analysis/fio.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | from bobber.lib.analysis.common import fio_command_details 4 | from typing import Tuple 5 | 6 | 7 | def clean_iops(iops: str) -> float: 8 | """ 9 | Convert the IOPS into an equivalent operations/second result. 10 | 11 | Parse the IOPS value from the input string and convert the value from a 12 | larger unit to an equivalent operations/second, if applicable. 13 | 14 | Parameters 15 | ---------- 16 | iops : str 17 | A ``string`` of the number of operations/second and resulting unit. 18 | 19 | Returns 20 | ------- 21 | float 22 | Returns a ``float`` of the final IOPS value in operations/second. 23 | """ 24 | number = float(re.findall(r'\d+', iops)[0]) 25 | if 'G' in iops: 26 | ops_per_second = number * 1e9 27 | elif 'M' in iops: 28 | ops_per_second = number * 1e6 29 | elif 'k' in iops: 30 | ops_per_second = number * 1e3 31 | else: 32 | ops_per_second = number 33 | return ops_per_second 34 | 35 | 36 | def clean_bw(bandwidth: str) -> float: 37 | """ 38 | Convert the bandwidth into an equivalent bytes/second result. 39 | 40 | Parse the bandwidth value from the input string and convert the value from 41 | a larger unit to an equivalent operations/second, if applicable. 42 | 43 | Parameters 44 | ---------- 45 | bandwidth : str 46 | A ``string`` of the bandwidth and unit from the test. 47 | 48 | Returns 49 | ------- 50 | float 51 | Returns a ``float`` of the final bandwidth in bytes/second. 52 | """ 53 | number = float(re.findall(r'(\d+(?:\.\d+)?)', bandwidth)[0]) 54 | if 'GB/s' in bandwidth: 55 | bytes_per_second = number * 1e9 56 | elif 'MB/s' in bandwidth: 57 | bytes_per_second = number * 1e6 58 | elif 'kb/s' in bandwidth.lower(): 59 | bytes_per_second = number * 1e3 60 | else: 61 | bytes_per_second = number 62 | return bytes_per_second 63 | 64 | 65 | def fio_bw_results(log_contents: str, systems: int, string_to_match: str, 66 | log: str) -> list: 67 | """ 68 | Capture the bandwidth results from the log files. 69 | 70 | Search the log for any lines containing a bandwidth value and return a 71 | final list of all of the parsed values. 72 | 73 | Parameters 74 | ---------- 75 | log_contents : str 76 | A ``string`` of the contents from an FIO log file. 77 | systems : int 78 | An ``integer`` of the number of systems used during the current test. 79 | string_to_match : str 80 | A regex ``string`` of the line to pull from the log file to match any 81 | bandwidth lines. 82 | log : str 83 | A ``string`` of the name of the log file being parsed. 84 | 85 | Returns 86 | ------- 87 | list 88 | Returns a ``list`` of ``floats`` representing all of the bandwidth 89 | values parsed from the log. 90 | 91 | Raises 92 | ------ 93 | ValueError 94 | Raises a ``ValueError`` if the bandwidth cannot be parsed from the log 95 | file. 96 | """ 97 | final_bw = [] 98 | 99 | match = re.findall(string_to_match, log_contents) 100 | if len(match) != systems: 101 | print(f'Warning: Invalid number of results found in {log} log file. ' 102 | 'Skipping...') 103 | return [] 104 | for result in match: 105 | bw = re.findall(r'\(\d+[kMG]B/s\)', result) 106 | if len(bw) != 1: 107 | bw = re.findall(r'\(\d+\.\d+[kMG]B/s\)', result) 108 | if len(bw) != 1: 109 | raise ValueError('Bandwidth cannot be parsed from FIO log!') 110 | bw = clean_bw(bw[0]) 111 | final_bw.append(bw) 112 | return final_bw 113 | 114 | 115 | def fio_iops_results(log_contents: str, systems: int, string_to_match: str, 116 | log: str) -> list: 117 | """ 118 | Capture the IOPS results from the log files. 119 | 120 | Search the log for any lines containing IOPS values and return a final list 121 | of all of the parsed values. The FIO IOPS tests print an extra line for 122 | multi-node tests and are subsequently dropped. 123 | 124 | Parameters 125 | ---------- 126 | log_contents : str 127 | A ``string`` of the contents from an FIO log file. 128 | systems : int 129 | An ``integer`` of the number of systems used during the current test. 130 | string_to_match : str 131 | A regex ``string`` of the line to pull from the log file to match any 132 | IOPS lines. 133 | log : str 134 | A ``string`` of the name of the log file being parsed. 135 | 136 | Returns 137 | ------- 138 | list 139 | Returns a ``list`` of ``floats`` representing all of the IOPS values 140 | parsed from the log. 141 | 142 | Raises 143 | ------ 144 | ValueError 145 | Raises a ``ValueError`` if the IOPS cannot be parsed from the log 146 | file. 147 | """ 148 | final_iops = [] 149 | 150 | match = re.findall(string_to_match, log_contents) 151 | if (systems == 1 and len(match) != systems) or \ 152 | (systems != 1 and len(match) != systems + 1): 153 | print(f'Warning: Invalid number of results found in {log} log file. ' 154 | 'Skipping...') 155 | return [] 156 | for result in match: 157 | iops = re.findall(r'[-+]?\d*\.\d+[kMG]|\d+[kMG]|\d+', result) 158 | if len(iops) not in [5, 6]: 159 | raise ValueError('IOPS cannot be parsed from FIO log!') 160 | iops = clean_iops(iops[0]) 161 | final_iops.append(iops) 162 | # For multi-system benchmarks, an extra IOPS line is included with 163 | # semi-aggregate results, but needs to be dropped from our results for a 164 | # more accurate analysis. 165 | if systems != 1: 166 | final_iops = final_iops[:-1] 167 | return final_iops 168 | 169 | 170 | def parse_fio_bw_file(log_files: list, systems: int, read_system_results: dict, 171 | write_system_results: dict) -> Tuple[dict, dict, dict, 172 | dict]: 173 | """ 174 | Parse the FIO bandwidth results and test parameters. 175 | 176 | Search all log files for read and write parameters used to initiate the 177 | test and the final results and return the resulting objects. 178 | 179 | Parameters 180 | ---------- 181 | log_files : list 182 | A ``list`` of ``strings`` of the filenames of all FIO bandwidth logs in 183 | the results directory. 184 | systems : int 185 | An ``integer`` of the number of systems used during the current test. 186 | read_system_results : dict 187 | A ``dictionary`` of the final read results for N-systems. 188 | write_system_results : dict 189 | A ``dictionary`` of the final write results for N-systems. 190 | 191 | Returns 192 | ------- 193 | tuple 194 | A ``tuple`` of four dictionaries containing the read results, write 195 | results, read parameters, and write parameters, respectively. 196 | """ 197 | read_params, write_params = None, None 198 | 199 | for log in log_files: 200 | with open(log, 'r') as f: 201 | log_contents = f.read() 202 | read_params, write_params = fio_command_details(log_contents, 203 | read_params, 204 | write_params) 205 | write_bw = fio_bw_results(log_contents, systems, 'WRITE: bw=.*', log) 206 | if write_bw == []: 207 | continue 208 | read_bw = fio_bw_results(log_contents, systems, 'READ: bw=.*', log) 209 | write_system_results[systems].append(sum(write_bw)) 210 | read_system_results[systems].append(sum(read_bw)) 211 | return read_system_results, write_system_results, read_params, write_params 212 | 213 | 214 | def parse_fio_iops_file(log_files: list, systems: int, 215 | read_system_results: dict, 216 | write_system_results: dict) -> Tuple[dict, dict, dict, 217 | dict]: 218 | """ 219 | Parse the FIO IOPS results and test parameters. 220 | 221 | Search all log files for read and write parameters used to initiate the 222 | test and the final results and return the resulting objects. 223 | 224 | Parameters 225 | ---------- 226 | log_files : list 227 | A ``list`` of ``strings`` of the filenames of all FIO IOPS logs in the 228 | results directory. 229 | systems : int 230 | An ``integer`` of the number of systems used during the current test. 231 | read_system_results : dict 232 | A ``dictionary`` of the final read results for N-systems. 233 | write_system_results : dict 234 | A ``dictionary`` of the final write results for N-systems. 235 | 236 | Returns 237 | ------- 238 | tuple 239 | A ``tuple`` of four dictionaries containing the read results, write 240 | results, read parameters, and write parameters, respectively. 241 | """ 242 | read_params, write_params = None, None 243 | 244 | for log in log_files: 245 | with open(log, 'r') as f: 246 | log_contents = f.read() 247 | read_params, write_params = fio_command_details(log_contents, 248 | read_params, 249 | write_params) 250 | write_iops = fio_iops_results(log_contents, systems, 'write: IOPS=.*', 251 | log) 252 | read_iops = fio_iops_results(log_contents, systems, 'read: IOPS=.*', 253 | log) 254 | write_system_results[systems].append(sum(write_iops)) 255 | read_system_results[systems].append(sum(read_iops)) 256 | return read_system_results, write_system_results, read_params, write_params 257 | -------------------------------------------------------------------------------- /bobber/lib/analysis/common.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | from collections import defaultdict 4 | from typing import Tuple 5 | 6 | 7 | class bcolors: 8 | """ 9 | A helper class to annotate text with colors. 10 | """ 11 | PASS = '\033[92m' # nosec 12 | WARNING = '\033[93m' 13 | FAIL = '\033[91m' 14 | BOLD = '\033[1m' 15 | ENDC = '\033[0m' 16 | 17 | 18 | def num_systems(log: str) -> int: 19 | """ 20 | Returns an ``integer`` of the number of systems that were tested during a 21 | particular run. 22 | 23 | Parameters 24 | ---------- 25 | log : str 26 | A ``string`` of the filename for a single log. 27 | 28 | Returns 29 | ------- 30 | int 31 | Returns an ``int`` of the number of systems that were tested for the 32 | given logfile. Defaults to None if not found. 33 | """ 34 | try: 35 | systems = re.findall(r'systems_\d+_', log) 36 | systems = re.findall(r'\d+', systems[0]) 37 | return int(systems[0]) 38 | except ValueError: 39 | return None 40 | 41 | 42 | def _bobber_version(log: str) -> str: 43 | """ 44 | Returns a ``string`` representation of the Bobber version tested, such as 45 | '6.3.1'. 46 | 47 | Parameters 48 | ---------- 49 | log : str 50 | A ``string`` of the filename for a single log. 51 | 52 | Returns 53 | ------- 54 | str 55 | Returns a ``string`` of the Bobber version tested, such as '6.3.1'. 56 | 57 | Raises 58 | ------ 59 | ValueError 60 | Raises a ``ValueError`` if the version cannot be parsed from the log 61 | file. 62 | """ 63 | version = re.findall(r'version_\d+_\d+_\d+', log) 64 | if len(version) != 1: 65 | raise ValueError(f'Could not parse Bobber version from {log} file!') 66 | version = version[0].replace('version_', '') 67 | return version.replace('_', '.') 68 | 69 | 70 | def check_bobber_version(logs: list, override: bool) -> str: 71 | """ 72 | Ensure the Bobber version matches in all logs being parsed. 73 | 74 | As a safeguard to mixing results from different Bobber versions, the 75 | version needs to be checked for all logs to ensure they are equal. By 76 | comparing each new log version with the previous version captured, if all 77 | are equal in the list of logs, then it is guaranteed that the logs are all 78 | the same. 79 | 80 | Parameters 81 | ---------- 82 | logs : list 83 | A ``list`` of strings of all of the log filenames in the directory that 84 | was passed. 85 | override : bool 86 | A ``boolean`` which evaluates to ``True`` when the version-checking 87 | should be skipped. 88 | 89 | Returns 90 | ------- 91 | str 92 | Returns a ``string`` of the Bobber version being tested. 93 | 94 | Raises 95 | ------ 96 | ValueError 97 | Raises a ``ValueError`` when any log versions don't match. 98 | """ 99 | last_version = None 100 | 101 | for log in logs: 102 | version = _bobber_version(log) 103 | if override: 104 | return version 105 | if last_version and version != last_version: 106 | raise ValueError('Error: Only logs using the same Bobber version ' 107 | 'are allowed in the results directory.') 108 | last_version = version 109 | return version 110 | 111 | 112 | def _convert_to_bytes(value: str) -> float: 113 | """ 114 | Convert a number to bytes. 115 | 116 | Convert a passed number to bytes by parsing the number from the passed 117 | string and multiplying by the appropriate multiplier to convert from a 118 | larger unit to bytes. 119 | 120 | Parameters 121 | ---------- 122 | value : str 123 | A ``string`` of the value to convert to bytes. 124 | 125 | Returns 126 | ------- 127 | float 128 | Returns a ``float`` of the final value in bytes. 129 | """ 130 | number = float(re.sub('[a-zA-Z]*', '', value)) 131 | if 'gib' in value.lower(): 132 | return number * 1024 * 1024 * 1024 133 | elif 'g' in value.lower(): 134 | return number * 1e9 135 | elif 'mib' in value.lower(): 136 | return number * 1024 * 1024 137 | elif 'm' in value.lower(): 138 | return number * 1e6 139 | elif 'kib' in value.lower(): 140 | return number * 1024 141 | elif 'k' in value.lower(): 142 | return number * 1e3 143 | 144 | 145 | def _fio_command_parse(command: str) -> dict: 146 | """ 147 | Parse the command parameters for fio. 148 | 149 | Pull all of the flags and parameters used during a fio run and save them as 150 | a dictionary to make it easier to reference what was used during a test. 151 | 152 | Parameters 153 | ---------- 154 | command : str 155 | A ``string`` of the command used during the fio run. 156 | 157 | Returns 158 | ------- 159 | dict 160 | Returns a ``dictionary`` of the parameters used during the fio run. 161 | """ 162 | parameter_dict = {} 163 | 164 | for parameter in command.split(): 165 | # Skip the following parameters as they don't provide meaningful data. 166 | if parameter == '/usr/bin/fio': 167 | continue 168 | key, value = parameter.split('=') 169 | key = key.replace('--', '') 170 | if key in ['blocksize', 'size']: 171 | value = _convert_to_bytes(value) 172 | else: 173 | # Attempt to convert to a int for numerical values. If it fails, 174 | # keep as a string as that's likely intended type. 175 | try: 176 | value = int(value) 177 | except ValueError: 178 | value = str(value) 179 | parameter_dict[key] = value 180 | return parameter_dict 181 | 182 | 183 | def _compare_dicts(old_results: dict, new_results: dict) -> bool: 184 | """ 185 | Compare testing dictionaries for equality. 186 | 187 | Compare the dictionaries for equality while ignoring the 'directory' and 188 | 'command' keys since these will always differ amongst tests. If all other 189 | parameters are equal, it is assumed the tests used the same parameters. 190 | 191 | Parameters 192 | ---------- 193 | old_results : dict 194 | A ``dictionary`` of the test parameters used during the 195 | previously-parsed test log. 196 | new_results : dict 197 | A ``dictionary`` of the test parameters used during the test log 198 | currently being parsed. 199 | 200 | Returns 201 | ------- 202 | bool 203 | Returns a ``boolean`` which evaluates to `True` when all of the 204 | parameters are equal between the two dictionaries and `False` if at 205 | least on parameter is different. 206 | """ 207 | ignore_keys = ['directory', 'command'] 208 | 209 | old = dict((k, v) for k, v in old_results.items() if k not in ignore_keys) 210 | new = dict((k, v) for k, v in new_results.items() if k not in ignore_keys) 211 | return old == new 212 | 213 | 214 | def fio_command_details(log_contents: str, old_reads: dict, 215 | old_writes: dict) -> Tuple[dict, dict]: 216 | """ 217 | Parse the command parameters and compare with the previous log. 218 | 219 | Pull the fio parameters used for both the read and write commands during 220 | the tests and compare them with the previous log file that was parsed to 221 | ensure all tests being parsed are using the same parameters. 222 | 223 | Parameters 224 | ---------- 225 | log_contents : str 226 | A ``string`` of all the output inside a log file. 227 | old_reads : dict 228 | A ``dictionary`` of the previous read test parameters that were parsed. 229 | old_writes : dict 230 | A ``dictionary`` of the previous write test parameters that were 231 | parsed. 232 | 233 | Returns 234 | ------- 235 | tuple 236 | Returns a ``tuple`` of (``dict``, ``dict``) where each dictionary are 237 | the parsed read and write parameters, respectively, from the tests. 238 | 239 | Raises 240 | ------ 241 | ValueError 242 | Raises a ``ValueError`` if the fio command type is unexpected or the 243 | parameters differ between two or more tests. 244 | """ 245 | commands = re.findall(r'/usr/bin/fio --rw.*', log_contents) 246 | if len(commands) < 2: 247 | raise ValueError(f'FIO command not found in {log} file!') 248 | 249 | for command in commands: 250 | if '--rw=read' in command: 251 | read_params = _fio_command_parse(command) 252 | read_params['command'] = command 253 | elif '--rw=write' in command: 254 | write_params = _fio_command_parse(command) 255 | write_params['command'] = command 256 | elif '--rw=randread' in command: 257 | read_params = _fio_command_parse(command) 258 | read_params['command'] = command 259 | elif '--rw=randwrite' in command: 260 | write_params = _fio_command_parse(command) 261 | write_params['command'] = command 262 | else: 263 | raise ValueError('Unexpected FIO test type. Expected ' 264 | 'read, write, randread, or randwrite.') 265 | if old_reads and old_writes: 266 | if not _compare_dicts(old_reads, read_params) or \ 267 | not _compare_dicts(old_writes, write_params): 268 | raise ValueError('Parameters differ between tests. Ensure only ' 269 | 'tests with the same parameters are used.') 270 | return read_params, write_params 271 | 272 | 273 | def divide_logs_by_systems(log_files: list, log_to_match: str) -> dict: 274 | """ 275 | Extract logs on a per-system basis. 276 | 277 | Given a list of all logs in a directory and a string to match for the log 278 | files, extract all of the requested logs and group them together on a 279 | per-system basis. For example, matching 'stg_iops' will pull all of the 280 | IOPS test logs and combine all of the one-node IOPS logs in a list, then 281 | all of the two-node IOPS logs in another list, and so on. 282 | 283 | Parameters 284 | ---------- 285 | log_files : list 286 | A ``list`` of log filenames from the passed directory to parse. 287 | log_to_match : str 288 | A ``string`` of the logs to match in the directory. 'stg_iops' will 289 | match all logs that begin with 'stg_iops'. 290 | 291 | Returns 292 | ------- 293 | dict 294 | Returns a ``dictionary`` of all results where the key is the number of 295 | nodes being tested and the value is a list of all of the logs that 296 | match the filter for that system count. 297 | """ 298 | # Divide the results based on the number of systems tested. 299 | num_systems_dict = defaultdict(list) 300 | 301 | for log in log_files: 302 | if log_to_match not in log: 303 | continue 304 | systems = num_systems(log) 305 | num_systems_dict[systems].append(log) 306 | return num_systems_dict 307 | -------------------------------------------------------------------------------- /bobber/lib/analysis/compare_baseline.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import sys 3 | from bobber.lib.constants import BASELINES 4 | from bobber.lib.exit_codes import BASELINE_FAILURE 5 | from bobber.lib.analysis.common import bcolors 6 | from bobber.lib.system.file_handler import read_yaml 7 | from typing import NoReturn, Optional, Tuple 8 | 9 | 10 | # Map the dicitonary keys in the baseline to human-readable names. 11 | TEST_MAPPING = { 12 | 'bandwidth': 'FIO Bandwidth', 13 | 'iops': 'FIO IOPS', 14 | 'nccl': 'NCCL', 15 | 'dali': 'DALI' 16 | } 17 | 18 | 19 | def metric_passes(expected: float, got: float, tolerance: int) -> bool: 20 | """ 21 | Determine if a test result meets a particular threshold. 22 | 23 | Compares the parsed value with the requested baseline for the same test and 24 | return a boolean of whether or not it is greater than expected. If a 25 | tolerance is passed, any value that is N-percent or higher below the 26 | requested tolerance of N will still be marked as passing. 27 | 28 | Parameters 29 | ---------- 30 | expected : float 31 | A ``float`` of the baseline value to compare against. 32 | got : float 33 | A ``float`` of the test result that was parsed. 34 | tolerance : int 35 | An ``int`` of the percentage below the threshold to still mark as 36 | passing. 37 | 38 | Returns 39 | ------- 40 | bool 41 | Returns a ``boolean`` which evaluates to `True` when the parsed value 42 | is greater than the baseline and `False` otherwise. 43 | """ 44 | if tolerance > 0: 45 | # If user passes a 5% tolerance, multiply the expected value by 5% less 46 | # than current value to get the tolerance. 47 | expected = (1 - tolerance / 100) * expected 48 | if got > expected: 49 | return True 50 | else: 51 | return False 52 | 53 | 54 | def result_text(result: bool, failures: int) -> Tuple[str, int]: 55 | """ 56 | Color-code the result output. 57 | 58 | If the result passes the threshold, it will be marked as PASSing in green 59 | text. Otherwise, it will be marked as FAILing in red text. 60 | 61 | Parameters 62 | ---------- 63 | result : bool 64 | A ``boolean`` which evaluates to `True` when the value meets the 65 | threshold and `False` if not. 66 | failures : int 67 | An ``integer`` of the number of results that have not met the 68 | threshold. 69 | 70 | Returns 71 | ------- 72 | tuple 73 | Returns a ``tuple`` of (``str``, ``int``) representing the color-coded 74 | text and the number of failures found, respectively. 75 | """ 76 | if result: 77 | output = f'{bcolors.PASS}PASS{bcolors.ENDC}' 78 | else: 79 | failures += 1 80 | output = f'{bcolors.FAIL}FAIL{bcolors.ENDC}' 81 | return output, failures 82 | 83 | 84 | def evaluate_fio(baselines: dict, results: dict, test_name: str, failures: int, 85 | tolerance: int) -> int: 86 | """ 87 | Evaluate the fio test results against the baseline. 88 | 89 | Determine if the fio test results meet the expected threshold and display 90 | the outcome with appropriate units. 91 | 92 | Parameters 93 | ---------- 94 | baselines : dict 95 | A ``dictionary`` of the baseline to compare results against. 96 | results : dict 97 | A ``dictionary`` of the parsed results. 98 | test_name : str 99 | A ``string`` of the name of the test being parsed. 100 | failures : int 101 | An ``integer`` of the number of results that have not met the 102 | threshold. 103 | tolerance : int 104 | An ``int`` of the percentage below the threshold to still mark as 105 | passing. 106 | 107 | Returns 108 | ------- 109 | int 110 | Returns an ``integer`` of the number of results that have not met the 111 | threshold. 112 | """ 113 | for test, value in baselines.items(): 114 | if test_name not in results.keys(): 115 | continue 116 | if test_name == 'bandwidth': 117 | unit = '(GB/s)' 118 | expected = value / 1000000000 119 | got = round(results[test_name][test] / 1000000000, 3) 120 | elif test_name == 'iops': 121 | unit = '(k IOPS)' 122 | expected = value / 1000 123 | got = round(results[test_name][test] / 1000, 3) 124 | print(f' {TEST_MAPPING[test_name]} {test.title()} {unit}') 125 | text = f' Expected: {expected}, Got: {got}' 126 | result = metric_passes(expected, got, tolerance) 127 | output, failures = result_text(result, failures) 128 | text += f', Result: {output}' 129 | print(text) 130 | return failures 131 | 132 | 133 | def evaluate_nccl(baseline: dict, results: dict, failures: int, 134 | tolerance: int) -> int: 135 | """ 136 | Evaluate the NCCL test results against the baseline. 137 | 138 | Determine if the NCCL test results meet the expected threshold and display 139 | the outcome with appropriate units. 140 | 141 | Parameters 142 | ---------- 143 | baselines : dict 144 | A ``dictionary`` of the baseline to compare results against. 145 | results : dict 146 | A ``dictionary`` of the parsed results. 147 | failures : int 148 | An ``integer`` of the number of results that have not met the 149 | threshold. 150 | tolerance : int 151 | An ``int`` of the percentage below the threshold to still mark as 152 | passing. 153 | 154 | Returns 155 | ------- 156 | int 157 | Returns an ``integer`` of the number of results that have not met the 158 | threshold. 159 | """ 160 | if 'max_bus_bw' not in baseline.keys(): 161 | return failures 162 | print(' NCCL Max Bus Bandwidth (GB/s)') 163 | expected = baseline['max_bus_bw'] 164 | got = results['nccl']['max_bus_bw'] 165 | text = f' Expected: {expected}, Got: {got}' 166 | result = metric_passes(expected, got, tolerance) 167 | output, failures = result_text(result, failures) 168 | text += f', Result: {output}' 169 | print(text) 170 | return failures 171 | 172 | 173 | def evaluate_dali(baselines: dict, results: dict, test_name: str, 174 | failures: int, tolerance: int) -> int: 175 | """ 176 | Evaluate the DALI test results against the baseline. 177 | 178 | Determine if the DALI test results meet the expected threshold and display 179 | the outcome with appropriate units. 180 | 181 | Parameters 182 | ---------- 183 | baselines : dict 184 | A ``dictionary`` of the baseline to compare results against. 185 | results : dict 186 | A ``dictionary`` of the parsed results. 187 | test_name : str 188 | A ``string`` of the name of the test being parsed. 189 | failures : int 190 | An ``integer`` of the number of results that have not met the 191 | threshold. 192 | tolerance : int 193 | An ``int`` of the percentage below the threshold to still mark as 194 | passing. 195 | 196 | Returns 197 | ------- 198 | int 199 | Returns an ``integer`` of the number of results that have not met the 200 | threshold. 201 | """ 202 | for test, value in baselines.items(): 203 | if test not in results.keys(): 204 | continue 205 | print(f' DALI {test} (images/second)') 206 | expected = value 207 | got = round(results[test]['average images/second'], 3) 208 | text = f' Expected: {expected}, Got: {got}' 209 | result = metric_passes(expected, got, tolerance) 210 | output, failures = result_text(result, failures) 211 | text += f', Result: {output}' 212 | print(text) 213 | return failures 214 | 215 | 216 | def evaluate_test(baseline: dict, results: dict, system_count: int, 217 | tolerance: int, failures: int) -> int: 218 | """ 219 | Evaluate all tests for N-nodes and compare against the baseline. 220 | 221 | The comparison verifies results meet a certain threshold for each system 222 | count in a sweep. For example, in an 8-node sweep, compare the one-node 223 | results to the baseline before comparing the two-node results and so on. 224 | 225 | Parameters 226 | ---------- 227 | baseline : dict 228 | A ``dictionary`` of the baseline to compare results against. 229 | results : dict 230 | A ``dictionary`` of the parsed results. 231 | system_count : int 232 | An ``int`` of the number of systems that were tested for each 233 | comparison level. 234 | tolerance : int 235 | An ``int`` of the percentage below the threshold to still mark as 236 | passing. 237 | failures : int 238 | An ``integer`` of the number of results that have not met the 239 | threshold. 240 | 241 | Returns 242 | ------- 243 | int 244 | Returns an ``integer`` of the number of results that have not met the 245 | threshold. 246 | """ 247 | for test_name, test_values in baseline.items(): 248 | print('-' * 80) 249 | if test_name in ['bandwidth', 'iops']: 250 | failures = evaluate_fio(test_values, results, test_name, failures, 251 | tolerance) 252 | elif test_name == 'nccl': 253 | failures = evaluate_nccl(test_values, results, failures, tolerance) 254 | elif test_name == 'dali': 255 | failures = evaluate_dali(test_values, 256 | results['dali'], 257 | test_name, 258 | failures, 259 | tolerance) 260 | return failures 261 | 262 | 263 | def compare_baseline(results: dict, baseline: str, tolerance: int, 264 | custom: Optional[bool] = False) -> NoReturn: 265 | """ 266 | Compare a baseline against parsed results. 267 | 268 | Pull the requested baseline either from a custom YAML file or one of the 269 | existing baselines included with the application and compare against the 270 | parsed results by checking if the parsed result is greater than the 271 | baseline on a per-system basis. 272 | 273 | Parameters 274 | ---------- 275 | results : dict 276 | A ``dictionary`` of the complete set of results from a parsed 277 | dictionary. 278 | baseline : str 279 | A ``string`` of the baseline to use. This either represents a key from 280 | the included baselines, or a filename to a custom YAML config file to 281 | read. 282 | tolerance : int 283 | An ``int`` of the tolerance as a percentage below the baseline to allow 284 | results to still be marked as passing. 285 | custom : bool (optional) 286 | An optional ``boolean`` that, when `True`, will read in a baseline 287 | passed from a YAML file. If `False`, it will compare against an 288 | included baseline. 289 | """ 290 | failures = 0 291 | 292 | print('=' * 80) 293 | print('Baseline assessment') 294 | if custom: 295 | print('Comparing against a custom config') 296 | baseline = read_yaml(baseline) 297 | else: 298 | print(f'Comparing against "{baseline}"') 299 | baseline = BASELINES[baseline] 300 | if tolerance > 0: 301 | print(f'Allowing a tolerance of {tolerance}% below expected to PASS') 302 | 303 | for system_count, baseline_results in baseline['systems'].items(): 304 | print('=' * 80) 305 | if str(system_count) not in results['systems'].keys(): 306 | print(f'No results found for {system_count} system(s)') 307 | print('Skipping...') 308 | continue 309 | print(f' {system_count} System(s)') 310 | failures = evaluate_test(baseline_results, 311 | results['systems'][str(system_count)], 312 | system_count, 313 | tolerance, 314 | failures) 315 | 316 | if failures > 0: 317 | print('-' * 80) 318 | print(f'{failures} test(s) did not meet the suggested criteria!') 319 | print('See results above for failed tests and verify setup.') 320 | # Throw a non-zero exit status so any tools that read codes will catch 321 | # that the baseline was not met. 322 | sys.exit(BASELINE_FAILURE) 323 | 324 | print('=' * 80) 325 | -------------------------------------------------------------------------------- /bobber/lib/analysis/dali.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | 4 | 5 | def _clean_sizes(sizes: list) -> list: 6 | """ 7 | Remove all text from sizes. 8 | 9 | The parser to capture sizes of various objects includes 'in bytes: ' in the 10 | string which should be stripped, leaving only numbers. 11 | 12 | Parameters 13 | ---------- 14 | sizes : list 15 | A ``list`` of ``strings`` of sizes of various objects. 16 | 17 | Returns 18 | ------- 19 | list 20 | Returns a ``list`` of ``integers`` of sizes of various objects. 21 | """ 22 | return [int(size.replace('in bytes: ', '')) for size in sizes] 23 | 24 | 25 | def _size_parsing(log_contents: str) -> dict: 26 | """ 27 | Capture the image and directory size for image data. 28 | 29 | Parse the image and directory size for all images generated using 30 | Imageinary. It is assumed that the image and directory size are identical 31 | for both the TFRecord and standard JPEG images of similar sizes. 32 | 33 | Parameters 34 | ---------- 35 | log_contents : str 36 | A ``string`` of the contents from a DALI log file. 37 | 38 | Returns 39 | ------- 40 | dict 41 | Returns a ``dictionary`` of image size information for all image sizes 42 | and formats. 43 | 44 | Raises 45 | ------ 46 | ValueError 47 | Raises a ``ValueError`` if the log file does not contain size 48 | information. 49 | """ 50 | results_sub_dict = { 51 | 'image size': 0, 52 | 'size unit': 'B', 53 | 'directory size': 0, 54 | 'min images/second': 0, 55 | 'average images/second': 0, 56 | 'min bandwidth': 0, 57 | 'average bandwidth': 0, 58 | 'bandwidth unit': 'bytes/second' 59 | } 60 | results = { 61 | '800x600 standard jpg': results_sub_dict.copy(), 62 | '3840x2160 standard jpg': results_sub_dict.copy(), 63 | '800x600 tfrecord': results_sub_dict.copy(), 64 | '3840x2160 tfrecord': results_sub_dict.copy() 65 | } 66 | 67 | image_size = re.findall('First image size from .*\n.*', log_contents) 68 | if len(image_size) != 4: 69 | raise ValueError('Error: Incomplete DALI file. Missing information on' 70 | ' file sizes') 71 | for line in image_size: 72 | sizes = re.findall(r'in bytes: \d+', line) 73 | if len(sizes) != 2: 74 | raise ValueError('Error: Missing data sizes in DALI log file.') 75 | image_size, directory_size = _clean_sizes(sizes) 76 | if '3840x2160' in line: 77 | results['3840x2160 standard jpg']['image size'] = image_size 78 | results['3840x2160 standard jpg']['directory size'] = \ 79 | directory_size 80 | results['3840x2160 tfrecord']['image size'] = image_size 81 | results['3840x2160 tfrecord']['directory size'] = directory_size 82 | elif '800x600' in line: 83 | results['800x600 standard jpg']['image size'] = image_size 84 | results['800x600 standard jpg']['directory size'] = directory_size 85 | results['800x600 tfrecord']['image size'] = image_size 86 | results['800x600 tfrecord']['directory size'] = directory_size 87 | return results 88 | 89 | 90 | def _average(input_list: list) -> float: 91 | """ 92 | Find the average of a list. 93 | 94 | Given a list of numbers, calculate the average of all values in the list. 95 | If the list is empty, default to 0.0. 96 | 97 | Parameters 98 | ---------- 99 | input_list : list 100 | A ``list`` of ``floats`` to find an average of. 101 | 102 | Returns 103 | ------- 104 | float 105 | Returns a ``float`` of the average value of the list. 106 | """ 107 | try: 108 | return float(sum(input_list) / len(input_list)) 109 | except ZeroDivisionError: 110 | return 0.0 111 | 112 | 113 | def _update_results(image_type_match: dict, results: list) -> dict: 114 | """ 115 | Update image dictionary with throughput and bandwidth. 116 | 117 | Find the minimum and average throughput and bandwdith for a particular 118 | image size and type by processing a list of all corresponding results. 119 | 120 | Parameters 121 | ---------- 122 | image_type_match : dict 123 | A ``dictionary`` of the throughput and bandwidth for a particular image 124 | size and type. 125 | results : list 126 | A ``list`` of ``floats`` representing results from the experiment runs. 127 | 128 | Returns 129 | ------- 130 | dict 131 | An updated ``dictionary`` of the throughput and bandwidth for a 132 | particular image size and type. 133 | """ 134 | size = image_type_match['image size'] 135 | image_type_match['min images/second'] = min(results) 136 | image_type_match['average images/second'] = _average(results) 137 | image_type_match['min bandwidth'] = size * min(results) 138 | image_type_match['average bandwidth'] = size * _average(results) 139 | return image_type_match 140 | 141 | 142 | def _result_parsing(log_contents: str, systems: int, image_results: dict, 143 | log_file: str) -> dict: 144 | """ 145 | Parse the throughput results from the log file. 146 | 147 | Given a log file, find all of the results for each of the four test runs 148 | including both standard JPEG and TFRecord formats for 800x600 and 4K 149 | images. Each section starts with 'RUN 1/1' and runs for 11 epochs before 150 | printing 'OK' once complete. The result sections are in a strict order, 151 | allowing us to deterministically match results with the corresponding 152 | image size and type: 153 | 0: 800x600 Standard File Read 154 | 1: 3840x2160 Standard File Read 155 | 2: 800x600 TFRecord 156 | 3: 3840x2160 TFRecord 157 | 158 | Parameters 159 | ---------- 160 | log_contents : str 161 | A ``string`` of the contents from a DALI log file. 162 | systems : int 163 | An ``integer`` of the number of systems used during the current test. 164 | image_results : dict 165 | A ``dictionary`` of image size information for all image sizes and 166 | formats. 167 | log_file : str 168 | A ``string`` of the name of the log file being parsed. 169 | 170 | Returns 171 | ------- 172 | dict 173 | Returns an updated ``dictionary`` of image size information for all 174 | image sizes and formats. 175 | """ 176 | # The result sections are in a strict order, allowing us to 177 | # deterministically match results with the corresponding image size and 178 | # type: 179 | # 0: 800x600 Standard File Read 180 | # 1: 3840x2160 Standard File Read 181 | # 2: 800x600 TFRecord 182 | # 3: 3840x2160 TFRecord 183 | image_type_match = [ 184 | image_results['800x600 standard jpg'], 185 | image_results['3840x2160 standard jpg'], 186 | image_results['800x600 tfrecord'], 187 | image_results['3840x2160 tfrecord'] 188 | ] 189 | 190 | test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL) 191 | if len(test_sections) != 4: 192 | print(f'Warning: Invalid number of results found in {log_file} log ' 193 | 'file. Skipping...') 194 | return {} 195 | 196 | for num, section in enumerate(test_sections): 197 | result_lines = re.findall('.*img/s', section) 198 | all_speeds = [] 199 | 200 | for line in result_lines: 201 | speed = re.sub('.*speed: ', '', line) 202 | speed = float(speed.replace(' [img/s', '')) 203 | all_speeds.append(speed) 204 | 205 | # Per standard practices, the first N results for N systems is treated 206 | # as a warmup and discarded. Occasionally, the timing of results will 207 | # be off, and one node will showcase the 2nd test pass before all nodes 208 | # have finished the first. To accomodate for this, the lowest N results 209 | # are assumed to be the first test pass and are dropped. 210 | all_speeds = sorted(all_speeds)[systems:] 211 | image_type_match[num] = _update_results(image_type_match[num], 212 | all_speeds) 213 | 214 | # Rebuild the dictionary based on the updated results. 215 | image_results = { 216 | '800x600 standard jpg': image_type_match[0], 217 | '3840x2160 standard jpg': image_type_match[1], 218 | '800x600 tfrecord': image_type_match[2], 219 | '3840x2160 tfrecord': image_type_match[3] 220 | } 221 | return image_results 222 | 223 | 224 | def _combine_results(results: list, systems: int) -> dict: 225 | """ 226 | Aggregate all results for N-systems. 227 | 228 | Find the average throughput, bandwidth, and size for all iterations 229 | combined and create a single object which can be used to easily reference 230 | results. 231 | 232 | Parameters 233 | ---------- 234 | results : list 235 | A ``list`` of ``dicts`` for all results from a particular test. 236 | systems : int 237 | An ``integer`` of the number of systems used during the current test. 238 | 239 | Returns 240 | ------- 241 | dict 242 | Returns a ``dictionary`` of the final aggregate results for all 243 | iterations for N-nodes for all image types and sizes. 244 | """ 245 | system_results = {} 246 | 247 | for image_type in ['800x600 standard jpg', 248 | '3840x2160 standard jpg', 249 | '800x600 tfrecord', 250 | '3840x2160 tfrecord']: 251 | avg_min_speed, avg_avg_speed = [], [] 252 | avg_min_bw, avg_avg_bw = [], [] 253 | avg_img_size, avg_dir_size = [], [] 254 | 255 | for result in results: 256 | if image_type not in result: 257 | continue 258 | avg_min_speed.append(result[image_type]['min images/second']) 259 | avg_avg_speed.append(result[image_type]['average images/second']) 260 | avg_min_bw.append(result[image_type]['min bandwidth']) 261 | avg_avg_bw.append(result[image_type]['average bandwidth']) 262 | avg_img_size.append(result[image_type]['image size']) 263 | avg_dir_size.append(result[image_type]['directory size']) 264 | 265 | # Multiply the average in all performance categories by the number of 266 | # systems tested to get an average aggregate throughput result for the 267 | # cluster. 268 | system_results[image_type] = { 269 | 'image size': _average(avg_img_size), 270 | 'size unit': 'B', 271 | 'directory size': _average(avg_dir_size), 272 | 'min images/second': _average(avg_min_speed) * systems, 273 | 'average images/second': _average(avg_avg_speed) * systems, 274 | 'min bandwidth': _average(avg_min_bw) * systems, 275 | 'average bandwidth': _average(avg_avg_bw) * systems, 276 | 'bandwidth unit': 'bytes/second' 277 | } 278 | return system_results 279 | 280 | 281 | def parse_dali_file(log_files: list, systems: int, results_dict: dict) -> dict: 282 | """ 283 | Parse the aggregate DALI results for N-systems. 284 | 285 | Search through each DALI log for N-systems and find the minimum and average 286 | throughput and bandwidth for all four of the DALI tests of various image 287 | sizes and formats. 288 | 289 | Parameters 290 | ---------- 291 | log_files : list 292 | A ``list`` of ``strings`` where each element is a filepath to a log 293 | file. 294 | systems : int 295 | An ``integer`` of the current number of systems to aggregate results 296 | for. 297 | results_dict : dict 298 | A ``dictionary`` of the aggregate test results for all system counts. 299 | 300 | Returns 301 | ------- 302 | dict 303 | An updated ``dictionary`` of the aggregate test results including the 304 | newly-parsed results for N-systems. 305 | """ 306 | results = [] 307 | 308 | for log in log_files: 309 | with open(log, 'r') as f: 310 | log_contents = f.read() 311 | image_results = _size_parsing(log_contents) 312 | results.append(_result_parsing(log_contents, 313 | systems, 314 | image_results, 315 | log)) 316 | results_dict[systems] = _combine_results(results, systems) 317 | return results_dict 318 | -------------------------------------------------------------------------------- /bobber/lib/analysis/table.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import numpy as np 3 | import operator 4 | from bobber.lib.analysis.common import bcolors 5 | from tabulate import tabulate 6 | from typing import NoReturn, Tuple 7 | 8 | 9 | FIO_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 1MB BS{bcolors.ENDC}' 10 | FIO_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 1MB BS{bcolors.ENDC}' 11 | FIO_READ_IOP = f'{bcolors.BOLD}FIO Read (k IOPS) - 4K BS{bcolors.ENDC}' 12 | FIO_WRITE_IOP = f'{bcolors.BOLD}FIO Write (k IOPS) - 4K BS{bcolors.ENDC}' 13 | FIO_125K_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 125K BS{bcolors.ENDC}' 14 | FIO_125K_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 125K BS{bcolors.ENDC}' 15 | NCCL = f'{bcolors.BOLD}NCCL Max BW (GB/s){bcolors.ENDC}' 16 | DALI_IMG_SM = (f'{bcolors.BOLD}DALI Standard 800x600 throughput ' 17 | f'(images/second){bcolors.ENDC}') 18 | DALI_IMG_SM_BW = (f'{bcolors.BOLD}DALI Standard 800x600 bandwidth ' 19 | f'(GB/s){bcolors.ENDC}') 20 | DALI_IMG_LG = (f'{bcolors.BOLD}DALI Standard 3840x2160 throughput ' 21 | f'(images/second){bcolors.ENDC}') 22 | DALI_IMG_LG_BW = (f'{bcolors.BOLD}DALI Standard 3840x2160 bandwidth ' 23 | f'(GB/s){bcolors.ENDC}') 24 | DALI_TF_SM = (f'{bcolors.BOLD}DALI TFRecord 800x600 throughput ' 25 | f'(images/second){bcolors.ENDC}') 26 | DALI_TF_SM_BW = (f'{bcolors.BOLD}DALI TFRecord 800x600 bandwidth ' 27 | f'(GB/s){bcolors.ENDC}') 28 | DALI_TF_LG = (f'{bcolors.BOLD}DALI TFRecord 3840x2160 throughput ' 29 | f'(images/second){bcolors.ENDC}') 30 | DALI_TF_LG_BW = (f'{bcolors.BOLD}DALI TFRecord 3840x2160 bandwidth ' 31 | f'(GB/s){bcolors.ENDC}') 32 | 33 | 34 | def bytes_to_gb(number: float) -> float: 35 | """ 36 | Convert bytes to gigabytes. 37 | 38 | Parameters 39 | ---------- 40 | number : float 41 | A ``float`` in bytes. 42 | 43 | Returns 44 | ------- 45 | float 46 | Returns a ``float`` of the number in gigabytes. 47 | """ 48 | return round(number * 1e-9, 3) 49 | 50 | 51 | def iops_to_kiops(number: float) -> float: 52 | """ 53 | Convert iops to k-iops. 54 | 55 | Parameters 56 | ---------- 57 | number : float 58 | A ``float`` in iops. 59 | 60 | Returns 61 | ------- 62 | float 63 | Returns a ``float`` of the number in k-iops. 64 | """ 65 | return round(number * 1e-3, 3) 66 | 67 | 68 | def scale(values: list) -> float: 69 | """ 70 | Calculate the scaling factor of results. 71 | 72 | Calculate the scale by determining the slope of the line of best fit and 73 | dividing by the first value in the results, plus 1. 74 | 75 | Parameters 76 | ---------- 77 | values : list 78 | A ``list`` of ``floats`` to calculate the scale factor for. 79 | 80 | Returns 81 | ------- 82 | float 83 | Returns a ``float`` of the scaling factor. 84 | """ 85 | x = np.array(range(1, len(values) + 1)) 86 | y = np.array(values) 87 | slope, _ = np.polyfit(x, y, 1) 88 | return slope / values[0] + 1.0 89 | 90 | 91 | def fio_bw(results: list) -> Tuple[list, list]: 92 | """ 93 | Save the FIO bandwidth read and write results. 94 | 95 | Save the read and write results from the FIO bandwidth tests on an 96 | increasing per-system basis with the first element in the list being the 97 | column header. 98 | 99 | Parameters 100 | ---------- 101 | results : list 102 | A ``list`` of ``dictionaries`` containing all results from the tests. 103 | 104 | Returns 105 | ------- 106 | tuple 107 | Returns a ``tuple`` of (``list``, ``list``) containing the read and 108 | write bandwidth results, respectively. 109 | """ 110 | try: 111 | read = [FIO_READ_BW] + [bytes_to_gb(result[1]['bandwidth']['read']) 112 | for result in results] 113 | write = [FIO_WRITE_BW] + [bytes_to_gb(result[1]['bandwidth']['write']) 114 | for result in results] 115 | except KeyError: 116 | return [] 117 | else: 118 | return [read, write] 119 | 120 | 121 | def fio_iops(results: list) -> Tuple[list, list]: 122 | """ 123 | Save the FIO IOPS read and write results. 124 | 125 | Save the read and write results from the FIO IOPS tests on an increasing 126 | per-system basis with the first element in the list being the column 127 | header. 128 | 129 | Parameters 130 | ---------- 131 | results : list 132 | A ``list`` of ``dictionaries`` containing all results from the tests. 133 | 134 | Returns 135 | ------- 136 | tuple 137 | Returns a ``tuple`` of (``list``, ``list``) containing the read and 138 | write IOPS results, respectively. 139 | """ 140 | try: 141 | read = [FIO_READ_IOP] + [iops_to_kiops(result[1]['iops']['read']) 142 | for result in results] 143 | write = [FIO_WRITE_IOP] + [iops_to_kiops(result[1]['iops']['write']) 144 | for result in results] 145 | except KeyError: 146 | return [] 147 | else: 148 | return [read, write] 149 | 150 | 151 | def fio_125k_bw(results: list) -> Tuple[list, list]: 152 | """ 153 | Save the FIO 125k bandwidth read and write results. 154 | 155 | Save the read and write results from the FIO 125k bandwidth tests on an 156 | increasing per-system basis with the first element in the list being the 157 | column header. 158 | 159 | Parameters 160 | ---------- 161 | results : list 162 | A ``list`` of ``dictionaries`` containing all results from the tests. 163 | 164 | Returns 165 | ------- 166 | tuple 167 | Returns a ``tuple`` of (``list``, ``list``) containing the read and 168 | write 125k bandwidth results, respectively. 169 | """ 170 | try: 171 | read = [FIO_125K_READ_BW] + [bytes_to_gb(result[1]['125k_bandwidth'] 172 | ['read']) 173 | for result in results] 174 | write = [FIO_125K_WRITE_BW] + [bytes_to_gb(result[1]['125k_bandwidth'] 175 | ['write']) 176 | for result in results] 177 | except KeyError: 178 | return [] 179 | else: 180 | return [read, write] 181 | 182 | 183 | def nccl(results: list) -> list: 184 | """ 185 | Save the NCCL results. 186 | 187 | Save the maximum bus bandwidth results from the NCCL tests on an increasing 188 | per-system basis with the first element in the list being the column 189 | header. 190 | 191 | Parameters 192 | ---------- 193 | results : list 194 | A ``list`` of dictionaries containing all results from the tests. 195 | 196 | Returns 197 | ------- 198 | list 199 | Returns a ``list`` of the NCCL max bus bandwidth results. 200 | """ 201 | try: 202 | nccl = [NCCL] + [round(result[1]['nccl']['max_bus_bw'], 3) 203 | for result in results] 204 | except KeyError: 205 | return [] 206 | else: 207 | return [nccl] 208 | 209 | 210 | def dali(results: list) -> Tuple[list, list, list, list, list, list, list, 211 | list]: 212 | """ 213 | Save the DALI results. 214 | 215 | Save the throughput and bandwidth results from the DALI tests on an 216 | increasing per-system basis with the first element in the list being the 217 | column header. 218 | 219 | Parameters 220 | ---------- 221 | results : list 222 | A ``list`` of dictionaries containing all results from the tests. 223 | 224 | Returns 225 | ------- 226 | tuple 227 | Returns a ``tuple`` of eight ``lists`` containing the throughput 228 | followed by bandwidth for small and large standard images, then small 229 | and large TFRecords. 230 | """ 231 | try: 232 | img_sm = [DALI_IMG_SM] + [result[1]['dali']['800x600 standard jpg'] 233 | ['average images/second'] 234 | for result in results] 235 | img_sm_bw = [DALI_IMG_SM_BW] + [bytes_to_gb(result[1]['dali'] 236 | ['800x600 standard jpg'] 237 | ['average bandwidth']) 238 | for result in results] 239 | img_lg = [DALI_IMG_LG] + [result[1]['dali']['3840x2160 standard jpg'] 240 | ['average images/second'] 241 | for result in results] 242 | img_lg_bw = [DALI_IMG_LG_BW] + [bytes_to_gb(result[1]['dali'] 243 | ['3840x2160 standard jpg'] 244 | ['average bandwidth']) 245 | for result in results] 246 | tf_sm = [DALI_TF_SM] + [result[1]['dali']['800x600 tfrecord'] 247 | ['average images/second'] 248 | for result in results] 249 | tf_sm_bw = [DALI_TF_SM_BW] + [bytes_to_gb(result[1]['dali'] 250 | ['800x600 tfrecord'] 251 | ['average bandwidth']) 252 | for result in results] 253 | tf_lg = [DALI_TF_LG] + [result[1]['dali']['3840x2160 tfrecord'] 254 | ['average images/second'] 255 | for result in results] 256 | tf_lg_bw = [DALI_TF_LG_BW] + [bytes_to_gb(result[1]['dali'][ 257 | '3840x2160 tfrecord'] 258 | ['average bandwidth']) 259 | for result in results] 260 | except KeyError: 261 | return [] 262 | else: 263 | return [img_sm, img_sm_bw, img_lg, img_lg_bw, tf_sm, tf_sm_bw, tf_lg, 264 | tf_lg_bw] 265 | 266 | 267 | def add_scale(data: list) -> NoReturn: 268 | """ 269 | Add the scaling factor to results. 270 | 271 | Iterate through all results and append the scaling factor to each of the 272 | categories, if applicable. Results that have a scaling factor greater than 273 | 1.9x are marked GREEN, results greater than 1.5 are marked YELLOW, and all 274 | other results are RED. 275 | 276 | Parameters 277 | ---------- 278 | data : list 279 | A ``list`` of ``lists`` of all categories of results. 280 | """ 281 | for subset in data: 282 | # No results in the data - just the test category name 283 | if len(subset) < 2: 284 | continue 285 | # Scaling can't be calculated for NCCL as it has a different behavior 286 | # from other tests. For single-node only tests, there is nothing to 287 | # measure for scaling. Both scenarios should be ignored for calculating 288 | # scale factor. 289 | if 'nccl' in subset[0].lower() or len(subset) == 2: 290 | subset += ['N/A'] 291 | continue 292 | values = subset[1:] 293 | scale_val = round(scale(values), 2) 294 | if scale_val > 1.9: 295 | scale_text = f'{bcolors.PASS}{scale_val}X{bcolors.ENDC}' 296 | elif scale_val > 1.5: 297 | scale_text = f'{bcolors.WARNING}{scale_val}X{bcolors.ENDC}' 298 | else: 299 | scale_text = f'{bcolors.FAIL}{scale_val}X{bcolors.ENDC}' 300 | subset += [scale_text] 301 | 302 | 303 | def display_table(json_results: dict) -> NoReturn: 304 | """ 305 | Display results in tabular format. 306 | 307 | Find the results on a per-system basis for all categories and display the 308 | resulting scaling factor. 309 | 310 | Parameters 311 | ---------- 312 | json_results : dict 313 | A ``dictionary`` of the final results that have been parsed from the 314 | results directory. 315 | """ 316 | data = [] 317 | headers = [f'{bcolors.BOLD}Test{bcolors.ENDC}'] + \ 318 | [f'{bcolors.BOLD}{num} Node(s){bcolors.ENDC}' 319 | for num in sorted(json_results['systems'])] + \ 320 | [f'{bcolors.BOLD}Scale{bcolors.ENDC}'] 321 | results = sorted(json_results['systems'].items()) 322 | 323 | data += fio_bw(results) 324 | data += fio_iops(results) 325 | data += fio_125k_bw(results) 326 | data += nccl(results) 327 | data += dali(results) 328 | 329 | add_scale(data) 330 | 331 | print(tabulate(data, headers=headers, tablefmt='grid', numalign='right')) 332 | print() 333 | -------------------------------------------------------------------------------- /bobber/lib/docker/management.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import docker 3 | import os 4 | import sys 5 | from bobber.__version__ import __version__ as version 6 | from bobber.lib.exit_codes import (CONTAINER_NOT_RUNNING, 7 | CONTAINER_VERSION_MISMATCH, 8 | DOCKER_BUILD_FAILURE, 9 | DOCKER_COMMUNICATION_ERROR, 10 | NVIDIA_RUNTIME_ERROR) 11 | from bobber.lib.system.file_handler import update_log 12 | from docker.models.containers import Container 13 | from typing import NoReturn, Optional 14 | 15 | 16 | class DockerManager: 17 | """ 18 | Build, launch, and execute commands for Docker containers. 19 | 20 | The DockerManager provides a single interface accessible from the entire 21 | Bobber package in which to communicate with Docker containers. The class 22 | provides the ability to build new containers based on the provided 23 | Dockerfile, launch the container with necessary settings for tests, and 24 | execute commands inside the launched container to run tests. An instance 25 | of this class is created in the bobber.lib.docker.__init__.py module which 26 | can be access from other modules without re-instantiating the class. 27 | """ 28 | def __init__(self) -> NoReturn: 29 | try: 30 | self.client = docker.from_env() 31 | self.cli = docker.APIClient(timeout=600) 32 | except docker.errors.DockerException as e: 33 | if 'error while fetching server api version' in str(e).lower(): 34 | print('Error: Could not communicate with the Docker daemon.') 35 | print('Ensure Docker is running with "systemctl start docker"') 36 | sys.exit(DOCKER_COMMUNICATION_ERROR) 37 | 38 | def _build_if_not_built(self, tag: str, bobber_version: str) -> NoReturn: 39 | """ 40 | Build the image if not built already. 41 | 42 | Check if an image exists for the local version of Bobber. If not, build 43 | the image immediately. 44 | 45 | Parameters 46 | ---------- 47 | tag : string 48 | A ``string`` of the Bobber image name, such as 49 | 'nvidia/bobber:5.0.0'. 50 | bobber_version : string 51 | A ``string`` of the local version of Bobber, such as '5.0.0'. 52 | """ 53 | try: 54 | self.client.images.get(tag) 55 | except docker.errors.ImageNotFound: 56 | print(f'Image {tag} not built, building now...') 57 | self.build(bobber_version) 58 | 59 | def get_tag(self, bobber_version: str) -> str: 60 | """ 61 | Create the image name. 62 | 63 | Build the full image name including the tag, such as 64 | 'nvidia/bobber:5.0.0'. 65 | 66 | Parameters 67 | ---------- 68 | bobber_version : string 69 | A ``string`` of the local version of Bobber, such as '5.0.0'. 70 | 71 | Returns 72 | ------- 73 | str 74 | Returns a ``string`` of the full image name plus tag, such as 75 | 'nvidia/bobber:5.0.0'. 76 | """ 77 | return f'nvidia/bobber:{bobber_version}' 78 | 79 | def cast(self, storage_path: str, ignore_gpu: bool, 80 | bobber_version: str) -> NoReturn: 81 | """ 82 | Launch a container with necessary settings. 83 | 84 | Launch a Bobber image with various settings required to initiate the 85 | testing framework, including attaching GPUs, starting an SSH daemon, 86 | setting the container to privileged mode, and attaching a filesystem 87 | to be accessible inside the container. 88 | 89 | The launched container will be based off of the Bobber image for the 90 | current version of the application. If the image does not yet exist, 91 | it will be built automatically. The launched container is named 92 | 'bobber'. 93 | 94 | Parameters 95 | ---------- 96 | storage_path : string 97 | A ``string`` of the absolute path to the storage location to test 98 | against, such as `/mnt/storage`. 99 | ignore_gpu : boolean 100 | When `True`, launches the container without GPU resources. Defaults 101 | to `False`. 102 | bobber_version : string 103 | A ``string`` of the local version of Bobber, such as '5.0.0'. 104 | """ 105 | tag = self.get_tag(bobber_version) 106 | self._build_if_not_built(tag, bobber_version) 107 | runtime = None 108 | if not ignore_gpu: 109 | runtime = 'nvidia' 110 | try: 111 | self.client.containers.run( 112 | tag, 113 | 'bash -c "/usr/sbin/sshd; sleep infinity"', 114 | detach=True, 115 | auto_remove=True, 116 | ipc_mode='host', 117 | name='bobber', 118 | network_mode='host', 119 | privileged=True, 120 | shm_size='1G', 121 | runtime=runtime, 122 | ulimits=[ 123 | docker.types.Ulimit(name='memlock', 124 | soft=-1, 125 | hard=-1), 126 | docker.types.Ulimit(name='stack', 127 | soft=67108864, 128 | hard=67108864) 129 | ], 130 | volumes={ 131 | f'{storage_path}': { 132 | 'bind': '/mnt/fs_under_test', 133 | 'mode': 'rw' 134 | } 135 | } 136 | ) 137 | except docker.errors.APIError as e: 138 | if 'Unknown runtime specified nvidia' in str(e): 139 | print('NVIDIA container runtime not found. Ensure the latest ' 140 | 'nvidia-docker libraries and NVIDIA drivers are ' 141 | 'installed.') 142 | sys.exit(NVIDIA_RUNTIME_ERROR) 143 | 144 | def export(self, bobber_version: str) -> NoReturn: 145 | """ 146 | Save an image as a tarball. 147 | 148 | To make it easy to transfer an image to multiple machines, the image 149 | can be saved as a tarball which can be copied directly to a remote 150 | device. On the other device, run the "load" command to load the copied 151 | tarball. 152 | 153 | Parameters 154 | ---------- 155 | bobber_version : string 156 | A ``string`` of the local version of Bobber, such as '5.0.0'. 157 | """ 158 | tag = self.get_tag(bobber_version) 159 | self._build_if_not_built(tag, bobber_version) 160 | filename = tag.replace('/', '_').replace(':', '_') 161 | print(f'Exporting {tag} to "{filename}.tar". This may take a while...') 162 | image = self.cli.get_image(tag) 163 | with open(f'{filename}.tar', 'wb') as image_file: 164 | for chunk in image: 165 | image_file.write(chunk) 166 | print(f'{tag} saved to {filename}.tar') 167 | 168 | def build(self, bobber_version: str) -> NoReturn: 169 | """ 170 | Build the image on the Dockerfile. 171 | 172 | Build a new image based on the Dockerfile named 173 | 'nvidia/bobber:{version}'. 174 | 175 | Parameters 176 | ---------- 177 | bobber_version : string 178 | A ``string`` of the local version of Bobber, such as '5.0.0'. 179 | """ 180 | tag = self.get_tag(bobber_version) 181 | print('Building a new image. This may take a while...') 182 | # Set the path to the repository's parent directory. 183 | path = os.path.dirname(os.path.abspath(__file__)) 184 | path = '/'.join(path.split('/')[:-2]) 185 | output = self.cli.build(path=path, 186 | dockerfile='lib/docker/Dockerfile', 187 | tag=tag, 188 | decode=True) 189 | for line in output: 190 | if 'error' in line.keys(): 191 | print(line['error'].rstrip()) 192 | print(f'{tag} build failed. See error above.') 193 | sys.exit(DOCKER_BUILD_FAILURE) 194 | if 'stream' in line.keys() and line['stream'].strip() != '': 195 | print(line['stream'].rstrip()) 196 | print(f'{tag} successfully built') 197 | 198 | def load(self, filename: str) -> NoReturn: 199 | """ 200 | Load a Docker image from a tarball. 201 | 202 | If a Bobber image was saved as a tarball using the "export" command, it 203 | can be loaded on the system using the "load" command. 204 | 205 | Parameters 206 | ---------- 207 | filename : string 208 | A ``string`` of the filename for the local tarball to load, such as 209 | './nvidia_bobber_5.0.0.tar'. 210 | """ 211 | print(f'Importing {filename}. This may take a while...') 212 | with open(filename, 'rb') as image_file: 213 | self.client.images.load(image_file) 214 | 215 | def execute(self, command: str, environment: Optional[dict] = None, 216 | log_file: Optional[str] = None) -> NoReturn: 217 | """ 218 | Execute a command against the running container. 219 | 220 | Assuming the Bobber container is already launched from the "cast" 221 | command, execute a specific command and stream the output to the 222 | terminal. Optionally specify a dictionary with any necessary 223 | environment variables and a log file to save the output to. 224 | 225 | Parameters 226 | ---------- 227 | command : string 228 | A ``string`` of the command to run inside the container. 229 | environment : dict (Optional) 230 | A ``dictionary`` of environment variables to use where the keys are 231 | the name of the variable and the values are the corresponding value 232 | to set. 233 | log_file : string (Optional) 234 | A ``string`` of the path and filename to optionally save output to. 235 | """ 236 | if not self.running: 237 | print('Bobber container not running. Launch a container with ' 238 | '"bobber cast" prior to running any tests.') 239 | sys.exit(CONTAINER_NOT_RUNNING) 240 | bobber = self.client.containers.get('bobber') 241 | if not self.version_match(bobber): 242 | print('Bobber container version mismatch.') 243 | print('Kill the running Bobber container with "docker kill bobber"' 244 | ' and re-cast a new container with "bobber cast" prior to ' 245 | 'running any tests.') 246 | sys.exit(CONTAINER_VERSION_MISMATCH) 247 | result = bobber.exec_run( 248 | command, 249 | demux=False, 250 | environment=environment, 251 | stream=True 252 | ) 253 | # Continually print STDOUT and STDERR until there is nothing left 254 | while True: 255 | try: 256 | output = next(result.output).decode('ascii') 257 | print(output.rstrip()) 258 | if log_file: 259 | update_log(log_file, output) 260 | # Usually only happens for terminating characters at the end of 261 | # streams 262 | except UnicodeDecodeError: 263 | print(result.output) 264 | except StopIteration: 265 | break 266 | 267 | def version_match(self, container: Container) -> bool: 268 | """ 269 | Determine if the Bobber container version matches the application. 270 | 271 | The running Bobber container version must match the local Bobber 272 | application version to ensure all tests will function properly. 273 | 274 | Parameters 275 | ---------- 276 | container : Container 277 | A ``Container`` object representing the running Bobber image. 278 | 279 | Returns 280 | ------- 281 | bool 282 | Returns `True` when the versions match and `False` when not. 283 | """ 284 | if f'nvidia/bobber:{version}' not in container.image.tags: 285 | return False 286 | return True 287 | 288 | @property 289 | def running(self) -> bool: 290 | """ 291 | Determine if the Bobber container is running on the system. 292 | 293 | Check to see if the current version of the Bobber container is running 294 | on the local machine and return the status. This method can be used to 295 | determine whether or not to run a command that depends on the container 296 | being launched. 297 | 298 | Returns 299 | ------- 300 | bool 301 | Returns `True` when the container is running and `False` when not. 302 | """ 303 | try: 304 | bobber = self.client.containers.get('bobber') 305 | except docker.errors.NotFound: 306 | return False 307 | else: 308 | return True 309 | -------------------------------------------------------------------------------- /bobber/lib/tests/run_tests.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import os 3 | from argparse import Namespace 4 | from bobber.lib.constants import ( 5 | RUN_ALL, 6 | RUN_DALI, 7 | RUN_NCCL, 8 | RUN_STG_BW, 9 | RUN_STG_IOPS, 10 | RUN_STG_125K, 11 | RUN_STG_META 12 | ) 13 | from bobber.lib.docker import manager 14 | from time import sleep 15 | from typing import NoReturn 16 | 17 | 18 | def run_dali(args: Namespace, bobber_version: str, iteration: int, 19 | hosts: str) -> NoReturn: 20 | """ 21 | Run single or multi-node DALI tests. 22 | 23 | Run a single or multi-node DALI test which reads random image data in from 24 | designated storage and loads it onto local resources after preprocessing 25 | that is typically done for ResNet50 pipelines. 26 | 27 | Parameters 28 | ---------- 29 | args : Namespace 30 | A ``Namespace`` of all settings specified by the user for the test. 31 | bobber_version : string 32 | A ``string`` of the local version of Bobber, such as '5.0.0'. 33 | iteration : int 34 | An ``int`` of the local test number, starting at 1. 35 | hosts : string 36 | A comma-separated list of hostnames to test against, such as 37 | 'host1,host2,host3,host4'. 38 | """ 39 | dali_log = os.path.join(args.log_path, 40 | f'dali_iteration_{iteration}_' 41 | f'gpus_{args.gpus}_' 42 | f'batch_size_lg_{args.batch_size_lg}_' 43 | f'batch_size_sm_{args.batch_size_sm}_' 44 | f'systems_{len(hosts.split(","))}_' 45 | f'version_{bobber_version}.log') 46 | environment = { 47 | 'BATCH_SIZE_LG': args.batch_size_lg, 48 | 'BATCH_SIZE_SM': args.batch_size_sm, 49 | 'GPUS': args.gpus, 50 | 'HOSTS': hosts, 51 | 'SSH_IFACE': args.ssh_iface 52 | } 53 | manager.execute('tests/dali_multi.sh', 54 | environment=environment, 55 | log_file=dali_log) 56 | 57 | if args.pause > 0: 58 | sleep(args.pause) 59 | 60 | 61 | def run_stg_bw(args: Namespace, bobber_version: str, iteration: int, 62 | hosts: str) -> NoReturn: 63 | """ 64 | Run single or multi-node storage bandwidth tests with FIO. 65 | 66 | Run a single or multi-node storage bandwidth test with FIO which first 67 | writes data to the filesystem with 1MB block size and 4GB file size, 68 | followed by reading the data back. 69 | 70 | Parameters 71 | ---------- 72 | args : Namespace 73 | A ``Namespace`` of all settings specified by the user for the test. 74 | bobber_version : string 75 | A ``string`` of the local version of Bobber, such as '5.0.0'. 76 | iteration : int 77 | An ``int`` of the local test number, starting at 1. 78 | hosts : string 79 | A comma-separated list of hostnames to test against, such as 80 | 'host1,host2,host3,host4'. 81 | """ 82 | stg_bw_log = os.path.join(args.log_path, 83 | f'stg_bw_iteration_{iteration}_' 84 | f'threads_{args.bw_threads}_' 85 | f'direct_{args.direct}_' 86 | f'depth_{args.io_depth}_' 87 | f'read_pattern_{args.read_pattern}_' 88 | f'write_pattern_{args.write_pattern}_' 89 | f'systems_{len(hosts.split(","))}_' 90 | f'version_{bobber_version}.log') 91 | environment = { 92 | 'EXTRA_FLAGS': args.stg_extra_flags, 93 | 'IO_DEPTH': args.io_depth, 94 | 'DIRECTIO': args.direct, 95 | 'THREADS': args.bw_threads, 96 | 'READ_PATTERN': args.read_pattern, 97 | 'WRITE_PATTERN': args.write_pattern, 98 | 'HOSTS': hosts 99 | } 100 | manager.execute('tests/fio_multi.sh', 101 | environment=environment, 102 | log_file=stg_bw_log) 103 | 104 | if args.pause > 0: 105 | sleep(args.pause) 106 | 107 | 108 | def run_stg_125k(args: Namespace, bobber_version: str, iteration: int, 109 | hosts: str) -> NoReturn: 110 | """ 111 | Run single or multi-node storage 125KB IO size tests with FIO. 112 | 113 | Run a single or multi-node storage bandwidth test with FIO which first 114 | writes data to the filesystem with 125KB block size and 4GB file size, 115 | followed by reading the data back. 116 | 117 | Parameters 118 | ---------- 119 | args : Namespace 120 | A ``Namespace`` of all settings specified by the user for the test. 121 | bobber_version : string 122 | A ``string`` of the local version of Bobber, such as '5.0.0'. 123 | iteration : int 124 | An ``int`` of the local test number, starting at 1. 125 | hosts : string 126 | A comma-separated list of hostnames to test against, such as 127 | 'host1,host2,host3,host4'. 128 | """ 129 | stg_125k_log = os.path.join(args.log_path, 130 | f'stg_125k_iteration_{iteration}_' 131 | f'threads_{args.stg_125k_threads}_' 132 | f'direct_{args.direct}_' 133 | f'depth_{args.io_depth}_' 134 | f'systems_{len(hosts.split(","))}_' 135 | f'version_{bobber_version}.log') 136 | environment = { 137 | 'EXTRA_FLAGS': args.stg_extra_flags, 138 | 'IO_DEPTH': args.io_depth, 139 | 'IOSIZE': 125, 140 | 'DIRECTIO': args.direct, 141 | 'THREADS': args.stg_125k_threads, 142 | 'READ_PATTERN': args.read_pattern, 143 | 'WRITE_PATTERN': args.write_pattern, 144 | 'HOSTS': hosts 145 | } 146 | manager.execute('tests/fio_multi.sh', 147 | environment=environment, 148 | log_file=stg_125k_log) 149 | 150 | if args.pause > 0: 151 | sleep(args.pause) 152 | 153 | 154 | def run_stg_iops(args: Namespace, bobber_version: str, iteration: int, 155 | hosts: str) -> NoReturn: 156 | """ 157 | Run single or multi-node storage IOPS tests with FIO. 158 | 159 | Run a single or multi-node storage IOPS test with FIO which first writes 160 | data to the filesystem with 4kB block size and 4GB file size, followed by 161 | reading the data back. 162 | 163 | Parameters 164 | ---------- 165 | args : Namespace 166 | A ``Namespace`` of all settings specified by the user for the test. 167 | bobber_version : string 168 | A ``string`` of the local version of Bobber, such as '5.0.0'. 169 | iteration : int 170 | An ``int`` of the local test number, starting at 1. 171 | hosts : string 172 | A comma-separated list of hostnames to test against, such as 173 | 'host1,host2,host3,host4'. 174 | """ 175 | stg_iops_log = os.path.join(args.log_path, 176 | f'stg_iops_iteration_{iteration}_' 177 | f'threads_{args.iops_threads}_' 178 | f'direct_{args.direct}_' 179 | f'depth_{args.io_depth}_' 180 | f'read_pattern_{args.read_pattern}_' 181 | f'write_pattern_{args.write_pattern}_' 182 | f'systems_{len(hosts.split(","))}_' 183 | f'version_{bobber_version}.log') 184 | environment = { 185 | 'EXTRA_FLAGS': args.stg_extra_flags, 186 | 'IO_DEPTH': args.io_depth, 187 | 'DIRECTIO': args.direct, 188 | 'THREADS': args.iops_threads, 189 | 'IOSIZE': 4, 190 | 'READ_PATTERN': args.read_pattern, 191 | 'WRITE_PATTERN': args.write_pattern, 192 | 'HOSTS': hosts 193 | } 194 | manager.execute('tests/fio_multi.sh', 195 | environment=environment, 196 | log_file=stg_iops_log) 197 | 198 | if args.pause > 0: 199 | sleep(args.pause) 200 | 201 | 202 | def run_stg_meta(args: Namespace, bobber_version: str, iteration: int, 203 | hosts: str) -> NoReturn: 204 | """ 205 | Run single or multi-node storage metadata test with FIO. 206 | 207 | Run a single or multi-node storage metadata test with FIO which tests 208 | various metadata operation performance for the filesystem. 209 | 210 | Parameters 211 | ---------- 212 | args : Namespace 213 | A ``Namespace`` of all settings specified by the user for the test. 214 | bobber_version : string 215 | A ``string`` of the local version of Bobber, such as '5.0.0'. 216 | iteration : int 217 | An ``int`` of the local test number, starting at 1. 218 | hosts : string 219 | A comma-separated list of hostnames to test against, such as 220 | 'host1,host2,host3,host4'. 221 | """ 222 | stg_meta_log = os.path.join(args.log_path, 223 | f'stg_meta_iteration_{iteration}_' 224 | f'systems_{len(hosts.split(","))}_' 225 | f'version_{bobber_version}.log') 226 | environment = { 227 | 'HOSTS': hosts, 228 | 'SSH_IFACE': args.ssh_iface, 229 | 'NCCL_IB_HCAS': args.nccl_ib_hcas 230 | } 231 | manager.execute('tests/mdtest_multi.sh', 232 | environment=environment, 233 | log_file=stg_meta_log) 234 | 235 | if args.pause > 0: 236 | sleep(args.pause) 237 | 238 | 239 | def run_nccl(args: Namespace, bobber_version: str, iteration: int, 240 | hosts: str) -> NoReturn: 241 | """ 242 | Run single or multi-node NCCL test. 243 | 244 | Run a single or multi-node NCCL test which verifies network and GPU 245 | performance and communication. 246 | 247 | Parameters 248 | ---------- 249 | args : Namespace 250 | A ``Namespace`` of all settings specified by the user for the test. 251 | bobber_version : string 252 | A ``string`` of the local version of Bobber, such as '5.0.0'. 253 | iteration : int 254 | An ``int`` of the local test number, starting at 1. 255 | hosts : string 256 | A comma-separated list of hostnames to test against, such as 257 | 'host1,host2,host3,host4'. 258 | """ 259 | nccl_log = os.path.join(args.log_path, 260 | f'nccl_iteration_{iteration}_' 261 | f'gpus_{args.gpus}_' 262 | f'nccl_max_{args.nccl_max}_' 263 | f'gid_{args.compute_gid}_' 264 | f'nccl_tc_{args.nccl_tc}_' 265 | f'systems_{len(hosts.split(","))}_' 266 | f'version_{bobber_version}.log') 267 | environment = { 268 | 'GPUS': args.gpus, 269 | 'NCCL_MAX': args.nccl_max, 270 | 'NCCL_TC': args.nccl_tc, 271 | 'COMPUTE_GID': args.compute_gid, 272 | 'HOSTS': hosts, 273 | 'SSH_IFACE': args.ssh_iface, 274 | 'NCCL_IB_HCAS': args.nccl_ib_hcas 275 | } 276 | manager.execute('tests/nccl_multi.sh', 277 | environment=environment, 278 | log_file=nccl_log) 279 | 280 | if args.pause > 0: 281 | sleep(args.pause) 282 | 283 | 284 | def kickoff_test(args: Namespace, bobber_version: str, iteration: int, 285 | hosts: str) -> NoReturn: 286 | """ 287 | Start a specified test. 288 | 289 | Launch a test as requested from the CLI for the given iteration. 290 | 291 | Parameters 292 | ---------- 293 | args : Namespace 294 | A ``Namespace`` of all settings specified by the user for the test. 295 | bobber_version : string 296 | A ``string`` of the local version of Bobber, such as '5.0.0'. 297 | iteration : int 298 | An ``int`` of the local test number, starting at 1. 299 | hosts : string 300 | A comma-separated list of hostnames to test against, such as 301 | 'host1,host2,host3,host4'. 302 | """ 303 | if args.command == RUN_DALI: 304 | run_dali(args, bobber_version, iteration, hosts) 305 | elif args.command == RUN_NCCL: 306 | run_nccl(args, bobber_version, iteration, hosts) 307 | elif args.command == RUN_STG_BW: 308 | run_stg_bw(args, bobber_version, iteration, hosts) 309 | elif args.command == RUN_STG_IOPS: 310 | run_stg_iops(args, bobber_version, iteration, hosts) 311 | elif args.command == RUN_STG_125K: 312 | run_stg_125k(args, bobber_version, iteration, hosts) 313 | elif args.command == RUN_STG_META: 314 | run_stg_meta(args, bobber_version, iteration, hosts) 315 | elif args.command == RUN_ALL: 316 | run_nccl(args, bobber_version, iteration, hosts) 317 | run_stg_meta(args, bobber_version, iteration, hosts) 318 | run_stg_bw(args, bobber_version, iteration, hosts) 319 | run_dali(args, bobber_version, iteration, hosts) 320 | run_stg_iops(args, bobber_version, iteration, hosts) 321 | run_stg_125k(args, bobber_version, iteration, hosts) 322 | 323 | 324 | def test_selector(args: Namespace, bobber_version: str) -> NoReturn: 325 | """ 326 | Start a test iteration. 327 | 328 | If the user requested to run a sweep of the hosts, the tests will begin 329 | with the first node in the hosts list for a single-node test, then 330 | progressively add the next host in the list until all nodes are tested 331 | together. During each iteration, one run of each requested test will be 332 | executed before going to the next iteration. 333 | 334 | Parameters 335 | ---------- 336 | args : Namespace 337 | A ``Namespace`` of all settings specified by the user for the test. 338 | bobber_version : string 339 | A ``string`` of the local version of Bobber, such as '5.0.0'. 340 | """ 341 | if args.sweep: 342 | hosts = [] 343 | 344 | for host in args.hosts.split(','): 345 | hosts.append(host) 346 | for iteration in range(1, args.iterations + 1): 347 | host_string = ','.join(hosts) 348 | kickoff_test(args, bobber_version, iteration, host_string) 349 | else: 350 | for iteration in range(1, args.iterations + 1): 351 | kickoff_test(args, bobber_version, iteration, args.hosts) 352 | -------------------------------------------------------------------------------- /bobber/lib/analysis/parse-mlperf.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import re 3 | from argparse import ArgumentParser, Namespace 4 | from glob import glob 5 | from os.path import join 6 | from typing import NoReturn, Tuple 7 | 8 | 9 | class Aggregate: 10 | """ 11 | Find the aggregate results for from multiple iterations. 12 | 13 | Parameters 14 | ---------- 15 | epoch_zero_speeds : list 16 | A ``list`` of ``floats`` of the first epoch speeds. 17 | epoch_zero_times : list 18 | A ``list`` of ``floats`` of the epoch zero times. 19 | elapsed_times : list 20 | A ``list`` of ``floats`` of the overall elapsed time. 21 | average_speeds : list 22 | A ``list`` of ``floats`` of the overall average speeds. 23 | """ 24 | def __init__(self, epoch_zero_speeds: list, epoch_zero_times: list, 25 | elapsed_times: list, average_speeds: list) -> NoReturn: 26 | self.epoch_zero_speeds = epoch_zero_speeds 27 | self.epoch_zero_times = epoch_zero_times 28 | self.elapsed_times = elapsed_times 29 | self.average_speeds = average_speeds 30 | 31 | 32 | class Results: 33 | """ 34 | The results from a single test run. 35 | 36 | Parameters 37 | ---------- 38 | epoch_zero_speed : float 39 | A ``float`` of the first epoch speed. 40 | epoch_zero_time : float 41 | A ``float`` of the epoch zero time. 42 | elapsed_time : float 43 | A ``float`` of the overall elapsed time. 44 | average_speed : float 45 | A ``float`` of the overall average speed. 46 | """ 47 | def __init__(self, epoch_zero_speed: float, epoch_zero_time: float, 48 | elapsed_time: float, average_speed: float) -> NoReturn: 49 | self.epoch_zero_speed = epoch_zero_speed 50 | self.epoch_zero_time = epoch_zero_time 51 | self.elapsed_time = elapsed_time 52 | self.average_speed = average_speed 53 | 54 | 55 | def parse_args() -> Namespace: 56 | """ 57 | Parse arguments passed to the MLPerf parser. 58 | 59 | Returns 60 | ------- 61 | Namespace 62 | Returns a ``Namespace`` of all of the arguments that were parsed from 63 | the application during runtime. 64 | """ 65 | parser = ArgumentParser(description='Parse MLPerf results') 66 | parser.add_argument('directory', type=str, help='The directory where ' 67 | 'MLPerf log results are saved.') 68 | return parser.parse_args() 69 | 70 | 71 | def average(list_to_average: list) -> float: 72 | """ 73 | Find the average of a list. 74 | 75 | Given a list of numbers, calculate the average of all values in the list. 76 | If the list is empty, default to 0.0. 77 | 78 | Parameters 79 | ---------- 80 | list_to_average : list 81 | A ``list`` of ``floats`` to find an average of. 82 | 83 | Returns 84 | ------- 85 | float 86 | Returns a ``float`` of the average value of the list. 87 | """ 88 | try: 89 | return round(sum(list_to_average) / len(list_to_average), 3) 90 | except ZeroDivisionError: 91 | return 0.0 92 | 93 | 94 | def ms_to_seconds(time: float) -> float: 95 | """ 96 | Convert milliseconds to seconds. 97 | 98 | Parameters 99 | ---------- 100 | time : float 101 | A ``float`` of time in milliseconds. 102 | 103 | Returns 104 | ------- 105 | float 106 | Returns a ``float`` of the converted time in seconds. 107 | """ 108 | return round(time / 1000, 3) 109 | 110 | 111 | def ms_to_minutes(time: float) -> float: 112 | """ 113 | Convert milliseconds to minutes. 114 | 115 | Parameters 116 | ---------- 117 | time : float 118 | A ``float`` of time in milliseconds. 119 | 120 | Returns 121 | ------- 122 | float 123 | Returns a ``float`` of the converted time in minutes. 124 | """ 125 | return round(time / 1000 / 60, 3) 126 | 127 | 128 | def get_files(directory: str) -> list: 129 | """ 130 | Read all log files. 131 | 132 | Given an input directory as a string, read all log files and return the 133 | filenames including the directory as a list. 134 | 135 | Parameters 136 | ---------- 137 | directory : str 138 | A ``string`` pointing to the results directory. 139 | 140 | Returns 141 | ------- 142 | list 143 | Returns a ``list`` of ``strings`` of the paths to each log file in the 144 | results directory. 145 | """ 146 | return glob(join(directory, '*.log')) 147 | 148 | 149 | def parse_epoch_line(line: str) -> Tuple[int, float]: 150 | """ 151 | Parse the throughput for each epoch. 152 | 153 | Pull the images/second and epoch for each results line in an MLPerf log. 154 | 155 | Parameters 156 | ---------- 157 | line : str 158 | A ``string`` of a results line in an MLPerf log. 159 | 160 | Returns 161 | ------- 162 | tuple 163 | Returns a ``tuple`` of (``int``, ``float``) of the epoch number and 164 | resulting speed in images/second. 165 | """ 166 | # Lines are in the format: 167 | # "Epoch[NUM] Batch [NUM-NUM] Speed: NUM.NUM samples/sec accuracy=NUM.NUM" 168 | epoch = re.findall(r'\[\d+\]', line)[0].replace('[', '').replace(']', '') 169 | speed = re.findall(r'Speed: .* samples', line) 170 | if len(speed) == 1: 171 | speed = speed[0].replace('Speed: ', '').replace(' samples', '') 172 | return int(epoch), float(speed) 173 | 174 | 175 | def parse_time(line: str) -> int: 176 | """ 177 | Parse the timestamp from a line in the log. 178 | 179 | Parameters 180 | ---------- 181 | line : str 182 | A ``string`` of a line in an MLPerf log file. 183 | 184 | Returns 185 | ------- 186 | int 187 | Returns an ``int`` of the parsed timestamp. 188 | """ 189 | return int(re.findall(r'\d+', line)[0]) 190 | 191 | 192 | def parse_epoch_values(logfile: str) -> Tuple[list, list]: 193 | """ 194 | Parse the epoch and throughput lines. 195 | 196 | Find all of the lines that contain a throughput and save the first epoch 197 | and overall epoch results in lists. 198 | 199 | Parameters 200 | ---------- 201 | logfile : str 202 | A ``string`` of all contents from a logfile. 203 | 204 | Returns 205 | ------- 206 | tuple 207 | Returns a ``tuple`` of (``list``, ``list``) containing the first epoch 208 | results followed by all results. 209 | """ 210 | epoch_zero_vals, all_epoch_vals = [], [] 211 | epoch_values = re.findall(r'Epoch\[\d+\] Batch.*', logfile) 212 | 213 | for value in epoch_values: 214 | epoch, speed = parse_epoch_line(value) 215 | all_epoch_vals.append(speed) 216 | if epoch == 0: 217 | epoch_zero_vals.append(speed) 218 | return epoch_zero_vals, all_epoch_vals 219 | 220 | 221 | def parse_epoch_times(logfile: str) -> Tuple[list, list]: 222 | """ 223 | Parse the time for each epoch. 224 | 225 | Find the overall time it takes to complete each epoch by finding the 226 | difference in milliseconds. 227 | 228 | Parameters 229 | ---------- 230 | logfile : str 231 | A ``string`` of all contents from a logfile. 232 | 233 | Returns 234 | ------- 235 | tuple 236 | Returns a ``tuple`` of (``list``, ``list``) representing the time taken 237 | during the first epoch and the overall elapsed time for the test. 238 | """ 239 | epoch_start_times = re.findall(r'time_ms.*?epoch_start', logfile) 240 | epoch_stop_times = re.findall(r'time_ms.*?epoch_stop', logfile) 241 | # The epoch 0 time is the difference between the timestamp where epoch 0 242 | # ended, and the timestamp where epoch 0 began. 243 | epoch_zero_time = parse_time(epoch_stop_times[0]) - \ 244 | parse_time(epoch_start_times[0]) 245 | # The total elapsed time is the difference between the timestamp of when 246 | # the final epoch ended, and the timestamp where epoch 0 began. 247 | elapsed_time = parse_time(epoch_stop_times[-1]) - \ 248 | parse_time(epoch_start_times[0]) 249 | return epoch_zero_time, elapsed_time 250 | 251 | 252 | def parse_file(logfile: str) -> object: 253 | """ 254 | Parse a single MLPerf file. 255 | 256 | Find the first epoch and overall results for a single MLPerf file and 257 | create a singular object to represent the results. 258 | 259 | Parameters 260 | ---------- 261 | logfile : str 262 | A ``string`` of all contents from a logfile. 263 | 264 | Returns 265 | ------- 266 | Results instance 267 | Returns an instance of the Results class. 268 | """ 269 | epoch_zero_vals, all_epoch_vals = parse_epoch_values(logfile) 270 | epoch_zero_time, elapsed_time = parse_epoch_times(logfile) 271 | results = Results(average(epoch_zero_vals), 272 | epoch_zero_time, 273 | elapsed_time, 274 | average(all_epoch_vals)) 275 | return results 276 | 277 | 278 | def find_num_nodes(logfile: str) -> int: 279 | """ 280 | Find the number of nodes tested. 281 | 282 | Parameters 283 | ---------- 284 | logfile : str 285 | A ``string`` of all contents from a logfile. 286 | 287 | Returns 288 | ------- 289 | int 290 | Returns an ``integer`` of the number of nodes tested. 291 | """ 292 | clear_cache_command = re.findall(r'srun.*Clearing cache on ', logfile) 293 | if len(clear_cache_command) == 0: 294 | print('Unable to find number of nodes tested. Assuming single node.') 295 | return 1 296 | n_tasks = re.findall(r'ntasks=\d+', clear_cache_command[0]) 297 | num_nodes = n_tasks[0].replace('ntasks=', '') 298 | return num_nodes 299 | 300 | 301 | def find_filesystem_test_path(logfile: str) -> str: 302 | """ 303 | Parse the filesystem path from the log file. 304 | 305 | The 'container-mounts=...' line in each log file contains the location of 306 | the shared filesystem. 307 | 308 | Parameters 309 | ---------- 310 | logfiles : str 311 | A ``string`` of all contents from a logfile. 312 | 313 | Returns 314 | ------- 315 | str 316 | Returns a ``string`` of the location of the filesystem. 317 | """ 318 | container_mounts_line = re.findall(r'container-mounts=\S*:/data', logfile) 319 | if len(container_mounts_line) == 0: 320 | print('Unable to find container mount directory. Leaving empty.') 321 | return '' 322 | container_data_mount = container_mounts_line[0].replace( 323 | 'container-mounts=', '') 324 | return container_data_mount 325 | 326 | 327 | def read_files(logfiles: list) -> Tuple[object, int, str]: 328 | """ 329 | Read all MLPerf files and find aggregate results. 330 | 331 | Read all log files in a directory and determine the average speed and time 332 | taken to process images for both the first epoch and all results combined. 333 | 334 | Parameters 335 | ---------- 336 | logfiles : list 337 | A ``list`` of the filepaths for all log files in an input directory. 338 | 339 | Returns 340 | ------- 341 | tuple 342 | Returns a ``tuple`` of an instance of the Aggregate class, the number 343 | of nodes tested, and the path to the filesystem under test. 344 | """ 345 | all_results = [] 346 | prev_nodes_found = None 347 | prev_filesystem_test_path = None 348 | 349 | for filename in logfiles: 350 | with open(filename, 'r') as logpointer: 351 | log = logpointer.read() 352 | results = parse_file(log) 353 | all_results.append(results) 354 | nodes_tested = find_num_nodes(log) 355 | filesystem_test_path = find_filesystem_test_path(log) 356 | if prev_nodes_found and nodes_tested != prev_nodes_found: 357 | raise ValueError('Error: Mixed node sizes found in log files!') 358 | if prev_filesystem_test_path and \ 359 | filesystem_test_path != prev_filesystem_test_path: 360 | raise ValueError('Error: Mixed test paths found in log files!') 361 | prev_nodes_found = nodes_tested 362 | prev_filesystem_test_path = filesystem_test_path 363 | aggregate = Aggregate( 364 | [result.epoch_zero_speed for result in all_results], 365 | [result.epoch_zero_time for result in all_results], 366 | [result.elapsed_time for result in all_results], 367 | [result.average_speed for result in all_results] 368 | ) 369 | return aggregate, nodes_tested, filesystem_test_path 370 | 371 | 372 | def print_averages(results: object, directory: str, nodes_tested: int, 373 | filesystem_test_path: str) -> NoReturn: 374 | """ 375 | Print the average results. 376 | 377 | Print the average time and speed for epoch 0 and all results, plus test 378 | information including the log directory and the location of the filesystem 379 | under test. 380 | 381 | Parameters 382 | ---------- 383 | results : object 384 | An instance of the Results class containing the results from a single 385 | test. 386 | directory : str 387 | A ``string`` of the passed directory where results were saved. 388 | nodes_tested : int 389 | An ``int`` of the number of nodes that were tested for a file. 390 | filesystem_test_path : str 391 | A ``string`` of the path to the filesystem under test. 392 | """ 393 | e_zero_speed = average(results.epoch_zero_speeds) 394 | e_zero_time = ms_to_seconds(average(results.epoch_zero_times)) 395 | overall_speed = average(results.average_speeds) 396 | overall_time = ms_to_minutes(average(results.elapsed_times)) 397 | 398 | output = f"""MLPerf Results: 399 | Log directory name: {directory} 400 | Filesystem test path: {filesystem_test_path} 401 | Number of iterations: {len(results.epoch_zero_speeds)} 402 | Nodes tested: {nodes_tested} 403 | Epoch 0: 404 | Speed: {e_zero_speed} images/second 405 | Average time: {e_zero_time} seconds 406 | Overall: 407 | Speed: {overall_speed} images/second 408 | Average time: {overall_time} minutes""" 409 | print(output) 410 | 411 | 412 | def main() -> NoReturn: 413 | """ 414 | Parse MLPerf test results. 415 | """ 416 | args = parse_args() 417 | logfiles = get_files(args.directory) 418 | aggregate, nodes_tested, filesystem_test_path = read_files(logfiles) 419 | print_averages(aggregate, args.directory, nodes_tested, 420 | filesystem_test_path) 421 | 422 | 423 | if __name__ == '__main__': 424 | main() 425 | -------------------------------------------------------------------------------- /bobber/lib/analysis/parse_results.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | import json 3 | import sys 4 | from collections import defaultdict 5 | from glob import glob 6 | from os.path import join 7 | from bobber.lib.exit_codes import MISSING_LOG_FILES, SUCCESS 8 | from bobber.lib.analysis.aggregate_results import AggregateResults 9 | from bobber.lib.analysis.common import (check_bobber_version, 10 | divide_logs_by_systems) 11 | from bobber.lib.analysis.compare_baseline import compare_baseline 12 | from bobber.lib.analysis.dali import parse_dali_file 13 | from bobber.lib.analysis.fio import parse_fio_bw_file, parse_fio_iops_file 14 | from bobber.lib.analysis.meta import parse_meta_file 15 | from bobber.lib.analysis.nccl import parse_nccl_file 16 | from bobber.lib.analysis.table import display_table 17 | from bobber.lib.system.file_handler import write_file 18 | from typing import NoReturn, Optional, Tuple 19 | 20 | 21 | def get_files(directory: str) -> list: 22 | """ 23 | Read all log files. 24 | 25 | Given an input directory as a string, read all log files and return the 26 | filenames including the directory as a list. 27 | 28 | Parameters 29 | ---------- 30 | directory : str 31 | A ``string`` pointing to the results directory. 32 | 33 | Returns 34 | ------- 35 | list 36 | Returns a ``list`` of ``strings`` of the paths to each log file in the 37 | results directory. 38 | """ 39 | return glob(join(directory, '*.log')) 40 | 41 | 42 | def parse_fio_bw(log_files: list) -> Tuple[dict, dict, dict, dict]: 43 | """ 44 | Parse all FIO bandwidth logs. 45 | 46 | Find each FIO bandwidth log in the results directory and parse the read and 47 | write results and parameters from each log for all system counts. 48 | 49 | Parameters 50 | ---------- 51 | log_files : list 52 | A ``list`` of ``strings`` of the paths to each log file in the results 53 | directory. 54 | 55 | Returns 56 | ------- 57 | tuple 58 | A ``tuple`` of four dictionaries containing the read results, write 59 | results, read parameters, and write parameters, respectively for all 60 | system counts. 61 | """ 62 | read_sys_results = defaultdict(list) 63 | write_sys_results = defaultdict(list) 64 | read_params, write_params = None, None 65 | 66 | fio_logs_by_systems = divide_logs_by_systems(log_files, 'stg_bw_iteration') 67 | 68 | for systems, files in fio_logs_by_systems.items(): 69 | read_sys_results, write_sys_results, read_params, write_params = \ 70 | parse_fio_bw_file(files, 71 | systems, 72 | read_sys_results, 73 | write_sys_results) 74 | return read_sys_results, write_sys_results, read_params, write_params 75 | 76 | 77 | def parse_fio_iops(log_files: list) -> Tuple[dict, dict, dict, dict]: 78 | """ 79 | Parse all FIO IOPS logs. 80 | 81 | Find each FIO IOPS log in the results directory and parse the read and 82 | write results and parameters from each log for all system counts. 83 | 84 | Parameters 85 | ---------- 86 | log_files : list 87 | A ``list`` of ``strings`` of the paths to each log file in the results 88 | directory. 89 | 90 | Returns 91 | ------- 92 | tuple 93 | A ``tuple`` of four dictionaries containing the read results, write 94 | results, read parameters, and write parameters, respectively for all 95 | system counts. 96 | """ 97 | read_sys_results = defaultdict(list) 98 | write_sys_results = defaultdict(list) 99 | read_params, write_params = None, None 100 | 101 | fio_logs_by_systems = divide_logs_by_systems(log_files, 102 | 'stg_iops_iteration') 103 | 104 | for systems, files in fio_logs_by_systems.items(): 105 | read_sys_results, write_sys_results, read_params, write_params = \ 106 | parse_fio_iops_file(files, 107 | systems, 108 | read_sys_results, 109 | write_sys_results) 110 | return read_sys_results, write_sys_results, read_params, write_params 111 | 112 | 113 | def parse_fio_125k_bw(log_files: list) -> Tuple[dict, dict, dict, dict]: 114 | """ 115 | Parse all FIO 125k bandwidth logs. 116 | 117 | Find each FIO 125k bandwidth log in the results directory and parse the 118 | read and write results and parameters from each log for all system counts. 119 | 120 | Parameters 121 | ---------- 122 | log_files : list 123 | A ``list`` of ``strings`` of the paths to each log file in the results 124 | directory. 125 | 126 | Returns 127 | ------- 128 | tuple 129 | A ``tuple`` of four dictionaries containing the 125k read results, 125k 130 | write results, 125k read parameters, and 125k write parameters for all 131 | system counts. 132 | """ 133 | read_sys_results = defaultdict(list) 134 | write_sys_results = defaultdict(list) 135 | read_params, write_params = None, None 136 | 137 | fio_logs_by_systems = divide_logs_by_systems(log_files, 138 | 'stg_125k_iteration') 139 | 140 | for systems, files in fio_logs_by_systems.items(): 141 | read_sys_results, write_sys_results, read_params, write_params = \ 142 | parse_fio_bw_file(files, 143 | systems, 144 | read_sys_results, 145 | write_sys_results) 146 | return read_sys_results, write_sys_results, read_params, write_params 147 | 148 | 149 | def parse_nccl(log_files: list) -> Tuple[dict, dict]: 150 | """ 151 | Parse all NCCL logs. 152 | 153 | Find the maximum bus bandwidth and resulting byte size for all NCCL files 154 | for all system counts. 155 | 156 | Parameters 157 | ---------- 158 | log_files : list 159 | A ``list`` of ``strings`` of the paths to each log file in the results 160 | directory. 161 | 162 | Returns 163 | ------- 164 | tuple 165 | Returns a ``tuple`` of (``dict``, ``dict``) representing the maximum 166 | bus bandwidth and corresponding byte size for all system counts. 167 | """ 168 | bw_results = defaultdict(list) 169 | bytes_results = defaultdict(list) 170 | 171 | nccl_logs_by_systems = divide_logs_by_systems(log_files, 'nccl') 172 | 173 | for systems, files in nccl_logs_by_systems.items(): 174 | max_bw, byte_size = parse_nccl_file(files, systems) 175 | bw_results[systems] = max_bw 176 | bytes_results[systems] = byte_size 177 | return bw_results, bytes_results 178 | 179 | 180 | def parse_dali(log_files: list) -> dict: 181 | """ 182 | Parse all DALI logs. 183 | 184 | Parse the bandwidth and throughput for all image types and sizes from all 185 | DALI log files. 186 | 187 | Parameters 188 | ---------- 189 | log_files : list 190 | A ``list`` of ``strings`` of the paths to each log file in the results 191 | directory. 192 | 193 | Returns 194 | ------- 195 | dict 196 | Returns a ``dictionary`` of the throughput and bandwidth for all system 197 | counts. 198 | """ 199 | results_dict = {} 200 | 201 | dali_logs_by_systems = divide_logs_by_systems(log_files, 'dali') 202 | 203 | for systems, files in dali_logs_by_systems.items(): 204 | results_dict = parse_dali_file(files, systems, results_dict) 205 | return results_dict 206 | 207 | 208 | def parse_meta(log_files: list) -> dict: 209 | """ 210 | Parse all metadata logs. 211 | 212 | Parse the minimum, maximum, and mean values for all operations in the 213 | metadata log files. 214 | 215 | Parameters 216 | ---------- 217 | log_files : list 218 | A ``list`` of ``strings`` of the paths to each log file in the results 219 | directory. 220 | 221 | Returns 222 | ------- 223 | dict 224 | Returns a ``dictionary`` of the results from various metadata 225 | operations for all system counts. 226 | """ 227 | results_dict = {} 228 | 229 | meta_logs_by_systems = divide_logs_by_systems(log_files, 'stg_meta') 230 | 231 | for systems, files in meta_logs_by_systems.items(): 232 | results_dict = parse_meta_file(files, systems, results_dict) 233 | return results_dict 234 | 235 | 236 | def save_json(final_dictionary_output: dict, filename: str) -> NoReturn: 237 | """ 238 | Save results to a file. 239 | 240 | Save the final JSON data to a file for future reference. If the filename is 241 | not specified, don't save the file. 242 | 243 | Parameters 244 | ---------- 245 | final_dictionary_output : dict 246 | A ``dictionary`` of the final JSON output to save. 247 | filename : str 248 | A ``string`` of the filename to write the JSON data to. 249 | """ 250 | if not filename: 251 | return 252 | with open(filename, 'w') as json_file: 253 | json.dump(final_dictionary_output, json_file) 254 | print(f'JSON data saved to {filename}') 255 | 256 | 257 | def save_yaml_baseline(final_dictionary_output: dict, 258 | directory: str) -> NoReturn: 259 | """ 260 | Save results as a YAML baseline file. 261 | 262 | The parsed results should be saved as a YAML baseline file which can be 263 | used to compare similar systems against existing results. The YAML file 264 | will be saved in the results directory as "baseline.yaml". 265 | 266 | Parameters 267 | ---------- 268 | final_dictionary_output : dict 269 | A ``dictionary`` of the parsed results on a per-system level. 270 | directory : str 271 | A ``string`` of the directory where results are saved. 272 | """ 273 | contents = 'systems:\n' 274 | 275 | for systems, results in final_dictionary_output['systems'].items(): 276 | dali = results.get('dali', {}) 277 | small_jpg = dali.get('800x600 standard jpg', {}) 278 | large_jpg = dali.get('3840x2160 standard jpg', {}) 279 | small_tf = dali.get('800x600 tfrecord', {}) 280 | large_tf = dali.get('3840x2160 tfrecord', {}) 281 | contents += f""" {systems}: 282 | bandwidth: 283 | # FIO BW speed in bytes/second 284 | read: {results.get('bandwidth', {}).get('read', 0)} 285 | write: {results.get('bandwidth', {}).get('write', 0)} 286 | iops: 287 | # FIO IOPS speed in ops/second 288 | read: {results.get('iops', {}).get('read', 0)} 289 | write: {results.get('iops', {}).get('write', 0)} 290 | 125k_bandwidth: 291 | # FIO 125k BW speed in bytes/second 292 | read: {results.get('125k_bandwidth', {}).get('read', 0)} 293 | write: {results.get('125k_bandwidth', {}).get('write', 0)} 294 | nccl: 295 | # NCCL maximum bus bandwidth in GB/s 296 | max_bus_bw: {results.get('nccl', {}).get('max_bus_bw', 0)} 297 | dali: 298 | # DALI average speed in images/second 299 | 800x600 standard jpg: {small_jpg.get('average images/second', 0)} 300 | 3840x2160 standard jpg: {large_jpg.get('average images/second', 0)} 301 | 800x600 tfrecord: {small_tf.get('average images/second', 0)} 302 | 3840x2160 tfrecord: {large_tf.get('average images/second', 0)} 303 | """ 304 | write_file(f'{directory}/baseline.yaml', contents) 305 | 306 | 307 | def main(directory: str, 308 | baseline: Optional[str] = None, 309 | custom_baseline: Optional[str] = None, 310 | tolerance: Optional[int] = 0, 311 | verbose: Optional[bool] = False, 312 | override_version_check: Optional[bool] = False, 313 | json_filename: Optional[str] = None) -> NoReturn: 314 | """ 315 | Parse all results on a per-system level. 316 | 317 | Read all log files from a results directory and iterate through the results 318 | on a per-system level. The results displayed are of the aggregate value for 319 | each system count. 320 | 321 | A baseline can be optionally included to compare the results in the output 322 | directory against pre-configured results to verify performance meets 323 | desired levels. 324 | 325 | Parameters 326 | ---------- 327 | directory : str 328 | A ``string`` of the directory where results are located. 329 | baseline : str (optional) 330 | A ``string`` representing the key from the included baselines to 331 | compare results to. 332 | custom_baseline : str (optional) 333 | A ``string`` of the filename to a custom YAML config file to read and 334 | compare results to. 335 | tolerance : int (optional) 336 | An ``integer`` of the tolerance as a percentage below the baseline to 337 | allow results to still be marked as passing. 338 | verbose : bool (optional) 339 | A ``boolean`` that prints additional textual output when `True`. 340 | override_version_check : bool (optional) 341 | A ``boolean`` which skips checking the Bobber version tested when 342 | `True`. 343 | json_filename : str (optional) 344 | A ``string`` of the filename to save JSON data to. 345 | """ 346 | final_dictionary_output = {'systems': {}} 347 | 348 | log_files = get_files(directory) 349 | if len(log_files) < 1: 350 | print('No log files found. Please specify a directory containing ' 351 | 'valid logs.') 352 | print('Exiting...') 353 | sys.exit(MISSING_LOG_FILES) 354 | bobber_version = check_bobber_version(log_files, 355 | override_version_check) 356 | bw_results = parse_fio_bw(log_files) 357 | read_bw, write_bw, read_bw_params, write_bw_params = bw_results 358 | bw_125k_results = parse_fio_125k_bw(log_files) 359 | read_125k_bw, write_125k_bw, read_125k_bw_params, write_125k_bw_params = \ 360 | bw_125k_results 361 | iops_results = parse_fio_iops(log_files) 362 | read_iops, write_iops, read_iops_params, write_iops_params = iops_results 363 | metadata = parse_meta(log_files) 364 | max_bw, bytes_sizes = parse_nccl(log_files) 365 | dali_results = parse_dali(log_files) 366 | total_systems = 0 367 | systems = [] 368 | 369 | for result in [read_bw, read_iops, read_125k_bw, max_bw, dali_results, 370 | metadata]: 371 | try: 372 | total_systems = max(result.keys()) 373 | systems = sorted(result.keys()) 374 | except ValueError: 375 | continue 376 | else: 377 | break 378 | 379 | for system_num in systems: 380 | aggregate = AggregateResults(read_bw, 381 | write_bw, 382 | read_bw_params, 383 | write_bw_params, 384 | read_iops, 385 | write_iops, 386 | read_iops_params, 387 | write_iops_params, 388 | read_125k_bw, 389 | write_125k_bw, 390 | read_125k_bw_params, 391 | write_125k_bw_params, 392 | max_bw, 393 | bytes_sizes, 394 | dali_results, 395 | metadata, 396 | system_num) 397 | final_dictionary_output['systems'][str(system_num)] = aggregate.json 398 | if verbose: 399 | print(aggregate) 400 | 401 | final_dictionary_output['total_systems'] = total_systems 402 | final_dictionary_output['bobber_version'] = bobber_version 403 | display_table(final_dictionary_output) 404 | save_yaml_baseline(final_dictionary_output, directory) 405 | save_json(final_dictionary_output, json_filename) 406 | 407 | if custom_baseline: 408 | compare_baseline(final_dictionary_output, custom_baseline, tolerance, 409 | custom=True) 410 | elif baseline: 411 | compare_baseline(final_dictionary_output, baseline, tolerance) 412 | -------------------------------------------------------------------------------- /bobber/lib/analysis/aggregate_results.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: MIT 2 | from functools import wraps 3 | from typing import NoReturn 4 | 5 | 6 | def average_decorator(func: 'method') -> float: 7 | """ 8 | A simple wrapper to calculate the average of a list. 9 | 10 | This wrapper can be used on any function or method which returns a list of 11 | ints or floats and calculates the average of those values. If the average 12 | can't be calculated for any reason, the value will default to 0.0. 13 | 14 | Parameters 15 | ---------- 16 | func : function/method 17 | A function to be wrapped with the average decorator. 18 | 19 | Returns 20 | ------- 21 | float 22 | Returns a ``float`` of the final average value from the list. 23 | """ 24 | @wraps(func) 25 | def wrapper(*args): 26 | value = func(*args) 27 | try: 28 | return sum(value) / len(value) 29 | except (TypeError, ValueError, ZeroDivisionError): 30 | return 0.0 31 | return wrapper 32 | 33 | 34 | class AggregateResults: 35 | """ 36 | Determine the aggregate values for all results. 37 | 38 | Bobber test runs typically include multiple iterations of all tests in an 39 | attempt to eliminiate noise. In order to find the true result, all 40 | iterations from a single test pass are averaged together. This is done on a 41 | per-system count level where all N-iterations of the single-node tests are 42 | aggregated together, then all N-iterations of the two-node tests (if 43 | applicable) are aggregated together, and so on. 44 | 45 | This class has a few helper methods to make it easy to output all data to 46 | both JSON format and a string representing the results. 47 | 48 | Parameters 49 | ---------- 50 | read_bw : dict 51 | A ``dictionary`` containing all of the fio read bandwidth results for 52 | N-systems. 53 | write_bw : dict 54 | A ``dicitonary`` containing all of the fio write bandwidth results for 55 | N-systems. 56 | read_bw_params : dict 57 | A ``dictionary`` of the parameters used during the fio read bandwdith 58 | tests. 59 | write_bw_params : dict 60 | A ``dictionary`` of the parameters used during the fio write bandwidth 61 | tests. 62 | read_iops : dict 63 | A ``dictionary`` containing all of the fio read iops results for 64 | N-systems. 65 | write_iops : dict 66 | A ``dictionary`` containing all of the fio write iops results for 67 | N-systems. 68 | read_iops_params : dict 69 | A ``dictionary`` of the parameters used during the fio read iops tests. 70 | write_iops_params : dict 71 | A ``dictionary`` of the parameters used during the fio write iops 72 | tests. 73 | read_125k_bw : dict 74 | A ``dictionary`` containing all of the fio 125k read bandwidth results 75 | for N-systems. 76 | write_125k_bw : dict 77 | A ``dictionary`` containing all of the fio 125k write bandwidth results 78 | for N-systems. 79 | read_125k_bw_params : dict 80 | A ``dictionary`` of the parameters used during the fio 125k read 81 | bandwidth tests. 82 | write_125k_bw_params : dict 83 | A ``dictionary`` of the parameters used during the fio 125k write 84 | bandwidth tests. 85 | max_bw : dict 86 | A ``dictionary`` of the maximum bus bandwidth achieved from NCCL tests. 87 | bytes_sizes : dict 88 | A ``dictionary`` of the byte size used when the maximum bus bandwidth 89 | was achieved for NCCL tests. 90 | dali_results : dict 91 | A ``dictionary`` of the DALI throughput for all image sizes and types 92 | in images/second. 93 | metadata : dict 94 | A ``dictionary`` of the max, min, and mean values for all metadata 95 | operations. 96 | systems : int 97 | An ``int`` for the number of systems the current results represent. 98 | """ 99 | def __init__(self, 100 | read_bw: dict, 101 | write_bw: dict, 102 | read_bw_params: dict, 103 | write_bw_params: dict, 104 | read_iops: dict, 105 | write_iops: dict, 106 | read_iops_params: dict, 107 | write_iops_params: dict, 108 | read_125k_bw: dict, 109 | write_125k_bw: dict, 110 | read_125k_bw_params: dict, 111 | write_125k_bw_params: dict, 112 | max_bw: dict, 113 | bytes_sizes: dict, 114 | dali_results: dict, 115 | metadata: dict, 116 | systems: int) -> NoReturn: 117 | self._read_bw = read_bw 118 | self._read_bw_params = read_bw_params 119 | self._read_iops = read_iops 120 | self._read_iops_params = read_iops_params 121 | self._125k_read_bw = read_125k_bw 122 | self._125k_read_bw_params = read_125k_bw_params 123 | self._write_bw = write_bw 124 | self._write_bw_params = write_bw_params 125 | self._write_iops = write_iops 126 | self._write_iops_params = write_iops_params 127 | self._125k_write_bw = write_125k_bw 128 | self._125k_write_bw_params = write_125k_bw_params 129 | self._max_bw = max_bw 130 | self._bytes_sizes = bytes_sizes 131 | self._dali_results = dali_results 132 | self._metadata = metadata 133 | self._num_systems = systems 134 | 135 | def __str__(self) -> str: 136 | """ 137 | A helper function to display results in human-readable text. 138 | 139 | Find the aggregate results for each test for N-systems and return the 140 | final output as a string, similar to the following: 141 | 142 | Systems tested: 1 143 | Aggregate Read Bandwidth: 1.595 GB/s 144 | Aggregate Write Bandwidth: 1.232 GB/s 145 | Aggregate Read IOPS: 136.5 k IOPS 146 | Aggregate Write IOPS: 135.0 k IOPS 147 | Aggregate 125k Read Bandwidth: 1.595 GB/s 148 | Aggregate 125k Write Bandwidth: 1.232 GB/s 149 | NCCL Max Bus Bandwidth: 79.865 at 512.0 MB 150 | Mdtest 151 | Directory creation: 71406.29550000001 ops 152 | Directory stat: 2698234.1525 ops 153 | Directory removal: 16016.5275 ops 154 | File creation: 137218.586 ops 155 | File stat: 2705405.084 ops 156 | File read: 2230275.9365 ops 157 | File removal: 175736.5435 ops 158 | Tree creation: 1546.792 ops 159 | Tree removal: 5878.747 ops 160 | 161 | DALI Standard 800x600 162 | Min Speed: 2509.35 images/second (0.727 GB/s) 163 | Avg Speed: 2694.595 images/second (0.78 GB/s) 164 | DALI Standard 3840x2160 165 | Min Speed: 344.078 images/second (1.712 GB/s) 166 | Avg Speed: 430.854 images/second (2.144 GB/s) 167 | DALI TFRecord 800x600 168 | Min Speed: 2508.069 images/second (0.726 GB/s) 169 | Avg Speed: 2665.653 images/second (0.772 GB/s) 170 | DALI TFRecord 3840x2160 171 | Min Speed: 317.276 images/second (1.579 GB/s) 172 | Avg Speed: 376.862 images/second (1.875 GB/s) 173 | 174 | Returns 175 | ------- 176 | str 177 | Returns a ``string`` of the final aggregate results for N-systems. 178 | """ 179 | values_to_print = [ 180 | # [Field name, value, unit] 181 | ['Systems tested:', self._num_systems, ''], 182 | ['Aggregate Read Bandwidth:', self.average_read_bw, ' GB/s'], 183 | ['Aggregate Write Bandwidth:', self.average_write_bw, ' GB/s'], 184 | ['Aggregate 125k Read Bandwidth:', self.average_125k_read_bw, 185 | ' GB/s'], 186 | ['Aggregate 125k Write Bandwidth:', self.average_125k_write_bw, 187 | ' GB/s'], 188 | ['Aggregate Read IOPS:', self.average_read_iops, 'k IOPS'], 189 | ['Aggregate Write IOPS:', self.average_write_iops, 'k IOPS'], 190 | ] 191 | output = '' 192 | for item in values_to_print: 193 | field, value, unit = item 194 | if value: 195 | output += f'{field} {value} {unit}\n' 196 | if round(self.max_bus_bandwidth, 3) != 0.0: 197 | output += ('NCCL Max Bus Bandwidth: ' 198 | f'{round(self.max_bus_bandwidth, 3)} ' 199 | f'at {self.max_bus_bytes / 1024 / 1024} MB') 200 | 201 | if self._metadata: 202 | output += '\n' 203 | output += self._metadata_print() 204 | 205 | if self._dali_results_print('800x600 standard jpg'): 206 | output += (f""" 207 | DALI Standard 800x600{self._dali_results_print('800x600 standard jpg')} 208 | DALI Standard 3840x2160{self._dali_results_print('3840x2160 standard jpg')} 209 | DALI TFRecord 800x600{self._dali_results_print('800x600 tfrecord')} 210 | DALI TFRecord 3840x2160{self._dali_results_print('3840x2160 tfrecord')} 211 | """) 212 | else: 213 | output += '\n' 214 | return output 215 | 216 | def _metadata_print(self) -> str: 217 | """ 218 | Determine and return the metadata results. 219 | 220 | Iterate through all of the final metadata results for each operation 221 | type and generate the aggregate number of operations for all 222 | iterations. 223 | 224 | Returns 225 | ------- 226 | str 227 | Returns a ``string`` of the formated metadata results. 228 | """ 229 | output = 'Mdtest\n' 230 | 231 | if self._metadata[self._num_systems] == '': 232 | return '' 233 | for key, values in self._metadata[self._num_systems].items(): 234 | output += (f" {key}: {values['mean']} ops\n") 235 | return output 236 | 237 | def _dali_results_print(self, size: str) -> str: 238 | """ 239 | Determine and return the DALI results. 240 | 241 | Calculate the minimum and average speed in images/second and the 242 | resulting bandwidth for each image type and format and return the 243 | result as a string. 244 | 245 | Parameters 246 | ---------- 247 | size : str 248 | The size and type of image to parse, such as '800x600 tfrecord'. 249 | 250 | Returns 251 | ------- 252 | str 253 | Returns a ``string`` of the formated DALI results. 254 | """ 255 | try: 256 | dali_results = self._dali_results[self._num_systems] 257 | except KeyError: 258 | return '' 259 | min_speed = round(dali_results[size]['min images/second'], 3) 260 | min_bw = round(dali_results[size]['min bandwidth'] * 1e-9, 3) 261 | avg_speed = round(dali_results[size]['average images/second'], 3) 262 | avg_bw = round(dali_results[size]['average bandwidth'] * 1e-9, 3) 263 | 264 | output = (f""" 265 | Min Speed: {min_speed} images/second ({min_bw} GB/s) 266 | Avg Speed: {avg_speed} images/second ({avg_bw} GB/s)""") 267 | return output 268 | 269 | @property 270 | def json(self) -> dict: 271 | """ 272 | Generate a JSON representation of the results. 273 | 274 | Creating a JSON dump of the results makes it easier for remote tools to 275 | archive or display results in an easily ingestible format, such as 276 | webpages or databases. 277 | 278 | Returns 279 | ------- 280 | dict 281 | Returns a JSON-parsable ``dictionary`` representation of all of the 282 | results including parameters and units where applicable. 283 | """ 284 | results = { 285 | 'systems_tested': self._num_systems, 286 | 'bandwidth': { 287 | 'read': self._average_read_bw(), 288 | 'write': self._average_write_bw(), 289 | 'unit': 'bytes/second', 290 | 'parameters': { 291 | 'read': self._read_bw_params, 292 | 'write': self._write_bw_params 293 | } 294 | }, 295 | 'iops': { 296 | 'read': self._average_read_iops(), 297 | 'write': self._average_write_iops(), 298 | 'unit': 'operations/second', 299 | 'parameters': { 300 | 'read': self._read_iops_params, 301 | 'write': self._write_iops_params 302 | } 303 | }, 304 | '125k_bandwidth': { 305 | 'read': self._average_125k_read_bw(), 306 | 'write': self._average_125k_write_bw(), 307 | 'unit': 'operations/second', 308 | 'parameters': { 309 | 'read': self._125k_read_bw_params, 310 | 'write': self._125k_write_bw_params 311 | } 312 | }, 313 | 'nccl': { 314 | 'max_bus_bw': self.max_bus_bandwidth, 315 | 'max_bus_bytes': self.max_bus_bytes, 316 | 'max_bus_bw_units': 'GB/s' 317 | } 318 | } 319 | try: 320 | results['dali'] = self._dali_results[self._num_systems] 321 | except KeyError: 322 | results['dali'] = {} 323 | return results 324 | 325 | @average_decorator 326 | def _average_read_bw(self) -> float: 327 | """ 328 | Returns the average read bandwidth as a ``float`` for all iterations 329 | in B/s. Defaults to 0.0. 330 | """ 331 | try: 332 | return self._read_bw[self._num_systems] 333 | except KeyError: 334 | return 0.0 335 | 336 | @property 337 | def average_read_bw(self) -> float: 338 | """ 339 | Returns the average read bandwidth as a ``float`` for all iterations 340 | in GB/s, rounded to the nearest thousandth. 341 | """ 342 | return round(self._average_read_bw() * 1e-9, 3) 343 | 344 | @average_decorator 345 | def _average_write_bw(self) -> float: 346 | """ 347 | Returns the average write bandwidth as a ``float`` for all iterations 348 | in B/s. Defaults to 0.0 349 | """ 350 | try: 351 | return self._write_bw[self._num_systems] 352 | except KeyError: 353 | return 0.0 354 | 355 | @property 356 | def average_write_bw(self) -> float: 357 | """ 358 | Returns the average write bandwidth as a ``float`` for all iterations 359 | in GB/s, rounded to the nearest thousandth. 360 | """ 361 | return round(self._average_write_bw() * 1e-9, 3) 362 | 363 | @average_decorator 364 | def _average_125k_read_bw(self) -> float: 365 | """ 366 | Returns the average 125k read bandwidth as a ``float`` for all 367 | iterations in B/s. Defaults to 0.0. 368 | """ 369 | try: 370 | return self._125k_read_bw[self._num_systems] 371 | except KeyError: 372 | return 0.0 373 | 374 | @property 375 | def average_125k_read_bw(self) -> float: 376 | """ 377 | Returns the average 125k read bandwidth as a ``float`` for all 378 | iterations in GB/s, rounded to the nearest thousandth. 379 | """ 380 | return round(self._average_125k_read_bw() * 1e-9, 3) 381 | 382 | @average_decorator 383 | def _average_125k_write_bw(self) -> float: 384 | """ 385 | Returns the average 125k write bandwidth as a ``float`` for all 386 | iterations in B/s. Defaults to 0.0 387 | """ 388 | try: 389 | return self._125k_write_bw[self._num_systems] 390 | except KeyError: 391 | return 0.0 392 | 393 | @property 394 | def average_125k_write_bw(self) -> float: 395 | """ 396 | Returns the average 125k write bandwidth as a ``float`` for all 397 | iterations in GB/s, rounded to the nearest thousandth. 398 | """ 399 | return round(self._average_125k_write_bw() * 1e-9, 3) 400 | 401 | @average_decorator 402 | def _average_read_iops(self) -> float: 403 | """ 404 | Returns the average read IOPS as a ``float`` for all iterations in 405 | ops/second. Defaults to 0.0. 406 | """ 407 | try: 408 | return self._read_iops[self._num_systems] 409 | except KeyError: 410 | return 0.0 411 | 412 | @property 413 | def average_read_iops(self) -> float: 414 | """ 415 | Returns the average read IOPS as a ``float`` for all iterations in K 416 | ops/second. 417 | """ 418 | return round(self._average_read_iops() * 1e-3, 3) 419 | 420 | @average_decorator 421 | def _average_write_iops(self) -> float: 422 | """ 423 | Returns the average write IOPS as a ``float`` for all iterations in 424 | ops/second. Defaults to 0.0. 425 | """ 426 | try: 427 | return self._write_iops[self._num_systems] 428 | except KeyError: 429 | return 0.0 430 | 431 | @property 432 | def average_write_iops(self) -> float: 433 | """ 434 | Returns the average write IOPS as a ``float`` for all iterations in K 435 | ops/second. 436 | """ 437 | return round(self._average_write_iops() * 1e-3, 3) 438 | 439 | @property 440 | @average_decorator 441 | def max_bus_bandwidth(self) -> float: 442 | """ 443 | Returns the average of the maximum bandwidth achieved as a ``float`` 444 | in NCCL in GB/s. Defaults to 0.0 445 | """ 446 | try: 447 | return self._max_bw[self._num_systems] 448 | except KeyError: 449 | return 0.0 450 | 451 | @property 452 | def max_bus_bytes(self) -> float: 453 | """ 454 | Returns the associated byte size for the maximum bandwidth achieved in 455 | NCCL as a ``float``. Defaults to 0.0 456 | """ 457 | try: 458 | return int(max(self._bytes_sizes[self._num_systems], 459 | key=self._bytes_sizes[self._num_systems].count)) 460 | except (ValueError, KeyError): 461 | return 0.0 462 | --------------------------------------------------------------------------------