├── bobber
    ├── lib
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── run_tests.py
    │   ├── analysis
    │   │   ├── __init__.py
    │   │   ├── nccl.py
    │   │   ├── meta.py
    │   │   ├── fio.py
    │   │   ├── common.py
    │   │   ├── compare_baseline.py
    │   │   ├── dali.py
    │   │   ├── table.py
    │   │   ├── parse-mlperf.py
    │   │   ├── parse_results.py
    │   │   └── aggregate_results.py
    │   ├── system
    │   │   ├── __init__.py
    │   │   └── file_handler.py
    │   ├── docker
    │   │   ├── __init__.py
    │   │   ├── Dockerfile
    │   │   └── management.py
    │   ├── exit_codes.py
    │   └── constants.py
    ├── __version__.py
    ├── __init__.py
    └── test_scripts
    │   ├── fio_fill_single.sh
    │   ├── call_dali_multi.sh
    │   ├── mdtest_multi.sh
    │   ├── nccl_multi.sh
    │   ├── fio_multi.sh
    │   ├── dali_multi.sh
    │   └── setup_fio.sh
├── .gitignore
├── requirements.txt
├── LICENSE
├── .github
    └── workflows
    │   └── python-package.yml
├── docs
    ├── sample_baseline.yaml
    ├── building.md
    ├── troubleshooting.md
    ├── parsing.md
    ├── docker.md
    ├── non_dgx_support.md
    └── baselines.md
├── setup.py
├── .gitlab-ci.yml
└── CONTRIBUTING.md


/bobber/lib/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | 


--------------------------------------------------------------------------------
/bobber/lib/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | 


--------------------------------------------------------------------------------
/bobber/lib/system/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | 


--------------------------------------------------------------------------------
/bobber/__version__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | __version__ = '6.3.1'
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | *.pyc
3 | build/*
4 | dist/*
5 | env/*
6 | nvidia_bobber.egg-info/
7 | 


--------------------------------------------------------------------------------
/bobber/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: MIT
2 | from bobber.__version__ import __version__
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2020.6.20
 2 | chardet==3.0.4
 3 | docker==4.3.1
 4 | idna==2.10
 5 | numpy==1.19.5
 6 | pycodestyle==2.6.0
 7 | PyYAML==5.4.1
 8 | requests==2.26.0
 9 | six==1.15.0
10 | tabulate==0.8.7
11 | urllib3==1.26.5
12 | websocket-client==0.57.0
13 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/fio_fill_single.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | cd /storage-perf-test
 5 | 
 6 | if [ "x$THREADS" = "x" ]; then
 7 | 	THREADS=80
 8 | fi
 9 | 
10 | if [ "x$DIRECTIO" = "x" ]; then
11 | 	DIRECTIO=0
12 | fi
13 | 
14 | NO_FIO_SERVER=1 DIRECTIO=$DIRECTIO FSDIR=/mnt/fs_under_test NJOBS=$THREADS ./run_disk_fill_test.sh
15 | 


--------------------------------------------------------------------------------
/bobber/lib/docker/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | import docker
 3 | from bobber.lib.docker.management import DockerManager
 4 | 
 5 | manager = DockerManager()
 6 | 
 7 | # Map the instance methods to allow importing as "bobber.docker.<instance>"
 8 | # in other modules.
 9 | build = manager.build
10 | cast = manager.cast
11 | execute = manager.execute
12 | export = manager.export
13 | load = manager.load
14 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/call_dali_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: MIT
 3 | BATCH_SIZE=$1
 4 | DATASET=$2
 5 | GPUS=$3
 6 | 
 7 | if [[ "$DATASET" == *tfrecord* ]]; then
 8 |   python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET"
 9 | else
10 |   python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET"
11 | fi
12 | 


--------------------------------------------------------------------------------
/bobber/lib/exit_codes.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | # This file contains a list of exit codes for debugging
 3 | SUCCESS = 0  # Successful termination
 4 | BASELINE_FAILURE = 10  # Performance did not meet criteria
 5 | MISSING_LOG_FILES = 20  # Parsing directory with no logs
 6 | DOCKER_BUILD_FAILURE = 30  # Failure building Docker image
 7 | DOCKER_COMMUNICATION_ERROR = 31  # Unable to communicate with Docker
 8 | CONTAINER_NOT_RUNNING = 32  # Bobber container not running
 9 | NVIDIA_RUNTIME_ERROR = 33  # NVIDIA container runtime not found
10 | CONTAINER_VERSION_MISMATCH = 34  # Container different from application
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 NVIDIA CORPORATION
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/mdtest_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | #force threads to 44 for now - unclear on why we can't use more threads with mdtest but it blows up
 5 | THREADS=44
 6 | FSDIR=/mnt/fs_under_test
 7 | 
 8 | mkdir $FSDIR/mdtest
 9 | 
10 | if [ "x$HOSTS" = "x" ]; then
11 | 	HOSTS=localhost:$THREADS
12 | fi
13 | 
14 | if [ "x$NCCL_IB_HCAS" = "x" ]; then
15 | 	NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9
16 | fi
17 | 
18 | if [ "x$SSH_IFACE" = "x" ]; then
19 | 	SSH_IFACE=enp226s0
20 | fi
21 | 
22 | IFS="," read -r -a HOST_ARRAY <<< "$HOSTS"
23 | 
24 | HOST_COUNT=${#HOST_ARRAY[@]}
25 | 
26 | #remove trailing comma when passing the argument
27 | HOST_STRING=""
28 | for i in ${HOST_ARRAY[@]}; do
29 | 	HOST_STRING+="$i:$THREADS,"
30 | done
31 | 
32 | set -x
33 | 
34 | mpirun -np $(($HOST_COUNT*$THREADS)) -H ${HOST_STRING%?} -map-by ppr:$THREADS:node --allow-run-as-root --mca btl_openib_warn_default_gid_prefix 0 --mca btl_openib_if_exclude mlx5_0,mlx5_5,mlx5_6 --mca plm_base_verbose 0 --mca plm_rsh_agent ssh -x IBV_DRIVERS -mca btl_tcp_if_include $SSH_IFACE -mca plm_rsh_args "-p 2222" /io-500-dev/bin/mdtest -i 3 -I 4 -z 3 -b 8 -u -d $FSDIR/mdtest
35 | 
36 | rm -rf $FSDIR/mdtest
37 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on: [push, pull_request]
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: [3.6, 3.7, 3.8, 3.9]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install -r requirements.txt
26 |         pip install wheel
27 |     - name: Lint with pycodestyle
28 |       run: |
29 |         pycodestyle bobber
30 |     - name: Build and install the wheel
31 |       run: |
32 |         python setup.py bdist_wheel sdist
33 |         pip install dist/nvidia_bobber-*.whl
34 |     - name: Build the latest image
35 |       run: |
36 |         bobber build
37 |         docker images | grep nvidia/bobber
38 | 


--------------------------------------------------------------------------------
/docs/sample_baseline.yaml:
--------------------------------------------------------------------------------
 1 | systems:
 2 |     1:
 3 |         bandwidth:
 4 |             # FIO BW speed in bytes/second
 5 |             read: 1200000000
 6 |             write: 1000000000
 7 |         iops:
 8 |             # FIO IOPS speed in ops/second
 9 |             read: 100000
10 |             write: 100000
11 |         nccl:
12 |             # NCCL maximum bus bandwidth in GB/s
13 |             max_bus_bw: 230
14 |         dali:
15 |             # DALI average speed in images/second
16 |             800x600 standard jpg: 2000
17 |             3840x2160 standard jpg: 300
18 |             800x600 tfrecord: 2000
19 |             3840x2160 tfrecord: 300
20 |     2:
21 |         bandwidth:
22 |             # FIO BW speed in bytes/second
23 |             read: 2400000000
24 |             write: 2000000000
25 |         iops:
26 |             # FIO IOPS speed in ops/second
27 |             read: 200000
28 |             write: 200000
29 |         nccl:
30 |             # NCCL maximum bus bandwidth in GB/s
31 |             max_bus_bw: 185
32 |         dali:
33 |             # DALI average speed in images/second
34 |             800x600 standard jpg: 4000
35 |             3840x2160 standard jpg: 600
36 |             800x600 tfrecord: 4000
37 |             3840x2160 tfrecord: 600
38 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/nccl_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | if [ "x$GPUS" = "x" ]; then
 5 | 	GPUS=8
 6 | fi
 7 | 
 8 | if [ "x$HOSTS" = "x" ]; then
 9 | 	HOSTS=localhost:$GPUS
10 | fi
11 | 
12 | if [ "x$NCCL_IB_HCAS" = "x" ]; then
13 | 	NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9
14 | fi
15 | 
16 | if [ "x$SSH_IFACE" = "x" ]; then
17 | 	SSH_IFACE=enp226s0
18 | fi
19 | 
20 | if [ "x$NCCL_MAX" = "x" ]; then
21 | 	NCCL_MAX=1
22 | fi
23 | 
24 | if [ "x$COMPUTE_GID" = "x" ]; then
25 | 	COMPUTE_GID=0
26 | fi
27 | 
28 | if [ "x$NCCL_TC" = "x" ]; then
29 | 	NCCL_TC=''
30 | fi
31 | 
32 | IFS="," read -r -a HOST_ARRAY <<< "$HOSTS"
33 | 
34 | HOST_COUNT=${#HOST_ARRAY[@]}
35 | 
36 | #remove trailing comma when passing the argument
37 | for i in ${HOST_ARRAY[@]}; do
38 | 	HOST_STRING+="$i:$GPUS,"
39 | done
40 | 
41 | mpirun -report-uri -display-allocation -v --allow-run-as-root --np $(($GPUS*$HOST_COUNT)) -H ${HOST_STRING%?} -bind-to none -map-by slot -x IBV_DRIVERS -x LD_LIBRARY_PATH -x PATH -x NCCL_IB_HCA=$NCCL_IB_HCAS -x NCCL_IB_TC=$NCCL_TC -x NCCL_IB_GID_INDEX=$COMPUTE_GID -x NCCL_IB_CUDA_SUPPORT=1 -mca orte_base_help_aggregate 0 -mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include $SSH_IFACE -mca btl_openib_verbose 1 /nccl-tests/build/all_reduce_perf -b 8 -e ${NCCL_MAX}G -f 2
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | from bobber import __version__
 3 | from setuptools import setup
 4 | 
 5 | with open('README.md', 'r') as f:
 6 |     long_description = f.read()
 7 | 
 8 | setup(
 9 |     name='nvidia-bobber',
10 |     version=__version__,
11 |     description='Containerized testing of system components that impact AI workload performance',
12 |     long_description=long_description,
13 |     packages=['bobber',
14 |               'bobber/lib',
15 |               'bobber/lib/analysis',
16 |               'bobber/lib/docker',
17 |               'bobber/lib/system',
18 |               'bobber/lib/tests'],
19 |     include_package_data=True,
20 |     package_data={'': ['lib/docker/Dockerfile',
21 |                        'test_scripts/call_dali_multi.sh',
22 |                        'test_scripts/dali_multi.sh',
23 |                        'test_scripts/fio_fill_single.sh',
24 |                        'test_scripts/fio_multi.sh',
25 |                        'test_scripts/mdtest_multi.sh',
26 |                        'test_scripts/nccl_multi.sh',
27 |                        'test_scripts/setup_fio.sh']},
28 |     license='MIT',
29 |     python_requires='>=3.6',
30 |     entry_points={
31 |         'console_scripts': ['bobber=bobber.bobber:main']
32 |     },
33 |     install_requires=[
34 |         'docker >= 4.3.1',
35 |         'numpy >= 1.9.5',
36 |         'pyyaml >= 5.4.0',
37 |         'tabulate >= 0.8.7',
38 |         'six>=1.15.0'
39 |     ]
40 | )
41 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/nccl.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | import re
 3 | from typing import Tuple
 4 | 
 5 | 
 6 | def parse_nccl_file(log_files: list, systems: int) -> Tuple[list, list]:
 7 |     """
 8 |     Find the maximum bus bandwidth and bus bytes from NCCL tests.
 9 | 
10 |     Parse the bandwidth at all byte sizes achieved during NCCL tests and match
11 |     the maximum bus bandwidth with the corresponding byte size from the
12 |     results. Only the maximum and corresponding byte size from each log are
13 |     returned to later find the overall average.
14 | 
15 |     Parameters
16 |     ----------
17 |     log_files : list
18 |         A ``list`` of ``strings`` of the filenames for all NCCL log files in
19 |         the results directory.
20 |     systems : int
21 |         An ``integer`` of the number of systems used during the current test.
22 | 
23 |     Returns
24 |     -------
25 |     tuple
26 |         Returns a ``tuple`` of (``list``, ``list``) containing the maximum bus
27 |         bandwidth and the bus bytes, respectively.
28 |     """
29 |     max_bus_bw_list = []
30 |     bus_bytes_list = []
31 | 
32 |     for log in log_files:
33 |         with open(log, 'r') as f:
34 |             log_contents = f.read()
35 |         out_of_place_results = re.findall('.*float     sum.*', log_contents)
36 |         results = [line.split() for line in out_of_place_results]
37 |         bytes_array = [float(result[0]) for result in results]
38 |         bus_bw_array = [float(result[6]) for result in results]
39 |         max_bus_bw_list.append(max(bus_bw_array))
40 |         max_index = bus_bw_array.index(max(bus_bw_array))
41 |         bus_bytes_list.append(bytes_array[max_index])
42 |     return max_bus_bw_list, bus_bytes_list
43 | 


--------------------------------------------------------------------------------
/bobber/lib/system/file_handler.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: MIT
 2 | import os
 3 | import yaml
 4 | from typing import NoReturn
 5 | 
 6 | 
 7 | def create_directory(directory: str) -> NoReturn:
 8 |     """
 9 |     Create a directory if it doesn't exist.
10 | 
11 |     Parameters
12 |     ----------
13 |     directory : string
14 |         A ``string`` of the full directory path to create if it doesn't exist.
15 |     """
16 |     if not os.path.exists(directory):
17 |         os.makedirs(directory)
18 | 
19 | 
20 | def update_log(logfile: str, contents: str) -> NoReturn:
21 |     """
22 |     Append a log with new output from a test.
23 | 
24 |     Parameters
25 |     ----------
26 |     logfile : string
27 |         A ``string`` of the logfile to write data to.
28 |     contents : string
29 |         A ``string`` of the contents to append the log file with.
30 |     """
31 |     with open(logfile, 'a') as log:
32 |         log.write(contents)
33 | 
34 | 
35 | def write_file(filename: str, contents: str) -> NoReturn:
36 |     """
37 |     Write data to a file.
38 | 
39 |     Parameters
40 |     ----------
41 |     filename : string
42 |         A ``string`` of the file to write data to.
43 |     contents : string
44 |         A ``string`` of the contents to write to the file.
45 |     """
46 |     with open(filename, 'w') as fp:
47 |         fp.write(contents)
48 | 
49 | 
50 | def read_yaml(filename: str) -> dict:
51 |     """
52 |     Read a YAML file and return the contents.
53 | 
54 |     Parameters
55 |     ----------
56 |     filename : string
57 |         A ``string`` of the full file path to read.
58 | 
59 |     Returns
60 |     -------
61 |     dict
62 |         Returns a ``dict`` representing the entire contents of the file.
63 |     """
64 |     with open(filename, 'r') as handler:
65 |         return yaml.safe_load(handler)
66 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/fio_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "x$THREADS" = "x" ]; then
 4 | 	THREADS=80
 5 | fi
 6 | 
 7 | if [ "x$DIRECTIO" = "x" ]; then
 8 |         DIRECTIO=0
 9 | fi
10 | 
11 | if [ "x$HOSTS" = "x" ]; then
12 | 	HOSTS=''
13 | fi
14 | 
15 | if [ "x$IO_DEPTH" = "x" ]; then
16 |         IO_DEPTH=16
17 | fi
18 | 
19 | if [ "x$EXTRA_FLAGS" = "x" ]; then
20 |         EXTRA_FLAGS=''
21 | fi
22 | 
23 | if [ "x$READ_PATTERN" = "x" ]; then
24 |         READ_PATTERN="read"
25 | fi
26 | 
27 | if [ "x$WRITE_PATTERN" = "x" ]; then
28 |         WRITE_PATTERN="write"
29 | fi
30 | 
31 | HOSTS_WITH_SPACES=`echo $HOSTS | sed "s/,/ /g"`
32 | 
33 | FSDIR=/mnt/fs_under_test
34 | 
35 | IODEPTH=$IO_DEPTH
36 | NJOBS=$THREADS
37 | 
38 | # Process all settings
39 | source /tests/setup_fio.sh
40 | 
41 | # Clean up old jobs
42 | stop_servers
43 | 
44 | # Start servers
45 | start_servers
46 | 
47 | RUNOPTS="--invalidate=${INVALIDATE} --blocksize=${IOSIZE}k --size=${SIZE}k --numjobs=${NJOBS} --directory=${WORKDIR} ${FSYNC}"
48 | CREATEOPTS="--invalidate=${INVALIDATE} --blocksize=${CREATE_IOSIZE}k --size=${SIZE}k --numjobs=${NJOBS} --directory=${WORKDIR} ${FSYNC}"
49 | 
50 | # List of commands
51 | ## Run create only first as it has been said it improves performance
52 | ## Run create with a large blocksize, because using a smaller blocksize will take an inordinate amount of time
53 | launch_fio --create_only=1 --rw=write ${IOSETTINGS} ${STDOPTS} ${CREATEOPTS}
54 | 
55 | launch_fio --rw=${WRITE_PATTERN} ${IOSETTINGS} ${STDOPTS} ${RUNOPTS} ${EXTRA_FLAGS}
56 | drop_caches
57 | 
58 | launch_fio --rw=${READ_PATTERN} ${IOSETTINGS} ${STDOPTS} ${RUNOPTS} ${EXTRA_FLAGS}
59 | drop_caches
60 | 
61 | # Clean up the job
62 | stop_servers
63 | 
64 | echo "Cleaning workspace"
65 | rm -f $JOBFN
66 | if [ "x$NORMDATA" == "x" ]; then
67 |         rm -rf $WORKDIR
68 | fi
69 | 
70 | echo "Done Running FIO Test"
71 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |     - pre_clean
 3 |     - docker_build
 4 |     - test
 5 | 
 6 | variables:
 7 |     GIT_SUBMODULE_STRATEGY: recursive
 8 | 
 9 | pre_clean:
10 |     stage: pre_clean
11 |     script:
12 |         - echo "Cleaning all Docker containers, images, and volumes"
13 |         - if [[ $(docker ps -q) ]]; then docker kill $(docker ps -q); fi
14 |         - if [[ $(docker ps -q -a) ]]; then docker rm $(docker ps -q -a); fi
15 |         - docker system prune --all --force
16 |         - echo "Removing old results"
17 |         - rm -rf ~/build_output
18 |         - echo "Removing old virtual environments"
19 |         - rm -rf env/
20 | 
21 | docker_build:
22 |     stage: docker_build
23 |     script:
24 |         - echo "Testing all containers to verify successful building"
25 |         - echo "Building Python wheel"
26 |         - virtualenv --python python3 env
27 |         - source env/bin/activate
28 |         - python setup.py bdist_wheel sdist
29 |         - pip install dist/nvidia_bobber-*-none-any.whl
30 |         - echo "Building latest Bobber image"
31 |         - bobber build
32 |         # Capture the build ID during the image build process and ensure it is listed in the system
33 |         - docker images | grep `bobber build | grep "Successfully built" | awk '{print $3}'`
34 | 
35 | test:
36 |     stage: test
37 |     script:
38 |         - echo "Running a single-node test to verify functionality"
39 |         - virtualenv --python python3 env
40 |         - source env/bin/activate
41 |         - python setup.py bdist_wheel sdist
42 |         - pip install dist/nvidia_bobber-*-none-any.whl
43 |         - bobber cast /raid
44 |         - bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --125k-threads 32 --iops-threads 96 --read-pattern randread test_results localhost
45 |         - bobber parse-results --compare-baseline single-dgx-station-baseline test_results/
46 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribute to Bobber
 2 | 
 3 | Before contributing to Bobber, we require all users to sign-off on their work.
 4 | 
 5 | ## Sign your work
 6 | 
 7 | The sign-off is a simple line at the end of the explanation for the patch. Your
 8 | signature certifies that you wrote the patch or otherwise have the right to pass
 9 | it on as an open-source patch. The rules are pretty simple: if you can certify
10 | the below (from [developercertificate.org](http://developercertificate.org/)):
11 | 
12 | ```
13 | Developer Certificate of Origin
14 | Version 1.1
15 | 
16 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
17 | 1 Letterman Drive
18 | Suite D4700
19 | San Francisco, CA, 94129
20 | 
21 | Everyone is permitted to copy and distribute verbatim copies of this
22 | license document, but changing it is not allowed.
23 | 
24 | Developer's Certificate of Origin 1.1
25 | 
26 | By making a contribution to this project, I certify that:
27 | 
28 | (a) The contribution was created in whole or in part by me and I
29 |     have the right to submit it under the open source license
30 |     indicated in the file; or
31 | 
32 | (b) The contribution is based upon previous work that, to the best
33 |     of my knowledge, is covered under an appropriate open source
34 |     license and I have the right under that license to submit that
35 |     work with modifications, whether created in whole or in part
36 |     by me, under the same open source license (unless I am
37 |     permitted to submit under a different license), as indicated
38 |     in the file; or
39 | 
40 | (c) The contribution was provided directly to me by some other
41 |     person who certified (a), (b) or (c) and I have not modified
42 |     it.
43 | 
44 | (d) I understand and agree that this project and the contribution
45 |     are public and that a record of the contribution (including all
46 |     personal information I submit with it, including my sign-off) is
47 |     maintained indefinitely and may be redistributed consistent with
48 |     this project or the open source license(s) involved.
49 | ```
50 | 
51 | Then you just add a line to every git commit message:
52 | 
53 |     Signed-off-by: Joe Smith <joe.smith@email.com>
54 | 
55 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
56 | 
57 | If you set your `user.name` and `user.email` git configs, you can sign your
58 | commit automatically with `git commit -s`.
59 | 


--------------------------------------------------------------------------------
/docs/building.md:
--------------------------------------------------------------------------------
 1 | # Building and running from source
 2 | While it is recommended to run Bobber using the latest Python wheel available on
 3 | the [GitHub Releases](https://github.com/NVIDIA/Bobber/releases) page, it is
 4 | possible to run Bobber directly from source, either by building and installing a
 5 | wheel locally or using Python to call specific Bobber modules.
 6 | 
 7 | **NOTE:** If any changes are made to the application, any Bobber containers must
 8 | be killed and re-launched to pickup the changes. This can be done with the
 9 | following which is expected to be run after building and installing a new wheel
10 | or modifying code locally:
11 | 
12 | ```
13 | docker kill bobber  # Only necessary if Bobber is already running
14 | bobber cast /path/to/storage  # If a new wheel was built/installed
15 | # OR
16 | python3 -m bobber.bobber cast /path/to/storage  # If modifying and running code directly
17 | ```
18 | 
19 | ## Running the Python modules
20 | To run the code directly using Python, first ensure all dependencies are
21 | installed using PIP. This can be done globally using `sudo`, or in a virtual
22 | environment with `virtualenv` or Anaconda.
23 | 
24 | ```
25 | sudo pip3 install -r requirements.txt
26 | ```
27 | 
28 | Once installed, Bobber can be called directly by calling the `bobber` package:
29 | 
30 | ```
31 | python3 -m bobber.bobber ...
32 | ```
33 | 
34 | Using `python3 -m bobber.bobber ...` is analogous to running `bobber ...` from
35 | the installed wheel with the exception of calling the code directly. The above
36 | command needs to be run from the root `bobber` directory of the repo.
37 | 
38 | For example, to build the Bobber image directly from the code, run the
39 | following:
40 | 
41 | ```
42 | cd ~/bobber
43 | python3 -m bobber.bobber build
44 | ```
45 | 
46 | ## Building a Python wheel
47 | A Python wheel can be built directly from the source and installed to replace
48 | any existing Bobber wheels and run Bobber as normal without calling the code. A
49 | bash script has been created which automatically builds a development version of
50 | the Python wheel based on the local changes. Running the `./build-dev-wheel`
51 | script will update the version number to a dev version with a timestamp and
52 | build a new wheel of the current code with the updated version. By adding
53 | `minor` or `patch` to the script as an argument, the minor and patch version
54 | will be updated in addition to the timestamp, respectively.
55 | 
56 | For example, to build a dev wheel without updating the minor or patch versions,
57 | run:
58 | 
59 | ```
60 | ./build-dev-wheel
61 | ```
62 | 
63 | If the current version of the package is `6.3.1`, this will generated a new
64 | wheel in the local `dist/` directory (which will be created if not already done)
65 | with the version `6.3.1.dev20210323084016` depending on the time the script was
66 | run.
67 | 
68 | Likewise, running
69 | 
70 | ```
71 | ./build-dev-wheel patch
72 | ```
73 | 
74 | will generate a wheel in `dist/` with version `6.3.2.dev20210323084016` and
75 | 
76 | ```
77 | ./build-dev-wheel minor
78 | ```
79 | 
80 | will generate a wheel in `dist/` with version `6.4.0.dev20210323084016`.
81 | 
82 | To generate a wheel manually without altering the version number, run
83 | 
84 | ```
85 | python3 setup.py bdist_wheel sdist
86 | ```
87 | 
88 | ### Installing the built wheel
89 | Install the generated wheel by ignoring any existing packages using PIP:
90 | 
91 | ```
92 | sudo pip3 install --ignore-installed dist/nvidia_bobber-<version>.whl
93 | ```
94 | 
95 | Bobber can now be used normally with `bobber ...` targeting the code as written
96 | when the wheel was built.
97 | 


--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | Things don't always go as planned. This guide provides some steps to
 3 | troubleshoot Bobber when it doesn't work as expected.
 4 | 
 5 | ## General troubleshooting
 6 | The first item to check when something goes wrong is the Docker image and
 7 | containers across the cluster. On all hosts, verify the Docker image is built
 8 | and matches the version of the Bobber wheel. Check the Bobber version with:
 9 | 
10 | ```
11 | $ bobber --version
12 | 6.3.1
13 | ```
14 | 
15 | The version number is listed in the first line. Check the Bobber image is built
16 | and matches the version above with:
17 | 
18 | ```
19 | $ docker images | grep nvidia/bobber
20 | nvidia/bobber   6.3.1   a467a25ff008    10 minutes ago  5.23GB
21 | ```
22 | 
23 | If the above command does not contain output or the second column (`6.3.1` in
24 | the example) does not match the version of Bobber from the first step, the image
25 | needs to be built. Run `bobber build` to build the image and verify using the
26 | steps above once complete.
27 | 
28 | Before any tests can be run, the container needs to be launched on all nodes.
29 | This can be verified with:
30 | 
31 | ```
32 | $ docker ps | grep bobber
33 | 1ab2b10f8eb1    nvidia/bobber:6.3.1 "/usr/local/bin/nvid..."    4 days ago  Up 4 days   bobber
34 | ```
35 | 
36 | If the above command does not contain output, the container needs to be launched
37 | using the `bobber cast /path/to/storage` command.
38 | 
39 | ## Exit codes
40 | When the application terminates after a handled issue, various exit codes may be
41 | thrown depending on the situation. The following list provides extra context on
42 | these codes:
43 | 
44 |   * `0`: Exit Success - The application terminated successfully.
45 |   * `10`: Baseline Failure - This is thrown while comparing results from a test run against a baseline (either one of the defaults or a custom baseline) using the `bobber parse-results --compare-baseline ...` or `bobber parse-results --custom-baseline ...` command. If at least one result doesn't exceed the baseline performance, it will be marked as a failure. Check the output of the command for a list of the results that don't exceed baseline performance and verify connectivity and configuration.
46 |   * `20`: Missing Log Files - Thrown while attempting to parse results while specifying a directory that does not contain valid log files. Verify the directory being parsed contains log files with data.
47 |   * `30`: Docker Build Failure - Thrown while trying to build the Bobber image with `bobber build`. Look at the output from the command to see if there are any specific issues while building. This is commonly seen when networking on host and/or Docker levels are down.
48 |   * `31`: Docker Communication Error - Bobber was unable to communicate with the Docker daemon. Ensure Docker is running `systemctl start docker` and verify it is working properly with `docker images`. This command should not throw errors if Docker can communicate properly.
49 |   * `32`: Container Not Running - The Bobber container needs to be running on all nodes prior to starting any tests. Use the `bobber cast` command to launch the container on all hosts.
50 |   * `33`: NVIDIA Runtime Error - The Bobber container is unable to be launched with NVIDIA runtime capabilities. Ensure the latest NVIDIA drivers are installed as well as the latest nvidia-docker libraries. Verify GPUs can be accessed inside containers by running `docker run --rm -it nvcr.io/nvidia/cuda:11.2.1-runtime nvidia-smi`. This should display the list of GPUs installed in the system if the NVIDIA container runtime is installed properly.
51 |   * `34`: Container Version Mismatch - The Bobber container and application version need to match to ensure proper functionality of the tests. To rectify the situation, first kill the running Bobber container with `docker kill bobber` then re-cast a new container with the same version as the Bobber application with the `bobber cast` command. If an image isn't already built for that version of Bobber, it will be built automatically with `bobber cast`. Note that if a new image is built, it will need to be re-copied to all hosts in the cluster for multi-node tests and subsequently killed/launched on all nodes using the above commands.
52 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/dali_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: MIT
 3 | 
 4 | if [ "x$GPUS" = "x" ]; then
 5 | 	GPUS=8
 6 | fi
 7 | 
 8 | if [ "x$BATCH_SIZE_SM" = "x" ]; then
 9 | 	BATCH_SIZE_SM=150
10 | fi
11 | 
12 | if [ "x$BATCH_SIZE_LG" = "x" ]; then
13 | 	BATCH_SIZE_LG=150
14 | fi
15 | 
16 | GPUS_ZERO_BASE=$(($GPUS-1))
17 | 
18 | if [ "x$HOSTS" = "x" ]; then
19 | 	HOSTS=localhost:1
20 | fi
21 | 
22 | if [ "x$SSH_IFACE" = "x" ]; then
23 | 	SSH_IFACE=enp226s0
24 | fi
25 | 
26 | IFS=',' read -r -a HOST_ARRAY <<< "$HOSTS"
27 | 
28 | HOST_COUNT=${#HOST_ARRAY[@]}
29 | 
30 | #remove trailing comma when passing the argument
31 | for i in ${HOST_ARRAY[@]}; do
32 | 	HOST_STRING+="$i:$GPUS,"
33 | done
34 | 
35 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images
36 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images
37 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline
38 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline
39 | mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx
40 | mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx
41 | 
42 | imagine create-images --width 3840 --height 2160 --count $(($GPUS*1000)) --size /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images 4k_image_ jpg
43 | imagine create-images --width 800 --height 600 --count $(($GPUS*1000)) --size /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images small_image_ jpg
44 | 
45 | imagine create-tfrecord --img-per-file 1000 /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline tfrecord-
46 | imagine create-tfrecord --img-per-file 1000 /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline tfrecord-
47 | 
48 | for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx/tfrecord-$i; done
49 | for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx/tfrecord-$i; done
50 | 
51 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_SM /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images $GPUS
52 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3
53 | 
54 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_LG /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images $GPUS
55 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3
56 | 
57 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_SM "/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" $GPUS
58 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3
59 | 
60 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE /tests/call_dali_multi.sh $BATCH_SIZE_LG "/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" $GPUS
61 | mpirun --allow-run-as-root -H ${HOST_STRING%?} -bind-to none -map-by ppr:1:node --mca plm_rsh_agent ssh -mca plm_rsh_args "-p 2222" -mca btl_tcp_if_include $SSH_IFACE sysctl vm.drop_caches=3
62 | 
63 | rm -r /mnt/fs_under_test/imageinary_data
64 | 


--------------------------------------------------------------------------------
/bobber/lib/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | # Larger base stage with required items for building various tools
  3 | FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04 as build
  4 | 
  5 | ENV DEBIAN_FRONTEND=noninteractive
  6 | 
  7 | # Install all required build dependencies
  8 | RUN apt-get update && apt-get -y install apt-utils && rm -rf /var/lib/apt/lists/*
  9 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
 10 | 	swig \
 11 | 	bison \
 12 | 	gcc \
 13 | 	libgfortran4 \
 14 | 	pkg-config \
 15 | 	autotools-dev \
 16 | 	debhelper \
 17 | 	automake \
 18 | 	m4 \
 19 | 	gfortran \
 20 | 	tk \
 21 | 	flex \
 22 | 	libltdl-dev \
 23 | 	autoconf \
 24 | 	dpatch \
 25 | 	graphviz \
 26 | 	tcl \
 27 | 	chrpath \
 28 | 	libglib2.0-0 \
 29 | 	python-libxml2 \
 30 | 	build-essential \
 31 | 	cmake \
 32 | 	git \
 33 | 	curl \
 34 | 	wget \
 35 | 	ca-certificates \
 36 | 	iputils-ping \
 37 | 	net-tools \
 38 | 	ethtool \
 39 | 	perl \
 40 | 	lsb-release \
 41 | 	iproute2 \
 42 | 	pciutils \
 43 | 	kmod \
 44 | 	libnuma1 \
 45 | 	lsof \
 46 | 	libopenmpi-dev && \
 47 | 	rm -rf /var/lib/apt/lists/*
 48 | 
 49 | # Compile NVIDIA's NCCL tests
 50 | RUN git clone https://github.com/NVIDIA/nccl-tests && \
 51 | 	cd nccl-tests/ && \
 52 | 	git reset --hard ec1b5e22e618d342698fda659efdd5918da6bd9f && \
 53 | 	make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi
 54 | 
 55 | # Compile OSU microbenchmarks
 56 | RUN wget --no-check-certificate https://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.6.2.tar.gz && \
 57 | 	tar zxf osu-micro-benchmarks-5.6.2.tar.gz && \
 58 | 	cd osu-micro-benchmarks-5.6.2 && \
 59 | 	./configure CC=/usr/bin/mpicc CXX=/usr/bin/mpicxx --enable-cuda --with-cuda-include=/usr/local/cuda/include --with-cuda-libpath=/usr/local/cuda/lib64 && \
 60 | 	make && \
 61 | 	make install && \
 62 | 	rm -rf ../*.tar.gz
 63 | 
 64 | # Build IO500, IOR, and mdtest
 65 | RUN git clone https://github.com/jyvet/io-500-dev && \
 66 | 	cd io-500-dev && \
 67 | 	git reset --hard 0232acfa8e64f7c543db8930dd279009ec9c32bc && \
 68 | 	utilities/prepare.sh
 69 | 
 70 | # Lighter runtime stage copying only necessary build artifacts from earlier
 71 | FROM nvcr.io/nvidia/cuda:11.2.0-runtime-ubuntu20.04
 72 | 
 73 | ENV DEBIAN_FRONTEND=noninteractive
 74 | 
 75 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
 76 | 	openssh-client \
 77 | 	openssh-server \
 78 | 	git \
 79 | 	fio \
 80 | 	psmisc \
 81 | 	libopenmpi-dev \
 82 | 	openmpi-bin \
 83 | 	python \
 84 | 	python3-dev \
 85 | 	python3-pip \
 86 | 	python3-distutils && \
 87 | 	rm -rf /var/lib/apt/lists/*
 88 | 
 89 | # Set default NCCL parameters
 90 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 91 | 
 92 | # Install OpenSSH for MPI to communicate between containers
 93 | RUN mkdir -p /var/run/sshd && \
 94 |     mkdir -p /root/.ssh && \
 95 |     echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
 96 |     echo "UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
 97 |     sed -i 's/^#*Port 22/Port 2222/' /etc/ssh/sshd_config && \
 98 |     echo "HOST *" >> /root/.ssh/config && \
 99 |     echo "PORT 2222" >> /root/.ssh/config && \
100 |     ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" && \
101 |     cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
102 |     chmod 700 /root/.ssh && \
103 |     chmod 600 /root/.ssh/*
104 | 
105 | WORKDIR /
106 | 
107 | # Copy the compiled nccl-tests binaries to the runtime image
108 | COPY --from=build /nccl-tests/build /nccl-tests/build
109 | 
110 | # Copy the compiled OSU microbenchmarks to the runtime image
111 | COPY --from=build /usr/local/libexec/osu-micro-benchmarks/mpi/collective/ /usr/local/libexec/osu-micro-benchmarks/mpi/collective/
112 | 
113 | # Copy the compiled IO500 binaries to the runtime image
114 | COPY --from=build /io-500-dev/bin /io-500-dev/bin
115 | 
116 | RUN git clone https://github.com/NVIDIA/DALI dali && \
117 | 	cd dali/ && \
118 | 	git reset --hard fd30786d773d08185d78988b2903dce2ace0a00b
119 | 
120 | RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools && \
121 |     python3 -m pip install --no-cache-dir nvidia-pyindex && \
122 |     python3 -m pip install --no-cache-dir \
123 |         nvidia-imageinary['tfrecord']>=1.1.2 \
124 |         nvidia-dali-cuda110
125 | 
126 | COPY test_scripts /tests/
127 | 
128 | EXPOSE 2222
129 | 


--------------------------------------------------------------------------------
/docs/parsing.md:
--------------------------------------------------------------------------------
 1 | # Parsing
 2 | Bobber includes a couple different parsers which can be used to easily verify
 3 | performance results from a test. By pointing Bobber to the directory where
 4 | results were saved, aggregate values per system-count level will be displayed.
 5 | 
 6 | The output displays a table with the aggregate results among all iterations per
 7 | number of nodes tested. For example, if 10 iterations were run, the 1-node
 8 | results will be the average values for all test runs for a single node. As the
 9 | node count goes up, the results reflect the aggregate value for all nodes that
10 | were tested. The table is automatically generated based on the values above to
11 | make it possible to view how results scale with additional node counts.
12 | 
13 | ```
14 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
15 | | Test                                               |   1 Node(s) |   2 Node(s) |   3 Node(s) |   4 Node(s) |   5 Node(s) |   6 Node(s) |   7 Node(s) |   8 Node(s) | Scale   |
16 | +====================================================+=============+=============+=============+=============+=============+=============+=============+=============+=========+
17 | | FIO Read (GB/s) - 1MB BS                           |       7.996 |      18.208 |      22.707 |       23.43 |      34.916 |       37.28 |      44.941 |      45.316 | 1.67X   |
18 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
19 | | FIO Write (GB/s) - 1MB BS                          |       4.439 |       5.291 |        5.46 |         5.6 |       7.116 |       7.444 |       7.588 |       7.486 | 1.11X   |
20 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
21 | | FIO Read (k IOPS) - 4K BS                          |       306.9 |       515.2 |       546.9 |       566.1 |         625 |       638.6 |       790.8 |       978.8 | 1.25X   |
22 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
23 | | FIO Write (k IOPS) - 4K BS                         |       295.5 |       437.9 |       445.5 |         427 |       474.4 |       474.7 |       502.4 |       484.2 | 1.07X   |
24 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
25 | | NCCL Max BW (GB/s)                                 |     235.253 |     141.883 |     140.335 |     140.731 |     140.083 |     140.966 |     139.593 |     140.715 | N/A     |
26 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
27 | | DALI Standard 800x600 throughput (images/second)   |     5821.49 |     11849.7 |     17719.4 |     23654.6 |     29508.7 |     35501.1 |     41282.8 |     47250.2 | 2.02X   |
28 | +----------------------------------------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+---------+
29 | ```
30 | 
31 | ### Parser Assumptions
32 | The auto-parser makes the following assumptions:
33 |   * The Bobber version must match for all files in a directory so results aren't
34 | getting mixed. This can be overriden with `--override-version-check` while
35 | calling the script.
36 |   * If a result file is invalid or missing data, it is skipped and not included
37 | with the results. The average results will reflect the limited number of valid
38 | results.
39 |   * The lowest N-results in DALI tests are dropped for N-nodes. These results
40 | are part of a known warm-up period for DALI and do not indicate actual
41 | performance.
42 |   * The scale 
43 | 
44 | ## Parsing MLPerf
45 | This repository includes a Python package that can quickly and easily parse
46 | MLPerf results. Note that MLPerf is **not** included in Bobber though results
47 | from the ResNet50 image classification benchmarks can be parsed here.
48 | 
49 | ```bash
50 | $ python3 bobber/lib/analysis/parse-mlperf.py path_to_results/
51 | MLPerf Results:
52 | Directory name: path_to_results/
53 | Number of iterations: 5
54 | Nodes tested: 8
55 | Epoch 0:
56 |     Speed: 80113.842 images/second
57 |     Average time: 15.901 seconds
58 | Overall:
59 |     Speed: 148134.457 images/second
60 |     Average time: 7.116 minutes
61 | ```
62 | 
63 | The output displays the aggregate results among all MLPerf test passes and finds
64 | the average speed and times for all runs. Results for both Epoch 0 and overall
65 | numbers are displayed to provide different insights. Epoch 0 is helpful to best
66 | identify the storage performance as images are likely not to be cached in the
67 | system.
68 | 
69 | ### Parser Assumptions
70 | The parser makes the following assumptions:
71 |   * The parser assumes the directory only contains results for a single test
72 | sweep for a set number of nodes (ie. all results are from a 10-iteration test
73 | for 4 nodes and no results from different node counts are included).
74 |   * The elapsed time is found by taking the difference between the start time
75 | for the first epoch (Epoch 0) and the stop time for the last epoch.
76 |   * All results are averaged together based on the number of results in the
77 | directory.
78 | 


--------------------------------------------------------------------------------
/bobber/test_scripts/setup_fio.sh:
--------------------------------------------------------------------------------
  1 | drop_caches () {
  2 | 
  3 | 	DC=0
  4 |         case $FSTYPE in
  5 |             gpfs)
  6 |                 echo "Cannot drop cache on GPFS"
  7 |                 ;;
  8 |             lustre|nfs|ext4|wekafs)
  9 |                 DC=1
 10 |                 ;;
 11 |             *)
 12 |                 echo "Unable to determine how to drop cache on FSTYPE: $FSTYPE, dropping anyway"
 13 |                 DC=1
 14 |                 ;;
 15 |         esac
 16 | 	if [ $DC -eq 1 ]; then
 17 | 		echo "Starting Drop Caches: $(date)"
 18 | 		declare -a pidlist
 19 | 		unset pidlist
 20 | 		for N in ${FIO_NODELIST}; do
 21 |             ssh $N $SSHOPTS /sbin/sysctl vm.drop_caches=3 &
 22 | 		    p=$!
 23 | 		    pidlist=(${pidlist[@]} $p)
 24 |         done
 25 | 		wait ${pidlist[@]}
 26 | 		echo "Ending Drop Caches: $(date)"
 27 | 	fi
 28 | }
 29 | 
 30 | stop_servers () {
 31 | 
 32 |     declare -a pidlist
 33 |     pidlist=""
 34 |     for N in $FIO_NODELIST; do
 35 |         echo "Killing Server on $N"
 36 | 
 37 | 	    if [ "$N" == "localhost" ]; then
 38 | 	        killall fio
 39 |         else
 40 |             ssh ${SSHOPTS} $N killall fio > /dev/null 2>&1 &
 41 | 	    fi
 42 |         p=$!
 43 |         pidlist=(${pidlist[@]} $p)
 44 |     done
 45 |     wait ${pidlist[@]}
 46 | }
 47 | 
 48 | start_servers () {
 49 | 
 50 |     if [ x"$NO_FIO_SERVER" != x"1" ]; then
 51 |         declare -a pidlist
 52 |         pidlist=""
 53 |         for N in $FIO_NODELIST; do
 54 |             echo "Launching Server on $N"
 55 | 
 56 |         	if [ "$N" == "localhost" ]; then
 57 |     	        $FIOBIN --server --daemonize=/tmp/pidfile.$$ > /dev/null 2>&1 &
 58 |             else
 59 |                 ssh ${SSHOPTS} $N $FIOBIN --server --daemonize=/tmp/pidfile.$$ > /dev/null 2>&1 &
 60 |         	fi
 61 |             p=$!
 62 |             pidlist=(${pidlist[@]} $p)
 63 |         done
 64 |         wait ${pidlist[@]}
 65 |     else
 66 |        echo "Not Starting FIO Server"
 67 |     fi
 68 | }
 69 | 
 70 | create_jobfile () {
 71 | 
 72 |     # Write job to stdout
 73 |     echo "[${NAME}]"
 74 |     for O in $@; do
 75 | 	if [ "$O" != "--create_jobfile" ]; then
 76 | 		echo $O | sed 's/^\-\-//g'
 77 | 	fi
 78 |     done
 79 | }
 80 | 
 81 | launch_fio () {
 82 | 
 83 |     echo "Command: "
 84 |     echo $FIOBIN $@
 85 | 
 86 |     # Create Job File
 87 |     JOBFN=.jobfn.$$
 88 |     create_jobfile $@ > $JOBFN
 89 |     cat $JOBFN
 90 | 
 91 |     if [ x"$NO_FIO_SERVER" != x"1" ]; then
 92 | 
 93 |         # Run Jobfile
 94 |         MFILE=/tmp/mfile.$$
 95 |         rm -f $MFILE
 96 |         echo $FIO_NODELIST | tr ' ' '\n' > $MFILE
 97 | 
 98 |         $FIOBIN --client=$MFILE $JOBFN
 99 | 
100 |         # Cleanup job file
101 |         rm -rf $JOBFN
102 |         rm -f $MFILE
103 |     else
104 | 	    taskset -c 0-23,48-71 $FIOBIN $JOBFN
105 |     fi
106 | }
107 | 
108 | #Filesystem type
109 | export FSTYPE=$(df -T $FSDIR | tail -1 | awk '{print $2}')
110 | # Set Size of file per thread
111 | export SIZE=${SIZE:-$(( 4096 * 1024 ))}
112 | # Set Size of each IO in KB
113 | export IOSIZE=${IOSIZE:-1024}
114 | # Set size of the IOs for file creation in KB
115 | export CREATE_IOSIZE=${CREATE_IOSIZE:-1024}
116 | # Number of Files per job
117 | export NRFILES=${NRFILES:-256}
118 | # Use DirectIO?
119 | export DIRECTIO=${DIRECTIO:-0}
120 | # Use MMAP IO?
121 | export MMAPIO=${MMAPIO:-0}
122 | # Set IODepth for DirectIO cases
123 | export IODEPTH=${IODEPTH:-16}
124 | # Set the invalidate flag or not, default is yes
125 | export INVALIDATE=${INVALIDATE:-1}
126 | # Set SSH options
127 | export SSHOPTS=${SSHOPTS:-"-o StrictHostKeyChecking=no"}
128 | # Set extra flags, if present
129 | export EXTRA_FLAGS=${EXTRA_FLAGS:-""}
130 | # Set JobName
131 | export NAME=${NAME:-iotest}
132 | # Set DirectIO settings if needed, allow for IOENGINE flexibility
133 | export IOENGINE=${IOENGINE:-posixaio}
134 | IOSETTINGS=""
135 | 
136 | if [ $DIRECTIO -eq 1 ] && [ $MMAPIO -eq 1 ]; then
137 | 	echo "ERROR, unable to use both Direct IO and MMAP I/O simultaenously.  Exiting"
138 | 	exit 1
139 | fi
140 | 
141 | if [ $DIRECTIO -eq 1 ]; then
142 |    IOSETTINGS="--direct=${DIRECTIO} --ioengine=${IOENGINE} --iodepth=${IODEPTH}"
143 | fi
144 | 
145 | if [ $MMAPIO -eq 1 ]; then
146 |    IOSETTINGS="--ioengine=mmap"
147 | fi
148 | 
149 | # Set FSYNC if needed
150 | if [ x"$FSYNC" != x"" ]; then
151 |         FSYNC="--fsync=${FSYNC}"
152 | fi
153 | 
154 | #### Settings to run
155 | FIOBIN=${FIOBIN:-fio}
156 | 
157 | if [ x"$(which $FIOBIN)" == x"" ]; then
158 | 	echo "ERROR: Enable to find fio binary at <$FIOBIN>.  Set with FIOBIN. Exiting"
159 | 	exit
160 | fi
161 | 
162 | FIODIR=$(cd $(dirname $(which $FIOBIN)) && pwd)
163 | FIOBIN=$FIODIR/$(basename $FIOBIN)
164 | 
165 | DATETAG=$(date +%Y%m%d%H%M%S)
166 | 
167 | export STDOPTS="--create_serialize=0 --fallocate=none --group_reporting=1 --disable_lat=1 --disable_clat=1 --disable_slat=1 --startdelay=5 --ramp_time=3 --runtime=180 --time_based=1"
168 | 
169 | echo "IOTEST Settings:"
170 | for E in FSDIR FSTYPE NJOBS SIZE IOSIZE NRFILES DIRECTIO MMAPIO IOSETTINGS INVALIDATE FSYNC STDOPTS FIOBIN DATETAG SSHOPTS RUNTIME EXTRA_FLAGS; do
171 |         eval V=\$$E
172 |         echo $E | awk '{printf("%-12s: ", $1);}'
173 |         echo $V
174 | done
175 | echo ""
176 | 
177 | ########## Create
178 | WORKDIR=$FSDIR/fiodir.$DATETAG
179 | echo "Creating output directory $WORKDIR"
180 | mkdir $WORKDIR
181 | 
182 | ########## Use nodelist from Bobber
183 | FIO_NODELIST=$HOSTS_WITH_SPACES
184 | echo "NCOUNT : $(echo $FIO_NODELIST | wc -w)"
185 | 
186 | if [ x"$(which numactl)" != x"" ]; then
187 |     numactl --show
188 | fi
189 | 
190 | echo "FIO_NODELIST: $FIO_NODELIST"
191 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/meta.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import re
  3 | 
  4 | 
  5 | def avg(stats: list) -> float:
  6 |     """
  7 |     Find the average of a list.
  8 | 
  9 |     Given a list of numbers, calculate the average of all values in the list.
 10 |     If the list is empty, default to 0.0.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     input_list : list
 15 |         A ``list`` of ``floats`` to find an average of.
 16 | 
 17 |     Returns
 18 |     -------
 19 |     float
 20 |         Returns a ``float`` of the average value of the list.
 21 |     """
 22 |     if len(stats) > 0:
 23 |         return sum(stats) / len(stats)
 24 |     else:
 25 |         return 0.0
 26 | 
 27 | 
 28 | def pull_stats(summary: list) -> dict:
 29 |     """
 30 |     Convert stats to a dictionary.
 31 | 
 32 |     Each line in the summary table in the log file needs to be parsed by first
 33 |     converting the table to a comma-separated list for easy parsing, then
 34 |     taking the first column as the statistical category and placing the
 35 |     remaining values into maximum, minimum, mean, and standard deviation.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     summary : list
 40 |         A ``list`` of ``strings`` representing each line in the summary table
 41 |         of the metadata file.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     dict
 46 |         Returns a ``dictionary`` of the converted table.
 47 |     """
 48 |     results = {}
 49 | 
 50 |     for stat in summary:
 51 |         # Convert the table to a comma-separated list to make it easier to
 52 |         # parse.
 53 |         stat = stat.replace(':', '')
 54 |         stat_csv = re.sub('  +', ',', stat.strip())
 55 |         components = stat_csv.split(',')
 56 |         key, max_val, min_val, mean, stdev = components
 57 |         results[key] = {
 58 |             'max': float(max_val),
 59 |             'min': float(min_val),
 60 |             'mean': float(mean),
 61 |             'stdev': float(stdev)
 62 |         }
 63 |     return results
 64 | 
 65 | 
 66 | def parse_summary(log_contents: str) -> list:
 67 |     """
 68 |     Pull the summary table from the metadata log.
 69 | 
 70 |     The bottom of the metadata log contains a summary table with all of the
 71 |     individual metadata operations and the results from the test. This table is
 72 |     denoted by a line of '-' signs and is ended with '-- finished'. Since these
 73 |     lines are used to make parsing easier, they should be dropped in the end.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     log_contents : str
 78 |         A ``string`` of the contents of the entire contents of a metadata log
 79 |         file.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     list
 84 |         Returns a ``list`` of ``strings`` representing each line in the summary
 85 |         table.
 86 |     """
 87 |     summary = re.findall('---------                      .*-- finished',
 88 |                          log_contents, re.DOTALL)
 89 |     if len(summary) == 0:
 90 |         return None
 91 |     # `summary` is a single-element list where the element is a list of all of
 92 |     # the metadata stats. The first and last lines are unecessary as they are
 93 |     # only used to parse the table and can be dropped.
 94 |     summary = summary[0].split('\n')[1:-1]
 95 |     return summary
 96 | 
 97 | 
 98 | def aggregate_results(combined_results: list) -> dict:
 99 |     """
100 |     Find the aggregate results for all categories.
101 | 
102 |     Parse every result from the metadata log files and capture the min, max,
103 |     and mean for each operation for all iterations in a single object.
104 | 
105 |     Parameters
106 |     ----------
107 |     combined_results : list
108 |         A ``list`` of ``dictionaries`` containing the results from each summary
109 |         table in each log file.
110 | 
111 |     Returns
112 |     -------
113 |     dict
114 |         Returns a ``dictionary`` of the final aggregate results for each
115 |         operation in the summary tables of all logs.
116 |     """
117 |     final_aggregate = {}
118 | 
119 |     if len(combined_results) == 0:
120 |         return final_aggregate
121 | 
122 |     for key, stats in combined_results[0].items():
123 |         key_metrics = [stat[key] for stat in combined_results]
124 |         final_aggregate[key] = {
125 |             'max': max([result['max'] for result in key_metrics]),
126 |             'min': min([result['min'] for result in key_metrics]),
127 |             'mean': avg([result['mean'] for result in key_metrics])
128 |         }
129 |     return final_aggregate
130 | 
131 | 
132 | def parse_meta_file(log_files: list, systems: int, results: dict) -> dict:
133 |     """
134 |     Parse the metadata results from the metadata logs.
135 | 
136 |     Search through each metadata log and extract the operations in the summary
137 |     table, saving the aggregate results in a dictionary.
138 | 
139 |     Parameters
140 |     ----------
141 |     log_files : list
142 |         A ``list`` of ``strings`` of the filename of each metadata log file in
143 |         the results directory.
144 |     systems : int
145 |         An ``integer`` of the number of systems used during the current test.
146 |     results : dict
147 |         A ``dictionary`` of the aggregate metadata results for each system
148 |         count.
149 | 
150 |     Returns
151 |     -------
152 |     dict
153 |         Returns an updated ``dictionary`` including the aggregate metadata
154 |         results for N-systems.
155 |     """
156 |     combined_results = []
157 | 
158 |     for log in log_files:
159 |         with open(log, 'r') as f:
160 |             log_contents = f.read()
161 |         summary = parse_summary(log_contents)
162 |         if not summary:
163 |             print(f'Warning: Invalid results found in {log} log file.')
164 |             print('Skipping...')
165 |             continue
166 |         stats = pull_stats(summary)
167 |         combined_results.append(stats)
168 |     results[systems] = aggregate_results(combined_results)
169 |     return results
170 | 


--------------------------------------------------------------------------------
/docs/docker.md:
--------------------------------------------------------------------------------
  1 | # Docker
  2 | This document demonstrates how to verify Docker installations and proper GPU
  3 | functionality for Docker containers.
  4 | 
  5 | ## Docker installation/upgrade
  6 | This project requires Docker version 19.03 or newer to be installed. Check the
  7 | version of Docker installed on the system with
  8 | 
  9 | ```bash
 10 | docker --version
 11 | ```
 12 | 
 13 | If the version is 19.03 or newer, you may continue to the next sub-section.
 14 | 
 15 | If your Docker version is older than 19.03, or Docker is not installed, follow
 16 | the steps listed on Docker's website for
 17 | [upgrading the Docker client](https://docs.docker.com/engine/install/ubuntu/),
 18 | which are copied below for reference:
 19 | 
 20 | First, remove any existing installations:
 21 | 
 22 | ```bash
 23 | sudo apt-get remove docker docker-engine docker.io containerd runc
 24 | ```
 25 | 
 26 | Next, install required dependencies and add the Docker GPG key:
 27 | 
 28 | ```bash
 29 | sudo apt-get update
 30 | 
 31 | sudo apt-get install \
 32 |     apt-transport-https \
 33 |     ca-certificates \
 34 |     curl \
 35 |     gnupg-agent \
 36 |     software-properties-common
 37 | 
 38 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
 39 | ```
 40 | 
 41 | Lastly, add the stable repository and install Docker:
 42 | 
 43 | ```bash
 44 | sudo add-apt-repository \
 45 |    "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
 46 |    $(lsb_release -cs) \
 47 |    stable"
 48 | 
 49 | sudo apt-get update
 50 | 
 51 | sudo apt-get install docker-ce docker-ce-cli containerd.io
 52 | ```
 53 | 
 54 | ## Docker permissions
 55 | By default, only the `root` user is able to use Docker. To enable other users to
 56 | use Docker without `sudo`, the user must be added to the `docker` group:
 57 | 
 58 | ```bash
 59 | sudo usermod -aG docker $USER
 60 | newgrp docker
 61 | ```
 62 | 
 63 | Verify your user is now able to interact directly with Docker without `sudo`:
 64 | 
 65 | ```bash
 66 | $ docker images
 67 | REPOSITORY                         TAG                     IMAGE ID            CREATED             SIZE
 68 | ```
 69 | 
 70 | ## Install NVIDIA-Docker
 71 | In order to access GPUs inside Docker containers, the `nvidia-docker` package
 72 | needs to be installed on all systems. The following installs the package
 73 | (taken from [the nvidia-docker docs](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker)):
 74 | 
 75 | ```bash
 76 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && \
 77 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - && \
 78 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 79 | sudo apt update
 80 | sudo apt install nvidia-docker2
 81 | sudo systemctl restart docker
 82 | ```
 83 | 
 84 | ## Testing containers
 85 | Once Docker is fully installed, ensure GPUs are accessible from containers by
 86 | pulling a CUDA container and running `nvidia-smi`:
 87 | 
 88 | ```bash
 89 | docker run --rm -it --gpus all nvidia/cuda:11.0-base nvidia-smi
 90 | ```
 91 | 
 92 | This should output information on the GPUs installed on a system, similar to
 93 | below:
 94 | 
 95 | ```
 96 | +-----------------------------------------------------------------------------+
 97 | | NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
 98 | |-------------------------------+----------------------+----------------------+
 99 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
100 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
101 | |                               |                      |               MIG M. |
102 | |===============================+======================+======================|
103 | |   0  A100-SXM4-40GB      On   | 00000000:07:00.0 Off |                    0 |
104 | | N/A   31C    P0    62W / 400W |      0MiB / 40537MiB |      0%      Default |
105 | |                               |                      |             Disabled |
106 | +-------------------------------+----------------------+----------------------+
107 | |   1  A100-SXM4-40GB      On   | 00000000:0F:00.0 Off |                    0 |
108 | | N/A   29C    P0    60W / 400W |      0MiB / 40537MiB |      0%      Default |
109 | |                               |                      |             Disabled |
110 | +-------------------------------+----------------------+----------------------+
111 | |   2  A100-SXM4-40GB      On   | 00000000:47:00.0 Off |                    0 |
112 | | N/A   30C    P0    63W / 400W |      0MiB / 40537MiB |      0%      Default |
113 | |                               |                      |             Disabled |
114 | +-------------------------------+----------------------+----------------------+
115 | |   3  A100-SXM4-40GB      On   | 00000000:4E:00.0 Off |                    0 |
116 | | N/A   30C    P0    60W / 400W |      0MiB / 40537MiB |      0%      Default |
117 | |                               |                      |             Disabled |
118 | +-------------------------------+----------------------+----------------------+
119 | |   4  A100-SXM4-40GB      On   | 00000000:87:00.0 Off |                    0 |
120 | | N/A   34C    P0    64W / 400W |      0MiB / 40537MiB |      0%      Default |
121 | |                               |                      |             Disabled |
122 | +-------------------------------+----------------------+----------------------+
123 | |   5  A100-SXM4-40GB      On   | 00000000:90:00.0 Off |                    0 |
124 | | N/A   33C    P0    66W / 400W |      0MiB / 40537MiB |      0%      Default |
125 | |                               |                      |             Disabled |
126 | +-------------------------------+----------------------+----------------------+
127 | |   6  A100-SXM4-40GB      On   | 00000000:B7:00.0 Off |                    0 |
128 | | N/A   34C    P0    61W / 400W |      0MiB / 40537MiB |      0%      Default |
129 | |                               |                      |             Disabled |
130 | +-------------------------------+----------------------+----------------------+
131 | |   7  A100-SXM4-40GB      On   | 00000000:BD:00.0 Off |                    0 |
132 | | N/A   33C    P0    58W / 400W |      0MiB / 40537MiB |      0%      Default |
133 | |                               |                      |             Disabled |
134 | +-------------------------------+----------------------+----------------------+
135 |                                                                                
136 | +-----------------------------------------------------------------------------+
137 | | Processes:                                                                  |
138 | |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
139 | |        ID   ID                                                   Usage      |
140 | |=============================================================================|
141 | |  No running processes found                                                 |
142 | +-----------------------------------------------------------------------------+
143 | ```


--------------------------------------------------------------------------------
/docs/non_dgx_support.md:
--------------------------------------------------------------------------------
  1 | # Non-DGX Support
  2 | While Bobber supports both the NVIDIA DGX A100 and the DGX-2 platforms out of
  3 | the box, it can also be run on most non-DGX Linux-based platforms with at least
  4 | one NVIDIA Turing-based architecture or newer GPU installed. Depending on the
  5 | systems being tested, a few parameters will need to be updated for Bobber to run
  6 | as intended. This guide provides information on how to find the parameters to
  7 | use:
  8 | 
  9 | ## GPU Count
 10 | By default, Bobber expects 8 GPUs in a system, similar to the DGX A100. If a
 11 | system has a different number of GPUs installed than the default value, it will
 12 | need to be specified by passing the `--gpus N` flag to any of the test Bobber
 13 | test commands. To find the number of NVIDIA GPUs available, use `nvidia-smi` to
 14 | list system-level GPU information. The following will list the GPUs installed on
 15 | a system:
 16 | 
 17 | ```bash
 18 | $ nvidia-smi --query-gpu=gpu_name --format=csv,noheader
 19 | Quadro RTX 8000
 20 | Quadro RTX 8000
 21 | ```
 22 | 
 23 | In the example above, the system has two RTX 8000 GPUs available. To run a test
 24 | with this system, the `--gpus 2` flag will need to be passed, similar to the
 25 | following:
 26 | 
 27 | ```bash
 28 | $ bobber run-all --gpus 2 /home/user/logs test-machine-1
 29 | ```
 30 | 
 31 | At present, Bobber assumes all test systems in a cluster have the **same**
 32 | number of GPUs available. To run a test pass with multiple nodes that all have
 33 | two GPUs, run the following:
 34 | 
 35 | ```bash
 36 | $ bobber run-all --gpus 2 /home/user/logs test-machine-1,test-machine-2,...
 37 | ```
 38 | 
 39 | ## SSH Interface
 40 | While not important for single-node tests, the `--ssh-iface` flag is used to
 41 | tell Bobber which network interface to use to communicate with other test nodes
 42 | for multi-node tests. This can be found by using the `ip link show` command to
 43 | list the active network interfaces on a system:
 44 | 
 45 | ```bash
 46 | $ ip link show | grep "state UP"
 47 | 2: enp67s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
 48 | 4: wlo2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DORMANT group default qlen 1000
 49 | ```
 50 | 
 51 | The example above shows two interfaces are "UP" - `enp67s0` which is a wired
 52 | connection and `wlo2` which is a wireless connection. For this system, the wired
 53 | connection is desired as it should provide better stability and performance. In
 54 | general, the chosen interface should also be the primary management interface
 55 | used by the operating system.
 56 | 
 57 | To use `enp67s0` as the interface per the example above, tests can be started
 58 | with:
 59 | 
 60 | ```bash
 61 | bobber run-all --ssh-iface enp67s0 /home/user/logs test-machine-1,test-machine-2,...
 62 | ```
 63 | 
 64 | ## DALI Batch Sizes
 65 | The DALI preprocesses large and small images which is typical of ResNet50
 66 | workflows. Depending on the amount of GPU memory available, the DALI tests could
 67 | run out of memory and terminate prematurely. Lowering the batch size for DALI
 68 | allows the GPUs to allocate less memory to the test, enabling the process to
 69 | complete as intended. It is recommended to attempt the tests once with the
 70 | default batch sizes to verify successful completion. If the GPUs ran out of
 71 | memory, a line similar to the following will be shown in the test log (note
 72 | that the line number (`11`) and process number (`5349`) may differ):
 73 | 
 74 | ```
 75 | /tests/call_dali_multi.sh: line 11:  5349 Killed
 76 | ```
 77 | 
 78 | This will also be accompanied by the following mpirun error:
 79 | 
 80 | ```
 81 | --------------------------------------------------------------------------
 82 | Primary job  terminated normally, but 1 process returned
 83 | a non-zero exit code. Per user-direction, the job has been aborted.
 84 | --------------------------------------------------------------------------
 85 | --------------------------------------------------------------------------
 86 | mpirun detected that one or more processes exited with non-zero status, thus causing
 87 | the job to be terminated. The first process to do so was:
 88 |   Process name: [[37996,1],0]
 89 |   Exit code:    137
 90 | --------------------------------------------------------------------------
 91 | ```
 92 | 
 93 | If this error is shown, drop the batch size by half and restart the test to see
 94 | if it completes successfully. Continue this process until the test is able to
 95 | run. While it is possible both the small and large image sizes are causing the
 96 | GPUs to run out of memory, it is much more likely that the large image batch
 97 | size needs to be dropped. The default batch size for large images is `256` and
 98 | for small images it is `512`. Specify the batch sizes with the following:
 99 | 
100 | ```bash
101 | $ bobber run-dali --batch-size-lg 128 --batch-size-sm 256 /home/user/logs test-machine-1,test-machine-2,...
102 | # OR
103 | $ bobber run-all --batch-size-lg 128 --batch-size-sm 256 /home/user/logs test-machine-1,test-machine-2,...
104 | ```
105 | 
106 | ## FIO Thread Flags
107 | Depending on the performance of the filesystem under test in addition to the
108 | CPUs, the FIO tests might stall, though this is very unlikely. If any of the FIO
109 | tests are stuck for a long time (10 minutes or more), the thread counts for both
110 | IOPS and bandwidth tests can be dropped to a lower level. These can be specified
111 | with the `--iops-threads` and `--bw-threads` flags. Note that for high
112 | performance filesystems and beefy compute nodes, these values can also be
113 | increased to attempt to achieve higher test results. The flags can be specified
114 | as follows:
115 | 
116 | ```bash
117 | $ bobber run-all --iops-threads 100 --bw-threads 32 /home/user/logs test-machine-1,test-machine-2,...
118 | ```
119 | 
120 | ## NCCL HCAs
121 | The NCCL tests use specific HCAs to communicate across nodes. At present, this
122 | requires NVIDIA Mellanox network adapters connected between nodes either
123 | directly or via a network switch. For most server configurations, there will be
124 | a dedicated compute fabric used for high-speed communication between nodes.
125 | These adapters should be targeted for the NCCL tests. To find the appropriate
126 | adapters, run `ibdev2netdev` to find the HCA device name that corresponds to the
127 | network device name for the compute network. For example, consider the following
128 | output:
129 | 
130 | ```bash
131 | $ ibdev2netdev
132 | mlx5_0 ==> ib0 (Up)
133 | mlx5_1 ==> ib1 (Up)
134 | mlx5_10 ==> ib10 (Up)
135 | mlx5_11 ==> ib11 (Up)
136 | mlx5_2 ==> ib2 (Up)
137 | mlx5_3 ==> ib3 (Up)
138 | mlx5_4 ==> ib4 (Up)
139 | mlx5_5 ==> ib5 (Up)
140 | mlx5_6 ==> ib6 (Up)
141 | mlx5_7 ==> ib7 (Up)
142 | mlx5_8 ==> ib8 (Up)
143 | mlx5_9 ==> ib9 (Up)
144 | ```
145 | 
146 | If the compute network is on adapters ib0-ib7, the devices to use for NCCL are
147 | `mlx5_0`, `mlx5_1`, ..., `mlx5_7`, matching the list above. To use these devices
148 | for the NCCL tests, the `--nccl-ib-hcas` flag needs to be passed, similar to the
149 | following. Note that the devices are separated with commas and no spaces:
150 | 
151 | ```bash
152 | $ bobber run-all --nccl-ib-hcas mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 /home/user/logs test-machine-1,test-machine-2,...
153 | # OR
154 | $ bobber run-nccl --nccl-ib-hcas mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 /home/user/logs test-machine-1,test-machine-2,...
155 | ```
156 | 


--------------------------------------------------------------------------------
/docs/baselines.md:
--------------------------------------------------------------------------------
  1 | # Baselines
  2 | The results parser included with Bobber is able to compare results against a
  3 | pre-defined baseline or a custom baseline passed in as a YAML file. By comparing
  4 | results against a baseline, it is possible to easily check if a round of tests
  5 | meets the expected performance level or if any tests are underperforming. This
  6 | is useful to verify new systems are performing as expected or to view if any
  7 | changes to the hardware or software affect stability.
  8 | 
  9 | ## Running the baseline comparison
 10 | There are two main paths to compare baselines in Bobber, either by using a
 11 | built-in baseline config or using a custom file.
 12 | 
 13 | ### Using built-in baselines
 14 | To compare against a built-in baseline, use the `--compare-baseline` flag with
 15 | the `parse-results` command. To list the possible choices, pass the `--help`
 16 | flag as below. The choices are listed in the curly brackets (`{}`):
 17 | 
 18 | ```
 19 | bobber parse-results --help
 20 | ...
 21 |   --compare-baseline {single-dgx-station-baseline}
 22 |                         Compare the values produced by a test run against a pre-defined baseline to verify performance meets an acceptable threshold.
 23 | ```
 24 | 
 25 | To run the comparison against existing results, run the following while updating
 26 | the baseline and log directory, if applicable.
 27 | 
 28 | ```
 29 | bobber parse-results --compare-baseline single-dgx-station-baseline results_logs/
 30 | ```
 31 | 
 32 | ### Using custom baselines
 33 | To use a custom baseline, a YAML file needs to be created which specifies the
 34 | expected performance for every test. This can be done by running the
 35 | `parse-results` command against a directory which will automatically generate a
 36 | YAML baseline in the directory named `baseline.yaml`. A
 37 | [sample file](sample_baseline.yaml) has also been created as a reference if a
 38 | custom baseline is desired. Every custom baseline file must have the following
 39 | structure:
 40 | 
 41 | ```
 42 | systems:  # This should always be the first line
 43 |     1:  # This designates all results in the sub-block are specific to a single compute node
 44 |         bandwidth:  # This section is for the FIO bandwidth results in bytes/second
 45 |             read: 1200000000  # The FIO bandwidth read results in bytes/second
 46 |             write: 1000000000  # The FIO bandwidth write results in bytes/second
 47 |         iops:  # This section is for the FIO IOPS results in ops/second
 48 |             read: 100000  # The FIO IOPS read speed in ops/second
 49 |             write: 100000  # The FIO IOPS write speed in ops/second
 50 |         nccl:  # The maximum bus bandwidth in GB/s for NCCL
 51 |             max_bus_bw: 230  # The maximum bus bandwidth in GB/s for NCCL
 52 |         dali:  # The average speed in images/second from DALI tests
 53 |             800x600 standard jpg: 2000  # The speed in images/second for 800x600 standard JPG images in DALI
 54 |             3840x2160 standard jpg: 300  # The speed in images/second for 4K standard JPG images in DALI
 55 |             800x600 tfrecord: 2000  # The speed in images/second for 800x600 TFRecords in DALI
 56 |             3840x2160 tfrecord: 300  # The speed in images/second for 4K TFRecords in DALI
 57 |     2: # Continue the same pattern as above for results specific to two compute nodes, if applicable
 58 |     ...
 59 | ```
 60 | 
 61 | The custom results parser will only compare against the system counts that are
 62 | provided in the YAML file, meaning if only results for 8 compute nodes are
 63 | included in the YAML file, only those results will be compared. As many or as
 64 | few system counts as desired can be added to the YAML file to more extensively
 65 | compare results at all levels.
 66 | 
 67 | After saving the YAML file locally, run the comparison as follows while updating
 68 | the YAML file location and log directory, if applicable:
 69 | 
 70 | ```
 71 | bobber parse-results --custom-baseline baseline.yaml results_log/
 72 | ```
 73 | 
 74 | ### Adding a tolerance
 75 | Both of the baseline methods above allow a custom tolerance to be specified to
 76 | give some wiggle-room in the results. Pass a percentage amount to allow below
 77 | the baseline.
 78 | 
 79 | Take for example a baseline that expects 10 GB/s from reads using FIO. If the
 80 | test results yield 9.8 GB/s, this will be marked as a FAIL. However, if the
 81 | tolerance is 5%, this will instead be marked as a PASS as 9.8 GB/s is within 5%
 82 | of the expected value of 10 GB/s.
 83 | 
 84 | To add a tolerance, add the `--baseline-tolerance` flag to either of the
 85 | commands above. The default tolerance is 0% if not specified, meaning the test
 86 | will fail if it is exactly at or below the baseline value.
 87 | 
 88 | ## Baseline results output
 89 | Regardless of which baseline method from above is chosen, the results will
 90 | compare the performance from the requested results file with the baseline of
 91 | choice. The comparison does a simple PASS/FAIL for every result depending on
 92 | whether it surpasses performance or not. If at least one result does not meet
 93 | performance expectations, the comparison will be marked as failed.
 94 | 
 95 | Example of results that pass every threshold:
 96 | 
 97 | ```
 98 | bobber parse-results --compare-baseline single-dgx-station-baseline log_files/
 99 | 
100 | ...
101 | 
102 | ================================================================================
103 | Baseline assessment
104 | Comparing against "single-dgx-station-baseline"
105 | ================================================================================
106 |  1 System(s)
107 | --------------------------------------------------------------------------------
108 |   FIO Bandwidth Read (GB/s)
109 |     Expected: 1.2, Got: 1.595, Result: PASS
110 |   FIO Bandwidth Write (GB/s)
111 |     Expected: 1.0, Got: 1.232, Result: PASS
112 | --------------------------------------------------------------------------------
113 |   FIO IOPS Read (k IOPS)
114 |     Expected: 100.0, Got: 136.5, Result: PASS
115 |   FIO IOPS Write (k IOPS)
116 |     Expected: 100.0, Got: 135.0, Result: PASS
117 | --------------------------------------------------------------------------------
118 |   NCCL Max Bus Bandwidth (GB/s)
119 |     Expected: 70, Got: 79.86500000000001, Result: PASS
120 | --------------------------------------------------------------------------------
121 |   DALI 800x600 standard jpg (images/second)
122 |     Expected: 2000, Got: 2694.595, Result: PASS
123 |   DALI 3840x2160 standard jpg (images/second)
124 |     Expected: 300, Got: 430.854, Result: PASS
125 |   DALI 800x600 tfrecord (images/second)
126 |     Expected: 2000, Got: 2665.653, Result: PASS
127 |   DALI 3840x2160 tfrecord (images/second)
128 |     Expected: 300, Got: 376.862, Result: PASS
129 | ================================================================================
130 | ```
131 | 
132 | Example of results that fail one or more thresholds:
133 | 
134 | ```
135 | bobber parse-results --custom-baseline sample_baseline.yaml log_files/
136 | 
137 | ...
138 | 
139 | ================================================================================
140 | Baseline assessment
141 | Comparing against a custom config
142 | ================================================================================
143 |  1 System(s)
144 | --------------------------------------------------------------------------------
145 |   FIO Bandwidth Read (GB/s)
146 |     Expected: 7.0, Got: 1.595, Result: FAIL
147 |   FIO Bandwidth Write (GB/s)
148 |     Expected: 3.0, Got: 1.232, Result: FAIL
149 | --------------------------------------------------------------------------------
150 |   FIO IOPS Read (k IOPS)
151 |     Expected: 300.0, Got: 136.5, Result: FAIL
152 |   FIO IOPS Write (k IOPS)
153 |     Expected: 200.0, Got: 135.0, Result: FAIL
154 | --------------------------------------------------------------------------------
155 |   NCCL Max Bus Bandwidth (GB/s)
156 |     Expected: 230, Got: 79.86500000000001, Result: FAIL
157 | --------------------------------------------------------------------------------
158 |   DALI 800x600 standard jpg (images/second)
159 |     Expected: 2000, Got: 2694.595, Result: PASS
160 |   DALI 3840x2160 standard jpg (images/second)
161 |     Expected: 300, Got: 430.854, Result: PASS
162 |   DALI 800x600 tfrecord (images/second)
163 |     Expected: 2000, Got: 2665.653, Result: PASS
164 |   DALI 3840x2160 tfrecord (images/second)
165 |     Expected: 300, Got: 376.862, Result: PASS
166 | --------------------------------------------------------------------------------
167 | 5 tests did not meet the suggested criteria!
168 | See results above for failed tests and verify setup.
169 | ```
170 | 


--------------------------------------------------------------------------------
/bobber/lib/constants.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | BUILD = 'build'
  3 | EXPORT = 'export'
  4 | CAST = 'cast'
  5 | LOAD = 'load'
  6 | PARSE_RESULTS = 'parse-results'
  7 | RUN_ALL = 'run-all'
  8 | RUN_DALI = 'run-dali'
  9 | RUN_NCCL = 'run-nccl'
 10 | RUN_STG_BW = 'run-stg-bw'
 11 | RUN_STG_IOPS = 'run-stg-iops'
 12 | RUN_STG_125K = 'run-stg-125k'
 13 | RUN_STG_META = 'run-stg-meta'
 14 | 
 15 | DGX_A100_SINGLE = {
 16 |     'gpus': 8,
 17 |     'bw_threads': 16,
 18 |     'stg_125k_threads': 16,
 19 |     'iops_threads': 200,
 20 |     'batch_size_sm': 512,
 21 |     'batch_size_lg': 256,
 22 |     'ssh_iface': 'enp226s0',
 23 |     'nccl_ib_hcas': 'mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7',
 24 |     'nccl_max': 4
 25 | }
 26 | 
 27 | DGX_A100_DUAL = {
 28 |     'gpus': 8,
 29 |     'bw_threads': 16,
 30 |     'stg_125k_threads': 16,
 31 |     'iops_threads': 200,
 32 |     'batch_size_sm': 512,
 33 |     'batch_size_lg': 256,
 34 |     'ssh_iface': 'enp226s0',
 35 |     'nccl_ib_hcas': 'mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9',
 36 |     'nccl_max': 4
 37 | }
 38 | 
 39 | DGX_2 = {
 40 |     'gpus': 16,
 41 |     'bw-threads': 16,
 42 |     'stg_125k_threads': 16,
 43 |     'batch-size-sm': 150,
 44 |     'batch-size-lg': 75,
 45 |     'iops-threads': 80,
 46 |     'ssh-iface': 'enp6s0',
 47 |     'nccl-ib-hcas':
 48 |     'mlx5_13,mlx5_15,mlx5_17,mlx5_19,mlx5_3,mlx5_5,mlx5_7,mlx5_9',
 49 |     'nccl-max': 1
 50 | }
 51 | 
 52 | SYSTEMS = {
 53 |     'dgx-a100-single': DGX_A100_SINGLE,
 54 |     'dgx-a100-dual': DGX_A100_DUAL,
 55 |     'dgx-2': DGX_2
 56 | }
 57 | 
 58 | READ_PATTERNS = {
 59 |     'read',
 60 |     'randread'
 61 | }
 62 | 
 63 | WRITE_PATTERNS = {
 64 |     'write',
 65 |     'randwrite'
 66 | }
 67 | 
 68 | # Baseline Results
 69 | # This is considered a minimum value that tests should hit in order to be
 70 | # verified the system has been configured properly for HPC and AI workloads.
 71 | SINGLE_DGX_STATION_BASELINE = {
 72 |     'systems': {
 73 |         '1': {
 74 |             'bandwidth': {
 75 |                 # FIO BW speed in bytes/second
 76 |                 'read': 1200000000,
 77 |                 'write': 1000000000
 78 |             },
 79 |             'iops': {
 80 |                 # FIO IOPS speed in ops/second
 81 |                 'read': 100000,
 82 |                 'write': 100000
 83 |             },
 84 |             'nccl': {
 85 |                 # NCCL maximum bus bandwidth in GB/s
 86 |                 'max_bus_bw': 70
 87 |             },
 88 |             'dali': {
 89 |                 # DALI average speed in images/second
 90 |                 '800x600 standard jpg': 2000,
 91 |                 '3840x2160 standard jpg': 300,
 92 |                 '800x600 tfrecord': 2000,
 93 |                 '3840x2160 tfrecord': 250
 94 |             }
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | DGX_A100_POD_BASELINE = {
100 |     'systems': {
101 |         '1': {
102 |             'bandwidth': {
103 |                 # FIO BW speed in bytes/second
104 |                 'read': 2250000000,
105 |                 'write': 875000000
106 |             },
107 |             'iops': {
108 |                 # FIO IOPS speed in ops/second
109 |                 'read': 87500,
110 |                 'write': 16250
111 |             },
112 |             'nccl': {
113 |                 # NCCL maximum bus bandwidth in GB/s
114 |                 'max_bus_bw': 230
115 |             },
116 |             'dali': {
117 |                 # DALI average speed in images/second
118 |                 '800x600 standard jpg': 2000,
119 |                 '3840x2160 standard jpg': 1000,
120 |                 '800x600 tfrecord': 4000,
121 |                 '3840x2160 tfrecord': 1000
122 |             }
123 |         },
124 |         '2': {
125 |             'bandwidth': {
126 |                 # FIO BW speed in bytes/second
127 |                 'read': 4500000000,
128 |                 'write': 1750000000
129 |             },
130 |             'iops': {
131 |                 # FIO IOPS speed in ops/second
132 |                 'read': 175000,
133 |                 'write': 32500
134 |             },
135 |             'nccl': {
136 |                 # NCCL maximum bus bandwidth in GB/s
137 |                 'max_bus_bw': 180
138 |             },
139 |             'dali': {
140 |                 # DALI average speed in images/second
141 |                 '800x600 standard jpg': 4000,
142 |                 '3840x2160 standard jpg': 2000,
143 |                 '800x600 tfrecord': 8000,
144 |                 '3840x2160 tfrecord': 2000
145 |             }
146 |         },
147 |         '3': {
148 |             'bandwidth': {
149 |                 # FIO BW speed in bytes/second
150 |                 'read': 6750000000,
151 |                 'write': 2625000000
152 |             },
153 |             'iops': {
154 |                 # FIO IOPS speed in ops/second
155 |                 'read': 262500,
156 |                 'write': 48750
157 |             },
158 |             'nccl': {
159 |                 # NCCL maximum bus bandwidth in GB/s
160 |                 'max_bus_bw': 180
161 |             },
162 |             'dali': {
163 |                 # DALI average speed in images/second
164 |                 '800x600 standard jpg': 6000,
165 |                 '3840x2160 standard jpg': 3000,
166 |                 '800x600 tfrecord': 12000,
167 |                 '3840x2160 tfrecord': 3000
168 |             }
169 |         },
170 |         '4': {
171 |             'bandwidth': {
172 |                 # FIO BW speed in bytes/second
173 |                 'read': 9000000000,
174 |                 'write': 3500000000
175 |             },
176 |             'iops': {
177 |                 # FIO IOPS speed in ops/second
178 |                 'read': 350000,
179 |                 'write': 65000
180 |             },
181 |             'nccl': {
182 |                 # NCCL maximum bus bandwidth in GB/s
183 |                 'max_bus_bw': 180
184 |             },
185 |             'dali': {
186 |                 # DALI average speed in images/second
187 |                 '800x600 standard jpg': 8000,
188 |                 '3840x2160 standard jpg': 4000,
189 |                 '800x600 tfrecord': 16000,
190 |                 '3840x2160 tfrecord': 4000
191 |             }
192 |         },
193 |         '5': {
194 |             'bandwidth': {
195 |                 # FIO BW speed in bytes/second
196 |                 'read': 11250000000,
197 |                 'write': 4375000000
198 |             },
199 |             'iops': {
200 |                 # FIO IOPS speed in ops/second
201 |                 'read': 437500,
202 |                 'write': 81250
203 |             },
204 |             'nccl': {
205 |                 # NCCL maximum bus bandwidth in GB/s
206 |                 'max_bus_bw': 180
207 |             },
208 |             'dali': {
209 |                 # DALI average speed in images/second
210 |                 '800x600 standard jpg': 20000,
211 |                 '3840x2160 standard jpg': 5000,
212 |                 '800x600 tfrecord': 20000,
213 |                 '3840x2160 tfrecord': 5000
214 |             }
215 |         },
216 |         '6': {
217 |             'bandwidth': {
218 |                 # FIO BW speed in bytes/second
219 |                 'read': 13500000000,
220 |                 'write': 5250000000
221 |             },
222 |             'iops': {
223 |                 # FIO IOPS speed in ops/second
224 |                 'read': 525000,
225 |                 'write': 97500
226 |             },
227 |             'nccl': {
228 |                 # NCCL maximum bus bandwidth in GB/s
229 |                 'max_bus_bw': 180
230 |             },
231 |             'dali': {
232 |                 # DALI average speed in images/second
233 |                 '800x600 standard jpg': 24000,
234 |                 '3840x2160 standard jpg': 6000,
235 |                 '800x600 tfrecord': 24000,
236 |                 '3840x2160 tfrecord': 6000
237 |             }
238 |         },
239 |         '7': {
240 |             'bandwidth': {
241 |                 # FIO BW speed in bytes/second
242 |                 'read': 15750000000,
243 |                 'write': 6125000000
244 |             },
245 |             'iops': {
246 |                 # FIO IOPS speed in ops/second
247 |                 'read': 612500,
248 |                 'write': 113750
249 |             },
250 |             'nccl': {
251 |                 # NCCL maximum bus bandwidth in GB/s
252 |                 'max_bus_bw': 180
253 |             },
254 |             'dali': {
255 |                 # DALI average speed in images/second
256 |                 '800x600 standard jpg': 28000,
257 |                 '3840x2160 standard jpg': 7000,
258 |                 '800x600 tfrecord': 28000,
259 |                 '3840x2160 tfrecord': 7000
260 |             }
261 |         },
262 |         '8': {
263 |             'bandwidth': {
264 |                 # FIO BW speed in bytes/second
265 |                 'read': 18000000000,
266 |                 'write': 7000000000
267 |             },
268 |             'iops': {
269 |                 # FIO IOPS speed in ops/second
270 |                 'read': 700000,
271 |                 'write': 130000
272 |             },
273 |             'nccl': {
274 |                 # NCCL maximum bus bandwidth in GB/s
275 |                 'max_bus_bw': 180
276 |             },
277 |             'dali': {
278 |                 # DALI average speed in images/second
279 |                 '800x600 standard jpg': 32000,
280 |                 '3840x2160 standard jpg': 8000,
281 |                 '800x600 tfrecord': 32000,
282 |                 '3840x2160 tfrecord': 8000
283 |             }
284 |         }
285 |     }
286 | }
287 | 
288 | BASELINES = {
289 |     'single-dgx-station-baseline': SINGLE_DGX_STATION_BASELINE,
290 |     'dgx-a100-pod-baseline': DGX_A100_POD_BASELINE
291 | }
292 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/fio.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import re
  3 | from bobber.lib.analysis.common import fio_command_details
  4 | from typing import Tuple
  5 | 
  6 | 
  7 | def clean_iops(iops: str) -> float:
  8 |     """
  9 |     Convert the IOPS into an equivalent operations/second result.
 10 | 
 11 |     Parse the IOPS value from the input string and convert the value from a
 12 |     larger unit to an equivalent operations/second, if applicable.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     iops : str
 17 |         A ``string`` of the number of operations/second and resulting unit.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     float
 22 |         Returns a ``float`` of the final IOPS value in operations/second.
 23 |     """
 24 |     number = float(re.findall(r'\d+', iops)[0])
 25 |     if 'G' in iops:
 26 |         ops_per_second = number * 1e9
 27 |     elif 'M' in iops:
 28 |         ops_per_second = number * 1e6
 29 |     elif 'k' in iops:
 30 |         ops_per_second = number * 1e3
 31 |     else:
 32 |         ops_per_second = number
 33 |     return ops_per_second
 34 | 
 35 | 
 36 | def clean_bw(bandwidth: str) -> float:
 37 |     """
 38 |     Convert the bandwidth into an equivalent bytes/second result.
 39 | 
 40 |     Parse the bandwidth value from the input string and convert the value from
 41 |     a larger unit to an equivalent operations/second, if applicable.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     bandwidth : str
 46 |         A ``string`` of the bandwidth and unit from the test.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     float
 51 |         Returns a ``float`` of the final bandwidth in bytes/second.
 52 |     """
 53 |     number = float(re.findall(r'(\d+(?:\.\d+)?)', bandwidth)[0])
 54 |     if 'GB/s' in bandwidth:
 55 |         bytes_per_second = number * 1e9
 56 |     elif 'MB/s' in bandwidth:
 57 |         bytes_per_second = number * 1e6
 58 |     elif 'kb/s' in bandwidth.lower():
 59 |         bytes_per_second = number * 1e3
 60 |     else:
 61 |         bytes_per_second = number
 62 |     return bytes_per_second
 63 | 
 64 | 
 65 | def fio_bw_results(log_contents: str, systems: int, string_to_match: str,
 66 |                    log: str) -> list:
 67 |     """
 68 |     Capture the bandwidth results from the log files.
 69 | 
 70 |     Search the log for any lines containing a bandwidth value and return a
 71 |     final list of all of the parsed values.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     log_contents : str
 76 |         A ``string`` of the contents from an FIO log file.
 77 |     systems : int
 78 |         An ``integer`` of the number of systems used during the current test.
 79 |     string_to_match : str
 80 |         A regex ``string`` of the line to pull from the log file to match any
 81 |         bandwidth lines.
 82 |     log : str
 83 |         A ``string`` of the name of the log file being parsed.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     list
 88 |         Returns a ``list`` of ``floats`` representing all of the bandwidth
 89 |         values parsed from the log.
 90 | 
 91 |     Raises
 92 |     ------
 93 |     ValueError
 94 |         Raises a ``ValueError`` if the bandwidth cannot be parsed from the log
 95 |         file.
 96 |     """
 97 |     final_bw = []
 98 | 
 99 |     match = re.findall(string_to_match, log_contents)
100 |     if len(match) != systems:
101 |         print(f'Warning: Invalid number of results found in {log} log file. '
102 |               'Skipping...')
103 |         return []
104 |     for result in match:
105 |         bw = re.findall(r'\(\d+[kMG]B/s\)', result)
106 |         if len(bw) != 1:
107 |             bw = re.findall(r'\(\d+\.\d+[kMG]B/s\)', result)
108 |             if len(bw) != 1:
109 |                 raise ValueError('Bandwidth cannot be parsed from FIO log!')
110 |         bw = clean_bw(bw[0])
111 |         final_bw.append(bw)
112 |     return final_bw
113 | 
114 | 
115 | def fio_iops_results(log_contents: str, systems: int, string_to_match: str,
116 |                      log: str) -> list:
117 |     """
118 |     Capture the IOPS results from the log files.
119 | 
120 |     Search the log for any lines containing IOPS values and return a final list
121 |     of all of the parsed values. The FIO IOPS tests print an extra line for
122 |     multi-node tests and are subsequently dropped.
123 | 
124 |     Parameters
125 |     ----------
126 |     log_contents : str
127 |         A ``string`` of the contents from an FIO log file.
128 |     systems : int
129 |         An ``integer`` of the number of systems used during the current test.
130 |     string_to_match : str
131 |         A regex ``string`` of the line to pull from the log file to match any
132 |         IOPS lines.
133 |     log : str
134 |         A ``string`` of the name of the log file being parsed.
135 | 
136 |     Returns
137 |     -------
138 |     list
139 |         Returns a ``list`` of ``floats`` representing all of the IOPS values
140 |         parsed from the log.
141 | 
142 |     Raises
143 |     ------
144 |     ValueError
145 |         Raises a ``ValueError`` if the IOPS cannot be parsed from the log
146 |         file.
147 |     """
148 |     final_iops = []
149 | 
150 |     match = re.findall(string_to_match, log_contents)
151 |     if (systems == 1 and len(match) != systems) or \
152 |        (systems != 1 and len(match) != systems + 1):
153 |         print(f'Warning: Invalid number of results found in {log} log file. '
154 |               'Skipping...')
155 |         return []
156 |     for result in match:
157 |         iops = re.findall(r'[-+]?\d*\.\d+[kMG]|\d+[kMG]|\d+', result)
158 |         if len(iops) not in [5, 6]:
159 |             raise ValueError('IOPS cannot be parsed from FIO log!')
160 |         iops = clean_iops(iops[0])
161 |         final_iops.append(iops)
162 |     # For multi-system benchmarks, an extra IOPS line is included with
163 |     # semi-aggregate results, but needs to be dropped from our results for a
164 |     # more accurate analysis.
165 |     if systems != 1:
166 |         final_iops = final_iops[:-1]
167 |     return final_iops
168 | 
169 | 
170 | def parse_fio_bw_file(log_files: list, systems: int, read_system_results: dict,
171 |                       write_system_results: dict) -> Tuple[dict, dict, dict,
172 |                                                            dict]:
173 |     """
174 |     Parse the FIO bandwidth results and test parameters.
175 | 
176 |     Search all log files for read and write parameters used to initiate the
177 |     test and the final results and return the resulting objects.
178 | 
179 |     Parameters
180 |     ----------
181 |     log_files : list
182 |         A ``list`` of ``strings`` of the filenames of all FIO bandwidth logs in
183 |         the results directory.
184 |     systems : int
185 |         An ``integer`` of the number of systems used during the current test.
186 |     read_system_results : dict
187 |         A ``dictionary`` of the final read results for N-systems.
188 |     write_system_results : dict
189 |         A ``dictionary`` of the final write results for N-systems.
190 | 
191 |     Returns
192 |     -------
193 |     tuple
194 |         A ``tuple`` of four dictionaries containing the read results, write
195 |         results, read parameters, and write parameters, respectively.
196 |     """
197 |     read_params, write_params = None, None
198 | 
199 |     for log in log_files:
200 |         with open(log, 'r') as f:
201 |             log_contents = f.read()
202 |         read_params, write_params = fio_command_details(log_contents,
203 |                                                         read_params,
204 |                                                         write_params)
205 |         write_bw = fio_bw_results(log_contents, systems, 'WRITE: bw=.*', log)
206 |         if write_bw == []:
207 |             continue
208 |         read_bw = fio_bw_results(log_contents, systems, 'READ: bw=.*', log)
209 |         write_system_results[systems].append(sum(write_bw))
210 |         read_system_results[systems].append(sum(read_bw))
211 |     return read_system_results, write_system_results, read_params, write_params
212 | 
213 | 
214 | def parse_fio_iops_file(log_files: list, systems: int,
215 |                         read_system_results: dict,
216 |                         write_system_results: dict) -> Tuple[dict, dict, dict,
217 |                                                              dict]:
218 |     """
219 |     Parse the FIO IOPS results and test parameters.
220 | 
221 |     Search all log files for read and write parameters used to initiate the
222 |     test and the final results and return the resulting objects.
223 | 
224 |     Parameters
225 |     ----------
226 |     log_files : list
227 |         A ``list`` of ``strings`` of the filenames of all FIO IOPS logs in the
228 |         results directory.
229 |     systems : int
230 |         An ``integer`` of the number of systems used during the current test.
231 |     read_system_results : dict
232 |         A ``dictionary`` of the final read results for N-systems.
233 |     write_system_results : dict
234 |         A ``dictionary`` of the final write results for N-systems.
235 | 
236 |     Returns
237 |     -------
238 |     tuple
239 |         A ``tuple`` of four dictionaries containing the read results, write
240 |         results, read parameters, and write parameters, respectively.
241 |     """
242 |     read_params, write_params = None, None
243 | 
244 |     for log in log_files:
245 |         with open(log, 'r') as f:
246 |             log_contents = f.read()
247 |         read_params, write_params = fio_command_details(log_contents,
248 |                                                         read_params,
249 |                                                         write_params)
250 |         write_iops = fio_iops_results(log_contents, systems, 'write: IOPS=.*',
251 |                                       log)
252 |         read_iops = fio_iops_results(log_contents, systems, 'read: IOPS=.*',
253 |                                      log)
254 |         write_system_results[systems].append(sum(write_iops))
255 |         read_system_results[systems].append(sum(read_iops))
256 |     return read_system_results, write_system_results, read_params, write_params
257 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/common.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import re
  3 | from collections import defaultdict
  4 | from typing import Tuple
  5 | 
  6 | 
  7 | class bcolors:
  8 |     """
  9 |     A helper class to annotate text with colors.
 10 |     """
 11 |     PASS = '\033[92m'  # nosec
 12 |     WARNING = '\033[93m'
 13 |     FAIL = '\033[91m'
 14 |     BOLD = '\033[1m'
 15 |     ENDC = '\033[0m'
 16 | 
 17 | 
 18 | def num_systems(log: str) -> int:
 19 |     """
 20 |     Returns an ``integer`` of the number of systems that were tested during a
 21 |     particular run.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     log : str
 26 |         A ``string`` of the filename for a single log.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     int
 31 |         Returns an ``int`` of the number of systems that were tested for the
 32 |         given logfile. Defaults to None if not found.
 33 |     """
 34 |     try:
 35 |         systems = re.findall(r'systems_\d+_', log)
 36 |         systems = re.findall(r'\d+', systems[0])
 37 |         return int(systems[0])
 38 |     except ValueError:
 39 |         return None
 40 | 
 41 | 
 42 | def _bobber_version(log: str) -> str:
 43 |     """
 44 |     Returns a ``string`` representation of the Bobber version tested, such as
 45 |     '6.3.1'.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     log : str
 50 |         A ``string`` of the filename for a single log.
 51 | 
 52 |     Returns
 53 |     -------
 54 |     str
 55 |         Returns a ``string`` of the Bobber version tested, such as '6.3.1'.
 56 | 
 57 |     Raises
 58 |     ------
 59 |     ValueError
 60 |         Raises a ``ValueError`` if the version cannot be parsed from the log
 61 |         file.
 62 |     """
 63 |     version = re.findall(r'version_\d+_\d+_\d+', log)
 64 |     if len(version) != 1:
 65 |         raise ValueError(f'Could not parse Bobber version from {log} file!')
 66 |     version = version[0].replace('version_', '')
 67 |     return version.replace('_', '.')
 68 | 
 69 | 
 70 | def check_bobber_version(logs: list, override: bool) -> str:
 71 |     """
 72 |     Ensure the Bobber version matches in all logs being parsed.
 73 | 
 74 |     As a safeguard to mixing results from different Bobber versions, the
 75 |     version needs to be checked for all logs to ensure they are equal. By
 76 |     comparing each new log version with the previous version captured, if all
 77 |     are equal in the list of logs, then it is guaranteed that the logs are all
 78 |     the same.
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     logs : list
 83 |         A ``list`` of strings of all of the log filenames in the directory that
 84 |         was passed.
 85 |     override : bool
 86 |         A ``boolean`` which evaluates to ``True`` when the version-checking
 87 |         should be skipped.
 88 | 
 89 |     Returns
 90 |     -------
 91 |     str
 92 |         Returns a ``string`` of the Bobber version being tested.
 93 | 
 94 |     Raises
 95 |     ------
 96 |     ValueError
 97 |         Raises a ``ValueError`` when any log versions don't match.
 98 |     """
 99 |     last_version = None
100 | 
101 |     for log in logs:
102 |         version = _bobber_version(log)
103 |         if override:
104 |             return version
105 |         if last_version and version != last_version:
106 |             raise ValueError('Error: Only logs using the same Bobber version '
107 |                              'are allowed in the results directory.')
108 |         last_version = version
109 |     return version
110 | 
111 | 
112 | def _convert_to_bytes(value: str) -> float:
113 |     """
114 |     Convert a number to bytes.
115 | 
116 |     Convert a passed number to bytes by parsing the number from the passed
117 |     string and multiplying by the appropriate multiplier to convert from a
118 |     larger unit to bytes.
119 | 
120 |     Parameters
121 |     ----------
122 |     value : str
123 |         A ``string`` of the value to convert to bytes.
124 | 
125 |     Returns
126 |     -------
127 |     float
128 |         Returns a ``float`` of the final value in bytes.
129 |     """
130 |     number = float(re.sub('[a-zA-Z]*', '', value))
131 |     if 'gib' in value.lower():
132 |         return number * 1024 * 1024 * 1024
133 |     elif 'g' in value.lower():
134 |         return number * 1e9
135 |     elif 'mib' in value.lower():
136 |         return number * 1024 * 1024
137 |     elif 'm' in value.lower():
138 |         return number * 1e6
139 |     elif 'kib' in value.lower():
140 |         return number * 1024
141 |     elif 'k' in value.lower():
142 |         return number * 1e3
143 | 
144 | 
145 | def _fio_command_parse(command: str) -> dict:
146 |     """
147 |     Parse the command parameters for fio.
148 | 
149 |     Pull all of the flags and parameters used during a fio run and save them as
150 |     a dictionary to make it easier to reference what was used during a test.
151 | 
152 |     Parameters
153 |     ----------
154 |     command : str
155 |         A ``string`` of the command used during the fio run.
156 | 
157 |     Returns
158 |     -------
159 |     dict
160 |         Returns a ``dictionary`` of the parameters used during the fio run.
161 |     """
162 |     parameter_dict = {}
163 | 
164 |     for parameter in command.split():
165 |         # Skip the following parameters as they don't provide meaningful data.
166 |         if parameter == '/usr/bin/fio':
167 |             continue
168 |         key, value = parameter.split('=')
169 |         key = key.replace('--', '')
170 |         if key in ['blocksize', 'size']:
171 |             value = _convert_to_bytes(value)
172 |         else:
173 |             # Attempt to convert to a int for numerical values. If it fails,
174 |             # keep as a string as that's likely intended type.
175 |             try:
176 |                 value = int(value)
177 |             except ValueError:
178 |                 value = str(value)
179 |         parameter_dict[key] = value
180 |     return parameter_dict
181 | 
182 | 
183 | def _compare_dicts(old_results: dict, new_results: dict) -> bool:
184 |     """
185 |     Compare testing dictionaries for equality.
186 | 
187 |     Compare the dictionaries for equality while ignoring the 'directory' and
188 |     'command' keys since these will always differ amongst tests. If all other
189 |     parameters are equal, it is assumed the tests used the same parameters.
190 | 
191 |     Parameters
192 |     ----------
193 |     old_results : dict
194 |         A ``dictionary`` of the test parameters used during the
195 |         previously-parsed test log.
196 |     new_results : dict
197 |         A ``dictionary`` of the test parameters used during the test log
198 |         currently being parsed.
199 | 
200 |     Returns
201 |     -------
202 |     bool
203 |         Returns a ``boolean`` which evaluates to `True` when all of the
204 |         parameters are equal between the two dictionaries and `False` if at
205 |         least on parameter is different.
206 |     """
207 |     ignore_keys = ['directory', 'command']
208 | 
209 |     old = dict((k, v) for k, v in old_results.items() if k not in ignore_keys)
210 |     new = dict((k, v) for k, v in new_results.items() if k not in ignore_keys)
211 |     return old == new
212 | 
213 | 
214 | def fio_command_details(log_contents: str, old_reads: dict,
215 |                         old_writes: dict) -> Tuple[dict, dict]:
216 |     """
217 |     Parse the command parameters and compare with the previous log.
218 | 
219 |     Pull the fio parameters used for both the read and write commands during
220 |     the tests and compare them with the previous log file that was parsed to
221 |     ensure all tests being parsed are using the same parameters.
222 | 
223 |     Parameters
224 |     ----------
225 |     log_contents : str
226 |         A ``string`` of all the output inside a log file.
227 |     old_reads : dict
228 |         A ``dictionary`` of the previous read test parameters that were parsed.
229 |     old_writes : dict
230 |         A ``dictionary`` of the previous write test parameters that were
231 |         parsed.
232 | 
233 |     Returns
234 |     -------
235 |     tuple
236 |         Returns a ``tuple`` of (``dict``, ``dict``) where each dictionary are
237 |         the parsed read and write parameters, respectively, from the tests.
238 | 
239 |     Raises
240 |     ------
241 |     ValueError
242 |         Raises a ``ValueError`` if the fio command type is unexpected or the
243 |         parameters differ between two or more tests.
244 |     """
245 |     commands = re.findall(r'/usr/bin/fio --rw.*', log_contents)
246 |     if len(commands) < 2:
247 |         raise ValueError(f'FIO command not found in {log} file!')
248 | 
249 |     for command in commands:
250 |         if '--rw=read' in command:
251 |             read_params = _fio_command_parse(command)
252 |             read_params['command'] = command
253 |         elif '--rw=write' in command:
254 |             write_params = _fio_command_parse(command)
255 |             write_params['command'] = command
256 |         elif '--rw=randread' in command:
257 |             read_params = _fio_command_parse(command)
258 |             read_params['command'] = command
259 |         elif '--rw=randwrite' in command:
260 |             write_params = _fio_command_parse(command)
261 |             write_params['command'] = command
262 |         else:
263 |             raise ValueError('Unexpected FIO test type. Expected '
264 |                              'read, write, randread, or randwrite.')
265 |     if old_reads and old_writes:
266 |         if not _compare_dicts(old_reads, read_params) or \
267 |            not _compare_dicts(old_writes, write_params):
268 |             raise ValueError('Parameters differ between tests. Ensure only '
269 |                              'tests with the same parameters are used.')
270 |     return read_params, write_params
271 | 
272 | 
273 | def divide_logs_by_systems(log_files: list, log_to_match: str) -> dict:
274 |     """
275 |     Extract logs on a per-system basis.
276 | 
277 |     Given a list of all logs in a directory and a string to match for the log
278 |     files, extract all of the requested logs and group them together on a
279 |     per-system basis. For example, matching 'stg_iops' will pull all of the
280 |     IOPS test logs and combine all of the one-node IOPS logs in a list, then
281 |     all of the two-node IOPS logs in another list, and so on.
282 | 
283 |     Parameters
284 |     ----------
285 |     log_files : list
286 |         A ``list`` of log filenames from the passed directory to parse.
287 |     log_to_match : str
288 |         A ``string`` of the logs to match in the directory. 'stg_iops' will
289 |         match all logs that begin with 'stg_iops'.
290 | 
291 |     Returns
292 |     -------
293 |     dict
294 |         Returns a ``dictionary`` of all results where the key is the number of
295 |         nodes being tested and the value is a list of all of the logs that
296 |         match the filter for that system count.
297 |     """
298 |     # Divide the results based on the number of systems tested.
299 |     num_systems_dict = defaultdict(list)
300 | 
301 |     for log in log_files:
302 |         if log_to_match not in log:
303 |             continue
304 |         systems = num_systems(log)
305 |         num_systems_dict[systems].append(log)
306 |     return num_systems_dict
307 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/compare_baseline.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import sys
  3 | from bobber.lib.constants import BASELINES
  4 | from bobber.lib.exit_codes import BASELINE_FAILURE
  5 | from bobber.lib.analysis.common import bcolors
  6 | from bobber.lib.system.file_handler import read_yaml
  7 | from typing import NoReturn, Optional, Tuple
  8 | 
  9 | 
 10 | # Map the dicitonary keys in the baseline to human-readable names.
 11 | TEST_MAPPING = {
 12 |     'bandwidth': 'FIO Bandwidth',
 13 |     'iops': 'FIO IOPS',
 14 |     'nccl': 'NCCL',
 15 |     'dali': 'DALI'
 16 | }
 17 | 
 18 | 
 19 | def metric_passes(expected: float, got: float, tolerance: int) -> bool:
 20 |     """
 21 |     Determine if a test result meets a particular threshold.
 22 | 
 23 |     Compares the parsed value with the requested baseline for the same test and
 24 |     return a boolean of whether or not it is greater than expected. If a
 25 |     tolerance is passed, any value that is N-percent or higher below the
 26 |     requested tolerance of N will still be marked as passing.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     expected : float
 31 |         A ``float`` of the baseline value to compare against.
 32 |     got : float
 33 |         A ``float`` of the test result that was parsed.
 34 |     tolerance : int
 35 |         An ``int`` of the percentage below the threshold to still mark as
 36 |         passing.
 37 | 
 38 |     Returns
 39 |     -------
 40 |     bool
 41 |         Returns a ``boolean`` which evaluates to `True` when the parsed value
 42 |         is greater than the baseline and `False` otherwise.
 43 |     """
 44 |     if tolerance > 0:
 45 |         # If user passes a 5% tolerance, multiply the expected value by 5% less
 46 |         # than current value to get the tolerance.
 47 |         expected = (1 - tolerance / 100) * expected
 48 |     if got > expected:
 49 |         return True
 50 |     else:
 51 |         return False
 52 | 
 53 | 
 54 | def result_text(result: bool, failures: int) -> Tuple[str, int]:
 55 |     """
 56 |     Color-code the result output.
 57 | 
 58 |     If the result passes the threshold, it will be marked as PASSing in green
 59 |     text. Otherwise, it will be marked as FAILing in red text.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     result : bool
 64 |         A ``boolean`` which evaluates to `True` when the value meets the
 65 |         threshold and `False` if not.
 66 |     failures : int
 67 |         An ``integer`` of the number of results that have not met the
 68 |         threshold.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     tuple
 73 |         Returns a ``tuple`` of (``str``, ``int``) representing the color-coded
 74 |         text and the number of failures found, respectively.
 75 |     """
 76 |     if result:
 77 |         output = f'{bcolors.PASS}PASS{bcolors.ENDC}'
 78 |     else:
 79 |         failures += 1
 80 |         output = f'{bcolors.FAIL}FAIL{bcolors.ENDC}'
 81 |     return output, failures
 82 | 
 83 | 
 84 | def evaluate_fio(baselines: dict, results: dict, test_name: str, failures: int,
 85 |                  tolerance: int) -> int:
 86 |     """
 87 |     Evaluate the fio test results against the baseline.
 88 | 
 89 |     Determine if the fio test results meet the expected threshold and display
 90 |     the outcome with appropriate units.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     baselines : dict
 95 |         A ``dictionary`` of the baseline to compare results against.
 96 |     results : dict
 97 |         A ``dictionary`` of the parsed results.
 98 |     test_name : str
 99 |         A ``string`` of the name of the test being parsed.
100 |     failures : int
101 |         An ``integer`` of the number of results that have not met the
102 |         threshold.
103 |     tolerance : int
104 |         An ``int`` of the percentage below the threshold to still mark as
105 |         passing.
106 | 
107 |     Returns
108 |     -------
109 |     int
110 |         Returns an ``integer`` of the number of results that have not met the
111 |         threshold.
112 |     """
113 |     for test, value in baselines.items():
114 |         if test_name not in results.keys():
115 |             continue
116 |         if test_name == 'bandwidth':
117 |             unit = '(GB/s)'
118 |             expected = value / 1000000000
119 |             got = round(results[test_name][test] / 1000000000, 3)
120 |         elif test_name == 'iops':
121 |             unit = '(k IOPS)'
122 |             expected = value / 1000
123 |             got = round(results[test_name][test] / 1000, 3)
124 |         print(f'  {TEST_MAPPING[test_name]} {test.title()} {unit}')
125 |         text = f'    Expected: {expected}, Got: {got}'
126 |         result = metric_passes(expected, got, tolerance)
127 |         output, failures = result_text(result, failures)
128 |         text += f', Result: {output}'
129 |         print(text)
130 |     return failures
131 | 
132 | 
133 | def evaluate_nccl(baseline: dict, results: dict, failures: int,
134 |                   tolerance: int) -> int:
135 |     """
136 |     Evaluate the NCCL test results against the baseline.
137 | 
138 |     Determine if the NCCL test results meet the expected threshold and display
139 |     the outcome with appropriate units.
140 | 
141 |     Parameters
142 |     ----------
143 |     baselines : dict
144 |         A ``dictionary`` of the baseline to compare results against.
145 |     results : dict
146 |         A ``dictionary`` of the parsed results.
147 |     failures : int
148 |         An ``integer`` of the number of results that have not met the
149 |         threshold.
150 |     tolerance : int
151 |         An ``int`` of the percentage below the threshold to still mark as
152 |         passing.
153 | 
154 |     Returns
155 |     -------
156 |     int
157 |         Returns an ``integer`` of the number of results that have not met the
158 |         threshold.
159 |     """
160 |     if 'max_bus_bw' not in baseline.keys():
161 |         return failures
162 |     print('  NCCL Max Bus Bandwidth (GB/s)')
163 |     expected = baseline['max_bus_bw']
164 |     got = results['nccl']['max_bus_bw']
165 |     text = f'    Expected: {expected}, Got: {got}'
166 |     result = metric_passes(expected, got, tolerance)
167 |     output, failures = result_text(result, failures)
168 |     text += f', Result: {output}'
169 |     print(text)
170 |     return failures
171 | 
172 | 
173 | def evaluate_dali(baselines: dict, results: dict, test_name: str,
174 |                   failures: int, tolerance: int) -> int:
175 |     """
176 |     Evaluate the DALI test results against the baseline.
177 | 
178 |     Determine if the DALI test results meet the expected threshold and display
179 |     the outcome with appropriate units.
180 | 
181 |     Parameters
182 |     ----------
183 |     baselines : dict
184 |         A ``dictionary`` of the baseline to compare results against.
185 |     results : dict
186 |         A ``dictionary`` of the parsed results.
187 |     test_name : str
188 |         A ``string`` of the name of the test being parsed.
189 |     failures : int
190 |         An ``integer`` of the number of results that have not met the
191 |         threshold.
192 |     tolerance : int
193 |         An ``int`` of the percentage below the threshold to still mark as
194 |         passing.
195 | 
196 |     Returns
197 |     -------
198 |     int
199 |         Returns an ``integer`` of the number of results that have not met the
200 |         threshold.
201 |     """
202 |     for test, value in baselines.items():
203 |         if test not in results.keys():
204 |             continue
205 |         print(f'  DALI {test} (images/second)')
206 |         expected = value
207 |         got = round(results[test]['average images/second'], 3)
208 |         text = f'    Expected: {expected}, Got: {got}'
209 |         result = metric_passes(expected, got, tolerance)
210 |         output, failures = result_text(result, failures)
211 |         text += f', Result: {output}'
212 |         print(text)
213 |     return failures
214 | 
215 | 
216 | def evaluate_test(baseline: dict, results: dict, system_count: int,
217 |                   tolerance: int, failures: int) -> int:
218 |     """
219 |     Evaluate all tests for N-nodes and compare against the baseline.
220 | 
221 |     The comparison verifies results meet a certain threshold for each system
222 |     count in a sweep. For example, in an 8-node sweep, compare the one-node
223 |     results to the baseline before comparing the two-node results and so on.
224 | 
225 |     Parameters
226 |     ----------
227 |     baseline : dict
228 |         A ``dictionary`` of the baseline to compare results against.
229 |     results : dict
230 |         A ``dictionary`` of the parsed results.
231 |     system_count : int
232 |         An ``int`` of the number of systems that were tested for each
233 |         comparison level.
234 |     tolerance : int
235 |         An ``int`` of the percentage below the threshold to still mark as
236 |         passing.
237 |     failures : int
238 |         An ``integer`` of the number of results that have not met the
239 |         threshold.
240 | 
241 |     Returns
242 |     -------
243 |     int
244 |         Returns an ``integer`` of the number of results that have not met the
245 |         threshold.
246 |     """
247 |     for test_name, test_values in baseline.items():
248 |         print('-' * 80)
249 |         if test_name in ['bandwidth', 'iops']:
250 |             failures = evaluate_fio(test_values, results, test_name, failures,
251 |                                     tolerance)
252 |         elif test_name == 'nccl':
253 |             failures = evaluate_nccl(test_values, results, failures, tolerance)
254 |         elif test_name == 'dali':
255 |             failures = evaluate_dali(test_values,
256 |                                      results['dali'],
257 |                                      test_name,
258 |                                      failures,
259 |                                      tolerance)
260 |     return failures
261 | 
262 | 
263 | def compare_baseline(results: dict, baseline: str, tolerance: int,
264 |                      custom: Optional[bool] = False) -> NoReturn:
265 |     """
266 |     Compare a baseline against parsed results.
267 | 
268 |     Pull the requested baseline either from a custom YAML file or one of the
269 |     existing baselines included with the application and compare against the
270 |     parsed results by checking if the parsed result is greater than the
271 |     baseline on a per-system basis.
272 | 
273 |     Parameters
274 |     ----------
275 |     results : dict
276 |         A ``dictionary`` of the complete set of results from a parsed
277 |         dictionary.
278 |     baseline : str
279 |         A ``string`` of the baseline to use. This either represents a key from
280 |         the included baselines, or a filename to a custom YAML config file to
281 |         read.
282 |     tolerance : int
283 |         An ``int`` of the tolerance as a percentage below the baseline to allow
284 |         results to still be marked as passing.
285 |     custom : bool (optional)
286 |         An optional ``boolean`` that, when `True`, will read in a baseline
287 |         passed from a YAML file. If `False`, it will compare against an
288 |         included baseline.
289 |     """
290 |     failures = 0
291 | 
292 |     print('=' * 80)
293 |     print('Baseline assessment')
294 |     if custom:
295 |         print('Comparing against a custom config')
296 |         baseline = read_yaml(baseline)
297 |     else:
298 |         print(f'Comparing against "{baseline}"')
299 |         baseline = BASELINES[baseline]
300 |     if tolerance > 0:
301 |         print(f'Allowing a tolerance of {tolerance}% below expected to PASS')
302 | 
303 |     for system_count, baseline_results in baseline['systems'].items():
304 |         print('=' * 80)
305 |         if str(system_count) not in results['systems'].keys():
306 |             print(f'No results found for {system_count} system(s)')
307 |             print('Skipping...')
308 |             continue
309 |         print(f' {system_count} System(s)')
310 |         failures = evaluate_test(baseline_results,
311 |                                  results['systems'][str(system_count)],
312 |                                  system_count,
313 |                                  tolerance,
314 |                                  failures)
315 | 
316 |     if failures > 0:
317 |         print('-' * 80)
318 |         print(f'{failures} test(s) did not meet the suggested criteria!')
319 |         print('See results above for failed tests and verify setup.')
320 |         # Throw a non-zero exit status so any tools that read codes will catch
321 |         # that the baseline was not met.
322 |         sys.exit(BASELINE_FAILURE)
323 | 
324 |     print('=' * 80)
325 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/dali.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import re
  3 | 
  4 | 
  5 | def _clean_sizes(sizes: list) -> list:
  6 |     """
  7 |     Remove all text from sizes.
  8 | 
  9 |     The parser to capture sizes of various objects includes 'in bytes: ' in the
 10 |     string which should be stripped, leaving only numbers.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     sizes : list
 15 |         A ``list`` of ``strings`` of sizes of various objects.
 16 | 
 17 |     Returns
 18 |     -------
 19 |     list
 20 |         Returns a ``list`` of ``integers`` of sizes of various objects.
 21 |     """
 22 |     return [int(size.replace('in bytes: ', '')) for size in sizes]
 23 | 
 24 | 
 25 | def _size_parsing(log_contents: str) -> dict:
 26 |     """
 27 |     Capture the image and directory size for image data.
 28 | 
 29 |     Parse the image and directory size for all images generated using
 30 |     Imageinary. It is assumed that the image and directory size are identical
 31 |     for both the TFRecord and standard JPEG images of similar sizes.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     log_contents : str
 36 |         A ``string`` of the contents from a DALI log file.
 37 | 
 38 |     Returns
 39 |     -------
 40 |     dict
 41 |         Returns a ``dictionary`` of image size information for all image sizes
 42 |         and formats.
 43 | 
 44 |     Raises
 45 |     ------
 46 |     ValueError
 47 |         Raises a ``ValueError`` if the log file does not contain size
 48 |         information.
 49 |     """
 50 |     results_sub_dict = {
 51 |         'image size': 0,
 52 |         'size unit': 'B',
 53 |         'directory size': 0,
 54 |         'min images/second': 0,
 55 |         'average images/second': 0,
 56 |         'min bandwidth': 0,
 57 |         'average bandwidth': 0,
 58 |         'bandwidth unit': 'bytes/second'
 59 |     }
 60 |     results = {
 61 |         '800x600 standard jpg': results_sub_dict.copy(),
 62 |         '3840x2160 standard jpg': results_sub_dict.copy(),
 63 |         '800x600 tfrecord': results_sub_dict.copy(),
 64 |         '3840x2160 tfrecord': results_sub_dict.copy()
 65 |     }
 66 | 
 67 |     image_size = re.findall('First image size from .*\n.*', log_contents)
 68 |     if len(image_size) != 4:
 69 |         raise ValueError('Error: Incomplete DALI file. Missing information on'
 70 |                          ' file sizes')
 71 |     for line in image_size:
 72 |         sizes = re.findall(r'in bytes: \d+', line)
 73 |         if len(sizes) != 2:
 74 |             raise ValueError('Error: Missing data sizes in DALI log file.')
 75 |         image_size, directory_size = _clean_sizes(sizes)
 76 |         if '3840x2160' in line:
 77 |             results['3840x2160 standard jpg']['image size'] = image_size
 78 |             results['3840x2160 standard jpg']['directory size'] = \
 79 |                 directory_size
 80 |             results['3840x2160 tfrecord']['image size'] = image_size
 81 |             results['3840x2160 tfrecord']['directory size'] = directory_size
 82 |         elif '800x600' in line:
 83 |             results['800x600 standard jpg']['image size'] = image_size
 84 |             results['800x600 standard jpg']['directory size'] = directory_size
 85 |             results['800x600 tfrecord']['image size'] = image_size
 86 |             results['800x600 tfrecord']['directory size'] = directory_size
 87 |     return results
 88 | 
 89 | 
 90 | def _average(input_list: list) -> float:
 91 |     """
 92 |     Find the average of a list.
 93 | 
 94 |     Given a list of numbers, calculate the average of all values in the list.
 95 |     If the list is empty, default to 0.0.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     input_list : list
100 |         A ``list`` of ``floats`` to find an average of.
101 | 
102 |     Returns
103 |     -------
104 |     float
105 |         Returns a ``float`` of the average value of the list.
106 |     """
107 |     try:
108 |         return float(sum(input_list) / len(input_list))
109 |     except ZeroDivisionError:
110 |         return 0.0
111 | 
112 | 
113 | def _update_results(image_type_match: dict, results: list) -> dict:
114 |     """
115 |     Update image dictionary with throughput and bandwidth.
116 | 
117 |     Find the minimum and average throughput and bandwdith for a particular
118 |     image size and type by processing a list of all corresponding results.
119 | 
120 |     Parameters
121 |     ----------
122 |     image_type_match : dict
123 |         A ``dictionary`` of the throughput and bandwidth for a particular image
124 |         size and type.
125 |     results : list
126 |         A ``list`` of ``floats`` representing results from the experiment runs.
127 | 
128 |     Returns
129 |     -------
130 |     dict
131 |         An updated ``dictionary`` of the throughput and bandwidth for a
132 |         particular image size and type.
133 |     """
134 |     size = image_type_match['image size']
135 |     image_type_match['min images/second'] = min(results)
136 |     image_type_match['average images/second'] = _average(results)
137 |     image_type_match['min bandwidth'] = size * min(results)
138 |     image_type_match['average bandwidth'] = size * _average(results)
139 |     return image_type_match
140 | 
141 | 
142 | def _result_parsing(log_contents: str, systems: int, image_results: dict,
143 |                     log_file: str) -> dict:
144 |     """
145 |     Parse the throughput results from the log file.
146 | 
147 |     Given a log file, find all of the results for each of the four test runs
148 |     including both standard JPEG and TFRecord formats for 800x600 and 4K
149 |     images. Each section starts with 'RUN 1/1' and runs for 11 epochs before
150 |     printing 'OK' once complete. The result sections are in a strict order,
151 |     allowing us to deterministically match results with the corresponding
152 |     image size and type:
153 |       0: 800x600 Standard File Read
154 |       1: 3840x2160 Standard File Read
155 |       2: 800x600 TFRecord
156 |       3: 3840x2160 TFRecord
157 | 
158 |     Parameters
159 |     ----------
160 |     log_contents : str
161 |         A ``string`` of the contents from a DALI log file.
162 |     systems : int
163 |         An ``integer`` of the number of systems used during the current test.
164 |     image_results : dict
165 |         A ``dictionary`` of image size information for all image sizes and
166 |         formats.
167 |     log_file : str
168 |         A ``string`` of the name of the log file being parsed.
169 | 
170 |     Returns
171 |     -------
172 |     dict
173 |         Returns an updated ``dictionary`` of image size information for all
174 |         image sizes and formats.
175 |     """
176 |     # The result sections are in a strict order, allowing us to
177 |     # deterministically match results with the corresponding image size and
178 |     # type:
179 |     # 0: 800x600 Standard File Read
180 |     # 1: 3840x2160 Standard File Read
181 |     # 2: 800x600 TFRecord
182 |     # 3: 3840x2160 TFRecord
183 |     image_type_match = [
184 |         image_results['800x600 standard jpg'],
185 |         image_results['3840x2160 standard jpg'],
186 |         image_results['800x600 tfrecord'],
187 |         image_results['3840x2160 tfrecord']
188 |     ]
189 | 
190 |     test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL)
191 |     if len(test_sections) != 4:
192 |         print(f'Warning: Invalid number of results found in {log_file} log '
193 |               'file. Skipping...')
194 |         return {}
195 | 
196 |     for num, section in enumerate(test_sections):
197 |         result_lines = re.findall('.*img/s', section)
198 |         all_speeds = []
199 | 
200 |         for line in result_lines:
201 |             speed = re.sub('.*speed: ', '', line)
202 |             speed = float(speed.replace(' [img/s', ''))
203 |             all_speeds.append(speed)
204 | 
205 |         # Per standard practices, the first N results for N systems is treated
206 |         # as a warmup and discarded. Occasionally, the timing of results will
207 |         # be off, and one node will showcase the 2nd test pass before all nodes
208 |         # have finished the first. To accomodate for this, the lowest N results
209 |         # are assumed to be the first test pass and are dropped.
210 |         all_speeds = sorted(all_speeds)[systems:]
211 |         image_type_match[num] = _update_results(image_type_match[num],
212 |                                                 all_speeds)
213 | 
214 |     # Rebuild the dictionary based on the updated results.
215 |     image_results = {
216 |         '800x600 standard jpg': image_type_match[0],
217 |         '3840x2160 standard jpg': image_type_match[1],
218 |         '800x600 tfrecord': image_type_match[2],
219 |         '3840x2160 tfrecord': image_type_match[3]
220 |     }
221 |     return image_results
222 | 
223 | 
224 | def _combine_results(results: list, systems: int) -> dict:
225 |     """
226 |     Aggregate all results for N-systems.
227 | 
228 |     Find the average throughput, bandwidth, and size for all iterations
229 |     combined and create a single object which can be used to easily reference
230 |     results.
231 | 
232 |     Parameters
233 |     ----------
234 |     results : list
235 |         A ``list`` of ``dicts`` for all results from a particular test.
236 |     systems : int
237 |         An ``integer`` of the number of systems used during the current test.
238 | 
239 |     Returns
240 |     -------
241 |     dict
242 |         Returns a ``dictionary`` of the final aggregate results for all
243 |         iterations for N-nodes for all image types and sizes.
244 |     """
245 |     system_results = {}
246 | 
247 |     for image_type in ['800x600 standard jpg',
248 |                        '3840x2160 standard jpg',
249 |                        '800x600 tfrecord',
250 |                        '3840x2160 tfrecord']:
251 |         avg_min_speed, avg_avg_speed = [], []
252 |         avg_min_bw, avg_avg_bw = [], []
253 |         avg_img_size, avg_dir_size = [], []
254 | 
255 |         for result in results:
256 |             if image_type not in result:
257 |                 continue
258 |             avg_min_speed.append(result[image_type]['min images/second'])
259 |             avg_avg_speed.append(result[image_type]['average images/second'])
260 |             avg_min_bw.append(result[image_type]['min bandwidth'])
261 |             avg_avg_bw.append(result[image_type]['average bandwidth'])
262 |             avg_img_size.append(result[image_type]['image size'])
263 |             avg_dir_size.append(result[image_type]['directory size'])
264 | 
265 |         # Multiply the average in all performance categories by the number of
266 |         # systems tested to get an average aggregate throughput result for the
267 |         # cluster.
268 |         system_results[image_type] = {
269 |             'image size': _average(avg_img_size),
270 |             'size unit': 'B',
271 |             'directory size': _average(avg_dir_size),
272 |             'min images/second': _average(avg_min_speed) * systems,
273 |             'average images/second': _average(avg_avg_speed) * systems,
274 |             'min bandwidth': _average(avg_min_bw) * systems,
275 |             'average bandwidth': _average(avg_avg_bw) * systems,
276 |             'bandwidth unit': 'bytes/second'
277 |         }
278 |     return system_results
279 | 
280 | 
281 | def parse_dali_file(log_files: list, systems: int, results_dict: dict) -> dict:
282 |     """
283 |     Parse the aggregate DALI results for N-systems.
284 | 
285 |     Search through each DALI log for N-systems and find the minimum and average
286 |     throughput and bandwidth for all four of the DALI tests of various image
287 |     sizes and formats.
288 | 
289 |     Parameters
290 |     ----------
291 |     log_files : list
292 |         A ``list`` of ``strings`` where each element is a filepath to a log
293 |         file.
294 |     systems : int
295 |         An ``integer`` of the current number of systems to aggregate results
296 |         for.
297 |     results_dict : dict
298 |         A ``dictionary`` of the aggregate test results for all system counts.
299 | 
300 |     Returns
301 |     -------
302 |     dict
303 |         An updated ``dictionary`` of the aggregate test results including the
304 |         newly-parsed results for N-systems.
305 |     """
306 |     results = []
307 | 
308 |     for log in log_files:
309 |         with open(log, 'r') as f:
310 |             log_contents = f.read()
311 |         image_results = _size_parsing(log_contents)
312 |         results.append(_result_parsing(log_contents,
313 |                                        systems,
314 |                                        image_results,
315 |                                        log))
316 |     results_dict[systems] = _combine_results(results, systems)
317 |     return results_dict
318 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/table.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import numpy as np
  3 | import operator
  4 | from bobber.lib.analysis.common import bcolors
  5 | from tabulate import tabulate
  6 | from typing import NoReturn, Tuple
  7 | 
  8 | 
  9 | FIO_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 1MB BS{bcolors.ENDC}'
 10 | FIO_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 1MB BS{bcolors.ENDC}'
 11 | FIO_READ_IOP = f'{bcolors.BOLD}FIO Read (k IOPS) - 4K BS{bcolors.ENDC}'
 12 | FIO_WRITE_IOP = f'{bcolors.BOLD}FIO Write (k IOPS) - 4K BS{bcolors.ENDC}'
 13 | FIO_125K_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 125K BS{bcolors.ENDC}'
 14 | FIO_125K_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 125K BS{bcolors.ENDC}'
 15 | NCCL = f'{bcolors.BOLD}NCCL Max BW (GB/s){bcolors.ENDC}'
 16 | DALI_IMG_SM = (f'{bcolors.BOLD}DALI Standard 800x600 throughput '
 17 |                f'(images/second){bcolors.ENDC}')
 18 | DALI_IMG_SM_BW = (f'{bcolors.BOLD}DALI Standard 800x600 bandwidth '
 19 |                   f'(GB/s){bcolors.ENDC}')
 20 | DALI_IMG_LG = (f'{bcolors.BOLD}DALI Standard 3840x2160 throughput '
 21 |                f'(images/second){bcolors.ENDC}')
 22 | DALI_IMG_LG_BW = (f'{bcolors.BOLD}DALI Standard 3840x2160 bandwidth '
 23 |                   f'(GB/s){bcolors.ENDC}')
 24 | DALI_TF_SM = (f'{bcolors.BOLD}DALI TFRecord 800x600 throughput '
 25 |               f'(images/second){bcolors.ENDC}')
 26 | DALI_TF_SM_BW = (f'{bcolors.BOLD}DALI TFRecord 800x600 bandwidth '
 27 |                  f'(GB/s){bcolors.ENDC}')
 28 | DALI_TF_LG = (f'{bcolors.BOLD}DALI TFRecord 3840x2160 throughput '
 29 |               f'(images/second){bcolors.ENDC}')
 30 | DALI_TF_LG_BW = (f'{bcolors.BOLD}DALI TFRecord 3840x2160 bandwidth '
 31 |                  f'(GB/s){bcolors.ENDC}')
 32 | 
 33 | 
 34 | def bytes_to_gb(number: float) -> float:
 35 |     """
 36 |     Convert bytes to gigabytes.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     number : float
 41 |         A ``float`` in bytes.
 42 | 
 43 |     Returns
 44 |     -------
 45 |     float
 46 |         Returns a ``float`` of the number in gigabytes.
 47 |     """
 48 |     return round(number * 1e-9, 3)
 49 | 
 50 | 
 51 | def iops_to_kiops(number: float) -> float:
 52 |     """
 53 |     Convert iops to k-iops.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     number : float
 58 |         A ``float`` in iops.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     float
 63 |         Returns a ``float`` of the number in k-iops.
 64 |     """
 65 |     return round(number * 1e-3, 3)
 66 | 
 67 | 
 68 | def scale(values: list) -> float:
 69 |     """
 70 |     Calculate the scaling factor of results.
 71 | 
 72 |     Calculate the scale by determining the slope of the line of best fit and
 73 |     dividing by the first value in the results, plus 1.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     values : list
 78 |         A ``list`` of ``floats`` to calculate the scale factor for.
 79 | 
 80 |     Returns
 81 |     -------
 82 |     float
 83 |         Returns a ``float`` of the scaling factor.
 84 |     """
 85 |     x = np.array(range(1, len(values) + 1))
 86 |     y = np.array(values)
 87 |     slope, _ = np.polyfit(x, y, 1)
 88 |     return slope / values[0] + 1.0
 89 | 
 90 | 
 91 | def fio_bw(results: list) -> Tuple[list, list]:
 92 |     """
 93 |     Save the FIO bandwidth read and write results.
 94 | 
 95 |     Save the read and write results from the FIO bandwidth tests on an
 96 |     increasing per-system basis with the first element in the list being the
 97 |     column header.
 98 | 
 99 |     Parameters
100 |     ----------
101 |     results : list
102 |         A ``list`` of ``dictionaries`` containing all results from the tests.
103 | 
104 |     Returns
105 |     -------
106 |     tuple
107 |         Returns a ``tuple`` of (``list``, ``list``) containing the read and
108 |         write bandwidth results, respectively.
109 |     """
110 |     try:
111 |         read = [FIO_READ_BW] + [bytes_to_gb(result[1]['bandwidth']['read'])
112 |                                 for result in results]
113 |         write = [FIO_WRITE_BW] + [bytes_to_gb(result[1]['bandwidth']['write'])
114 |                                   for result in results]
115 |     except KeyError:
116 |         return []
117 |     else:
118 |         return [read, write]
119 | 
120 | 
121 | def fio_iops(results: list) -> Tuple[list, list]:
122 |     """
123 |     Save the FIO IOPS read and write results.
124 | 
125 |     Save the read and write results from the FIO IOPS tests on an increasing
126 |     per-system basis with the first element in the list being the column
127 |     header.
128 | 
129 |     Parameters
130 |     ----------
131 |     results : list
132 |         A ``list`` of ``dictionaries`` containing all results from the tests.
133 | 
134 |     Returns
135 |     -------
136 |     tuple
137 |         Returns a ``tuple`` of (``list``, ``list``) containing the read and
138 |         write IOPS results, respectively.
139 |     """
140 |     try:
141 |         read = [FIO_READ_IOP] + [iops_to_kiops(result[1]['iops']['read'])
142 |                                  for result in results]
143 |         write = [FIO_WRITE_IOP] + [iops_to_kiops(result[1]['iops']['write'])
144 |                                    for result in results]
145 |     except KeyError:
146 |         return []
147 |     else:
148 |         return [read, write]
149 | 
150 | 
151 | def fio_125k_bw(results: list) -> Tuple[list, list]:
152 |     """
153 |     Save the FIO 125k bandwidth read and write results.
154 | 
155 |     Save the read and write results from the FIO 125k bandwidth tests on an
156 |     increasing per-system basis with the first element in the list being the
157 |     column header.
158 | 
159 |     Parameters
160 |     ----------
161 |     results : list
162 |         A ``list`` of ``dictionaries`` containing all results from the tests.
163 | 
164 |     Returns
165 |     -------
166 |     tuple
167 |         Returns a ``tuple`` of (``list``, ``list``) containing the read and
168 |         write 125k bandwidth results, respectively.
169 |     """
170 |     try:
171 |         read = [FIO_125K_READ_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
172 |                                                  ['read'])
173 |                                      for result in results]
174 |         write = [FIO_125K_WRITE_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
175 |                                                    ['write'])
176 |                                        for result in results]
177 |     except KeyError:
178 |         return []
179 |     else:
180 |         return [read, write]
181 | 
182 | 
183 | def nccl(results: list) -> list:
184 |     """
185 |     Save the NCCL results.
186 | 
187 |     Save the maximum bus bandwidth results from the NCCL tests on an increasing
188 |     per-system basis with the first element in the list being the column
189 |     header.
190 | 
191 |     Parameters
192 |     ----------
193 |     results : list
194 |         A ``list`` of dictionaries containing all results from the tests.
195 | 
196 |     Returns
197 |     -------
198 |     list
199 |         Returns a ``list`` of the NCCL max bus bandwidth results.
200 |     """
201 |     try:
202 |         nccl = [NCCL] + [round(result[1]['nccl']['max_bus_bw'], 3)
203 |                          for result in results]
204 |     except KeyError:
205 |         return []
206 |     else:
207 |         return [nccl]
208 | 
209 | 
210 | def dali(results: list) -> Tuple[list, list, list, list, list, list, list,
211 |                                  list]:
212 |     """
213 |     Save the DALI results.
214 | 
215 |     Save the throughput and bandwidth results from the DALI tests on an
216 |     increasing per-system basis with the first element in the list being the
217 |     column header.
218 | 
219 |     Parameters
220 |     ----------
221 |     results : list
222 |         A ``list`` of dictionaries containing all results from the tests.
223 | 
224 |     Returns
225 |     -------
226 |     tuple
227 |         Returns a ``tuple`` of eight ``lists`` containing the throughput
228 |         followed by bandwidth for small and large standard images, then small
229 |         and large TFRecords.
230 |     """
231 |     try:
232 |         img_sm = [DALI_IMG_SM] + [result[1]['dali']['800x600 standard jpg']
233 |                                   ['average images/second']
234 |                                   for result in results]
235 |         img_sm_bw = [DALI_IMG_SM_BW] + [bytes_to_gb(result[1]['dali']
236 |                                                     ['800x600 standard jpg']
237 |                                                     ['average bandwidth'])
238 |                                         for result in results]
239 |         img_lg = [DALI_IMG_LG] + [result[1]['dali']['3840x2160 standard jpg']
240 |                                   ['average images/second']
241 |                                   for result in results]
242 |         img_lg_bw = [DALI_IMG_LG_BW] + [bytes_to_gb(result[1]['dali']
243 |                                                     ['3840x2160 standard jpg']
244 |                                                     ['average bandwidth'])
245 |                                         for result in results]
246 |         tf_sm = [DALI_TF_SM] + [result[1]['dali']['800x600 tfrecord']
247 |                                 ['average images/second']
248 |                                 for result in results]
249 |         tf_sm_bw = [DALI_TF_SM_BW] + [bytes_to_gb(result[1]['dali']
250 |                                                   ['800x600 tfrecord']
251 |                                                   ['average bandwidth'])
252 |                                       for result in results]
253 |         tf_lg = [DALI_TF_LG] + [result[1]['dali']['3840x2160 tfrecord']
254 |                                 ['average images/second']
255 |                                 for result in results]
256 |         tf_lg_bw = [DALI_TF_LG_BW] + [bytes_to_gb(result[1]['dali'][
257 |                                                   '3840x2160 tfrecord']
258 |                                                   ['average bandwidth'])
259 |                                       for result in results]
260 |     except KeyError:
261 |         return []
262 |     else:
263 |         return [img_sm, img_sm_bw, img_lg, img_lg_bw, tf_sm, tf_sm_bw, tf_lg,
264 |                 tf_lg_bw]
265 | 
266 | 
267 | def add_scale(data: list) -> NoReturn:
268 |     """
269 |     Add the scaling factor to results.
270 | 
271 |     Iterate through all results and append the scaling factor to each of the
272 |     categories, if applicable. Results that have a scaling factor greater than
273 |     1.9x are marked GREEN, results greater than 1.5 are marked YELLOW, and all
274 |     other results are RED.
275 | 
276 |     Parameters
277 |     ----------
278 |     data : list
279 |         A ``list`` of ``lists`` of all categories of results.
280 |     """
281 |     for subset in data:
282 |         # No results in the data - just the test category name
283 |         if len(subset) < 2:
284 |             continue
285 |         # Scaling can't be calculated for NCCL as it has a different behavior
286 |         # from other tests. For single-node only tests, there is nothing to
287 |         # measure for scaling. Both scenarios should be ignored for calculating
288 |         # scale factor.
289 |         if 'nccl' in subset[0].lower() or len(subset) == 2:
290 |             subset += ['N/A']
291 |             continue
292 |         values = subset[1:]
293 |         scale_val = round(scale(values), 2)
294 |         if scale_val > 1.9:
295 |             scale_text = f'{bcolors.PASS}{scale_val}X{bcolors.ENDC}'
296 |         elif scale_val > 1.5:
297 |             scale_text = f'{bcolors.WARNING}{scale_val}X{bcolors.ENDC}'
298 |         else:
299 |             scale_text = f'{bcolors.FAIL}{scale_val}X{bcolors.ENDC}'
300 |         subset += [scale_text]
301 | 
302 | 
303 | def display_table(json_results: dict) -> NoReturn:
304 |     """
305 |     Display results in tabular format.
306 | 
307 |     Find the results on a per-system basis for all categories and display the
308 |     resulting scaling factor.
309 | 
310 |     Parameters
311 |     ----------
312 |     json_results : dict
313 |         A ``dictionary`` of the final results that have been parsed from the
314 |         results directory.
315 |     """
316 |     data = []
317 |     headers = [f'{bcolors.BOLD}Test{bcolors.ENDC}'] + \
318 |               [f'{bcolors.BOLD}{num} Node(s){bcolors.ENDC}'
319 |                for num in sorted(json_results['systems'])] + \
320 |               [f'{bcolors.BOLD}Scale{bcolors.ENDC}']
321 |     results = sorted(json_results['systems'].items())
322 | 
323 |     data += fio_bw(results)
324 |     data += fio_iops(results)
325 |     data += fio_125k_bw(results)
326 |     data += nccl(results)
327 |     data += dali(results)
328 | 
329 |     add_scale(data)
330 | 
331 |     print(tabulate(data, headers=headers, tablefmt='grid', numalign='right'))
332 |     print()
333 | 


--------------------------------------------------------------------------------
/bobber/lib/docker/management.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import docker
  3 | import os
  4 | import sys
  5 | from bobber.__version__ import __version__ as version
  6 | from bobber.lib.exit_codes import (CONTAINER_NOT_RUNNING,
  7 |                                    CONTAINER_VERSION_MISMATCH,
  8 |                                    DOCKER_BUILD_FAILURE,
  9 |                                    DOCKER_COMMUNICATION_ERROR,
 10 |                                    NVIDIA_RUNTIME_ERROR)
 11 | from bobber.lib.system.file_handler import update_log
 12 | from docker.models.containers import Container
 13 | from typing import NoReturn, Optional
 14 | 
 15 | 
 16 | class DockerManager:
 17 |     """
 18 |     Build, launch, and execute commands for Docker containers.
 19 | 
 20 |     The DockerManager provides a single interface accessible from the entire
 21 |     Bobber package in which to communicate with Docker containers. The class
 22 |     provides the ability to build new containers based on the provided
 23 |     Dockerfile, launch the container with necessary settings for tests, and
 24 |     execute commands inside the launched container to run tests. An instance
 25 |     of this class is created in the bobber.lib.docker.__init__.py module which
 26 |     can be access from other modules without re-instantiating the class.
 27 |     """
 28 |     def __init__(self) -> NoReturn:
 29 |         try:
 30 |             self.client = docker.from_env()
 31 |             self.cli = docker.APIClient(timeout=600)
 32 |         except docker.errors.DockerException as e:
 33 |             if 'error while fetching server api version' in str(e).lower():
 34 |                 print('Error: Could not communicate with the Docker daemon.')
 35 |                 print('Ensure Docker is running with "systemctl start docker"')
 36 |                 sys.exit(DOCKER_COMMUNICATION_ERROR)
 37 | 
 38 |     def _build_if_not_built(self, tag: str, bobber_version: str) -> NoReturn:
 39 |         """
 40 |         Build the image if not built already.
 41 | 
 42 |         Check if an image exists for the local version of Bobber. If not, build
 43 |         the image immediately.
 44 | 
 45 |         Parameters
 46 |         ----------
 47 |         tag : string
 48 |             A ``string`` of the Bobber image name, such as
 49 |             'nvidia/bobber:5.0.0'.
 50 |         bobber_version : string
 51 |             A ``string`` of the local version of Bobber, such as '5.0.0'.
 52 |         """
 53 |         try:
 54 |             self.client.images.get(tag)
 55 |         except docker.errors.ImageNotFound:
 56 |             print(f'Image {tag} not built, building now...')
 57 |             self.build(bobber_version)
 58 | 
 59 |     def get_tag(self, bobber_version: str) -> str:
 60 |         """
 61 |         Create the image name.
 62 | 
 63 |         Build the full image name including the tag, such as
 64 |         'nvidia/bobber:5.0.0'.
 65 | 
 66 |         Parameters
 67 |         ----------
 68 |         bobber_version : string
 69 |             A ``string`` of the local version of Bobber, such as '5.0.0'.
 70 | 
 71 |         Returns
 72 |         -------
 73 |         str
 74 |             Returns a ``string`` of the full image name plus tag, such as
 75 |             'nvidia/bobber:5.0.0'.
 76 |         """
 77 |         return f'nvidia/bobber:{bobber_version}'
 78 | 
 79 |     def cast(self, storage_path: str, ignore_gpu: bool,
 80 |              bobber_version: str) -> NoReturn:
 81 |         """
 82 |         Launch a container with necessary settings.
 83 | 
 84 |         Launch a Bobber image with various settings required to initiate the
 85 |         testing framework, including attaching GPUs, starting an SSH daemon,
 86 |         setting the container to privileged mode, and attaching a filesystem
 87 |         to be accessible inside the container.
 88 | 
 89 |         The launched container will be based off of the Bobber image for the
 90 |         current version of the application. If the image does not yet exist,
 91 |         it will be built automatically. The launched container is named
 92 |         'bobber'.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         storage_path : string
 97 |             A ``string`` of the absolute path to the storage location to test
 98 |             against, such as `/mnt/storage`.
 99 |         ignore_gpu : boolean
100 |             When `True`, launches the container without GPU resources. Defaults
101 |             to `False`.
102 |         bobber_version : string
103 |             A ``string`` of the local version of Bobber, such as '5.0.0'.
104 |         """
105 |         tag = self.get_tag(bobber_version)
106 |         self._build_if_not_built(tag, bobber_version)
107 |         runtime = None
108 |         if not ignore_gpu:
109 |             runtime = 'nvidia'
110 |         try:
111 |             self.client.containers.run(
112 |                 tag,
113 |                 'bash -c "/usr/sbin/sshd; sleep infinity"',
114 |                 detach=True,
115 |                 auto_remove=True,
116 |                 ipc_mode='host',
117 |                 name='bobber',
118 |                 network_mode='host',
119 |                 privileged=True,
120 |                 shm_size='1G',
121 |                 runtime=runtime,
122 |                 ulimits=[
123 |                     docker.types.Ulimit(name='memlock',
124 |                                         soft=-1,
125 |                                         hard=-1),
126 |                     docker.types.Ulimit(name='stack',
127 |                                         soft=67108864,
128 |                                         hard=67108864)
129 |                 ],
130 |                 volumes={
131 |                     f'{storage_path}': {
132 |                         'bind': '/mnt/fs_under_test',
133 |                         'mode': 'rw'
134 |                     }
135 |                 }
136 |             )
137 |         except docker.errors.APIError as e:
138 |             if 'Unknown runtime specified nvidia' in str(e):
139 |                 print('NVIDIA container runtime not found. Ensure the latest '
140 |                       'nvidia-docker libraries and NVIDIA drivers are '
141 |                       'installed.')
142 |                 sys.exit(NVIDIA_RUNTIME_ERROR)
143 | 
144 |     def export(self, bobber_version: str) -> NoReturn:
145 |         """
146 |         Save an image as a tarball.
147 | 
148 |         To make it easy to transfer an image to multiple machines, the image
149 |         can be saved as a tarball which can be copied directly to a remote
150 |         device. On the other device, run the "load" command to load the copied
151 |         tarball.
152 | 
153 |         Parameters
154 |         ----------
155 |         bobber_version : string
156 |             A ``string`` of the local version of Bobber, such as '5.0.0'.
157 |         """
158 |         tag = self.get_tag(bobber_version)
159 |         self._build_if_not_built(tag, bobber_version)
160 |         filename = tag.replace('/', '_').replace(':', '_')
161 |         print(f'Exporting {tag} to "{filename}.tar". This may take a while...')
162 |         image = self.cli.get_image(tag)
163 |         with open(f'{filename}.tar', 'wb') as image_file:
164 |             for chunk in image:
165 |                 image_file.write(chunk)
166 |         print(f'{tag} saved to {filename}.tar')
167 | 
168 |     def build(self, bobber_version: str) -> NoReturn:
169 |         """
170 |         Build the image on the Dockerfile.
171 | 
172 |         Build a new image based on the Dockerfile named
173 |         'nvidia/bobber:{version}'.
174 | 
175 |         Parameters
176 |         ----------
177 |         bobber_version : string
178 |             A ``string`` of the local version of Bobber, such as '5.0.0'.
179 |         """
180 |         tag = self.get_tag(bobber_version)
181 |         print('Building a new image. This may take a while...')
182 |         # Set the path to the repository's parent directory.
183 |         path = os.path.dirname(os.path.abspath(__file__))
184 |         path = '/'.join(path.split('/')[:-2])
185 |         output = self.cli.build(path=path,
186 |                                 dockerfile='lib/docker/Dockerfile',
187 |                                 tag=tag,
188 |                                 decode=True)
189 |         for line in output:
190 |             if 'error' in line.keys():
191 |                 print(line['error'].rstrip())
192 |                 print(f'{tag} build failed. See error above.')
193 |                 sys.exit(DOCKER_BUILD_FAILURE)
194 |             if 'stream' in line.keys() and line['stream'].strip() != '':
195 |                 print(line['stream'].rstrip())
196 |         print(f'{tag} successfully built')
197 | 
198 |     def load(self, filename: str) -> NoReturn:
199 |         """
200 |         Load a Docker image from a tarball.
201 | 
202 |         If a Bobber image was saved as a tarball using the "export" command, it
203 |         can be loaded on the system using the "load" command.
204 | 
205 |         Parameters
206 |         ----------
207 |         filename : string
208 |             A ``string`` of the filename for the local tarball to load, such as
209 |             './nvidia_bobber_5.0.0.tar'.
210 |         """
211 |         print(f'Importing {filename}. This may take a while...')
212 |         with open(filename, 'rb') as image_file:
213 |             self.client.images.load(image_file)
214 | 
215 |     def execute(self, command: str, environment: Optional[dict] = None,
216 |                 log_file: Optional[str] = None) -> NoReturn:
217 |         """
218 |         Execute a command against the running container.
219 | 
220 |         Assuming the Bobber container is already launched from the "cast"
221 |         command, execute a specific command and stream the output to the
222 |         terminal. Optionally specify a dictionary with any necessary
223 |         environment variables and a log file to save the output to.
224 | 
225 |         Parameters
226 |         ----------
227 |         command : string
228 |             A ``string`` of the command to run inside the container.
229 |         environment : dict (Optional)
230 |             A ``dictionary`` of environment variables to use where the keys are
231 |             the name of the variable and the values are the corresponding value
232 |             to set.
233 |         log_file : string (Optional)
234 |             A ``string`` of the path and filename to optionally save output to.
235 |         """
236 |         if not self.running:
237 |             print('Bobber container not running. Launch a container with '
238 |                   '"bobber cast" prior to running any tests.')
239 |             sys.exit(CONTAINER_NOT_RUNNING)
240 |         bobber = self.client.containers.get('bobber')
241 |         if not self.version_match(bobber):
242 |             print('Bobber container version mismatch.')
243 |             print('Kill the running Bobber container with "docker kill bobber"'
244 |                   ' and re-cast a new container with "bobber cast" prior to '
245 |                   'running any tests.')
246 |             sys.exit(CONTAINER_VERSION_MISMATCH)
247 |         result = bobber.exec_run(
248 |             command,
249 |             demux=False,
250 |             environment=environment,
251 |             stream=True
252 |         )
253 |         # Continually print STDOUT and STDERR until there is nothing left
254 |         while True:
255 |             try:
256 |                 output = next(result.output).decode('ascii')
257 |                 print(output.rstrip())
258 |                 if log_file:
259 |                     update_log(log_file, output)
260 |             # Usually only happens for terminating characters at the end of
261 |             # streams
262 |             except UnicodeDecodeError:
263 |                 print(result.output)
264 |             except StopIteration:
265 |                 break
266 | 
267 |     def version_match(self, container: Container) -> bool:
268 |         """
269 |         Determine if the Bobber container version matches the application.
270 | 
271 |         The running Bobber container version must match the local Bobber
272 |         application version to ensure all tests will function properly.
273 | 
274 |         Parameters
275 |         ----------
276 |         container : Container
277 |             A ``Container`` object representing the running Bobber image.
278 | 
279 |         Returns
280 |         -------
281 |         bool
282 |             Returns `True` when the versions match and `False` when not.
283 |         """
284 |         if f'nvidia/bobber:{version}' not in container.image.tags:
285 |             return False
286 |         return True
287 | 
288 |     @property
289 |     def running(self) -> bool:
290 |         """
291 |         Determine if the Bobber container is running on the system.
292 | 
293 |         Check to see if the current version of the Bobber container is running
294 |         on the local machine and return the status. This method can be used to
295 |         determine whether or not to run a command that depends on the container
296 |         being launched.
297 | 
298 |         Returns
299 |         -------
300 |         bool
301 |             Returns `True` when the container is running and `False` when not.
302 |         """
303 |         try:
304 |             bobber = self.client.containers.get('bobber')
305 |         except docker.errors.NotFound:
306 |             return False
307 |         else:
308 |             return True
309 | 


--------------------------------------------------------------------------------
/bobber/lib/tests/run_tests.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import os
  3 | from argparse import Namespace
  4 | from bobber.lib.constants import (
  5 |     RUN_ALL,
  6 |     RUN_DALI,
  7 |     RUN_NCCL,
  8 |     RUN_STG_BW,
  9 |     RUN_STG_IOPS,
 10 |     RUN_STG_125K,
 11 |     RUN_STG_META
 12 | )
 13 | from bobber.lib.docker import manager
 14 | from time import sleep
 15 | from typing import NoReturn
 16 | 
 17 | 
 18 | def run_dali(args: Namespace, bobber_version: str, iteration: int,
 19 |              hosts: str) -> NoReturn:
 20 |     """
 21 |     Run single or multi-node DALI tests.
 22 | 
 23 |     Run a single or multi-node DALI test which reads random image data in from
 24 |     designated storage and loads it onto local resources after preprocessing
 25 |     that is typically done for ResNet50 pipelines.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     args : Namespace
 30 |         A ``Namespace`` of all settings specified by the user for the test.
 31 |     bobber_version : string
 32 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
 33 |     iteration : int
 34 |         An ``int`` of the local test number, starting at 1.
 35 |     hosts : string
 36 |         A comma-separated list of hostnames to test against, such as
 37 |         'host1,host2,host3,host4'.
 38 |     """
 39 |     dali_log = os.path.join(args.log_path,
 40 |                             f'dali_iteration_{iteration}_'
 41 |                             f'gpus_{args.gpus}_'
 42 |                             f'batch_size_lg_{args.batch_size_lg}_'
 43 |                             f'batch_size_sm_{args.batch_size_sm}_'
 44 |                             f'systems_{len(hosts.split(","))}_'
 45 |                             f'version_{bobber_version}.log')
 46 |     environment = {
 47 |         'BATCH_SIZE_LG': args.batch_size_lg,
 48 |         'BATCH_SIZE_SM': args.batch_size_sm,
 49 |         'GPUS': args.gpus,
 50 |         'HOSTS': hosts,
 51 |         'SSH_IFACE': args.ssh_iface
 52 |     }
 53 |     manager.execute('tests/dali_multi.sh',
 54 |                     environment=environment,
 55 |                     log_file=dali_log)
 56 | 
 57 |     if args.pause > 0:
 58 |         sleep(args.pause)
 59 | 
 60 | 
 61 | def run_stg_bw(args: Namespace, bobber_version: str, iteration: int,
 62 |                hosts: str) -> NoReturn:
 63 |     """
 64 |     Run single or multi-node storage bandwidth tests with FIO.
 65 | 
 66 |     Run a single or multi-node storage bandwidth test with FIO which first
 67 |     writes data to the filesystem with 1MB block size and 4GB file size,
 68 |     followed by reading the data back.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     args : Namespace
 73 |         A ``Namespace`` of all settings specified by the user for the test.
 74 |     bobber_version : string
 75 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
 76 |     iteration : int
 77 |         An ``int`` of the local test number, starting at 1.
 78 |     hosts : string
 79 |         A comma-separated list of hostnames to test against, such as
 80 |         'host1,host2,host3,host4'.
 81 |     """
 82 |     stg_bw_log = os.path.join(args.log_path,
 83 |                               f'stg_bw_iteration_{iteration}_'
 84 |                               f'threads_{args.bw_threads}_'
 85 |                               f'direct_{args.direct}_'
 86 |                               f'depth_{args.io_depth}_'
 87 |                               f'read_pattern_{args.read_pattern}_'
 88 |                               f'write_pattern_{args.write_pattern}_'
 89 |                               f'systems_{len(hosts.split(","))}_'
 90 |                               f'version_{bobber_version}.log')
 91 |     environment = {
 92 |         'EXTRA_FLAGS': args.stg_extra_flags,
 93 |         'IO_DEPTH': args.io_depth,
 94 |         'DIRECTIO': args.direct,
 95 |         'THREADS': args.bw_threads,
 96 |         'READ_PATTERN': args.read_pattern,
 97 |         'WRITE_PATTERN': args.write_pattern,
 98 |         'HOSTS': hosts
 99 |     }
100 |     manager.execute('tests/fio_multi.sh',
101 |                     environment=environment,
102 |                     log_file=stg_bw_log)
103 | 
104 |     if args.pause > 0:
105 |         sleep(args.pause)
106 | 
107 | 
108 | def run_stg_125k(args: Namespace, bobber_version: str, iteration: int,
109 |                  hosts: str) -> NoReturn:
110 |     """
111 |     Run single or multi-node storage 125KB IO size tests with FIO.
112 | 
113 |     Run a single or multi-node storage bandwidth test with FIO which first
114 |     writes data to the filesystem with 125KB block size and 4GB file size,
115 |     followed by reading the data back.
116 | 
117 |     Parameters
118 |     ----------
119 |     args : Namespace
120 |         A ``Namespace`` of all settings specified by the user for the test.
121 |     bobber_version : string
122 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
123 |     iteration : int
124 |         An ``int`` of the local test number, starting at 1.
125 |     hosts : string
126 |         A comma-separated list of hostnames to test against, such as
127 |         'host1,host2,host3,host4'.
128 |     """
129 |     stg_125k_log = os.path.join(args.log_path,
130 |                                 f'stg_125k_iteration_{iteration}_'
131 |                                 f'threads_{args.stg_125k_threads}_'
132 |                                 f'direct_{args.direct}_'
133 |                                 f'depth_{args.io_depth}_'
134 |                                 f'systems_{len(hosts.split(","))}_'
135 |                                 f'version_{bobber_version}.log')
136 |     environment = {
137 |         'EXTRA_FLAGS': args.stg_extra_flags,
138 |         'IO_DEPTH': args.io_depth,
139 |         'IOSIZE': 125,
140 |         'DIRECTIO': args.direct,
141 |         'THREADS': args.stg_125k_threads,
142 |         'READ_PATTERN': args.read_pattern,
143 |         'WRITE_PATTERN': args.write_pattern,
144 |         'HOSTS': hosts
145 |     }
146 |     manager.execute('tests/fio_multi.sh',
147 |                     environment=environment,
148 |                     log_file=stg_125k_log)
149 | 
150 |     if args.pause > 0:
151 |         sleep(args.pause)
152 | 
153 | 
154 | def run_stg_iops(args: Namespace, bobber_version: str, iteration: int,
155 |                  hosts: str) -> NoReturn:
156 |     """
157 |     Run single or multi-node storage IOPS tests with FIO.
158 | 
159 |     Run a single or multi-node storage IOPS test with FIO which first writes
160 |     data to the filesystem with 4kB block size and 4GB file size, followed by
161 |     reading the data back.
162 | 
163 |     Parameters
164 |     ----------
165 |     args : Namespace
166 |         A ``Namespace`` of all settings specified by the user for the test.
167 |     bobber_version : string
168 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
169 |     iteration : int
170 |         An ``int`` of the local test number, starting at 1.
171 |     hosts : string
172 |         A comma-separated list of hostnames to test against, such as
173 |         'host1,host2,host3,host4'.
174 |     """
175 |     stg_iops_log = os.path.join(args.log_path,
176 |                                 f'stg_iops_iteration_{iteration}_'
177 |                                 f'threads_{args.iops_threads}_'
178 |                                 f'direct_{args.direct}_'
179 |                                 f'depth_{args.io_depth}_'
180 |                                 f'read_pattern_{args.read_pattern}_'
181 |                                 f'write_pattern_{args.write_pattern}_'
182 |                                 f'systems_{len(hosts.split(","))}_'
183 |                                 f'version_{bobber_version}.log')
184 |     environment = {
185 |         'EXTRA_FLAGS': args.stg_extra_flags,
186 |         'IO_DEPTH': args.io_depth,
187 |         'DIRECTIO': args.direct,
188 |         'THREADS': args.iops_threads,
189 |         'IOSIZE': 4,
190 |         'READ_PATTERN': args.read_pattern,
191 |         'WRITE_PATTERN': args.write_pattern,
192 |         'HOSTS': hosts
193 |     }
194 |     manager.execute('tests/fio_multi.sh',
195 |                     environment=environment,
196 |                     log_file=stg_iops_log)
197 | 
198 |     if args.pause > 0:
199 |         sleep(args.pause)
200 | 
201 | 
202 | def run_stg_meta(args: Namespace, bobber_version: str, iteration: int,
203 |                  hosts: str) -> NoReturn:
204 |     """
205 |     Run single or multi-node storage metadata test with FIO.
206 | 
207 |     Run a single or multi-node storage metadata test with FIO which tests
208 |     various metadata operation performance for the filesystem.
209 | 
210 |     Parameters
211 |     ----------
212 |     args : Namespace
213 |         A ``Namespace`` of all settings specified by the user for the test.
214 |     bobber_version : string
215 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
216 |     iteration : int
217 |         An ``int`` of the local test number, starting at 1.
218 |     hosts : string
219 |         A comma-separated list of hostnames to test against, such as
220 |         'host1,host2,host3,host4'.
221 |     """
222 |     stg_meta_log = os.path.join(args.log_path,
223 |                                 f'stg_meta_iteration_{iteration}_'
224 |                                 f'systems_{len(hosts.split(","))}_'
225 |                                 f'version_{bobber_version}.log')
226 |     environment = {
227 |         'HOSTS': hosts,
228 |         'SSH_IFACE': args.ssh_iface,
229 |         'NCCL_IB_HCAS': args.nccl_ib_hcas
230 |     }
231 |     manager.execute('tests/mdtest_multi.sh',
232 |                     environment=environment,
233 |                     log_file=stg_meta_log)
234 | 
235 |     if args.pause > 0:
236 |         sleep(args.pause)
237 | 
238 | 
239 | def run_nccl(args: Namespace, bobber_version: str, iteration: int,
240 |              hosts: str) -> NoReturn:
241 |     """
242 |     Run single or multi-node NCCL test.
243 | 
244 |     Run a single or multi-node NCCL test which verifies network and GPU
245 |     performance and communication.
246 | 
247 |     Parameters
248 |     ----------
249 |     args : Namespace
250 |         A ``Namespace`` of all settings specified by the user for the test.
251 |     bobber_version : string
252 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
253 |     iteration : int
254 |         An ``int`` of the local test number, starting at 1.
255 |     hosts : string
256 |         A comma-separated list of hostnames to test against, such as
257 |         'host1,host2,host3,host4'.
258 |     """
259 |     nccl_log = os.path.join(args.log_path,
260 |                             f'nccl_iteration_{iteration}_'
261 |                             f'gpus_{args.gpus}_'
262 |                             f'nccl_max_{args.nccl_max}_'
263 |                             f'gid_{args.compute_gid}_'
264 |                             f'nccl_tc_{args.nccl_tc}_'
265 |                             f'systems_{len(hosts.split(","))}_'
266 |                             f'version_{bobber_version}.log')
267 |     environment = {
268 |         'GPUS': args.gpus,
269 |         'NCCL_MAX': args.nccl_max,
270 |         'NCCL_TC': args.nccl_tc,
271 |         'COMPUTE_GID': args.compute_gid,
272 |         'HOSTS': hosts,
273 |         'SSH_IFACE': args.ssh_iface,
274 |         'NCCL_IB_HCAS': args.nccl_ib_hcas
275 |     }
276 |     manager.execute('tests/nccl_multi.sh',
277 |                     environment=environment,
278 |                     log_file=nccl_log)
279 | 
280 |     if args.pause > 0:
281 |         sleep(args.pause)
282 | 
283 | 
284 | def kickoff_test(args: Namespace, bobber_version: str, iteration: int,
285 |                  hosts: str) -> NoReturn:
286 |     """
287 |     Start a specified test.
288 | 
289 |     Launch a test as requested from the CLI for the given iteration.
290 | 
291 |     Parameters
292 |     ----------
293 |     args : Namespace
294 |         A ``Namespace`` of all settings specified by the user for the test.
295 |     bobber_version : string
296 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
297 |     iteration : int
298 |         An ``int`` of the local test number, starting at 1.
299 |     hosts : string
300 |         A comma-separated list of hostnames to test against, such as
301 |         'host1,host2,host3,host4'.
302 |     """
303 |     if args.command == RUN_DALI:
304 |         run_dali(args, bobber_version, iteration, hosts)
305 |     elif args.command == RUN_NCCL:
306 |         run_nccl(args, bobber_version, iteration, hosts)
307 |     elif args.command == RUN_STG_BW:
308 |         run_stg_bw(args, bobber_version, iteration, hosts)
309 |     elif args.command == RUN_STG_IOPS:
310 |         run_stg_iops(args, bobber_version, iteration, hosts)
311 |     elif args.command == RUN_STG_125K:
312 |         run_stg_125k(args, bobber_version, iteration, hosts)
313 |     elif args.command == RUN_STG_META:
314 |         run_stg_meta(args, bobber_version, iteration, hosts)
315 |     elif args.command == RUN_ALL:
316 |         run_nccl(args, bobber_version, iteration, hosts)
317 |         run_stg_meta(args, bobber_version, iteration, hosts)
318 |         run_stg_bw(args, bobber_version, iteration, hosts)
319 |         run_dali(args, bobber_version, iteration, hosts)
320 |         run_stg_iops(args, bobber_version, iteration, hosts)
321 |         run_stg_125k(args, bobber_version, iteration, hosts)
322 | 
323 | 
324 | def test_selector(args: Namespace, bobber_version: str) -> NoReturn:
325 |     """
326 |     Start a test iteration.
327 | 
328 |     If the user requested to run a sweep of the hosts, the tests will begin
329 |     with the first node in the hosts list for a single-node test, then
330 |     progressively add the next host in the list until all nodes are tested
331 |     together. During each iteration, one run of each requested test will be
332 |     executed before going to the next iteration.
333 | 
334 |     Parameters
335 |     ----------
336 |     args : Namespace
337 |         A ``Namespace`` of all settings specified by the user for the test.
338 |     bobber_version : string
339 |         A ``string`` of the local version of Bobber, such as '5.0.0'.
340 |     """
341 |     if args.sweep:
342 |         hosts = []
343 | 
344 |         for host in args.hosts.split(','):
345 |             hosts.append(host)
346 |             for iteration in range(1, args.iterations + 1):
347 |                 host_string = ','.join(hosts)
348 |                 kickoff_test(args, bobber_version, iteration, host_string)
349 |     else:
350 |         for iteration in range(1, args.iterations + 1):
351 |             kickoff_test(args, bobber_version, iteration, args.hosts)
352 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/parse-mlperf.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import re
  3 | from argparse import ArgumentParser, Namespace
  4 | from glob import glob
  5 | from os.path import join
  6 | from typing import NoReturn, Tuple
  7 | 
  8 | 
  9 | class Aggregate:
 10 |     """
 11 |     Find the aggregate results for from multiple iterations.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     epoch_zero_speeds : list
 16 |         A ``list`` of ``floats`` of the first epoch speeds.
 17 |     epoch_zero_times : list
 18 |         A ``list`` of ``floats`` of the epoch zero times.
 19 |     elapsed_times : list
 20 |         A ``list`` of ``floats`` of the overall elapsed time.
 21 |     average_speeds : list
 22 |         A ``list`` of ``floats`` of the overall average speeds.
 23 |     """
 24 |     def __init__(self, epoch_zero_speeds: list, epoch_zero_times: list,
 25 |                  elapsed_times: list, average_speeds: list) -> NoReturn:
 26 |         self.epoch_zero_speeds = epoch_zero_speeds
 27 |         self.epoch_zero_times = epoch_zero_times
 28 |         self.elapsed_times = elapsed_times
 29 |         self.average_speeds = average_speeds
 30 | 
 31 | 
 32 | class Results:
 33 |     """
 34 |     The results from a single test run.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     epoch_zero_speed : float
 39 |         A ``float`` of the first epoch speed.
 40 |     epoch_zero_time : float
 41 |         A ``float`` of the epoch zero time.
 42 |     elapsed_time : float
 43 |         A ``float`` of the overall elapsed time.
 44 |     average_speed : float
 45 |         A ``float`` of the overall average speed.
 46 |     """
 47 |     def __init__(self, epoch_zero_speed: float, epoch_zero_time: float,
 48 |                  elapsed_time: float, average_speed: float) -> NoReturn:
 49 |         self.epoch_zero_speed = epoch_zero_speed
 50 |         self.epoch_zero_time = epoch_zero_time
 51 |         self.elapsed_time = elapsed_time
 52 |         self.average_speed = average_speed
 53 | 
 54 | 
 55 | def parse_args() -> Namespace:
 56 |     """
 57 |     Parse arguments passed to the MLPerf parser.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     Namespace
 62 |         Returns a ``Namespace`` of all of the arguments that were parsed from
 63 |         the application during runtime.
 64 |     """
 65 |     parser = ArgumentParser(description='Parse MLPerf results')
 66 |     parser.add_argument('directory', type=str, help='The directory where '
 67 |                         'MLPerf log results are saved.')
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | def average(list_to_average: list) -> float:
 72 |     """
 73 |     Find the average of a list.
 74 | 
 75 |     Given a list of numbers, calculate the average of all values in the list.
 76 |     If the list is empty, default to 0.0.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     list_to_average : list
 81 |         A ``list`` of ``floats`` to find an average of.
 82 | 
 83 |     Returns
 84 |     -------
 85 |     float
 86 |         Returns a ``float`` of the average value of the list.
 87 |     """
 88 |     try:
 89 |         return round(sum(list_to_average) / len(list_to_average), 3)
 90 |     except ZeroDivisionError:
 91 |         return 0.0
 92 | 
 93 | 
 94 | def ms_to_seconds(time: float) -> float:
 95 |     """
 96 |     Convert milliseconds to seconds.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     time : float
101 |         A ``float`` of time in milliseconds.
102 | 
103 |     Returns
104 |     -------
105 |     float
106 |         Returns a ``float`` of the converted time in seconds.
107 |     """
108 |     return round(time / 1000, 3)
109 | 
110 | 
111 | def ms_to_minutes(time: float) -> float:
112 |     """
113 |     Convert milliseconds to minutes.
114 | 
115 |     Parameters
116 |     ----------
117 |     time : float
118 |         A ``float`` of time in milliseconds.
119 | 
120 |     Returns
121 |     -------
122 |     float
123 |         Returns a ``float`` of the converted time in minutes.
124 |     """
125 |     return round(time / 1000 / 60, 3)
126 | 
127 | 
128 | def get_files(directory: str) -> list:
129 |     """
130 |     Read all log files.
131 | 
132 |     Given an input directory as a string, read all log files and return the
133 |     filenames including the directory as a list.
134 | 
135 |     Parameters
136 |     ----------
137 |     directory : str
138 |         A ``string`` pointing to the results directory.
139 | 
140 |     Returns
141 |     -------
142 |     list
143 |         Returns a ``list`` of ``strings`` of the paths to each log file in the
144 |         results directory.
145 |     """
146 |     return glob(join(directory, '*.log'))
147 | 
148 | 
149 | def parse_epoch_line(line: str) -> Tuple[int, float]:
150 |     """
151 |     Parse the throughput for each epoch.
152 | 
153 |     Pull the images/second and epoch for each results line in an MLPerf log.
154 | 
155 |     Parameters
156 |     ----------
157 |     line : str
158 |         A ``string`` of a results line in an MLPerf log.
159 | 
160 |     Returns
161 |     -------
162 |     tuple
163 |         Returns a ``tuple`` of (``int``, ``float``) of the epoch number and
164 |         resulting speed in images/second.
165 |     """
166 |     # Lines are in the format:
167 |     # "Epoch[NUM] Batch [NUM-NUM] Speed: NUM.NUM samples/sec accuracy=NUM.NUM"
168 |     epoch = re.findall(r'\[\d+\]', line)[0].replace('[', '').replace(']', '')
169 |     speed = re.findall(r'Speed: .* samples', line)
170 |     if len(speed) == 1:
171 |         speed = speed[0].replace('Speed: ', '').replace(' samples', '')
172 |     return int(epoch), float(speed)
173 | 
174 | 
175 | def parse_time(line: str) -> int:
176 |     """
177 |     Parse the timestamp from a line in the log.
178 | 
179 |     Parameters
180 |     ----------
181 |     line : str
182 |         A ``string`` of a line in an MLPerf log file.
183 | 
184 |     Returns
185 |     -------
186 |     int
187 |         Returns an ``int`` of the parsed timestamp.
188 |     """
189 |     return int(re.findall(r'\d+', line)[0])
190 | 
191 | 
192 | def parse_epoch_values(logfile: str) -> Tuple[list, list]:
193 |     """
194 |     Parse the epoch and throughput lines.
195 | 
196 |     Find all of the lines that contain a throughput and save the first epoch
197 |     and overall epoch results in lists.
198 | 
199 |     Parameters
200 |     ----------
201 |     logfile : str
202 |         A ``string`` of all contents from a logfile.
203 | 
204 |     Returns
205 |     -------
206 |     tuple
207 |         Returns a ``tuple`` of (``list``, ``list``) containing the first epoch
208 |         results followed by all results.
209 |     """
210 |     epoch_zero_vals, all_epoch_vals = [], []
211 |     epoch_values = re.findall(r'Epoch\[\d+\] Batch.*', logfile)
212 | 
213 |     for value in epoch_values:
214 |         epoch, speed = parse_epoch_line(value)
215 |         all_epoch_vals.append(speed)
216 |         if epoch == 0:
217 |             epoch_zero_vals.append(speed)
218 |     return epoch_zero_vals, all_epoch_vals
219 | 
220 | 
221 | def parse_epoch_times(logfile: str) -> Tuple[list, list]:
222 |     """
223 |     Parse the time for each epoch.
224 | 
225 |     Find the overall time it takes to complete each epoch by finding the
226 |     difference in milliseconds.
227 | 
228 |     Parameters
229 |     ----------
230 |     logfile : str
231 |         A ``string`` of all contents from a logfile.
232 | 
233 |     Returns
234 |     -------
235 |     tuple
236 |         Returns a ``tuple`` of (``list``, ``list``) representing the time taken
237 |         during the first epoch and the overall elapsed time for the test.
238 |     """
239 |     epoch_start_times = re.findall(r'time_ms.*?epoch_start', logfile)
240 |     epoch_stop_times = re.findall(r'time_ms.*?epoch_stop', logfile)
241 |     # The epoch 0 time is the difference between the timestamp where epoch 0
242 |     # ended, and the timestamp where epoch 0 began.
243 |     epoch_zero_time = parse_time(epoch_stop_times[0]) - \
244 |         parse_time(epoch_start_times[0])
245 |     # The total elapsed time is the difference between the timestamp of when
246 |     # the final epoch ended, and the timestamp where epoch 0 began.
247 |     elapsed_time = parse_time(epoch_stop_times[-1]) - \
248 |         parse_time(epoch_start_times[0])
249 |     return epoch_zero_time, elapsed_time
250 | 
251 | 
252 | def parse_file(logfile: str) -> object:
253 |     """
254 |     Parse a single MLPerf file.
255 | 
256 |     Find the first epoch and overall results for a single MLPerf file and
257 |     create a singular object to represent the results.
258 | 
259 |     Parameters
260 |     ----------
261 |     logfile : str
262 |         A ``string`` of all contents from a logfile.
263 | 
264 |     Returns
265 |     -------
266 |     Results instance
267 |         Returns an instance of the Results class.
268 |     """
269 |     epoch_zero_vals, all_epoch_vals = parse_epoch_values(logfile)
270 |     epoch_zero_time, elapsed_time = parse_epoch_times(logfile)
271 |     results = Results(average(epoch_zero_vals),
272 |                       epoch_zero_time,
273 |                       elapsed_time,
274 |                       average(all_epoch_vals))
275 |     return results
276 | 
277 | 
278 | def find_num_nodes(logfile: str) -> int:
279 |     """
280 |     Find the number of nodes tested.
281 | 
282 |     Parameters
283 |     ----------
284 |     logfile : str
285 |         A ``string`` of all contents from a logfile.
286 | 
287 |     Returns
288 |     -------
289 |     int
290 |         Returns an ``integer`` of the number of nodes tested.
291 |     """
292 |     clear_cache_command = re.findall(r'srun.*Clearing cache on ', logfile)
293 |     if len(clear_cache_command) == 0:
294 |         print('Unable to find number of nodes tested. Assuming single node.')
295 |         return 1
296 |     n_tasks = re.findall(r'ntasks=\d+', clear_cache_command[0])
297 |     num_nodes = n_tasks[0].replace('ntasks=', '')
298 |     return num_nodes
299 | 
300 | 
301 | def find_filesystem_test_path(logfile: str) -> str:
302 |     """
303 |     Parse the filesystem path from the log file.
304 | 
305 |     The 'container-mounts=...' line in each log file contains the location of
306 |     the shared filesystem.
307 | 
308 |     Parameters
309 |     ----------
310 |     logfiles : str
311 |         A ``string`` of all contents from a logfile.
312 | 
313 |     Returns
314 |     -------
315 |     str
316 |         Returns a ``string`` of the location of the filesystem.
317 |     """
318 |     container_mounts_line = re.findall(r'container-mounts=\S*:/data', logfile)
319 |     if len(container_mounts_line) == 0:
320 |         print('Unable to find container mount directory. Leaving empty.')
321 |         return '<Unknown>'
322 |     container_data_mount = container_mounts_line[0].replace(
323 |         'container-mounts=', '')
324 |     return container_data_mount
325 | 
326 | 
327 | def read_files(logfiles: list) -> Tuple[object, int, str]:
328 |     """
329 |     Read all MLPerf files and find aggregate results.
330 | 
331 |     Read all log files in a directory and determine the average speed and time
332 |     taken to process images for both the first epoch and all results combined.
333 | 
334 |     Parameters
335 |     ----------
336 |     logfiles : list
337 |         A ``list`` of the filepaths for all log files in an input directory.
338 | 
339 |     Returns
340 |     -------
341 |     tuple
342 |         Returns a ``tuple`` of an instance of the Aggregate class, the number
343 |         of nodes tested, and the path to the filesystem under test.
344 |     """
345 |     all_results = []
346 |     prev_nodes_found = None
347 |     prev_filesystem_test_path = None
348 | 
349 |     for filename in logfiles:
350 |         with open(filename, 'r') as logpointer:
351 |             log = logpointer.read()
352 |             results = parse_file(log)
353 |             all_results.append(results)
354 |             nodes_tested = find_num_nodes(log)
355 |             filesystem_test_path = find_filesystem_test_path(log)
356 |             if prev_nodes_found and nodes_tested != prev_nodes_found:
357 |                 raise ValueError('Error: Mixed node sizes found in log files!')
358 |             if prev_filesystem_test_path and \
359 |                     filesystem_test_path != prev_filesystem_test_path:
360 |                 raise ValueError('Error: Mixed test paths found in log files!')
361 |             prev_nodes_found = nodes_tested
362 |             prev_filesystem_test_path = filesystem_test_path
363 |     aggregate = Aggregate(
364 |         [result.epoch_zero_speed for result in all_results],
365 |         [result.epoch_zero_time for result in all_results],
366 |         [result.elapsed_time for result in all_results],
367 |         [result.average_speed for result in all_results]
368 |     )
369 |     return aggregate, nodes_tested, filesystem_test_path
370 | 
371 | 
372 | def print_averages(results: object, directory: str, nodes_tested: int,
373 |                    filesystem_test_path: str) -> NoReturn:
374 |     """
375 |     Print the average results.
376 | 
377 |     Print the average time and speed for epoch 0 and all results, plus test
378 |     information including the log directory and the location of the filesystem
379 |     under test.
380 | 
381 |     Parameters
382 |     ----------
383 |     results : object
384 |         An instance of the Results class containing the results from a single
385 |         test.
386 |     directory : str
387 |         A ``string`` of the passed directory where results were saved.
388 |     nodes_tested : int
389 |         An ``int`` of the number of nodes that were tested for a file.
390 |     filesystem_test_path : str
391 |         A ``string`` of the path to the filesystem under test.
392 |     """
393 |     e_zero_speed = average(results.epoch_zero_speeds)
394 |     e_zero_time = ms_to_seconds(average(results.epoch_zero_times))
395 |     overall_speed = average(results.average_speeds)
396 |     overall_time = ms_to_minutes(average(results.elapsed_times))
397 | 
398 |     output = f"""MLPerf Results:
399 | Log directory name: {directory}
400 | Filesystem test path: {filesystem_test_path}
401 | Number of iterations: {len(results.epoch_zero_speeds)}
402 | Nodes tested: {nodes_tested}
403 | Epoch 0:
404 |     Speed: {e_zero_speed} images/second
405 |     Average time: {e_zero_time} seconds
406 | Overall:
407 |     Speed: {overall_speed} images/second
408 |     Average time: {overall_time} minutes"""
409 |     print(output)
410 | 
411 | 
412 | def main() -> NoReturn:
413 |     """
414 |     Parse MLPerf test results.
415 |     """
416 |     args = parse_args()
417 |     logfiles = get_files(args.directory)
418 |     aggregate, nodes_tested, filesystem_test_path = read_files(logfiles)
419 |     print_averages(aggregate, args.directory, nodes_tested,
420 |                    filesystem_test_path)
421 | 
422 | 
423 | if __name__ == '__main__':
424 |     main()
425 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/parse_results.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | import json
  3 | import sys
  4 | from collections import defaultdict
  5 | from glob import glob
  6 | from os.path import join
  7 | from bobber.lib.exit_codes import MISSING_LOG_FILES, SUCCESS
  8 | from bobber.lib.analysis.aggregate_results import AggregateResults
  9 | from bobber.lib.analysis.common import (check_bobber_version,
 10 |                                         divide_logs_by_systems)
 11 | from bobber.lib.analysis.compare_baseline import compare_baseline
 12 | from bobber.lib.analysis.dali import parse_dali_file
 13 | from bobber.lib.analysis.fio import parse_fio_bw_file, parse_fio_iops_file
 14 | from bobber.lib.analysis.meta import parse_meta_file
 15 | from bobber.lib.analysis.nccl import parse_nccl_file
 16 | from bobber.lib.analysis.table import display_table
 17 | from bobber.lib.system.file_handler import write_file
 18 | from typing import NoReturn, Optional, Tuple
 19 | 
 20 | 
 21 | def get_files(directory: str) -> list:
 22 |     """
 23 |     Read all log files.
 24 | 
 25 |     Given an input directory as a string, read all log files and return the
 26 |     filenames including the directory as a list.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     directory : str
 31 |         A ``string`` pointing to the results directory.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     list
 36 |         Returns a ``list`` of ``strings`` of the paths to each log file in the
 37 |         results directory.
 38 |     """
 39 |     return glob(join(directory, '*.log'))
 40 | 
 41 | 
 42 | def parse_fio_bw(log_files: list) -> Tuple[dict, dict, dict, dict]:
 43 |     """
 44 |     Parse all FIO bandwidth logs.
 45 | 
 46 |     Find each FIO bandwidth log in the results directory and parse the read and
 47 |     write results and parameters from each log for all system counts.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     log_files : list
 52 |         A ``list`` of ``strings`` of the paths to each log file in the results
 53 |         directory.
 54 | 
 55 |     Returns
 56 |     -------
 57 |     tuple
 58 |         A ``tuple`` of four dictionaries containing the read results, write
 59 |         results, read parameters, and write parameters, respectively for all
 60 |         system counts.
 61 |     """
 62 |     read_sys_results = defaultdict(list)
 63 |     write_sys_results = defaultdict(list)
 64 |     read_params, write_params = None, None
 65 | 
 66 |     fio_logs_by_systems = divide_logs_by_systems(log_files, 'stg_bw_iteration')
 67 | 
 68 |     for systems, files in fio_logs_by_systems.items():
 69 |         read_sys_results, write_sys_results, read_params, write_params = \
 70 |             parse_fio_bw_file(files,
 71 |                               systems,
 72 |                               read_sys_results,
 73 |                               write_sys_results)
 74 |     return read_sys_results, write_sys_results, read_params, write_params
 75 | 
 76 | 
 77 | def parse_fio_iops(log_files: list) -> Tuple[dict, dict, dict, dict]:
 78 |     """
 79 |     Parse all FIO IOPS logs.
 80 | 
 81 |     Find each FIO IOPS log in the results directory and parse the read and
 82 |     write results and parameters from each log for all system counts.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     log_files : list
 87 |         A ``list`` of ``strings`` of the paths to each log file in the results
 88 |         directory.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     tuple
 93 |         A ``tuple`` of four dictionaries containing the read results, write
 94 |         results, read parameters, and write parameters, respectively for all
 95 |         system counts.
 96 |     """
 97 |     read_sys_results = defaultdict(list)
 98 |     write_sys_results = defaultdict(list)
 99 |     read_params, write_params = None, None
100 | 
101 |     fio_logs_by_systems = divide_logs_by_systems(log_files,
102 |                                                  'stg_iops_iteration')
103 | 
104 |     for systems, files in fio_logs_by_systems.items():
105 |         read_sys_results, write_sys_results, read_params, write_params = \
106 |             parse_fio_iops_file(files,
107 |                                 systems,
108 |                                 read_sys_results,
109 |                                 write_sys_results)
110 |     return read_sys_results, write_sys_results, read_params, write_params
111 | 
112 | 
113 | def parse_fio_125k_bw(log_files: list) -> Tuple[dict, dict, dict, dict]:
114 |     """
115 |     Parse all FIO 125k bandwidth logs.
116 | 
117 |     Find each FIO 125k bandwidth log in the results directory and parse the
118 |     read and write results and parameters from each log for all system counts.
119 | 
120 |     Parameters
121 |     ----------
122 |     log_files : list
123 |         A ``list`` of ``strings`` of the paths to each log file in the results
124 |         directory.
125 | 
126 |     Returns
127 |     -------
128 |     tuple
129 |         A ``tuple`` of four dictionaries containing the 125k read results, 125k
130 |         write results, 125k read parameters, and 125k write parameters for all
131 |         system counts.
132 |     """
133 |     read_sys_results = defaultdict(list)
134 |     write_sys_results = defaultdict(list)
135 |     read_params, write_params = None, None
136 | 
137 |     fio_logs_by_systems = divide_logs_by_systems(log_files,
138 |                                                  'stg_125k_iteration')
139 | 
140 |     for systems, files in fio_logs_by_systems.items():
141 |         read_sys_results, write_sys_results, read_params, write_params = \
142 |             parse_fio_bw_file(files,
143 |                               systems,
144 |                               read_sys_results,
145 |                               write_sys_results)
146 |     return read_sys_results, write_sys_results, read_params, write_params
147 | 
148 | 
149 | def parse_nccl(log_files: list) -> Tuple[dict, dict]:
150 |     """
151 |     Parse all NCCL logs.
152 | 
153 |     Find the maximum bus bandwidth and resulting byte size for all NCCL files
154 |     for all system counts.
155 | 
156 |     Parameters
157 |     ----------
158 |     log_files : list
159 |         A ``list`` of ``strings`` of the paths to each log file in the results
160 |         directory.
161 | 
162 |     Returns
163 |     -------
164 |     tuple
165 |         Returns a ``tuple`` of (``dict``, ``dict``) representing the maximum
166 |         bus bandwidth and corresponding byte size for all system counts.
167 |     """
168 |     bw_results = defaultdict(list)
169 |     bytes_results = defaultdict(list)
170 | 
171 |     nccl_logs_by_systems = divide_logs_by_systems(log_files, 'nccl')
172 | 
173 |     for systems, files in nccl_logs_by_systems.items():
174 |         max_bw, byte_size = parse_nccl_file(files, systems)
175 |         bw_results[systems] = max_bw
176 |         bytes_results[systems] = byte_size
177 |     return bw_results, bytes_results
178 | 
179 | 
180 | def parse_dali(log_files: list) -> dict:
181 |     """
182 |     Parse all DALI logs.
183 | 
184 |     Parse the bandwidth and throughput for all image types and sizes from all
185 |     DALI log files.
186 | 
187 |     Parameters
188 |     ----------
189 |     log_files : list
190 |         A ``list`` of ``strings`` of the paths to each log file in the results
191 |         directory.
192 | 
193 |     Returns
194 |     -------
195 |     dict
196 |         Returns a ``dictionary`` of the throughput and bandwidth for all system
197 |         counts.
198 |     """
199 |     results_dict = {}
200 | 
201 |     dali_logs_by_systems = divide_logs_by_systems(log_files, 'dali')
202 | 
203 |     for systems, files in dali_logs_by_systems.items():
204 |         results_dict = parse_dali_file(files, systems, results_dict)
205 |     return results_dict
206 | 
207 | 
208 | def parse_meta(log_files: list) -> dict:
209 |     """
210 |     Parse all metadata logs.
211 | 
212 |     Parse the minimum, maximum, and mean values for all operations in the
213 |     metadata log files.
214 | 
215 |     Parameters
216 |     ----------
217 |     log_files : list
218 |         A ``list`` of ``strings`` of the paths to each log file in the results
219 |         directory.
220 | 
221 |     Returns
222 |     -------
223 |     dict
224 |         Returns a ``dictionary`` of the results from various metadata
225 |         operations for all system counts.
226 |     """
227 |     results_dict = {}
228 | 
229 |     meta_logs_by_systems = divide_logs_by_systems(log_files, 'stg_meta')
230 | 
231 |     for systems, files in meta_logs_by_systems.items():
232 |         results_dict = parse_meta_file(files, systems, results_dict)
233 |     return results_dict
234 | 
235 | 
236 | def save_json(final_dictionary_output: dict, filename: str) -> NoReturn:
237 |     """
238 |     Save results to a file.
239 | 
240 |     Save the final JSON data to a file for future reference. If the filename is
241 |     not specified, don't save the file.
242 | 
243 |     Parameters
244 |     ----------
245 |     final_dictionary_output : dict
246 |         A ``dictionary`` of the final JSON output to save.
247 |     filename : str
248 |         A ``string`` of the filename to write the JSON data to.
249 |     """
250 |     if not filename:
251 |         return
252 |     with open(filename, 'w') as json_file:
253 |         json.dump(final_dictionary_output, json_file)
254 |         print(f'JSON data saved to {filename}')
255 | 
256 | 
257 | def save_yaml_baseline(final_dictionary_output: dict,
258 |                        directory: str) -> NoReturn:
259 |     """
260 |     Save results as a YAML baseline file.
261 | 
262 |     The parsed results should be saved as a YAML baseline file which can be
263 |     used to compare similar systems against existing results. The YAML file
264 |     will be saved in the results directory as "baseline.yaml".
265 | 
266 |     Parameters
267 |     ----------
268 |     final_dictionary_output : dict
269 |         A ``dictionary`` of the parsed results on a per-system level.
270 |     directory : str
271 |         A ``string`` of the directory where results are saved.
272 |     """
273 |     contents = 'systems:\n'
274 | 
275 |     for systems, results in final_dictionary_output['systems'].items():
276 |         dali = results.get('dali', {})
277 |         small_jpg = dali.get('800x600 standard jpg', {})
278 |         large_jpg = dali.get('3840x2160 standard jpg', {})
279 |         small_tf = dali.get('800x600 tfrecord', {})
280 |         large_tf = dali.get('3840x2160 tfrecord', {})
281 |         contents += f"""    {systems}:
282 |         bandwidth:
283 |             # FIO BW speed in bytes/second
284 |             read: {results.get('bandwidth', {}).get('read', 0)}
285 |             write: {results.get('bandwidth', {}).get('write', 0)}
286 |         iops:
287 |             # FIO IOPS speed in ops/second
288 |             read: {results.get('iops', {}).get('read', 0)}
289 |             write: {results.get('iops', {}).get('write', 0)}
290 |         125k_bandwidth:
291 |             # FIO 125k BW speed in bytes/second
292 |             read: {results.get('125k_bandwidth', {}).get('read', 0)}
293 |             write: {results.get('125k_bandwidth', {}).get('write', 0)}
294 |         nccl:
295 |             # NCCL maximum bus bandwidth in GB/s
296 |             max_bus_bw: {results.get('nccl', {}).get('max_bus_bw', 0)}
297 |         dali:
298 |             # DALI average speed in images/second
299 |             800x600 standard jpg: {small_jpg.get('average images/second', 0)}
300 |             3840x2160 standard jpg: {large_jpg.get('average images/second', 0)}
301 |             800x600 tfrecord: {small_tf.get('average images/second', 0)}
302 |             3840x2160 tfrecord: {large_tf.get('average images/second', 0)}
303 | """
304 |     write_file(f'{directory}/baseline.yaml', contents)
305 | 
306 | 
307 | def main(directory: str,
308 |          baseline: Optional[str] = None,
309 |          custom_baseline: Optional[str] = None,
310 |          tolerance: Optional[int] = 0,
311 |          verbose: Optional[bool] = False,
312 |          override_version_check: Optional[bool] = False,
313 |          json_filename: Optional[str] = None) -> NoReturn:
314 |     """
315 |     Parse all results on a per-system level.
316 | 
317 |     Read all log files from a results directory and iterate through the results
318 |     on a per-system level. The results displayed are of the aggregate value for
319 |     each system count.
320 | 
321 |     A baseline can be optionally included to compare the results in the output
322 |     directory against pre-configured results to verify performance meets
323 |     desired levels.
324 | 
325 |     Parameters
326 |     ----------
327 |     directory : str
328 |         A ``string`` of the directory where results are located.
329 |     baseline : str (optional)
330 |         A ``string`` representing the key from the included baselines to
331 |         compare results to.
332 |     custom_baseline : str (optional)
333 |         A ``string`` of the filename to a custom YAML config file to read and
334 |         compare results to.
335 |     tolerance : int (optional)
336 |         An ``integer`` of the tolerance as a percentage below the baseline to
337 |         allow results to still be marked as passing.
338 |     verbose : bool (optional)
339 |         A ``boolean`` that prints additional textual output when `True`.
340 |     override_version_check : bool (optional)
341 |         A ``boolean`` which skips checking the Bobber version tested when
342 |         `True`.
343 |     json_filename : str (optional)
344 |         A ``string`` of the filename to save JSON data to.
345 |     """
346 |     final_dictionary_output = {'systems': {}}
347 | 
348 |     log_files = get_files(directory)
349 |     if len(log_files) < 1:
350 |         print('No log files found. Please specify a directory containing '
351 |               'valid logs.')
352 |         print('Exiting...')
353 |         sys.exit(MISSING_LOG_FILES)
354 |     bobber_version = check_bobber_version(log_files,
355 |                                           override_version_check)
356 |     bw_results = parse_fio_bw(log_files)
357 |     read_bw, write_bw, read_bw_params, write_bw_params = bw_results
358 |     bw_125k_results = parse_fio_125k_bw(log_files)
359 |     read_125k_bw, write_125k_bw, read_125k_bw_params, write_125k_bw_params = \
360 |         bw_125k_results
361 |     iops_results = parse_fio_iops(log_files)
362 |     read_iops, write_iops, read_iops_params, write_iops_params = iops_results
363 |     metadata = parse_meta(log_files)
364 |     max_bw, bytes_sizes = parse_nccl(log_files)
365 |     dali_results = parse_dali(log_files)
366 |     total_systems = 0
367 |     systems = []
368 | 
369 |     for result in [read_bw, read_iops, read_125k_bw, max_bw, dali_results,
370 |                    metadata]:
371 |         try:
372 |             total_systems = max(result.keys())
373 |             systems = sorted(result.keys())
374 |         except ValueError:
375 |             continue
376 |         else:
377 |             break
378 | 
379 |     for system_num in systems:
380 |         aggregate = AggregateResults(read_bw,
381 |                                      write_bw,
382 |                                      read_bw_params,
383 |                                      write_bw_params,
384 |                                      read_iops,
385 |                                      write_iops,
386 |                                      read_iops_params,
387 |                                      write_iops_params,
388 |                                      read_125k_bw,
389 |                                      write_125k_bw,
390 |                                      read_125k_bw_params,
391 |                                      write_125k_bw_params,
392 |                                      max_bw,
393 |                                      bytes_sizes,
394 |                                      dali_results,
395 |                                      metadata,
396 |                                      system_num)
397 |         final_dictionary_output['systems'][str(system_num)] = aggregate.json
398 |         if verbose:
399 |             print(aggregate)
400 | 
401 |     final_dictionary_output['total_systems'] = total_systems
402 |     final_dictionary_output['bobber_version'] = bobber_version
403 |     display_table(final_dictionary_output)
404 |     save_yaml_baseline(final_dictionary_output, directory)
405 |     save_json(final_dictionary_output, json_filename)
406 | 
407 |     if custom_baseline:
408 |         compare_baseline(final_dictionary_output, custom_baseline, tolerance,
409 |                          custom=True)
410 |     elif baseline:
411 |         compare_baseline(final_dictionary_output, baseline, tolerance)
412 | 


--------------------------------------------------------------------------------
/bobber/lib/analysis/aggregate_results.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: MIT
  2 | from functools import wraps
  3 | from typing import NoReturn
  4 | 
  5 | 
  6 | def average_decorator(func: 'method') -> float:
  7 |     """
  8 |     A simple wrapper to calculate the average of a list.
  9 | 
 10 |     This wrapper can be used on any function or method which returns a list of
 11 |     ints or floats and calculates the average of those values. If the average
 12 |     can't be calculated for any reason, the value will default to 0.0.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     func : function/method
 17 |         A function to be wrapped with the average decorator.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     float
 22 |         Returns a ``float`` of the final average value from the list.
 23 |     """
 24 |     @wraps(func)
 25 |     def wrapper(*args):
 26 |         value = func(*args)
 27 |         try:
 28 |             return sum(value) / len(value)
 29 |         except (TypeError, ValueError, ZeroDivisionError):
 30 |             return 0.0
 31 |     return wrapper
 32 | 
 33 | 
 34 | class AggregateResults:
 35 |     """
 36 |     Determine the aggregate values for all results.
 37 | 
 38 |     Bobber test runs typically include multiple iterations of all tests in an
 39 |     attempt to eliminiate noise. In order to find the true result, all
 40 |     iterations from a single test pass are averaged together. This is done on a
 41 |     per-system count level where all N-iterations of the single-node tests are
 42 |     aggregated together, then all N-iterations of the two-node tests (if
 43 |     applicable) are aggregated together, and so on.
 44 | 
 45 |     This class has a few helper methods to make it easy to output all data to
 46 |     both JSON format and a string representing the results.
 47 | 
 48 |     Parameters
 49 |     ----------
 50 |     read_bw : dict
 51 |         A ``dictionary`` containing all of the fio read bandwidth results for
 52 |         N-systems.
 53 |     write_bw : dict
 54 |         A ``dicitonary`` containing all of the fio write bandwidth results for
 55 |         N-systems.
 56 |     read_bw_params : dict
 57 |         A ``dictionary`` of the parameters used during the fio read bandwdith
 58 |         tests.
 59 |     write_bw_params : dict
 60 |         A ``dictionary`` of the parameters used during the fio write bandwidth
 61 |         tests.
 62 |     read_iops : dict
 63 |         A ``dictionary`` containing all of the fio read iops results for
 64 |         N-systems.
 65 |     write_iops : dict
 66 |         A ``dictionary`` containing all of the fio write iops results for
 67 |         N-systems.
 68 |     read_iops_params : dict
 69 |         A ``dictionary`` of the parameters used during the fio read iops tests.
 70 |     write_iops_params : dict
 71 |         A ``dictionary`` of the parameters used during the fio write iops
 72 |         tests.
 73 |     read_125k_bw : dict
 74 |         A ``dictionary`` containing all of the fio 125k read bandwidth results
 75 |         for N-systems.
 76 |     write_125k_bw : dict
 77 |         A ``dictionary`` containing all of the fio 125k write bandwidth results
 78 |         for N-systems.
 79 |     read_125k_bw_params : dict
 80 |         A ``dictionary`` of the parameters used during the fio 125k read
 81 |         bandwidth tests.
 82 |     write_125k_bw_params : dict
 83 |         A ``dictionary`` of the parameters used during the fio 125k write
 84 |         bandwidth tests.
 85 |     max_bw : dict
 86 |         A ``dictionary`` of the maximum bus bandwidth achieved from NCCL tests.
 87 |     bytes_sizes : dict
 88 |         A ``dictionary`` of the byte size used when the maximum bus bandwidth
 89 |         was achieved for NCCL tests.
 90 |     dali_results : dict
 91 |         A ``dictionary`` of the DALI throughput for all image sizes and types
 92 |         in images/second.
 93 |     metadata : dict
 94 |         A ``dictionary`` of the max, min, and mean values for all metadata
 95 |         operations.
 96 |     systems : int
 97 |         An ``int`` for the number of systems the current results represent.
 98 |     """
 99 |     def __init__(self,
100 |                  read_bw: dict,
101 |                  write_bw: dict,
102 |                  read_bw_params: dict,
103 |                  write_bw_params: dict,
104 |                  read_iops: dict,
105 |                  write_iops: dict,
106 |                  read_iops_params: dict,
107 |                  write_iops_params: dict,
108 |                  read_125k_bw: dict,
109 |                  write_125k_bw: dict,
110 |                  read_125k_bw_params: dict,
111 |                  write_125k_bw_params: dict,
112 |                  max_bw: dict,
113 |                  bytes_sizes: dict,
114 |                  dali_results: dict,
115 |                  metadata: dict,
116 |                  systems: int) -> NoReturn:
117 |         self._read_bw = read_bw
118 |         self._read_bw_params = read_bw_params
119 |         self._read_iops = read_iops
120 |         self._read_iops_params = read_iops_params
121 |         self._125k_read_bw = read_125k_bw
122 |         self._125k_read_bw_params = read_125k_bw_params
123 |         self._write_bw = write_bw
124 |         self._write_bw_params = write_bw_params
125 |         self._write_iops = write_iops
126 |         self._write_iops_params = write_iops_params
127 |         self._125k_write_bw = write_125k_bw
128 |         self._125k_write_bw_params = write_125k_bw_params
129 |         self._max_bw = max_bw
130 |         self._bytes_sizes = bytes_sizes
131 |         self._dali_results = dali_results
132 |         self._metadata = metadata
133 |         self._num_systems = systems
134 | 
135 |     def __str__(self) -> str:
136 |         """
137 |         A helper function to display results in human-readable text.
138 | 
139 |         Find the aggregate results for each test for N-systems and return the
140 |         final output as a string, similar to the following:
141 | 
142 |         Systems tested: 1
143 |         Aggregate Read Bandwidth: 1.595  GB/s
144 |         Aggregate Write Bandwidth: 1.232  GB/s
145 |         Aggregate Read IOPS: 136.5 k IOPS
146 |         Aggregate Write IOPS: 135.0 k IOPS
147 |         Aggregate 125k Read Bandwidth: 1.595  GB/s
148 |         Aggregate 125k Write Bandwidth: 1.232  GB/s
149 |         NCCL Max Bus Bandwidth: 79.865 at 512.0 MB
150 |         Mdtest
151 |             Directory creation: 71406.29550000001 ops
152 |             Directory stat: 2698234.1525 ops
153 |             Directory removal: 16016.5275 ops
154 |             File creation: 137218.586 ops
155 |             File stat: 2705405.084 ops
156 |             File read: 2230275.9365 ops
157 |             File removal: 175736.5435 ops
158 |             Tree creation: 1546.792 ops
159 |             Tree removal: 5878.747 ops
160 | 
161 |         DALI Standard 800x600
162 |             Min Speed: 2509.35 images/second (0.727 GB/s)
163 |             Avg Speed: 2694.595 images/second (0.78 GB/s)
164 |         DALI Standard 3840x2160
165 |             Min Speed: 344.078 images/second (1.712 GB/s)
166 |             Avg Speed: 430.854 images/second (2.144 GB/s)
167 |         DALI TFRecord 800x600
168 |             Min Speed: 2508.069 images/second (0.726 GB/s)
169 |             Avg Speed: 2665.653 images/second (0.772 GB/s)
170 |         DALI TFRecord 3840x2160
171 |             Min Speed: 317.276 images/second (1.579 GB/s)
172 |             Avg Speed: 376.862 images/second (1.875 GB/s)
173 | 
174 |         Returns
175 |         -------
176 |         str
177 |             Returns a ``string`` of the final aggregate results for N-systems.
178 |         """
179 |         values_to_print = [
180 |             # [Field name, value, unit]
181 |             ['Systems tested:', self._num_systems, ''],
182 |             ['Aggregate Read Bandwidth:', self.average_read_bw, ' GB/s'],
183 |             ['Aggregate Write Bandwidth:', self.average_write_bw, ' GB/s'],
184 |             ['Aggregate 125k Read Bandwidth:', self.average_125k_read_bw,
185 |              ' GB/s'],
186 |             ['Aggregate 125k Write Bandwidth:', self.average_125k_write_bw,
187 |              ' GB/s'],
188 |             ['Aggregate Read IOPS:', self.average_read_iops, 'k IOPS'],
189 |             ['Aggregate Write IOPS:', self.average_write_iops, 'k IOPS'],
190 |         ]
191 |         output = ''
192 |         for item in values_to_print:
193 |             field, value, unit = item
194 |             if value:
195 |                 output += f'{field} {value} {unit}\n'
196 |         if round(self.max_bus_bandwidth, 3) != 0.0:
197 |             output += ('NCCL Max Bus Bandwidth: '
198 |                        f'{round(self.max_bus_bandwidth, 3)} '
199 |                        f'at {self.max_bus_bytes / 1024 / 1024} MB')
200 | 
201 |         if self._metadata:
202 |             output += '\n'
203 |             output += self._metadata_print()
204 | 
205 |         if self._dali_results_print('800x600 standard jpg'):
206 |             output += (f"""
207 | DALI Standard 800x600{self._dali_results_print('800x600 standard jpg')}
208 | DALI Standard 3840x2160{self._dali_results_print('3840x2160 standard jpg')}
209 | DALI TFRecord 800x600{self._dali_results_print('800x600 tfrecord')}
210 | DALI TFRecord 3840x2160{self._dali_results_print('3840x2160 tfrecord')}
211 | """)
212 |         else:
213 |             output += '\n'
214 |         return output
215 | 
216 |     def _metadata_print(self) -> str:
217 |         """
218 |         Determine and return the metadata results.
219 | 
220 |         Iterate through all of the final metadata results for each operation
221 |         type and generate the aggregate number of operations for all
222 |         iterations.
223 | 
224 |         Returns
225 |         -------
226 |         str
227 |             Returns a ``string`` of the formated metadata results.
228 |         """
229 |         output = 'Mdtest\n'
230 | 
231 |         if self._metadata[self._num_systems] == '':
232 |             return ''
233 |         for key, values in self._metadata[self._num_systems].items():
234 |             output += (f"    {key}: {values['mean']} ops\n")
235 |         return output
236 | 
237 |     def _dali_results_print(self, size: str) -> str:
238 |         """
239 |         Determine and return the DALI results.
240 | 
241 |         Calculate the minimum and average speed in images/second and the
242 |         resulting bandwidth for each image type and format and return the
243 |         result as a string.
244 | 
245 |         Parameters
246 |         ----------
247 |         size : str
248 |             The size and type of image to parse, such as '800x600 tfrecord'.
249 | 
250 |         Returns
251 |         -------
252 |         str
253 |             Returns a ``string`` of the formated DALI results.
254 |         """
255 |         try:
256 |             dali_results = self._dali_results[self._num_systems]
257 |         except KeyError:
258 |             return ''
259 |         min_speed = round(dali_results[size]['min images/second'], 3)
260 |         min_bw = round(dali_results[size]['min bandwidth'] * 1e-9, 3)
261 |         avg_speed = round(dali_results[size]['average images/second'], 3)
262 |         avg_bw = round(dali_results[size]['average bandwidth'] * 1e-9, 3)
263 | 
264 |         output = (f"""
265 |     Min Speed: {min_speed} images/second ({min_bw} GB/s)
266 |     Avg Speed: {avg_speed} images/second ({avg_bw} GB/s)""")
267 |         return output
268 | 
269 |     @property
270 |     def json(self) -> dict:
271 |         """
272 |         Generate a JSON representation of the results.
273 | 
274 |         Creating a JSON dump of the results makes it easier for remote tools to
275 |         archive or display results in an easily ingestible format, such as
276 |         webpages or databases.
277 | 
278 |         Returns
279 |         -------
280 |         dict
281 |             Returns a JSON-parsable ``dictionary`` representation of all of the
282 |             results including parameters and units where applicable.
283 |         """
284 |         results = {
285 |             'systems_tested': self._num_systems,
286 |             'bandwidth': {
287 |                 'read': self._average_read_bw(),
288 |                 'write': self._average_write_bw(),
289 |                 'unit': 'bytes/second',
290 |                 'parameters': {
291 |                     'read': self._read_bw_params,
292 |                     'write': self._write_bw_params
293 |                 }
294 |             },
295 |             'iops': {
296 |                 'read': self._average_read_iops(),
297 |                 'write': self._average_write_iops(),
298 |                 'unit': 'operations/second',
299 |                 'parameters': {
300 |                     'read': self._read_iops_params,
301 |                     'write': self._write_iops_params
302 |                 }
303 |             },
304 |             '125k_bandwidth': {
305 |                 'read': self._average_125k_read_bw(),
306 |                 'write': self._average_125k_write_bw(),
307 |                 'unit': 'operations/second',
308 |                 'parameters': {
309 |                     'read': self._125k_read_bw_params,
310 |                     'write': self._125k_write_bw_params
311 |                 }
312 |             },
313 |             'nccl': {
314 |                 'max_bus_bw': self.max_bus_bandwidth,
315 |                 'max_bus_bytes': self.max_bus_bytes,
316 |                 'max_bus_bw_units': 'GB/s'
317 |             }
318 |         }
319 |         try:
320 |             results['dali'] = self._dali_results[self._num_systems]
321 |         except KeyError:
322 |             results['dali'] = {}
323 |         return results
324 | 
325 |     @average_decorator
326 |     def _average_read_bw(self) -> float:
327 |         """
328 |         Returns the average read bandwidth as a ``float`` for all iterations
329 |         in B/s. Defaults to 0.0.
330 |         """
331 |         try:
332 |             return self._read_bw[self._num_systems]
333 |         except KeyError:
334 |             return 0.0
335 | 
336 |     @property
337 |     def average_read_bw(self) -> float:
338 |         """
339 |         Returns the average read bandwidth as a ``float`` for all iterations
340 |         in GB/s, rounded to the nearest thousandth.
341 |         """
342 |         return round(self._average_read_bw() * 1e-9, 3)
343 | 
344 |     @average_decorator
345 |     def _average_write_bw(self) -> float:
346 |         """
347 |         Returns the average write bandwidth as a ``float`` for all iterations
348 |         in B/s. Defaults to 0.0
349 |         """
350 |         try:
351 |             return self._write_bw[self._num_systems]
352 |         except KeyError:
353 |             return 0.0
354 | 
355 |     @property
356 |     def average_write_bw(self) -> float:
357 |         """
358 |         Returns the average write bandwidth as a ``float`` for all iterations
359 |         in GB/s, rounded to the nearest thousandth.
360 |         """
361 |         return round(self._average_write_bw() * 1e-9, 3)
362 | 
363 |     @average_decorator
364 |     def _average_125k_read_bw(self) -> float:
365 |         """
366 |         Returns the average 125k read bandwidth as a ``float`` for all
367 |         iterations in B/s. Defaults to 0.0.
368 |         """
369 |         try:
370 |             return self._125k_read_bw[self._num_systems]
371 |         except KeyError:
372 |             return 0.0
373 | 
374 |     @property
375 |     def average_125k_read_bw(self) -> float:
376 |         """
377 |         Returns the average 125k read bandwidth as a ``float`` for all
378 |         iterations in GB/s, rounded to the nearest thousandth.
379 |         """
380 |         return round(self._average_125k_read_bw() * 1e-9, 3)
381 | 
382 |     @average_decorator
383 |     def _average_125k_write_bw(self) -> float:
384 |         """
385 |         Returns the average 125k write bandwidth as a ``float`` for all
386 |         iterations in B/s. Defaults to 0.0
387 |         """
388 |         try:
389 |             return self._125k_write_bw[self._num_systems]
390 |         except KeyError:
391 |             return 0.0
392 | 
393 |     @property
394 |     def average_125k_write_bw(self) -> float:
395 |         """
396 |         Returns the average 125k write bandwidth as a ``float`` for all
397 |         iterations in GB/s, rounded to the nearest thousandth.
398 |         """
399 |         return round(self._average_125k_write_bw() * 1e-9, 3)
400 | 
401 |     @average_decorator
402 |     def _average_read_iops(self) -> float:
403 |         """
404 |         Returns the average read IOPS as a ``float`` for all iterations in
405 |         ops/second. Defaults to 0.0.
406 |         """
407 |         try:
408 |             return self._read_iops[self._num_systems]
409 |         except KeyError:
410 |             return 0.0
411 | 
412 |     @property
413 |     def average_read_iops(self) -> float:
414 |         """
415 |         Returns the average read IOPS as a ``float`` for all iterations in K
416 |         ops/second.
417 |         """
418 |         return round(self._average_read_iops() * 1e-3, 3)
419 | 
420 |     @average_decorator
421 |     def _average_write_iops(self) -> float:
422 |         """
423 |         Returns the average write IOPS as a ``float`` for all iterations in
424 |         ops/second. Defaults to 0.0.
425 |         """
426 |         try:
427 |             return self._write_iops[self._num_systems]
428 |         except KeyError:
429 |             return 0.0
430 | 
431 |     @property
432 |     def average_write_iops(self) -> float:
433 |         """
434 |         Returns the average write IOPS as a ``float`` for all iterations in K
435 |         ops/second.
436 |         """
437 |         return round(self._average_write_iops() * 1e-3, 3)
438 | 
439 |     @property
440 |     @average_decorator
441 |     def max_bus_bandwidth(self) -> float:
442 |         """
443 |         Returns the average of the maximum bandwidth achieved as a ``float``
444 |         in NCCL in GB/s. Defaults to 0.0
445 |         """
446 |         try:
447 |             return self._max_bw[self._num_systems]
448 |         except KeyError:
449 |             return 0.0
450 | 
451 |     @property
452 |     def max_bus_bytes(self) -> float:
453 |         """
454 |         Returns the associated byte size for the maximum bandwidth achieved in
455 |         NCCL as a ``float``. Defaults to 0.0
456 |         """
457 |         try:
458 |             return int(max(self._bytes_sizes[self._num_systems],
459 |                            key=self._bytes_sizes[self._num_systems].count))
460 |         except (ValueError, KeyError):
461 |             return 0.0
462 | 


--------------------------------------------------------------------------------