├── mlpstorage
    ├── __init__.py
    ├── benchmarks
    │   ├── __init__.py
    │   ├── vectordbbench.py
    │   ├── base.py
    │   └── dlio.py
    ├── debug.py
    ├── config.py
    ├── main.py
    ├── mlps_logging.py
    ├── DEFINING_RULES_CHECKS.md
    ├── history.py
    ├── reporting.py
    ├── utils.py
    └── cli.py
├── mlpstorage.yaml
├── ansible
    ├── collections
    │   └── requirements.yml
    ├── inventory
    ├── README.md
    └── setup.yml
├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── cla.yml
├── configs
    ├── dlio
    │   ├── config.yaml
    │   ├── hydra
    │   │   └── job_logging
    │   │   │   └── custom.yaml
    │   └── workload
    │   │   ├── cosmoflow_datagen.yaml
    │   │   ├── resnet50_datagen.yaml
    │   │   ├── unet3d_datagen.yaml
    │   │   ├── resnet50_h100.yaml
    │   │   ├── cosmoflow_a100.yaml
    │   │   ├── cosmoflow_h100.yaml
    │   │   ├── resnet50_a100.yaml
    │   │   ├── llama3_8b.yaml
    │   │   ├── llama3_70b.yaml
    │   │   ├── llama3_405b.yaml
    │   │   ├── llama3_1t.yaml
    │   │   ├── unet3d_a100.yaml
    │   │   └── unet3d_h100.yaml
    └── vectordbbench
    │   ├── default.yaml
    │   └── 10m.yaml
├── pyproject.toml
├── CONTRIBUTING.md
├── DEVELOPMENT.md
├── system_configuration.yaml
├── test
    └── run_tests.sh
├── LICENSE.md
└── README.md


/mlpstorage/__init__.py:
--------------------------------------------------------------------------------
1 | # VERSION
2 | VERSION = "2.0.0b1"
3 | __version__ = VERSION


--------------------------------------------------------------------------------
/mlpstorage.yaml:
--------------------------------------------------------------------------------
1 | # Example configuration for global options to all commands
2 | hosts: 127.0.0.1
3 | data_dir: /mnt/nvme
4 | results_dir: /root/mlpstorage_results/


--------------------------------------------------------------------------------
/ansible/collections/requirements.yml:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: Apache-2.0
2 | # Copyright (c) 2022 Dell Inc, or its subsidiaries.
3 | ---
4 | collections:
5 |   - community.general
6 | 


--------------------------------------------------------------------------------
/ansible/inventory:
--------------------------------------------------------------------------------
1 | [clients]
2 | worker1 ansible_host=172.22.X.X    ansible_connection=ssh        ansible_user=root
3 | worker2 ansible_host=172.22.X.X    ansible_connection=ssh        ansible_user=root


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in the repo.
2 | # Unless a later match takes precedence,they will be requested for review when someone opens a pull request.
3 | * @mlcommons/wg-storage
4 | 


--------------------------------------------------------------------------------
/mlpstorage/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | from mlpstorage.benchmarks.dlio import TrainingBenchmark, CheckpointingBenchmark
2 | from mlpstorage.benchmarks.vectordbbench import VectorDBBenchmark
3 | 
4 | __all__ = ['TrainingBenchmark', 'VectorDBBenchmark', 'CheckpointingBenchmark']
5 | 


--------------------------------------------------------------------------------
/configs/dlio/config.yaml:
--------------------------------------------------------------------------------
 1 | # A set of configuration
 2 | defaults:
 3 |  - _self_
 4 |  - workload: 
 5 |  - override hydra/job_logging: disabled  
 6 |  - override hydra/hydra_logging: disabled
 7 | hydra:
 8 |   run:
 9 |     dir: ./results/${workload.model}/${now:%Y-%m-%d}-${now:%H-%M-%S}
10 | 


--------------------------------------------------------------------------------
/configs/dlio/hydra/job_logging/custom.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   simple:
 4 |     format: '[%(levelname)s] - %(message)s [%(pathname)s:%(lineno)d]'
 5 | handlers:
 6 |   console:
 7 |     class: logging.StreamHandler
 8 |     formatter: simple
 9 |     stream: ext://sys.stdout
10 | root:
11 |   handlers: [console]
12 | 
13 | disable_existing_loggers: false


--------------------------------------------------------------------------------
/configs/dlio/workload/cosmoflow_datagen.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: cosmoflow
 3 | 
 4 | framework: tensorflow
 5 | 
 6 | workflow:
 7 |   generate_data: True
 8 |   train: False
 9 |   checkpoint: False
10 | 
11 | dataset:
12 |  data_folder: data/cosmoflow
13 |  num_files_train: 524288
14 |  num_samples_per_file: 1
15 |  record_length_bytes: 2828486
16 |  record_length_bytes_stdev: 71311
17 |  format: tfrecord
18 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/resnet50_datagen.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: resnet50
 3 |   type: cnn
 4 | 
 5 | framework: tensorflow
 6 | 
 7 | workflow:
 8 |   generate_data: True
 9 |   train: False
10 |   checkpoint: False
11 | 
12 | dataset:
13 |  num_files_train: 1024
14 |  num_samples_per_file: 1251
15 |  record_length_bytes: 114660.07
16 |  record_length_bytes_resize: 150528
17 |  data_folder: data/resnet50
18 |  format: tfrecord
19 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/unet3d_datagen.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: unet3d
 3 |   type: cnn
 4 |   model_size: 499153191
 5 | 
 6 | framework: pytorch
 7 | 
 8 | workflow:
 9 |   generate_data: True
10 |   train: False
11 |   checkpoint: False
12 | 
13 | dataset: 
14 |   data_folder: data/unet3d/
15 |   format: npz
16 |   num_files_train: 168
17 |   num_samples_per_file: 1
18 |   record_length_bytes: 146600628
19 |   record_length_bytes_stdev: 68341808
20 |   record_length_bytes_resize: 2097152
21 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/resnet50_h100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: resnet50
 3 |   type: cnn
 4 | 
 5 | framework: tensorflow
 6 | 
 7 | workflow:
 8 |  generate_data: False
 9 |  train: True
10 | 
11 | dataset:
12 |  num_files_train: 1024
13 |  num_samples_per_file: 1251
14 |  record_length_bytes: 114660.07
15 |  record_length_bytes_resize: 150528
16 |  data_folder: data/resnet50
17 |  format: tfrecord
18 | 
19 | train: 
20 |  computation_time: 0.224
21 |  epochs: 5
22 |  
23 | reader:
24 |  data_loader: tensorflow
25 |  read_threads: 8
26 |  computation_threads: 8
27 |  batch_size: 400
28 | 
29 | metric:
30 |  au: 0.90
31 | 


--------------------------------------------------------------------------------
/configs/vectordbbench/default.yaml:
--------------------------------------------------------------------------------
 1 | database:
 2 |   host: 127.0.0.1
 3 |   port: 19530
 4 |   database: milvus
 5 |   max_receive_message_length: 514_983_574
 6 |   max_send_message_length: 514_983_574
 7 | 
 8 | dataset:
 9 |   collection_name: mlps_1m_1shards_1536dim_uniform
10 |   num_vectors: 1_000_000
11 |   dimension: 1536
12 |   distribution: uniform
13 |   batch_size: 10
14 |   chunk_size: 100_000
15 |   num_shards: 1
16 |   vector_dtype: FLOAT_VECTOR
17 | 
18 | index:
19 |   index_type: DISKANN
20 |   metric_type: COSINE
21 |   index_params:
22 |     M: 64
23 |     ef_construction: 200
24 | 
25 | workflow:
26 |   compact: True


--------------------------------------------------------------------------------
/configs/vectordbbench/10m.yaml:
--------------------------------------------------------------------------------
 1 | database:
 2 |   host: 127.0.0.1
 3 |   port: 19530
 4 |   database: milvus
 5 |   max_receive_message_length: 514_983_574
 6 |   max_send_message_length: 514_983_574
 7 | 
 8 | dataset:
 9 |   collection_name: mlps_10m_10shards_1536dim_uniform
10 |   num_vectors: 10_000_000
11 |   dimension: 1536
12 |   distribution: uniform
13 |   batch_size: 10
14 |   chunk_size: 1_000_000
15 |   num_shards: 10
16 |   vector_dtype: FLOAT_VECTOR
17 | 
18 | index:
19 |   index_type: DISKANN
20 |   metric_type: COSINE
21 |   index_params:
22 |     M: 64
23 |     ef_construction: 200
24 | 
25 | workflow:
26 |   compact: True


--------------------------------------------------------------------------------
/configs/dlio/workload/cosmoflow_a100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: cosmoflow
 3 | 
 4 | framework: tensorflow
 5 | 
 6 | workflow:
 7 |  generate_data: False
 8 |  train: True
 9 | 
10 | dataset:
11 |  data_folder: data/cosmoflow
12 |  num_files_train: 524288
13 |  num_samples_per_file: 1
14 |  record_length_bytes: 2828486
15 |  record_length_bytes_stdev: 71311
16 |  format: tfrecord
17 | 
18 | reader:
19 |  data_loader: tensorflow
20 |  read_threads: 4
21 |  batch_size: 1
22 |  file_shuffle: seed
23 |  sample_shuffle: seed
24 |  shuffle_size: 2
25 | 
26 | train: 
27 |   epochs: 5
28 |   computation_time: 0.00551
29 | 
30 | metric:
31 |  au: 0.70
32 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/cosmoflow_h100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: cosmoflow
 3 | 
 4 | framework: tensorflow
 5 | 
 6 | workflow:
 7 |  generate_data: False
 8 |  train: True
 9 | 
10 | dataset:
11 |  data_folder: data/cosmoflow
12 |  num_files_train: 524288
13 |  num_samples_per_file: 1
14 |  record_length_bytes: 2828486
15 |  record_length_bytes_stdev: 71311
16 |  format: tfrecord
17 | 
18 | reader:
19 |  data_loader: tensorflow
20 |  read_threads: 4
21 |  batch_size: 1
22 |  file_shuffle: seed
23 |  sample_shuffle: seed
24 |  shuffle_size: 2
25 |  
26 | train: 
27 |   epochs: 5
28 |   computation_time: 0.00350
29 | 
30 | metric:
31 |   au: 0.70
32 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/resnet50_a100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: resnet50
 3 |   type: cnn
 4 | 
 5 | framework: tensorflow
 6 | 
 7 | workflow:
 8 |  generate_data: False
 9 |  train: True
10 | 
11 | dataset:
12 |  num_files_train: 1024
13 |  num_samples_per_file: 1251
14 |  record_length_bytes: 114660.07
15 |  record_length_bytes_resize: 150528
16 |  data_folder: data/resnet50
17 |  format: tfrecord
18 | 
19 | train: 
20 |  computation_time: 0.435
21 |  epochs: 5
22 |  
23 | reader:
24 |  data_loader: tensorflow
25 |  read_threads: 8
26 |  computation_threads: 8
27 |  batch_size: 400
28 |  dont_use_mmap: True
29 | 
30 | metric:
31 |  au: 0.90
32 | 


--------------------------------------------------------------------------------
/configs/dlio/workload/llama3_8b.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   name: llama_8b
 3 |   type: transformer
 4 |   num_layers: 32
 5 |   model_datatype: fp16
 6 |   optimizer_datatype: fp32
 7 |   parallelism:
 8 |     pipeline: 1
 9 |     tensor: 1
10 |     zero_stage: 3
11 |   transformer: 
12 |     vocab_size: 128256
13 |     hidden_size: 4096
14 |     ffn_hidden_size: 14336
15 |     num_attention_heads: 32
16 |     num_kv_heads: 8
17 | 
18 | framework: pytorch
19 | 
20 | workflow:
21 |   generate_data: False
22 |   train: False
23 |   checkpoint: True
24 | 
25 | checkpoint:
26 |   checkpoint_folder: checkpoints/llama_8b
27 |   time_between_checkpoints: 5
28 |   num_checkpoints_write: 10
29 |   num_checkpoints_read: 10
30 |   fsync: True


--------------------------------------------------------------------------------
/configs/dlio/workload/llama3_70b.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: llama_70b
 3 |   type: transformer
 4 |   num_layers: 80
 5 |   model_datatype: fp16
 6 |   optimizer_datatype: fp32
 7 |   parallelism:
 8 |     tensor: 8
 9 |     pipeline: 1
10 |     zero_stage: 3
11 |   transformer: 
12 |     vocab_size: 128256
13 |     hidden_size: 8192
14 |     ffn_hidden_size: 28672
15 |     num_attention_heads: 128
16 |     num_kv_heads: 8
17 | 
18 | framework: pytorch
19 | 
20 | workflow:
21 |   generate_data: False
22 |   train: False
23 |   checkpoint: True
24 | 
25 | checkpoint:
26 |   checkpoint_folder: checkpoints/llama_70b
27 |   time_between_checkpoints: 5
28 |   num_checkpoints_write: 10
29 |   num_checkpoints_read: 10
30 |   fsync: True


--------------------------------------------------------------------------------
/configs/dlio/workload/llama3_405b.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: llama_405b
 3 |   type: transformer
 4 |   num_layers: 126
 5 |   model_datatype: fp16
 6 |   optimizer_datatype: fp32
 7 |   parallelism:
 8 |     tensor: 8
 9 |     pipeline: 32
10 |     zero_stage: 1
11 |   transformer: 
12 |     vocab_size: 128256
13 |     hidden_size: 16384
14 |     ffn_hidden_size: 53248
15 |     num_attention_heads: 128
16 |     num_kv_heads: 8
17 | 
18 | framework: pytorch
19 | 
20 | workflow:
21 |   generate_data: False
22 |   train: False
23 |   checkpoint: True
24 | 
25 | checkpoint:
26 |   checkpoint_folder: checkpoints/llama_405b
27 |   time_between_checkpoints: 5
28 |   num_checkpoints_write: 10
29 |   num_checkpoints_read: 10
30 |   fsync: True


--------------------------------------------------------------------------------
/configs/dlio/workload/llama3_1t.yaml:
--------------------------------------------------------------------------------
 1 | # we mimic the checkpoint data for megatron-deepspeed
 2 | model: 
 3 |   name: llama_1t
 4 |   type: transformer
 5 |   num_layers: 128
 6 |   model_datatype: fp16
 7 |   optimizer_datatype: fp32
 8 |   parallelism:
 9 |     tensor: 8
10 |     pipeline: 64
11 |     zero_stage: 1
12 |   transformer: 
13 |     vocab_size: 128256
14 |     hidden_size: 25872
15 |     ffn_hidden_size: 98304
16 |     num_attention_heads: 192
17 |     num_kv_heads: 32
18 | 
19 | framework: pytorch
20 | 
21 | workflow:
22 |   generate_data: False
23 |   train: False
24 |   checkpoint: True
25 | 
26 | checkpoint:
27 |   checkpoint_folder: checkpoints/llama_1t
28 |   time_between_checkpoints: 5
29 |   num_checkpoints_write: 10
30 |   num_checkpoints_read: 10
31 |   fsync: True


--------------------------------------------------------------------------------
/configs/dlio/workload/unet3d_a100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: unet3d
 3 |   type: cnn
 4 |   model_size: 499153191
 5 | 
 6 | framework: pytorch
 7 | 
 8 | workflow:
 9 |   generate_data: False
10 |   train: True
11 |   checkpoint: False
12 | 
13 | dataset: 
14 |   data_folder: data/unet3d/
15 |   format: npz
16 |   num_files_train: 168
17 |   num_samples_per_file: 1
18 |   record_length_bytes: 146600628
19 |   record_length_bytes_stdev: 68341808
20 |   record_length_bytes_resize: 2097152
21 |   
22 | reader: 
23 |   data_loader: pytorch
24 |   batch_size: 7
25 |   read_threads: 4
26 |   file_shuffle: seed
27 |   sample_shuffle: seed
28 | 
29 | train:
30 |   epochs: 5
31 |   computation_time: 0.636
32 | 
33 | checkpoint:
34 |   checkpoint_folder: checkpoints/unet3d
35 |   checkpoint_after_epoch: 5
36 |   epochs_between_checkpoints: 2
37 | 
38 | metric:
39 |   au: 0.90


--------------------------------------------------------------------------------
/configs/dlio/workload/unet3d_h100.yaml:
--------------------------------------------------------------------------------
 1 | model: 
 2 |   name: unet3d
 3 |   type: cnn
 4 |   model_size: 499153191
 5 | 
 6 | framework: pytorch
 7 | 
 8 | workflow:
 9 |   generate_data: False
10 |   train: True
11 |   checkpoint: False
12 | 
13 | dataset: 
14 |   data_folder: data/unet3d/
15 |   format: npz
16 |   num_files_train: 168
17 |   num_samples_per_file: 1
18 |   record_length_bytes: 146600628
19 |   record_length_bytes_stdev: 68341808
20 |   record_length_bytes_resize: 2097152
21 |   
22 | reader: 
23 |   data_loader: pytorch
24 |   batch_size: 7
25 |   read_threads: 4
26 |   file_shuffle: seed
27 |   sample_shuffle: seed
28 | 
29 | train:
30 |   epochs: 5
31 |   computation_time: 0.323
32 | 
33 | checkpoint:
34 |   checkpoint_folder: checkpoints/unet3d
35 |   checkpoint_after_epoch: 5
36 |   epochs_between_checkpoints: 2
37 | 
38 | metric:
39 |   au: 0.90
40 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=42", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mlpstorage"
 7 | version = "2.0.0b1"
 8 | description = "MLPerf Storage Benchmark Suite"
 9 | readme = "README.md"
10 | authors = [
11 |     {name = "MLCommons Storage Working Group"}
12 | ]
13 | requires-python = ">=3.10.0"
14 | dependencies = [
15 |     "dlio-benchmark @ git+https://github.com/argonne-lcf/dlio_benchmark.git@mlperf_storage_v2.0",
16 |     "psutil>=5.9",
17 |     "pyarrow"
18 | ]
19 | 
20 | [project.urls]
21 | "Homepage" = "https://github.com/mlcommons/storage"
22 | "Bug Tracker" = "https://github.com/mlcommons/storage/issues"
23 | 
24 | [tool.setuptools]
25 | packages = {find = {}}
26 | 
27 | [tool.setuptools.package-data]
28 | "mlpstorage" = ["../configs/dlio/workload/*.yaml"]
29 | 
30 | [project.scripts]
31 | mlpstorage = 'mlpstorage.main:main'


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing
2 | 
3 | The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started). 
4 | 
5 | Generally we encourage people to become a MLCommons member if they wish to contribute to MLCommons projects, but outside pull requests are very welcome too.
6 | 
7 | Regardless of if you are a member, your organization needs to sign the MLCommons CLA. Please fill out this [CLA sign up form](https://forms.gle/Ew1KkBVpyeJDuRw67) form to get started.
8 | 
9 | MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests.


--------------------------------------------------------------------------------
/mlpstorage/debug.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import types
 3 | import traceback
 4 | 
 5 | from functools import partialmethod, wraps
 6 | 
 7 | from mlpstorage.config import MLPS_DEBUG
 8 | 
 9 | 
10 | def debug_tryer_wrapper(on_error, debug, logger, func):
11 |     @wraps(func)
12 |     def debug_tryed(*args, **kwargs):
13 |         with DebugTryer(on_error=on_error, debug=debug, logger=logger, description=func.__name__):
14 |             return func(*args, **kwargs)
15 | 
16 |     return debug_tryed
17 | 
18 | 
19 | def debugger_hook(type, value, tb):
20 |     """
21 |     This hook is enabled with:
22 | 
23 |     `sys.excepthook = debugger_hook`
24 | 
25 |     This will result in exceptions dropping the user into the pdb debugger
26 | 
27 |     https://stackoverflow.com/questions/242485/starting-python-debugger-automatically-on-error
28 | 
29 |     :param type:
30 |     :param value:
31 |     :param tb:
32 |     :return:
33 |     """
34 |     if hasattr(sys, 'ps1') or not sys.stderr.isatty():
35 |         # we are in interactive mode or we don't have a tty-like
36 |         # device, so we call the default hook
37 |         sys.__excepthook__(type, value, tb)
38 |     else:
39 |         import pdb
40 |         import traceback
41 |         # we are NOT in interactive mode, print the exception...
42 |         traceback.print_exception(type, value, tb)
43 |         print()
44 |         # ...then start the debugger in post-mortem mode.
45 |         # pdb.pm() # deprecated
46 |         pdb.post_mortem(tb)  # more "modern"
47 | 


--------------------------------------------------------------------------------
/.github/workflows/cla.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: "cla-bot"
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 |   pull_request_target:
 7 |     types: [opened,closed,synchronize]
 8 | 
 9 | jobs:
10 |   cla-check:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: "MLCommons CLA bot check"
14 |         if: (github.event.comment.body == 'recheck') || github.event_name == 'pull_request_target'
15 |         # Alpha Release
16 |         uses: mlcommons/cla-bot@master
17 |         env:
18 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 |           # the below token should have repo scope and must be manually added by you in the repository's secret
20 |           PERSONAL_ACCESS_TOKEN : ${{ secrets.MLCOMMONS_BOT_CLA_TOKEN }}
21 |         with:
22 |           path-to-signatures: 'cla-bot/v1/cla.json'
23 |           # branch should not be protected
24 |           branch: 'main'
25 |           allowlist: user1,bot*
26 |           remote-organization-name: mlcommons
27 |           remote-repository-name: systems
28 |           
29 |          #below are the optional inputs - If the optional inputs are not given, then default values will be taken
30 |           #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository)
31 |           #remote-repository-name:  enter the  remote repository name where the signatures should be stored (Default is storing the signatures in the same repository)
32 |           #create-file-commit-message: 'For example: Creating file for storing CLA Signatures'
33 |           #signed-commit-message: 'For example: $contributorName has signed the CLA in #$pullRequestNo'
34 |           #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign'
35 |           #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA'
36 |           #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.'
37 | 


--------------------------------------------------------------------------------
/ansible/README.md:
--------------------------------------------------------------------------------
 1 | # MLPerf™ Storage Benchmark Suite Setup (Ansible Playbook)
 2 | 
 3 | This repository contains an Ansible playbook to automate the setup of the **MLPerf™ Storage Benchmark Suite** on **bare-metal nodes** across multiple operating systems (Ubuntu, Red Hat, SLES).
 4 | 
 5 | ## 📋 Prerequisites
 6 | 
 7 | - **One node must act as the control node** — this is where Ansible is installed and run.
 8 |   All other nodes are *target nodes* and **do not need Ansible installed**.
 9 | - The control node must be able to SSH into all target nodes.
10 | - Ensure the `ansible_user` has **sudo** privileges on all nodes.
11 | - Set up **password-less SSH and sudo** to avoid authentication interruptions.
12 |   
13 | ### Set up password-less SSH (on control node):
14 | 
15 | ```bash
16 | ssh-keygen
17 | ssh-copy-id root@172.22.X.X  # Run for each target node
18 | ```
19 | 
20 | ## 📦 Installation
21 | 
22 | ### Install Python dependencies:
23 | 
24 | ```bash
25 | uv venv 
26 | source venv/bin/activate
27 | uv pip install ansible ansible-pylibssh
28 | ansible-galaxy collection install -r collections/requirements.yml
29 | ```
30 | 
31 | ## 🗂️ Preparing Mount Directories
32 | 
33 | Before running benchmarks, **ensure dataset folders exist** on **every node**.
34 | 
35 | ### Option 1: Local Directory
36 | 
37 | Create local folders:
38 | 
39 | ```bash
40 | mkdir -p /mnt/nfs/train /mnt/nfs/valid
41 | ```
42 | 
43 | ### Option 2: NFS Mounted Directory
44 | 
45 | Use `/mnt/nfs` as a shared mount location across nodes.
46 | 
47 | ```bash
48 | sudo apt install nfs-common  # Or use zypper/yum based on OS
49 | sudo mkdir -p /mnt/nfs
50 | sudo mount -t nfs 172.22.X.X:/shared-path /mnt/nfs
51 | ```
52 | 
53 | To persist it, add to `/etc/fstab`:
54 | 
55 | ```bash
56 | 172.22.X.X:/shared-path /mnt/nfs nfs defaults 0 0
57 | ```
58 | 
59 | ## 🚀 Running Bare-Metal Ansible Setup
60 | 
61 | ### 1. Update the inventory file with the target IPs:
62 | 
63 | ```bash
64 | nano inventory
65 | ```
66 | 
67 | ### 2. Run the playbook:
68 | 
69 | ```bash
70 | cd storage/ansible/
71 | ansible-playbook -i inventory setup.yml
72 | ```
73 | 
74 | ### 3. Activate virtual environment
75 | 
76 | ```bash
77 | source venv/bin/activate
78 | ```
79 | 
80 | ### 4. Data Generation
81 | 
82 | This step should run only once per model, as data generation is time-consuming.
83 | 
84 | ```bash
85 | mlpstorage training datagen --hosts 172.X.X.1 172.X.X.2 --num-processes 8 --model cosmoflow --data-dir /mnt/nfs/data --results-dir /mnt/nfs/result --param dataset.num_files_train=100
86 | ```
87 | 
88 | ### 5. Run training Benchmark
89 | 
90 | ```bash
91 | mlpstorage training run --hosts 172.X.X.1 172.X.X.2  --num-client-hosts 2 --client-host-memory-in-gb 64 --num-accelerators 8 --accelerator-type h100 --model cosmoflow --data-dir /mnt/nfs/data --results-dir /mnt/nfs/result --param dataset.num_files_train=100
92 | ```


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | # Rules Updates
 2 | 
 3 | - [ ] Define filesystem caching rules in detail
 4 | - [ ] Define system json schema and creation process
 5 | - [ ] Define allowed time between runs
 6 | - [ ] Define rules that use local SSD for caching data
 7 | - [ ] Define rules for hyperconverged and local cache
 8 | 
 9 | # Code Updates
10 | - [ ] Configure datasize to collect the memory information from the hosts instead of getting a number of hosts for the calculation
11 | 
12 | - [ ] Determine method to use cgroups for memory limitation in the benchmark script.
13 | 
14 | - [x] Add a log block at the start of datagen & run that output all the parms being used to be clear on what a run is.
15 | 
16 | - [x] Remove accelerator type from datagen
17 | - [x] datasize should output the datagen command to copy and paste
18 | 
19 | - [ ] Add autosize parameter for run_benchmark and datasize
20 | - [ ] for run it's just size of dataset based on memory capacity
21 | - [ ] For datasize it needs an input of GB/s for the cluster and list of hosts
22 | -
23 | - [x] Keep a log of mlperfstorage commands executed in a mlperf.history file in results_dir
24 | 
25 | - [ ] Add support for datagen to use subdirectories
26 | - [x] Capture cluster information and write to a json document in outputdir. 
27 | - [ ] Figure out how to get all clients for milvus
28 | 
29 | ## benchmark[.py | .sh] script
30 | - [x] Unique names for files and directories with structure for benchmark, accelerator, count, run-sequence, run-number
31 | - [x] Better installer that manages dependencies
32 | - [ ] Containerization
33 | - - [ ] Ease of Deployment of Benchmark (just get it working)
34 | - - [ ] Cgroups and resource limits (better cache management)
35 | - [ ] Flush Cache before a run
36 | - [ ] Validate inputs for –closed runs (eg: don’t allow runs against datasets that are too small)
37 | - [ ] Reportgen should run validation against outputs
38 | - [ ] Add better system.json creation to automate the system description for consistency
39 | - - [ ] Add json schema checker for system documents that submitters create
40 | - [ ] Automate execution of multiple runs
41 | - [ ] ~~Add support for code changes in closed to supported categories [ data loader, s3 connector, etc]~~
42 | - - [ ] ~~Add patches directory that gets applied before execution~~
43 | - [ ] Add runtime estimation 
44 | - [x] and --what-if or --dry-run flag
45 | - [ ] Automate selection of minimum required dataset
46 | - [ ] ~~Determine if batch sizes in MLPerf Training are representative of batch sizes for realistically sized datasets~~
47 | - [ ] Split system.json into automatically capturable (clients) and manual (storage)
48 | - [ ] Define system.json schema and add schema checker to the tool for reportgen
49 | - [ ] Add report-dir csv of results from tests as they are run
50 | - [ ] Collect versions of all prerequisite packages for storage and dlio
51 | 
52 | ## DLIO Improvements
53 | - [ ] Reduce verbosity of logging
54 | - [ ] Add callback handler for custom monitoring
55 | - - [ ] SPECStorage uses a “PRIME_MON_SCRIPT” environment variable that will execute at different times
56 | - - [ ] Checkpoint_bench uses RPC to call execution which can be wrapped externally
57 | - [ ] Add support for DIRECTIO
58 | - [ ] Add seed for dataset creation so that distribution of sizes is the same for all submitters (file 1 = mean + x bytes, file 2 = mean + y bytes, etc)
59 | - [ ] Determine if global barrier for each batch matches industry behavior
60 | 
61 | ## Results Presentation
62 | - [ ] Better linking and presentation of system diagrams (add working links to system diagrams to supplementals)
63 | - [ ] Define presentation and rules for hyperconverged or systems with local cache


--------------------------------------------------------------------------------
/system_configuration.yaml:
--------------------------------------------------------------------------------
  1 | System:
  2 |   name: FastAmazingAcmeStorage 9000
  3 |   description: <text>
  4 |   storage_location: [ remote | local | hyper-converged ]
  5 |   client_software: [ in-box | proprietary ]
  6 |   storage_interface: [ block | file | object ]
  7 |   required_rack_units:
  8 |   shared_capabilities:
  9 |     multi_host_support: True            # False is used for local storage
 10 |     simultaneous_write_support: False   # Are simultaneous writes by multiple hosts supported in the submitted configuration
 11 |     simultaneous_read__support: True    # Are simultaneous reads by multiple hosts supported in the submitted configuration
 12 |   max_sequential_read: # Optional - GiB/s
 13 |   max_sequential_write: # Optional - GiB/s
 14 |   max_random_read: # Optional - GiB/s
 15 |   max_random_write: # Optional - GiB/s
 16 | 
 17 | PowerRequirements:
 18 |   dlio_client:
 19 |     quantity: # number of dlio_client nodes
 20 |     psu1_nameplate_power: # power in watts
 21 |     psu2_nameplate_power: # power in watts
 22 |     psu3_nameplate_power: # power in watts
 23 |     design_power: 2400
 24 |     num_active_psus: 2
 25 |     num_passive_psus: 1
 26 | 
 27 |   # All storage nodes need to be listed (data, metadata, etc) as well as any required backed switching
 28 |   storage_data_node:
 29 |     quantity: # number of storage data nodes
 30 |     psu1_nameplate_power: # power in watts
 31 |     psu2_nameplate_power: # power in watts
 32 |     psu3_nameplate_power: # power in watts
 33 |     design_power: 2400
 34 |     num_active_psus: 2
 35 |     num_passive_psus: 1
 36 |   backend_switch:
 37 |     quantity: 1
 38 |     psu1_nameplate_power: 700   # network PSU
 39 |     psu2_nameplate_power: 700   # network PSU
 40 |     design_power: 700
 41 |     num_active_psus: 1
 42 |     num_passive_psus: 1
 43 | 
 44 | 
 45 | # All nodes used need to be listed. Clients, Data storage, metadata, front-end, back-end, etc
 46 | Nodes:
 47 |   # Useful name for the client describing it's role in the system under test
 48 |   dlio_client:  # This can be DLIO Client, storage node, storage controller, AwesomeMarketingName_Type1, etc
 49 |     quantity: 8  # How many of this node
 50 |     hardware:
 51 |       model: SMC
 52 |       rack_units: 2
 53 |       power_supplies: 2
 54 |       psu_configuration: active/passive
 55 |       psu_rating: 1200
 56 |       memory_capacity: 256GB
 57 |       memory_configuration: 8x32GB
 58 |       cpu_qty: 2
 59 |       cpu_model: AMD 9555
 60 |       cpu_cores: 96
 61 |     networking:
 62 |       management:
 63 |         model: intel i210
 64 |         speed: 1Gbps
 65 |         qty: 1
 66 |     operating_system:
 67 |       name: Ubuntu
 68 |       version: 22.04 LTS
 69 |       release_date: 2022-04-12
 70 |       kernel_version: 5.15.0-56-generic
 71 |       cpu_architecture: x86_64
 72 |     tuning:
 73 |       # All non-default tunings for OS need to be listed
 74 |       mpi_configuration:
 75 |         environment_variables: <list>
 76 |         version: Open MPI 4.1.4
 77 |       sysctl_parameters:
 78 |         <list all non-default parameters>
 79 |   storage_data_node: # This can be DLIO Client, storage node, storage controller, AwesomeMarketingName_Type1, etc
 80 |     quantity: 8  # How many of this node
 81 |     hardware:
 82 |       model: SMC
 83 |       rack_units: 2
 84 |       power_supplies: 2
 85 |       psu_configuration: active/passive
 86 |       psu_rating: 1200
 87 |       memory_capacity: 256GB
 88 |       memory_configuration: 8x32GB
 89 |       cpu_qty: 2
 90 |       cpu_model: AMD 9555
 91 |       cpu_cores: 96
 92 |     networking:
 93 |       management:
 94 |         model: intel i210
 95 |         speed: 1Gbps
 96 |         qty: 1
 97 |     operating_system:
 98 |       name: Ubuntu
 99 |       version: 22.04 LTS
100 |       release_date: 2022-04-12
101 |       kernel_version: 5.15.0-56-generic
102 |       cpu_architecture: x86_64
103 |     tuning:
104 |       mpi_configuration:
105 |         environment_variables: <list>
106 |         version: Open MPI 4.1.4
107 |       sysctl_parameters:
108 |         <list all non-default parameters>
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/mlpstorage/benchmarks/vectordbbench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from mlpstorage.benchmarks.base import Benchmark
  5 | from mlpstorage.config import CONFIGS_ROOT_DIR, BENCHMARK_TYPES
  6 | from mlpstorage.utils import read_config_from_file
  7 | 
  8 | 
  9 | class VectorDBBenchmark(Benchmark):
 10 | 
 11 |     VECTORDB_CONFIG_PATH = "vectordbbench"
 12 |     VDBBENCH_BIN = "vdbbench"
 13 |     BENCHMARK_TYPE = BENCHMARK_TYPES.vector_database
 14 | 
 15 |     def __init__(self, args):
 16 |         super().__init__(args)
 17 |         self.command_method_map = {
 18 |             "datagen": self.execute_datagen,
 19 |             "run-search": self.execute_run
 20 |         }
 21 | 
 22 |         self.command = args.command
 23 |         self.category = args.category if hasattr(args, 'category') else None
 24 |         self.config_path = os.path.join(CONFIGS_ROOT_DIR, self.VECTORDB_CONFIG_PATH)
 25 |         self.config_name = args.config if hasattr(args, 'config') and args.config else "default"
 26 |         self.yaml_params = read_config_from_file(os.path.join(self.config_path, f"{self.config_name}.yaml"))
 27 | 
 28 |         self.verify_benchmark()
 29 | 
 30 |         self.logger.status(f'Instantiated the VectorDB Benchmark...')
 31 | 
 32 |     def _run(self):
 33 |         """Execute the appropriate command based on the command_method_map"""
 34 |         if self.command in self.command_method_map:
 35 |             self.logger.verboser(f"Executing command: {self.command}")
 36 |             self.command_method_map[self.command]()
 37 |         else:
 38 |             self.logger.error(f"Unsupported command: {self.command}")
 39 |             sys.exit(1)
 40 | 
 41 |     def build_command(self, script_name, additional_params=None):
 42 |         """
 43 |         Build a command string for executing a script with appropriate parameters
 44 | 
 45 |         Args:
 46 |             script_name (str): Name of the script to execute (e.g., "load_vdb.py" or "simple_bench.py")
 47 |             additional_params (dict, optional): Additional parameters to add to the command
 48 | 
 49 |         Returns:
 50 |             str: The complete command string
 51 |         """
 52 |         # Ensure output directory exists
 53 |         os.makedirs(self.run_result_output, exist_ok=True)
 54 | 
 55 |         # Build the base command
 56 |         config_file = os.path.join(self.config_path, f"{self.config_name}.yaml")
 57 | 
 58 |         cmd = f"{script_name}"
 59 |         cmd += f" --config {config_file}"
 60 | 
 61 |         if script_name == "load-vdb":
 62 |             if self.args.force:
 63 |                 cmd += " --force"
 64 | 
 65 |         # Add host and port if provided (common to both datagen and run)
 66 |         if hasattr(self.args, 'host') and self.args.host:
 67 |             cmd += f" --host {self.args.host}"
 68 |         if hasattr(self.args, 'port') and self.args.port:
 69 |             cmd += f" --port {self.args.port}"
 70 | 
 71 |         # Add any additional parameters
 72 |         if additional_params:
 73 |             for param, attr in additional_params.items():
 74 |                 if attr:
 75 |                     cmd += f" --{param} {attr}"
 76 | 
 77 |         return cmd
 78 | 
 79 |     def execute_datagen(self):
 80 |         """Execute the data generation command using load_vdb.py"""
 81 |         additional_params = {
 82 |             "dimension": self.args.dimension,
 83 |             "num-shards": self.args.num_shards,
 84 |             "vector-dtype": self.args.vector_dtype,
 85 |             "num-vectors": self.args.num_vectors,
 86 |             "distribution": self.args.distribution,
 87 |             "batch-size": self.args.batch_size,
 88 |             "chunk-size": self.args.chunk_size,
 89 |         }
 90 |         cmd = self.build_command("load-vdb", additional_params)
 91 | 
 92 |         self.logger.verbose(f'Executing data generation.')
 93 |         self._execute_command(cmd)
 94 | 
 95 |     def execute_run(self):
 96 |         """Execute the benchmark run command using simple_bench.py"""
 97 |         # Define additional parameters specific to the run command
 98 |         additional_params = {
 99 |             "processes": self.args.num_query_processes,
100 |             "batch-size": self.args.batch_size,
101 |             "runtime": self.args.runtime,
102 |             "queries": self.args.queries,
103 |             "report-count": self.args.report_count,
104 |             "output-dir": self.run_result_output,
105 |         }
106 | 
107 |         cmd = self.build_command("vdbbench", additional_params)
108 |         self.logger.verbose(f'Execuging benchmark run.')
109 |         self._execute_command(cmd, output_file_prefix=f"{self.BENCHMARK_TYPE.value}_{self.args.command}")
110 | 
111 | 


--------------------------------------------------------------------------------
/ansible/setup.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Copyright (c) 2022 Dell Inc, or its subsidiaries.
  3 | ---
  4 | - name: Install MLPerf Storage v2.0
  5 |   hosts: clients
  6 |   become: true
  7 |   tasks:
  8 |     - name: COMMON | Ensure curl, git and python build dependencies are installed
  9 |       ansible.builtin.package:
 10 |         name:
 11 |           - curl
 12 |           - git
 13 |           - mpich
 14 |         state: present
 15 | 
 16 |     - name: COMMON | Download UV installer
 17 |       ansible.builtin.get_url:
 18 |         url: https://astral.sh/uv/install.sh
 19 |         dest: /tmp/install.sh
 20 |         mode: '0755'
 21 | 
 22 |     - name: COMMON | Install UV
 23 |       ansible.builtin.command: "sh /tmp/install.sh"
 24 | 
 25 |     - name: COMMON | Recursively remove directory
 26 |       ansible.builtin.file:
 27 |         path: /root/venv/
 28 |         state: absent
 29 |     
 30 |     - name:  COMMON | Create and activate Python virtual environment
 31 |       ansible.builtin.command: /root/.local/bin/uv venv --python 3.12 /root/venv
 32 | 
 33 |     - name: DEBIAN | Install OS-specific system dependencies
 34 |       when: ansible_os_family == 'Debian'
 35 |       block:
 36 |         - name: DEBIAN | Remove Open MPI
 37 |           ansible.builtin.package:
 38 |             state: absent
 39 |             name:
 40 |               - libopenmpi-dev
 41 |               - openmpi-bin 
 42 |               - openmpi-common
 43 |         - name: DEBIAN | Install Python and MPI packages
 44 |           ansible.builtin.package:
 45 |             name:
 46 |               - python3-dev
 47 |             state: present
 48 | 
 49 |     - name: RHEL | Install OS-specific system dependencies
 50 |       when: ansible_os_family == 'RedHat'
 51 |       block:
 52 |         - name: RHEL | Remove Open MPI
 53 |           ansible.builtin.package:
 54 |             state: absent
 55 |             name:
 56 |               - openmpi-devel       
 57 |               - openmpi
 58 |         - name: RHEL | Install Python and MPI packages
 59 |           ansible.builtin.package:
 60 |             name:
 61 |               - python3-devel
 62 |               - mpich-devel
 63 |             state: present
 64 | 
 65 |         - name: RHEL | Export MPI paths
 66 |           shell: |
 67 |             echo 'export LD_LIBRARY_PATH=/usr/lib64/mpich/lib:$LD_LIBRARY_PATH' >> /root/.bashrc
 68 |             echo 'export PATH=/usr/lib64/mpich/bin:$PATH' >> /root/.bashrc
 69 |           args:
 70 |             executable: /bin/bash
 71 | 
 72 |     # Required: newer GCC needed to compile mpi4py correctly
 73 |     # MLPerf Storage depends on mpi4py which in turn requires a valid MPI compiler and development environment.
 74 |     # On SLES, the default GCC may not be available in PATH or may be outdated. Installing GCC 12 and explicitly setting it as the default resolves these issues.
 75 |     - name: SLES | Install OS-specific system dependencies
 76 |       when: ansible_os_family == 'Suse'
 77 |       block:
 78 |         - name: SLES | Remove Open MPI
 79 |           ansible.builtin.package:
 80 |             state: absent
 81 |             name:
 82 |               - openmpi-devel
 83 |               - openmpi
 84 |         - name: SLES | Install base system packages
 85 |           ansible.builtin.package:
 86 |             name:
 87 |               - python3-devel
 88 |               - mpich-devel
 89 |               - gcc12
 90 |               - gcc12-c++
 91 |               - cpp12
 92 |             state: present
 93 | 
 94 |         - name: SLES | Export MPI paths
 95 |           shell: |
 96 |             echo 'export PATH=/usr/lib64/mpi/gcc/mpich/bin:$PATH' >> /root/.bashrc
 97 |             echo 'export LD_LIBRARY_PATH=/usr/lib64/mpi/gcc/mpich/lib64:$LD_LIBRARY_PATH' >> /root/.bashrc
 98 |             echo 'export CC=/usr/lib64/mpi/gcc/mpich/bin/mpicc' >> /root/.bashrc
 99 |           args:
100 |             executable: /bin/bash
101 | 
102 |         - name: SLES | Set GCC 12 as default
103 |           community.general.alternatives:
104 |             name: "{{ item.name }}"
105 |             link: "{{ item.link }}"
106 |             path: "{{ item.path }}"
107 |             priority: 80
108 |           loop:
109 |             - { name: gcc, link: /usr/bin/gcc, path: "/usr/bin/gcc-12" }
110 |             - { name: g++, link: /usr/bin/g++, path: "/usr/bin/g++-12" }
111 |             - { name: cpp, link: /usr/bin/cpp, path: "/usr/bin/cpp-12" }
112 |             - { name: c++, link: /usr/bin/c++, path: "/usr/bin/g++-12" }
113 | 
114 |     - name:  COMMON | Install MLPerf Storage from GitHub using UV inside venv
115 |       ansible.builtin.shell: |
116 |         /root/.local/bin/uv pip install --native-tls --index https://pypi.nvidia.com/ "git+https://github.com/mlcommons/storage.git@main"
117 |       args:
118 |         executable: /bin/bash
119 |         chdir: /root/venv


--------------------------------------------------------------------------------
/mlpstorage/config.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import enum
  3 | import os
  4 | import pathlib
  5 | import tempfile
  6 | 
  7 | 
  8 | def check_env(setting, default_value=None):
  9 |     """
 10 |     This function checks the config, the default value, and the environment variables in the correct order for setting
 11 |     our constants. Lower position overrides a higher position
 12 |         - default_value
 13 |         - value_from_config
 14 |         - environment variable
 15 |     """
 16 |     value_from_environment = os.environ.get(setting)
 17 |     if type(value_from_environment) is str:
 18 |         if value_from_environment.lower() == 'true':
 19 |             value_from_environment = True
 20 |         elif value_from_environment.lower() == 'false':
 21 |             value_from_environment = False
 22 | 
 23 |     set_value = None
 24 |     if value_from_environment:
 25 |         set_value = value_from_environment
 26 |     elif default_value:
 27 |         set_value = default_value
 28 |     else:
 29 |         set_value = None
 30 | 
 31 |     return set_value
 32 | 
 33 | 
 34 | MLPS_DEBUG = check_env('MLPS_DEBUG', False)
 35 | HISTFILE = os.path.join(pathlib.Path.home(), "mlps_history")
 36 | 
 37 | def get_datetime_string():
 38 |     return datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 39 | 
 40 | # Define constants:
 41 | DATETIME_STR = get_datetime_string()
 42 | CONFIGS_ROOT_DIR = os.path.join(os.path.split(os.path.abspath(os.path.dirname(__file__)))[0], "configs")
 43 | 
 44 | MLPSTORAGE_BIN_NAME = "mlpstorage"
 45 | 
 46 | HYDRA_OUTPUT_SUBDIR = "dlio_config"
 47 | 
 48 | COSMOFLOW = "cosmoflow"
 49 | RESNET = "resnet50"
 50 | UNET = "unet3d"
 51 | MODELS = [COSMOFLOW, RESNET, UNET]
 52 | 
 53 | H100 = "h100"
 54 | A100 = "a100"
 55 | ACCELERATORS = [H100, A100]
 56 | 
 57 | OPEN = "open"
 58 | CLOSED = "closed"
 59 | CATEGORIES = [OPEN, CLOSED]
 60 | 
 61 | LLAMA3_8B = "llama3-8b"
 62 | LLAMA3_70B = 'llama3-70b'
 63 | LLAMA3_405B = 'llama3-405b'
 64 | LLAMA3_1T = 'llama3-1t'
 65 | LLM_MODELS = [LLAMA3_70B, LLAMA3_405B, LLAMA3_1T, LLAMA3_8B]
 66 | 
 67 | LLM_SUBSET_PROCS = 8
 68 | # Defined as (MinProcs, ZeroLevel, GPU per Data Parallel Instance, Closed GPU Count)
 69 | LLM_ALLOWED_VALUES = {
 70 |     LLAMA3_1T: (LLM_SUBSET_PROCS, 1, 8*64, 8*64*2),     # 8*64*2 = 1,024 processes
 71 |     LLAMA3_405B: (LLM_SUBSET_PROCS, 1, 8*32, 8*32*2),   # 8*32*2 = 512 processes
 72 |     LLAMA3_70B: (LLM_SUBSET_PROCS, 3, 8, 8*8),          # 8*8*1 = 64 processes
 73 |     LLAMA3_8B: (LLM_SUBSET_PROCS, 3, 8, 8)              # 8*1*1 = 8 processes
 74 | }
 75 | 
 76 | # Defined as (Model GB, Optimizer GB)
 77 | # These need to be updated with actual values
 78 | LLM_SIZE_BY_RANK = {
 79 |     LLAMA3_1T: (2571, 15426),
 80 |     LLAMA3_405B: (755, 4533),
 81 |     LLAMA3_70B: (130, 781),
 82 |     LLAMA3_8B: (15, 90)
 83 | }
 84 | 
 85 | CHECKPOINT_RANKS_STRINGS = "\n    ".join(
 86 |     [f'{key}: CLOSED in [{value[0]} || {value[3]}], OPEN allows a multiple of {value[2]}' for key, value in LLM_ALLOWED_VALUES.items()])
 87 | 
 88 | LLM_MODELS_STRINGS = "\n    ".join(LLM_MODELS)
 89 | 
 90 | MPIRUN = "mpirun"
 91 | MPIEXEC = "mpiexec"
 92 | MPI_CMDS = [MPIRUN, MPIEXEC]
 93 | 
 94 | STEPS_PER_EPOCH = 500
 95 | MOST_MEMORY_MULTIPLIER = 5
 96 | MAX_READ_THREADS_TRAINING = 32
 97 | 
 98 | DEFAULT_HOSTS = ["127.0.0.1",]
 99 | 
100 | MPI_RUN_BIN = os.environ.get("MPI_RUN_BIN", MPIRUN)
101 | MPI_EXEC_BIN = os.environ.get("MPI_EXEC_BIN", MPIEXEC)
102 | ALLOW_RUN_AS_ROOT = True
103 | 
104 | MAX_NUM_FILES_TRAIN = 128*1024
105 | 
106 | DEFAULT_RESULTS_DIR = os.path.join(tempfile.gettempdir(), f"mlperf_storage_results")
107 | 
108 | import enum
109 | 
110 | class EXIT_CODE(enum.IntEnum):
111 |     SUCCESS = 0
112 |     GENERAL_ERROR = 1
113 |     INVALID_ARGUMENTS = 2
114 |     FILE_NOT_FOUND = 3
115 |     PERMISSION_DENIED = 4
116 |     CONFIGURATION_ERROR = 5
117 |     FAILURE = 6
118 |     TIMEOUT = 7
119 |     # Add more as needed
120 |     
121 |     def __str__(self):
122 |         return f"{self.name} ({self.value})"
123 | class EXEC_TYPE(enum.Enum):
124 |     MPI = "mpi"
125 |     DOCKER = "docker"
126 |     def __str__(self):
127 |         return self.value
128 | 
129 | 
130 | class PARAM_VALIDATION(enum.Enum):
131 |     CLOSED = "closed"
132 |     OPEN = "open"
133 |     INVALID = "invalid"
134 | 
135 | 
136 | class BENCHMARK_TYPES(enum.Enum):
137 |     training = "training"
138 |     vector_database = "vector_database"
139 |     checkpointing = "checkpointing"
140 | 
141 | # Enum for supported search metric types of COSINE, L2, IP
142 | SEARCH_METRICS = ["COSINE", "L2", "IP"]
143 | 
144 | # Supported Index Types is only DISKANN but more could be supported in the future
145 | INDEX_TYPES = ["DISKANN"]
146 | 
147 | # Supported vector data types is currently only FLOAT_VECTOR but more could be supported in the future
148 | VECTOR_DTYPES = ["FLOAT_VECTOR"]
149 | 
150 | # Supported distributions are currently uniform, normal, or zipfian
151 | DISTRIBUTIONS = ["uniform", "normal", "zipfian"]
152 | 
153 | # Default runtime for vector database benchmarks if not defined
154 | VECTORDB_DEFAULT_RUNTIME = 60


--------------------------------------------------------------------------------
/mlpstorage/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3.9
  2 | #!/usr/bin/env python3
  3 | import signal
  4 | import sys
  5 | 
  6 | from mlpstorage.benchmarks import TrainingBenchmark, VectorDBBenchmark, CheckpointingBenchmark
  7 | from mlpstorage.cli import parse_arguments, validate_args, update_args
  8 | from mlpstorage.config import HISTFILE, DATETIME_STR, EXIT_CODE, DEFAULT_RESULTS_DIR, get_datetime_string, HYDRA_OUTPUT_SUBDIR
  9 | from mlpstorage.debug import debugger_hook, MLPS_DEBUG
 10 | from mlpstorage.history import HistoryTracker
 11 | from mlpstorage.mlps_logging import setup_logging, apply_logging_options
 12 | from mlpstorage.reporting import ReportGenerator
 13 | 
 14 | logger = setup_logging("MLPerfStorage")
 15 | signal_received = False
 16 | 
 17 | 
 18 | def signal_handler(sig, frame):
 19 |     """Handle signals like SIGINT (Ctrl+C) and SIGTERM."""
 20 |     global signal_received
 21 | 
 22 |     signal_name = signal.Signals(sig).name
 23 |     logger.warning(f"Received signal {signal_name} ({sig})")
 24 | 
 25 |     # Set the flag to indicate we've received a signal
 26 |     signal_received = True
 27 | 
 28 |     # For SIGTERM, exit immediately
 29 |     if sig in (signal.SIGTERM, signal.SIGINT):
 30 |         logger.info("Exiting immediately due to SIGTERM")
 31 |         sys.exit(EXIT_CODE.INTERRUPTED)
 32 | 
 33 | 
 34 | def run_benchmark(args, run_datetime):
 35 |     """Run a benchmark based on the provided args."""
 36 |     program_switch_dict = dict(
 37 |         training=TrainingBenchmark,
 38 |         checkpointing=CheckpointingBenchmark,
 39 |         vectordb=VectorDBBenchmark,
 40 |     )
 41 | 
 42 |     benchmark_class = program_switch_dict.get(args.program)
 43 |     if not benchmark_class:
 44 |         print(f"Unsupported program: {args.program}")
 45 |         return 1
 46 |         
 47 |     benchmark = benchmark_class(args, run_datetime=run_datetime, logger=logger)
 48 |     try:
 49 |         ret_code = benchmark.run()
 50 |     except Exception as e:
 51 |         logger.error(f"Error running benchmark: {str(e)}")
 52 |         ret_code = EXIT_CODE.ERROR
 53 |     finally:
 54 |         logger.status(f'Writing metadata for benchmark to: {benchmark.metadata_file_path}')
 55 |         try:
 56 |             benchmark.write_metadata()
 57 |             return ret_code
 58 |         except Exception as e:
 59 |             logger.error(f"Error writing metadata: {str(e)}")
 60 |             return ret_code
 61 | 
 62 | 
 63 | def main():
 64 |     signal.signal(signal.SIGINT, signal_handler)
 65 |     signal.signal(signal.SIGTERM, signal_handler)
 66 |     global signal_received
 67 | 
 68 |     args = parse_arguments()
 69 |     if args.debug or MLPS_DEBUG:
 70 |         sys.excepthook = debugger_hook
 71 | 
 72 |     apply_logging_options(logger, args)
 73 | 
 74 |     datetime_str = DATETIME_STR
 75 | 
 76 |     hist = HistoryTracker(history_file=HISTFILE, logger=logger)
 77 |     if args.program != "history":
 78 |         # Don't save history commands
 79 |         hist.add_entry(sys.argv, datetime_str=datetime_str)
 80 | 
 81 |     # Handle history command separately
 82 |     if args.program == 'history':
 83 |         new_args = hist.handle_history_command(args)
 84 | 
 85 |         # Check if we got new args back (not just an exit code)
 86 |         if isinstance(new_args, EXIT_CODE):
 87 |             # We got an exit code, so return it
 88 |             return new_args
 89 | 
 90 |         elif isinstance(new_args, object) and hasattr(new_args, 'program'):
 91 |             # Check if logging options have changed
 92 |             if (hasattr(new_args, 'debug') and new_args.debug != args.debug) or \
 93 |                (hasattr(new_args, 'verbose') and new_args.verbose != args.verbose) or \
 94 |                (hasattr(new_args, 'stream_log_level') and new_args.stream_log_level != args.stream_log_level):
 95 |                 # Apply the new logging options
 96 |                 apply_logging_options(logger, new_args)
 97 |             
 98 |             args = new_args
 99 |         else:
100 |             # If handle_history_command returned an exit code, return it
101 |             return new_args
102 | 
103 |     if args.program == "reports":
104 |         results_dir = args.results_dir if hasattr(args, 'results_dir') else DEFAULT_RESULTS_DIR
105 |         report_generator = ReportGenerator(results_dir, args, logger=logger)
106 |         return report_generator.generate_reports()
107 | 
108 |     run_datetime = datetime_str
109 | 
110 |     # Handle vdb end conditions, num_process standardization, and args.params flattening
111 |     update_args(args)
112 | 
113 |     # For other commands, run the benchmark
114 |     for i in range(args.loops):
115 |         if signal_received:
116 |             print(f'Caught signal, exiting...')
117 |             return EXIT_CODE.INTERRUPTED
118 | 
119 |         ret_code = run_benchmark(args, run_datetime)
120 |         if ret_code != EXIT_CODE.SUCCESS:
121 |             logger.error(f"Benchmark failed after {i+1} iterations")
122 |             return EXIT_CODE.FAILURE
123 | 
124 |         # Set datetime for next iteration
125 |         run_datetime = get_datetime_string()
126 | 
127 |     return EXIT_CODE.SUCCESS
128 | 
129 | if __name__ == "__main__":
130 |     sys.exit(main())
131 | 


--------------------------------------------------------------------------------
/mlpstorage/mlps_logging.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import enum
  4 | import logging
  5 | import sys
  6 | 
  7 | # Define the custom log levels
  8 | CRITICAL = logging.CRITICAL
  9 | FATAL = CRITICAL
 10 | ERROR = logging.ERROR
 11 | RESULT = 35
 12 | WARNING = logging.WARNING   # 30
 13 | WARN = WARNING
 14 | STATUS = 25
 15 | INFO = logging.INFO         # 20
 16 | VERBOSE = 19
 17 | VERBOSER = 18
 18 | VERBOSEST = 17
 19 | DEBUG = logging.DEBUG       # 10
 20 | RIDICULOUS = 7
 21 | LUDICROUS = 5
 22 | PLAID = 3
 23 | NOTSET = logging.NOTSET
 24 | 
 25 | DEFAULT_STREAM_LOG_LEVEL = logging.INFO
 26 | 
 27 | custom_levels = {
 28 |     'RESULT': RESULT,
 29 |     'STATUS': STATUS,
 30 |     'VERBOSE': VERBOSE,
 31 |     'VERBOSER': VERBOSER,
 32 |     'VERBOSEST': VERBOSEST,
 33 |     'RIDICULOUS': RIDICULOUS,
 34 |     'LUDICROUS': LUDICROUS,
 35 |     'PLAID': PLAID
 36 | }
 37 | 
 38 | 
 39 | # Custom colors for various logging levels
 40 | class COLORS(enum.Enum):
 41 |     grey = "\033[0;30m"
 42 |     red = "\033[0;31m"
 43 |     green = "\033[0;32m"
 44 |     yellow = "\033[0;33m"
 45 |     blue = "\033[0;34m"
 46 |     purple = "\033[0;35m"
 47 |     cyan = "\033[0;36m"
 48 |     white = "\033[0;37m"
 49 |     igrey = "\033[0;90m"
 50 |     ired = "\033[0;91m"
 51 |     igreen = "\033[0;92m"
 52 |     iyellow = "\033[0;93m"
 53 |     iblue = "\033[0;94m"
 54 |     ipurple = "\033[0;95m"
 55 |     icyan = "\033[0;96m"
 56 |     iwhite = "\033[0;97m"
 57 |     bgrey = "\033[1;30m"
 58 |     bred = "\033[1;31m"
 59 |     bgreen = "\033[1;32m"
 60 |     byellow = "\033[1;33m"
 61 |     bblue = "\033[1;34m"
 62 |     bpurple = "\033[1;35m"
 63 |     bcyan = "\033[1;36m"
 64 |     bwhite = "\033[1;37m"
 65 |     bigrey = "\033[1;90m"
 66 |     bired = "\033[1;91m"
 67 |     bigreen = "\033[1;92m"
 68 |     biyellow = "\033[1;93m"
 69 |     biblue = "\033[1;94m"
 70 |     bipurple = "\033[1;95m"
 71 |     bicyan = "\033[1;96m"
 72 |     biwhite = "\033[1;97m"
 73 |     normal = "\033[0m"
 74 | 
 75 | 
 76 | level_to_color_map = {
 77 |     ERROR: COLORS.bred,
 78 |     CRITICAL: COLORS.bred,
 79 |     WARNING: COLORS.yellow,
 80 |     RESULT: COLORS.green,
 81 |     STATUS: COLORS.bblue,
 82 |     INFO: COLORS.normal,
 83 |     VERBOSE: COLORS.normal,
 84 |     VERBOSER: COLORS.normal,
 85 |     VERBOSEST: COLORS.normal,
 86 |     DEBUG: COLORS.normal,
 87 |     RIDICULOUS: COLORS.normal,
 88 |     LUDICROUS: COLORS.normal,
 89 |     PLAID: COLORS.bipurple,
 90 | }
 91 | 
 92 | 
 93 | def get_level_color(level):
 94 |     return level_to_color_map.get(level, COLORS.normal).value
 95 | 
 96 | 
 97 | def log_level_factory(level_name):
 98 |     level_num = custom_levels.get(level_name, logging.NOTSET)
 99 | 
100 |     def log_func(self, message, *args, **kwargs):
101 |         self._log(level_num, message, args, **kwargs)
102 |     return log_func
103 | 
104 | 
105 | # Add the custom levels to the logger
106 | for custom_name, custom_num in custom_levels.items():
107 |     logging.addLevelName(custom_num, custom_name)
108 |     setattr(logging.Logger, custom_name.lower(), log_level_factory(custom_name))
109 | 
110 | 
111 | class ColoredStandardFormatter(logging.Formatter):
112 |     def format(self, record):
113 |         formatted_time = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
114 |         color = get_level_color(record.levelno)
115 |         return f"{color}{formatted_time}|{record.levelname}: {record.getMessage()}{COLORS['normal'].value}"
116 | 
117 | 
118 | class ColoredDebugFormatter(logging.Formatter):
119 |     def format(self, record):
120 |         formatted_time = f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
121 |         color = get_level_color(record.levelno)
122 |         return f"{color}{formatted_time}|{record.levelname}:{record.module}:{record.lineno}: " \
123 |                f"{record.getMessage()}{COLORS['normal'].value}"
124 | 
125 | 
126 | def setup_logging(name=__name__, stream_log_level=DEFAULT_STREAM_LOG_LEVEL):
127 |     if isinstance(stream_log_level, str):
128 |         stream_log_level = logging.getLevelName(stream_log_level.upper())
129 | 
130 |     _logger = logging.getLogger(name)
131 |     _logger.setLevel(logging.DEBUG)
132 | 
133 |     stream_handler = logging.StreamHandler()
134 |     stream_handler.setFormatter(ColoredStandardFormatter())
135 |     stream_handler.setLevel(stream_log_level)  # Adjust this level as needed
136 |     _logger.addHandler(stream_handler)
137 | 
138 |     return _logger
139 | 
140 | 
141 | def apply_logging_options(_logger, args):
142 |     if args is None:
143 |         return
144 |     # Set log level to VERBOSE unless the current log level is higher. In which case set it 1 level higher
145 |     stream_handlers = [h for h in _logger.handlers if not hasattr(h, 'baseFilename')]
146 |     log_levels = sorted([v for k, v in sys.modules[__name__].__dict__.items() if type(v) is int])
147 | 
148 |     if hasattr(args, "stream_log_level") and args.stream_log_level:
149 |         for stream_handler in stream_handlers:
150 |             stream_handler.setLevel(args.stream_log_level.upper())
151 | 
152 |     if hasattr(args, "verbose") and args.verbose:
153 |         for stream_handler in stream_handlers:
154 |             if stream_handler.level > VERBOSE:
155 |                 stream_handler.setLevel(VERBOSE)
156 | 
157 |     if hasattr(args, "debug") and args.debug:
158 |         for stream_handler in stream_handlers:
159 |             stream_handler.setFormatter(ColoredDebugFormatter())
160 |             if stream_handler.level > DEBUG:
161 |                 stream_handler.setLevel(DEBUG)
162 | 


--------------------------------------------------------------------------------
/mlpstorage/DEFINING_RULES_CHECKS.md:
--------------------------------------------------------------------------------
  1 | # Defining rules checks in rules.py
  2 | 
  3 | Short Version: Add new checks by adding methods to `TrainingRulesChecker` and `CheckpointingRulesChecker` in `rules.py` that start with `check_*` and return an `Issue` object or a list of `Issue` objects.
  4 | 
  5 | ## RulesCheckers
  6 | In rules.py, there is a class per workload for checking rules:
  7 | - `TrainingRulesChecker`
  8 | - `CheckpointingRulesChecker`
  9 | 
 10 | These classes are subclasses of `RulesChecker`. The parent class has the following attributes:
 11 | - `self.benchmark_run`, a `BenchmarkRun` object.
 12 | - `self.issues`. a list of `Issue` objects
 13 | 
 14 | When a `RulesChecker` instance is run, all methods starting with `check_*` will run. Each `check_*` method is expected to operate on `self.benchmark_run` and return an `Issue` object or a list of `Issue` objects.
 15 | 
 16 | ## Issues
 17 | `Issue` defines the results of a rules check and may be a "Non-Issue" or an issue that has verified rules compatibility.
 18 | 
 19 | ```python
 20 | @dataclass
 21 | class Issue:
 22 |     validation: PARAM_VALIDATION
 23 |     message: str
 24 |     parameter: Optional[str] = None
 25 |     expected: Optional[Any] = None
 26 |     actual: Optional[Any] = None
 27 | ```
 28 | 
 29 | Here's an example of `Issue` creation that results in 3 types of issues, OPEN, CLOSED, and INVALID:
 30 | 
 31 | ```python
 32 |  def check_allowed_params(self) -> Optional[Issue]:
 33 |     """
 34 |     This method will verify that the only parameters that were set were the allowed parameters.
 35 |     Allowed for closed:
 36 |       - dataset.num_files_train
 37 |       - dataset.num_subfolders_train
 38 |       -
 39 |     :return:
 40 |     """
 41 |     closed_allowed_params = ['dataset.num_files_train', 'dataset.num_subfolders_train', 'dataset.data_folder',
 42 |                              'reader.read_threads', 'reader.computation_threads', 'reader.transfer_size',
 43 |                              'reader.prefetch_size', 'checkpoint.checkpoint_folder',
 44 |                              'storage.storage_type', 'storage.storage_root']
 45 |     open_allowed_params = ['framework', 'dataset.format', 'dataset.num_samples_per_file', 'reader.data_loader']
 46 |     issues = []
 47 |     for param, value in self.benchmark_run.override_parameters.items():
 48 |         self.logger.debug(f"Processing override parameter: {param} = {value}")
 49 |         if param in closed_allowed_params:
 50 |             issues.append(Issue(
 51 |                 validation=PARAM_VALIDATION.CLOSED,
 52 |                 message=f"Closed parameter override allowed: {param} = {value}",
 53 |                 parameter="Overrode Parameters",
 54 |                 actual=value
 55 |             ))
 56 |         elif param in open_allowed_params:
 57 |             issues.append(Issue(
 58 |                 validation=PARAM_VALIDATION.OPEN,
 59 |                 message=f"Open parameter override allowed: {param} = {value}",
 60 |                 parameter="Overrode Parameters",
 61 |                 actual=value
 62 |             ))
 63 |         else:
 64 |             issues.append(Issue(
 65 |                 validation=PARAM_VALIDATION.INVALID,
 66 |                 message=f"Disallowed parameter override: {param} = {value}",
 67 |                 parameter="Overrode Parameters",
 68 |                 expected="None",
 69 |                 actual=value
 70 |             ))
 71 |     return issues
 72 | ```
 73 | 
 74 | ## BenchmarkRun
 75 | A `BenchmarkRun` object maps the logs of a completed run and the instance of an executing run to the same object. This allows checkers to run before a test is executed and as part of the submission checker after tests have executed.
 76 | 
 77 | The relevant attributes on a `BenchmarkRun`:
 78 | ```python
 79 | class BenchmarkRun:
 80 |     """
 81 |     Represents a benchmark run with all parameters and system information.
 82 |     Can be constructed either from a benchmark instance or from result files.
 83 |     """
 84 |     def __init__(self, benchmark_result=None, benchmark_instance=None, logger=None):
 85 |         self.logger = logger
 86 |         
 87 |         # These will be set when the result or instance are processed
 88 |         self.benchmark_type = None
 89 |         self.model = None
 90 |         self.command = None
 91 |         self.num_processes = None
 92 |         self.parameters = dict()
 93 |         self.override_parameters = dict()
 94 |         self.system_info = None
 95 |         self.metrics = {}
 96 |         self._run_id = None
 97 |         self.run_datetime = None
 98 |         self.result_root_dir = None
 99 | 
100 |         self.benchmark_result = benchmark_result
101 |         self.benchmark_instance = benchmark_instance
102 | 
103 |         if benchmark_instance:
104 |             self._process_benchmark_instance(benchmark_instance)
105 |             self.post_execution = False
106 |         elif benchmark_result:
107 |             self._process_benchmark_result(benchmark_result)
108 |             self.post_execution = True
109 | 
110 |         self._run_id = RunID(program=self.benchmark_type.name, command=self.command,  model=self.model,
111 |                             run_datetime=self.run_datetime)
112 | ```
113 | 
114 | `.benchmark_type` is the enum:
115 | ```python
116 | class BENCHMARK_TYPES(enum.Enum):
117 |     training = "training"
118 |     vector_database = "vector_database"
119 |     checkpointing = "checkpointing"
120 | ```
121 | 
122 | - `.parameters` (Dict)
123 |   - the parameters as were run and include params from the config files and from the CLI.
124 | - `.override_parameters` (Dict) 
125 |   - the parameters that overrode the config file and were set by the user.
126 | - `.system_info`  (ClusterInformation)
127 |   - a `ClusterInformation` object with information on the number of clients and amount of memory per client.
128 | - `.metrics` (Dict)
129 |   - DLIO metrics from the test
130 | 
131 | `self.benchmark_result` and `self.benchmark_instance` are the associated objects when deeper inspection needs to be run with the context of pre- vs post- execution.
132 | 


--------------------------------------------------------------------------------
/test/run_tests.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Fail if any command fails:
  4 | set -e
  5 | 
  6 | # Function to run a command and handle errors
  7 | run_command() {
  8 |     local cmd="$1"
  9 |     local expected_exit_code="${2:-0}"  # Default expected exit code is 0 (success)
 10 | 
 11 |     echo "Running: $cmd"
 12 |     echo "Expected exit code: $expected_exit_code"
 13 | 
 14 |     # Create temporary files for stdout and stderr
 15 |     local stdout_file=$(mktemp)
 16 |     local stderr_file=$(mktemp)
 17 | 
 18 |     # Run the command, capturing stdout, stderr, and exit status
 19 |     set +e  # Temporarily disable automatic exit on error
 20 |     eval "$cmd" > "$stdout_file" 2> "$stderr_file"
 21 |     local exit_status=$?
 22 |     set -e  # Re-enable automatic exit on error
 23 | 
 24 |     # Display stdout regardless of success/failure
 25 |     cat "$stdout_file"
 26 | 
 27 |     # Display stderr if there is any
 28 |     if [ -s "$stderr_file" ]; then
 29 |         echo "STDERR output:"
 30 |         cat "$stderr_file"
 31 |     fi
 32 | 
 33 |     # Check if exit code matches expected exit code
 34 |     if [ $exit_status -eq $expected_exit_code ]; then
 35 |         echo "Command completed with expected exit status $exit_status"
 36 |     else
 37 |         echo "ERROR: Command failed with unexpected exit status"
 38 |         echo "Command: $cmd"
 39 |         echo "Actual exit code: $exit_status"
 40 |         echo "Expected exit code: $expected_exit_code"
 41 | 
 42 |         # Clean up temp files
 43 |         rm -f "$stdout_file" "$stderr_file"
 44 | 
 45 |         # Exit with the actual error code
 46 |         exit $exit_status
 47 |     fi
 48 | 
 49 |     # Clean up temp files
 50 |     rm -f "$stdout_file" "$stderr_file"
 51 |     echo "----------------------------------------"
 52 | }
 53 | 
 54 | # Define the list of commands to run with their expected exit codes
 55 | # Format: "command|expected_exit_code"
 56 | # If expected_exit_code is omitted, 0 (success) is assumed
 57 | commands=(
 58 |     "mlpstorage --version|0"
 59 |     "mlpstorage history show|0"
 60 | 
 61 | #     Example of a command expected to fail with exit code 2 (INVALID_ARGUMENTS)
 62 |     "mlpstorage training datasize --invalid-flag|2"
 63 | 
 64 |     "mlpstorage training datasize --model resnet50 --client-host-memory-in-gb 256 --max-accelerators 80 --num-client-hosts 2 --accelerator-type a100|0"
 65 |     "mlpstorage training datasize --model resnet50 --client-host-memory-in-gb 256 --max-accelerators 80 --num-client-hosts 2 --accelerator-type h100|0"
 66 |     "mlpstorage training datasize --model cosmoflow --client-host-memory-in-gb 256 --max-accelerators 80 --num-client-hosts 2 --accelerator-type h100|0"
 67 |     "mlpstorage training datasize --model unet3d --client-host-memory-in-gb 256 --max-accelerators 80 --num-client-hosts 2 --accelerator-type h100|0"
 68 | 
 69 |     "mlpstorage training datagen --hosts 127.0.0.1,127.0.0.1 --model resnet50 --num-processes 96 --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 70 |     "mlpstorage training datagen --hosts 127.0.0.1,127.0.0.1 --model cosmoflow --num-processes 96 --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 71 |     "mlpstorage training datagen --hosts 127.0.0.1,127.0.0.1 --model unet3d --num-processes 96 --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 72 | 
 73 |     "mlpstorage training run --hosts 127.0.0.1,127.0.0.1 --num-client-hosts 2 --client-host-memory-in-gb 256 --num-accelerators 8 --accelerator-type a100 --model resnet50 --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 74 |     "mlpstorage training run --hosts 127.0.0.1,127.0.0.1 --num-client-hosts 2 --client-host-memory-in-gb 256 --num-accelerators 8 --accelerator-type a100 --model cosmoflow --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 75 | 
 76 |     # Checkpoint folder required for unet3d
 77 |     "mlpstorage training run --hosts 127.0.0.1,127.0.0.1 --num-client-hosts 2 --client-host-memory-in-gb 256 --num-accelerators 8 --accelerator-type a100 --model unet3d --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --allow-run-as-root|2"
 78 |     "mlpstorage training run --hosts 127.0.0.1,127.0.0.1 --num-client-hosts 2 --client-host-memory-in-gb 256 --num-accelerators 8 --accelerator-type a100 --model unet3d --param dataset.num_files_train=192 --data-dir /mnt/nvme/test_data --checkpoint-folder /mnt/nvme/test_data/unet3d_checkpoints --results-dir /root/mlpstorage_test_results --allow-run-as-root|0"
 79 | 
 80 |     "mlpstorage checkpointing datasize --hosts 127.0.0.1,127.0.0.1 --client-host-memory-in-gb 256 --model llama3-8b --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results|0"
 81 |     "mlpstorage checkpointing datasize --hosts 127.0.0.1,127.0.0.1 --client-host-memory-in-gb 256 --model llama3-70b --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results|0"
 82 |     "mlpstorage checkpointing datasize --hosts 127.0.0.1,127.0.0.1 --client-host-memory-in-gb 256 --model llama3-405b --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results|0"
 83 |     "mlpstorage checkpointing datasize --hosts 127.0.0.1,127.0.0.1 --client-host-memory-in-gb 256 --model llama3-1t --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results|0"
 84 | 
 85 |     "mlpstorage checkpointing run --hosts 127.0.0.1 --model llama3-8b --client-host-memory-in-gb 512 --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --num-checkpoints-read 1 --num-checkpoints-write 1 --allow-run-as-root|0"
 86 |     "mlpstorage checkpointing run --hosts 127.0.0.1 --model llama3-70b --client-host-memory-in-gb 512 --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --num-checkpoints-read 1 --num-checkpoints-write 1 --allow-run-as-root|0"
 87 |     "mlpstorage checkpointing run --hosts 127.0.0.1 --model llama3-405b --client-host-memory-in-gb 512 --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --num-checkpoints-read 1 --num-checkpoints-write 1 --allow-run-as-root|0"
 88 |     "mlpstorage checkpointing run --hosts 127.0.0.1 --model llama3-1t --client-host-memory-in-gb 512 --num-processes 8 --checkpoint-folder /mnt/nvme/test_data --results-dir /root/mlpstorage_test_results --num-checkpoints-read 1 --num-checkpoints-write 1 --allow-run-as-root|0"
 89 | 
 90 |     "mlpstorage reports reportgen --results-dir ./test_results|3"
 91 |     "mlpstorage reports reportgen --results-dir /root/mlpstorage_test_results|0"
 92 | )
 93 | 
 94 | # Loop through all commands and run them
 95 | for cmd_with_code in "${commands[@]}"; do
 96 |     # Split the command and expected exit code
 97 |     IFS='|' read -r cmd expected_code <<< "$cmd_with_code"
 98 | 
 99 |     # If no expected code was provided, default to 0
100 |     if [ -z "$expected_code" ]; then
101 |         expected_code=0
102 |     fi
103 | 
104 |     run_command "$cmd" "$expected_code"
105 | done
106 | 
107 | echo "All tests completed successfully!"
108 | 


--------------------------------------------------------------------------------
/mlpstorage/benchmarks/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import json
  3 | import os
  4 | import pprint
  5 | import signal
  6 | import sys
  7 | import time
  8 | import types
  9 | 
 10 | from typing import Tuple
 11 | from functools import wraps
 12 | 
 13 | from pyarrow.ipc import open_stream
 14 | 
 15 | from mlpstorage.config import PARAM_VALIDATION, DATETIME_STR, MLPS_DEBUG
 16 | from mlpstorage.debug import debug_tryer_wrapper
 17 | from mlpstorage.mlps_logging import setup_logging, apply_logging_options
 18 | from mlpstorage.rules import BenchmarkVerifier, generate_output_location
 19 | from mlpstorage.utils import CommandExecutor, MLPSJsonEncoder
 20 | 
 21 | 
 22 | class Benchmark(abc.ABC):
 23 | 
 24 |     BENCHMARK_TYPE = None
 25 | 
 26 |     def __init__(self, args, logger=None, run_datetime=None, run_number=0) -> None:
 27 |         self.args = args
 28 |         self.debug = self.args.debug or MLPS_DEBUG
 29 |         if logger:
 30 |             self.logger = logger
 31 |         else:
 32 |             # Ensure there is always a logger available
 33 |             self.logger = setup_logging(name=f"{self.BENCHMARK_TYPE}_benchmark", stream_log_level=args.stream_log_level)
 34 |             self.logger.warning(f'Benchmark did not get a logger passed. Using default logger.')
 35 |             apply_logging_options(self.logger, args)
 36 | 
 37 |         if not run_datetime:
 38 |             self.logger.warning('No run datetime provided. Using current datetime.')
 39 |         self.run_datetime = run_datetime if run_datetime else DATETIME_STR
 40 |         self.run_number = run_number
 41 |         self.runtime = 0
 42 | 
 43 |         self.benchmark_run_verifier = None
 44 |         self.verification = None
 45 |         self.cmd_executor = CommandExecutor(logger=self.logger, debug=args.debug)
 46 | 
 47 |         self.command_output_files = list()
 48 |         self.run_result_output = self.generate_output_location()
 49 |         os.makedirs(self.run_result_output, exist_ok=True)
 50 | 
 51 |         self.metadata_filename = f"{self.BENCHMARK_TYPE.value}_{self.run_datetime}_metadata.json"
 52 |         self.metadata_file_path = os.path.join(self.run_result_output, self.metadata_filename)
 53 | 
 54 |         self.logger.status(f'Benchmark results directory: {self.run_result_output}')
 55 | 
 56 |     def _execute_command(self, command, output_file_prefix=None, print_stdout=True, print_stderr=True) -> Tuple[str, str, int]:
 57 |         """
 58 |         Execute the given command and return stdout, stderr, and return code.
 59 |         :param command: Command to execute
 60 |         :param print_stdout: Whether to print stdout
 61 |         :param print_stderr: Whether to print stderr
 62 |         :return: (stdout, stderr, return code)
 63 |         """
 64 | 
 65 |         self.__dict__.update({'executed_command': command})
 66 | 
 67 |         if self.args.what_if:
 68 |             self.logger.debug(f'Executing command in --what-if mode means no execution will be performed.')
 69 |             log_message = f'What-if mode: \nCommand: {command}'
 70 |             if self.debug:
 71 |                 log_message += f'\n\nParameters: \n{pprint.pformat(vars(self.args))}'
 72 |             self.logger.info(log_message)
 73 |             return "", "", 0
 74 |         else:
 75 |             watch_signals = {signal.SIGINT, signal.SIGTERM}
 76 |             stdout, stderr, return_code = self.cmd_executor.execute(command, watch_signals=watch_signals,
 77 |                                                                     print_stdout=print_stdout,
 78 |                                                                     print_stderr=print_stderr)
 79 | 
 80 |             if output_file_prefix:
 81 |                 stdout_filename = f"{output_file_prefix}.stdout.log"
 82 |                 stderr_filename = f"{output_file_prefix}.stderr.log"
 83 | 
 84 |                 stdout_file = os.path.join(self.run_result_output, stdout_filename)
 85 |                 stderr_file = os.path.join(self.run_result_output, stderr_filename)
 86 | 
 87 |                 with open(stdout_file, 'w+') as fd:
 88 |                     self.logger.verbose(f'Command stdout saved to: {stdout_filename}')
 89 |                     fd.write(stdout)
 90 | 
 91 |                 with open(stderr_file, 'w+') as fd:
 92 |                     self.logger.verbose(f'Command stderr saved to: {stderr_filename}')
 93 |                     fd.write(stderr)
 94 | 
 95 |                 self.command_output_files.append(dict(command=command, stdout=stdout_file, stderr=stderr_file))
 96 | 
 97 |             return stdout, stderr, return_code
 98 | 
 99 |     @property
100 |     def metadata(self):
101 |         metadata = dict()
102 |         keys_to_skip = ["command_method_map", "logger", 'benchmark_run_verifier', "cmd_executor"]
103 |         for k, v in self.__dict__.items():
104 |             if not k in keys_to_skip and not k.startswith("__"):
105 |                 metadata[k] = v
106 | 
107 |         metadata['benchmark_type'] = self.BENCHMARK_TYPE.name
108 | 
109 |         return metadata
110 | 
111 |     def write_metadata(self):
112 |         with open(self.metadata_file_path, 'w+') as fd:
113 |             json.dump(self.metadata, fd, indent=2, cls=MLPSJsonEncoder)
114 | 
115 |         if self.args.verbose or self.args.debug or self.debug:
116 |             json.dump(self.metadata, sys.stdout, indent=2, cls=MLPSJsonEncoder)
117 | 
118 |     def generate_output_location(self) -> str:
119 |         if not self.BENCHMARK_TYPE:
120 |             raise ValueError(f'No benchmark specified. Unable to generate output location')
121 |         return generate_output_location(self, self.run_datetime)
122 | 
123 |     def verify_benchmark(self) -> bool:
124 |         self.logger.verboser(f'Verifying benchmark parameters: {self.args}')
125 |         if not self.benchmark_run_verifier:
126 |             self.benchmark_run_verifier = BenchmarkVerifier(self, logger=self.logger)
127 | 
128 |         self.verification = self.benchmark_run_verifier.verify()
129 |         self.logger.verboser(f'Benchmark verification result: {self.verification}')
130 | 
131 |         if not self.args.closed and not hasattr(self.args, "open"):
132 |             self.logger.warning(f'Running the benchmark without verification for open or closed configurations. These results are not valid for submission. Use --open or --closed to specify a configuration.')
133 |             return True
134 |         if not self.BENCHMARK_TYPE:
135 |             raise ValueError(f'No benchmark specified. Unable to verify benchmark')
136 | 
137 |         if not self.verification:
138 |             self.logger.error(f'Verification did not return a result. Contact the developer')
139 |             sys.exit(1)
140 |         if self.verification == PARAM_VALIDATION.CLOSED:
141 |             return True
142 |         elif self.verification == PARAM_VALIDATION.INVALID:
143 |             if self.args.allow_invalid_params:
144 |                 self.logger.warning(f'Invalid configuration found. Allowing the benchmark to proceed.')
145 |                 return True
146 |             else:
147 |                 self.logger.error(f'Invalid configuration found. Aborting benchmark run.')
148 |                 sys.exit(1)
149 | 
150 |         if self.verification == PARAM_VALIDATION.OPEN:
151 |             if self.args.closed == False:
152 |                 # "--open" was passed
153 |                 self.logger.status(f'Running as allowed open configuration')
154 |                 return True
155 |             else:
156 |                 self.logger.warning(f'Parameters allowed for open but not closed. Use --open and rerun the benchmark.')
157 |                 sys.exit(1)
158 | 
159 |     @abc.abstractmethod
160 |     def _run(self):
161 |         """
162 |         Run the command for the given benchmark.
163 |         :return:
164 |         """
165 |         raise NotImplementedError
166 | 
167 |     def run(self):
168 |         start_time = time.time()
169 |         result = self._run()
170 |         self.runtime = time.time() - start_time
171 |         return result
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/mlpstorage/history.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shlex
  3 | import sys
  4 | from typing import Union
  5 | 
  6 | from mlpstorage.config import HISTFILE, DATETIME_STR, EXIT_CODE
  7 | from mlpstorage.mlps_logging import setup_logging
  8 | 
  9 | 
 10 | class HistoryTracker:
 11 |     """
 12 |     Tracks the history of command executions in a file.
 13 |     Each line contains sequence_id, datetime, and the full command separated by commas.
 14 |     """
 15 | 
 16 |     def __init__(self, history_file=None, logger=None):
 17 |         self.history_file = history_file or HISTFILE
 18 |         self.logger = logger or setup_logging(name="HistoryTracker", stream_log_level="INFO")
 19 |         self._ensure_history_file_exists()
 20 | 
 21 |     def _ensure_history_file_exists(self):
 22 |         """Create the history file if it doesn't exist."""
 23 |         if not os.path.exists(self.history_file):
 24 |             os.makedirs(os.path.dirname(self.history_file), exist_ok=True)
 25 |             with open(self.history_file, 'w') as f:
 26 |                 pass
 27 | 
 28 |     def _parse_history_line(self, line):
 29 |         """Parse a history line into sequence_id, timestamp, and command."""
 30 |         try:
 31 |             self.logger.debug(f"Parsing history line: {line}")
 32 |             sequence_id, timestamp, command = line.strip().split(',', 2)
 33 |             return int(sequence_id), timestamp, command
 34 |         except (ValueError, IndexError):
 35 |             self.logger.error(f"Invalid history line: {line}")
 36 |             return None
 37 | 
 38 |     def get_next_sequence_id(self):
 39 |         """Get the next sequence ID by reading the last line of the history file."""
 40 |         try:
 41 |             with open(self.history_file, 'r') as f:
 42 |                 lines = f.readlines()
 43 |                 if len(lines) == 0:
 44 |                     return 1
 45 | 
 46 |                 last_line = lines[-1]
 47 |                 if not last_line:
 48 |                     return 1
 49 | 
 50 |                 try:
 51 |                     # Parse the sequence_id from the first part of the line
 52 |                     sequence_id, _, _ = self._parse_history_line(last_line)
 53 |                     return sequence_id + 1
 54 |                 except (ValueError, IndexError):
 55 |                     return 1
 56 |         except FileNotFoundError:
 57 |             return 1
 58 | 
 59 |     def add_entry(self, command: Union[str, list[str]], datetime_str=None):
 60 |         """
 61 |         Add a new entry to the history file.
 62 | 
 63 |         Args:
 64 |             command (str): The full command that was executed
 65 |         """
 66 |         if isinstance(command, list):
 67 |             command = ' '.join(command)
 68 | 
 69 |         sequence_id = self.get_next_sequence_id()
 70 |         timestamp = datetime_str if datetime_str else DATETIME_STR
 71 |         
 72 |         # Format the line as: sequence_id, timestamp, command
 73 |         history_line = f"{sequence_id},{timestamp},{command}"
 74 |         self.logger.verboser(f"Adding command to history: {history_line}")
 75 |         with open(self.history_file, 'a') as f:
 76 |             f.write(history_line + '\n')
 77 | 
 78 |         return sequence_id
 79 |         
 80 |     def get_command_by_id(self, sequence_id) -> Union[str, None]:
 81 |         """
 82 |         Retrieve a command by its sequence ID.
 83 |         
 84 |         Args:
 85 |             sequence_id (int): The sequence ID to look for
 86 |             
 87 |         Returns:
 88 |             str or None: The command string if found, None otherwise
 89 |         """
 90 |         try:
 91 |             with open(self.history_file, 'r') as f:
 92 |                 for line in f:
 93 |                     self.logger.ridiculous(f"Parsing history line: {line}")
 94 |                     line = line.strip()
 95 |                     if not line:
 96 |                         continue
 97 |                     
 98 |                     try:
 99 |                         cid, _, command = self._parse_history_line(line)
100 |                         if sequence_id == cid:
101 |                             return command
102 |                     except (ValueError, IndexError):
103 |                         continue
104 | 
105 |             self.logger.debug(f"Command not found for sequence_id: {sequence_id}")
106 |             return None  # Command not found
107 |         except FileNotFoundError:
108 |             return None
109 |             
110 |     def get_history_entries(self, limit=None):
111 |         """
112 |         Retrieve history entries, optionally limited to a specific number.
113 |         
114 |         Args:
115 |             limit (int, optional): Maximum number of entries to return, starting from most recent
116 |             
117 |         Returns:
118 |             list: List of tuples containing (sequence_id, timestamp, command)
119 |         """
120 |         entries = []
121 |         try:
122 |             with open(self.history_file, 'r') as f:
123 |                 for line in f:
124 |                     line = line.strip()
125 |                     if not line:
126 |                         continue
127 |                     
128 |                     try:
129 |                         sequence_id, timestamp, command = self._parse_history_line(line)
130 |                         entries.append((sequence_id, timestamp, command))
131 |                     except (ValueError, IndexError):
132 |                         continue
133 |                         
134 |             # Return the most recent entries if limit is specified
135 |             if limit is not None and limit > 0:
136 |                 return entries[-limit:]
137 |             return entries
138 |         except FileNotFoundError:
139 |             return []
140 |             
141 |     def print_history(self, limit=None, sequence_id=None):
142 |         """
143 |         Print history entries to stdout.
144 |         
145 |         Args:
146 |             limit (int, optional): Maximum number of entries to print, starting from most recent
147 |             sequence_id (int, optional): Specific sequence ID to print
148 |             
149 |         Returns:
150 |             bool: True if entries were found and printed, False otherwise
151 |         """
152 |         if sequence_id is not None:
153 |             command = self.get_command_by_id(sequence_id)
154 |             if command:
155 |                 print(f"{sequence_id}: {command}")
156 |                 return EXIT_CODE.SUCCESS
157 |             else:
158 |                 print(f"No command found with ID {sequence_id}")
159 |                 return EXIT_CODE.INVALID_ARGUMENTS
160 |                 
161 |         entries = self.get_history_entries(limit)
162 |         if not entries:
163 |             print("No history entries found")
164 |             return EXIT_CODE.INVALID_ARGUMENTS
165 |             
166 |         for seq_id, timestamp, command in entries:
167 |             print(f"{seq_id} : {timestamp} : {command}")
168 | 
169 |         return EXIT_CODE.SUCCESS
170 |         
171 |     def create_args_from_command(self, sequence_id):
172 |         """
173 |         Create an args object from a command in the history.
174 |         
175 |         Args:
176 |             sequence_id (int): The sequence ID of the command to use
177 |             
178 |         Returns:
179 |             argparse.Namespace or None: The args object if command found, None otherwise
180 |         """
181 |         command = self.get_command_by_id(sequence_id)
182 |         if not command:
183 |             return None
184 |             
185 |         # Remove the script name if present
186 |         command_parts = shlex.split(command)
187 |         if command_parts and os.path.basename(command_parts[0]) == os.path.basename(sys.argv[0]):
188 |             command_parts = command_parts[1:]
189 |             
190 |         # Import here to avoid circular imports
191 |         from mlpstorage.cli import parse_arguments
192 |         
193 |         # Save original argv and restore after parsing
194 |         original_argv = sys.argv
195 |         try:
196 |             sys.argv = [original_argv[0]] + command_parts
197 |             args = parse_arguments()
198 |             return args
199 |         except Exception as e:
200 |             print(f"Error parsing command: {e}")
201 |             return None
202 | 
203 |     def handle_history_command(self, args):
204 |         """
205 |         Handle the history command based on CLI arguments.
206 |         
207 |         Args:
208 |             args: The parsed command-line arguments
209 |             
210 |         Returns:
211 |             int: Exit code (0 for success, non-zero for failure)
212 |         """
213 | 
214 |         if hasattr(args, 'id') and args.id is not None:
215 |             # Print specific history entry
216 |             return self.print_history(sequence_id=args.id)
217 |         elif hasattr(args, 'limit') and args.limit is not None:
218 |             # Print limited history entries
219 |             return self.print_history(limit=args.limit)
220 |         elif hasattr(args, 'rerun_id') and args.rerun_id is not None:
221 |             # Return args from a specific history entry
222 |             new_args = self.create_args_from_command(args.rerun_id)
223 |             if new_args is None:
224 |                 print(f"Command with ID {args.rerun_id} not found or could not be parsed")
225 |                 return EXIT_CODE.INVALID_ARGUMENTS
226 |             return new_args
227 |         else:
228 |             # Print all history entries
229 |             return self.print_history()


--------------------------------------------------------------------------------
/mlpstorage/reporting.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import os.path
  4 | import pprint
  5 | import sys
  6 | 
  7 | from dataclasses import dataclass
  8 | from typing import List, Dict, Any
  9 | 
 10 | from mlpstorage.mlps_logging import setup_logging, apply_logging_options
 11 | from mlpstorage.config import MLPS_DEBUG, BENCHMARK_TYPES, EXIT_CODE, PARAM_VALIDATION, LLM_MODELS, MODELS, ACCELERATORS
 12 | from mlpstorage.rules import get_runs_files, BenchmarkVerifier, BenchmarkRun, Issue
 13 | from mlpstorage.utils import flatten_nested_dict, remove_nan_values
 14 | 
 15 | @dataclass
 16 | class Result:
 17 |     multi: bool
 18 |     benchmark_type: BENCHMARK_TYPES
 19 |     benchmark_command: str
 20 |     benchmark_model: [LLM_MODELS, MODELS]
 21 |     benchmark_run: BenchmarkRun
 22 |     issues: List[Issue]
 23 |     category: PARAM_VALIDATION
 24 |     metrics: Dict[str, Any]
 25 | 
 26 | 
 27 | class ReportGenerator:
 28 | 
 29 |     def __init__(self, results_dir, args=None, logger=None):
 30 |         self.args = args
 31 |         if self.args is not None:
 32 |             self.debug = self.args.debug or MLPS_DEBUG
 33 |         else:
 34 |             self.debug = MLPS_DEBUG
 35 | 
 36 |         if logger:
 37 |             self.logger = logger
 38 |         else:
 39 |             # Ensure there is always a logger available
 40 |             self.logger = setup_logging(name=f"mlpstorage_reporter")
 41 |             apply_logging_options(self.logger, args)
 42 | 
 43 |         self.results_dir = results_dir
 44 |         if not os.path.exists(self.results_dir):
 45 |             self.logger.error(f'Results directory {self.results_dir} does not exist')
 46 |             sys.exit(EXIT_CODE.FILE_NOT_FOUND)
 47 | 
 48 |         self.run_results = dict()           # {run_id : result_dict }
 49 |         self.workload_results = dict()      # {(model) | (model, accelerator) : result_dict }
 50 |         self.accumulate_results()
 51 |         self.print_results()
 52 | 
 53 |     def generate_reports(self):
 54 |         # Verify the results directory exists:
 55 |         self.logger.info(f'Generating reports for {self.results_dir}')
 56 |         run_result_dicts = [report.benchmark_run.as_dict() for report in self.run_results.values()]
 57 | 
 58 |         self.write_csv_file(run_result_dicts)
 59 |         self.write_json_file(run_result_dicts)
 60 |             
 61 |         return EXIT_CODE.SUCCESS
 62 | 
 63 |     def accumulate_results(self):
 64 |         """
 65 |         This function will look through the result_files and generate a result dictionary for each run by reading the metadata.json and summary.json files.
 66 | 
 67 |         If the metadata.json file does not exist, log an error and continue
 68 |         If summary.json files does not exist, set status=Failed and only use data from metadata.json the run_info from the result_files dictionary
 69 |         :return:
 70 |         """
 71 |         benchmark_runs = get_runs_files(self.results_dir, logger=self.logger)
 72 | 
 73 |         self.logger.info(f'Accumulating results from {len(benchmark_runs)} runs')
 74 |         for benchmark_run in benchmark_runs:
 75 |             self.logger.ridiculous(f'Processing run: \n{pprint.pformat(benchmark_run)}')
 76 |             verifier = BenchmarkVerifier(benchmark_run, logger=self.logger)
 77 |             category = verifier.verify()
 78 |             issues = verifier.issues
 79 |             result_dict = dict(
 80 |                 multi=False,
 81 |                 benchmark_run=benchmark_run,
 82 |                 benchmark_type=benchmark_run.benchmark_type,
 83 |                 benchmark_command=benchmark_run.command,
 84 |                 benchmark_model=benchmark_run.model,
 85 |                 issues=issues,
 86 |                 category=category,
 87 |                 metrics=benchmark_run.metrics
 88 |             )
 89 |             self.run_results[benchmark_run.run_id] = Result(**result_dict)
 90 | 
 91 |         # Group runs for workload to run additional verifiers
 92 |         # These will be manually defined as these checks align with a specific submission version
 93 |         # I need to group by model. For training workloads we also group by accelerator but the same checker
 94 |         # is used based on model.
 95 |         workload_runs = dict()
 96 | 
 97 |         for benchmark_run in benchmark_runs:
 98 |             workload_key = (benchmark_run.model, benchmark_run.accelerator)
 99 |             if workload_key not in workload_runs.keys():
100 |                 workload_runs[workload_key] = []
101 |             workload_runs[workload_key].append(benchmark_run)
102 | 
103 |         for workload_key, runs in workload_runs.items():
104 |             model, accelerator = workload_key
105 |             if not runs:
106 |                 continue
107 |             self.logger.info(f'Running additional verifiers for model: {model}, accelerator: {accelerator}')
108 |             verifier = BenchmarkVerifier(*runs, logger=self.logger)
109 |             category = verifier.verify()
110 |             issues = verifier.issues
111 |             result_dict = dict(
112 |                 multi=True,
113 |                 benchmark_run=runs,
114 |                 benchmark_type=runs[0].benchmark_type,
115 |                 benchmark_command=runs[0].command,
116 |                 benchmark_model=runs[0].model,
117 |                 issues=issues,
118 |                 category=category,
119 |                 metrics=dict()      # Add function to aggregate metrics
120 |             )
121 |             self.workload_results[workload_key] = Result(**result_dict)
122 | 
123 |     def print_results(self):
124 |         print("\n========================= Results Report =========================")
125 |         for category in [PARAM_VALIDATION.CLOSED, PARAM_VALIDATION.OPEN, PARAM_VALIDATION.INVALID]:
126 |             print(f"\n------------------------- {category.value.upper()} Report -------------------------")
127 |             for result in self.run_results.values():
128 |                 if result.category == category:
129 |                     print(f'\tRunID: {result.benchmark_run.run_id}')
130 |                     print(f'\t    Benchmark Type: {result.benchmark_type.value}')
131 |                     print(f'\t    Command: {result.benchmark_command}')
132 |                     print(f'\t    Model: {result.benchmark_model}')
133 |                     if result.issues:
134 |                         print(f'\t    Issues:')
135 |                         for issue in result.issues:
136 |                             print(f'\t\t- {issue}')
137 |                     else:
138 |                         print(f'\t\t- No issues found')
139 | 
140 |                     if result.metrics:
141 |                         print(f'\t    Metrics:')
142 |                         for metric, value in result.metrics.items():
143 |                             if type(value) in (int, float):
144 |                                 if "percentage" in metric.lower():
145 |                                     print(f'\t\t- {metric}: {value:,.1f}%')
146 |                                 else:
147 |                                     print(f'\t\t- {metric}: {value:,.1f}')
148 |                             elif type(value) in (list, tuple):
149 |                                 if "percentage" in metric.lower():
150 |                                     print(f'\t\t- {metric}: {", ".join(f"{v:,.1f}%" for v in value)}')
151 |                                 else:
152 |                                     print(f'\t\t- {metric}: {", ".join(f"{v:,.1f}" for v in value)}')
153 |                             else:
154 |                                 print(f'\t\t- {metric}: {value}')
155 | 
156 |                     print("\n")
157 | 
158 |         print("\n========================= Submissions Report =========================")
159 |         for category in [PARAM_VALIDATION.CLOSED, PARAM_VALIDATION.OPEN, PARAM_VALIDATION.INVALID]:
160 |             print(f"\n------------------------- {category.value.upper()} Report -------------------------")
161 |             for workload_key, workload_result in self.workload_results.items():
162 |                 if workload_result.category == category:
163 |                     if workload_result.benchmark_model in LLM_MODELS:
164 |                         workload_id = f"Checkpointing - {workload_result.benchmark_model}"
165 |                     elif workload_result.benchmark_model in MODELS:
166 |                         accelerator = workload_result.benchmark_run[0].accelerator
167 |                         workload_id = (f"Training - {workload_result.benchmark_model}, "
168 |                                        f"Accelerator: {accelerator}")
169 |                     else:
170 |                         print(f'Unknown workload type: {workload_result.benchmark_model}')
171 | 
172 |                     print(f'\tWorkloadID: {workload_id}')
173 |                     print(f'\t    Benchmark Type: {workload_result.benchmark_type.value}')
174 |                     if workload_result.benchmark_command:
175 |                         print(f'\t    Command: {workload_result.benchmark_command}')
176 | 
177 |                     print(f'\t    Runs: ')
178 |                     for run in workload_result.benchmark_run:
179 |                         print(f'\t\t- {run.run_id} - [{self.run_results[run.run_id].category.value.upper()}]')
180 | 
181 |                     if workload_result.issues:
182 |                         print(f'\t    Issues:')
183 |                         for issue in workload_result.issues:
184 |                             print(f'\t\t- {issue}')
185 |                     else:
186 |                         print(f'\t\t- No issues found')
187 | 
188 |                     print("\n")
189 | 
190 | 
191 |     def write_json_file(self, results):
192 |         json_file = os.path.join(self.results_dir,'results.json')
193 |         self.logger.info(f'Writing results to {json_file}')
194 |         with open(json_file, 'w') as f:
195 |             json.dump(results, f, indent=2)
196 | 
197 |     def write_csv_file(self, results):
198 |         csv_file = os.path.join(self.results_dir,'results.csv')
199 |         self.logger.info(f'Writing results to {csv_file}')
200 |         flattened_results = [flatten_nested_dict(r) for r in results]
201 |         flattened_results = [remove_nan_values(r) for r in flattened_results]
202 |         fieldnames = set()
203 |         for l in flattened_results:
204 |             fieldnames.update(l.keys())
205 | 
206 |         with open(csv_file, 'w+', newline='') as file_object:
207 |             csv_writer = csv.DictWriter(f=file_object, fieldnames=sorted(fieldnames), lineterminator='\n')
208 |             csv_writer.writeheader()
209 |             csv_writer.writerows(flattened_results)
210 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/mlpstorage/utils.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | import enum
  3 | import io
  4 | import json
  5 | import logging
  6 | import math
  7 | import os
  8 | import pprint
  9 | import psutil
 10 | import subprocess
 11 | import shlex
 12 | import select
 13 | import signal
 14 | import sys
 15 | import threading
 16 | import yaml
 17 | 
 18 | from typing import List, Union, Optional, Dict, Tuple, Set
 19 | 
 20 | from mlpstorage.config import CONFIGS_ROOT_DIR, MPIRUN, MPIEXEC, MPI_RUN_BIN, MPI_EXEC_BIN
 21 | 
 22 | 
 23 | class MLPSJsonEncoder(json.JSONEncoder):
 24 | 
 25 |     def default(self, obj):
 26 |         try:
 27 |             if isinstance(obj, (float, int, str, list, tuple, dict)):
 28 |                 return super().default(obj)
 29 |             if isinstance(obj, set):
 30 |                 return list(obj)
 31 |             elif "Logger" in str(type(obj)):
 32 |                 return "Logger object"
 33 |             elif 'ClusterInformation' in str(type(obj)):
 34 |                 return obj.info
 35 |             elif isinstance(obj, enum.Enum):
 36 |                 return obj.value
 37 |             elif hasattr(obj, '__dict__'):
 38 |                 return obj.__dict__
 39 |             else:
 40 |                 return super().default(obj)
 41 |         except Exception as e:
 42 |             return str(obj)
 43 | 
 44 | 
 45 | def is_valid_datetime_format(datetime_str):
 46 |     """
 47 |     Check if a string is a valid datetime in the format "yyyymmdd_hhMMss"
 48 |     
 49 |     :param datetime_str: String to check
 50 |     :return: True if the string is a valid datetime, False otherwise
 51 |     """
 52 |     try:
 53 |         # Check if the string has the correct length and format
 54 |         if len(datetime_str) != 15 or datetime_str[8] != '_':
 55 |             return False
 56 |         
 57 |         # Try to parse the datetime string
 58 |         parsed_datetime = datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
 59 |         return True
 60 |     except ValueError:
 61 |         # If parsing fails, the format is invalid
 62 |         return False
 63 | 
 64 | 
 65 | def get_datetime_from_timestamp(datetime_str):
 66 |     if is_valid_datetime_format(datetime_str):
 67 |         return datetime.strptime(datetime_str, "%Y%m%d_%H%M%S")
 68 |     else:
 69 |         return None
 70 | 
 71 | 
 72 | def read_config_from_file(relative_path):
 73 |     config_path = os.path.join(CONFIGS_ROOT_DIR, relative_path)
 74 |     if not os.path.isfile(config_path):
 75 |         raise FileNotFoundError(f"Configuration file not found: {config_path}")
 76 | 
 77 |     with open(config_path, 'r') as f:
 78 |         config = yaml.safe_load(f)
 79 | 
 80 |     return config
 81 | 
 82 | 
 83 | def update_nested_dict(original_dict, update_dict):
 84 |     updated_dict = {}
 85 |     for key, value in original_dict.items():
 86 |         if key in update_dict:
 87 |             if isinstance(value, dict) and isinstance(update_dict[key], dict):
 88 |                 updated_dict[key] = update_nested_dict(value, update_dict[key])
 89 |             else:
 90 |                 updated_dict[key] = update_dict[key]
 91 |         else:
 92 |             updated_dict[key] = value
 93 |     for key, value in update_dict.items():
 94 |         if key not in original_dict:
 95 |             updated_dict[key] = value
 96 |     return updated_dict
 97 | 
 98 | 
 99 | def create_nested_dict(flat_dict, parent_dict=None, separator='.'):
100 |     if parent_dict is None:
101 |         parent_dict = {}
102 | 
103 |     for key, value in flat_dict.items():
104 |         keys = key.split(separator)
105 |         current_dict = parent_dict
106 |         for i, k in enumerate(keys[:-1]):
107 |             if k not in current_dict:
108 |                 current_dict[k] = {}
109 |             current_dict = current_dict[k]
110 |         current_dict[keys[-1]] = value
111 | 
112 |     return parent_dict
113 | 
114 | 
115 | def flatten_nested_dict(nested_dict, parent_key='', separator='.'):
116 |     """
117 |     Flatten a nested dictionary structure into a single-level dictionary with keys
118 |     joined by a separator.
119 | 
120 |     Example:
121 |         Input: {'a': 1, 'b': {'c': 2, 'd': {'e': 3}}}
122 |         Output: {'a': 1, 'b.c': 2, 'b.d.e': 3}
123 | 
124 |     Args:
125 |         nested_dict (dict): The nested dictionary to flatten
126 |         parent_key (str): The parent key prefix (used in recursion)
127 |         separator (str): The character to use for joining keys
128 | 
129 |     Returns:
130 |         dict: A flattened dictionary with compound keys
131 |     """
132 |     flat_dict = {}
133 | 
134 |     for key, value in nested_dict.items():
135 |         new_key = f"{parent_key}{separator}{key}" if parent_key else key
136 | 
137 |         if isinstance(value, dict):
138 |             # Recursively flatten any nested dictionaries
139 |             flat_dict.update(flatten_nested_dict(value, new_key, separator))
140 |         else:
141 |             # Add the leaf value to our flattened dictionary
142 |             flat_dict[new_key] = value
143 | 
144 |     return flat_dict
145 | 
146 | 
147 | def remove_nan_values(input_dict):
148 |     # Remove any NaN values from the input dictionary
149 |     ret_dict = dict()
150 |     for k, v in input_dict.items():
151 |         if type(v) in [float, int] and math.isnan(v):  # Ignore NaN values
152 |             continue
153 |         else:
154 |             ret_dict[k] = v
155 | 
156 |     return ret_dict
157 | 
158 | 
159 | class CommandExecutor:
160 |     """
161 |     A class to execute shell commands in a subprocess with live output streaming and signal handling.
162 |     
163 |     This class allows:
164 |     - Executing commands as a string or list of arguments
165 |     - Capturing stdout and stderr
166 |     - Optionally printing stdout and stderr in real-time
167 |     - Handling signals to gracefully terminate the process
168 |     """
169 |     
170 |     def __init__(self, logger: logging.Logger, debug: bool = False):
171 |         """
172 |         Initialize the CommandExecutor.
173 |         
174 |         Args:
175 |             debug: If True, enables debug mode with additional logging
176 |         """
177 |         self.logger = logger
178 |         self.debug = debug
179 |         self.process = None
180 |         self.terminated_by_signal = False
181 |         self.signal_received = None
182 |         self._original_handlers = {}
183 |         self._stop_event = threading.Event()
184 |     
185 |     def execute(self, 
186 |                 command: Union[str, List[str]], 
187 |                 print_stdout: bool = False,
188 |                 print_stderr: bool = False,
189 |                 watch_signals: Optional[Set[int]] = None) -> Tuple[str, str, int]:
190 |         """
191 |         Execute a command and return its stdout, stderr, and return code.
192 |         
193 |         Args:
194 |             command: The command to execute (string or list of strings)
195 |             print_stdout: If True, prints stdout in real-time
196 |             print_stderr: If True, prints stderr in real-time
197 |             watch_signals: Set of signals to watch for (e.g., {signal.SIGINT, signal.SIGTERM})
198 |                           If any of these signals are received, the process will be terminated
199 |         
200 |         Returns:
201 |             Tuple of (stdout_content, stderr_content, return_code)
202 |         """
203 | 
204 |         self.logger.debug(f"DEBUG - Executing command: {command}")
205 |         
206 |         # Parse command if it's a string
207 |         if isinstance(command, str):
208 |             cmd_args = shlex.split(command)
209 |         else:
210 |             cmd_args = command
211 |         
212 |         # Set up signal handlers if requested
213 |         if watch_signals:
214 |             self._setup_signal_handlers(watch_signals)
215 |         
216 |         # Reset state
217 |         self._stop_event.clear()
218 |         self.terminated_by_signal = False
219 |         self.signal_received = None
220 |         
221 |         # Initialize output buffers
222 |         stdout_buffer = io.StringIO()
223 |         stderr_buffer = io.StringIO()
224 |         return_code = None
225 |         
226 |         try:
227 |             # Start the process
228 |             self.process = subprocess.Popen(
229 |                 cmd_args,
230 |                 stdout=subprocess.PIPE,
231 |                 stderr=subprocess.PIPE,
232 |                 text=True,
233 |                 bufsize=1  # Line buffered
234 |             )
235 |             
236 |             # Get file descriptors for select
237 |             stdout_fd = self.process.stdout.fileno()
238 |             stderr_fd = self.process.stderr.fileno()
239 |             
240 |             # Process output until completion or signal
241 |             while self.process.poll() is None and not self._stop_event.is_set():
242 |                 # Wait for output with timeout to allow checking for signals
243 |                 readable, _, _ = select.select(
244 |                     [self.process.stdout, self.process.stderr], 
245 |                     [], 
246 |                     [], 
247 |                     0.1
248 |                 )
249 |                 
250 |                 for stream in readable:
251 |                     line = stream.readline()
252 |                     if not line:  # EOF
253 |                         continue
254 |                         
255 |                     if stream.fileno() == stdout_fd:
256 |                         stdout_buffer.write(line)
257 |                         if print_stdout:
258 |                             sys.stdout.write(line)
259 |                             sys.stdout.flush()
260 |                     elif stream.fileno() == stderr_fd:
261 |                         stderr_buffer.write(line)
262 |                         if print_stderr:
263 |                             sys.stderr.write(line)
264 |                             sys.stderr.flush()
265 |             
266 |             # Read any remaining output
267 |             stdout_remainder = self.process.stdout.read()
268 |             if stdout_remainder:
269 |                 stdout_buffer.write(stdout_remainder)
270 |                 if print_stdout:
271 |                     sys.stdout.write(stdout_remainder)
272 |                     sys.stdout.flush()
273 |                     
274 |             stderr_remainder = self.process.stderr.read()
275 |             if stderr_remainder:
276 |                 stderr_buffer.write(stderr_remainder)
277 |                 if print_stderr:
278 |                     sys.stderr.write(stderr_remainder)
279 |                     sys.stderr.flush()
280 |             
281 |             # Get the return code
282 |             return_code = self.process.poll()
283 |             
284 |             # Check if we were terminated by a signal
285 |             if self.terminated_by_signal:
286 |                 self.logger.debug(f"DEBUG - Process terminated by signal: {self.signal_received}")
287 |                 
288 |             return stdout_buffer.getvalue(), stderr_buffer.getvalue(), return_code
289 |             
290 |         finally:
291 |             # Clean up
292 |             if self.process and self.process.poll() is None:
293 |                 self.process.terminate()
294 |                 try:
295 |                     self.process.wait(timeout=5)
296 |                 except subprocess.TimeoutExpired:
297 |                     self.process.kill()
298 |             
299 |             # Restore original signal handlers
300 |             self._restore_signal_handlers()
301 |     
302 |     def _setup_signal_handlers(self, signals: Set[int]):
303 |         """Set up signal handlers for the specified signals."""
304 |         self._original_handlers = {}
305 |         
306 |         def signal_handler(sig, frame):
307 |             self.logger.debug(f"DEBUG - Received signal: {sig}")
308 |             self.terminated_by_signal = True
309 |             self.signal_received = sig
310 |             self._stop_event.set()
311 |             
312 |             if self.process and self.process.poll() is None:
313 |                 self.process.terminate()
314 | 
315 |             for handler in self._original_handlers.values():
316 |                 handler(sig, frame)
317 |         
318 |         for sig in signals:
319 |             self._original_handlers[sig] = signal.getsignal(sig)
320 |             signal.signal(sig, signal_handler)
321 |     
322 |     def _restore_signal_handlers(self):
323 |         """Restore original signal handlers."""
324 |         for sig, handler in self._original_handlers.items():
325 |             signal.signal(sig, handler)
326 |         self._original_handlers = {}
327 | 
328 | 
329 | def generate_mpi_prefix_cmd(mpi_cmd, hosts, num_processes, oversubscribe, allow_run_as_root, params, logger):
330 |     # Check if we got slot definitions with the hosts:
331 |     slots_configured = False
332 |     for host in hosts:
333 |         if ":" in host:
334 |             slots_configured = True
335 |             break
336 | 
337 |     if slots_configured:
338 |         # Ensure the configured number of slots is >= num_processes
339 |         num_slots = sum(int(slot) for host, slot in (host.split(":") for host in hosts))
340 |         logger.debug(f"Configured slots: {num_slots}")
341 |         if num_slots < num_processes:
342 |             raise ValueError(f"Configured slots ({num_slots}) are not sufficient to run {num_processes} processes")
343 |     elif not slots_configured:
344 |         slotted_hosts = []
345 |         # manually define how many slots per host to evenly distribute the processes across hosts. If num_processes
346 |         # is not divisible by the number of hosts, distribute the remaining processes to the rest of the hosts.
347 |         base_slots_per_host = num_processes // len(hosts)
348 |         remaining_slots = num_processes % len(hosts)
349 | 
350 |         for i, host in enumerate(hosts):
351 |             # Add one extra slot to hosts until we've distributed all remaining slots
352 |             slots_for_this_host = base_slots_per_host + (1 if i < remaining_slots else 0)
353 |             slotted_hosts.append(f"{host}:{slots_for_this_host}")
354 | 
355 |         # Replace the original hosts list with the slotted version
356 |         hosts = slotted_hosts
357 |         logger.debug(f"Configured slots for hosts: {hosts}")
358 | 
359 |     if mpi_cmd == MPIRUN:
360 |         prefix = f"{MPI_RUN_BIN} -n {num_processes} -host {','.join(hosts)}"
361 |     elif mpi_cmd == MPIEXEC:
362 |         prefix = f"{MPI_EXEC_BIN} -n {num_processes} -host {','.join(hosts)}"
363 |     else:
364 |         raise ValueError(f"Unsupported MPI command: {mpi_cmd}")
365 | 
366 |     # CPU scheduling optimizations for multi-host I/O workloads
367 |     unique_hosts = set()
368 |     for host in hosts:
369 |         host_part = host.split(':')[0] if ':' in host else host
370 |         unique_hosts.add(host_part)
371 |     
372 |     if len(unique_hosts) > 1:
373 |         # Multi-host: prioritize even distribution across nodes
374 |         prefix += " --bind-to none --map-by node"
375 |     else:
376 |         # Single-host: optimize for NUMA domains
377 |         prefix += " --bind-to none --map-by socket"
378 | 
379 |     if oversubscribe:
380 |         prefix += " --oversubscribe"
381 | 
382 |     if allow_run_as_root:
383 |         prefix += " --allow-run-as-root"
384 | 
385 |     if params:
386 |         for param in params:
387 |             prefix += f" {param}"
388 | 
389 |     return prefix


--------------------------------------------------------------------------------
/mlpstorage/benchmarks/dlio.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import os
  3 | import os.path
  4 | import pprint
  5 | import sys
  6 | 
  7 | from mlpstorage.benchmarks.base import Benchmark
  8 | from mlpstorage.config import (CONFIGS_ROOT_DIR, BENCHMARK_TYPES, EXEC_TYPE, MPIRUN, MLPSTORAGE_BIN_NAME,
  9 |                                LLM_ALLOWED_VALUES, LLM_SUBSET_PROCS, EXIT_CODE, MODELS, HYDRA_OUTPUT_SUBDIR,
 10 |                                LLM_SIZE_BY_RANK)
 11 | from mlpstorage.rules import calculate_training_data_size, HostInfo, HostMemoryInfo, HostCPUInfo, ClusterInformation
 12 | from mlpstorage.utils import (read_config_from_file, create_nested_dict, update_nested_dict, generate_mpi_prefix_cmd)
 13 | 
 14 | 
 15 | class DLIOBenchmark(Benchmark, abc.ABC):
 16 | 
 17 |     DLIO_CONFIG_PATH = "dlio"
 18 |     BENCHMARK_TYPE = None
 19 | 
 20 |     def __init__(self, args, **kwargs):
 21 |         super().__init__(args, **kwargs)
 22 |         self._config_name = None
 23 |         self.base_command = "dlio_benchmark"
 24 |         if args.dlio_bin_path:
 25 |             self.base_path = args.dlio_bin_path
 26 |         else:
 27 |             self.base_path = os.path.dirname(sys.argv[0])
 28 |         self.base_command_path = os.path.join(self.base_path, self.base_command)
 29 | 
 30 |         # This is the path that DLIO needs. The files are in this self.config_path/workload
 31 |         self.config_path = os.path.join(CONFIGS_ROOT_DIR, self.DLIO_CONFIG_PATH)
 32 | 
 33 |         self.per_host_mem_kB = None
 34 |         self.total_mem_kB = None
 35 | 
 36 |         if args.command != "datagen":
 37 |             self.cluster_information = self.accumulate_host_info(args)
 38 | 
 39 |     def accumulate_host_info(self, args):
 40 |         host_info_list = []
 41 |         per_host_mem = args.client_host_memory_in_gb
 42 |         for host in args.hosts:
 43 |             host_info = HostInfo(
 44 |                 hostname=host,
 45 |                 cpu=None,
 46 |                 memory=HostMemoryInfo.from_total_mem_int(per_host_mem * 1024 * 1024 * 1024)
 47 |             )
 48 |             host_info_list.append(host_info)
 49 |         return ClusterInformation(host_info_list=host_info_list, logger=self.logger)
 50 | 
 51 |     @property
 52 |     def config_name(self):
 53 |         if self._config_name is None:
 54 |             self.logger.error("This subclass doesn't appropriately set config name. self.config_name should be set in __init__")
 55 |             raise ValueError("config_name not set")
 56 |         return self._config_name
 57 | 
 58 |     @config_name.setter
 59 |     def config_name(self, config_name):
 60 |         self._config_name = config_name
 61 | 
 62 |     def process_dlio_params(self, config_file):
 63 |         params_dict = dict() if not self.args.params else {k: v for k, v in (item.split("=") for item in self.args.params)}
 64 |         yaml_params = read_config_from_file(os.path.join(self.DLIO_CONFIG_PATH, "workload", config_file))
 65 |         combined_params = update_nested_dict(yaml_params, create_nested_dict(params_dict))
 66 | 
 67 |         self.logger.debug(f'yaml params: \n{pprint.pformat(yaml_params)}')
 68 |         self.logger.debug(f'combined params: \n{pprint.pformat(combined_params)}')
 69 |         self.logger.debug(f'Instance params: \n{pprint.pformat(self.__dict__)}')
 70 | 
 71 |         return params_dict, yaml_params, combined_params
 72 | 
 73 |     @abc.abstractmethod
 74 |     def _run(self):
 75 |         """
 76 |         This method needs to call execute_command method to run the benchmark
 77 |         :return:
 78 |         """
 79 |         raise NotImplementedError("Subclasses must implement this method")
 80 | 
 81 |     def execute_command(self):
 82 |         cmd = self.generate_dlio_command()
 83 |         self.logger.status(f'Running benchmark command:: {cmd}')
 84 |         output_file_prefix = f"{self.BENCHMARK_TYPE.value}"
 85 |         if hasattr(self.args, "command"):
 86 |             output_file_prefix += f"_{self.args.command}"
 87 | 
 88 |         self._execute_command(cmd, output_file_prefix=output_file_prefix)
 89 | 
 90 |     @abc.abstractmethod
 91 |     def add_workflow_to_cmd(self, cmd) -> str:
 92 |         raise NotImplementedError("Subclasses must implement this method")
 93 | 
 94 |     def generate_dlio_command(self):
 95 |         self.logger.verboser(f'Generating DLIO command for benchmark {self.BENCHMARK_TYPE.value}')
 96 |         cmd = ""
 97 |         cmd = f"{self.base_command_path}"
 98 |         cmd += f" workload={self.config_name}"
 99 | 
100 |         # Run directory for Hydra to output log files
101 |         cmd += f" ++hydra.run.dir={self.run_result_output}"
102 |         cmd += f" ++hydra.output_subdir={HYDRA_OUTPUT_SUBDIR}"
103 | 
104 |         cmd = self.add_workflow_to_cmd(cmd)
105 | 
106 |         if self.params_dict:
107 |             for key, value in self.params_dict.items():
108 |                 cmd += f" ++workload.{key}={value}"
109 | 
110 |         cmd += f" --config-dir={self.config_path}"
111 | 
112 |         if self.args.exec_type == EXEC_TYPE.MPI:
113 |             self.logger.debug(f'Generating MPI Command with binary "{self.args.mpi_bin}"')
114 |             mpi_prefix = generate_mpi_prefix_cmd(self.args.mpi_bin, self.args.hosts, self.args.num_processes,
115 |                                                  self.args.oversubscribe, self.args.allow_run_as_root,
116 |                                                  self.args.mpi_params, self.logger)
117 |             cmd = f"{mpi_prefix} {cmd}"
118 | 
119 |         return cmd
120 | 
121 | 
122 | class TrainingBenchmark(DLIOBenchmark):
123 | 
124 |     BENCHMARK_TYPE = BENCHMARK_TYPES.training
125 | 
126 |     def __init__(self, args, **kwargs):
127 |         super().__init__(args, **kwargs)
128 | 
129 |         # This allows each command to map to a specific wrapper method. When methods are created, replace the default
130 |         # 'self.execute_command' with the command-specific method (like "self._datasize()")
131 |         self.command_method_map = dict(
132 |             datasize=self.datasize,
133 |             datagen=self.execute_command,
134 |             run=self.execute_command,
135 |             configview=self.execute_command,
136 |             reportgen=self.execute_command)
137 |         config_suffix = "datagen" if args.command == "datagen" else args.accelerator_type
138 |         under_model = args.model.replace("-", "_")
139 |         self.config_file = f"{under_model}_{config_suffix}.yaml"
140 |         self.config_name = f"{under_model}_{config_suffix}"
141 | 
142 |         self.params_dict, self.yaml_params, self.combined_params = self.process_dlio_params(self.config_file)
143 | 
144 |         if self.args.command not in ("datagen", "datasize"):
145 |             self.verify_benchmark()
146 | 
147 |         if self.args.command != "datasize":
148 |             # The datasize command uses --data-dir and needs to generate a command that also calls --data-dir
149 |             # The add_datadir_param would convert --data-dir to --dataset.data_folder which is invalid to
150 |             # mlpstorage.
151 |             self.add_datadir_param()
152 |         self.logger.verboser(f'Instantiated the Training Benchmark...')
153 | 
154 |     def add_datadir_param(self):
155 |         self.params_dict['dataset.data_folder'] = self.args.data_dir
156 |         if not any([self.args.data_dir.endswith(m) for m in MODELS]):
157 |             # Add the model to the data dir path and make sure it exists
158 |             self.params_dict['dataset.data_folder'] = os.path.join(self.args.data_dir, self.args.model)
159 |             if not os.path.exists(self.params_dict['dataset.data_folder']):
160 |                 self.logger.info(f'Creating data directory: {self.params_dict["dataset.data_folder"]}...')
161 |                 os.makedirs(self.params_dict['dataset.data_folder'])
162 | 
163 |         # Create the train, eval, test directories
164 |         for folder in ["train", "valid", "test"]:
165 |             folder_path = os.path.join(self.params_dict['dataset.data_folder'], folder)
166 |             if not os.path.exists(folder_path):
167 |                 self.logger.info(f'Creating directory: {folder_path}...')
168 |                 os.makedirs(folder_path)
169 | 
170 |     def add_workflow_to_cmd(self, cmd) -> str:
171 |         # # Configure the workflow depending on command
172 |         # if self.args.command == "datagen":
173 |         #     cmd += " ++workload.workflow.generate_data=True ++workload.workflow.train=False"
174 |         # elif self.args.command == "run_benchmark":
175 |         #     cmd += " ++workload.workflow.generate_data=False ++workload.workflow.train=True"
176 |         #
177 |         # # Training doesn't do checkpoints
178 |         # cmd += " ++workload.workflow.checkpoint=False"
179 |         # We're now using the workflow defined in the yaml file only
180 |         return cmd
181 | 
182 |     def generate_datagen_benchmark_command(self, num_files_train, num_subfolders_train):
183 |         """
184 |         This function will generate the command to use to call this program with the training & datagen parameters.
185 |         """
186 |         kv_map = {
187 |             "dataset.num_files_train": num_files_train,
188 |             "dataset.num_subfolders_train": num_subfolders_train,
189 |         }
190 | 
191 |         cmd = f"{MLPSTORAGE_BIN_NAME} training datagen"
192 |         if self.args.hosts:
193 |             cmd += f" --hosts={','.join(self.args.hosts)}"
194 |         cmd += f" --model={self.args.model}"
195 |         cmd += f" --exec-type={self.args.exec_type}"
196 | 
197 |         if self.params_dict:
198 |             for key, value in self.params_dict.items():
199 |                 if key in kv_map.keys():
200 |                     continue
201 |                 cmd += f" --{key}={value}"
202 | 
203 |         for key, value in kv_map.items():
204 |             if value == 0:
205 |                 continue
206 |             cmd += f" --param {key}={value}"
207 | 
208 |         # During datasize, this will be set to max_accelerators
209 |         cmd += f" --num-processes={self.args.num_processes}"
210 |         cmd += f" --results-dir={self.args.results_dir}"
211 | 
212 |         if self.args.data_dir:
213 |             cmd += f" --data-dir={self.args.data_dir}"
214 |         else:
215 |             cmd += f" --data-dir=<INSERT_DATA_DIR>"
216 | 
217 |         return cmd
218 | 
219 | 
220 |     def datasize(self):
221 |         num_files_train, num_subfolders_train, total_disk_bytes = calculate_training_data_size(
222 |             self.args, self.cluster_information, self.combined_params['dataset'], self.combined_params['reader'], self.logger
223 |         )
224 |         self.logger.result(f'Number of training files: {num_files_train}')
225 |         self.logger.result(f'Number of training subfolders: {num_subfolders_train}')
226 |         self.logger.result(f'Total disk space required for training: {total_disk_bytes / 1024**3:.2f} GB')
227 | 
228 |         if num_files_train > 10000:
229 |             self.logger.warning(
230 |                 f'The number of files required may be excessive for some filesystems. You can use the num_subfolders_train parameter to shard the dataset. To keep near 10,000 files per folder use "{int(num_files_train / 10000)}x" subfolders by adding "--param dataset.num_subfolders_train={int(num_files_train / 10000)}"')
231 | 
232 |         cmd = self.generate_datagen_benchmark_command(num_files_train, num_subfolders_train)
233 |         self.logger.result(f'Run the following command to generate data: \n{cmd}')
234 |         self.logger.warning(f'The parameter for --num-processes is the same as --max-accelerators. Adjust the value '
235 |                        f'according to your system.')
236 | 
237 |     def _run(self):
238 |         try:
239 |             self.command_method_map[self.args.command]()
240 |         except Exception as e:
241 |             self.logger.error(f'Error occurred while executing command: {str(e)}')
242 |             return EXIT_CODE.FAILURE
243 |         return EXIT_CODE.SUCCESS
244 | 
245 | 
246 | class CheckpointingBenchmark(DLIOBenchmark):
247 | 
248 |     BENCHMARK_TYPE = BENCHMARK_TYPES.checkpointing
249 | 
250 |     def __init__(self, args, **kwargs):
251 |         super().__init__(args, **kwargs)
252 | 
253 |         self.config_name = f'{args.model.replace("-", "_")}'
254 |         self.config_file = f'{self.config_name}.yaml'
255 |         self.params_dict, self.yaml_params, self.combined_params = self.process_dlio_params(self.config_file)
256 |         self.verify_benchmark()
257 |         self.add_checkpoint_params()
258 |         self.logger.status(f'Instantiated the Checkpointing Benchmark...')
259 | 
260 |     def add_checkpoint_params(self):
261 |         min_procs, zero_level, GPUpDP, ClosedGPUs = LLM_ALLOWED_VALUES.get(self.args.model)
262 |         configured_data_parallelism = int(ClosedGPUs / GPUpDP)
263 | 
264 |         # We only need the param "model.parallelism.data" if we are not using default checkpoint_mode
265 |         if self.args.num_processes < ClosedGPUs:
266 |             self.params_dict['checkpoint.mode'] = "subset"
267 |             self.params_dict['model.parallelism.data'] = configured_data_parallelism
268 | 
269 |         self.params_dict['checkpoint.num_checkpoints_read'] = self.args.num_checkpoints_read
270 |         self.params_dict['checkpoint.num_checkpoints_write'] = self.args.num_checkpoints_write
271 |         self.params_dict['checkpoint.checkpoint_folder'] = os.path.join(self.args.checkpoint_folder, self.args.model)
272 | 
273 | 
274 |     def add_workflow_to_cmd(self, cmd) -> str:
275 |         # cmd += " ++workload.workflow.generate_data=False ++workload.workflow.train=False"
276 |         # cmd += " ++workload.workflow.checkpoint=True"
277 |         # We're now using the workflow defined in the yaml file only
278 |         return cmd
279 | 
280 |     def _run(self):
281 |         try:
282 |             if self.args.command == "run":
283 |                 self.execute_command()
284 |             elif self.args.command == "datasize":
285 |                 self.datasize()
286 |             else:
287 |                 self.logger.error(f'Invalid command: {self.args.command}')
288 |                 return EXIT_CODE.INVALID_ARGUMENTS
289 |         except Exception as e:
290 |             return EXIT_CODE.FAILURE
291 |         return EXIT_CODE.SUCCESS
292 | 
293 |     def datasize(self):
294 |         self.logger.verbose(f'Running datasize for {self.args.model}...')
295 |         # Calculate the total writes per rank which equates to memory required per rank
296 |         # If zero_level is 1, then rank 0 writes the entire model,
297 |         # If zero_level is 3, then the model is sharded across all ranks
298 |         min_procs, zero_level, GPUpDP, ClosedGPUs = LLM_ALLOWED_VALUES.get(self.args.model)
299 |         model_gb, optimizer_gb = LLM_SIZE_BY_RANK.get(self.args.model)
300 |         rank_gb = []
301 | 
302 |         self.logger.verbose(f'Model & optimizer size: {model_gb:.2f} GB, {optimizer_gb:.2f} GB')
303 |         for rank in range(self.args.num_processes):
304 |             rank_gb.append(0)
305 |             if zero_level == 1:
306 |                 self.logger.debug("Optimizer is written by all ranks, but only the ranks on the first DP instance write the model")
307 |                 rank_gb[rank] = optimizer_gb / self.args.num_processes
308 |                 if rank < GPUpDP:
309 |                     rank_gb[rank] += model_gb / GPUpDP
310 |                     self.logger.debug(f'First DP: rank-{rank} write model: {rank_gb[rank]:.2f} GB')
311 |             elif zero_level == 3:
312 |                 rank_gb[rank] = (model_gb + optimizer_gb) / self.args.num_processes
313 |                 self.logger.debug(f'Rank {rank} writes portion of model and optimizer: {rank_gb[rank]:.2f} GB')
314 |             else:
315 |                 self.logger.error(f'Invalid zero_level: {zero_level}')
316 |                 raise ValueError("Invalid zero_level")
317 | 
318 |         rank_string = "\n\t".join(f"Rank {rank}: {rank_gb[rank]:.2f} GB" for rank in range(self.args.num_processes))
319 | 
320 |         self.logger.result(f'Total GB required per rank:\n\t{rank_string}')
321 |         self.logger.result(f'Total GB required for all ranks: {sum(rank_gb):.2f} GB')
322 | 
323 | 
324 | 


--------------------------------------------------------------------------------
/mlpstorage/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | 
  4 | 
  5 | from mlpstorage import VERSION
  6 | from mlpstorage.config import (CHECKPOINT_RANKS_STRINGS, MODELS, ACCELERATORS, DEFAULT_HOSTS, VECTORDB_DEFAULT_RUNTIME,
  7 |                                LLM_MODELS, LLM_MODELS_STRINGS, MPI_CMDS, EXEC_TYPE, DEFAULT_RESULTS_DIR, EXIT_CODE,
  8 |                                VECTOR_DTYPES, DISTRIBUTIONS, UNET)
  9 | 
 10 | # TODO: Get rid of this now that I'm not repeating arguments for different subparsers?
 11 | help_messages = dict(
 12 |     # General help messages
 13 |     sub_commands="Select a subcommand for the benchmark.",
 14 |     #      Model to emulate. A specific model defines the sample size, sample container format, and data ra
 15 |     model="Model to emulate. A specific model defines the sample size, sample container format, and data \n"
 16 |           "rates for each supported accelerator.",
 17 |     accelerator_type="Accelerator to simulate for the benchmark. A specific accelerator defines the data access "
 18 |                      "sizes and rates for each supported workload",
 19 |     num_accelerators_datasize="Max number of simulated accelerators. In multi-host configurations the accelerators "
 20 |                               "will be initiated in a round-robin fashion to ensure equal distribution of "
 21 |                               "simulated accelerator processes",
 22 |     num_accelerators_run="Number of simulated accelerators. In multi-host configurations the accelerators "
 23 |                          "will be initiated in a round-robin fashion to ensure equal distribution of "
 24 |                          "simulated accelerator processes",
 25 |     num_accelerators_datagen="Number of parallel processes to use for dataset generation. Processes will be "
 26 |                              "initiated in a round-robin fashion across the configured client hosts",
 27 |     num_client_hosts="Number of participating client hosts. Simulated accelerators will be initiated on these "
 28 |                      "hosts in a round-robin fashion",
 29 |     client_host_mem_GB="Memory available in the client where the benchmark is run. The dataset needs to be 5x the "
 30 |                        "available memory for closed submissions.",
 31 |     client_hosts="Space-separated list of IP addresses or hostnames of the participating hosts. "
 32 |                  "\nExample: '--hosts 192.168.1.1 192.168.1.2 192.168.1.3' or '--hosts host1 host2 host3'. Slots can "
 33 |                  "be specified by appending ':<num_slots>' to a hostname like so: '--hosts host1:2 host2:2'. This "
 34 |                  "example will run 2 accelerators on each host. If slots are not specified the number of processes "
 35 |                  "will be equally distributed across the hosts with any remainder being distributed evenly on the "
 36 |                  "remaining hosts in the order they are listed.",
 37 |     category="Benchmark category to be submitted.",
 38 |     results_dir="Directory where the benchmark results will be saved.",
 39 |     params="Additional parameters to be passed to the benchmark. These will override the config file. "
 40 |            "\nFor a closed submission only a subset of params are supported. "
 41 |            "\nMultiple values allowed in the form: "
 42 |            "\n    --params key1=value1 key2=value2 key3=value3",
 43 |     datasize="The datasize command calculates the number of samples needed for a given workload, accelerator type,"
 44 |              " number of accelerators, and client host memory.",
 45 |     training_datagen="The datagen command generates a dataset for a given workload and number of parallel generation "
 46 |             "processes.",
 47 |     run_benchmark="Run the benchmark with the specified parameters.",
 48 |     configview="View the final config based on the specified options.",
 49 |     reportgen="Generate a report from the benchmark results.",
 50 | 
 51 |     # Checkpoint foler is used for training and checkpointing
 52 |     checkpoint_folder="Location for checkpoint files for training or checkpointing workloads",
 53 | 
 54 |     # Checkpointing help messages
 55 |     checkpoint_run="The checkpoint command executes checkpoint saves and restores for a given model.",
 56 |     llm_model="The model & size to be emulated for checkpointing. The selection will dictate the TP, PP, & DP "
 57 |               "\nsizes as well as the size of the checkpoint. "
 58 |               "\nAvailable LLM Models: "
 59 |               f"\n    {LLM_MODELS_STRINGS}",
 60 |     num_checkpoints="The number of checkpoints to be executed.",
 61 |     num_checkpoint_accelerators=f"The number of accelerators to emulate for the checkpoint task. Each LLM Model "
 62 |                                 f"\ncan be executed with the following accelerator counts: "
 63 |                                 f"\n    {CHECKPOINT_RANKS_STRINGS}",
 64 |     deepspeed_zero_level="The DeepSpeed Zero level. \nSupported options: "
 65 |                          "\n    0 = disabled, "
 66 |                          "\n    1 = Optimizer Partitioning, "
 67 |                          "\n    2 = Gradient partitioning, "
 68 |                          "\n    3 = Model Parameter Partitioning",
 69 |     checkpoint_datasize="The datasize command calculates the total amount of writes for a given command and an estimate "
 70 |                         "of the required memory.",
 71 |     checkpoint_subset="Run the checkpoint in 'Subset' mode. This mode only runs on a subset of hosts. eg, for large "
 72 |                       "models that required hundreds of processes to do an entire checkpoint, subset mode enables "
 73 |                       "using fewer processes and only doing part of the checkpoint. This is used in the Submissions to "
 74 |                       "represent a single 8-GPU node writing to local storage.",
 75 | 
 76 |     # VectorDB help messages
 77 |     db_ip_address=f"IP address of the VectorDB instance. If not provided, a local VectorDB instance will be used.",
 78 |     db_port=f"Port number of the VectorDB instance.",
 79 |     db_collection=f"Collection name for the VectorDB instance.",
 80 |     dimension=f"Dimensionality of the vectors.",
 81 |     num_shards=f"Number of shards for the collection. Recommended is 1 for every 1 Million vectors",
 82 |     vector_dtype=f"Data type of the vectors. Supported options: {VECTOR_DTYPES}",
 83 |     num_vectors=f"Number of vectors to be inserted into the collection.",
 84 |     distribution=f"Distribution of the vectors. Supported options: {DISTRIBUTIONS}",
 85 |     vdb_datagen_batch_size=f"Batch size for data insertion.",
 86 |     vdb_datagen_chunk_size="Number of vectors to generate in each insertion chunk. Tune for memory management.",
 87 | 
 88 |     vdb_run_search="Run the VectorDB Search benchmark with the specified parameters.",
 89 |     vdb_datagen="Generate a dataset for the VectorDB benchmark.",
 90 |     vdb_report_count="Number of batches between print statements",
 91 |     num_query_processes=f"Number of parallel processes to use for query execution.",
 92 |     query_batch_size=f"Number of vectors to query in each batch (per process).",
 93 | 
 94 |     # Reports help messages
 95 |     output_dir=f"Directory where the benchmark report will be saved.",
 96 |     config_file="Path to YAML file with argument overrides that will be applied after CLI arguments",
 97 | 
 98 |     # MPI help messages
 99 |     mpi_bin=f"Execution type for MPI commands. Supported options: {MPI_CMDS}",
100 |     exec_type=f"Execution type for benchmark commands. Supported options: {list(EXEC_TYPE)}",
101 | )
102 | 
103 | prog_descriptions = dict(
104 |     training="Run the MLPerf Storage training benchmark",
105 |     checkpointing="Run the MLPerf Storage checkpointing benchmark",
106 |     vectordb="Run the MLPerf Storage Preview of a VectorDB benchmark (not available in closed submissions)",
107 | )
108 | 
109 | def parse_arguments():
110 |     # Many of the help messages are shared between the subparsers. This dictionary prevents rewriting the same messages
111 |     # in multiple places.
112 |     parser = argparse.ArgumentParser(description="Script to launch the MLPerf Storage benchmark")
113 |     parser.add_argument("--version", action="version", version=f"%(prog)s {VERSION}")
114 |     sub_programs = parser.add_subparsers(dest="program", required=True)
115 |     sub_programs.required = True
116 | 
117 |     training_parsers = sub_programs.add_parser("training", description=prog_descriptions['training'],
118 |                                                help="Training benchmark options")
119 |     checkpointing_parsers = sub_programs.add_parser("checkpointing", description=prog_descriptions['checkpointing'],
120 |                                                     help="Checkpointing benchmark options",
121 |                                                     formatter_class=argparse.RawTextHelpFormatter)
122 |     vectordb_parsers = sub_programs.add_parser("vectordb", description=prog_descriptions['vectordb'],
123 |                                                help="VectorDB benchmark options")
124 |     reports_parsers = sub_programs.add_parser("reports", help="Generate a report from benchmark results")
125 |     history_parsers = sub_programs.add_parser("history", help="Display benchmark history")
126 | 
127 |     sub_programs_map = dict(training=training_parsers,
128 |                             checkpointing=checkpointing_parsers,
129 |                             vectordb=vectordb_parsers,
130 |                             reports=reports_parsers,
131 |                             history=history_parsers
132 |                             )
133 | 
134 |     add_training_arguments(training_parsers)
135 |     add_checkpointing_arguments(checkpointing_parsers)
136 |     add_vectordb_arguments(vectordb_parsers)
137 |     add_reports_arguments(reports_parsers)
138 |     add_history_arguments(history_parsers)
139 | 
140 |     for _parser in sub_programs_map.values():
141 |         add_universal_arguments(_parser)
142 | 
143 |     if len(sys.argv) == 1:
144 |         parser.print_help(sys.stderr)
145 |         sys.exit(1)
146 | 
147 |     if len(sys.argv) == 2 and sys.argv[1] in sub_programs_map.keys():
148 |         sub_programs_map[sys.argv[1]].print_help(sys.stderr)
149 |         sys.exit(1)
150 | 
151 |     parsed_args = parser.parse_args()
152 |     
153 |     # Apply YAML config file overrides if specified
154 |     if hasattr(parsed_args, 'config_file') and parsed_args.config_file:
155 |         parsed_args = apply_yaml_config_overrides(parsed_args)
156 |     
157 |     validate_args(parsed_args)
158 |     return parsed_args
159 | 
160 | def apply_yaml_config_overrides(args):
161 |     """
162 |     Apply overrides from a YAML config file to the parsed arguments.
163 |     
164 |     Args:
165 |         args (argparse.Namespace): The parsed command-line arguments
166 |         
167 |     Returns:
168 |         argparse.Namespace: The updated arguments with YAML overrides applied
169 |     """
170 |     import yaml
171 |     
172 |     try:
173 |         with open(args.config_file, 'r') as f:
174 |             yaml_config = yaml.safe_load(f)
175 |         
176 |         if not yaml_config:
177 |             print(f"Warning: Config file {args.config_file} is empty or invalid")
178 |             return args
179 |             
180 |         # Convert args to a dictionary for easier manipulation
181 |         args_dict = vars(args)
182 |         
183 |         # Apply overrides from YAML
184 |         for key, value in yaml_config.items():
185 |             # Skip if the key doesn't exist in args
186 |             if key not in args_dict:
187 |                 print(f"Warning: Config file contains unknown parameter '{key}', skipping")
188 |                 continue
189 |                 
190 |             # Skip if the value is None (to avoid overriding CLI args with None)
191 |             if value is None:
192 |                 continue
193 |                 
194 |             # Handle special cases for list arguments
195 |             if isinstance(args_dict.get(key), list) and not isinstance(value, list):
196 |                 if key == 'hosts':
197 |                     # Convert string to list for hosts
198 |                     args_dict[key] = value.split(',')
199 |                 elif key == 'params':
200 |                     # Convert dict to list of "key=value" strings for params
201 |                     if isinstance(value, dict):
202 |                         args_dict[key] = [f"{k}={v}" for k, v in value.items()]
203 |                     else:
204 |                         print(f"Warning: Invalid format for 'params' in config file, skipping")
205 |                         continue
206 |             else:
207 |                 # Regular case - just override the value
208 |                 args_dict[key] = value
209 |                 
210 |         # Convert back to Namespace
211 |         return argparse.Namespace(**args_dict)
212 |         
213 |     except FileNotFoundError:
214 |         print(f"Error: Config file {args.config_file} not found")
215 |         sys.exit(EXIT_CODE.INVALID_ARGUMENTS)
216 |     except yaml.YAMLError as e:
217 |         print(f"Error parsing YAML config file: {e}")
218 |         sys.exit(EXIT_CODE.INVALID_ARGUMENTS)
219 |     except Exception as e:
220 |         print(f"Error applying config file overrides: {e}")
221 |         sys.exit(EXIT_CODE.INVALID_ARGUMENTS)
222 | 
223 | # These are used by the history tracker to know if logging needs to be updated.
224 | logging_options = ['debug', 'verbose', 'stream_log_level']
225 | 
226 | def add_universal_arguments(parser):
227 |     standard_args = parser.add_argument_group("Standard Arguments")
228 |     standard_args.add_argument('--results-dir', '-rd', type=str, default=DEFAULT_RESULTS_DIR, help=help_messages['results_dir'])
229 |     standard_args.add_argument('--loops', type=int, default=1, help="Number of times to run the benchmark")
230 |     standard_args.add_argument('--config-file', '-c', type=str, help="Path to YAML file with argument overrides")
231 | 
232 |     # Create a mutually exclusive group for closed/open options
233 |     submission_group = standard_args.add_mutually_exclusive_group()
234 |     submission_group.add_argument("--open", action="store_false", dest="closed", default=False,
235 |                                   help="Run as an open submission")
236 |     submission_group.add_argument("--closed", action="store_true", help="Run as a closed submission")
237 | 
238 |     output_control = parser.add_argument_group("Output Control")
239 |     output_control.add_argument("--debug", action="store_true", help="Enable debug mode")
240 |     output_control.add_argument("--verbose", action="store_true", help="Enable verbose mode")
241 |     output_control.add_argument("--stream-log-level", type=str, default="INFO",)
242 | 
243 |     output_control.add_argument("--allow-invalid-params", "-aip", action="store_true",
244 |                         help="Do not fail on invalid parameters.")
245 | 
246 |     view_only_args = parser.add_argument_group("View Only")
247 |     view_only_args.add_argument("--what-if", action="store_true", help="View the configuration that would execute and "
248 |                                                                        "the associated command.")
249 | 
250 | 
251 | def add_mpi_group(parser):
252 |     mpi_options = parser.add_argument_group("MPI")
253 |     mpi_options.add_argument('--mpi-bin', choices=MPI_CMDS, default="mpirun", help=help_messages['mpi_bin'])
254 |     mpi_options.add_argument('--oversubscribe', action="store_true")
255 |     mpi_options.add_argument('--allow-run-as-root', action="store_true")
256 |     mpi_options.add_argument('--mpi-params', nargs="+", type=str, action="append", help="Other MPI parameters that will be passed to MPI")
257 | 
258 | 
259 | def add_training_arguments(training_parsers):
260 |     training_subparsers = training_parsers.add_subparsers(dest="command", required=True)
261 |     training_parsers.required = True
262 | 
263 |     datasize = training_subparsers.add_parser("datasize", help=help_messages['datasize'])
264 |     datagen = training_subparsers.add_parser("datagen", help=help_messages['training_datagen'])
265 |     run_benchmark = training_subparsers.add_parser("run", help=help_messages['run_benchmark'])
266 |     configview = training_subparsers.add_parser("configview", help=help_messages['configview'])
267 | 
268 |     for _parser in [datasize, datagen, run_benchmark]:
269 |         _parser.add_argument('--hosts', '-s', nargs="+", default=DEFAULT_HOSTS, help=help_messages['client_hosts'])
270 |         _parser.add_argument('--model', '-m', choices=MODELS, required=True, help=help_messages['model'])
271 | 
272 |         # TODO: Add exclusive group for memory or auto-scaling
273 |         # For 'datagen' this should be used to ensure enough memory exists to do the generation. The if statement
274 |         #  prevents it from being use today but change when we add the capability
275 |         if _parser != datagen:
276 |             _parser.add_argument('--client-host-memory-in-gb', '-cm', type=int, required=True, help=help_messages['client_host_mem_GB'])
277 | 
278 |         _parser.add_argument('--exec-type', '-et', type=EXEC_TYPE, choices=list(EXEC_TYPE), default=EXEC_TYPE.MPI, help=help_messages['exec_type'])
279 | 
280 |         add_mpi_group(_parser)
281 | 
282 |     datagen.add_argument('--num-processes', '-np', type=int, required=True, help=help_messages['num_accelerators_datagen'])
283 |     datasize.add_argument('--max-accelerators', '-ma', type=int, required=True, help=help_messages['num_accelerators_datasize'])
284 |     run_benchmark.add_argument('--num-accelerators', '-na', type=int, required=True, help=help_messages['num_accelerators_run'])
285 |     configview.add_argument('--num-accelerators', '-na', type=int, required=True, help=help_messages['num_accelerators_run'])
286 | 
287 |     for _parser in [datasize, run_benchmark]:
288 |         _parser.add_argument('--accelerator-type', '-g', choices=ACCELERATORS, required=True, help=help_messages['accelerator_type'])
289 |         _parser.add_argument('--num-client-hosts', '-nc', type=int, help=help_messages['num_client_hosts'])
290 | 
291 |     for _parser in [datasize, datagen, run_benchmark, configview]:
292 |         _parser.add_argument("--data-dir", '-dd', type=str, help="Filesystem location for data")
293 |         _parser.add_argument('--params', '-p', nargs="+", type=str, action="append", help=help_messages['params'])
294 |         _parser.add_argument('--dlio-bin-path', '-dp', type=str, help="Path to DLIO binary. Default is the same as mlpstorage binary path")
295 |         add_universal_arguments(_parser)
296 | 
297 | 
298 | def add_checkpointing_arguments(checkpointing_parsers):
299 |     checkpointing_subparsers = checkpointing_parsers.add_subparsers(dest="command", required=True)
300 |     checkpointing_parsers.required = True
301 | 
302 |     datasize = checkpointing_subparsers.add_parser("datasize", help=help_messages['checkpoint_datasize'])
303 |     run_benchmark = checkpointing_subparsers.add_parser("run", help=help_messages['checkpoint_run'])
304 | 
305 |     for _parser in [datasize, run_benchmark]:
306 |         _parser.add_argument('--hosts', '-s', nargs="+", default=DEFAULT_HOSTS, help=help_messages['client_hosts'])
307 |         _parser.add_argument('--client-host-memory-in-gb', '-cm', type=int, required=True,
308 |                              help=help_messages['client_host_mem_GB'])
309 | 
310 |         # We do not use "choices=LLM_MODELS" here because it makes the help really long. We define a string for the
311 |         # help that includes the choices and do validation in the validate_args section
312 |         _parser.add_argument('--model', '-m', required=True, help=help_messages['llm_model'])
313 |         _parser.add_argument('--num-checkpoints-read', '-ncr', type=int, default=10, help=help_messages['num_checkpoints'])
314 |         _parser.add_argument('--num-checkpoints-write', '-ncw', type=int, default=10, help=help_messages['num_checkpoints'])
315 |         _parser.add_argument('--num-processes', '-np', type=int, required=True, help=help_messages['num_checkpoint_accelerators'])
316 | 
317 |         # We handle the subset param automatically if number of processes is less than number required for Closed
318 |         # _parser.add_argument('--subset', action="store_true", help=help_messages["checkpoint_subset"])
319 |         _parser.add_argument('--params', '-p', nargs="+", type=str, action="append", help=help_messages['params'])
320 |         _parser.add_argument("--checkpoint-folder", '-cf', type=str, required=True, help=help_messages['checkpoint_folder'])
321 |         _parser.add_argument('--dlio-bin-path', '-dp', type=str, help="Path to DLIO binary. Default is the same as mlpstorage binary path")
322 | 
323 | 
324 |         # Not available in open or closed for MLPS 2.0
325 |         # _parser.add_argument('--deepspeed-zero-level', '-dzl', type=zero_level_type, default=0,
326 |         #                      help=help_messages['deepspeed_zero_level'])
327 | 
328 |         if _parser == run_benchmark:
329 |             _parser.add_argument('--exec-type', '-et', type=EXEC_TYPE, choices=list(EXEC_TYPE), default=EXEC_TYPE.MPI, help=help_messages['exec_type'])
330 |             add_mpi_group(_parser)
331 | 
332 |         add_universal_arguments(_parser)
333 | 
334 | 
335 | def add_vectordb_arguments(vectordb_parsers):
336 |     # VectorDB Benchmark
337 |     vectordb_subparsers = vectordb_parsers.add_subparsers(dest="command", required=True, help="sub_commands")
338 |     vectordb_parsers.required = True
339 | 
340 |     datagen = vectordb_subparsers.add_parser('datagen', help=help_messages['vdb_datagen'])
341 |     run_search = vectordb_subparsers.add_parser('run-search', help=help_messages['vdb_run_search'])
342 | 
343 |     for _parser in [datagen, run_search]:
344 |         _parser.add_argument('--host', '-s', type=str, default="127.0.0.1", help=help_messages['db_ip_address'])
345 |         _parser.add_argument('--port', '-p', type=int, default=19530, help=help_messages['db_port'])
346 |         _parser.add_argument('--config')
347 |         _parser.add_argument('--collection', type=str, help=help_messages['db_collection'])
348 | 
349 |     # Datagen specific arguments
350 |     datagen.add_argument('--dimension', type=int, default=1536, help=help_messages['dimension'])
351 |     datagen.add_argument('--num-shards', type=int, default=1, help=help_messages['num_shards'])
352 |     datagen.add_argument('--vector-dtype', choices=VECTOR_DTYPES, default="FLOAT_VECTOR", help=help_messages['vector_dtype'])
353 |     datagen.add_argument('--num-vectors', type=int, default=1_000_000, help=help_messages['num_vectors'])
354 |     datagen.add_argument('--distribution', choices=DISTRIBUTIONS, default="uniform", help=help_messages['distribution'])
355 |     datagen.add_argument('--batch-size', type=int, default=1_000, help=help_messages['vdb_datagen_batch_size'])
356 |     datagen.add_argument('--chunk-size', type=int, default=10_000, help=help_messages['vdb_datagen_chunk_size'])
357 |     datagen.add_argument("--force", action="store_true", help="Force recreate collection if it exists")
358 | 
359 |     # Add specific VectorDB benchmark options here
360 |     run_search.add_argument('--num-query-processes', type=int, default=1, help=help_messages['num_query_processes'])
361 |     run_search.add_argument('--batch-size', type=int, default=1, help=help_messages['query_batch_size'])
362 |     run_search.add_argument('--report-count', type=int, default=100, help=help_messages['vdb_report_count'])
363 | 
364 |     end_group = run_search.add_argument_group("Provide an end condition of runtime (in seconds) or total number of "
365 |                                               "queries to execute. The default is to run for 60 seconds")
366 |     end_condition = end_group.add_mutually_exclusive_group()
367 |     end_condition.add_argument("--runtime", type=int, help="Run for a specific duration in seconds")
368 |     end_condition.add_argument("--queries", type=int, help="Run for a specific number of queries")
369 | 
370 |     for _parser in [datagen, run_search]:
371 |         add_universal_arguments(_parser)
372 | 
373 | 
374 | def add_reports_arguments(reports_parsers):
375 |     # Reporting
376 | 
377 |     reports_subparsers = reports_parsers.add_subparsers(dest="command", required=True, help="Sub-commands")
378 |     reports_parsers.required = True
379 | 
380 |     reportgen = reports_subparsers.add_parser('reportgen', help=help_messages['reportgen'])
381 | 
382 |     reportgen.add_argument('--output-dir', type=str, help=help_messages['output_dir'])
383 |     add_universal_arguments(reportgen)
384 | 
385 | 
386 | def add_history_arguments(history_parsers):
387 |     # History
388 |     history_subparsers = history_parsers.add_subparsers(dest="command", required=True, help="Sub-commands")
389 |     history_parsers.required = True
390 | 
391 |     history = history_subparsers.add_parser('show', help="Show command history")
392 |     history.add_argument('--limit', '-n', type=int, help="Limit to the N most recent commands")
393 |     history.add_argument('--id', '-i', type=int, help="Show a specific command by ID")
394 | 
395 |     rerun = history_subparsers.add_parser('rerun', help="Re-run a command from history")
396 |     rerun.add_argument('rerun_id', type=int, help="ID of the command to re-run")
397 | 
398 |     for _parser in [history, rerun]:
399 |         add_universal_arguments(_parser)
400 | 
401 | 
402 | def validate_args(args):
403 |     error_messages = []
404 |     # Add generic validations here. Workload specific validation is in the Benchmark classes
405 |     if args.program == "checkpointing":
406 |         if args.model not in LLM_MODELS:
407 |             error_messages.append("Invalid LLM model. Supported models are: {}".format(", ".join(LLM_MODELS)))
408 |         if args.num_checkpoints_read < 0 or args.num_checkpoints_write < 0:
409 |             error_messages.append("Number of checkpoints read and write must be non-negative")
410 | 
411 |     if error_messages:
412 |         for msg in error_messages:
413 |             print(msg)
414 | 
415 |         sys.exit(EXIT_CODE.INVALID_ARGUMENTS)
416 | 
417 | 
418 | def update_args(args):
419 |     """
420 |     This method is an interface between the CLI and the benchmark class.
421 |     """
422 |     if not hasattr(args, 'num_processes'):
423 |         # Different commands for training use different nomenclature for the number of mpi processes to use
424 |         # Training = num_accelerators
425 |         # Datasize = max_accelerators
426 |         # Datagen = num_processes
427 |         # Checkpoint = num_processes
428 |         # We want to consistently use num_processes in code but the different options for the CLI
429 |         for arg in ['num_processes', 'num_accelerators', 'max_accelerators']:
430 |             if hasattr(args, arg) and type(getattr(args, arg)) is int:
431 |                 print(f'Setting attr from {arg} to {getattr(args, arg)}')
432 |                 setattr(args, 'num_processes', int(getattr(args, arg)))
433 |                 break
434 | 
435 |     if hasattr(args, 'runtime') and hasattr(args, 'queries'):
436 |         # For VectorDB we need runtime or queries. If none defined use a default runtime
437 |         if not args.runtime and not args.queries:
438 |             args.runtime = VECTORDB_DEFAULT_RUNTIME  # Default runtime if not provided
439 | 
440 |     # Check for list of lists in params and flatten them
441 |     if args.params:
442 |         flattened_params = [item for sublist in args.params for item in sublist]
443 |         setattr(args, 'params', flattened_params)
444 | 
445 |     if args.mpi_params:
446 |         flattened_mpi_params = [item for sublist in args.mpi_params for item in sublist]
447 |         setattr(args,'mpi_params', flattened_mpi_params)
448 | 
449 |     if hasattr(args, 'hosts'):
450 |         print(f'Hosts is: {args.hosts}')
451 |         # hosts can be comma separated string or a list of strings. If it's a string, it is still a list of length 1
452 |         if len(args.hosts) == 1 and isinstance(args.hosts[0], str):
453 |             setattr(args, 'hosts', args.hosts[0].split(','))
454 |         print(f'Hosts is: {args.hosts}')
455 | 
456 |     if not hasattr(args, "num_client_hosts") and hasattr(args, "hosts"):
457 |         setattr(args, "num_client_hosts", len(args.hosts))
458 | 
459 | 
460 | if __name__ == "__main__":
461 |     args = parse_arguments()
462 |     import pprint
463 |     pprint.pprint(vars(args))
464 | 
465 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MLPerf Storage Benchmark Suite
  2 | MLPerf® Storage is a benchmark suite to characterize the performance of storage systems that support machine learning workloads.
  3 | 
  4 | - [Overview](#overview)
  5 | - [Prerequisite](#prerequisite)
  6 | - [Installation](#installation)
  7 | - [Configuration](#configuration)
  8 | - [Workloads](#workloads)
  9 | 	- [U-Net3D](#u-net3d)
 10 |    	- [ResNet-50](#resnet-50)
 11 |    	- [CosmoFlow](#cosmoflow)
 12 | - [Parameters](#parameters)
 13 | 	- [CLOSED](#closed)
 14 | 	- [OPEN](#open)
 15 | - [Submission Rules](#submission-rules)
 16 | - 
 17 | ## Overview
 18 | For an overview of how this benchmark suite is used by submitters to compare the performance of storage systems supporting an AI cluster, see the MLPerf® Storage Benchmark submission rules here: [doc](https://github.com/mlcommons/storage/blob/main/Submission_guidelines.md). 
 19 | 
 20 | ## Prerequisite
 21 | 
 22 | The installation and the configuration steps described in this README are validated against clients running Ubuntu 24.04 server with python 3.12.3. The benchmark script has to be run only in one participating client host(any) which internally calls `mpirun` to launch the distributed workloads across multiple client hosts. The launcher client host also participates in the distributed training process.
 23 | 
 24 | Following prerequisites must be satisfied
 25 | 
 26 | 1. Pick one host to act as the launcher client host. Passwordless ssh must be setup from the launcher client host to all other participating client hosts.  `ssh-copy-id` is a useful tool.
 27 | 2. The code and data location(discussed in further sections) must be exactly same in every client host including the launcher host. This is because, the same benchmark command is automatically triggered in every participating client host during the distributed training process.
 28 | 
 29 | ## Installation 
 30 | **The following installation steps must be run on every client host that will participate in running the benchmarks.**
 31 | 
 32 | ### Dependencies
 33 | DLIO requires MPI package. 
 34 | For eg: when running on Ubuntu 24.04, install openmpi tools and libraries. 
 35 | 
 36 | ```bash
 37 | sudo apt install python3-pip python3-venv libopenmpi-dev openmpi-common
 38 | ```
 39 | 
 40 | Create a virtual environment for package installations and activate it.
 41 | 
 42 | ```bash
 43 | python3 -m venv ~/.venvs/myenv
 44 | source ~/.venvs/myenv/bin/activate
 45 | ```
 46 | 
 47 | ### Pip
 48 | Please ensure you have the latest version of pip installed. This will fix the following error where the package is built as "UNKNOWN". Upgrade pip like so:
 49 | 
 50 | ```bash
 51 | python3 -m pip install --upgrade pip
 52 | ```
 53 | 
 54 | 
 55 | Clone the latest release from [MLCommons Storage](https://github.com/mlcommons/storage) repository and install Python dependencies.
 56 | 
 57 | ```bash
 58 | git clone -b v2.0 https://github.com/mlcommons/storage.git
 59 | cd storage
 60 | pip3 install -e .
 61 | ```
 62 | 
 63 | The working directory structure is as follows
 64 | 
 65 | ```
 66 | |---storage
 67 |        |---mlpstorage
 68 |            |---(folder contains benchmark src files)
 69 |        |---configs
 70 |            |---dlio
 71 |                |---workload
 72 |                    |---(folder contains configs for all checkpoint and training workloads)
 73 |            |---vectordbbench (These configurations are PREVIEW only and not available for submission)
 74 |                |---(folder contains configs for all vectordb workloads)
 75 | ```
 76 | 
 77 | The benchmark simulation will be performed through the [dlio_benchmark](https://github.com/argonne-lcf/dlio_benchmark) code, a benchmark suite for emulating I/O patterns for deep learning workloads. [dlio_benchmark](https://github.com/argonne-lcf/dlio_benchmark) is listed as a prerequisite to a specific git branch. A future release will update the installer to pull DLIO from PyPi. The DLIO configuration of each workload is specified through a yaml file. You can see the configs of all MLPerf Storage workloads in the `configs` folder. 
 78 | 
 79 | ## Operation
 80 | The benchmarks uses nested commands to select the workload category, workload, and workload parameters.
 81 | 
 82 | ### Workload Categories
 83 | The first argument is the workload category
 84 |  - training
 85 |  - checkpointing
 86 |  - vectordb (PREVIEW)
 87 | 
 88 | ```bash
 89 | [root@localhost ]#  mlpstorage -h
 90 | usage: mlpstorage [-h] [--version] {training,checkpointing,vectordb,reports,history} ...
 91 | 
 92 | Script to launch the MLPerf Storage benchmark
 93 | 
 94 | positional arguments:
 95 |   {training,checkpointing,vectordb,reports,history}
 96 |     training            Training benchmark options
 97 |     checkpointing       Checkpointing benchmark options
 98 |     vectordb            VectorDB benchmark options
 99 |     reports             Generate a report from benchmark results
100 |     history             Display benchmark history
101 | 
102 | optional arguments:
103 |   -h, --help            show this help message and exit
104 |   --version             show program's version number and exit
105 | ```
106 | 
107 | ### Training Category
108 | The training category supports 3 models (unet3d, resnet50, cosmoflow). The benchmark execution process requires these steps:
109 | 1. Datasize - Calculate required number of samples for a given client configuration
110 | 2. Datagen - Generate the required dataset
111 | 3. Run - Execute the benchmark
112 | 
113 | ```bash
114 | [root@localhost ]# mlpstorage training --help
115 | usage: mlpstorage training [-h] [--results-dir RESULTS_DIR] [--loops LOOPS] [--open | --closed] [--debug] [--verbose]
116 |                            [--stream-log-level STREAM_LOG_LEVEL] [--allow-invalid-params] [--what-if]
117 |                            {datasize,datagen,run,configview} ...
118 | 
119 | Run the MLPerf Storage training benchmark
120 | 
121 | positional arguments:
122 |   {datasize,datagen,run,configview}
123 |     datasize            The datasize command calculates the number of samples needed for a given workload, accelerator
124 |                         type, number of accelerators, and client host memory.
125 |     datagen             The datagen command generates a dataset for a given workload and number of parallel generation
126 |                         processes.
127 |     run                 Run the benchmark with the specified parameters.
128 |     configview          View the final config based on the specified options.
129 | 
130 | optional arguments:
131 |   -h, --help            show this help message and exit
132 | 
133 | Standard Arguments:
134 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
135 |                         Directory where the benchmark results will be saved.
136 |   --loops LOOPS         Number of times to run the benchmark
137 |   --open                Run as an open submission
138 |   --closed              Run as a closed submission
139 | 
140 | Output Control:
141 |   --debug               Enable debug mode
142 |   --verbose             Enable verbose mode
143 |   --stream-log-level STREAM_LOG_LEVEL
144 |   --allow-invalid-params, -aip
145 |                         Do not fail on invalid parameters.
146 | 
147 | View Only:
148 |   --what-if             View the configuration that would execute and the associated command.
149 | ```
150 | 
151 | Use ```mlpstorage training {command} --help``` for the full list of parameters for each command.
152 | 
153 | #### Data Sizing and Generation
154 | 
155 | **Note**: Steps described in this section must be run only in one client host(launcher client).
156 | 
157 | The datasize command relies on the accelerator being emulated, the max number of accelerators to support, the system memory in the benchmark clients, and the number of benchmark clients.
158 | 
159 | The two rules that generally dictate the datasize are:
160 | 1. The datasize on disk must be 5x the cumulative system memory of the benchmark clients
161 | 2. The benchmark must run for 500 iterations of the given batch size for all GPUs
162 | 
163 | If the list of clients is passed in for this command the amount of memory is found programmatically. Otherwise, the user can provide the number of clients and the amount of memory per client for the calculation.
164 | 
165 | ```bash
166 | [root@localhost ]# mlpstorage training datasize --help
167 | usage: mlpstorage training datasize [-h] [--hosts HOSTS [HOSTS ...]] --model {cosmoflow,resnet50,unet3d}
168 |                                     --client-host-memory-in-gb CLIENT_HOST_MEMORY_IN_GB [--exec-type {mpi,docker}]
169 |                                     [--mpi-bin {mpirun,mpiexec}] [--oversubscribe] [--allow-run-as-root]
170 |                                     --max-accelerators MAX_ACCELERATORS --accelerator-type {h100,a100}
171 |                                     --num-client-hosts NUM_CLIENT_HOSTS [--data-dir DATA_DIR]
172 |                                     [--params PARAMS [PARAMS ...]]
173 |                                     [--results-dir RESULTS_DIR] [--loops LOOPS] [--open | --closed] [--debug]
174 |                                     [--verbose] [--stream-log-level STREAM_LOG_LEVEL] [--allow-invalid-params]
175 |                                     [--what-if]
176 | 
177 | optional arguments:
178 |   -h, --help            show this help message and exit
179 |   --hosts HOSTS [HOSTS ...], -s HOSTS [HOSTS ...]
180 |                         Space-separated list of IP addresses or hostnames of the participating hosts. Example: '--
181 |                         hosts 192.168.1.1 192.168.1.2 192.168.1.3' or '--hosts host1 host2 host3'
182 |   --model {cosmoflow,resnet50,unet3d}, -m {cosmoflow,resnet50,unet3d}
183 |                         Model to emulate. A specific model defines the sample size, sample container format, and data
184 |                         rates for each supported accelerator.
185 |   --client-host-memory-in-gb CLIENT_HOST_MEMORY_IN_GB, -cm CLIENT_HOST_MEMORY_IN_GB
186 |                         Memory available in the client where the benchmark is run. The dataset needs to be 5x the
187 |                         available memory for closed submissions.
188 |   --exec-type {mpi,docker}, -et {mpi,docker}
189 |                         Execution type for benchmark commands. Supported options: [<EXEC_TYPE.MPI: 'mpi'>,
190 |                         <EXEC_TYPE.DOCKER: 'docker'>]
191 |   --max-accelerators MAX_ACCELERATORS, -ma MAX_ACCELERATORS
192 |                         Max number of simulated accelerators. In multi-host configurations the accelerators will be
193 |                         initiated in a round-robin fashion to ensure equal distribution of simulated accelerator
194 |                         processes
195 |   --accelerator-type {h100,a100}, -g {h100,a100}
196 |                         Accelerator to simulate for the benchmark. A specific accelerator defines the data access
197 |                         sizes and rates for each supported workload
198 |   --num-client-hosts NUM_CLIENT_HOSTS, -nc NUM_CLIENT_HOSTS
199 |                         Number of participating client hosts. Simulated accelerators will be initiated on these hosts
200 |                         in a round-robin fashion
201 |   --data-dir DATA_DIR, -dd DATA_DIR
202 |                         Filesystem location for data
203 |   --params PARAMS [PARAMS ...], -p PARAMS [PARAMS ...]
204 |                         Additional parameters to be passed to the benchmark. These will override the config file. For
205 |                         a closed submission only a subset of params are supported. Multiple values allowed in the
206 |                         form: --params key1=value1 key2=value2 key3=value3
207 |   --dlio-bin-path DLIO_BIN_PATH, -dp DLIO_BIN_PATH
208 |                         Path to DLIO binary. Default is the same as mlpstorage binary path
209 | 
210 | MPI:
211 |   --mpi-bin {mpirun,mpiexec}
212 |                         Execution type for MPI commands. Supported options: ['mpirun', 'mpiexec']
213 |   --oversubscribe
214 |   --allow-run-as-root
215 | 
216 | Standard Arguments:
217 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
218 |                         Directory where the benchmark results will be saved.
219 |   --loops LOOPS         Number of times to run the benchmark
220 |   --open                Run as an open submission
221 |   --closed              Run as a closed submission
222 | 
223 | Output Control:
224 |   --debug               Enable debug mode
225 |   --verbose             Enable verbose mode
226 |   --stream-log-level STREAM_LOG_LEVEL
227 |   --allow-invalid-params, -aip
228 |                         Do not fail on invalid parameters.
229 | 
230 | View Only:
231 |   --what-if             View the configuration that would execute and the associated command.
232 | ```
233 | 
234 | Example:
235 | 
236 | To calculate minimum dataset size for a `unet3d` model running on 2 client machines with 128 GB each with overall 8 simulated a100 accelerators
237 | 
238 | ```bash
239 | mlpstorage training datasize -m unet3d --client-host-memory-in-gb 128 --max-accelerators 16 --num-client-hosts 2 --accelerator-type a100  --results-dir ~/mlps-results
240 | ```
241 | 
242 | 2. Synthetic data is generated based on the workload requested by the user.
243 | 
244 | ```bash
245 | [root@localhost ]# mlpstorage training datagen --help
246 | usage: mlpstorage training datagen [-h] [--hosts HOSTS [HOSTS ...]] --model {cosmoflow,resnet50,unet3d}
247 |                                    [--exec-type {mpi,docker}] [--mpi-bin {mpirun,mpiexec}] [--oversubscribe]
248 |                                    [--allow-run-as-root] --num-processes NUM_PROCESSES [--data-dir DATA_DIR]
249 |                                    [--ssh-username SSH_USERNAME] [--params PARAMS [PARAMS ...]]
250 |                                    [--results-dir RESULTS_DIR] [--loops LOOPS] [--open | --closed] [--debug]
251 |                                    [--verbose] [--stream-log-level STREAM_LOG_LEVEL] [--allow-invalid-params]
252 |                                    [--what-if]
253 | 
254 | optional arguments:
255 |   -h, --help            show this help message and exit
256 |   --hosts HOSTS [HOSTS ...], -s HOSTS [HOSTS ...]
257 |                         Space-separated list of IP addresses or hostnames of the participating hosts. Example: '--
258 |                         hosts 192.168.1.1 192.168.1.2 192.168.1.3' or '--hosts host1 host2 host3'
259 |   --model {cosmoflow,resnet50,unet3d}, -m {cosmoflow,resnet50,unet3d}
260 |                         Model to emulate. A specific model defines the sample size, sample container format, and data
261 |                         rates for each supported accelerator.
262 |   --exec-type {mpi,docker}, -et {mpi,docker}
263 |                         Execution type for benchmark commands. Supported options: [<EXEC_TYPE.MPI: 'mpi'>,
264 |                         <EXEC_TYPE.DOCKER: 'docker'>]
265 |   --num-processes NUM_PROCESSES, -np NUM_PROCESSES
266 |                         Number of parallel processes to use for dataset generation. Processes will be initiated in a
267 |                         round-robin fashion across the configured client hosts
268 |   --data-dir DATA_DIR, -dd DATA_DIR
269 |                         Filesystem location for data
270 |   --params PARAMS [PARAMS ...], -p PARAMS [PARAMS ...]
271 |                         Additional parameters to be passed to the benchmark. These will override the config file. For
272 |                         a closed submission only a subset of params are supported. Multiple values allowed in the
273 |                         form: --params key1=value1 key2=value2 key3=value3
274 |   --dlio-bin-path DLIO_BIN_PATH, -dp DLIO_BIN_PATH
275 |                         Path to DLIO binary. Default is the same as mlpstorage binary path
276 | 
277 | MPI:
278 |   --mpi-bin {mpirun,mpiexec}
279 |                         Execution type for MPI commands. Supported options: ['mpirun', 'mpiexec']
280 |   --oversubscribe
281 |   --allow-run-as-root
282 | 
283 | Standard Arguments:
284 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
285 |                         Directory where the benchmark results will be saved.
286 |   --loops LOOPS         Number of times to run the benchmark
287 |   --open                Run as an open submission
288 |   --closed              Run as a closed submission
289 | 
290 | Output Control:
291 |   --debug               Enable debug mode
292 |   --verbose             Enable verbose mode
293 |   --stream-log-level STREAM_LOG_LEVEL
294 |   --allow-invalid-params, -aip
295 |                         Do not fail on invalid parameters.
296 | 
297 | View Only:
298 |   --what-if             View the configuration that would execute and the associated command.
299 | ```
300 | 
301 | Example:
302 | 
303 | For generating training data of 56,000 files for `unet3d` workload into `unet3d_data` directory using 8 parallel jobs distributed on 2 nodes.
304 | 
305 | ```bash
306 | mlpstorage training datagen --hosts 10.117.61.121,10.117.61.165 --model unet3d --num-processes 8 --data-dir /mnt/unet3d_data --param dataset.num_files_train=56000
307 | ```
308 | 
309 | #### Running a Training Benchmark
310 | 
311 | ```bash
312 | [root@localhost ]# mlpstorage training run --help
313 | usage: mlpstorage training run [-h] [--hosts HOSTS [HOSTS ...]] --model {cosmoflow,resnet50,unet3d}
314 |                                --client-host-memory-in-gb CLIENT_HOST_MEMORY_IN_GB [--exec-type {mpi,docker}]
315 |                                [--mpi-bin {mpirun,mpiexec}] [--oversubscribe] [--allow-run-as-root] --num-accelerators
316 |                                NUM_ACCELERATORS --accelerator-type {h100,a100} --num-client-hosts NUM_CLIENT_HOSTS
317 |                                [--data-dir DATA_DIR] [--ssh-username SSH_USERNAME] [--params PARAMS [PARAMS ...]]
318 |                                [--results-dir RESULTS_DIR] [--loops LOOPS] [--open | --closed] [--debug] [--verbose]
319 |                                [--stream-log-level STREAM_LOG_LEVEL] [--allow-invalid-params] [--what-if]
320 | 
321 | optional arguments:
322 |   -h, --help            show this help message and exit
323 |   --hosts HOSTS [HOSTS ...], -s HOSTS [HOSTS ...]
324 |                         Space-separated list of IP addresses or hostnames of the participating hosts. Example: '--
325 |                         hosts 192.168.1.1 192.168.1.2 192.168.1.3' or '--hosts host1 host2 host3'
326 |   --model {cosmoflow,resnet50,unet3d}, -m {cosmoflow,resnet50,unet3d}
327 |                         Model to emulate. A specific model defines the sample size, sample container format, and data
328 |                         rates for each supported accelerator.
329 |   --client-host-memory-in-gb CLIENT_HOST_MEMORY_IN_GB, -cm CLIENT_HOST_MEMORY_IN_GB
330 |                         Memory available in the client where the benchmark is run. The dataset needs to be 5x the
331 |                         available memory for closed submissions.
332 |   --exec-type {mpi,docker}, -et {mpi,docker}
333 |                         Execution type for benchmark commands. Supported options: [<EXEC_TYPE.MPI: 'mpi'>,
334 |                         <EXEC_TYPE.DOCKER: 'docker'>]
335 |   --num-accelerators NUM_ACCELERATORS, -na NUM_ACCELERATORS
336 |                         Number of simulated accelerators. In multi-host configurations the accelerators will be
337 |                         initiated in a round-robin fashion to ensure equal distribution of simulated accelerator
338 |                         processes
339 |   --accelerator-type {h100,a100}, -g {h100,a100}
340 |                         Accelerator to simulate for the benchmark. A specific accelerator defines the data access
341 |                         sizes and rates for each supported workload
342 |   --num-client-hosts NUM_CLIENT_HOSTS, -nc NUM_CLIENT_HOSTS
343 |                         Number of participating client hosts. Simulated accelerators will be initiated on these hosts
344 |                         in a round-robin fashion
345 |   --data-dir DATA_DIR, -dd DATA_DIR
346 |                         Filesystem location for data
347 |   --params PARAMS [PARAMS ...], -p PARAMS [PARAMS ...]
348 |                         Additional parameters to be passed to the benchmark. These will override the config file. For
349 |                         a closed submission only a subset of params are supported. Multiple values allowed in the
350 |                         form: --params key1=value1 key2=value2 key3=value3
351 |   --dlio-bin-path DLIO_BIN_PATH, -dp DLIO_BIN_PATH
352 |                         Path to DLIO binary. Default is the same as mlpstorage binary path
353 | 
354 | MPI:
355 |   --mpi-bin {mpirun,mpiexec}
356 |                         Execution type for MPI commands. Supported options: ['mpirun', 'mpiexec']
357 |   --oversubscribe
358 |   --allow-run-as-root
359 | 
360 | Standard Arguments:
361 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
362 |                         Directory where the benchmark results will be saved.
363 |   --loops LOOPS         Number of times to run the benchmark
364 |   --open                Run as an open submission
365 |   --closed              Run as a closed submission
366 | 
367 | Output Control:
368 |   --debug               Enable debug mode
369 |   --verbose             Enable verbose mode
370 |   --stream-log-level STREAM_LOG_LEVEL
371 |   --allow-invalid-params, -aip
372 |                         Do not fail on invalid parameters.
373 | 
374 | View Only:
375 |   --what-if             View the configuration that would execute and the associated command.
376 | 
377 | ```
378 | 
379 | Example:
380 | 
381 | For running benchmark on `unet3d` workload with data located in `unet3d_data` directory using 2 h100 accelerators spread across 2 client hosts(with IPs 10.117.61.121,10.117.61.165) and results on `unet3d_results` directory, 
382 | 
383 | ```bash
384 | mlpstorage training run --hosts 10.117.61.121,10.117.61.165 --num-client-hosts 2 --client-host-memory-in-gb 64 --num-accelerators 2 --accelerator-type h100 --model unet3d  --data-dir unet3d_data --results-dir unet3d_results    --param dataset.num_files_train=400 
385 | ```
386 | 
387 | 4. Benchmark submission report is generated by aggregating the individual run results. The reporting command provides the associated functions to generate a report for a given results directory
388 | 
389 | ```bash
390 | # TODO: Update
391 | [root@localhost]# mlpstorage reports --help
392 | usage: mlpstorage reports [-h] [--results-dir RESULTS_DIR] [--loops LOOPS] [--open | --closed] [--debug] [--verbose]
393 |                           [--stream-log-level STREAM_LOG_LEVEL] [--allow-invalid-params] [--what-if]
394 |                           {reportgen} ...
395 | 
396 | positional arguments:
397 |   {reportgen}           Sub-commands
398 |     reportgen           Generate a report from the benchmark results.
399 | 
400 | optional arguments:
401 |   -h, --help            show this help message and exit
402 | 
403 | Standard Arguments:
404 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
405 |                         Directory where the benchmark results will be saved.
406 |   --loops LOOPS         Number of times to run the benchmark
407 |   --open                Run as an open submission
408 |   --closed              Run as a closed submission
409 | 
410 | Output Control:
411 |   --debug               Enable debug mode
412 |   --verbose             Enable verbose mode
413 |   --stream-log-level STREAM_LOG_LEVEL
414 |   --allow-invalid-params, -aip
415 |                         Do not fail on invalid parameters.
416 | 
417 | View Only:
418 |   --what-if             View the configuration that would execute and the associated command.
419 | ```
420 | 
421 | To generate the benchmark report,
422 | 
423 | ```bash
424 | [root@localhost]# mlpstorage reports reportgen --help
425 | usage: mlpstorage reports reportgen [-h] [--output-dir OUTPUT_DIR] [--results-dir RESULTS_DIR] [--loops LOOPS]
426 |                                     [--open | --closed] [--debug] [--verbose] [--stream-log-level STREAM_LOG_LEVEL]
427 |                                     [--allow-invalid-params] [--what-if]
428 | 
429 | optional arguments:
430 |   -h, --help            show this help message and exit
431 |   --output-dir OUTPUT_DIR
432 |                         Directory where the benchmark report will be saved.
433 | 
434 | Standard Arguments:
435 |   --results-dir RESULTS_DIR, -rd RESULTS_DIR
436 |                         Directory where the benchmark results will be saved.
437 |   --loops LOOPS         Number of times to run the benchmark
438 |   --open                Run as an open submission
439 |   --closed              Run as a closed submission
440 | 
441 | Output Control:
442 |   --debug               Enable debug mode
443 |   --verbose             Enable verbose mode
444 |   --stream-log-level STREAM_LOG_LEVEL
445 |   --allow-invalid-params, -aip
446 |                         Do not fail on invalid parameters.
447 | 
448 | View Only:
449 |   --what-if             View the configuration that would execute and the associated command.
450 | ```
451 | 
452 | Note: The `reportgen` script must be run in the launcher client host. 
453 | 
454 | ## Training Models
455 | Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads
456 | - Image segmentation using U-Net3D model 
457 | - Image classification using Resnet-50 model
458 | - Cosmology parameter prediction using CosmoFlow model
459 | 
460 | ### U-Net3D
461 | 
462 | Calculate minimum dataset size required for the benchmark run based on your client configuration
463 | 
464 | ```bash
465 | mlpstorage training datasize --model unet3d --client-host-memory-in-gb 64 --num-client-hosts 1 --max-accelerators 4 --accelerator-type h100
466 | ```
467 | 
468 | Generate data for the benchmark run based on the minimum files
469 | 
470 | ```bash
471 | mlpstorage training datagen --hosts 127.0.0.1 --num-processes 8 --model unet3d --data-dir unet3d_data --results-dir unet3d_results  --param dataset.num_files_train=42000
472 | ```
473 |   
474 | Run the benchmark.
475 | 
476 | ```bash
477 | mlpstorage training run --hosts 127.0.0.1 --num-client-hosts 1 --client-host-memory-in-gb 64 --num-accelerators 4 --accelerator-type h100 --model unet3d  --data-dir unet3d_data --results-dir unet3d_results --param dataset.num_files_train=42000
478 | ```
479 | 
480 | All results will be stored in the directory configured using `--results-dir`(or `-r`) argument. To generate the final report, run the following in the launcher client host. 
481 | 
482 | ```bash 
483 | mlpstorage reports reportgen --results-dir unet3d_results
484 | ```
485 | 
486 | ### ResNet-50
487 | 
488 | Calculate minimum dataset size required for the benchmark run based on your client configuration
489 | 
490 | ```bash
491 |  mlpstorage training datasize --model resnet50 --client-host-memory-in-gb 64 --num-client-hosts 1 --max-accelerators 16 --accelerator-type h100
492 | ```
493 | 
494 | Generate data for the benchmark run
495 | 
496 | ```bash
497 | mlpstorage training datagen --hosts 127.0.0.1 --num-processes 8 --model resnet50 --data-dir resnet50_data --results-dir resnet50_results  --param dataset.num_files_train=2557
498 | ```
499 |   
500 | Run the benchmark.
501 | 
502 | ```bash
503 | mlpstorage training run --hosts 127.0.0.1 --num-client-hosts 1  --client-host-memory-in-gb 64  --num-accelerators 16 --accelerator-type h100  --model resnet50  --data-dir resnet50_data --results-dir resnet50_results --param dataset.num_files_train=2557
504 | ```
505 | 
506 | All results will be stored in the directory configured using `--results-dir`(or `-r`) argument. To generate the final report, run the following in the launcher client host. 
507 | 
508 | ```bash 
509 | mlpstorage reports reportgen --results-dir resnet50_results
510 | ```
511 | 
512 | ### CosmoFlow
513 | 
514 | Calculate minimum dataset size required for the benchmark run based on your client configuration
515 | 
516 | ```bash
517 | mlpstorage training datasize --model cosmoflow --client-host-memory-in-gb 64 --num-client-hosts 1 --max-accelerators 16 --accelerator-type h100 
518 | ```
519 | 
520 | Generate data for the benchmark run
521 | 
522 | ```bash
523 | mlpstorage training datagen --hosts 127.0.0.1 --num-processes 8 --model cosmoflow --data-dir cosmoflow_data --results-dir=cosmoflow_results  --param dataset.num_files_train=121477
524 | ```
525 |   
526 | Run the benchmark.
527 | 
528 | ```bash
529 | mlpstorage training run  --hosts 127.0.0.1 --num-client-hosts 1  --client-host-memory-in-gb 64 --num-accelerators 16  --accelerator-type h100  --model cosmoflow --data-dir cosmoflow_data --results-dir cosmoflow_results --param dataset.num_files_train=121477 
530 | ```
531 | 
532 | All results will be stored in the directory configured using `--results-dir`(or `-r`) argument. To generate the final report, run the following in the launcher client host. 
533 | 
534 | ```bash 
535 | mlpstorage reports reportgen --results-dir cosmoflow_results
536 | ```
537 | 
538 | ## Parameters 
539 | 
540 | ### CLOSED
541 | Below table displays the list of configurable parameters for the benchmark in the closed category.
542 | 
543 | | Parameter                      | Description                                                 |Default|
544 | | ------------------------------ | ------------------------------------------------------------ |-------|
545 | | **Dataset params**		|								|   |
546 | | dataset.num_files_train       | Number of files for the training set  		        | --|
547 | | dataset.num_subfolders_train  | Number of subfolders that the training set is stored	        |0|
548 | | dataset.data_folder           | The path where dataset is stored				| --|
549 | | **Reader params**				|						|   |
550 | | reader.read_threads		| Number of threads to load the data                            | --|
551 | | reader.computation_threads    | Number of threads to preprocess the data(for TensorFlow)      |1|
552 | | reader.prefetch_size    | Number of batches to prefetch      |2|
553 | | reader.transfer_size       | Number of bytes in the read buffer(only for Tensorflow)  		        | |
554 | | reader.odirect                  | Whether to use direct I/O for reader (currectly applicable to UNet3D)   | False | 
555 | | **Checkpoint params**		|								|   |
556 | | checkpoint.checkpoint_folder	| The folder to save the checkpoints  				| --|
557 | | **Storage params**		|								|   |
558 | | storage.storage_root		| The storage root directory  					| ./|
559 | | storage.storage_type		| The storage type  						|local_fs|
560 | 
561 | 
562 | ### OPEN
563 | In addition to what can be changed in the CLOSED category, the following parameters can be changed in the OPEN category.
564 | 
565 | | Parameter                      | Description                                                 |Default|
566 | | ------------------------------ | ------------------------------------------------------------ |-------|
567 | | framework		| The machine learning framework		|Pytorch for 3D U-Net |
568 | | **Dataset params**		|								|   |
569 | | dataset.format       | Format of the dataset  		        | .npz for 3D U-Net |
570 | | dataset.num_samples_per_file       | Number of samples per file(only for Tensorflow using tfrecord datasets)  		        | 1 for 3D U-Net |
571 | | **Reader params**		|
572 | | reader.data_loader       | Data loader type(Tensorflow or PyTorch or custom) 		        | PyTorch for 3D U-Net |
573 | 
574 | 
575 | ## Submission Rules
576 | 
577 | MLPerf™ Storage Benchmark submission rules are described in this [doc](https://github.com/mlcommons/storage/blob/main/Submission_guidelines.md). If you have questions, please contact [Storage WG chairs](https://mlcommons.org/en/groups/research-storage/).
578 | 


--------------------------------------------------------------------------------