├── .python-version
├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── general_question.md
│ ├── feature_request.md
│ └── bug_report.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ └── docs.yml
├── greptile.json
├── .coderabbit.yaml
├── tests
├── ref_data
│ ├── triton-inference-start_server_wrapper.sh
│ ├── sleep.sbatch
│ ├── slurm_container.sbatch
│ ├── nemo-launcher.sbatch
│ ├── ucc.sbatch
│ ├── osu-bench.sbatch
│ ├── ddlb.sbatch
│ ├── nccl.sbatch
│ └── megatron-run.sbatch
├── test_standalone_installer.py
└── test_job_type_handler.py
├── src
└── cloudai
│ ├── util
│ ├── general-report.jinja2
│ ├── general-slurm-report.jinja2
│ ├── nixl_report_template.jinja2
│ └── base-report.jinja2
│ ├── _core
│ ├── __init__.py
│ ├── base_job.py
│ ├── job_status_result.py
│ ├── grading_strategy.py
│ ├── report_generation_strategy.py
│ └── base_system_parser.py
│ ├── systems
│ ├── __init__.py
│ ├── lsf
│ │ ├── lsf_job.py
│ │ └── __init__.py
│ ├── slurm
│ │ ├── slurm_job.py
│ │ └── __init__.py
│ ├── standalone
│ │ ├── standalone_job.py
│ │ └── __init__.py
│ ├── kubernetes
│ │ ├── kubernetes_job.py
│ │ └── __init__.py
│ └── runai
│ │ ├── runai_job.py
│ │ └── __init__.py
│ ├── workloads
│ ├── __init__.py
│ ├── bash_cmd
│ │ └── __init__.py
│ ├── ddlb
│ │ └── __init__.py
│ ├── osu_bench
│ │ └── __init__.py
│ ├── nixl_kvbench
│ │ └── __init__.py
│ ├── slurm_container
│ │ └── __init__.py
│ ├── jax_toolbox
│ │ ├── grading_strategy.py
│ │ └── __init__.py
│ ├── triton_inference
│ │ ├── report_generation_strategy.py
│ │ └── __init__.py
│ ├── deepep
│ │ └── __init__.py
│ ├── megatron_run
│ │ ├── __init__.py
│ │ └── slurm_command_gen_strategy.py
│ ├── nixl_perftest
│ │ ├── __init__.py
│ │ └── report_generation_strategy.py
│ ├── aiconfig
│ │ └── __init__.py
│ ├── ucc_test
│ │ └── __init__.py
│ ├── nixl_bench
│ │ └── __init__.py
│ ├── sleep
│ │ ├── sleep.py
│ │ ├── standalone_command_gen_strategy.py
│ │ ├── slurm_command_gen_strategy.py
│ │ ├── __init__.py
│ │ ├── grading_strategy.py
│ │ └── lsf_command_gen_strategy.py
│ ├── chakra_replay
│ │ ├── __init__.py
│ │ ├── grading_strategy.py
│ │ └── chakra_replay.py
│ ├── nemo_launcher
│ │ └── __init__.py
│ ├── nemo_run
│ │ └── __init__.py
│ ├── ai_dynamo
│ │ └── __init__.py
│ └── nccl_test
│ │ ├── prediction_report_generation_strategy.py
│ │ ├── __init__.py
│ │ └── report_generation_strategy.py
│ ├── report_generator
│ ├── __init__.py
│ └── tool
│ │ ├── __init__.py
│ │ └── report_tool_interface.py
│ ├── cli
│ └── __init__.py
│ ├── configurator
│ └── __init__.py
│ └── __init__.py
├── .taplo.toml
├── conf
├── common
│ ├── test
│ │ ├── sleep.toml
│ │ ├── ucc_test.toml
│ │ ├── nccl_test.toml
│ │ ├── osu_test.toml
│ │ ├── nccl_test_all_gather.toml
│ │ ├── dse_nccl_all_gather.toml
│ │ └── nemo_run_llama3_8b.toml
│ ├── system
│ │ ├── standalone_system.toml
│ │ ├── kubernetes_cluster.toml
│ │ ├── example_runai_cluster.toml
│ │ └── example_slurm_cluster.toml
│ └── test_scenario
│ │ ├── osu_test.toml
│ │ ├── dse_nccl_all_gather.toml
│ │ ├── nemo_run_llama3_8b.toml
│ │ ├── slurm_container.toml
│ │ ├── ucc_generator_test.toml
│ │ ├── sleep.toml
│ │ └── ucc_test.toml
├── hook
│ ├── nccl_test.toml
│ └── test
│ │ └── nccl_test_all_gather.toml
├── release
│ ├── nemo_acceptance
│ │ ├── test_scenario
│ │ │ ├── gpt3_126m_mock.toml
│ │ │ ├── gpt3_126m_pile.toml
│ │ │ ├── nccl_test_loopback.toml
│ │ │ └── nemo_run_llama3_8b.toml
│ │ └── test
│ │ │ ├── nccl_test_all_reduce_loopback.toml
│ │ │ ├── nemo_run_llama3_8b.toml
│ │ │ └── gpt3_126m_mock.toml
│ └── spcx
│ │ └── l40s
│ │ └── test
│ │ ├── l40s-bc-nccl_test_gather.toml
│ │ ├── l40s-bc-nccl_test_reduce.toml
│ │ ├── l40s-bc-nccl_test_scatter.toml
│ │ ├── l40s-bc-nccl_test_broadcast.toml
│ │ ├── l40s-bc-nccl_test_hypercube.toml
│ │ ├── l40s-bc-nccl_test_alltoall.toml
│ │ ├── l40s-bc-nccl_test_sendrecv.toml
│ │ ├── l40s-bc-nccl_test_all_reduce.toml
│ │ ├── l40s-bc-nccl_test_bisection.toml
│ │ ├── l40s-bc-nccl_test_sendrecv_worst.toml
│ │ ├── l40s-bc-nccl_test_alltoall_worst.toml
│ │ ├── l40s-bc-nccl_test_all_reduce_worst.toml
│ │ ├── l40s-bc-nccl_test_alltoall_worst_failover.toml
│ │ ├── l40s-bc-nccl_test_reduce_scatter.toml
│ │ ├── l40s-bc-nccl_test_all_gather.toml
│ │ ├── l40s-bc-nccl_test_all_gather_worst.toml
│ │ └── l40s-bc-nccl_test_reduce_scatter_worst.toml
└── experimental
│ ├── test_scenario
│ ├── ddlb_test.toml
│ ├── nemo_launcher_nemotron_15b_bf16_2_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_2_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_4_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_8_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_16_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_32_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_64_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_128_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_256_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_128_node.toml
│ ├── nemo_launcher_nemotron_15b_fp8_256_node.toml
│ ├── deepep.toml
│ ├── nemo_launcher_nemotron_15b_bf16_16_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_4_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_8_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_32_node.toml
│ ├── nemo_launcher_nemotron_15b_bf16_64_node.toml
│ └── nemo_launcher_nemotron_15b_fp8.toml
│ ├── aiconfigurator
│ ├── test_scenario
│ │ └── aiconfigurator_disagg.toml
│ └── test
│ │ ├── aiconfigurator_disagg.toml
│ │ └── dse_aiconfigurator_disagg.toml
│ ├── ai_dynamo
│ ├── test_scenario
│ │ ├── vllm_k8s.toml
│ │ └── vllm_slurm.toml
│ └── test
│ │ └── vllm.toml
│ └── test
│ ├── ddlb_test.toml
│ ├── deepep_low_latency.toml
│ └── deepep_standard.toml
├── doc
├── Makefile
└── workloads
│ ├── sleep.rst
│ ├── ucc.rst
│ ├── nemo_run.rst
│ ├── nccl.rst
│ ├── chakra_replay.rst
│ ├── bash_cmd.rst
│ ├── slurm_container.rst
│ ├── index.rst
│ ├── nixl_kvbench.rst
│ ├── nixl_bench.rst
│ ├── ddlb.rst
│ └── osu.rst
└── .gitignore
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @amaslenn @srivatsankrishnan @jeffnvidia @alexmanle
2 |
--------------------------------------------------------------------------------
/greptile.json:
--------------------------------------------------------------------------------
1 | {
2 | "strictness": 1,
3 | "sequenceDiagramSection": {
4 | "included": true,
5 | "collapsible": true,
6 | "defaultOpen": false
7 | }
8 | }
--------------------------------------------------------------------------------
/.coderabbit.yaml:
--------------------------------------------------------------------------------
1 | reviews:
2 | profile: assertive
3 | auto_review:
4 | enabled: true
5 | sequence_diagrams: false
6 | high_level_summary: false
7 | collapse_walkthrough: true
8 | pre_merge_checks:
9 | docstrings:
10 | mode: "off"
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general_question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General question
3 | about: Ask a question or seek clarification about the project
4 | title: ''
5 | labels: 'question'
6 | assignees: ''
7 | ---
8 |
9 | > Please provide a detailed description of your question or the information you seek.
10 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | ---
8 |
9 | ## Problem Related to the Feature
10 | > A clear and concise description of what the problem is.
11 |
12 | ## Proposed Solution
13 | > A clear and concise description of what you want to happen.
14 |
--------------------------------------------------------------------------------
/tests/ref_data/triton-inference-start_server_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export NIM_LEADER_IP_ADDRESS=${SLURM_JOB_MASTER_NODE}
4 | export NIM_NODE_RANK=${SLURM_NODEID}
5 |
6 | export NIM_MODEL_NAME='__OUTPUT_DIR__/output'
7 | export NIM_CACHE_PATH='__OUTPUT_DIR__/output'
8 |
9 | if [ "$NIM_NODE_RANK" -eq 0 ]; then
10 | export NIM_LEADER_ROLE=1
11 | else
12 | export NIM_LEADER_ROLE=0
13 | fi
14 |
15 | echo "Starting NIM server on node rank ${NIM_NODE_RANK} with leader role ${NIM_LEADER_ROLE}"
16 | exec /opt/nim/start_server.sh
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | ---
8 |
9 | ## Describe the Bug
10 | > A clear and concise description of what the bug is.
11 |
12 | ## Steps to Reproduce
13 | > Steps to reproduce the behavior.
14 | > Please include the version information where the bug was observed.
15 |
16 | ## Expected Behavior
17 | > A clear and concise description of what you expected to happen.
18 |
19 | ## Screenshots
20 | > If applicable, add screenshots to help explain your problem.
21 |
--------------------------------------------------------------------------------
/src/cloudai/util/general-report.jinja2:
--------------------------------------------------------------------------------
1 | {% extends "base-report.jinja2" %}
2 |
3 | {% block content %}
4 |
5 |
6 | | Test |
7 | Description |
8 | Results |
9 |
10 | {% for item in report_items %}
11 |
12 | | {{ item.name }} |
13 | {{ item.description }} |
14 | {% if item.logs_path %}
15 | logs |
16 | {% else %}
17 | no logs |
18 | {% endif %}
19 |
20 | {% endfor %}
21 |
22 | {% endblock %}
23 |
--------------------------------------------------------------------------------
/src/cloudai/util/general-slurm-report.jinja2:
--------------------------------------------------------------------------------
1 | {% extends "base-report.jinja2" %}
2 |
3 | {% block content %}
4 |
5 |
6 | | Test |
7 | Description |
8 | Results |
9 | Nodes |
10 |
11 | {% for item in report_items %}
12 |
13 | | {{ item.name }} |
14 | {{ item.description }} |
15 | {% if item.logs_path %}
16 | logs |
17 | {% else %}
18 | no logs |
19 | {% endif %}
20 | {{ item.nodes }} |
21 |
22 | {% endfor %}
23 |
24 | {% endblock %}
25 |
--------------------------------------------------------------------------------
/src/cloudai/_core/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/src/cloudai/systems/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/src/cloudai/report_generator/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/src/cloudai/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .cli import main, setup_logging
18 |
19 | __all__ = [
20 | "main",
21 | "setup_logging",
22 | ]
23 |
--------------------------------------------------------------------------------
/.taplo.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | include = ["pyproject.toml", ".taplo.toml", "conf/**/*.toml"]
18 |
19 | [formatting]
20 | indent_entries = false
21 | indent_tables = true
22 |
--------------------------------------------------------------------------------
/conf/common/test/sleep.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "sleep"
18 | description = "sleep test"
19 | test_template_name = "Sleep"
20 |
21 | [cmd_args]
22 | seconds = 1
23 |
--------------------------------------------------------------------------------
/conf/hook/nccl_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test"
18 |
19 | [[Tests]]
20 | id = "Tests.1"
21 | test_name = "nccl_test_all_gather"
22 | time_limit = "00:20:00"
23 |
--------------------------------------------------------------------------------
/conf/common/system/standalone_system.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "standalone_system"
18 | scheduler = "standalone"
19 |
20 | install_path = "./install"
21 | output_path = "./results"
22 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test_scenario/gpt3_126m_mock.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "gpt3_126m_mock"
18 |
19 | [[Tests]]
20 | id = "Tests.1"
21 | test_name = "gpt3_126m_mock"
22 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test_scenario/gpt3_126m_pile.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "gpt3_126m_pile"
18 |
19 | [[Tests]]
20 | id = "Tests.1"
21 | test_name = "gpt3_126m_pile"
22 |
--------------------------------------------------------------------------------
/src/cloudai/report_generator/tool/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .tensorboard_data_reader import TensorBoardDataReader
18 |
19 | __all__ = [
20 | "TensorBoardDataReader",
21 | ]
22 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Summary
2 | Provide a concise summary of the changes introduced by this pull request. Detail the purpose and scope of the changes, referencing any relevant issues or discussions. Explain how these changes address the problem or improve the project.
3 |
4 | ## Test Plan
5 | In this section, describe the testing you have performed to verify the changes. Include:
6 | - A clear description of the testing environment.
7 | - The steps you followed to test the new features or bug fixes.
8 | - Any specific commands used during testing, along with their outputs.
9 | - A description of the results and observations from your testing.
10 | This information is crucial for reviewers to understand how the changes have been validated.
11 |
12 | ## Additional Notes
13 | Include any other notes or comments about the pull request here. This can include challenges faced, future considerations, or context that reviewers might find helpful.
14 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/ddlb_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "ddlb-test"
18 |
19 | [[Tests]]
20 | id = "Tests.ddlb"
21 | test_name = "ddlb_test"
22 | num_nodes = 1
23 | time_limit = "00:30:00"
24 |
--------------------------------------------------------------------------------
/tests/ref_data/sleep.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; sleep 5"
18 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/osu_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "osu_test_scenario"
18 | job_status_check = true
19 |
20 | [[Tests]]
21 | id = "Tests.1"
22 | test_name = "osu_test"
23 | num_nodes = "2"
24 | time_limit = "00:20:00"
25 |
--------------------------------------------------------------------------------
/conf/common/test/ucc_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "ucc_base_test"
18 | description = "Base config for UCC tests"
19 | test_template_name = "UCCTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test_scenario/nccl_test_loopback.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl-loopback-test"
18 |
19 | [[Tests]]
20 | id = "Tests.1"
21 | test_name = "nccl_test_all_reduce_loopback"
22 | num_nodes = "2"
23 | time_limit = "00:20:00"
24 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test_scenario/nemo_run_llama3_8b.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_run_llama3_8b"
18 |
19 | [[Tests]]
20 | id = "nemo_run_llama3_8b"
21 | test_name = "nemo_run_llama3_8b"
22 | num_nodes = "1"
23 | time_limit = "00:30:00"
24 |
--------------------------------------------------------------------------------
/src/cloudai/systems/lsf/lsf_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from dataclasses import dataclass
18 |
19 | from cloudai.core import BaseJob
20 |
21 |
22 | @dataclass
23 | class LSFJob(BaseJob):
24 | """A job class for execution on an LSF system."""
25 |
26 | pass
27 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/bash_cmd/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .bash_cmd import BashCmdArgs, BashCmdCommandGenStrategy, BashCmdTestDefinition
18 |
19 | __all__ = [
20 | "BashCmdArgs",
21 | "BashCmdCommandGenStrategy",
22 | "BashCmdTestDefinition",
23 | ]
24 |
--------------------------------------------------------------------------------
/src/cloudai/systems/slurm/slurm_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from dataclasses import dataclass
18 |
19 | from cloudai.core import BaseJob
20 |
21 |
22 | @dataclass
23 | class SlurmJob(BaseJob):
24 | """A job class for execution on a Slurm system."""
25 |
26 | pass
27 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/dse_nccl_all_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "dse-nccl-test"
18 |
19 | pre_test = "nccl_test"
20 | post_test = "nccl_test"
21 |
22 | [[Tests]]
23 | id = "Tests.1"
24 | test_name = "dse_nccl_all_gather"
25 | num_nodes = "2"
26 | time_limit = "00:20:00"
27 |
--------------------------------------------------------------------------------
/src/cloudai/systems/standalone/standalone_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from dataclasses import dataclass
18 |
19 | from cloudai.core import BaseJob
20 |
21 |
22 | @dataclass
23 | class StandaloneJob(BaseJob):
24 | """A job class for standalone execution."""
25 |
26 | pass
27 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_2_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_2_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_2_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "2"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_2_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_2_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_2_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_2_node"
22 | num_nodes = "2"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_4_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_4_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_4_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_4_node"
22 | num_nodes = "4"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_8_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_8_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_8_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_8_node"
22 | num_nodes = "8"
23 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_gather"
18 | description = "gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "gather_perf_mpi"
24 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_reduce"
18 | description = "reduce"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "reduce_perf_mpi"
24 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_16_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_16_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_16_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_16_node"
22 | num_nodes = "16"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_32_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_32_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_32_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_32_node"
22 | num_nodes = "32"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_64_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_64_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_64_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_64_node"
22 | num_nodes = "64"
23 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_scatter.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_scatter"
18 | description = "scatter"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "scatter_perf_mpi"
24 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_128_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_128_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_128_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_128_node"
22 | num_nodes = "128"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_256_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_256_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_256_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_256_node"
22 | num_nodes = "256"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_128_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_128_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_128_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_128_node"
22 | num_nodes = "128"
23 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_256_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8_256_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_256_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_256_node"
22 | num_nodes = "256"
23 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_broadcast.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_broadcast"
18 | description = "broadcast"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "broadcast_perf_mpi"
24 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_hypercube.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_hypercube"
18 | description = "hypercube"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "hypercube_perf_mpi"
24 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/ddlb/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .ddlb import DDLBCmdArgs, DDLBTestDefinition
18 | from .slurm_command_gen_strategy import DDLBTestSlurmCommandGenStrategy
19 |
20 | __all__ = [
21 | "DDLBCmdArgs",
22 | "DDLBTestDefinition",
23 | "DDLBTestSlurmCommandGenStrategy",
24 | ]
25 |
--------------------------------------------------------------------------------
/src/cloudai/systems/kubernetes/kubernetes_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from dataclasses import dataclass
19 |
20 | from cloudai.core import BaseJob
21 |
22 |
23 | @dataclass
24 | class KubernetesJob(BaseJob):
25 | """A job class for execution on a Kubernetes system."""
26 |
27 | kind: str
28 | name: str
29 |
--------------------------------------------------------------------------------
/src/cloudai/configurator/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .base_agent import BaseAgent
18 | from .base_gym import BaseGym
19 | from .cloudai_gym import CloudAIGymEnv
20 | from .grid_search import GridSearchAgent
21 |
22 | __all__ = [
23 | "BaseAgent",
24 | "BaseGym",
25 | "CloudAIGymEnv",
26 | "GridSearchAgent",
27 | ]
28 |
--------------------------------------------------------------------------------
/src/cloudai/systems/runai/runai_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from dataclasses import dataclass
18 |
19 | from cloudai.core import BaseJob
20 |
21 | from .runai_training import ActualPhase
22 |
23 |
24 | @dataclass
25 | class RunAIJob(BaseJob):
26 | """A job class for execution on an RunAI system."""
27 |
28 | status: ActualPhase
29 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/osu_bench/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
18 | from .slurm_command_gen_strategy import OSUBenchSlurmCommandGenStrategy
19 |
20 | __all__ = [
21 | "OSUBenchCmdArgs",
22 | "OSUBenchSlurmCommandGenStrategy",
23 | "OSUBenchTestDefinition",
24 | ]
25 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/deepep.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "deepep-benchmark"
18 |
19 | [[Tests]]
20 | id = "Tests.1"
21 | test_name = "deepep_standard"
22 | num_nodes = 2
23 | time_limit = "00:30:00"
24 |
25 | [[Tests]]
26 | id = "Tests.2"
27 | test_name = "deepep_low_latency"
28 | num_nodes = 2
29 | time_limit = "00:30:00"
30 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | SPHINXOPTS ?=
18 | SPHINXBUILD ?= sphinx-build
19 | SOURCEDIR = .
20 | BUILDDIR = _build
21 |
22 | help:
23 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 |
25 | .PHONY: help Makefile
26 |
27 | %: Makefile
28 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
29 |
--------------------------------------------------------------------------------
/conf/common/test/nccl_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_base_test"
18 | description = "NCCL base test configuration"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | ngpus = 1
24 | minbytes = "128"
25 | maxbytes = "4G"
26 | iters = 100
27 | warmup_iters = 50
28 | stepfactor = 2
29 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nixl_kvbench/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .nixl_kvbench import NIXLKVBenchCmdArgs, NIXLKVBenchTestDefinition
18 | from .slurm_command_gen_strategy import NIXLKVBenchSlurmCommandGenStrategy
19 |
20 | __all__ = [
21 | "NIXLKVBenchCmdArgs",
22 | "NIXLKVBenchSlurmCommandGenStrategy",
23 | "NIXLKVBenchTestDefinition",
24 | ]
25 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_16_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_16_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_16_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "16"
23 |
24 | [Tests.cmd_args.training.model]
25 | global_batch_size = 512
26 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_4_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_4_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_4_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "4"
23 |
24 | [Tests.cmd_args.training.model]
25 | global_batch_size = 128
26 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_8_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_8_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_8_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "8"
23 |
24 | [Tests.cmd_args.training.model]
25 | global_batch_size = 256
26 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_32_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_32_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_32_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "32"
23 |
24 | [Tests.cmd_args.training.model]
25 | global_batch_size = 1024
26 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_64_node.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_bf16_64_node"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_bf16_64_node"
21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node"
22 | num_nodes = "64"
23 |
24 | [Tests.cmd_args.training.model]
25 | global_batch_size = 2048
26 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/slurm_container/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .slurm_command_gen_strategy import SlurmContainerCommandGenStrategy
18 | from .slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition
19 |
20 | __all__ = [
21 | "SlurmContainerCmdArgs",
22 | "SlurmContainerCommandGenStrategy",
23 | "SlurmContainerTestDefinition",
24 | ]
25 |
--------------------------------------------------------------------------------
/conf/common/test/osu_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "osu_test"
18 | test_template_name = "OSUBench"
19 | description = "OSU Benchmark example"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "benchmarks_dir" = "/opt/hpcx/ompi/tests/osu-micro-benchmarks"
24 | "benchmark" = "osu_allreduce"
25 | "iterations" = 10
26 | "message_size" = "1024"
27 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/nemo_run_llama3_8b.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_run_llama3_8b"
18 |
19 | [[Tests]]
20 | id = "nemo_run_llama3_8b.single-node"
21 | test_name = "nemo_run_llama3_8b"
22 | num_nodes = 1
23 | time_limit = "00:60:00"
24 |
25 | [[Tests]]
26 | id = "nemo_run_llama3_8b.2nodes"
27 | test_name = "nemo_run_llama3_8b"
28 | num_nodes = 2
29 | time_limit = "00:60:00"
30 |
--------------------------------------------------------------------------------
/tests/test_standalone_installer.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from unittest.mock import Mock
18 |
19 | from cloudai.systems.standalone.standalone_installer import StandaloneInstaller
20 | from cloudai.systems.standalone.standalone_system import StandaloneSystem
21 |
22 |
23 | def test_create():
24 | installer = StandaloneInstaller(Mock(autospec=StandaloneSystem))
25 | assert installer
26 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/jax_toolbox/grading_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from pathlib import Path
17 |
18 | from cloudai.core import GradingStrategy
19 |
20 |
21 | class JaxToolboxGradingStrategy(GradingStrategy):
22 | """Performance grading strategy for JaxToolbox test templates on Slurm systems."""
23 |
24 | def grade(self, directory_path: Path, ideal_perf: float) -> float:
25 | return 0.0
26 |
--------------------------------------------------------------------------------
/src/cloudai/systems/kubernetes/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .kubernetes_installer import KubernetesInstaller
18 | from .kubernetes_job import KubernetesJob
19 | from .kubernetes_runner import KubernetesRunner
20 | from .kubernetes_system import KubernetesSystem
21 |
22 | __all__ = [
23 | "KubernetesInstaller",
24 | "KubernetesJob",
25 | "KubernetesRunner",
26 | "KubernetesSystem",
27 | ]
28 |
--------------------------------------------------------------------------------
/conf/common/test/nccl_test_all_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_gather"
18 | description = "all_gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "all_gather_perf_mpi"
24 | "ngpus" = "1"
25 | "minbytes" = "128"
26 | "maxbytes" = "4G"
27 | "iters" = "100"
28 | "warmup_iters" = "50"
29 | "stepfactor" = 2
30 |
--------------------------------------------------------------------------------
/src/cloudai/_core/base_job.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from dataclasses import dataclass, field
18 | from typing import Union
19 |
20 | from .test_scenario import TestRun
21 |
22 |
23 | @dataclass
24 | class BaseJob:
25 | """Base class for representing a job created by executing a test."""
26 |
27 | test_run: TestRun
28 | id: Union[str, int]
29 | terminated_by_dependency: bool = field(default=False, init=False)
30 |
--------------------------------------------------------------------------------
/src/cloudai/systems/standalone/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from .standalone_installer import StandaloneInstaller
19 | from .standalone_job import StandaloneJob
20 | from .standalone_runner import StandaloneRunner
21 | from .standalone_system import StandaloneSystem
22 |
23 | __all__ = [
24 | "StandaloneInstaller",
25 | "StandaloneJob",
26 | "StandaloneRunner",
27 | "StandaloneSystem",
28 | ]
29 |
--------------------------------------------------------------------------------
/conf/experimental/aiconfigurator/test_scenario/aiconfigurator_disagg.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "aiconfigurator-disagg-scenario"
18 |
19 | [[Tests]]
20 | id = "aiconfigurator.disagg.demo"
21 | time_limit = "00:05:00"
22 | test_name = "dse_aiconfigurator_disagg_demo_Llama3.1_70B"
23 | num_nodes = 1
24 | agent_metrics = [
25 | "ttft_ms",
26 | "tpot_ms",
27 | "tokens_per_s_per_gpu",
28 | "tokens_per_s_per_user",
29 | ]
30 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/triton_inference/report_generation_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from cloudai.core import ReportGenerationStrategy
18 |
19 |
20 | class TritonInferenceReportGenerationStrategy(ReportGenerationStrategy):
21 | """Report generation strategy for TritonInference."""
22 |
23 | def can_handle_directory(self) -> bool:
24 | return True
25 |
26 | def generate_report(self) -> None:
27 | pass
28 |
--------------------------------------------------------------------------------
/src/cloudai/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .core import (
18 | Registry,
19 | Runner,
20 | System,
21 | TestDefinition,
22 | TestRun,
23 | TestScenario,
24 | )
25 |
26 | # Public API
27 | __all__ = [
28 | "Registry",
29 | "Runner",
30 | "System",
31 | "TestDefinition",
32 | "TestRun",
33 | "TestScenario",
34 | ]
35 |
36 | from .registration import register_all
37 |
38 | register_all()
39 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/deepep/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .deepep import DeepEPCmdArgs, DeepEPTestDefinition
18 | from .report_generation_strategy import DeepEPReportGenerationStrategy
19 | from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy
20 |
21 | __all__ = [
22 | "DeepEPCmdArgs",
23 | "DeepEPReportGenerationStrategy",
24 | "DeepEPSlurmCommandGenStrategy",
25 | "DeepEPTestDefinition",
26 | ]
27 |
--------------------------------------------------------------------------------
/conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "vllm_k8s"
18 |
19 | [[Tests]]
20 | id = "dynamo.vllm"
21 | test_name = "vLLM-Qwen3-0.6B"
22 |
23 | [Tests.cmd_args]
24 | [Tests.cmd_args.dynamo]
25 | [Tests.cmd_args.dynamo.prefill_worker]
26 | num-nodes = 1
27 | tensor-parallel-size = 8
28 | [Tests.cmd_args.dynamo.decode_worker]
29 | num-nodes = 1
30 | tensor-parallel-size = 8
31 |
--------------------------------------------------------------------------------
/conf/common/test/dse_nccl_all_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "dse_nccl_all_gather"
18 | description = "all_gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "all_gather_perf_mpi"
24 | "ngpus" = "1"
25 | "minbytes" = "128"
26 | "maxbytes" = "4G"
27 | "iters" = "100"
28 | "warmup_iters" = ["5", "50"]
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_alltoall"
18 | description = "alltoall"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "alltoall_perf_mpi"
24 | minbytes = "1k"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | check = "0"
28 | warmup_iters = "20"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_sendrecv"
18 | description = "sendrecv"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "sendrecv_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | check = "0"
28 | warmup_iters = "20"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_reduce"
18 | description = "all_reduce"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "all_reduce_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | check = "0"
28 | warmup_iters = "20"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_bisection.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_bisection"
18 | description = "nccl_test_bisection"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "bisection_perf_mpi"
24 | ngpus = "1"
25 | minbytes = "128"
26 | maxbytes = "4G"
27 | iters = "100"
28 | warmup_iters = "50"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
--------------------------------------------------------------------------------
/src/cloudai/systems/lsf/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from .lsf_command_gen_strategy import LSFCommandGenStrategy
19 | from .lsf_installer import LSFInstaller
20 | from .lsf_job import LSFJob
21 | from .lsf_node import LSFNode
22 | from .lsf_runner import LSFRunner
23 | from .lsf_system import LSFSystem
24 |
25 | __all__ = [
26 | "LSFCommandGenStrategy",
27 | "LSFInstaller",
28 | "LSFJob",
29 | "LSFNode",
30 | "LSFRunner",
31 | "LSFSystem",
32 | ]
33 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv_worst.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_sendrecv_worst"
18 | description = "sendrecv"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "sendrecv_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | iters = "60"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "2"
33 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_alltoall_worst"
18 | description = "alltoall_tested_job"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "alltoall_perf_mpi"
24 | minbytes = "1k"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | iters = "60"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "2"
33 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/megatron_run/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .megatron_run import MegatronRunCmdArgs, MegatronRunTestDefinition
18 | from .report_generation_strategy import CheckpointTimingReportGenerationStrategy
19 | from .slurm_command_gen_strategy import MegatronRunSlurmCommandGenStrategy
20 |
21 | __all__ = [
22 | "CheckpointTimingReportGenerationStrategy",
23 | "MegatronRunCmdArgs",
24 | "MegatronRunSlurmCommandGenStrategy",
25 | "MegatronRunTestDefinition",
26 | ]
27 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce_worst.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_reduce_worst"
18 | description = "all_reduce_worst_alloc_test"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "all_reduce_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | iters = "60"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "2"
33 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nixl_perftest/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .nixl_perftest import MatgenCmdArgs, NixlPerftestCmdArgs, NixlPerftestTestDefinition
18 | from .report_generation_strategy import NIXLKVBenchDummyReport
19 | from .slurm_command_gen_strategy import NixlPerftestSlurmCommandGenStrategy
20 |
21 | __all__ = [
22 | "MatgenCmdArgs",
23 | "NIXLKVBenchDummyReport",
24 | "NixlPerftestCmdArgs",
25 | "NixlPerftestSlurmCommandGenStrategy",
26 | "NixlPerftestTestDefinition",
27 | ]
28 |
--------------------------------------------------------------------------------
/conf/common/system/kubernetes_cluster.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "kubernetes-cluster"
18 | scheduler = "kubernetes"
19 | kube_config_path = ""
20 |
21 | install_path = "./install"
22 | output_path = "./results"
23 | default_namespace = "default"
24 | monitor_interval = 1
25 |
26 | [global_env_vars]
27 | NCCL_IB_GID_INDEX = "3"
28 | NCCL_SOCKET_IFNAME = "ib0"
29 | NCCL_IB_HCA = "mlx5_0"
30 | UCX_NET_DEVICES = "mlx5_0:1"
31 | NCCL_P2P_LEVEL = "PIX"
32 | UCX_TLS = "rc_x,sm,cuda_copy"
33 | NCCL_IB_TC = "96"
34 |
--------------------------------------------------------------------------------
/conf/hook/test/nccl_test_all_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_gather"
18 | description = "all_gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | "subtest_name" = "all_gather_perf_mpi"
24 | "ngpus" = "1"
25 | "minbytes" = "128"
26 | "maxbytes" = "4G"
27 | "iters" = "100"
28 | "warmup_iters" = "50"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
33 | [extra_env_vars]
34 | "NCCL_TESTS_SPLIT_MASK" = "0x7"
35 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst_failover.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_alltoall_worst_failover"
18 | description = "alltoall_tested_job"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "alltoall_perf_mpi"
24 | minbytes = "128M"
25 | maxbytes = "128M"
26 | stepbytes = "0"
27 | iters = "20"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "1"
33 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/aiconfig/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .aiconfigurator import AiconfiguratorCmdArgs, AiconfiguratorTestDefinition
18 | from .report_generation_strategy import AiconfiguratorReportGenerationStrategy
19 | from .standalone_command_gen_strategy import AiconfiguratorStandaloneCommandGenStrategy
20 |
21 | __all__ = [
22 | "AiconfiguratorCmdArgs",
23 | "AiconfiguratorReportGenerationStrategy",
24 | "AiconfiguratorStandaloneCommandGenStrategy",
25 | "AiconfiguratorTestDefinition",
26 | ]
27 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nixl_perftest/report_generation_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from typing import ClassVar
18 |
19 | from cloudai.core import ReportGenerationStrategy
20 |
21 |
22 | class NIXLKVBenchDummyReport(ReportGenerationStrategy):
23 | """Dummy report to support sweeps as it requires "default" metric."""
24 |
25 | metrics: ClassVar[list[str]] = ["default"]
26 |
27 | def can_handle_directory(self) -> bool:
28 | return True
29 |
30 | def generate_report(self) -> None:
31 | pass
32 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/ucc_test/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .grading_strategy import UCCTestGradingStrategy
18 | from .report_generation_strategy import UCCTestReportGenerationStrategy
19 | from .slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
20 | from .ucc import UCCCmdArgs, UCCTestDefinition
21 |
22 | __all__ = [
23 | "UCCCmdArgs",
24 | "UCCTestDefinition",
25 | "UCCTestGradingStrategy",
26 | "UCCTestReportGenerationStrategy",
27 | "UCCTestSlurmCommandGenStrategy",
28 | ]
29 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/triton_inference/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from .report_generation_strategy import TritonInferenceReportGenerationStrategy
19 | from .slurm_command_gen_strategy import TritonInferenceSlurmCommandGenStrategy
20 | from .triton_inference import TritonInferenceCmdArgs, TritonInferenceTestDefinition
21 |
22 | __all__ = [
23 | "TritonInferenceCmdArgs",
24 | "TritonInferenceReportGenerationStrategy",
25 | "TritonInferenceSlurmCommandGenStrategy",
26 | "TritonInferenceTestDefinition",
27 | ]
28 |
--------------------------------------------------------------------------------
/conf/common/system/example_runai_cluster.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "example-runai-cluster"
18 | scheduler = "runai"
19 |
20 | install_path = "./install_dir"
21 | output_path = "./results"
22 | monitor_interval = 1
23 |
24 | base_url = "http://runai.example.com"
25 | user_email = "your_email"
26 | app_id = "your_app_id"
27 | app_secret = "your_app_secret"
28 | project_id = "your_project_id"
29 | cluster_id = "your_cluster_id"
30 |
31 | [global_env_vars]
32 | NCCL_IB_GID_INDEX = "3"
33 | NCCL_IB_TIMEOUT = "20"
34 | NCCL_IB_QPS_PER_CONNECTION = "4"
35 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test/nccl_test_all_reduce_loopback.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_reduce_loopback"
18 | description = "all_reduce"
19 | test_template_name = "NcclTest"
20 |
21 | [extra_env_vars]
22 | NCCL_P2P_DISABLE = "1"
23 | NCCL_SHM_DISABLE = "1"
24 |
25 | [cmd_args]
26 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
27 | "subtest_name" = "all_reduce_perf_mpi"
28 | "ngpus" = "1"
29 | "minbytes" = "8M"
30 | "maxbytes" = "16G"
31 | "iters" = "5"
32 | "warmup_iters" = "3"
33 |
34 | [extra_cmd_args]
35 | "--stepfactor" = "2"
36 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nixl_bench/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
18 | from .nixl_summary_report import NIXLBenchComparisonReport
19 | from .report_generation_strategy import NIXLBenchReportGenerationStrategy
20 | from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
21 |
22 | __all__ = [
23 | "NIXLBenchCmdArgs",
24 | "NIXLBenchComparisonReport",
25 | "NIXLBenchReportGenerationStrategy",
26 | "NIXLBenchSlurmCommandGenStrategy",
27 | "NIXLBenchTestDefinition",
28 | ]
29 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/sleep.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from cloudai.core import Installable
19 | from cloudai.models.workload import CmdArgs, TestDefinition
20 |
21 |
22 | class SleepCmdArgs(CmdArgs):
23 | """Sleep test command arguments."""
24 |
25 | docker_image_url: str = "ubuntu:22.04"
26 | seconds: int = 5
27 |
28 |
29 | class SleepTestDefinition(TestDefinition):
30 | """Test object for Sleep."""
31 |
32 | cmd_args: SleepCmdArgs
33 |
34 | @property
35 | def installables(self) -> list[Installable]:
36 | return []
37 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_reduce_scatter"
18 | description = "reduce_scatter"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "reduce_scatter_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | check = "0"
28 | warmup_iters = "20"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
33 | [extra_env_vars]
34 | "NCCL_TESTS_SPLIT_MASK" = "0x7"
35 | "NCCL_MIN_NCHANNELS" = "12"
36 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_gather"
18 | description = "all_gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "all_gather_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | check = "0"
28 | warmup_iters = "20"
29 |
30 | [extra_cmd_args]
31 | "--stepfactor" = "2"
32 |
33 | [extra_env_vars]
34 | "NCCL_TESTS_SPLIT_MASK" = "0x7"
35 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers
36 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/chakra_replay/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .chakra_replay import ChakraReplayCmdArgs, ChakraReplayTestDefinition
18 | from .grading_strategy import ChakraReplayGradingStrategy
19 | from .report_generation_strategy import ChakraReplayReportGenerationStrategy
20 | from .slurm_command_gen_strategy import ChakraReplaySlurmCommandGenStrategy
21 |
22 | __all__ = [
23 | "ChakraReplayCmdArgs",
24 | "ChakraReplayGradingStrategy",
25 | "ChakraReplayReportGenerationStrategy",
26 | "ChakraReplaySlurmCommandGenStrategy",
27 | "ChakraReplayTestDefinition",
28 | ]
29 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nemo_launcher/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .grading_strategy import NeMoLauncherGradingStrategy
18 | from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
19 | from .report_generation_strategy import NeMoLauncherReportGenerationStrategy
20 | from .slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
21 |
22 | __all__ = [
23 | "NeMoLauncherCmdArgs",
24 | "NeMoLauncherGradingStrategy",
25 | "NeMoLauncherReportGenerationStrategy",
26 | "NeMoLauncherSlurmCommandGenStrategy",
27 | "NeMoLauncherTestDefinition",
28 | ]
29 |
--------------------------------------------------------------------------------
/src/cloudai/report_generator/tool/report_tool_interface.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from abc import ABC, abstractmethod
18 | from pathlib import Path
19 |
20 |
21 | class ReportToolInterface(ABC):
22 | """Interface for report tools, defining methods to add and finalize reports."""
23 |
24 | @abstractmethod
25 | def finalize_report(self, output_filename: Path) -> None:
26 | """
27 | Finalize the report and save it to the specified filename.
28 |
29 | Args:
30 | output_filename (Path): The filename where the report will be saved.
31 | """
32 | pass
33 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather_worst.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_all_gather_worst"
18 | description = "all_gather"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "all_gather_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | iters = "60"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "2"
33 |
34 | [extra_env_vars]
35 | "NCCL_TESTS_SPLIT_MASK" = "0x7"
36 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers
37 |
--------------------------------------------------------------------------------
/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter_worst.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nccl_test_reduce_scatter_worst"
18 | description = "reduce_scatter"
19 | test_template_name = "NcclTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
23 | subtest_name = "reduce_scatter_perf_mpi"
24 | minbytes = "1K"
25 | maxbytes = "16G"
26 | stepbytes = "0"
27 | iters = "60"
28 | check = "0"
29 | warmup_iters = "40"
30 |
31 | [extra_cmd_args]
32 | "--stepfactor" = "2"
33 |
34 | [extra_env_vars]
35 | "NCCL_TESTS_SPLIT_MASK" = "0x7"
36 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers
37 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/standalone_command_gen_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from typing import cast
18 |
19 | from cloudai.core import CommandGenStrategy
20 |
21 | from .sleep import SleepCmdArgs, SleepTestDefinition
22 |
23 |
24 | class SleepStandaloneCommandGenStrategy(CommandGenStrategy):
25 | """Command generation strategy for the Sleep test on standalone systems."""
26 |
27 | def gen_exec_command(self) -> str:
28 | tdef: SleepTestDefinition = cast(SleepTestDefinition, self.test_run.test)
29 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args
30 | sec = tdef_cmd_args.seconds
31 | return f"sleep {sec}"
32 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/slurm_container.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "slurm-container"
18 |
19 | [[Tests]]
20 | id = "nccl.alltoall"
21 | num_nodes = 2
22 | time_limit = "00:20:00"
23 |
24 | name = "nccl-alltoall"
25 | description = "NCCL alltoall via SlurmContainer"
26 | test_template_name = "SlurmContainer"
27 |
28 | [Tests.cmd_args]
29 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
30 | cmd = "alltoall_perf_mpi --nthreads 1 --ngpus 1 --minbytes 128 --maxbytes 4G --stepbytes 1M --op sum --datatype float --root 0 --iters 100 --warmup_iters 50 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 --stepfactor 2"
31 |
--------------------------------------------------------------------------------
/conf/experimental/aiconfigurator/test/aiconfigurator_disagg.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "aiconfigurator_disagg_demo"
18 | description = "Aiconfigurator disaggregated predictor demo"
19 | test_template_name = "Aiconfigurator"
20 |
21 | [cmd_args]
22 | model_name = "LLAMA3.1_70B"
23 | system = "h100_sxm"
24 | # backend and version use defaults
25 | isl = 3000
26 | osl = 150
27 |
28 | [cmd_args.disagg]
29 | p_tp = 4
30 | p_pp = 1
31 | p_dp = 1
32 | p_bs = 1
33 | p_workers = 1
34 |
35 | d_tp = 4
36 | d_pp = 1
37 | d_dp = 1
38 | d_bs = 256
39 | d_workers = 1
40 |
41 | prefill_correction_scale = 1.0
42 | decode_correction_scale = 1.0
43 |
--------------------------------------------------------------------------------
/conf/experimental/test/ddlb_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "ddlb_test"
18 | description = "DDLB test configuration"
19 | test_template_name = "DDLBTest"
20 |
21 | [cmd_args]
22 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
23 | primitive = "tp_columnwise"
24 | m = [1024, 8192]
25 | n = 128
26 | k = 1024
27 | dtype = "float16"
28 | num_iterations = 50
29 | num_warmups = 10
30 | # Make sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after"
31 | impl = [
32 | "pytorch;backend=nccl;order=AG_before",
33 | "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before",
34 | ]
35 |
--------------------------------------------------------------------------------
/src/cloudai/_core/job_status_result.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | class JobStatusResult:
19 | """
20 | Encapsulates the result of a job status retrieval.
21 |
22 | Attributes
23 | is_successful (bool): Indicates if the job was successful.
24 | error_message (str): Error message if the job was not successful.
25 | """
26 |
27 | def __init__(self, is_successful: bool, error_message: str = ""):
28 | self.is_successful = is_successful
29 | self.error_message = error_message
30 |
31 | def __str__(self):
32 | return f"JobStatusResult(is_successful={self.is_successful}, error_message={self.error_message})"
33 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/ucc_generator_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | name = "ucc_generator_perftest"
19 |
20 | [[Tests]]
21 | id = "Tests.alltoallv"
22 | num_nodes = 2
23 | time_limit = "00:02:00"
24 |
25 | name = "ucc_generator_perftest"
26 | description = "UCC alltoallv"
27 | test_template_name = "UCCTest"
28 | extra_container_mounts = [
29 | "$PWD/conf/common/test_scenario/ucc_generator_file.txt:/opt/hpcx/ucc/tools/perf/generator/input_matrices.txt",
30 | ]
31 |
32 | [Tests.cmd_args]
33 | docker_image_url = "nvcr.io/nvidia/pytorch:25.09-py3"
34 | collective = "alltoallv"
35 | gen = "file:name=/opt/hpcx/ucc/tools/perf/generator/input_matrices.txt"
36 |
--------------------------------------------------------------------------------
/tests/ref_data/slurm_container.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; pwd ; ls"
18 |
--------------------------------------------------------------------------------
/doc/workloads/sleep.rst:
--------------------------------------------------------------------------------
1 | Sleep
2 | =====
3 |
4 | This workload (`test_template_name` is ``Sleep``) executes a simple sleep command for testing and timing purposes. Useful for testing schedulers and system behavior.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_sleep_test"
14 | description = "Example Sleep test"
15 | test_template_name = "Sleep"
16 |
17 | [cmd_args]
18 | seconds = 30
19 |
20 | Test Scenario example:
21 |
22 | .. code-block:: toml
23 |
24 | name = "sleep-test"
25 |
26 | [[Tests]]
27 | id = "sleep.1"
28 | num_nodes = 1
29 | time_limit = "00:02:00"
30 |
31 | test_name = "my_sleep_test"
32 |
33 | Test-in-Scenario example:
34 |
35 | .. code-block:: toml
36 |
37 | name = "sleep-test"
38 |
39 | [[Tests]]
40 | id = "sleep.1"
41 | num_nodes = 1
42 | time_limit = "00:02:00"
43 |
44 | name = "my_sleep_test"
45 | description = "Example Sleep test"
46 | test_template_name = "Sleep"
47 |
48 | [Tests.cmd_args]
49 | seconds = 30
50 |
51 | API Documentation
52 | -----------------
53 |
54 | Command Arguments
55 | ~~~~~~~~~~~~~~~~~
56 |
57 | .. autoclass:: cloudai.workloads.sleep.sleep.SleepCmdArgs
58 | :members:
59 | :show-inheritance:
60 |
61 | Test Definition
62 | ~~~~~~~~~~~~~~~
63 |
64 | .. autoclass:: cloudai.workloads.sleep.sleep.SleepTestDefinition
65 | :members:
66 | :show-inheritance:
67 |
--------------------------------------------------------------------------------
/doc/workloads/ucc.rst:
--------------------------------------------------------------------------------
1 | UCC
2 | ===
3 |
4 | This workload (`test_template_name` is ``UCCTest``) allows users to execute UCC benchmarks within the CloudAI framework.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "ucc"
14 | description = "Example UCC test"
15 | test_template_name = "UCCTest"
16 |
17 | [cmd_args]
18 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
19 |
20 | Test Scenario example:
21 |
22 | .. code-block:: toml
23 |
24 | name = "ucc-test"
25 |
26 | [[Tests]]
27 | id = "ucc.1"
28 | num_nodes = 1
29 | time_limit = "00:02:00"
30 |
31 | test_name = "ucc"
32 |
33 | Test-in-Scenario example:
34 |
35 | .. code-block:: toml
36 |
37 | name = "ucc-test"
38 |
39 | [[Tests]]
40 | id = "ucc.1"
41 | num_nodes = 1
42 | time_limit = "00:02:00"
43 |
44 | name = "ucc"
45 | description = "Example UCC test"
46 | test_template_name = "UCCTest"
47 |
48 | [Tests.cmd_args]
49 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
50 |
51 | API Documentation
52 | -----------------
53 |
54 | Command Arguments
55 | ~~~~~~~~~~~~~~~~~
56 |
57 | .. autoclass:: cloudai.workloads.ucc_test.ucc.UCCCmdArgs
58 | :members:
59 | :show-inheritance:
60 |
61 | Test Definition
62 | ~~~~~~~~~~~~~~~
63 |
64 | .. autoclass:: cloudai.workloads.ucc_test.ucc.UCCTestDefinition
65 | :members:
66 | :show-inheritance:
67 |
--------------------------------------------------------------------------------
/src/cloudai/_core/grading_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from abc import ABC, abstractmethod
18 | from pathlib import Path
19 |
20 |
21 | class GradingStrategy(ABC):
22 | """Abstract class for grading test performance."""
23 |
24 | @abstractmethod
25 | def grade(self, directory_path: Path, ideal_perf: float) -> float:
26 | """
27 | Grades the performance of a test.
28 |
29 | Args:
30 | directory_path (Path): Path to the directory containing the test's output.
31 | ideal_perf (float): The ideal performance value for comparison.
32 |
33 | Returns:
34 | float: Calculated grade based on the performance.
35 | """
36 | pass
37 |
--------------------------------------------------------------------------------
/tests/ref_data/nemo-launcher.sbatch:
--------------------------------------------------------------------------------
1 | VAR="$(scontrol show hostname \"${SLURM_STEP_NODELIST}\" | head -n1)" \
2 | __OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022-venv/bin/python \
3 | __OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts/main.py \
4 | cluster.gpus_per_node=8 \
5 | numa_mapping.enable=True \
6 | stages=["training"] \
7 | training.exp_manager.create_checkpoint_callback=False \
8 | training.model.data.data_impl=mock \
9 | training.model.data.data_prefix=[] \
10 | training.model.global_batch_size=128 \
11 | training.model.micro_batch_size=2 \
12 | training.model.pipeline_model_parallel_size=4 \
13 | training.model.tensor_model_parallel_size=4 \
14 | training.run.name=run \
15 | training.run.time_limit=3:00:00 \
16 | training.trainer.enable_checkpointing=False \
17 | training.trainer.log_every_n_steps=1 \
18 | training.trainer.max_steps=20 \
19 | training.trainer.val_check_interval=10 \
20 | training=gpt3/40b_improved \
21 | cluster.partition=main \
22 | training.trainer.num_nodes=1 \
23 | container=nvcr.io/nvidia/nemo:24.12.01 \
24 | cluster.job_name_prefix=test_account-cloudai.nemo: \
25 | base_results_dir=__OUTPUT_DIR__/output \
26 | launcher_scripts_path=__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts \
27 | +env_vars.VAR="$(scontrol show hostname \"${SLURM_STEP_NODELIST}\" | head -n1)"
28 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/slurm_command_gen_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from typing import List, cast
18 |
19 | from cloudai.systems.slurm import SlurmCommandGenStrategy
20 |
21 | from .sleep import SleepCmdArgs, SleepTestDefinition
22 |
23 |
24 | class SleepSlurmCommandGenStrategy(SlurmCommandGenStrategy):
25 | """Command generation strategy for Sleep on Slurm systems."""
26 |
27 | def _container_mounts(self) -> list[str]:
28 | return []
29 |
30 | def generate_test_command(self) -> List[str]:
31 | tdef: SleepTestDefinition = cast(SleepTestDefinition, self.test_run.test)
32 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args
33 | return [f"sleep {tdef_cmd_args.seconds}"]
34 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nemo_run/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .data_store_report_generation_strategy import NeMoRunDataStoreReportGenerationStrategy
18 | from .nemo_run import Data, Log, LogCkpt, NeMoRunCmdArgs, NeMoRunTestDefinition, Trainer, TrainerStrategy
19 | from .report_generation_strategy import NeMoRunReportGenerationStrategy
20 | from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
21 |
22 | __all__ = [
23 | "Data",
24 | "Log",
25 | "LogCkpt",
26 | "NeMoRunCmdArgs",
27 | "NeMoRunDataStoreReportGenerationStrategy",
28 | "NeMoRunReportGenerationStrategy",
29 | "NeMoRunSlurmCommandGenStrategy",
30 | "NeMoRunTestDefinition",
31 | "Trainer",
32 | "TrainerStrategy",
33 | ]
34 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .grading_strategy import SleepGradingStrategy
18 | from .kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
19 | from .lsf_command_gen_strategy import SleepLSFCommandGenStrategy
20 | from .sleep import SleepCmdArgs, SleepTestDefinition
21 | from .slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
22 | from .standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
23 |
24 | __all__ = [
25 | "SleepCmdArgs",
26 | "SleepGradingStrategy",
27 | "SleepKubernetesJsonGenStrategy",
28 | "SleepLSFCommandGenStrategy",
29 | "SleepSlurmCommandGenStrategy",
30 | "SleepStandaloneCommandGenStrategy",
31 | "SleepTestDefinition",
32 | ]
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Distribution / packaging
7 | .Python
8 | build/
9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | share/python-wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | doc/_build
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Jupyter Notebook
37 | .ipynb_checkpoints
38 |
39 | # IPython
40 | profile_default/
41 | ipython_config.py
42 |
43 | # pyenv
44 | .python-version
45 |
46 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
47 | __pypackages__/
48 |
49 | # Environments
50 | .env
51 | .venv
52 | env/
53 | venv/
54 | ENV/
55 | env.bak/
56 | venv.bak/
57 |
58 | # mypy
59 | .mypy_cache/
60 | .dmypy.json
61 | dmypy.json
62 |
63 | # pytype static type analyzer
64 | .pytype/
65 |
66 | # pycharm
67 | .idea/
68 |
69 | # VSCode
70 | .vscode/
71 |
72 | # Editors and IDEs
73 | *.swp
74 | *.bak
75 | *.tmp
76 | *~
77 | *.sublime-project
78 | *.sublime-workspace
79 |
80 | # OS generated files
81 | .DS_Store
82 | .DS_Store?
83 | ._*
84 | .Spotlight-V100
85 | .Trashes
86 | ehthumbs.db
87 | Thumbs.db
88 |
89 | *.log
90 | install/
91 | results/
92 | .*
93 | .cloudai.toml
94 |
--------------------------------------------------------------------------------
/doc/workloads/nemo_run.rst:
--------------------------------------------------------------------------------
1 | Nemo Run
2 | ========
3 |
4 | This workload (`test_template_name` is ``NemoRun``) executes NeMo training and fine-tuning tasks using the NeMo Run framework.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_nemo_test"
14 | description = "Example NeMo Run test"
15 | test_template_name = "NemoRun"
16 |
17 | [cmd_args]
18 | recipe = "llama3_8b"
19 | task = "pretrain"
20 |
21 | Test Scenario example:
22 |
23 | .. code-block:: toml
24 |
25 | name = "nemo-run-test"
26 |
27 | [[Tests]]
28 | id = "nemo.1"
29 | num_nodes = 4
30 | time_limit = "02:00:00"
31 |
32 | test_name = "my_nemo_test"
33 |
34 | Test-in-Scenario example:
35 |
36 | .. code-block:: toml
37 |
38 | name = "nemo-run-test"
39 |
40 | [[Tests]]
41 | id = "nemo.1"
42 | num_nodes = 4
43 | time_limit = "02:00:00"
44 |
45 | name = "my_nemo_test"
46 | description = "Example NeMo Run test"
47 | test_template_name = "NemoRun"
48 |
49 | [Tests.cmd_args]
50 | recipe = "llama3_8b"
51 | task = "pretrain"
52 |
53 | API Documentation
54 | -----------------
55 |
56 | Command Arguments
57 | ~~~~~~~~~~~~~~~~~
58 |
59 | .. autoclass:: cloudai.workloads.nemo_run.nemo_run.NeMoRunCmdArgs
60 | :members:
61 | :show-inheritance:
62 |
63 | Test Definition
64 | ~~~~~~~~~~~~~~~
65 |
66 | .. autoclass:: cloudai.workloads.nemo_run.nemo_run.NeMoRunTestDefinition
67 | :members:
68 | :show-inheritance:
69 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/grading_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pathlib import Path
18 |
19 | from cloudai.core import GradingStrategy
20 |
21 |
22 | class SleepGradingStrategy(GradingStrategy):
23 | """Performance grading strategy for Sleep test templates on Slurm systems."""
24 |
25 | def grade(self, directory_path: Path, ideal_perf: float) -> float:
26 | """
27 | Grades the performance of a test.
28 |
29 | Args:
30 | directory_path (Path): Path to the directory containing the test's output.
31 | ideal_perf (float): The ideal performance value for comparison.
32 |
33 | Returns:
34 | float: Calculated grade based on the performance.
35 | """
36 | return 100.0
37 |
--------------------------------------------------------------------------------
/tests/ref_data/ucc.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F"
18 |
--------------------------------------------------------------------------------
/src/cloudai/_core/report_generation_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from abc import ABC, abstractmethod
18 | from typing import ClassVar
19 |
20 | from .system import System
21 | from .test_scenario import TestRun
22 |
23 |
24 | class ReportGenerationStrategy(ABC):
25 | """Abstract class for generating reports from TestRun objects."""
26 |
27 | metrics: ClassVar[list[str]] = ["default"]
28 |
29 | def __init__(self, system: System, tr: TestRun) -> None:
30 | self.system = system
31 | self.test_run = tr
32 |
33 | def get_metric(self, metric: str) -> float:
34 | return 0.0
35 |
36 | @abstractmethod
37 | def can_handle_directory(self) -> bool: ...
38 |
39 | @abstractmethod
40 | def generate_report(self) -> None: ...
41 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/chakra_replay/grading_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from pathlib import Path
18 |
19 | from cloudai.core import GradingStrategy
20 |
21 |
22 | class ChakraReplayGradingStrategy(GradingStrategy):
23 | """Performance grading strategy for ChakraReplay test templates on Slurm systems."""
24 |
25 | def grade(self, directory_path: Path, ideal_perf: float) -> float:
26 | """
27 | Grades the performance of a test.
28 |
29 | Args:
30 | directory_path (Path): Path to the directory containing the test's output.
31 | ideal_perf (float): The ideal performance value for comparison.
32 |
33 | Returns:
34 | float: Calculated grade based on the performance.
35 | """
36 | return 100.0
37 |
--------------------------------------------------------------------------------
/tests/ref_data/osu-bench.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce --message-size 1024 --iterations 10 --full"
18 |
--------------------------------------------------------------------------------
/src/cloudai/systems/runai/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .runai_cluster import RunAICluster
18 | from .runai_event import RunAIEvent
19 | from .runai_installer import RunAIInstaller
20 | from .runai_job import RunAIJob
21 | from .runai_node import RunAINode
22 | from .runai_project import RunAIProject
23 | from .runai_pvc import RunAIPVC
24 | from .runai_rest_client import RunAIRestClient
25 | from .runai_runner import RunAIRunner
26 | from .runai_system import RunAISystem
27 | from .runai_training import RunAITraining
28 |
29 | __all__ = [
30 | "RunAICluster",
31 | "RunAIEvent",
32 | "RunAIInstaller",
33 | "RunAIJob",
34 | "RunAINode",
35 | "RunAIPVC",
36 | "RunAIProject",
37 | "RunAIRestClient",
38 | "RunAIRunner",
39 | "RunAISystem",
40 | "RunAITraining",
41 | ]
42 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/ai_dynamo/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .ai_dynamo import (
18 | AIDynamoArgs,
19 | AIDynamoCmdArgs,
20 | AIDynamoTestDefinition,
21 | DecodeWorkerArgs,
22 | GenAIPerfArgs,
23 | PrefillWorkerArgs,
24 | )
25 | from .kubernetes_json_gen_strategy import AIDynamoKubernetesJsonGenStrategy
26 | from .report_generation_strategy import AIDynamoReportGenerationStrategy
27 | from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy
28 |
29 | __all__ = [
30 | "AIDynamoArgs",
31 | "AIDynamoCmdArgs",
32 | "AIDynamoKubernetesJsonGenStrategy",
33 | "AIDynamoReportGenerationStrategy",
34 | "AIDynamoSlurmCommandGenStrategy",
35 | "AIDynamoTestDefinition",
36 | "DecodeWorkerArgs",
37 | "GenAIPerfArgs",
38 | "PrefillWorkerArgs",
39 | ]
40 |
--------------------------------------------------------------------------------
/doc/workloads/nccl.rst:
--------------------------------------------------------------------------------
1 | NCCL
2 | ====
3 |
4 | This workload (`test_template_name` is ``NcclTest``) allows users to execute NCCL benchmarks within the CloudAI framework.
5 |
6 | Usage Example
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_nccl_test"
14 | description = "Example NCCL test"
15 | test_template_name = "NcclTest"
16 |
17 | [cmd_args]
18 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
19 |
20 | Test Scenario example:
21 |
22 | .. code-block:: toml
23 |
24 | name = "nccl-test"
25 |
26 | [[Tests]]
27 | id = "nccl.1"
28 | num_nodes = 1
29 | time_limit = "00:05:00"
30 |
31 | test_name = "my_nccl_test"
32 |
33 | Test-in-Scenario example:
34 |
35 | .. code-block:: toml
36 |
37 | name = "nccl-test"
38 |
39 | [[Tests]]
40 | id = "nccl.1"
41 | num_nodes = 1
42 | time_limit = "00:05:00"
43 |
44 | name = "my_nccl_test"
45 | description = "Example NCCL test"
46 | test_template_name = "NcclTest"
47 |
48 | [Tests.cmd_args]
49 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3"
50 | subtest_name = "all_reduce_perf_mpi"
51 | iters = 100
52 |
53 | API Documentation
54 | ---------------------------------
55 |
56 | Command Arguments
57 | ~~~~~~~~~~~~~~~~~
58 |
59 | .. autoclass:: cloudai.workloads.nccl_test.nccl.NCCLCmdArgs
60 | :members:
61 | :show-inheritance:
62 |
63 | Test Definition
64 | ~~~~~~~~~~~~~~~
65 |
66 | .. autoclass:: cloudai.workloads.nccl_test.nccl.NCCLTestDefinition
67 | :members:
68 | :show-inheritance:
69 |
--------------------------------------------------------------------------------
/doc/workloads/chakra_replay.rst:
--------------------------------------------------------------------------------
1 | Chakra Replay
2 | =============
3 |
4 | This workload (`test_template_name` is ``ChakraReplay``) replays execution traces from the Chakra execution trace format for performance analysis and debugging.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_chakra_test"
14 | description = "Example Chakra replay test"
15 | test_template_name = "ChakraReplay"
16 |
17 | [cmd_args]
18 | trace_path = "/path/to/trace.et"
19 |
20 | Test Scenario example:
21 |
22 | .. code-block:: toml
23 |
24 | name = "chakra-replay-test"
25 |
26 | [[Tests]]
27 | id = "chakra.1"
28 | num_nodes = 1
29 | time_limit = "00:10:00"
30 |
31 | test_name = "my_chakra_test"
32 |
33 | Test-in-Scenario example:
34 |
35 | .. code-block:: toml
36 |
37 | name = "chakra-replay-test"
38 |
39 | [[Tests]]
40 | id = "chakra.1"
41 | num_nodes = 1
42 | time_limit = "00:10:00"
43 |
44 | name = "my_chakra_test"
45 | description = "Example Chakra replay test"
46 | test_template_name = "ChakraReplay"
47 |
48 | [Tests.cmd_args]
49 | trace_path = "/path/to/trace.et"
50 |
51 | API Documentation
52 | -----------------
53 |
54 | Command Arguments
55 | ~~~~~~~~~~~~~~~~~
56 |
57 | .. autoclass:: cloudai.workloads.chakra_replay.chakra_replay.ChakraReplayCmdArgs
58 | :members:
59 | :show-inheritance:
60 |
61 | Test Definition
62 | ~~~~~~~~~~~~~~~
63 |
64 | .. autoclass:: cloudai.workloads.chakra_replay.chakra_replay.ChakraReplayTestDefinition
65 | :members:
66 | :show-inheritance:
67 |
--------------------------------------------------------------------------------
/src/cloudai/_core/base_system_parser.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from abc import ABC, abstractmethod
18 | from typing import Any, Dict
19 |
20 | from .system import System
21 |
22 |
23 | class BaseSystemParser(ABC):
24 | """
25 | Abstract base class for system parsers.
26 |
27 | Parses system configuration data and creates system objects.
28 |
29 | Methods
30 | parse: Abstract method to parse configuration data and return a System object.
31 | """
32 |
33 | @abstractmethod
34 | def parse(self, data: Dict[str, Any]) -> System:
35 | """
36 | Parse configuration data and returns a System object.
37 |
38 | Args:
39 | data (Dict[str, Any]): The configuration data.
40 |
41 | Returns:
42 | System: A System object created from the configuration data.
43 | """
44 | pass
45 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/sleep/lsf_command_gen_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from typing import Dict, List, Union, cast
18 |
19 | from cloudai.core import TestRun
20 | from cloudai.systems.lsf import LSFCommandGenStrategy
21 |
22 | from .sleep import SleepCmdArgs, SleepTestDefinition
23 |
24 |
25 | class SleepLSFCommandGenStrategy(LSFCommandGenStrategy):
26 | """Command generation strategy for Sleep on LSF systems."""
27 |
28 | def _container_mounts(self, tr: TestRun) -> list[str]:
29 | return []
30 |
31 | def generate_test_command(
32 | self, env_vars: Dict[str, Union[str, List[str]]], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
33 | ) -> List[str]:
34 | tdef: SleepTestDefinition = cast(SleepTestDefinition, tr.test)
35 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args
36 | return [f"sleep {tdef_cmd_args.seconds}"]
37 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/sleep.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "sleep-scenario"
18 |
19 | [[Tests]]
20 | id = "Tests.sleep1"
21 | time_limit = "00:01:00"
22 | test_name = "sleep"
23 | [Tests.cmd_args]
24 | seconds = 10
25 |
26 | [[Tests]]
27 | id = "Tests.sleep5"
28 | time_limit = "00:01:00"
29 | test_name = "sleep"
30 | [Tests.cmd_args]
31 | seconds = 5
32 | [[Tests.dependencies]]
33 | type = "start_post_init"
34 | id = "Tests.sleep1"
35 |
36 | [[Tests]]
37 | id = "Tests.sleep5_2"
38 | time_limit = "00:01:00"
39 | test_name = "sleep"
40 | [Tests.cmd_args]
41 | seconds = 5
42 | [[Tests.dependencies]]
43 | type = "start_post_comp"
44 | id = "Tests.sleep1"
45 |
46 | [[Tests]]
47 | id = "Tests.sleep20"
48 | time_limit = "00:01:00"
49 | test_name = "sleep"
50 | [Tests.cmd_args]
51 | seconds = 20
52 | [[Tests.dependencies]]
53 | type = "end_post_comp"
54 | id = "Tests.sleep1"
55 |
--------------------------------------------------------------------------------
/conf/experimental/test/deepep_low_latency.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "deepep_low_latency"
18 | description = "DeepEP MoE Benchmark - Low Latency Mode"
19 | test_template_name = "DeepEP"
20 |
21 | [cmd_args]
22 | # Local .sqsh file:
23 | # docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
24 | # Container registry:
25 | docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
26 |
27 | mode = "low_latency"
28 |
29 | tokens = 128
30 | num_experts = 256
31 | num_topk = 1
32 | hidden_size = 7168
33 | data_type = "bfloat16"
34 | allow_nvlink_for_low_latency = false
35 | allow_mnnvl = false
36 | round_scale = false
37 | use_ue8m0 = false
38 | num_warmups = 20
39 | num_iterations = 50
40 | shuffle_columns = false
41 | use_kineto_profiler = false
42 | config_file_path = "/tmp/config.yaml"
43 | results_dir = "/workspace/dp-benchmark/results"
44 |
--------------------------------------------------------------------------------
/tests/ref_data/ddlb.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before"
18 |
--------------------------------------------------------------------------------
/conf/experimental/test/deepep_standard.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "deepep_standard"
18 | description = "DeepEP MoE Benchmark - Standard Mode"
19 | test_template_name = "DeepEP"
20 |
21 | [cmd_args]
22 | # Local .sqsh file:
23 | # docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
24 | # Container registry (uses your Docker credentials):
25 | docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
26 |
27 | mode = "standard"
28 |
29 | tokens = 1024
30 | num_experts = 256
31 | num_topk = 8
32 | hidden_size = 7168
33 | data_type = "bfloat16"
34 | allow_nvlink_for_low_latency = false
35 | allow_mnnvl = false
36 | round_scale = false
37 | use_ue8m0 = false
38 | num_warmups = 20
39 | num_iterations = 50
40 | shuffle_columns = false
41 | use_kineto_profiler = false
42 | config_file_path = "/tmp/config.yaml"
43 | results_dir = "/workspace/dp-benchmark/results"
44 |
--------------------------------------------------------------------------------
/tests/ref_data/nccl.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0"
18 |
--------------------------------------------------------------------------------
/src/cloudai/systems/slurm/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .single_sbatch_runner import SingleSbatchRunner
18 | from .slurm_command_gen_strategy import SlurmCommandGenStrategy
19 | from .slurm_installer import SlurmInstaller
20 | from .slurm_job import SlurmJob
21 | from .slurm_metadata import SlurmJobMetadata, SlurmStepMetadata, SlurmSystemMetadata
22 | from .slurm_node import SlurmNode, SlurmNodeState
23 | from .slurm_runner import SlurmRunner
24 | from .slurm_system import SlurmGroup, SlurmPartition, SlurmSystem, parse_node_list
25 |
26 | __all__ = [
27 | "SingleSbatchRunner",
28 | "SlurmCommandGenStrategy",
29 | "SlurmGroup",
30 | "SlurmInstaller",
31 | "SlurmJob",
32 | "SlurmJobMetadata",
33 | "SlurmNode",
34 | "SlurmNodeState",
35 | "SlurmPartition",
36 | "SlurmRunner",
37 | "SlurmStepMetadata",
38 | "SlurmSystem",
39 | "SlurmSystemMetadata",
40 | "parse_node_list",
41 | ]
42 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/jax_toolbox/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .gpt import GPTCmdArgs, GPTTestDefinition
18 | from .grading_strategy import JaxToolboxGradingStrategy
19 | from .grok import GrokCmdArgs, GrokTestDefinition
20 | from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition
21 | from .nemotron import NemotronCmdArgs, NemotronTestDefinition
22 | from .report_generation_strategy import JaxToolboxReportGenerationStrategy
23 | from .slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
24 |
25 | __all__ = [
26 | "GPTCmdArgs",
27 | "GPTTestDefinition",
28 | "GrokCmdArgs",
29 | "GrokTestDefinition",
30 | "JaxFdl",
31 | "JaxToolboxCmdArgs",
32 | "JaxToolboxGradingStrategy",
33 | "JaxToolboxReportGenerationStrategy",
34 | "JaxToolboxSlurmCommandGenStrategy",
35 | "JaxToolboxTestDefinition",
36 | "NemotronCmdArgs",
37 | "NemotronTestDefinition",
38 | ]
39 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Build and Deploy Documentation
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | workflow_dispatch: # Allow manual triggering
7 |
8 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
9 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
10 | concurrency:
11 | group: "pages"
12 | cancel-in-progress: false
13 |
14 | jobs:
15 | build:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - name: Checkout
19 | uses: actions/checkout@v4
20 |
21 | - name: Install uv
22 | uses: astral-sh/setup-uv@v5
23 | with:
24 | enable-cache: true
25 |
26 | - name: Set up environment
27 | run: |
28 | set -eE
29 | set -o pipefail
30 |
31 | uv sync --extra docs
32 |
33 | - name: Build documentation
34 | run: |
35 | set -eE
36 | set -o pipefail
37 |
38 | source .venv/bin/activate
39 | cd doc
40 | make html
41 |
42 | - name: Add .nojekyll file
43 | run: touch doc/_build/html/.nojekyll
44 |
45 | - name: Upload artifact
46 | uses: actions/upload-pages-artifact@v3
47 | with:
48 | path: ./doc/_build/html
49 |
50 | deploy:
51 | name: Deploy to GitHub Pages
52 | needs: build
53 |
54 | permissions:
55 | pages: write
56 | id-token: write
57 |
58 | environment:
59 | name: github-pages
60 | url: ${{ steps.deployment.outputs.page_url }}
61 |
62 | runs-on: ubuntu-latest
63 | steps:
64 | - name: Deploy to GitHub Pages
65 | id: deployment
66 | uses: actions/deploy-pages@v4
--------------------------------------------------------------------------------
/doc/workloads/bash_cmd.rst:
--------------------------------------------------------------------------------
1 | Bash Command
2 | ============
3 |
4 | This workload (`test_template_name` is ``BashCmd``) allows users to execute arbitrary bash commands within the CloudAI framework. This is useful for simple scripts, custom testing commands, or integrating external tools.
5 |
6 | ``cmd`` specified in the ``cmd_args`` section will be added as-is into generated sbatch script.
7 |
8 | Usage Examples
9 | -------------
10 |
11 | Test TOML example:
12 |
13 | .. code-block:: toml
14 |
15 | name = "my_bash_test"
16 | description = "Example bash command test"
17 | test_template_name = "BashCmd"
18 |
19 | [cmd_args]
20 | cmd = "echo 'Hello from CloudAI!'"
21 |
22 | Test Scenario example:
23 |
24 | .. code-block:: toml
25 |
26 | name = "bash-test"
27 |
28 | [[Tests]]
29 | id = "bash.1"
30 | num_nodes = 1
31 | time_limit = "00:05:00"
32 |
33 | test_name = "my_bash_test"
34 |
35 | Test-in-Scenario example:
36 |
37 | .. code-block:: toml
38 |
39 | name = "bash-test"
40 |
41 | [[Tests]]
42 | id = "bash.1"
43 | num_nodes = 1
44 | time_limit = "00:05:00"
45 |
46 | name = "my_bash_test"
47 | description = "Example bash command test"
48 | test_template_name = "BashCmd"
49 |
50 | [Tests.cmd_args]
51 | cmd = "echo 'Hello from CloudAI!'"
52 |
53 | API Documentation
54 | ---------------------------------
55 |
56 | Command Arguments
57 | ~~~~~~~~~~~~~~~~~
58 |
59 | .. autoclass:: cloudai.workloads.bash_cmd.bash_cmd.BashCmdArgs
60 | :members:
61 | :show-inheritance:
62 |
63 | Test Definition
64 | ~~~~~~~~~~~~~~~
65 |
66 | .. autoclass:: cloudai.workloads.bash_cmd.bash_cmd.BashCmdTestDefinition
67 | :members:
68 | :show-inheritance:
69 |
--------------------------------------------------------------------------------
/doc/workloads/slurm_container.rst:
--------------------------------------------------------------------------------
1 | Slurm Container
2 | ===============
3 |
4 | This workload (`test_template_name` is ``SlurmContainer``) executes containerized workloads using Slurm with custom container configurations.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_container_test"
14 | description = "Example Slurm container test"
15 | test_template_name = "SlurmContainer"
16 |
17 | [cmd_args]
18 | image_path = "/path/to/container.sqsh"
19 | cmd = "python train.py"
20 |
21 | Test Scenario example:
22 |
23 | .. code-block:: toml
24 |
25 | name = "slurm-container-test"
26 |
27 | [[Tests]]
28 | id = "container.1"
29 | num_nodes = 2
30 | time_limit = "01:00:00"
31 |
32 | test_name = "my_container_test"
33 |
34 | Test-in-Scenario example:
35 |
36 | .. code-block:: toml
37 |
38 | name = "slurm-container-test"
39 |
40 | [[Tests]]
41 | id = "container.1"
42 | num_nodes = 2
43 | time_limit = "01:00:00"
44 |
45 | name = "my_container_test"
46 | description = "Example Slurm container test"
47 | test_template_name = "SlurmContainer"
48 |
49 | [Tests.cmd_args]
50 | image_path = "/path/to/container.sqsh"
51 | cmd = "python train.py"
52 |
53 | API Documentation
54 | -----------------
55 |
56 | Command Arguments
57 | ~~~~~~~~~~~~~~~~~
58 |
59 | .. autoclass:: cloudai.workloads.slurm_container.slurm_container.SlurmContainerCmdArgs
60 | :members:
61 | :show-inheritance:
62 |
63 | Test Definition
64 | ~~~~~~~~~~~~~~~
65 |
66 | .. autoclass:: cloudai.workloads.slurm_container.slurm_container.SlurmContainerTestDefinition
67 | :members:
68 | :show-inheritance:
69 |
--------------------------------------------------------------------------------
/doc/workloads/index.rst:
--------------------------------------------------------------------------------
1 | Workloads Documentation
2 | =======================
3 |
4 | This section contains automatically generated documentation for all CloudAI workloads. Each workload provides specific functionality for running different types of tests and benchmarks.
5 |
6 | Available Workloads
7 | -------------------
8 |
9 | .. csv-table::
10 | :header: "Test", "Slurm", "Kubernetes", "RunAI", "Standalone"
11 | :widths: 40, 15, 15, 15, 15
12 |
13 | ":doc:`aiconfigurator`", "❌", "❌", "❌", "✅"
14 | ":doc:`ai_dynamo`", "✅", "✅", "❌", "❌"
15 | ":doc:`bash_cmd`", "✅", "❌", "❌", "❌"
16 | ":doc:`chakra_replay`", "✅", "❌", "❌", "❌"
17 | ":doc:`ddlb`", "✅", "❌", "❌", "❌"
18 | ":doc:`deepep`", "✅", "❌", "❌", "❌"
19 | ":doc:`jax_toolbox`", "✅", "❌", "❌", "❌"
20 | "MegatronRun", "✅", "❌", "❌", "❌"
21 | ":doc:`nccl`", "✅", "✅", "✅", "❌"
22 | ":doc:`nemo_launcher`", "✅", "❌", "❌", "❌"
23 | ":doc:`nemo_run`", "✅", "❌", "❌", "❌"
24 | ":doc:`nixl_bench`", "✅", "❌", "❌", "❌"
25 | ":doc:`nixl_kvbench`", "✅", "❌", "❌", "❌"
26 | ":doc:`nixl_perftest`", "✅", "❌", "❌", "❌"
27 | ":doc:`sleep`", "✅", "✅", "❌", "✅"
28 | ":doc:`slurm_container`", "✅", "❌", "❌", "❌"
29 | "Triton Inference", "✅", "❌", "❌", "❌"
30 | ":doc:`ucc`", "✅", "❌", "❌", "❌"
31 |
32 | .. toctree::
33 | :hidden:
34 | :glob:
35 |
36 | *
37 |
38 | Adding New Workloads
39 | ---------------------
40 |
41 | To add documentation for a new workload:
42 |
43 | 1. Add docstrings to your Python classes and methods.
44 | 2. Create a markdown file in ``doc/workloads/`` (e.g., ``my_workload.md``).
45 | 3. Add it to the table above.
46 |
47 | The documentation will be automatically generated during the build process.
48 |
--------------------------------------------------------------------------------
/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_launcher_nemotron_15b_fp8"
18 |
19 | [[Tests]]
20 | id = "nemo_launcher_nemotron_15b_fp8_2_node"
21 | test_name = "nemo_launcher_nemotron_15b_fp8_2_node"
22 | num_nodes = "2"
23 |
24 | [[Tests]]
25 | id = "nemo_launcher_nemotron_15b_fp8_4_node"
26 | test_name = "nemo_launcher_nemotron_15b_fp8_4_node"
27 | num_nodes = "4"
28 |
29 | [[Tests]]
30 | id = "nemo_launcher_nemotron_15b_fp8_8_node"
31 | test_name = "nemo_launcher_nemotron_15b_fp8_8_node"
32 | num_nodes = "8"
33 |
34 | [[Tests]]
35 | id = "nemo_launcher_nemotron_15b_fp8_16_node"
36 | test_name = "nemo_launcher_nemotron_15b_fp8_16_node"
37 | num_nodes = "16"
38 |
39 | [[Tests]]
40 | id = "nemo_launcher_nemotron_15b_fp8_32_node"
41 | test_name = "nemo_launcher_nemotron_15b_fp8_32_node"
42 | num_nodes = "32"
43 |
44 | [[Tests]]
45 | id = "nemo_launcher_nemotron_15b_fp8_64_node"
46 | test_name = "nemo_launcher_nemotron_15b_fp8_64_node"
47 | num_nodes = "64"
48 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nccl_test/prediction_report_generation_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from cloudai.core import System, TestRun
19 |
20 | from .prediction_report_generator import NcclTestPredictionReportGenerator
21 | from .report_generation_strategy import NcclTestReportGenerationStrategy
22 |
23 |
24 | class NcclTestPredictionReportGenerationStrategy(NcclTestReportGenerationStrategy):
25 | """Strategy for generating prediction reports from NCCL test outputs."""
26 |
27 | def __init__(self, system: System, tr: TestRun) -> None:
28 | super().__init__(system, tr)
29 |
30 | collective_type = self._normalize_collective_type(tr.test.cmd_args.subtest_name)
31 | self.prediction_report = NcclTestPredictionReportGenerator(collective_type, tr)
32 |
33 | def _normalize_collective_type(self, subtest_name: str) -> str:
34 | return subtest_name.replace("_perf", "").replace("_mpi", "")
35 |
36 | def generate_report(self) -> None:
37 | self.prediction_report.generate()
38 |
--------------------------------------------------------------------------------
/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "dynamo-vllm-slurm"
18 | job_status_check = false
19 |
20 | [[Tests]]
21 | id = "test.disagg.single-node"
22 | test_name = "vLLM-Qwen3-0.6B"
23 | num_nodes = 2 # 1 prefill node + 1 decode node
24 | time_limit = "00:10:00"
25 |
26 | [Tests.cmd_args.dynamo.prefill_worker]
27 | num-nodes = 1
28 | tensor-parallel-size = 4
29 | pipeline-parallel-size = 1
30 |
31 | [Tests.cmd_args.dynamo.decode_worker]
32 | num-nodes = 1
33 | tensor-parallel-size = 4
34 | pipeline-parallel-size = 1
35 |
36 | [[Tests]]
37 | id = "test.disagg.multinode"
38 | test_name = "vLLM-Qwen3-0.6B"
39 | num_nodes = 4 # 2 prefill nodes + 2 decode nodes
40 | time_limit = "00:10:00"
41 |
42 | [Tests.cmd_args.dynamo.prefill_worker]
43 | num-nodes = 2
44 | tensor-parallel-size = 4
45 | pipeline-parallel-size = 1
46 |
47 | [Tests.cmd_args.dynamo.decode_worker]
48 | num-nodes = 2
49 | tensor-parallel-size = 4
50 | pipeline-parallel-size = 1
51 |
--------------------------------------------------------------------------------
/conf/experimental/aiconfigurator/test/dse_aiconfigurator_disagg.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "dse_aiconfigurator_disagg_demo_Llama3.1_70B"
18 | description = "Aiconfigurator disaggregated predictor DSE sweeps"
19 | test_template_name = "Aiconfigurator"
20 | agent_metrics = [
21 | "ttft_ms",
22 | "tpot_ms",
23 | "tokens_per_s_per_gpu",
24 | "tokens_per_s_per_user",
25 | ]
26 | agent_reward_function = "ai_dynamo_log_scale"
27 |
28 |
29 | [cmd_args]
30 | model_name = "LLAMA3.1_70B"
31 | system = "h200_sxm"
32 | # backend and version use defaults
33 | isl = 4000
34 | osl = 500
35 |
36 | [cmd_args.disagg]
37 | p_tp = [1]
38 | p_pp = [1]
39 | p_dp = [1]
40 | p_bs = 1
41 | p_workers = [1, 2]
42 |
43 | d_tp = [1]
44 | d_pp = [1]
45 | d_dp = [1]
46 | d_bs = [8]
47 | d_workers = [1, 2]
48 |
49 | gemm_quant_mode = "fp8_block"
50 | moe_quant_mode = "fp8"
51 | kvcache_quant_mode = "fp8"
52 | fmha_quant_mode = "fp8"
53 | comm_quant_mode = "half"
54 | prefill_correction_scale = 1.0
55 | decode_correction_scale = 1.0
56 |
--------------------------------------------------------------------------------
/tests/ref_data/megatron-run.sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # generated by CloudAI@__CLOUDAI_VERSION__
3 | #SBATCH --job-name=__JOB_NAME__
4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6 | #SBATCH --partition=main
7 | #SBATCH -N 1
8 | #SBATCH --gpus-per-node=8
9 | #SBATCH --gres=gpu:8
10 |
11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12 |
13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14 |
15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16 |
17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --recompute-activations --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__ --load __CLOUDAI_DIR__ --tokenizer-model __CLOUDAI_DIR__/model.m"
18 |
--------------------------------------------------------------------------------
/src/cloudai/util/nixl_report_template.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
9 |
10 |
45 | {{ bokeh_script | safe }}
46 |
47 |
48 |
49 | {{ title }}
50 |
51 |
52 |
Interactive Charts
53 |
Use the interactive tools to zoom, pan, and hover over data points. Click on legend items
54 | to show/hide lines.
55 | {{ bokeh_div | safe }}
56 |
57 |
58 |
59 | {{ rich_html | safe }}
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nccl_test/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .grading_strategy import NcclTestGradingStrategy
18 | from .kubernetes_json_gen_strategy import NcclTestKubernetesJsonGenStrategy
19 | from .nccl import NCCLCmdArgs, NCCLTestDefinition
20 | from .nccl_comparison_report import ComparisonReportConfig, NcclComparisonReport
21 | from .performance_report_generation_strategy import NcclTestPerformanceReportGenerationStrategy
22 | from .prediction_report_generation_strategy import NcclTestPredictionReportGenerationStrategy
23 | from .runai_json_gen_strategy import NcclTestRunAIJsonGenStrategy
24 | from .slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
25 |
26 | __all__ = [
27 | "ComparisonReportConfig",
28 | "NCCLCmdArgs",
29 | "NCCLTestDefinition",
30 | "NcclComparisonReport",
31 | "NcclTestGradingStrategy",
32 | "NcclTestKubernetesJsonGenStrategy",
33 | "NcclTestPerformanceReportGenerationStrategy",
34 | "NcclTestPredictionReportGenerationStrategy",
35 | "NcclTestRunAIJsonGenStrategy",
36 | "NcclTestSlurmCommandGenStrategy",
37 | ]
38 |
--------------------------------------------------------------------------------
/src/cloudai/util/base-report.jinja2:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{ name }}
5 |
51 | {% block extra_head %}{% endblock %}
52 |
53 |
54 | {{ name }}
55 | {% block content %}{% endblock %}
56 |
57 |
--------------------------------------------------------------------------------
/conf/common/test/nemo_run_llama3_8b.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_run_llama3_8b"
18 | description = "nemo_run_llama3_8b"
19 | test_template_name = "NeMoRun"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/nemo:25.09.00"
23 | task = "pretrain"
24 | recipe_name = "cloudai_llama3_8b_recipe"
25 |
26 | [cmd_args.data]
27 | seq_length = 8192
28 | micro_batch_size = 1
29 | global_batch_size = 128
30 |
31 | [cmd_args.trainer]
32 | max_steps = 10
33 | val_check_interval = 100
34 | num_nodes = 1
35 |
36 | [cmd_args.trainer.strategy]
37 | tensor_model_parallel_size = 4
38 | pipeline_model_parallel_size = 1
39 | context_parallel_size = 1
40 |
41 | [extra_env_vars]
42 | NCCL_P2P_NET_CHUNKSIZE = "2097152"
43 | TORCHX_MAX_RETRIES = "0"
44 | TRANSFORMERS_OFFLINE = "0"
45 | NCCL_NVLS_ENABLE = "0"
46 | NVTE_DP_AMAX_REDUCE_INTERVAL = "0"
47 | NVTE_ASYNC_AMAX_REDUCTION = "1"
48 | NVTE_FUSED_ATTN = "1"
49 | NVTE_FLASH_ATTN = "1"
50 | NEMO_LOG_MEMORY_USAGE = "1"
51 | CUDA_DEVICE_MAX_CONNECTIONS = "1"
52 | NVTE_FWD_LAYERNORM_SM_MARGIN = "16"
53 | NVTE_BWD_LAYERNORM_SM_MARGIN = "16"
54 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test/nemo_run_llama3_8b.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "nemo_run_llama3_8b"
18 | description = "dse_nemo_run_llama3_8b"
19 | test_template_name = "NeMoRun"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io#nvidia/nemo:25.07"
23 | task = "pretrain"
24 | recipe_name = "cloudai_llama3_8b_recipe"
25 |
26 | [cmd_args.data]
27 | seq_length = 8192
28 | micro_batch_size = 1
29 | global_batch_size = 128
30 |
31 | [cmd_args.trainer]
32 | max_steps = 30
33 | val_check_interval = 30
34 | num_nodes = 1
35 |
36 | [cmd_args.trainer.strategy]
37 | tensor_model_parallel_size = 4
38 | pipeline_model_parallel_size = 1
39 | context_parallel_size = 2
40 |
41 | [extra_env_vars]
42 | NCCL_P2P_NET_CHUNKSIZE = "2097152"
43 | TORCHX_MAX_RETRIES = "0"
44 | TRANSFORMERS_OFFLINE = "0"
45 | NCCL_NVLS_ENABLE = "0"
46 | NVTE_DP_AMAX_REDUCE_INTERVAL = "0"
47 | NVTE_ASYNC_AMAX_REDUCTION = "1"
48 | NVTE_FUSED_ATTN = "1"
49 | NVTE_FLASH_ATTN = "1"
50 | NEMO_LOG_MEMORY_USAGE = "1"
51 | CUDA_DEVICE_MAX_CONNECTIONS = "1"
52 | NVTE_FWD_LAYERNORM_SM_MARGIN = "16"
53 | NVTE_BWD_LAYERNORM_SM_MARGIN = "16"
54 |
--------------------------------------------------------------------------------
/conf/common/system/example_slurm_cluster.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "example-cluster"
18 | scheduler = "slurm"
19 |
20 | install_path = "./install_dir"
21 | output_path = "./results"
22 | default_partition = "partition_1"
23 |
24 | mpi = "pmix"
25 | gpus_per_node = 8
26 | ntasks_per_node = 8
27 |
28 | [[partitions]]
29 | name = "partition_1"
30 |
31 | [[partitions.groups]]
32 | name = "group_1"
33 | nodes = ["node-[001-025]"]
34 |
35 | [[partitions.groups]]
36 | name = "group_2"
37 | nodes = ["node-[026-050]"]
38 |
39 | [[partitions.groups]]
40 | name = "group_3"
41 | nodes = ["node-[051-075]"]
42 |
43 | [[partitions.groups]]
44 | name = "group_4"
45 | nodes = ["node-[076-100]"]
46 |
47 | [[partitions]]
48 | name = "partition_2"
49 |
50 | [data_repository]
51 | endpoint = "MY_ENDPOINT"
52 | verify_certs = false
53 |
54 | [global_env_vars]
55 | # NCCL Specific Configurations
56 | NCCL_IB_GID_INDEX = "3"
57 | NCCL_IB_TIMEOUT = "20"
58 | NCCL_IB_QPS_PER_CONNECTION = "4"
59 |
60 | # Device Visibility Configuration
61 | MELLANOX_VISIBLE_DEVICES = "0,3,4,5,6,9,10,11"
62 | CUDA_VISIBLE_DEVICES = "0,1,2,3,4,5,6,7"
63 |
--------------------------------------------------------------------------------
/doc/workloads/nixl_kvbench.rst:
--------------------------------------------------------------------------------
1 | NIXL KVBench
2 | ============
3 |
4 | This workload (`test_template_name` is ``NIXLKVBench``) runs NIXL KV-cache benchmarking for key-value store performance testing.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_nixl_kvbench_test"
14 | description = "Example NIXL KVBench test"
15 | test_template_name = "NIXLKVBench"
16 |
17 | [cmd_args]
18 | docker_image_url = "..."
19 | model = "./examples/model_deepseek_r1.yaml"
20 | model_config = "./examples/block-tp1-pp16.yaml"
21 | backend = "POSIX"
22 | num_requests = 1
23 | source = "file"
24 | num_iter = 16
25 | page_size = 256
26 | filepath = "/data"
27 |
28 | Test Scenario example:
29 |
30 | .. code-block:: toml
31 |
32 | name = "nixl-kvbench-test"
33 |
34 | [[Tests]]
35 | id = "kvbench.1"
36 | num_nodes = 1
37 | time_limit = "00:10:00"
38 |
39 | test_name = "my_nixl_kvbench_test"
40 |
41 | Test-in-Scenario example:
42 |
43 | .. code-block:: toml
44 |
45 | name = "nixl-kvbench-test"
46 |
47 | [[Tests]]
48 | id = "kvbench.1"
49 | num_nodes = 1
50 | time_limit = "00:10:00"
51 |
52 | name = "my_nixl_kvbench_test"
53 | description = "Example NIXL KVBench test"
54 | test_template_name = "NIXLKVBench"
55 |
56 | [Tests.cmd_args]
57 | docker_image_url = "..."
58 | backend = "UCX"
59 | source = "memory"
60 | op_type = "READ"
61 |
62 | API Documentation
63 | -----------------
64 |
65 | Command Arguments
66 | ~~~~~~~~~~~~~~~~~
67 |
68 | .. autoclass:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchCmdArgs
69 | :members:
70 | :show-inheritance:
71 |
72 | Test Definition
73 | ~~~~~~~~~~~~~~~
74 |
75 | .. autoclass:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchTestDefinition
76 | :members:
77 | :show-inheritance:
78 |
--------------------------------------------------------------------------------
/doc/workloads/nixl_bench.rst:
--------------------------------------------------------------------------------
1 | NIXL Bench
2 | ==========
3 |
4 | This workload (`test_template_name` is ``NIXLBench``) runs NIXL benchmarking suite for network and interconnect performance testing.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_nixl_bench_test"
14 | description = "Example NIXL Bench test"
15 | test_template_name = "NIXLBench"
16 |
17 | [cmd_args]
18 | docker_image_url = "..."
19 | path_to_benchmark = "/workspace/nixlbench/build/nixlbench"
20 | backend = "UCX"
21 | initiator_seg_type = "VRAM"
22 | target_seg_type = "VRAM"
23 | op_type = "READ"
24 |
25 | Test Scenario example:
26 |
27 | .. code-block:: toml
28 |
29 | name = "nixl-bench-test"
30 |
31 | [[Tests]]
32 | id = "bench.1"
33 | num_nodes = 1
34 | time_limit = "00:10:00"
35 |
36 | test_name = "my_nixl_bench_test"
37 |
38 | Test-in-Scenario example:
39 |
40 | .. code-block:: toml
41 |
42 | name = "nixl-bench-test"
43 |
44 | [[Tests]]
45 | id = "bench.1"
46 | num_nodes = 1
47 | time_limit = "00:10:00"
48 |
49 | name = "my_nixl_bench_test"
50 | description = "Example NIXL Bench test"
51 | test_template_name = "NIXLBench"
52 |
53 | [Tests.cmd_args]
54 | docker_image_url = "..."
55 | path_to_benchmark = "/workspace/nixlbench/build/nixlbench"
56 | backend = "UCX"
57 | initiator_seg_type = "DRAM"
58 | target_seg_type = "DRAM"
59 | op_type = "WRITE"
60 |
61 | API Documentation
62 | -----------------
63 |
64 | Command Arguments
65 | ~~~~~~~~~~~~~~~~~
66 |
67 | .. autoclass:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchCmdArgs
68 | :members:
69 | :show-inheritance:
70 |
71 | Test Definition
72 | ~~~~~~~~~~~~~~~
73 |
74 | .. autoclass:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchTestDefinition
75 | :members:
76 | :show-inheritance:
77 |
--------------------------------------------------------------------------------
/doc/workloads/ddlb.rst:
--------------------------------------------------------------------------------
1 | DDLB
2 | ====
3 |
4 | This workload (`test_template_name` is ``DDLB``) allows users to execute DDLB (Distributed Deep Learning Benchmarks) within the CloudAI framework. Please find the DDLB README at https://github.com/samnordmann/ddlb.
5 |
6 | Usage Examples
7 | -------------
8 |
9 | Test TOML example:
10 |
11 | .. code-block:: toml
12 |
13 | name = "my_ddlb_test"
14 | description = "Example DDLB test"
15 | test_template_name = "DDLB"
16 |
17 | [cmd_args]
18 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
19 | primitive = "tp_columnwise"
20 | dtype = "float16"
21 |
22 | Test Scenario example:
23 |
24 | .. code-block:: toml
25 |
26 | name = "ddlb-test"
27 |
28 | [[Tests]]
29 | id = "ddlb.1"
30 | num_nodes = 1
31 | time_limit = "00:10:00"
32 |
33 | test_name = "my_ddlb_test"
34 |
35 | Test-in-Scenario example:
36 |
37 | .. code-block:: toml
38 |
39 | name = "ddlb-test"
40 |
41 | [[Tests]]
42 | id = "ddlb.1"
43 | num_nodes = 1
44 | time_limit = "00:10:00"
45 |
46 | name = "my_ddlb_test"
47 | description = "Example DDLB test"
48 | test_template_name = "DDLB"
49 |
50 | [Tests.cmd_args]
51 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
52 | primitive = "tp_columnwise"
53 | m = 1024
54 | n = 128
55 | k = 1024
56 | dtype = "float16"
57 | num_iterations = 50
58 | num_warmups = 5
59 | impl = "pytorch;backend=nccl;order=AG_before"
60 |
61 | API Documentation
62 | ---------------------------------
63 |
64 | Command Arguments
65 | ~~~~~~~~~~~~~~~~~
66 |
67 | .. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBCmdArgs
68 | :members:
69 | :show-inheritance:
70 |
71 | Test Definition
72 | ~~~~~~~~~~~~~~~
73 |
74 | .. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBTestDefinition
75 | :members:
76 | :show-inheritance:
77 |
78 |
--------------------------------------------------------------------------------
/doc/workloads/osu.rst:
--------------------------------------------------------------------------------
1 | OSU
2 | ===
3 |
4 | This workload (``test_template_name`` is ``OSUBench``) allows you to execute OSU Micro Benchmarks
5 | within the CloudAI framework.
6 |
7 | Usage example
8 | -------------
9 |
10 | Test example:
11 |
12 | .. code-block:: toml
13 |
14 | name = "osu_example"
15 | test_template_name = "OSUBench"
16 | description = "OSU Benchmark example"
17 |
18 | [cmd_args]
19 | "docker_image_url" = "docker-image-with-osu-benchmark:latest"
20 | "benchmarks_dir" = "/directory/with/osu/binaries/in/container"
21 | "benchmark" = ["osu_allreduce", "osu_allgather"]
22 | "iterations" = 10
23 | "message_size" = "1024"
24 |
25 | Test Scenario example:
26 |
27 | .. code-block:: toml
28 |
29 | name = "osu_example"
30 |
31 | [[Tests]]
32 | id = "Tests.1"
33 | test_name = "osu_example"
34 | num_nodes = "2"
35 | time_limit = "00:20:00"
36 |
37 | Test-in-Scenario example:
38 |
39 | .. code-block:: toml
40 |
41 | name = "osu-test"
42 |
43 | [[Tests]]
44 | id = "Tests.osu_allreduce"
45 | num_nodes = 2
46 | time_limit = "00:05:00"
47 |
48 | name = "osu_example"
49 | description = "OSU allreduce 1KB"
50 | test_template_name = "OSUBench"
51 |
52 | [Tests.cmd_args]
53 | docker_image_url = "docker-image-with-osu-benchmark:latest"
54 | benchmarks_dir = "/directory/with/osu/binaries/in/container"
55 | benchmark = "osu_allreduce"
56 | iterations = 10
57 | message_size = "1024"
58 |
59 | API Documentation
60 | -----------------
61 |
62 | Command Arguments
63 | ~~~~~~~~~~~~~~~~~
64 |
65 | .. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchCmdArgs
66 | :members:
67 | :show-inheritance:
68 |
69 | Test Definition
70 | ~~~~~~~~~~~~~~~
71 |
72 | .. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchTestDefinition
73 | :members:
74 | :show-inheritance:
75 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/nccl_test/report_generation_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import re
18 |
19 | from cloudai.core import ReportGenerationStrategy, System, TestRun
20 |
21 |
22 | class NcclTestReportGenerationStrategy(ReportGenerationStrategy):
23 | """Base strategy for generating reports from NCCL test outputs."""
24 |
25 | def __init__(self, system: System, tr: TestRun) -> None:
26 | super().__init__(system, tr)
27 |
28 | def can_handle_directory(self) -> bool:
29 | stdout_path = self.test_run.output_path / "stdout.txt"
30 | if stdout_path.exists():
31 | with stdout_path.open("r") as file:
32 | content = file.read()
33 | return bool(
34 | re.search(r"out-of-place|in-place", content)
35 | and re.search(
36 | r"\b(size\s+count\s+type\s+redop\s+root\s+"
37 | r"time\s+algbw\s+busbw\s+#wrong\s+time\s+"
38 | r"algbw\s+busbw\s+#wrong)\b",
39 | content,
40 | re.IGNORECASE,
41 | )
42 | )
43 | return False
44 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/megatron_run/slurm_command_gen_strategy.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | from typing import cast
19 |
20 | from cloudai.systems.slurm import SlurmCommandGenStrategy
21 |
22 | from .megatron_run import MegatronRunTestDefinition
23 |
24 |
25 | class MegatronRunSlurmCommandGenStrategy(SlurmCommandGenStrategy):
26 | """Command generation strategy for MegatronRun on Slurm systems."""
27 |
28 | def image_path(self) -> str | None:
29 | tdef: MegatronRunTestDefinition = cast(MegatronRunTestDefinition, self.test_run.test)
30 | return str(tdef.docker_image.installed_path)
31 |
32 | def _container_mounts(self) -> list[str]:
33 | return []
34 |
35 | def generate_test_command(self) -> list[str]:
36 | tdef: MegatronRunTestDefinition = cast(MegatronRunTestDefinition, self.test_run.test)
37 |
38 | command = [
39 | "python",
40 | str((tdef.cmd_args.run_script).absolute()),
41 | *[f"{k} {v}" for k, v in tdef.cmd_args_dict.items()],
42 | ]
43 |
44 | if self.test_run.test.extra_cmd_args:
45 | command.append(self.test_run.test.extra_args_str)
46 |
47 | return command
48 |
--------------------------------------------------------------------------------
/conf/common/test_scenario/ucc_test.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "ucc_test"
18 |
19 | [[Tests]]
20 | id = "Tests.alltoall"
21 | test_name = "ucc_base_test"
22 | description = "UCC alltoall"
23 | time_limit = "00:20:00"
24 | num_nodes = 2
25 | [Tests.cmd_args]
26 | collective = "alltoall"
27 |
28 | [[Tests]]
29 | id = "Tests.allgather"
30 | test_name = "ucc_base_test"
31 | description = "UCC allgather"
32 | time_limit = "00:20:00"
33 | num_nodes = 2
34 | [Tests.cmd_args]
35 | collective = "allgather"
36 | [[Tests.dependencies]]
37 | type = "start_post_comp"
38 | id = "Tests.alltoall"
39 |
40 | [[Tests]]
41 | id = "Tests.allreduce"
42 | test_name = "ucc_base_test"
43 | description = "UCC allreduce"
44 | time_limit = "00:20:00"
45 | num_nodes = 2
46 | [Tests.cmd_args]
47 | collective = "allreduce"
48 | e = "4G"
49 | [[Tests.dependencies]]
50 | type = "start_post_comp"
51 | id = "Tests.allgather"
52 |
53 | [[Tests]]
54 | id = "Tests.reduce_scatter"
55 | test_name = "ucc_base_test"
56 | description = "UCC reduce_scatter"
57 | time_limit = "00:20:00"
58 | num_nodes = 2
59 | [Tests.cmd_args]
60 | collective = "reduce_scatter"
61 | [[Tests.dependencies]]
62 | type = "start_post_comp"
63 | id = "Tests.allreduce"
64 |
--------------------------------------------------------------------------------
/conf/experimental/ai_dynamo/test/vllm.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "vLLM-Qwen3-0.6B"
18 | description = "vLLM backend with Qwen3-0.6B model"
19 | test_template_name = "AIDynamo"
20 |
21 | [cmd_args]
22 | docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.0"
23 |
24 | [cmd_args.dynamo]
25 | backend = "vllm"
26 | model = "Qwen/Qwen3-0.6B"
27 | workspace-path = "/workspace/examples/backends/vllm"
28 | prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
29 | decode-cmd = 'python3 -m dynamo.vllm'
30 |
31 | [cmd_args.dynamo.decode_worker]
32 | pipeline-parallel-size = 1
33 |
34 | [cmd_args.genai_perf]
35 | model = "Qwen/Qwen3-0.6B"
36 | endpoint = "v1/chat/completions"
37 | endpoint-type = "chat"
38 | extra-inputs = 'min_tokens:10'
39 | output-tokens-mean = 500
40 | output-tokens-stddev = 0
41 | random-seed = 123
42 | request-count = 50
43 | synthetic-input-tokens-mean = 300
44 | synthetic-input-tokens-stddev = 0
45 | warmup-request-count = 5
46 | concurrency = 2
47 | extra-args = "--streaming -- -v --async"
48 |
49 | [extra_env_vars]
50 | UCX_LOG_LEVEL = "warn"
51 | UCX_TLS = "cuda_copy,rc_x"
52 | DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
53 |
--------------------------------------------------------------------------------
/src/cloudai/workloads/chakra_replay/chakra_replay.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from typing import Optional
18 |
19 | from cloudai.core import CmdArgs, DockerImage, Installable, TestDefinition
20 |
21 |
22 | class ChakraReplayCmdArgs(CmdArgs):
23 | """ChakraReplay test command arguments."""
24 |
25 | docker_image_url: str
26 | mpi: str = "pmix"
27 | trace_type: str = "et"
28 | trace_path: Optional[str] = None
29 | num_replays: int = 1
30 |
31 |
32 | class ChakraReplayTestDefinition(TestDefinition):
33 | """Test object for ChakraReplay."""
34 |
35 | cmd_args: ChakraReplayCmdArgs
36 | _docker_image: Optional[DockerImage] = None
37 |
38 | @property
39 | def docker_image(self) -> DockerImage:
40 | if not self._docker_image:
41 | self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
42 | return self._docker_image
43 |
44 | @property
45 | def installables(self) -> list[Installable]:
46 | return [self.docker_image]
47 |
48 | @property
49 | def extra_args_str(self) -> str:
50 | parts = []
51 | for k, v in self.extra_cmd_args.items():
52 | parts.append(f"{k} {v}" if v else k)
53 | return " ".join(parts)
54 |
--------------------------------------------------------------------------------
/tests/test_job_type_handler.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import pytest
18 |
19 | from cloudai.core import TestRun
20 | from cloudai.systems.slurm.slurm_system import SlurmSystem
21 | from cloudai.workloads.nccl_test import NCCLCmdArgs, NCCLTestDefinition
22 |
23 |
24 | @pytest.fixture
25 | def tr(slurm_system: SlurmSystem) -> TestRun:
26 | tdef = NCCLTestDefinition(
27 | name="nccl",
28 | description="NCCL Test",
29 | test_template_name="NcclTest",
30 | cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"),
31 | )
32 | return TestRun(name="test_run", test=tdef, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
33 |
34 |
35 | def test_is_dse_job_non_dse(tr: TestRun):
36 | assert tr.is_dse_job is False
37 |
38 |
39 | def test_is_dse_job_dse_args(tr: TestRun):
40 | tr.test.cmd_args.nthreads = [1, 2]
41 | tr.test.extra_env_vars = {"VAR1": "singular"}
42 | assert tr.is_dse_job is True
43 |
44 |
45 | def test_is_dse_job_dse_env_vars(tr: TestRun):
46 | tr.test.extra_env_vars = {"VAR1": ["list-item1", "list-item2"], "VAR2": "singular3"}
47 | assert tr.is_dse_job is True
48 |
49 |
50 | def test_is_dse_job_num_nodes(tr: TestRun):
51 | tr.num_nodes = [1, 2]
52 | assert tr.is_dse_job is True
53 |
--------------------------------------------------------------------------------
/conf/release/nemo_acceptance/test/gpt3_126m_mock.toml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name = "gpt3_126m_mock"
18 | description = "gpt3_126m_mock"
19 | test_template_name = "NeMoLauncher"
20 |
21 | [cmd_args]
22 | [cmd_args.training]
23 | values = "gpt3/126m"
24 |
25 | [cmd_args.training.trainer]
26 | max_steps = "100"
27 | val_check_interval = "20"
28 |
29 | [cmd_args.training.model]
30 | pipeline_model_parallel_size = "1"
31 |
32 | [cmd_args.training.model.data]
33 | data_impl = "mock"
34 |
35 | [cmd_args.training.run]
36 | name = "run"
37 | time_limit = "20:00"
38 |
39 | [extra_cmd_args]
40 | "training.model.activations_checkpoint_num_layers" = "null"
41 | "training.model.fsdp" = "true"
42 | "training.model.fsdp_grad_reduce_dtype" = "bf16"
43 | "training.model.fsdp_sharding_strategy" = "full"
44 | "training.model.mcore_gpt" = "true"
45 | "training.model.optim.name" = "fused_adam"
46 | "training.trainer.limit_val_batches" = "5"
47 | "training.trainer.log_every_n_steps" = "1"
48 | "training.trainer.num_nodes" = "1"
49 | "~training.model.optim.bucket_cap_mb" = "null"
50 | "~training.model.optim.contiguous_grad_buffer" = "null"
51 | "~training.model.optim.overlap_grad_sync" = "null"
52 | "~training.model.optim.overlap_param_sync" = "null"
53 |
--------------------------------------------------------------------------------