├── .python-version ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── general_question.md │ ├── feature_request.md │ └── bug_report.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── docs.yml ├── greptile.json ├── .coderabbit.yaml ├── tests ├── ref_data │ ├── triton-inference-start_server_wrapper.sh │ ├── sleep.sbatch │ ├── slurm_container.sbatch │ ├── nemo-launcher.sbatch │ ├── ucc.sbatch │ ├── osu-bench.sbatch │ ├── ddlb.sbatch │ ├── nccl.sbatch │ └── megatron-run.sbatch ├── test_standalone_installer.py └── test_job_type_handler.py ├── src └── cloudai │ ├── util │ ├── general-report.jinja2 │ ├── general-slurm-report.jinja2 │ ├── nixl_report_template.jinja2 │ └── base-report.jinja2 │ ├── _core │ ├── __init__.py │ ├── base_job.py │ ├── job_status_result.py │ ├── grading_strategy.py │ ├── report_generation_strategy.py │ └── base_system_parser.py │ ├── systems │ ├── __init__.py │ ├── lsf │ │ ├── lsf_job.py │ │ └── __init__.py │ ├── slurm │ │ ├── slurm_job.py │ │ └── __init__.py │ ├── standalone │ │ ├── standalone_job.py │ │ └── __init__.py │ ├── kubernetes │ │ ├── kubernetes_job.py │ │ └── __init__.py │ └── runai │ │ ├── runai_job.py │ │ └── __init__.py │ ├── workloads │ ├── __init__.py │ ├── bash_cmd │ │ └── __init__.py │ ├── ddlb │ │ └── __init__.py │ ├── osu_bench │ │ └── __init__.py │ ├── nixl_kvbench │ │ └── __init__.py │ ├── slurm_container │ │ └── __init__.py │ ├── jax_toolbox │ │ ├── grading_strategy.py │ │ └── __init__.py │ ├── triton_inference │ │ ├── report_generation_strategy.py │ │ └── __init__.py │ ├── deepep │ │ └── __init__.py │ ├── megatron_run │ │ ├── __init__.py │ │ └── slurm_command_gen_strategy.py │ ├── nixl_perftest │ │ ├── __init__.py │ │ └── report_generation_strategy.py │ ├── aiconfig │ │ └── __init__.py │ ├── ucc_test │ │ └── __init__.py │ ├── nixl_bench │ │ └── __init__.py │ ├── sleep │ │ ├── sleep.py │ │ ├── standalone_command_gen_strategy.py │ │ ├── slurm_command_gen_strategy.py │ │ ├── __init__.py │ │ ├── grading_strategy.py │ │ └── lsf_command_gen_strategy.py │ ├── chakra_replay │ │ ├── __init__.py │ │ ├── grading_strategy.py │ │ └── chakra_replay.py │ ├── nemo_launcher │ │ └── __init__.py │ ├── nemo_run │ │ └── __init__.py │ ├── ai_dynamo │ │ └── __init__.py │ └── nccl_test │ │ ├── prediction_report_generation_strategy.py │ │ ├── __init__.py │ │ └── report_generation_strategy.py │ ├── report_generator │ ├── __init__.py │ └── tool │ │ ├── __init__.py │ │ └── report_tool_interface.py │ ├── cli │ └── __init__.py │ ├── configurator │ └── __init__.py │ └── __init__.py ├── .taplo.toml ├── conf ├── common │ ├── test │ │ ├── sleep.toml │ │ ├── ucc_test.toml │ │ ├── nccl_test.toml │ │ ├── osu_test.toml │ │ ├── nccl_test_all_gather.toml │ │ ├── dse_nccl_all_gather.toml │ │ └── nemo_run_llama3_8b.toml │ ├── system │ │ ├── standalone_system.toml │ │ ├── kubernetes_cluster.toml │ │ ├── example_runai_cluster.toml │ │ └── example_slurm_cluster.toml │ └── test_scenario │ │ ├── osu_test.toml │ │ ├── dse_nccl_all_gather.toml │ │ ├── nemo_run_llama3_8b.toml │ │ ├── slurm_container.toml │ │ ├── ucc_generator_test.toml │ │ ├── sleep.toml │ │ └── ucc_test.toml ├── hook │ ├── nccl_test.toml │ └── test │ │ └── nccl_test_all_gather.toml ├── release │ ├── nemo_acceptance │ │ ├── test_scenario │ │ │ ├── gpt3_126m_mock.toml │ │ │ ├── gpt3_126m_pile.toml │ │ │ ├── nccl_test_loopback.toml │ │ │ └── nemo_run_llama3_8b.toml │ │ └── test │ │ │ ├── nccl_test_all_reduce_loopback.toml │ │ │ ├── nemo_run_llama3_8b.toml │ │ │ └── gpt3_126m_mock.toml │ └── spcx │ │ └── l40s │ │ └── test │ │ ├── l40s-bc-nccl_test_gather.toml │ │ ├── l40s-bc-nccl_test_reduce.toml │ │ ├── l40s-bc-nccl_test_scatter.toml │ │ ├── l40s-bc-nccl_test_broadcast.toml │ │ ├── l40s-bc-nccl_test_hypercube.toml │ │ ├── l40s-bc-nccl_test_alltoall.toml │ │ ├── l40s-bc-nccl_test_sendrecv.toml │ │ ├── l40s-bc-nccl_test_all_reduce.toml │ │ ├── l40s-bc-nccl_test_bisection.toml │ │ ├── l40s-bc-nccl_test_sendrecv_worst.toml │ │ ├── l40s-bc-nccl_test_alltoall_worst.toml │ │ ├── l40s-bc-nccl_test_all_reduce_worst.toml │ │ ├── l40s-bc-nccl_test_alltoall_worst_failover.toml │ │ ├── l40s-bc-nccl_test_reduce_scatter.toml │ │ ├── l40s-bc-nccl_test_all_gather.toml │ │ ├── l40s-bc-nccl_test_all_gather_worst.toml │ │ └── l40s-bc-nccl_test_reduce_scatter_worst.toml └── experimental │ ├── test_scenario │ ├── ddlb_test.toml │ ├── nemo_launcher_nemotron_15b_bf16_2_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_2_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_4_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_8_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_16_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_32_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_64_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_128_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_256_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_128_node.toml │ ├── nemo_launcher_nemotron_15b_fp8_256_node.toml │ ├── deepep.toml │ ├── nemo_launcher_nemotron_15b_bf16_16_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_4_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_8_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_32_node.toml │ ├── nemo_launcher_nemotron_15b_bf16_64_node.toml │ └── nemo_launcher_nemotron_15b_fp8.toml │ ├── aiconfigurator │ ├── test_scenario │ │ └── aiconfigurator_disagg.toml │ └── test │ │ ├── aiconfigurator_disagg.toml │ │ └── dse_aiconfigurator_disagg.toml │ ├── ai_dynamo │ ├── test_scenario │ │ ├── vllm_k8s.toml │ │ └── vllm_slurm.toml │ └── test │ │ └── vllm.toml │ └── test │ ├── ddlb_test.toml │ ├── deepep_low_latency.toml │ └── deepep_standard.toml ├── doc ├── Makefile └── workloads │ ├── sleep.rst │ ├── ucc.rst │ ├── nemo_run.rst │ ├── nccl.rst │ ├── chakra_replay.rst │ ├── bash_cmd.rst │ ├── slurm_container.rst │ ├── index.rst │ ├── nixl_kvbench.rst │ ├── nixl_bench.rst │ ├── ddlb.rst │ └── osu.rst └── .gitignore /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @amaslenn @srivatsankrishnan @jeffnvidia @alexmanle 2 | -------------------------------------------------------------------------------- /greptile.json: -------------------------------------------------------------------------------- 1 | { 2 | "strictness": 1, 3 | "sequenceDiagramSection": { 4 | "included": true, 5 | "collapsible": true, 6 | "defaultOpen": false 7 | } 8 | } -------------------------------------------------------------------------------- /.coderabbit.yaml: -------------------------------------------------------------------------------- 1 | reviews: 2 | profile: assertive 3 | auto_review: 4 | enabled: true 5 | sequence_diagrams: false 6 | high_level_summary: false 7 | collapse_walkthrough: true 8 | pre_merge_checks: 9 | docstrings: 10 | mode: "off" 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general_question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General question 3 | about: Ask a question or seek clarification about the project 4 | title: '' 5 | labels: 'question' 6 | assignees: '' 7 | --- 8 | 9 | > Please provide a detailed description of your question or the information you seek. 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | ## Problem Related to the Feature 10 | > A clear and concise description of what the problem is. 11 | 12 | ## Proposed Solution 13 | > A clear and concise description of what you want to happen. 14 | -------------------------------------------------------------------------------- /tests/ref_data/triton-inference-start_server_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export NIM_LEADER_IP_ADDRESS=${SLURM_JOB_MASTER_NODE} 4 | export NIM_NODE_RANK=${SLURM_NODEID} 5 | 6 | export NIM_MODEL_NAME='__OUTPUT_DIR__/output' 7 | export NIM_CACHE_PATH='__OUTPUT_DIR__/output' 8 | 9 | if [ "$NIM_NODE_RANK" -eq 0 ]; then 10 | export NIM_LEADER_ROLE=1 11 | else 12 | export NIM_LEADER_ROLE=0 13 | fi 14 | 15 | echo "Starting NIM server on node rank ${NIM_NODE_RANK} with leader role ${NIM_LEADER_ROLE}" 16 | exec /opt/nim/start_server.sh -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | ## Describe the Bug 10 | > A clear and concise description of what the bug is. 11 | 12 | ## Steps to Reproduce 13 | > Steps to reproduce the behavior. 14 | > Please include the version information where the bug was observed. 15 | 16 | ## Expected Behavior 17 | > A clear and concise description of what you expected to happen. 18 | 19 | ## Screenshots 20 | > If applicable, add screenshots to help explain your problem. 21 | -------------------------------------------------------------------------------- /src/cloudai/util/general-report.jinja2: -------------------------------------------------------------------------------- 1 | {% extends "base-report.jinja2" %} 2 | 3 | {% block content %} 4 | 5 | 6 | 7 | 8 | 9 | 10 | {% for item in report_items %} 11 | 12 | 13 | 14 | {% if item.logs_path %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 |
TestDescriptionResults
{{ item.name }}{{ item.description }}logsno logs
22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /src/cloudai/util/general-slurm-report.jinja2: -------------------------------------------------------------------------------- 1 | {% extends "base-report.jinja2" %} 2 | 3 | {% block content %} 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for item in report_items %} 12 | 13 | 14 | 15 | {% if item.logs_path %} 16 | 17 | {% else %} 18 | 19 | {% endif %} 20 | 21 | 22 | {% endfor %} 23 |
TestDescriptionResultsNodes
{{ item.name }}{{ item.description }}logsno logs{{ item.nodes }}
24 | {% endblock %} 25 | -------------------------------------------------------------------------------- /src/cloudai/_core/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /src/cloudai/systems/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /src/cloudai/workloads/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /src/cloudai/report_generator/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /src/cloudai/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .cli import main, setup_logging 18 | 19 | __all__ = [ 20 | "main", 21 | "setup_logging", 22 | ] 23 | -------------------------------------------------------------------------------- /.taplo.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | include = ["pyproject.toml", ".taplo.toml", "conf/**/*.toml"] 18 | 19 | [formatting] 20 | indent_entries = false 21 | indent_tables = true 22 | -------------------------------------------------------------------------------- /conf/common/test/sleep.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "sleep" 18 | description = "sleep test" 19 | test_template_name = "Sleep" 20 | 21 | [cmd_args] 22 | seconds = 1 23 | -------------------------------------------------------------------------------- /conf/hook/nccl_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test" 18 | 19 | [[Tests]] 20 | id = "Tests.1" 21 | test_name = "nccl_test_all_gather" 22 | time_limit = "00:20:00" 23 | -------------------------------------------------------------------------------- /conf/common/system/standalone_system.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "standalone_system" 18 | scheduler = "standalone" 19 | 20 | install_path = "./install" 21 | output_path = "./results" 22 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/gpt3_126m_mock.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "gpt3_126m_mock" 18 | 19 | [[Tests]] 20 | id = "Tests.1" 21 | test_name = "gpt3_126m_mock" 22 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/gpt3_126m_pile.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "gpt3_126m_pile" 18 | 19 | [[Tests]] 20 | id = "Tests.1" 21 | test_name = "gpt3_126m_pile" 22 | -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .tensorboard_data_reader import TensorBoardDataReader 18 | 19 | __all__ = [ 20 | "TensorBoardDataReader", 21 | ] 22 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Summary 2 | Provide a concise summary of the changes introduced by this pull request. Detail the purpose and scope of the changes, referencing any relevant issues or discussions. Explain how these changes address the problem or improve the project. 3 | 4 | ## Test Plan 5 | In this section, describe the testing you have performed to verify the changes. Include: 6 | - A clear description of the testing environment. 7 | - The steps you followed to test the new features or bug fixes. 8 | - Any specific commands used during testing, along with their outputs. 9 | - A description of the results and observations from your testing. 10 | This information is crucial for reviewers to understand how the changes have been validated. 11 | 12 | ## Additional Notes 13 | Include any other notes or comments about the pull request here. This can include challenges faced, future considerations, or context that reviewers might find helpful. 14 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/ddlb_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "ddlb-test" 18 | 19 | [[Tests]] 20 | id = "Tests.ddlb" 21 | test_name = "ddlb_test" 22 | num_nodes = 1 23 | time_limit = "00:30:00" 24 | -------------------------------------------------------------------------------- /tests/ref_data/sleep.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; sleep 5" 18 | -------------------------------------------------------------------------------- /conf/common/test_scenario/osu_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "osu_test_scenario" 18 | job_status_check = true 19 | 20 | [[Tests]] 21 | id = "Tests.1" 22 | test_name = "osu_test" 23 | num_nodes = "2" 24 | time_limit = "00:20:00" 25 | -------------------------------------------------------------------------------- /conf/common/test/ucc_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "ucc_base_test" 18 | description = "Base config for UCC tests" 19 | test_template_name = "UCCTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/nccl_test_loopback.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl-loopback-test" 18 | 19 | [[Tests]] 20 | id = "Tests.1" 21 | test_name = "nccl_test_all_reduce_loopback" 22 | num_nodes = "2" 23 | time_limit = "00:20:00" 24 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_run_llama3_8b" 18 | 19 | [[Tests]] 20 | id = "nemo_run_llama3_8b" 21 | test_name = "nemo_run_llama3_8b" 22 | num_nodes = "1" 23 | time_limit = "00:30:00" 24 | -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from dataclasses import dataclass 18 | 19 | from cloudai.core import BaseJob 20 | 21 | 22 | @dataclass 23 | class LSFJob(BaseJob): 24 | """A job class for execution on an LSF system.""" 25 | 26 | pass 27 | -------------------------------------------------------------------------------- /src/cloudai/workloads/bash_cmd/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .bash_cmd import BashCmdArgs, BashCmdCommandGenStrategy, BashCmdTestDefinition 18 | 19 | __all__ = [ 20 | "BashCmdArgs", 21 | "BashCmdCommandGenStrategy", 22 | "BashCmdTestDefinition", 23 | ] 24 | -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from dataclasses import dataclass 18 | 19 | from cloudai.core import BaseJob 20 | 21 | 22 | @dataclass 23 | class SlurmJob(BaseJob): 24 | """A job class for execution on a Slurm system.""" 25 | 26 | pass 27 | -------------------------------------------------------------------------------- /conf/common/test_scenario/dse_nccl_all_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "dse-nccl-test" 18 | 19 | pre_test = "nccl_test" 20 | post_test = "nccl_test" 21 | 22 | [[Tests]] 23 | id = "Tests.1" 24 | test_name = "dse_nccl_all_gather" 25 | num_nodes = "2" 26 | time_limit = "00:20:00" 27 | -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/standalone_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from dataclasses import dataclass 18 | 19 | from cloudai.core import BaseJob 20 | 21 | 22 | @dataclass 23 | class StandaloneJob(BaseJob): 24 | """A job class for standalone execution.""" 25 | 26 | pass 27 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_2_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_2_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_2_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "2" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_2_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_2_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_2_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_2_node" 22 | num_nodes = "2" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_4_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_4_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_4_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_4_node" 22 | num_nodes = "4" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_8_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_8_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_8_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_8_node" 22 | num_nodes = "8" 23 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_gather" 18 | description = "gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "gather_perf_mpi" 24 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_reduce" 18 | description = "reduce" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "reduce_perf_mpi" 24 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_16_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_16_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_16_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_16_node" 22 | num_nodes = "16" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_32_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_32_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_32_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_32_node" 22 | num_nodes = "32" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_64_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_64_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_64_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_64_node" 22 | num_nodes = "64" 23 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_scatter.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_scatter" 18 | description = "scatter" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "scatter_perf_mpi" 24 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_128_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_128_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_128_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_128_node" 22 | num_nodes = "128" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_256_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_256_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_256_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_256_node" 22 | num_nodes = "256" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_128_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_128_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_128_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_128_node" 22 | num_nodes = "128" 23 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_256_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8_256_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_256_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_256_node" 22 | num_nodes = "256" 23 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_broadcast.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_broadcast" 18 | description = "broadcast" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "broadcast_perf_mpi" 24 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_hypercube.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_hypercube" 18 | description = "hypercube" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "hypercube_perf_mpi" 24 | -------------------------------------------------------------------------------- /src/cloudai/workloads/ddlb/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .ddlb import DDLBCmdArgs, DDLBTestDefinition 18 | from .slurm_command_gen_strategy import DDLBTestSlurmCommandGenStrategy 19 | 20 | __all__ = [ 21 | "DDLBCmdArgs", 22 | "DDLBTestDefinition", 23 | "DDLBTestSlurmCommandGenStrategy", 24 | ] 25 | -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/kubernetes_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from dataclasses import dataclass 19 | 20 | from cloudai.core import BaseJob 21 | 22 | 23 | @dataclass 24 | class KubernetesJob(BaseJob): 25 | """A job class for execution on a Kubernetes system.""" 26 | 27 | kind: str 28 | name: str 29 | -------------------------------------------------------------------------------- /src/cloudai/configurator/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .base_agent import BaseAgent 18 | from .base_gym import BaseGym 19 | from .cloudai_gym import CloudAIGymEnv 20 | from .grid_search import GridSearchAgent 21 | 22 | __all__ = [ 23 | "BaseAgent", 24 | "BaseGym", 25 | "CloudAIGymEnv", 26 | "GridSearchAgent", 27 | ] 28 | -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from dataclasses import dataclass 18 | 19 | from cloudai.core import BaseJob 20 | 21 | from .runai_training import ActualPhase 22 | 23 | 24 | @dataclass 25 | class RunAIJob(BaseJob): 26 | """A job class for execution on an RunAI system.""" 27 | 28 | status: ActualPhase 29 | -------------------------------------------------------------------------------- /src/cloudai/workloads/osu_bench/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition 18 | from .slurm_command_gen_strategy import OSUBenchSlurmCommandGenStrategy 19 | 20 | __all__ = [ 21 | "OSUBenchCmdArgs", 22 | "OSUBenchSlurmCommandGenStrategy", 23 | "OSUBenchTestDefinition", 24 | ] 25 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/deepep.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "deepep-benchmark" 18 | 19 | [[Tests]] 20 | id = "Tests.1" 21 | test_name = "deepep_standard" 22 | num_nodes = 2 23 | time_limit = "00:30:00" 24 | 25 | [[Tests]] 26 | id = "Tests.2" 27 | test_name = "deepep_low_latency" 28 | num_nodes = 2 29 | time_limit = "00:30:00" 30 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | SPHINXOPTS ?= 18 | SPHINXBUILD ?= sphinx-build 19 | SOURCEDIR = . 20 | BUILDDIR = _build 21 | 22 | help: 23 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | 25 | .PHONY: help Makefile 26 | 27 | %: Makefile 28 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 29 | -------------------------------------------------------------------------------- /conf/common/test/nccl_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_base_test" 18 | description = "NCCL base test configuration" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | ngpus = 1 24 | minbytes = "128" 25 | maxbytes = "4G" 26 | iters = 100 27 | warmup_iters = 50 28 | stepfactor = 2 29 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_kvbench/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .nixl_kvbench import NIXLKVBenchCmdArgs, NIXLKVBenchTestDefinition 18 | from .slurm_command_gen_strategy import NIXLKVBenchSlurmCommandGenStrategy 19 | 20 | __all__ = [ 21 | "NIXLKVBenchCmdArgs", 22 | "NIXLKVBenchSlurmCommandGenStrategy", 23 | "NIXLKVBenchTestDefinition", 24 | ] 25 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_16_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_16_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_16_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "16" 23 | 24 | [Tests.cmd_args.training.model] 25 | global_batch_size = 512 26 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_4_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_4_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_4_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "4" 23 | 24 | [Tests.cmd_args.training.model] 25 | global_batch_size = 128 26 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_8_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_8_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_8_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "8" 23 | 24 | [Tests.cmd_args.training.model] 25 | global_batch_size = 256 26 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_32_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_32_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_32_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "32" 23 | 24 | [Tests.cmd_args.training.model] 25 | global_batch_size = 1024 26 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_64_node.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_bf16_64_node" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_bf16_64_node" 21 | test_name = "nemo_launcher_nemotron_15b_bf16_2_node" 22 | num_nodes = "64" 23 | 24 | [Tests.cmd_args.training.model] 25 | global_batch_size = 2048 26 | -------------------------------------------------------------------------------- /src/cloudai/workloads/slurm_container/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .slurm_command_gen_strategy import SlurmContainerCommandGenStrategy 18 | from .slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition 19 | 20 | __all__ = [ 21 | "SlurmContainerCmdArgs", 22 | "SlurmContainerCommandGenStrategy", 23 | "SlurmContainerTestDefinition", 24 | ] 25 | -------------------------------------------------------------------------------- /conf/common/test/osu_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "osu_test" 18 | test_template_name = "OSUBench" 19 | description = "OSU Benchmark example" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "benchmarks_dir" = "/opt/hpcx/ompi/tests/osu-micro-benchmarks" 24 | "benchmark" = "osu_allreduce" 25 | "iterations" = 10 26 | "message_size" = "1024" 27 | -------------------------------------------------------------------------------- /conf/common/test_scenario/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_run_llama3_8b" 18 | 19 | [[Tests]] 20 | id = "nemo_run_llama3_8b.single-node" 21 | test_name = "nemo_run_llama3_8b" 22 | num_nodes = 1 23 | time_limit = "00:60:00" 24 | 25 | [[Tests]] 26 | id = "nemo_run_llama3_8b.2nodes" 27 | test_name = "nemo_run_llama3_8b" 28 | num_nodes = 2 29 | time_limit = "00:60:00" 30 | -------------------------------------------------------------------------------- /tests/test_standalone_installer.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from unittest.mock import Mock 18 | 19 | from cloudai.systems.standalone.standalone_installer import StandaloneInstaller 20 | from cloudai.systems.standalone.standalone_system import StandaloneSystem 21 | 22 | 23 | def test_create(): 24 | installer = StandaloneInstaller(Mock(autospec=StandaloneSystem)) 25 | assert installer 26 | -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/grading_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from pathlib import Path 17 | 18 | from cloudai.core import GradingStrategy 19 | 20 | 21 | class JaxToolboxGradingStrategy(GradingStrategy): 22 | """Performance grading strategy for JaxToolbox test templates on Slurm systems.""" 23 | 24 | def grade(self, directory_path: Path, ideal_perf: float) -> float: 25 | return 0.0 26 | -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .kubernetes_installer import KubernetesInstaller 18 | from .kubernetes_job import KubernetesJob 19 | from .kubernetes_runner import KubernetesRunner 20 | from .kubernetes_system import KubernetesSystem 21 | 22 | __all__ = [ 23 | "KubernetesInstaller", 24 | "KubernetesJob", 25 | "KubernetesRunner", 26 | "KubernetesSystem", 27 | ] 28 | -------------------------------------------------------------------------------- /conf/common/test/nccl_test_all_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_gather" 18 | description = "all_gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "all_gather_perf_mpi" 24 | "ngpus" = "1" 25 | "minbytes" = "128" 26 | "maxbytes" = "4G" 27 | "iters" = "100" 28 | "warmup_iters" = "50" 29 | "stepfactor" = 2 30 | -------------------------------------------------------------------------------- /src/cloudai/_core/base_job.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from dataclasses import dataclass, field 18 | from typing import Union 19 | 20 | from .test_scenario import TestRun 21 | 22 | 23 | @dataclass 24 | class BaseJob: 25 | """Base class for representing a job created by executing a test.""" 26 | 27 | test_run: TestRun 28 | id: Union[str, int] 29 | terminated_by_dependency: bool = field(default=False, init=False) 30 | -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from .standalone_installer import StandaloneInstaller 19 | from .standalone_job import StandaloneJob 20 | from .standalone_runner import StandaloneRunner 21 | from .standalone_system import StandaloneSystem 22 | 23 | __all__ = [ 24 | "StandaloneInstaller", 25 | "StandaloneJob", 26 | "StandaloneRunner", 27 | "StandaloneSystem", 28 | ] 29 | -------------------------------------------------------------------------------- /conf/experimental/aiconfigurator/test_scenario/aiconfigurator_disagg.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "aiconfigurator-disagg-scenario" 18 | 19 | [[Tests]] 20 | id = "aiconfigurator.disagg.demo" 21 | time_limit = "00:05:00" 22 | test_name = "dse_aiconfigurator_disagg_demo_Llama3.1_70B" 23 | num_nodes = 1 24 | agent_metrics = [ 25 | "ttft_ms", 26 | "tpot_ms", 27 | "tokens_per_s_per_gpu", 28 | "tokens_per_s_per_user", 29 | ] 30 | -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/report_generation_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from cloudai.core import ReportGenerationStrategy 18 | 19 | 20 | class TritonInferenceReportGenerationStrategy(ReportGenerationStrategy): 21 | """Report generation strategy for TritonInference.""" 22 | 23 | def can_handle_directory(self) -> bool: 24 | return True 25 | 26 | def generate_report(self) -> None: 27 | pass 28 | -------------------------------------------------------------------------------- /src/cloudai/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .core import ( 18 | Registry, 19 | Runner, 20 | System, 21 | TestDefinition, 22 | TestRun, 23 | TestScenario, 24 | ) 25 | 26 | # Public API 27 | __all__ = [ 28 | "Registry", 29 | "Runner", 30 | "System", 31 | "TestDefinition", 32 | "TestRun", 33 | "TestScenario", 34 | ] 35 | 36 | from .registration import register_all 37 | 38 | register_all() 39 | -------------------------------------------------------------------------------- /src/cloudai/workloads/deepep/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .deepep import DeepEPCmdArgs, DeepEPTestDefinition 18 | from .report_generation_strategy import DeepEPReportGenerationStrategy 19 | from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy 20 | 21 | __all__ = [ 22 | "DeepEPCmdArgs", 23 | "DeepEPReportGenerationStrategy", 24 | "DeepEPSlurmCommandGenStrategy", 25 | "DeepEPTestDefinition", 26 | ] 27 | -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "vllm_k8s" 18 | 19 | [[Tests]] 20 | id = "dynamo.vllm" 21 | test_name = "vLLM-Qwen3-0.6B" 22 | 23 | [Tests.cmd_args] 24 | [Tests.cmd_args.dynamo] 25 | [Tests.cmd_args.dynamo.prefill_worker] 26 | num-nodes = 1 27 | tensor-parallel-size = 8 28 | [Tests.cmd_args.dynamo.decode_worker] 29 | num-nodes = 1 30 | tensor-parallel-size = 8 31 | -------------------------------------------------------------------------------- /conf/common/test/dse_nccl_all_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "dse_nccl_all_gather" 18 | description = "all_gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "all_gather_perf_mpi" 24 | "ngpus" = "1" 25 | "minbytes" = "128" 26 | "maxbytes" = "4G" 27 | "iters" = "100" 28 | "warmup_iters" = ["5", "50"] 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_alltoall" 18 | description = "alltoall" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "alltoall_perf_mpi" 24 | minbytes = "1k" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | check = "0" 28 | warmup_iters = "20" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_sendrecv" 18 | description = "sendrecv" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "sendrecv_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | check = "0" 28 | warmup_iters = "20" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_reduce" 18 | description = "all_reduce" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "all_reduce_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | check = "0" 28 | warmup_iters = "20" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_bisection.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_bisection" 18 | description = "nccl_test_bisection" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "bisection_perf_mpi" 24 | ngpus = "1" 25 | minbytes = "128" 26 | maxbytes = "4G" 27 | iters = "100" 28 | warmup_iters = "50" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from .lsf_command_gen_strategy import LSFCommandGenStrategy 19 | from .lsf_installer import LSFInstaller 20 | from .lsf_job import LSFJob 21 | from .lsf_node import LSFNode 22 | from .lsf_runner import LSFRunner 23 | from .lsf_system import LSFSystem 24 | 25 | __all__ = [ 26 | "LSFCommandGenStrategy", 27 | "LSFInstaller", 28 | "LSFJob", 29 | "LSFNode", 30 | "LSFRunner", 31 | "LSFSystem", 32 | ] 33 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv_worst.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_sendrecv_worst" 18 | description = "sendrecv" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "sendrecv_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | iters = "60" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "2" 33 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_alltoall_worst" 18 | description = "alltoall_tested_job" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "alltoall_perf_mpi" 24 | minbytes = "1k" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | iters = "60" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "2" 33 | -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .megatron_run import MegatronRunCmdArgs, MegatronRunTestDefinition 18 | from .report_generation_strategy import CheckpointTimingReportGenerationStrategy 19 | from .slurm_command_gen_strategy import MegatronRunSlurmCommandGenStrategy 20 | 21 | __all__ = [ 22 | "CheckpointTimingReportGenerationStrategy", 23 | "MegatronRunCmdArgs", 24 | "MegatronRunSlurmCommandGenStrategy", 25 | "MegatronRunTestDefinition", 26 | ] 27 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce_worst.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_reduce_worst" 18 | description = "all_reduce_worst_alloc_test" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "all_reduce_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | iters = "60" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "2" 33 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .nixl_perftest import MatgenCmdArgs, NixlPerftestCmdArgs, NixlPerftestTestDefinition 18 | from .report_generation_strategy import NIXLKVBenchDummyReport 19 | from .slurm_command_gen_strategy import NixlPerftestSlurmCommandGenStrategy 20 | 21 | __all__ = [ 22 | "MatgenCmdArgs", 23 | "NIXLKVBenchDummyReport", 24 | "NixlPerftestCmdArgs", 25 | "NixlPerftestSlurmCommandGenStrategy", 26 | "NixlPerftestTestDefinition", 27 | ] 28 | -------------------------------------------------------------------------------- /conf/common/system/kubernetes_cluster.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "kubernetes-cluster" 18 | scheduler = "kubernetes" 19 | kube_config_path = "" 20 | 21 | install_path = "./install" 22 | output_path = "./results" 23 | default_namespace = "default" 24 | monitor_interval = 1 25 | 26 | [global_env_vars] 27 | NCCL_IB_GID_INDEX = "3" 28 | NCCL_SOCKET_IFNAME = "ib0" 29 | NCCL_IB_HCA = "mlx5_0" 30 | UCX_NET_DEVICES = "mlx5_0:1" 31 | NCCL_P2P_LEVEL = "PIX" 32 | UCX_TLS = "rc_x,sm,cuda_copy" 33 | NCCL_IB_TC = "96" 34 | -------------------------------------------------------------------------------- /conf/hook/test/nccl_test_all_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_gather" 18 | description = "all_gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | "subtest_name" = "all_gather_perf_mpi" 24 | "ngpus" = "1" 25 | "minbytes" = "128" 26 | "maxbytes" = "4G" 27 | "iters" = "100" 28 | "warmup_iters" = "50" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | 33 | [extra_env_vars] 34 | "NCCL_TESTS_SPLIT_MASK" = "0x7" 35 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst_failover.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_alltoall_worst_failover" 18 | description = "alltoall_tested_job" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "alltoall_perf_mpi" 24 | minbytes = "128M" 25 | maxbytes = "128M" 26 | stepbytes = "0" 27 | iters = "20" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "1" 33 | -------------------------------------------------------------------------------- /src/cloudai/workloads/aiconfig/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .aiconfigurator import AiconfiguratorCmdArgs, AiconfiguratorTestDefinition 18 | from .report_generation_strategy import AiconfiguratorReportGenerationStrategy 19 | from .standalone_command_gen_strategy import AiconfiguratorStandaloneCommandGenStrategy 20 | 21 | __all__ = [ 22 | "AiconfiguratorCmdArgs", 23 | "AiconfiguratorReportGenerationStrategy", 24 | "AiconfiguratorStandaloneCommandGenStrategy", 25 | "AiconfiguratorTestDefinition", 26 | ] 27 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/report_generation_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import ClassVar 18 | 19 | from cloudai.core import ReportGenerationStrategy 20 | 21 | 22 | class NIXLKVBenchDummyReport(ReportGenerationStrategy): 23 | """Dummy report to support sweeps as it requires "default" metric.""" 24 | 25 | metrics: ClassVar[list[str]] = ["default"] 26 | 27 | def can_handle_directory(self) -> bool: 28 | return True 29 | 30 | def generate_report(self) -> None: 31 | pass 32 | -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .grading_strategy import UCCTestGradingStrategy 18 | from .report_generation_strategy import UCCTestReportGenerationStrategy 19 | from .slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy 20 | from .ucc import UCCCmdArgs, UCCTestDefinition 21 | 22 | __all__ = [ 23 | "UCCCmdArgs", 24 | "UCCTestDefinition", 25 | "UCCTestGradingStrategy", 26 | "UCCTestReportGenerationStrategy", 27 | "UCCTestSlurmCommandGenStrategy", 28 | ] 29 | -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from .report_generation_strategy import TritonInferenceReportGenerationStrategy 19 | from .slurm_command_gen_strategy import TritonInferenceSlurmCommandGenStrategy 20 | from .triton_inference import TritonInferenceCmdArgs, TritonInferenceTestDefinition 21 | 22 | __all__ = [ 23 | "TritonInferenceCmdArgs", 24 | "TritonInferenceReportGenerationStrategy", 25 | "TritonInferenceSlurmCommandGenStrategy", 26 | "TritonInferenceTestDefinition", 27 | ] 28 | -------------------------------------------------------------------------------- /conf/common/system/example_runai_cluster.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "example-runai-cluster" 18 | scheduler = "runai" 19 | 20 | install_path = "./install_dir" 21 | output_path = "./results" 22 | monitor_interval = 1 23 | 24 | base_url = "http://runai.example.com" 25 | user_email = "your_email" 26 | app_id = "your_app_id" 27 | app_secret = "your_app_secret" 28 | project_id = "your_project_id" 29 | cluster_id = "your_cluster_id" 30 | 31 | [global_env_vars] 32 | NCCL_IB_GID_INDEX = "3" 33 | NCCL_IB_TIMEOUT = "20" 34 | NCCL_IB_QPS_PER_CONNECTION = "4" 35 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/nccl_test_all_reduce_loopback.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_reduce_loopback" 18 | description = "all_reduce" 19 | test_template_name = "NcclTest" 20 | 21 | [extra_env_vars] 22 | NCCL_P2P_DISABLE = "1" 23 | NCCL_SHM_DISABLE = "1" 24 | 25 | [cmd_args] 26 | "docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3" 27 | "subtest_name" = "all_reduce_perf_mpi" 28 | "ngpus" = "1" 29 | "minbytes" = "8M" 30 | "maxbytes" = "16G" 31 | "iters" = "5" 32 | "warmup_iters" = "3" 33 | 34 | [extra_cmd_args] 35 | "--stepfactor" = "2" 36 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition 18 | from .nixl_summary_report import NIXLBenchComparisonReport 19 | from .report_generation_strategy import NIXLBenchReportGenerationStrategy 20 | from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy 21 | 22 | __all__ = [ 23 | "NIXLBenchCmdArgs", 24 | "NIXLBenchComparisonReport", 25 | "NIXLBenchReportGenerationStrategy", 26 | "NIXLBenchSlurmCommandGenStrategy", 27 | "NIXLBenchTestDefinition", 28 | ] 29 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/sleep.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from cloudai.core import Installable 19 | from cloudai.models.workload import CmdArgs, TestDefinition 20 | 21 | 22 | class SleepCmdArgs(CmdArgs): 23 | """Sleep test command arguments.""" 24 | 25 | docker_image_url: str = "ubuntu:22.04" 26 | seconds: int = 5 27 | 28 | 29 | class SleepTestDefinition(TestDefinition): 30 | """Test object for Sleep.""" 31 | 32 | cmd_args: SleepCmdArgs 33 | 34 | @property 35 | def installables(self) -> list[Installable]: 36 | return [] 37 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_reduce_scatter" 18 | description = "reduce_scatter" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "reduce_scatter_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | check = "0" 28 | warmup_iters = "20" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | 33 | [extra_env_vars] 34 | "NCCL_TESTS_SPLIT_MASK" = "0x7" 35 | "NCCL_MIN_NCHANNELS" = "12" 36 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_gather" 18 | description = "all_gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "all_gather_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | check = "0" 28 | warmup_iters = "20" 29 | 30 | [extra_cmd_args] 31 | "--stepfactor" = "2" 32 | 33 | [extra_env_vars] 34 | "NCCL_TESTS_SPLIT_MASK" = "0x7" 35 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers 36 | -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .chakra_replay import ChakraReplayCmdArgs, ChakraReplayTestDefinition 18 | from .grading_strategy import ChakraReplayGradingStrategy 19 | from .report_generation_strategy import ChakraReplayReportGenerationStrategy 20 | from .slurm_command_gen_strategy import ChakraReplaySlurmCommandGenStrategy 21 | 22 | __all__ = [ 23 | "ChakraReplayCmdArgs", 24 | "ChakraReplayGradingStrategy", 25 | "ChakraReplayReportGenerationStrategy", 26 | "ChakraReplaySlurmCommandGenStrategy", 27 | "ChakraReplayTestDefinition", 28 | ] 29 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .grading_strategy import NeMoLauncherGradingStrategy 18 | from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition 19 | from .report_generation_strategy import NeMoLauncherReportGenerationStrategy 20 | from .slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy 21 | 22 | __all__ = [ 23 | "NeMoLauncherCmdArgs", 24 | "NeMoLauncherGradingStrategy", 25 | "NeMoLauncherReportGenerationStrategy", 26 | "NeMoLauncherSlurmCommandGenStrategy", 27 | "NeMoLauncherTestDefinition", 28 | ] 29 | -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/report_tool_interface.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from abc import ABC, abstractmethod 18 | from pathlib import Path 19 | 20 | 21 | class ReportToolInterface(ABC): 22 | """Interface for report tools, defining methods to add and finalize reports.""" 23 | 24 | @abstractmethod 25 | def finalize_report(self, output_filename: Path) -> None: 26 | """ 27 | Finalize the report and save it to the specified filename. 28 | 29 | Args: 30 | output_filename (Path): The filename where the report will be saved. 31 | """ 32 | pass 33 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather_worst.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_all_gather_worst" 18 | description = "all_gather" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "all_gather_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | iters = "60" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "2" 33 | 34 | [extra_env_vars] 35 | "NCCL_TESTS_SPLIT_MASK" = "0x7" 36 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers 37 | -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter_worst.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nccl_test_reduce_scatter_worst" 18 | description = "reduce_scatter" 19 | test_template_name = "NcclTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 23 | subtest_name = "reduce_scatter_perf_mpi" 24 | minbytes = "1K" 25 | maxbytes = "16G" 26 | stepbytes = "0" 27 | iters = "60" 28 | check = "0" 29 | warmup_iters = "40" 30 | 31 | [extra_cmd_args] 32 | "--stepfactor" = "2" 33 | 34 | [extra_env_vars] 35 | "NCCL_TESTS_SPLIT_MASK" = "0x7" 36 | "NCCL_MIN_NCHANNELS" = "12" # only for low number of servers 37 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/standalone_command_gen_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import cast 18 | 19 | from cloudai.core import CommandGenStrategy 20 | 21 | from .sleep import SleepCmdArgs, SleepTestDefinition 22 | 23 | 24 | class SleepStandaloneCommandGenStrategy(CommandGenStrategy): 25 | """Command generation strategy for the Sleep test on standalone systems.""" 26 | 27 | def gen_exec_command(self) -> str: 28 | tdef: SleepTestDefinition = cast(SleepTestDefinition, self.test_run.test) 29 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args 30 | sec = tdef_cmd_args.seconds 31 | return f"sleep {sec}" 32 | -------------------------------------------------------------------------------- /conf/common/test_scenario/slurm_container.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "slurm-container" 18 | 19 | [[Tests]] 20 | id = "nccl.alltoall" 21 | num_nodes = 2 22 | time_limit = "00:20:00" 23 | 24 | name = "nccl-alltoall" 25 | description = "NCCL alltoall via SlurmContainer" 26 | test_template_name = "SlurmContainer" 27 | 28 | [Tests.cmd_args] 29 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 30 | cmd = "alltoall_perf_mpi --nthreads 1 --ngpus 1 --minbytes 128 --maxbytes 4G --stepbytes 1M --op sum --datatype float --root 0 --iters 100 --warmup_iters 50 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 --stepfactor 2" 31 | -------------------------------------------------------------------------------- /conf/experimental/aiconfigurator/test/aiconfigurator_disagg.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "aiconfigurator_disagg_demo" 18 | description = "Aiconfigurator disaggregated predictor demo" 19 | test_template_name = "Aiconfigurator" 20 | 21 | [cmd_args] 22 | model_name = "LLAMA3.1_70B" 23 | system = "h100_sxm" 24 | # backend and version use defaults 25 | isl = 3000 26 | osl = 150 27 | 28 | [cmd_args.disagg] 29 | p_tp = 4 30 | p_pp = 1 31 | p_dp = 1 32 | p_bs = 1 33 | p_workers = 1 34 | 35 | d_tp = 4 36 | d_pp = 1 37 | d_dp = 1 38 | d_bs = 256 39 | d_workers = 1 40 | 41 | prefill_correction_scale = 1.0 42 | decode_correction_scale = 1.0 43 | -------------------------------------------------------------------------------- /conf/experimental/test/ddlb_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "ddlb_test" 18 | description = "DDLB test configuration" 19 | test_template_name = "DDLBTest" 20 | 21 | [cmd_args] 22 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" 23 | primitive = "tp_columnwise" 24 | m = [1024, 8192] 25 | n = 128 26 | k = 1024 27 | dtype = "float16" 28 | num_iterations = 50 29 | num_warmups = 10 30 | # Make sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after" 31 | impl = [ 32 | "pytorch;backend=nccl;order=AG_before", 33 | "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before", 34 | ] 35 | -------------------------------------------------------------------------------- /src/cloudai/_core/job_status_result.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | class JobStatusResult: 19 | """ 20 | Encapsulates the result of a job status retrieval. 21 | 22 | Attributes 23 | is_successful (bool): Indicates if the job was successful. 24 | error_message (str): Error message if the job was not successful. 25 | """ 26 | 27 | def __init__(self, is_successful: bool, error_message: str = ""): 28 | self.is_successful = is_successful 29 | self.error_message = error_message 30 | 31 | def __str__(self): 32 | return f"JobStatusResult(is_successful={self.is_successful}, error_message={self.error_message})" 33 | -------------------------------------------------------------------------------- /conf/common/test_scenario/ucc_generator_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | name = "ucc_generator_perftest" 19 | 20 | [[Tests]] 21 | id = "Tests.alltoallv" 22 | num_nodes = 2 23 | time_limit = "00:02:00" 24 | 25 | name = "ucc_generator_perftest" 26 | description = "UCC alltoallv" 27 | test_template_name = "UCCTest" 28 | extra_container_mounts = [ 29 | "$PWD/conf/common/test_scenario/ucc_generator_file.txt:/opt/hpcx/ucc/tools/perf/generator/input_matrices.txt", 30 | ] 31 | 32 | [Tests.cmd_args] 33 | docker_image_url = "nvcr.io/nvidia/pytorch:25.09-py3" 34 | collective = "alltoallv" 35 | gen = "file:name=/opt/hpcx/ucc/tools/perf/generator/input_matrices.txt" 36 | -------------------------------------------------------------------------------- /tests/ref_data/slurm_container.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; pwd ; ls" 18 | -------------------------------------------------------------------------------- /doc/workloads/sleep.rst: -------------------------------------------------------------------------------- 1 | Sleep 2 | ===== 3 | 4 | This workload (`test_template_name` is ``Sleep``) executes a simple sleep command for testing and timing purposes. Useful for testing schedulers and system behavior. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_sleep_test" 14 | description = "Example Sleep test" 15 | test_template_name = "Sleep" 16 | 17 | [cmd_args] 18 | seconds = 30 19 | 20 | Test Scenario example: 21 | 22 | .. code-block:: toml 23 | 24 | name = "sleep-test" 25 | 26 | [[Tests]] 27 | id = "sleep.1" 28 | num_nodes = 1 29 | time_limit = "00:02:00" 30 | 31 | test_name = "my_sleep_test" 32 | 33 | Test-in-Scenario example: 34 | 35 | .. code-block:: toml 36 | 37 | name = "sleep-test" 38 | 39 | [[Tests]] 40 | id = "sleep.1" 41 | num_nodes = 1 42 | time_limit = "00:02:00" 43 | 44 | name = "my_sleep_test" 45 | description = "Example Sleep test" 46 | test_template_name = "Sleep" 47 | 48 | [Tests.cmd_args] 49 | seconds = 30 50 | 51 | API Documentation 52 | ----------------- 53 | 54 | Command Arguments 55 | ~~~~~~~~~~~~~~~~~ 56 | 57 | .. autoclass:: cloudai.workloads.sleep.sleep.SleepCmdArgs 58 | :members: 59 | :show-inheritance: 60 | 61 | Test Definition 62 | ~~~~~~~~~~~~~~~ 63 | 64 | .. autoclass:: cloudai.workloads.sleep.sleep.SleepTestDefinition 65 | :members: 66 | :show-inheritance: 67 | -------------------------------------------------------------------------------- /doc/workloads/ucc.rst: -------------------------------------------------------------------------------- 1 | UCC 2 | === 3 | 4 | This workload (`test_template_name` is ``UCCTest``) allows users to execute UCC benchmarks within the CloudAI framework. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "ucc" 14 | description = "Example UCC test" 15 | test_template_name = "UCCTest" 16 | 17 | [cmd_args] 18 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 19 | 20 | Test Scenario example: 21 | 22 | .. code-block:: toml 23 | 24 | name = "ucc-test" 25 | 26 | [[Tests]] 27 | id = "ucc.1" 28 | num_nodes = 1 29 | time_limit = "00:02:00" 30 | 31 | test_name = "ucc" 32 | 33 | Test-in-Scenario example: 34 | 35 | .. code-block:: toml 36 | 37 | name = "ucc-test" 38 | 39 | [[Tests]] 40 | id = "ucc.1" 41 | num_nodes = 1 42 | time_limit = "00:02:00" 43 | 44 | name = "ucc" 45 | description = "Example UCC test" 46 | test_template_name = "UCCTest" 47 | 48 | [Tests.cmd_args] 49 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 50 | 51 | API Documentation 52 | ----------------- 53 | 54 | Command Arguments 55 | ~~~~~~~~~~~~~~~~~ 56 | 57 | .. autoclass:: cloudai.workloads.ucc_test.ucc.UCCCmdArgs 58 | :members: 59 | :show-inheritance: 60 | 61 | Test Definition 62 | ~~~~~~~~~~~~~~~ 63 | 64 | .. autoclass:: cloudai.workloads.ucc_test.ucc.UCCTestDefinition 65 | :members: 66 | :show-inheritance: 67 | -------------------------------------------------------------------------------- /src/cloudai/_core/grading_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from abc import ABC, abstractmethod 18 | from pathlib import Path 19 | 20 | 21 | class GradingStrategy(ABC): 22 | """Abstract class for grading test performance.""" 23 | 24 | @abstractmethod 25 | def grade(self, directory_path: Path, ideal_perf: float) -> float: 26 | """ 27 | Grades the performance of a test. 28 | 29 | Args: 30 | directory_path (Path): Path to the directory containing the test's output. 31 | ideal_perf (float): The ideal performance value for comparison. 32 | 33 | Returns: 34 | float: Calculated grade based on the performance. 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /tests/ref_data/nemo-launcher.sbatch: -------------------------------------------------------------------------------- 1 | VAR="$(scontrol show hostname \"${SLURM_STEP_NODELIST}\" | head -n1)" \ 2 | __OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022-venv/bin/python \ 3 | __OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts/main.py \ 4 | cluster.gpus_per_node=8 \ 5 | numa_mapping.enable=True \ 6 | stages=["training"] \ 7 | training.exp_manager.create_checkpoint_callback=False \ 8 | training.model.data.data_impl=mock \ 9 | training.model.data.data_prefix=[] \ 10 | training.model.global_batch_size=128 \ 11 | training.model.micro_batch_size=2 \ 12 | training.model.pipeline_model_parallel_size=4 \ 13 | training.model.tensor_model_parallel_size=4 \ 14 | training.run.name=run \ 15 | training.run.time_limit=3:00:00 \ 16 | training.trainer.enable_checkpointing=False \ 17 | training.trainer.log_every_n_steps=1 \ 18 | training.trainer.max_steps=20 \ 19 | training.trainer.val_check_interval=10 \ 20 | training=gpt3/40b_improved \ 21 | cluster.partition=main \ 22 | training.trainer.num_nodes=1 \ 23 | container=nvcr.io/nvidia/nemo:24.12.01 \ 24 | cluster.job_name_prefix=test_account-cloudai.nemo: \ 25 | base_results_dir=__OUTPUT_DIR__/output \ 26 | launcher_scripts_path=__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts \ 27 | +env_vars.VAR="$(scontrol show hostname \"${SLURM_STEP_NODELIST}\" | head -n1)" 28 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import List, cast 18 | 19 | from cloudai.systems.slurm import SlurmCommandGenStrategy 20 | 21 | from .sleep import SleepCmdArgs, SleepTestDefinition 22 | 23 | 24 | class SleepSlurmCommandGenStrategy(SlurmCommandGenStrategy): 25 | """Command generation strategy for Sleep on Slurm systems.""" 26 | 27 | def _container_mounts(self) -> list[str]: 28 | return [] 29 | 30 | def generate_test_command(self) -> List[str]: 31 | tdef: SleepTestDefinition = cast(SleepTestDefinition, self.test_run.test) 32 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args 33 | return [f"sleep {tdef_cmd_args.seconds}"] 34 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .data_store_report_generation_strategy import NeMoRunDataStoreReportGenerationStrategy 18 | from .nemo_run import Data, Log, LogCkpt, NeMoRunCmdArgs, NeMoRunTestDefinition, Trainer, TrainerStrategy 19 | from .report_generation_strategy import NeMoRunReportGenerationStrategy 20 | from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy 21 | 22 | __all__ = [ 23 | "Data", 24 | "Log", 25 | "LogCkpt", 26 | "NeMoRunCmdArgs", 27 | "NeMoRunDataStoreReportGenerationStrategy", 28 | "NeMoRunReportGenerationStrategy", 29 | "NeMoRunSlurmCommandGenStrategy", 30 | "NeMoRunTestDefinition", 31 | "Trainer", 32 | "TrainerStrategy", 33 | ] 34 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .grading_strategy import SleepGradingStrategy 18 | from .kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy 19 | from .lsf_command_gen_strategy import SleepLSFCommandGenStrategy 20 | from .sleep import SleepCmdArgs, SleepTestDefinition 21 | from .slurm_command_gen_strategy import SleepSlurmCommandGenStrategy 22 | from .standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy 23 | 24 | __all__ = [ 25 | "SleepCmdArgs", 26 | "SleepGradingStrategy", 27 | "SleepKubernetesJsonGenStrategy", 28 | "SleepLSFCommandGenStrategy", 29 | "SleepSlurmCommandGenStrategy", 30 | "SleepStandaloneCommandGenStrategy", 31 | "SleepTestDefinition", 32 | ] 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | doc/_build 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Jupyter Notebook 37 | .ipynb_checkpoints 38 | 39 | # IPython 40 | profile_default/ 41 | ipython_config.py 42 | 43 | # pyenv 44 | .python-version 45 | 46 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 47 | __pypackages__/ 48 | 49 | # Environments 50 | .env 51 | .venv 52 | env/ 53 | venv/ 54 | ENV/ 55 | env.bak/ 56 | venv.bak/ 57 | 58 | # mypy 59 | .mypy_cache/ 60 | .dmypy.json 61 | dmypy.json 62 | 63 | # pytype static type analyzer 64 | .pytype/ 65 | 66 | # pycharm 67 | .idea/ 68 | 69 | # VSCode 70 | .vscode/ 71 | 72 | # Editors and IDEs 73 | *.swp 74 | *.bak 75 | *.tmp 76 | *~ 77 | *.sublime-project 78 | *.sublime-workspace 79 | 80 | # OS generated files 81 | .DS_Store 82 | .DS_Store? 83 | ._* 84 | .Spotlight-V100 85 | .Trashes 86 | ehthumbs.db 87 | Thumbs.db 88 | 89 | *.log 90 | install/ 91 | results/ 92 | .* 93 | .cloudai.toml 94 | -------------------------------------------------------------------------------- /doc/workloads/nemo_run.rst: -------------------------------------------------------------------------------- 1 | Nemo Run 2 | ======== 3 | 4 | This workload (`test_template_name` is ``NemoRun``) executes NeMo training and fine-tuning tasks using the NeMo Run framework. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_nemo_test" 14 | description = "Example NeMo Run test" 15 | test_template_name = "NemoRun" 16 | 17 | [cmd_args] 18 | recipe = "llama3_8b" 19 | task = "pretrain" 20 | 21 | Test Scenario example: 22 | 23 | .. code-block:: toml 24 | 25 | name = "nemo-run-test" 26 | 27 | [[Tests]] 28 | id = "nemo.1" 29 | num_nodes = 4 30 | time_limit = "02:00:00" 31 | 32 | test_name = "my_nemo_test" 33 | 34 | Test-in-Scenario example: 35 | 36 | .. code-block:: toml 37 | 38 | name = "nemo-run-test" 39 | 40 | [[Tests]] 41 | id = "nemo.1" 42 | num_nodes = 4 43 | time_limit = "02:00:00" 44 | 45 | name = "my_nemo_test" 46 | description = "Example NeMo Run test" 47 | test_template_name = "NemoRun" 48 | 49 | [Tests.cmd_args] 50 | recipe = "llama3_8b" 51 | task = "pretrain" 52 | 53 | API Documentation 54 | ----------------- 55 | 56 | Command Arguments 57 | ~~~~~~~~~~~~~~~~~ 58 | 59 | .. autoclass:: cloudai.workloads.nemo_run.nemo_run.NeMoRunCmdArgs 60 | :members: 61 | :show-inheritance: 62 | 63 | Test Definition 64 | ~~~~~~~~~~~~~~~ 65 | 66 | .. autoclass:: cloudai.workloads.nemo_run.nemo_run.NeMoRunTestDefinition 67 | :members: 68 | :show-inheritance: 69 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/grading_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pathlib import Path 18 | 19 | from cloudai.core import GradingStrategy 20 | 21 | 22 | class SleepGradingStrategy(GradingStrategy): 23 | """Performance grading strategy for Sleep test templates on Slurm systems.""" 24 | 25 | def grade(self, directory_path: Path, ideal_perf: float) -> float: 26 | """ 27 | Grades the performance of a test. 28 | 29 | Args: 30 | directory_path (Path): Path to the directory containing the test's output. 31 | ideal_perf (float): The ideal performance value for comparison. 32 | 33 | Returns: 34 | float: Calculated grade based on the performance. 35 | """ 36 | return 100.0 37 | -------------------------------------------------------------------------------- /tests/ref_data/ucc.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F" 18 | -------------------------------------------------------------------------------- /src/cloudai/_core/report_generation_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from abc import ABC, abstractmethod 18 | from typing import ClassVar 19 | 20 | from .system import System 21 | from .test_scenario import TestRun 22 | 23 | 24 | class ReportGenerationStrategy(ABC): 25 | """Abstract class for generating reports from TestRun objects.""" 26 | 27 | metrics: ClassVar[list[str]] = ["default"] 28 | 29 | def __init__(self, system: System, tr: TestRun) -> None: 30 | self.system = system 31 | self.test_run = tr 32 | 33 | def get_metric(self, metric: str) -> float: 34 | return 0.0 35 | 36 | @abstractmethod 37 | def can_handle_directory(self) -> bool: ... 38 | 39 | @abstractmethod 40 | def generate_report(self) -> None: ... 41 | -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/grading_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pathlib import Path 18 | 19 | from cloudai.core import GradingStrategy 20 | 21 | 22 | class ChakraReplayGradingStrategy(GradingStrategy): 23 | """Performance grading strategy for ChakraReplay test templates on Slurm systems.""" 24 | 25 | def grade(self, directory_path: Path, ideal_perf: float) -> float: 26 | """ 27 | Grades the performance of a test. 28 | 29 | Args: 30 | directory_path (Path): Path to the directory containing the test's output. 31 | ideal_perf (float): The ideal performance value for comparison. 32 | 33 | Returns: 34 | float: Calculated grade based on the performance. 35 | """ 36 | return 100.0 37 | -------------------------------------------------------------------------------- /tests/ref_data/osu-bench.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce --message-size 1024 --iterations 10 --full" 18 | -------------------------------------------------------------------------------- /src/cloudai/systems/runai/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .runai_cluster import RunAICluster 18 | from .runai_event import RunAIEvent 19 | from .runai_installer import RunAIInstaller 20 | from .runai_job import RunAIJob 21 | from .runai_node import RunAINode 22 | from .runai_project import RunAIProject 23 | from .runai_pvc import RunAIPVC 24 | from .runai_rest_client import RunAIRestClient 25 | from .runai_runner import RunAIRunner 26 | from .runai_system import RunAISystem 27 | from .runai_training import RunAITraining 28 | 29 | __all__ = [ 30 | "RunAICluster", 31 | "RunAIEvent", 32 | "RunAIInstaller", 33 | "RunAIJob", 34 | "RunAINode", 35 | "RunAIPVC", 36 | "RunAIProject", 37 | "RunAIRestClient", 38 | "RunAIRunner", 39 | "RunAISystem", 40 | "RunAITraining", 41 | ] 42 | -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .ai_dynamo import ( 18 | AIDynamoArgs, 19 | AIDynamoCmdArgs, 20 | AIDynamoTestDefinition, 21 | DecodeWorkerArgs, 22 | GenAIPerfArgs, 23 | PrefillWorkerArgs, 24 | ) 25 | from .kubernetes_json_gen_strategy import AIDynamoKubernetesJsonGenStrategy 26 | from .report_generation_strategy import AIDynamoReportGenerationStrategy 27 | from .slurm_command_gen_strategy import AIDynamoSlurmCommandGenStrategy 28 | 29 | __all__ = [ 30 | "AIDynamoArgs", 31 | "AIDynamoCmdArgs", 32 | "AIDynamoKubernetesJsonGenStrategy", 33 | "AIDynamoReportGenerationStrategy", 34 | "AIDynamoSlurmCommandGenStrategy", 35 | "AIDynamoTestDefinition", 36 | "DecodeWorkerArgs", 37 | "GenAIPerfArgs", 38 | "PrefillWorkerArgs", 39 | ] 40 | -------------------------------------------------------------------------------- /doc/workloads/nccl.rst: -------------------------------------------------------------------------------- 1 | NCCL 2 | ==== 3 | 4 | This workload (`test_template_name` is ``NcclTest``) allows users to execute NCCL benchmarks within the CloudAI framework. 5 | 6 | Usage Example 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_nccl_test" 14 | description = "Example NCCL test" 15 | test_template_name = "NcclTest" 16 | 17 | [cmd_args] 18 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 19 | 20 | Test Scenario example: 21 | 22 | .. code-block:: toml 23 | 24 | name = "nccl-test" 25 | 26 | [[Tests]] 27 | id = "nccl.1" 28 | num_nodes = 1 29 | time_limit = "00:05:00" 30 | 31 | test_name = "my_nccl_test" 32 | 33 | Test-in-Scenario example: 34 | 35 | .. code-block:: toml 36 | 37 | name = "nccl-test" 38 | 39 | [[Tests]] 40 | id = "nccl.1" 41 | num_nodes = 1 42 | time_limit = "00:05:00" 43 | 44 | name = "my_nccl_test" 45 | description = "Example NCCL test" 46 | test_template_name = "NcclTest" 47 | 48 | [Tests.cmd_args] 49 | docker_image_url = "nvcr.io#nvidia/pytorch:25.06-py3" 50 | subtest_name = "all_reduce_perf_mpi" 51 | iters = 100 52 | 53 | API Documentation 54 | --------------------------------- 55 | 56 | Command Arguments 57 | ~~~~~~~~~~~~~~~~~ 58 | 59 | .. autoclass:: cloudai.workloads.nccl_test.nccl.NCCLCmdArgs 60 | :members: 61 | :show-inheritance: 62 | 63 | Test Definition 64 | ~~~~~~~~~~~~~~~ 65 | 66 | .. autoclass:: cloudai.workloads.nccl_test.nccl.NCCLTestDefinition 67 | :members: 68 | :show-inheritance: 69 | -------------------------------------------------------------------------------- /doc/workloads/chakra_replay.rst: -------------------------------------------------------------------------------- 1 | Chakra Replay 2 | ============= 3 | 4 | This workload (`test_template_name` is ``ChakraReplay``) replays execution traces from the Chakra execution trace format for performance analysis and debugging. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_chakra_test" 14 | description = "Example Chakra replay test" 15 | test_template_name = "ChakraReplay" 16 | 17 | [cmd_args] 18 | trace_path = "/path/to/trace.et" 19 | 20 | Test Scenario example: 21 | 22 | .. code-block:: toml 23 | 24 | name = "chakra-replay-test" 25 | 26 | [[Tests]] 27 | id = "chakra.1" 28 | num_nodes = 1 29 | time_limit = "00:10:00" 30 | 31 | test_name = "my_chakra_test" 32 | 33 | Test-in-Scenario example: 34 | 35 | .. code-block:: toml 36 | 37 | name = "chakra-replay-test" 38 | 39 | [[Tests]] 40 | id = "chakra.1" 41 | num_nodes = 1 42 | time_limit = "00:10:00" 43 | 44 | name = "my_chakra_test" 45 | description = "Example Chakra replay test" 46 | test_template_name = "ChakraReplay" 47 | 48 | [Tests.cmd_args] 49 | trace_path = "/path/to/trace.et" 50 | 51 | API Documentation 52 | ----------------- 53 | 54 | Command Arguments 55 | ~~~~~~~~~~~~~~~~~ 56 | 57 | .. autoclass:: cloudai.workloads.chakra_replay.chakra_replay.ChakraReplayCmdArgs 58 | :members: 59 | :show-inheritance: 60 | 61 | Test Definition 62 | ~~~~~~~~~~~~~~~ 63 | 64 | .. autoclass:: cloudai.workloads.chakra_replay.chakra_replay.ChakraReplayTestDefinition 65 | :members: 66 | :show-inheritance: 67 | -------------------------------------------------------------------------------- /src/cloudai/_core/base_system_parser.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from abc import ABC, abstractmethod 18 | from typing import Any, Dict 19 | 20 | from .system import System 21 | 22 | 23 | class BaseSystemParser(ABC): 24 | """ 25 | Abstract base class for system parsers. 26 | 27 | Parses system configuration data and creates system objects. 28 | 29 | Methods 30 | parse: Abstract method to parse configuration data and return a System object. 31 | """ 32 | 33 | @abstractmethod 34 | def parse(self, data: Dict[str, Any]) -> System: 35 | """ 36 | Parse configuration data and returns a System object. 37 | 38 | Args: 39 | data (Dict[str, Any]): The configuration data. 40 | 41 | Returns: 42 | System: A System object created from the configuration data. 43 | """ 44 | pass 45 | -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/lsf_command_gen_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import Dict, List, Union, cast 18 | 19 | from cloudai.core import TestRun 20 | from cloudai.systems.lsf import LSFCommandGenStrategy 21 | 22 | from .sleep import SleepCmdArgs, SleepTestDefinition 23 | 24 | 25 | class SleepLSFCommandGenStrategy(LSFCommandGenStrategy): 26 | """Command generation strategy for Sleep on LSF systems.""" 27 | 28 | def _container_mounts(self, tr: TestRun) -> list[str]: 29 | return [] 30 | 31 | def generate_test_command( 32 | self, env_vars: Dict[str, Union[str, List[str]]], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun 33 | ) -> List[str]: 34 | tdef: SleepTestDefinition = cast(SleepTestDefinition, tr.test) 35 | tdef_cmd_args: SleepCmdArgs = tdef.cmd_args 36 | return [f"sleep {tdef_cmd_args.seconds}"] 37 | -------------------------------------------------------------------------------- /conf/common/test_scenario/sleep.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "sleep-scenario" 18 | 19 | [[Tests]] 20 | id = "Tests.sleep1" 21 | time_limit = "00:01:00" 22 | test_name = "sleep" 23 | [Tests.cmd_args] 24 | seconds = 10 25 | 26 | [[Tests]] 27 | id = "Tests.sleep5" 28 | time_limit = "00:01:00" 29 | test_name = "sleep" 30 | [Tests.cmd_args] 31 | seconds = 5 32 | [[Tests.dependencies]] 33 | type = "start_post_init" 34 | id = "Tests.sleep1" 35 | 36 | [[Tests]] 37 | id = "Tests.sleep5_2" 38 | time_limit = "00:01:00" 39 | test_name = "sleep" 40 | [Tests.cmd_args] 41 | seconds = 5 42 | [[Tests.dependencies]] 43 | type = "start_post_comp" 44 | id = "Tests.sleep1" 45 | 46 | [[Tests]] 47 | id = "Tests.sleep20" 48 | time_limit = "00:01:00" 49 | test_name = "sleep" 50 | [Tests.cmd_args] 51 | seconds = 20 52 | [[Tests.dependencies]] 53 | type = "end_post_comp" 54 | id = "Tests.sleep1" 55 | -------------------------------------------------------------------------------- /conf/experimental/test/deepep_low_latency.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "deepep_low_latency" 18 | description = "DeepEP MoE Benchmark - Low Latency Mode" 19 | test_template_name = "DeepEP" 20 | 21 | [cmd_args] 22 | # Local .sqsh file: 23 | # docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh" 24 | # Container registry: 25 | docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark" 26 | 27 | mode = "low_latency" 28 | 29 | tokens = 128 30 | num_experts = 256 31 | num_topk = 1 32 | hidden_size = 7168 33 | data_type = "bfloat16" 34 | allow_nvlink_for_low_latency = false 35 | allow_mnnvl = false 36 | round_scale = false 37 | use_ue8m0 = false 38 | num_warmups = 20 39 | num_iterations = 50 40 | shuffle_columns = false 41 | use_kineto_profiler = false 42 | config_file_path = "/tmp/config.yaml" 43 | results_dir = "/workspace/dp-benchmark/results" 44 | -------------------------------------------------------------------------------- /tests/ref_data/ddlb.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before" 18 | -------------------------------------------------------------------------------- /conf/experimental/test/deepep_standard.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "deepep_standard" 18 | description = "DeepEP MoE Benchmark - Standard Mode" 19 | test_template_name = "DeepEP" 20 | 21 | [cmd_args] 22 | # Local .sqsh file: 23 | # docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh" 24 | # Container registry (uses your Docker credentials): 25 | docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark" 26 | 27 | mode = "standard" 28 | 29 | tokens = 1024 30 | num_experts = 256 31 | num_topk = 8 32 | hidden_size = 7168 33 | data_type = "bfloat16" 34 | allow_nvlink_for_low_latency = false 35 | allow_mnnvl = false 36 | round_scale = false 37 | use_ue8m0 = false 38 | num_warmups = 20 39 | num_iterations = 50 40 | shuffle_columns = false 41 | use_kineto_profiler = false 42 | config_file_path = "/tmp/config.yaml" 43 | results_dir = "/workspace/dp-benchmark/results" 44 | -------------------------------------------------------------------------------- /tests/ref_data/nccl.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0" 18 | -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .single_sbatch_runner import SingleSbatchRunner 18 | from .slurm_command_gen_strategy import SlurmCommandGenStrategy 19 | from .slurm_installer import SlurmInstaller 20 | from .slurm_job import SlurmJob 21 | from .slurm_metadata import SlurmJobMetadata, SlurmStepMetadata, SlurmSystemMetadata 22 | from .slurm_node import SlurmNode, SlurmNodeState 23 | from .slurm_runner import SlurmRunner 24 | from .slurm_system import SlurmGroup, SlurmPartition, SlurmSystem, parse_node_list 25 | 26 | __all__ = [ 27 | "SingleSbatchRunner", 28 | "SlurmCommandGenStrategy", 29 | "SlurmGroup", 30 | "SlurmInstaller", 31 | "SlurmJob", 32 | "SlurmJobMetadata", 33 | "SlurmNode", 34 | "SlurmNodeState", 35 | "SlurmPartition", 36 | "SlurmRunner", 37 | "SlurmStepMetadata", 38 | "SlurmSystem", 39 | "SlurmSystemMetadata", 40 | "parse_node_list", 41 | ] 42 | -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .gpt import GPTCmdArgs, GPTTestDefinition 18 | from .grading_strategy import JaxToolboxGradingStrategy 19 | from .grok import GrokCmdArgs, GrokTestDefinition 20 | from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition 21 | from .nemotron import NemotronCmdArgs, NemotronTestDefinition 22 | from .report_generation_strategy import JaxToolboxReportGenerationStrategy 23 | from .slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy 24 | 25 | __all__ = [ 26 | "GPTCmdArgs", 27 | "GPTTestDefinition", 28 | "GrokCmdArgs", 29 | "GrokTestDefinition", 30 | "JaxFdl", 31 | "JaxToolboxCmdArgs", 32 | "JaxToolboxGradingStrategy", 33 | "JaxToolboxReportGenerationStrategy", 34 | "JaxToolboxSlurmCommandGenStrategy", 35 | "JaxToolboxTestDefinition", 36 | "NemotronCmdArgs", 37 | "NemotronTestDefinition", 38 | ] 39 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Documentation 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | workflow_dispatch: # Allow manual triggering 7 | 8 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 9 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 10 | concurrency: 11 | group: "pages" 12 | cancel-in-progress: false 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | 21 | - name: Install uv 22 | uses: astral-sh/setup-uv@v5 23 | with: 24 | enable-cache: true 25 | 26 | - name: Set up environment 27 | run: | 28 | set -eE 29 | set -o pipefail 30 | 31 | uv sync --extra docs 32 | 33 | - name: Build documentation 34 | run: | 35 | set -eE 36 | set -o pipefail 37 | 38 | source .venv/bin/activate 39 | cd doc 40 | make html 41 | 42 | - name: Add .nojekyll file 43 | run: touch doc/_build/html/.nojekyll 44 | 45 | - name: Upload artifact 46 | uses: actions/upload-pages-artifact@v3 47 | with: 48 | path: ./doc/_build/html 49 | 50 | deploy: 51 | name: Deploy to GitHub Pages 52 | needs: build 53 | 54 | permissions: 55 | pages: write 56 | id-token: write 57 | 58 | environment: 59 | name: github-pages 60 | url: ${{ steps.deployment.outputs.page_url }} 61 | 62 | runs-on: ubuntu-latest 63 | steps: 64 | - name: Deploy to GitHub Pages 65 | id: deployment 66 | uses: actions/deploy-pages@v4 -------------------------------------------------------------------------------- /doc/workloads/bash_cmd.rst: -------------------------------------------------------------------------------- 1 | Bash Command 2 | ============ 3 | 4 | This workload (`test_template_name` is ``BashCmd``) allows users to execute arbitrary bash commands within the CloudAI framework. This is useful for simple scripts, custom testing commands, or integrating external tools. 5 | 6 | ``cmd`` specified in the ``cmd_args`` section will be added as-is into generated sbatch script. 7 | 8 | Usage Examples 9 | ------------- 10 | 11 | Test TOML example: 12 | 13 | .. code-block:: toml 14 | 15 | name = "my_bash_test" 16 | description = "Example bash command test" 17 | test_template_name = "BashCmd" 18 | 19 | [cmd_args] 20 | cmd = "echo 'Hello from CloudAI!'" 21 | 22 | Test Scenario example: 23 | 24 | .. code-block:: toml 25 | 26 | name = "bash-test" 27 | 28 | [[Tests]] 29 | id = "bash.1" 30 | num_nodes = 1 31 | time_limit = "00:05:00" 32 | 33 | test_name = "my_bash_test" 34 | 35 | Test-in-Scenario example: 36 | 37 | .. code-block:: toml 38 | 39 | name = "bash-test" 40 | 41 | [[Tests]] 42 | id = "bash.1" 43 | num_nodes = 1 44 | time_limit = "00:05:00" 45 | 46 | name = "my_bash_test" 47 | description = "Example bash command test" 48 | test_template_name = "BashCmd" 49 | 50 | [Tests.cmd_args] 51 | cmd = "echo 'Hello from CloudAI!'" 52 | 53 | API Documentation 54 | --------------------------------- 55 | 56 | Command Arguments 57 | ~~~~~~~~~~~~~~~~~ 58 | 59 | .. autoclass:: cloudai.workloads.bash_cmd.bash_cmd.BashCmdArgs 60 | :members: 61 | :show-inheritance: 62 | 63 | Test Definition 64 | ~~~~~~~~~~~~~~~ 65 | 66 | .. autoclass:: cloudai.workloads.bash_cmd.bash_cmd.BashCmdTestDefinition 67 | :members: 68 | :show-inheritance: 69 | -------------------------------------------------------------------------------- /doc/workloads/slurm_container.rst: -------------------------------------------------------------------------------- 1 | Slurm Container 2 | =============== 3 | 4 | This workload (`test_template_name` is ``SlurmContainer``) executes containerized workloads using Slurm with custom container configurations. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_container_test" 14 | description = "Example Slurm container test" 15 | test_template_name = "SlurmContainer" 16 | 17 | [cmd_args] 18 | image_path = "/path/to/container.sqsh" 19 | cmd = "python train.py" 20 | 21 | Test Scenario example: 22 | 23 | .. code-block:: toml 24 | 25 | name = "slurm-container-test" 26 | 27 | [[Tests]] 28 | id = "container.1" 29 | num_nodes = 2 30 | time_limit = "01:00:00" 31 | 32 | test_name = "my_container_test" 33 | 34 | Test-in-Scenario example: 35 | 36 | .. code-block:: toml 37 | 38 | name = "slurm-container-test" 39 | 40 | [[Tests]] 41 | id = "container.1" 42 | num_nodes = 2 43 | time_limit = "01:00:00" 44 | 45 | name = "my_container_test" 46 | description = "Example Slurm container test" 47 | test_template_name = "SlurmContainer" 48 | 49 | [Tests.cmd_args] 50 | image_path = "/path/to/container.sqsh" 51 | cmd = "python train.py" 52 | 53 | API Documentation 54 | ----------------- 55 | 56 | Command Arguments 57 | ~~~~~~~~~~~~~~~~~ 58 | 59 | .. autoclass:: cloudai.workloads.slurm_container.slurm_container.SlurmContainerCmdArgs 60 | :members: 61 | :show-inheritance: 62 | 63 | Test Definition 64 | ~~~~~~~~~~~~~~~ 65 | 66 | .. autoclass:: cloudai.workloads.slurm_container.slurm_container.SlurmContainerTestDefinition 67 | :members: 68 | :show-inheritance: 69 | -------------------------------------------------------------------------------- /doc/workloads/index.rst: -------------------------------------------------------------------------------- 1 | Workloads Documentation 2 | ======================= 3 | 4 | This section contains automatically generated documentation for all CloudAI workloads. Each workload provides specific functionality for running different types of tests and benchmarks. 5 | 6 | Available Workloads 7 | ------------------- 8 | 9 | .. csv-table:: 10 | :header: "Test", "Slurm", "Kubernetes", "RunAI", "Standalone" 11 | :widths: 40, 15, 15, 15, 15 12 | 13 | ":doc:`aiconfigurator`", "❌", "❌", "❌", "✅" 14 | ":doc:`ai_dynamo`", "✅", "✅", "❌", "❌" 15 | ":doc:`bash_cmd`", "✅", "❌", "❌", "❌" 16 | ":doc:`chakra_replay`", "✅", "❌", "❌", "❌" 17 | ":doc:`ddlb`", "✅", "❌", "❌", "❌" 18 | ":doc:`deepep`", "✅", "❌", "❌", "❌" 19 | ":doc:`jax_toolbox`", "✅", "❌", "❌", "❌" 20 | "MegatronRun", "✅", "❌", "❌", "❌" 21 | ":doc:`nccl`", "✅", "✅", "✅", "❌" 22 | ":doc:`nemo_launcher`", "✅", "❌", "❌", "❌" 23 | ":doc:`nemo_run`", "✅", "❌", "❌", "❌" 24 | ":doc:`nixl_bench`", "✅", "❌", "❌", "❌" 25 | ":doc:`nixl_kvbench`", "✅", "❌", "❌", "❌" 26 | ":doc:`nixl_perftest`", "✅", "❌", "❌", "❌" 27 | ":doc:`sleep`", "✅", "✅", "❌", "✅" 28 | ":doc:`slurm_container`", "✅", "❌", "❌", "❌" 29 | "Triton Inference", "✅", "❌", "❌", "❌" 30 | ":doc:`ucc`", "✅", "❌", "❌", "❌" 31 | 32 | .. toctree:: 33 | :hidden: 34 | :glob: 35 | 36 | * 37 | 38 | Adding New Workloads 39 | --------------------- 40 | 41 | To add documentation for a new workload: 42 | 43 | 1. Add docstrings to your Python classes and methods. 44 | 2. Create a markdown file in ``doc/workloads/`` (e.g., ``my_workload.md``). 45 | 3. Add it to the table above. 46 | 47 | The documentation will be automatically generated during the build process. 48 | -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_launcher_nemotron_15b_fp8" 18 | 19 | [[Tests]] 20 | id = "nemo_launcher_nemotron_15b_fp8_2_node" 21 | test_name = "nemo_launcher_nemotron_15b_fp8_2_node" 22 | num_nodes = "2" 23 | 24 | [[Tests]] 25 | id = "nemo_launcher_nemotron_15b_fp8_4_node" 26 | test_name = "nemo_launcher_nemotron_15b_fp8_4_node" 27 | num_nodes = "4" 28 | 29 | [[Tests]] 30 | id = "nemo_launcher_nemotron_15b_fp8_8_node" 31 | test_name = "nemo_launcher_nemotron_15b_fp8_8_node" 32 | num_nodes = "8" 33 | 34 | [[Tests]] 35 | id = "nemo_launcher_nemotron_15b_fp8_16_node" 36 | test_name = "nemo_launcher_nemotron_15b_fp8_16_node" 37 | num_nodes = "16" 38 | 39 | [[Tests]] 40 | id = "nemo_launcher_nemotron_15b_fp8_32_node" 41 | test_name = "nemo_launcher_nemotron_15b_fp8_32_node" 42 | num_nodes = "32" 43 | 44 | [[Tests]] 45 | id = "nemo_launcher_nemotron_15b_fp8_64_node" 46 | test_name = "nemo_launcher_nemotron_15b_fp8_64_node" 47 | num_nodes = "64" 48 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/prediction_report_generation_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from cloudai.core import System, TestRun 19 | 20 | from .prediction_report_generator import NcclTestPredictionReportGenerator 21 | from .report_generation_strategy import NcclTestReportGenerationStrategy 22 | 23 | 24 | class NcclTestPredictionReportGenerationStrategy(NcclTestReportGenerationStrategy): 25 | """Strategy for generating prediction reports from NCCL test outputs.""" 26 | 27 | def __init__(self, system: System, tr: TestRun) -> None: 28 | super().__init__(system, tr) 29 | 30 | collective_type = self._normalize_collective_type(tr.test.cmd_args.subtest_name) 31 | self.prediction_report = NcclTestPredictionReportGenerator(collective_type, tr) 32 | 33 | def _normalize_collective_type(self, subtest_name: str) -> str: 34 | return subtest_name.replace("_perf", "").replace("_mpi", "") 35 | 36 | def generate_report(self) -> None: 37 | self.prediction_report.generate() 38 | -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "dynamo-vllm-slurm" 18 | job_status_check = false 19 | 20 | [[Tests]] 21 | id = "test.disagg.single-node" 22 | test_name = "vLLM-Qwen3-0.6B" 23 | num_nodes = 2 # 1 prefill node + 1 decode node 24 | time_limit = "00:10:00" 25 | 26 | [Tests.cmd_args.dynamo.prefill_worker] 27 | num-nodes = 1 28 | tensor-parallel-size = 4 29 | pipeline-parallel-size = 1 30 | 31 | [Tests.cmd_args.dynamo.decode_worker] 32 | num-nodes = 1 33 | tensor-parallel-size = 4 34 | pipeline-parallel-size = 1 35 | 36 | [[Tests]] 37 | id = "test.disagg.multinode" 38 | test_name = "vLLM-Qwen3-0.6B" 39 | num_nodes = 4 # 2 prefill nodes + 2 decode nodes 40 | time_limit = "00:10:00" 41 | 42 | [Tests.cmd_args.dynamo.prefill_worker] 43 | num-nodes = 2 44 | tensor-parallel-size = 4 45 | pipeline-parallel-size = 1 46 | 47 | [Tests.cmd_args.dynamo.decode_worker] 48 | num-nodes = 2 49 | tensor-parallel-size = 4 50 | pipeline-parallel-size = 1 51 | -------------------------------------------------------------------------------- /conf/experimental/aiconfigurator/test/dse_aiconfigurator_disagg.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "dse_aiconfigurator_disagg_demo_Llama3.1_70B" 18 | description = "Aiconfigurator disaggregated predictor DSE sweeps" 19 | test_template_name = "Aiconfigurator" 20 | agent_metrics = [ 21 | "ttft_ms", 22 | "tpot_ms", 23 | "tokens_per_s_per_gpu", 24 | "tokens_per_s_per_user", 25 | ] 26 | agent_reward_function = "ai_dynamo_log_scale" 27 | 28 | 29 | [cmd_args] 30 | model_name = "LLAMA3.1_70B" 31 | system = "h200_sxm" 32 | # backend and version use defaults 33 | isl = 4000 34 | osl = 500 35 | 36 | [cmd_args.disagg] 37 | p_tp = [1] 38 | p_pp = [1] 39 | p_dp = [1] 40 | p_bs = 1 41 | p_workers = [1, 2] 42 | 43 | d_tp = [1] 44 | d_pp = [1] 45 | d_dp = [1] 46 | d_bs = [8] 47 | d_workers = [1, 2] 48 | 49 | gemm_quant_mode = "fp8_block" 50 | moe_quant_mode = "fp8" 51 | kvcache_quant_mode = "fp8" 52 | fmha_quant_mode = "fp8" 53 | comm_quant_mode = "half" 54 | prefill_correction_scale = 1.0 55 | decode_correction_scale = 1.0 56 | -------------------------------------------------------------------------------- /tests/ref_data/megatron-run.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # generated by CloudAI@__CLOUDAI_VERSION__ 3 | #SBATCH --job-name=__JOB_NAME__ 4 | #SBATCH --output=__OUTPUT_DIR__/output/stdout.txt 5 | #SBATCH --error=__OUTPUT_DIR__/output/stderr.txt 6 | #SBATCH --partition=main 7 | #SBATCH -N 1 8 | #SBATCH --gpus-per-node=8 9 | #SBATCH --gres=gpu:8 10 | 11 | export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) 12 | 13 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." 14 | 15 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh 16 | 17 | srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/megatron:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,$PWD bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python __CLOUDAI_DIR__/run.py --global-batch-size 16 --hidden-size 4096 --max-position-embeddings 4096 --num-attention-heads 32 --num-layers 32 --pipeline-model-parallel-size 1 --recompute-activations --seq-length 4096 --tensor-model-parallel-size 2 --save __CLOUDAI_DIR__ --load __CLOUDAI_DIR__ --tokenizer-model __CLOUDAI_DIR__/model.m" 18 | -------------------------------------------------------------------------------- /src/cloudai/util/nixl_report_template.jinja2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 9 | 10 | 45 | {{ bokeh_script | safe }} 46 | 47 | 48 | 49 |

{{ title }}

50 | 51 |
52 |

Interactive Charts

53 |

Use the interactive tools to zoom, pan, and hover over data points. Click on legend items 54 | to show/hide lines.

55 | {{ bokeh_div | safe }} 56 |
57 | 58 |
59 | {{ rich_html | safe }} 60 |
61 | 62 | 63 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .grading_strategy import NcclTestGradingStrategy 18 | from .kubernetes_json_gen_strategy import NcclTestKubernetesJsonGenStrategy 19 | from .nccl import NCCLCmdArgs, NCCLTestDefinition 20 | from .nccl_comparison_report import ComparisonReportConfig, NcclComparisonReport 21 | from .performance_report_generation_strategy import NcclTestPerformanceReportGenerationStrategy 22 | from .prediction_report_generation_strategy import NcclTestPredictionReportGenerationStrategy 23 | from .runai_json_gen_strategy import NcclTestRunAIJsonGenStrategy 24 | from .slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy 25 | 26 | __all__ = [ 27 | "ComparisonReportConfig", 28 | "NCCLCmdArgs", 29 | "NCCLTestDefinition", 30 | "NcclComparisonReport", 31 | "NcclTestGradingStrategy", 32 | "NcclTestKubernetesJsonGenStrategy", 33 | "NcclTestPerformanceReportGenerationStrategy", 34 | "NcclTestPredictionReportGenerationStrategy", 35 | "NcclTestRunAIJsonGenStrategy", 36 | "NcclTestSlurmCommandGenStrategy", 37 | ] 38 | -------------------------------------------------------------------------------- /src/cloudai/util/base-report.jinja2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ name }} 5 | 51 | {% block extra_head %}{% endblock %} 52 | 53 | 54 |

{{ name }}

55 | {% block content %}{% endblock %} 56 | 57 | -------------------------------------------------------------------------------- /conf/common/test/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_run_llama3_8b" 18 | description = "nemo_run_llama3_8b" 19 | test_template_name = "NeMoRun" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/nemo:25.09.00" 23 | task = "pretrain" 24 | recipe_name = "cloudai_llama3_8b_recipe" 25 | 26 | [cmd_args.data] 27 | seq_length = 8192 28 | micro_batch_size = 1 29 | global_batch_size = 128 30 | 31 | [cmd_args.trainer] 32 | max_steps = 10 33 | val_check_interval = 100 34 | num_nodes = 1 35 | 36 | [cmd_args.trainer.strategy] 37 | tensor_model_parallel_size = 4 38 | pipeline_model_parallel_size = 1 39 | context_parallel_size = 1 40 | 41 | [extra_env_vars] 42 | NCCL_P2P_NET_CHUNKSIZE = "2097152" 43 | TORCHX_MAX_RETRIES = "0" 44 | TRANSFORMERS_OFFLINE = "0" 45 | NCCL_NVLS_ENABLE = "0" 46 | NVTE_DP_AMAX_REDUCE_INTERVAL = "0" 47 | NVTE_ASYNC_AMAX_REDUCTION = "1" 48 | NVTE_FUSED_ATTN = "1" 49 | NVTE_FLASH_ATTN = "1" 50 | NEMO_LOG_MEMORY_USAGE = "1" 51 | CUDA_DEVICE_MAX_CONNECTIONS = "1" 52 | NVTE_FWD_LAYERNORM_SM_MARGIN = "16" 53 | NVTE_BWD_LAYERNORM_SM_MARGIN = "16" 54 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "nemo_run_llama3_8b" 18 | description = "dse_nemo_run_llama3_8b" 19 | test_template_name = "NeMoRun" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io#nvidia/nemo:25.07" 23 | task = "pretrain" 24 | recipe_name = "cloudai_llama3_8b_recipe" 25 | 26 | [cmd_args.data] 27 | seq_length = 8192 28 | micro_batch_size = 1 29 | global_batch_size = 128 30 | 31 | [cmd_args.trainer] 32 | max_steps = 30 33 | val_check_interval = 30 34 | num_nodes = 1 35 | 36 | [cmd_args.trainer.strategy] 37 | tensor_model_parallel_size = 4 38 | pipeline_model_parallel_size = 1 39 | context_parallel_size = 2 40 | 41 | [extra_env_vars] 42 | NCCL_P2P_NET_CHUNKSIZE = "2097152" 43 | TORCHX_MAX_RETRIES = "0" 44 | TRANSFORMERS_OFFLINE = "0" 45 | NCCL_NVLS_ENABLE = "0" 46 | NVTE_DP_AMAX_REDUCE_INTERVAL = "0" 47 | NVTE_ASYNC_AMAX_REDUCTION = "1" 48 | NVTE_FUSED_ATTN = "1" 49 | NVTE_FLASH_ATTN = "1" 50 | NEMO_LOG_MEMORY_USAGE = "1" 51 | CUDA_DEVICE_MAX_CONNECTIONS = "1" 52 | NVTE_FWD_LAYERNORM_SM_MARGIN = "16" 53 | NVTE_BWD_LAYERNORM_SM_MARGIN = "16" 54 | -------------------------------------------------------------------------------- /conf/common/system/example_slurm_cluster.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "example-cluster" 18 | scheduler = "slurm" 19 | 20 | install_path = "./install_dir" 21 | output_path = "./results" 22 | default_partition = "partition_1" 23 | 24 | mpi = "pmix" 25 | gpus_per_node = 8 26 | ntasks_per_node = 8 27 | 28 | [[partitions]] 29 | name = "partition_1" 30 | 31 | [[partitions.groups]] 32 | name = "group_1" 33 | nodes = ["node-[001-025]"] 34 | 35 | [[partitions.groups]] 36 | name = "group_2" 37 | nodes = ["node-[026-050]"] 38 | 39 | [[partitions.groups]] 40 | name = "group_3" 41 | nodes = ["node-[051-075]"] 42 | 43 | [[partitions.groups]] 44 | name = "group_4" 45 | nodes = ["node-[076-100]"] 46 | 47 | [[partitions]] 48 | name = "partition_2" 49 | 50 | [data_repository] 51 | endpoint = "MY_ENDPOINT" 52 | verify_certs = false 53 | 54 | [global_env_vars] 55 | # NCCL Specific Configurations 56 | NCCL_IB_GID_INDEX = "3" 57 | NCCL_IB_TIMEOUT = "20" 58 | NCCL_IB_QPS_PER_CONNECTION = "4" 59 | 60 | # Device Visibility Configuration 61 | MELLANOX_VISIBLE_DEVICES = "0,3,4,5,6,9,10,11" 62 | CUDA_VISIBLE_DEVICES = "0,1,2,3,4,5,6,7" 63 | -------------------------------------------------------------------------------- /doc/workloads/nixl_kvbench.rst: -------------------------------------------------------------------------------- 1 | NIXL KVBench 2 | ============ 3 | 4 | This workload (`test_template_name` is ``NIXLKVBench``) runs NIXL KV-cache benchmarking for key-value store performance testing. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_nixl_kvbench_test" 14 | description = "Example NIXL KVBench test" 15 | test_template_name = "NIXLKVBench" 16 | 17 | [cmd_args] 18 | docker_image_url = "..." 19 | model = "./examples/model_deepseek_r1.yaml" 20 | model_config = "./examples/block-tp1-pp16.yaml" 21 | backend = "POSIX" 22 | num_requests = 1 23 | source = "file" 24 | num_iter = 16 25 | page_size = 256 26 | filepath = "/data" 27 | 28 | Test Scenario example: 29 | 30 | .. code-block:: toml 31 | 32 | name = "nixl-kvbench-test" 33 | 34 | [[Tests]] 35 | id = "kvbench.1" 36 | num_nodes = 1 37 | time_limit = "00:10:00" 38 | 39 | test_name = "my_nixl_kvbench_test" 40 | 41 | Test-in-Scenario example: 42 | 43 | .. code-block:: toml 44 | 45 | name = "nixl-kvbench-test" 46 | 47 | [[Tests]] 48 | id = "kvbench.1" 49 | num_nodes = 1 50 | time_limit = "00:10:00" 51 | 52 | name = "my_nixl_kvbench_test" 53 | description = "Example NIXL KVBench test" 54 | test_template_name = "NIXLKVBench" 55 | 56 | [Tests.cmd_args] 57 | docker_image_url = "..." 58 | backend = "UCX" 59 | source = "memory" 60 | op_type = "READ" 61 | 62 | API Documentation 63 | ----------------- 64 | 65 | Command Arguments 66 | ~~~~~~~~~~~~~~~~~ 67 | 68 | .. autoclass:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchCmdArgs 69 | :members: 70 | :show-inheritance: 71 | 72 | Test Definition 73 | ~~~~~~~~~~~~~~~ 74 | 75 | .. autoclass:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchTestDefinition 76 | :members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /doc/workloads/nixl_bench.rst: -------------------------------------------------------------------------------- 1 | NIXL Bench 2 | ========== 3 | 4 | This workload (`test_template_name` is ``NIXLBench``) runs NIXL benchmarking suite for network and interconnect performance testing. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_nixl_bench_test" 14 | description = "Example NIXL Bench test" 15 | test_template_name = "NIXLBench" 16 | 17 | [cmd_args] 18 | docker_image_url = "..." 19 | path_to_benchmark = "/workspace/nixlbench/build/nixlbench" 20 | backend = "UCX" 21 | initiator_seg_type = "VRAM" 22 | target_seg_type = "VRAM" 23 | op_type = "READ" 24 | 25 | Test Scenario example: 26 | 27 | .. code-block:: toml 28 | 29 | name = "nixl-bench-test" 30 | 31 | [[Tests]] 32 | id = "bench.1" 33 | num_nodes = 1 34 | time_limit = "00:10:00" 35 | 36 | test_name = "my_nixl_bench_test" 37 | 38 | Test-in-Scenario example: 39 | 40 | .. code-block:: toml 41 | 42 | name = "nixl-bench-test" 43 | 44 | [[Tests]] 45 | id = "bench.1" 46 | num_nodes = 1 47 | time_limit = "00:10:00" 48 | 49 | name = "my_nixl_bench_test" 50 | description = "Example NIXL Bench test" 51 | test_template_name = "NIXLBench" 52 | 53 | [Tests.cmd_args] 54 | docker_image_url = "..." 55 | path_to_benchmark = "/workspace/nixlbench/build/nixlbench" 56 | backend = "UCX" 57 | initiator_seg_type = "DRAM" 58 | target_seg_type = "DRAM" 59 | op_type = "WRITE" 60 | 61 | API Documentation 62 | ----------------- 63 | 64 | Command Arguments 65 | ~~~~~~~~~~~~~~~~~ 66 | 67 | .. autoclass:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchCmdArgs 68 | :members: 69 | :show-inheritance: 70 | 71 | Test Definition 72 | ~~~~~~~~~~~~~~~ 73 | 74 | .. autoclass:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchTestDefinition 75 | :members: 76 | :show-inheritance: 77 | -------------------------------------------------------------------------------- /doc/workloads/ddlb.rst: -------------------------------------------------------------------------------- 1 | DDLB 2 | ==== 3 | 4 | This workload (`test_template_name` is ``DDLB``) allows users to execute DDLB (Distributed Deep Learning Benchmarks) within the CloudAI framework. Please find the DDLB README at https://github.com/samnordmann/ddlb. 5 | 6 | Usage Examples 7 | ------------- 8 | 9 | Test TOML example: 10 | 11 | .. code-block:: toml 12 | 13 | name = "my_ddlb_test" 14 | description = "Example DDLB test" 15 | test_template_name = "DDLB" 16 | 17 | [cmd_args] 18 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" 19 | primitive = "tp_columnwise" 20 | dtype = "float16" 21 | 22 | Test Scenario example: 23 | 24 | .. code-block:: toml 25 | 26 | name = "ddlb-test" 27 | 28 | [[Tests]] 29 | id = "ddlb.1" 30 | num_nodes = 1 31 | time_limit = "00:10:00" 32 | 33 | test_name = "my_ddlb_test" 34 | 35 | Test-in-Scenario example: 36 | 37 | .. code-block:: toml 38 | 39 | name = "ddlb-test" 40 | 41 | [[Tests]] 42 | id = "ddlb.1" 43 | num_nodes = 1 44 | time_limit = "00:10:00" 45 | 46 | name = "my_ddlb_test" 47 | description = "Example DDLB test" 48 | test_template_name = "DDLB" 49 | 50 | [Tests.cmd_args] 51 | docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" 52 | primitive = "tp_columnwise" 53 | m = 1024 54 | n = 128 55 | k = 1024 56 | dtype = "float16" 57 | num_iterations = 50 58 | num_warmups = 5 59 | impl = "pytorch;backend=nccl;order=AG_before" 60 | 61 | API Documentation 62 | --------------------------------- 63 | 64 | Command Arguments 65 | ~~~~~~~~~~~~~~~~~ 66 | 67 | .. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBCmdArgs 68 | :members: 69 | :show-inheritance: 70 | 71 | Test Definition 72 | ~~~~~~~~~~~~~~~ 73 | 74 | .. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBTestDefinition 75 | :members: 76 | :show-inheritance: 77 | 78 | -------------------------------------------------------------------------------- /doc/workloads/osu.rst: -------------------------------------------------------------------------------- 1 | OSU 2 | === 3 | 4 | This workload (``test_template_name`` is ``OSUBench``) allows you to execute OSU Micro Benchmarks 5 | within the CloudAI framework. 6 | 7 | Usage example 8 | ------------- 9 | 10 | Test example: 11 | 12 | .. code-block:: toml 13 | 14 | name = "osu_example" 15 | test_template_name = "OSUBench" 16 | description = "OSU Benchmark example" 17 | 18 | [cmd_args] 19 | "docker_image_url" = "docker-image-with-osu-benchmark:latest" 20 | "benchmarks_dir" = "/directory/with/osu/binaries/in/container" 21 | "benchmark" = ["osu_allreduce", "osu_allgather"] 22 | "iterations" = 10 23 | "message_size" = "1024" 24 | 25 | Test Scenario example: 26 | 27 | .. code-block:: toml 28 | 29 | name = "osu_example" 30 | 31 | [[Tests]] 32 | id = "Tests.1" 33 | test_name = "osu_example" 34 | num_nodes = "2" 35 | time_limit = "00:20:00" 36 | 37 | Test-in-Scenario example: 38 | 39 | .. code-block:: toml 40 | 41 | name = "osu-test" 42 | 43 | [[Tests]] 44 | id = "Tests.osu_allreduce" 45 | num_nodes = 2 46 | time_limit = "00:05:00" 47 | 48 | name = "osu_example" 49 | description = "OSU allreduce 1KB" 50 | test_template_name = "OSUBench" 51 | 52 | [Tests.cmd_args] 53 | docker_image_url = "docker-image-with-osu-benchmark:latest" 54 | benchmarks_dir = "/directory/with/osu/binaries/in/container" 55 | benchmark = "osu_allreduce" 56 | iterations = 10 57 | message_size = "1024" 58 | 59 | API Documentation 60 | ----------------- 61 | 62 | Command Arguments 63 | ~~~~~~~~~~~~~~~~~ 64 | 65 | .. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchCmdArgs 66 | :members: 67 | :show-inheritance: 68 | 69 | Test Definition 70 | ~~~~~~~~~~~~~~~ 71 | 72 | .. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchTestDefinition 73 | :members: 74 | :show-inheritance: 75 | -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/report_generation_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import re 18 | 19 | from cloudai.core import ReportGenerationStrategy, System, TestRun 20 | 21 | 22 | class NcclTestReportGenerationStrategy(ReportGenerationStrategy): 23 | """Base strategy for generating reports from NCCL test outputs.""" 24 | 25 | def __init__(self, system: System, tr: TestRun) -> None: 26 | super().__init__(system, tr) 27 | 28 | def can_handle_directory(self) -> bool: 29 | stdout_path = self.test_run.output_path / "stdout.txt" 30 | if stdout_path.exists(): 31 | with stdout_path.open("r") as file: 32 | content = file.read() 33 | return bool( 34 | re.search(r"out-of-place|in-place", content) 35 | and re.search( 36 | r"\b(size\s+count\s+type\s+redop\s+root\s+" 37 | r"time\s+algbw\s+busbw\s+#wrong\s+time\s+" 38 | r"algbw\s+busbw\s+#wrong)\b", 39 | content, 40 | re.IGNORECASE, 41 | ) 42 | ) 43 | return False 44 | -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from typing import cast 19 | 20 | from cloudai.systems.slurm import SlurmCommandGenStrategy 21 | 22 | from .megatron_run import MegatronRunTestDefinition 23 | 24 | 25 | class MegatronRunSlurmCommandGenStrategy(SlurmCommandGenStrategy): 26 | """Command generation strategy for MegatronRun on Slurm systems.""" 27 | 28 | def image_path(self) -> str | None: 29 | tdef: MegatronRunTestDefinition = cast(MegatronRunTestDefinition, self.test_run.test) 30 | return str(tdef.docker_image.installed_path) 31 | 32 | def _container_mounts(self) -> list[str]: 33 | return [] 34 | 35 | def generate_test_command(self) -> list[str]: 36 | tdef: MegatronRunTestDefinition = cast(MegatronRunTestDefinition, self.test_run.test) 37 | 38 | command = [ 39 | "python", 40 | str((tdef.cmd_args.run_script).absolute()), 41 | *[f"{k} {v}" for k, v in tdef.cmd_args_dict.items()], 42 | ] 43 | 44 | if self.test_run.test.extra_cmd_args: 45 | command.append(self.test_run.test.extra_args_str) 46 | 47 | return command 48 | -------------------------------------------------------------------------------- /conf/common/test_scenario/ucc_test.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "ucc_test" 18 | 19 | [[Tests]] 20 | id = "Tests.alltoall" 21 | test_name = "ucc_base_test" 22 | description = "UCC alltoall" 23 | time_limit = "00:20:00" 24 | num_nodes = 2 25 | [Tests.cmd_args] 26 | collective = "alltoall" 27 | 28 | [[Tests]] 29 | id = "Tests.allgather" 30 | test_name = "ucc_base_test" 31 | description = "UCC allgather" 32 | time_limit = "00:20:00" 33 | num_nodes = 2 34 | [Tests.cmd_args] 35 | collective = "allgather" 36 | [[Tests.dependencies]] 37 | type = "start_post_comp" 38 | id = "Tests.alltoall" 39 | 40 | [[Tests]] 41 | id = "Tests.allreduce" 42 | test_name = "ucc_base_test" 43 | description = "UCC allreduce" 44 | time_limit = "00:20:00" 45 | num_nodes = 2 46 | [Tests.cmd_args] 47 | collective = "allreduce" 48 | e = "4G" 49 | [[Tests.dependencies]] 50 | type = "start_post_comp" 51 | id = "Tests.allgather" 52 | 53 | [[Tests]] 54 | id = "Tests.reduce_scatter" 55 | test_name = "ucc_base_test" 56 | description = "UCC reduce_scatter" 57 | time_limit = "00:20:00" 58 | num_nodes = 2 59 | [Tests.cmd_args] 60 | collective = "reduce_scatter" 61 | [[Tests.dependencies]] 62 | type = "start_post_comp" 63 | id = "Tests.allreduce" 64 | -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test/vllm.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "vLLM-Qwen3-0.6B" 18 | description = "vLLM backend with Qwen3-0.6B model" 19 | test_template_name = "AIDynamo" 20 | 21 | [cmd_args] 22 | docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.0" 23 | 24 | [cmd_args.dynamo] 25 | backend = "vllm" 26 | model = "Qwen/Qwen3-0.6B" 27 | workspace-path = "/workspace/examples/backends/vllm" 28 | prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker' 29 | decode-cmd = 'python3 -m dynamo.vllm' 30 | 31 | [cmd_args.dynamo.decode_worker] 32 | pipeline-parallel-size = 1 33 | 34 | [cmd_args.genai_perf] 35 | model = "Qwen/Qwen3-0.6B" 36 | endpoint = "v1/chat/completions" 37 | endpoint-type = "chat" 38 | extra-inputs = 'min_tokens:10' 39 | output-tokens-mean = 500 40 | output-tokens-stddev = 0 41 | random-seed = 123 42 | request-count = 50 43 | synthetic-input-tokens-mean = 300 44 | synthetic-input-tokens-stddev = 0 45 | warmup-request-count = 5 46 | concurrency = 2 47 | extra-args = "--streaming -- -v --async" 48 | 49 | [extra_env_vars] 50 | UCX_LOG_LEVEL = "warn" 51 | UCX_TLS = "cuda_copy,rc_x" 52 | DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" 53 | -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/chakra_replay.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from typing import Optional 18 | 19 | from cloudai.core import CmdArgs, DockerImage, Installable, TestDefinition 20 | 21 | 22 | class ChakraReplayCmdArgs(CmdArgs): 23 | """ChakraReplay test command arguments.""" 24 | 25 | docker_image_url: str 26 | mpi: str = "pmix" 27 | trace_type: str = "et" 28 | trace_path: Optional[str] = None 29 | num_replays: int = 1 30 | 31 | 32 | class ChakraReplayTestDefinition(TestDefinition): 33 | """Test object for ChakraReplay.""" 34 | 35 | cmd_args: ChakraReplayCmdArgs 36 | _docker_image: Optional[DockerImage] = None 37 | 38 | @property 39 | def docker_image(self) -> DockerImage: 40 | if not self._docker_image: 41 | self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) 42 | return self._docker_image 43 | 44 | @property 45 | def installables(self) -> list[Installable]: 46 | return [self.docker_image] 47 | 48 | @property 49 | def extra_args_str(self) -> str: 50 | parts = [] 51 | for k, v in self.extra_cmd_args.items(): 52 | parts.append(f"{k} {v}" if v else k) 53 | return " ".join(parts) 54 | -------------------------------------------------------------------------------- /tests/test_job_type_handler.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import pytest 18 | 19 | from cloudai.core import TestRun 20 | from cloudai.systems.slurm.slurm_system import SlurmSystem 21 | from cloudai.workloads.nccl_test import NCCLCmdArgs, NCCLTestDefinition 22 | 23 | 24 | @pytest.fixture 25 | def tr(slurm_system: SlurmSystem) -> TestRun: 26 | tdef = NCCLTestDefinition( 27 | name="nccl", 28 | description="NCCL Test", 29 | test_template_name="NcclTest", 30 | cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"), 31 | ) 32 | return TestRun(name="test_run", test=tdef, num_nodes=1, nodes=[], output_path=slurm_system.output_path) 33 | 34 | 35 | def test_is_dse_job_non_dse(tr: TestRun): 36 | assert tr.is_dse_job is False 37 | 38 | 39 | def test_is_dse_job_dse_args(tr: TestRun): 40 | tr.test.cmd_args.nthreads = [1, 2] 41 | tr.test.extra_env_vars = {"VAR1": "singular"} 42 | assert tr.is_dse_job is True 43 | 44 | 45 | def test_is_dse_job_dse_env_vars(tr: TestRun): 46 | tr.test.extra_env_vars = {"VAR1": ["list-item1", "list-item2"], "VAR2": "singular3"} 47 | assert tr.is_dse_job is True 48 | 49 | 50 | def test_is_dse_job_num_nodes(tr: TestRun): 51 | tr.num_nodes = [1, 2] 52 | assert tr.is_dse_job is True 53 | -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/gpt3_126m_mock.toml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES 2 | # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name = "gpt3_126m_mock" 18 | description = "gpt3_126m_mock" 19 | test_template_name = "NeMoLauncher" 20 | 21 | [cmd_args] 22 | [cmd_args.training] 23 | values = "gpt3/126m" 24 | 25 | [cmd_args.training.trainer] 26 | max_steps = "100" 27 | val_check_interval = "20" 28 | 29 | [cmd_args.training.model] 30 | pipeline_model_parallel_size = "1" 31 | 32 | [cmd_args.training.model.data] 33 | data_impl = "mock" 34 | 35 | [cmd_args.training.run] 36 | name = "run" 37 | time_limit = "20:00" 38 | 39 | [extra_cmd_args] 40 | "training.model.activations_checkpoint_num_layers" = "null" 41 | "training.model.fsdp" = "true" 42 | "training.model.fsdp_grad_reduce_dtype" = "bf16" 43 | "training.model.fsdp_sharding_strategy" = "full" 44 | "training.model.mcore_gpt" = "true" 45 | "training.model.optim.name" = "fused_adam" 46 | "training.trainer.limit_val_batches" = "5" 47 | "training.trainer.log_every_n_steps" = "1" 48 | "training.trainer.num_nodes" = "1" 49 | "~training.model.optim.bucket_cap_mb" = "null" 50 | "~training.model.optim.contiguous_grad_buffer" = "null" 51 | "~training.model.optim.overlap_grad_sync" = "null" 52 | "~training.model.optim.overlap_param_sync" = "null" 53 | --------------------------------------------------------------------------------