├── .coderabbit.yaml ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── general_question.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── ci.yml │ └── docs.yml ├── .gitignore ├── .python-version ├── .taplo.toml ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── conf ├── common │ ├── system │ │ ├── example_runai_cluster.toml │ │ ├── example_slurm_cluster.toml │ │ ├── kubernetes_cluster.toml │ │ └── standalone_system.toml │ ├── test │ │ ├── dse_nccl_all_gather.toml │ │ ├── nccl_test.toml │ │ ├── nccl_test_all_gather.toml │ │ ├── nemo_run_llama3_8b.toml │ │ ├── sleep.toml │ │ └── ucc_test.toml │ └── test_scenario │ │ ├── dse_nccl_all_gather.toml │ │ ├── dse_nemo_run_llama3_8b.toml │ │ ├── nccl_test.toml │ │ ├── nemo_run_llama3_8b.toml │ │ ├── sleep.toml │ │ ├── slurm_container.toml │ │ └── ucc_test.toml ├── experimental │ ├── ai_dynamo │ │ ├── test │ │ │ ├── agg.yaml │ │ │ └── vllm.toml │ │ └── test_scenario │ │ │ └── vllm_k8s.toml │ ├── test │ │ ├── ddlb_test.toml │ │ ├── deepep_low_latency.toml │ │ ├── deepep_standard.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_128_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_256_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_2_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_128_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_16_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_256_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_2_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_32_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_4_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_64_node.toml │ │ └── nemo_launcher_nemotron_15b_fp8_8_node.toml │ └── test_scenario │ │ ├── ddlb_test.toml │ │ ├── deepep.toml │ │ ├── nemo_launcher_nemotron_15b_bf16.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_128_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_16_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_256_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_2_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_32_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_4_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_64_node.toml │ │ ├── nemo_launcher_nemotron_15b_bf16_8_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_128_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_16_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_256_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_2_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_32_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_4_node.toml │ │ ├── nemo_launcher_nemotron_15b_fp8_64_node.toml │ │ └── nemo_launcher_nemotron_15b_fp8_8_node.toml ├── hook │ ├── nccl_test.toml │ └── test │ │ └── nccl_test_all_gather.toml └── release │ ├── nemo_acceptance │ ├── test │ │ ├── gpt3_126m_mock.toml │ │ ├── gpt3_126m_pile.toml │ │ ├── nccl_test_all_reduce_loopback.toml │ │ └── nemo_run_llama3_8b.toml │ └── test_scenario │ │ ├── gpt3_126m_mock.toml │ │ ├── gpt3_126m_pile.toml │ │ ├── nccl_test_loopback.toml │ │ └── nemo_run_llama3_8b.toml │ └── spcx │ └── l40s │ ├── test │ ├── l40s-bc-nccl_test_all_gather.toml │ ├── l40s-bc-nccl_test_all_gather_worst.toml │ ├── l40s-bc-nccl_test_all_reduce.toml │ ├── l40s-bc-nccl_test_all_reduce_worst.toml │ ├── l40s-bc-nccl_test_alltoall.toml │ ├── l40s-bc-nccl_test_alltoall_worst.toml │ ├── l40s-bc-nccl_test_alltoall_worst_failover.toml │ ├── l40s-bc-nccl_test_bisection.toml │ ├── l40s-bc-nccl_test_broadcast.toml │ ├── l40s-bc-nccl_test_gather.toml │ ├── l40s-bc-nccl_test_hypercube.toml │ ├── l40s-bc-nccl_test_reduce.toml │ ├── l40s-bc-nccl_test_reduce_scatter.toml │ ├── l40s-bc-nccl_test_reduce_scatter_worst.toml │ ├── l40s-bc-nccl_test_scatter.toml │ ├── l40s-bc-nccl_test_sendrecv.toml │ └── l40s-bc-nccl_test_sendrecv_worst.toml │ └── test_scenario │ └── l40s_bc_nccl_test.toml ├── doc ├── DEV.md ├── Makefile ├── USER_GUIDE.md ├── ai_dynamo.md ├── conf.py ├── index.md ├── reporting.md └── workloads │ ├── ai_dynamo.rst │ ├── bash_cmd.rst │ ├── chakra_replay.rst │ ├── ddlb.rst │ ├── deepep.rst │ ├── index.md │ ├── nccl.rst │ ├── nemo_run.rst │ ├── nixl_bench.rst │ ├── nixl_kvbench.rst │ ├── nixl_perftest.rst │ ├── sleep.rst │ ├── slurm_container.rst │ └── ucc.rst ├── greptile.json ├── pyproject.toml ├── src └── cloudai │ ├── __init__.py │ ├── _core │ ├── __init__.py │ ├── base_installer.py │ ├── base_job.py │ ├── base_reporter.py │ ├── base_runner.py │ ├── base_system_parser.py │ ├── command_gen_strategy.py │ ├── exceptions.py │ ├── grader.py │ ├── grading_strategy.py │ ├── install_status_result.py │ ├── installables.py │ ├── job_status_result.py │ ├── json_gen_strategy.py │ ├── registry.py │ ├── report_generation_strategy.py │ ├── runner.py │ ├── system.py │ └── test_scenario.py │ ├── cli │ ├── __init__.py │ ├── cli.py │ └── handlers.py │ ├── configurator │ ├── __init__.py │ ├── base_agent.py │ ├── base_gym.py │ ├── cloudai_gym.py │ ├── grid_search.py │ └── reward_functions.py │ ├── core.py │ ├── models │ ├── scenario.py │ └── workload.py │ ├── parser.py │ ├── registration.py │ ├── report_generator │ ├── __init__.py │ ├── comparison_report.py │ ├── groups.py │ ├── tool │ │ ├── __init__.py │ │ ├── bokeh_report_tool.py │ │ ├── csv_report_tool.py │ │ ├── report_tool_interface.py │ │ └── tensorboard_data_reader.py │ └── util.py │ ├── reporter.py │ ├── systems │ ├── __init__.py │ ├── kubernetes │ │ ├── __init__.py │ │ ├── kubernetes_installer.py │ │ ├── kubernetes_job.py │ │ ├── kubernetes_runner.py │ │ └── kubernetes_system.py │ ├── lsf │ │ ├── __init__.py │ │ ├── lsf_command_gen_strategy.py │ │ ├── lsf_installer.py │ │ ├── lsf_job.py │ │ ├── lsf_node.py │ │ ├── lsf_runner.py │ │ └── lsf_system.py │ ├── runai │ │ ├── __init__.py │ │ ├── runai_cluster.py │ │ ├── runai_event.py │ │ ├── runai_installer.py │ │ ├── runai_job.py │ │ ├── runai_node.py │ │ ├── runai_project.py │ │ ├── runai_pvc.py │ │ ├── runai_rest_client.py │ │ ├── runai_runner.py │ │ ├── runai_system.py │ │ └── runai_training.py │ ├── slurm │ │ ├── __init__.py │ │ ├── docker_image_cache_manager.py │ │ ├── single_sbatch_runner.py │ │ ├── slurm-metadata.sh │ │ ├── slurm_command_gen_strategy.py │ │ ├── slurm_installer.py │ │ ├── slurm_job.py │ │ ├── slurm_metadata.py │ │ ├── slurm_node.py │ │ ├── slurm_runner.py │ │ └── slurm_system.py │ └── standalone │ │ ├── __init__.py │ │ ├── standalone_installer.py │ │ ├── standalone_job.py │ │ ├── standalone_runner.py │ │ └── standalone_system.py │ ├── test_parser.py │ ├── test_scenario_parser.py │ ├── util │ ├── __init__.py │ ├── base-report.jinja2 │ ├── command_shell.py │ ├── general-report.jinja2 │ ├── general-slurm-report.jinja2 │ ├── lazy_imports.py │ ├── nixl_report_template.jinja2 │ └── utils.py │ └── workloads │ ├── __init__.py │ ├── ai_dynamo │ ├── __init__.py │ ├── ai_dynamo.py │ ├── ai_dynamo.sh │ ├── kubernetes_json_gen_strategy.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── bash_cmd │ ├── __init__.py │ └── bash_cmd.py │ ├── chakra_replay │ ├── __init__.py │ ├── chakra_replay.py │ ├── grading_strategy.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── common │ └── nixl.py │ ├── ddlb │ ├── __init__.py │ ├── ddlb.py │ └── slurm_command_gen_strategy.py │ ├── deepep │ ├── __init__.py │ ├── deepep.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── jax_toolbox │ ├── __init__.py │ ├── gpt.py │ ├── grading_strategy.py │ ├── grok.py │ ├── jax_toolbox.py │ ├── nemotron.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── megatron_run │ ├── __init__.py │ ├── megatron_run.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── nccl_test │ ├── __init__.py │ ├── grading_strategy.py │ ├── kubernetes_json_gen_strategy.py │ ├── nccl.py │ ├── nccl_comparison_report.py │ ├── performance_report_generation_strategy.py │ ├── prediction_report_generation_strategy.py │ ├── prediction_report_generator.py │ ├── report_generation_strategy.py │ ├── runai_json_gen_strategy.py │ └── slurm_command_gen_strategy.py │ ├── nemo_launcher │ ├── __init__.py │ ├── grading_strategy.py │ ├── nemo_launcher.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── nemo_run │ ├── __init__.py │ ├── cloudai_nemorun.py │ ├── data_store_report_generation_strategy.py │ ├── http_data_repository.py │ ├── nemo_run.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── nixl_bench │ ├── __init__.py │ ├── nixl_bench.py │ ├── nixl_summary_report.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── nixl_kvbench │ ├── __init__.py │ ├── nixl_kvbench.py │ └── slurm_command_gen_strategy.py │ ├── nixl_perftest │ ├── __init__.py │ ├── nixl_perftest.py │ ├── report_generation_strategy.py │ └── slurm_command_gen_strategy.py │ ├── sleep │ ├── __init__.py │ ├── grading_strategy.py │ ├── kubernetes_json_gen_strategy.py │ ├── lsf_command_gen_strategy.py │ ├── sleep.py │ ├── slurm_command_gen_strategy.py │ └── standalone_command_gen_strategy.py │ ├── slurm_container │ ├── __init__.py │ ├── slurm_command_gen_strategy.py │ └── slurm_container.py │ ├── triton_inference │ ├── __init__.py │ ├── report_generation_strategy.py │ ├── slurm_command_gen_strategy.py │ └── triton_inference.py │ └── ucc_test │ ├── __init__.py │ ├── grading_strategy.py │ ├── report_generation_strategy.py │ ├── slurm_command_gen_strategy.py │ └── ucc.py ├── tests ├── conftest.py ├── job_status_retrieval_strategy │ ├── test_jax_toolbox_job_status_retrieval_strategy.py │ ├── test_nccl_job_status_retrieval_strategy.py │ └── test_nemo_run_job_status_retrieval_strategy.py ├── json_gen_strategy │ ├── test_nccl_kubernetes_json_gen_strategy.py │ └── test_nccl_runai_json_gen_strategy.py ├── ref_data │ ├── ai-dynamo.sbatch │ ├── ddlb.sbatch │ ├── deepep-benchmark.sbatch │ ├── gpt-no-hook.sbatch │ ├── gpt-pre-test.sbatch │ ├── gpt.run │ ├── grok-no-hook.sbatch │ ├── grok-pre-test.sbatch │ ├── grok.run │ ├── megatron-run.sbatch │ ├── nccl.sbatch │ ├── nemo-launcher.sbatch │ ├── nemo-run-no-hook.sbatch │ ├── nemo-run-pre-test.sbatch │ ├── nemo-run-vboost.sbatch │ ├── nixl-kvbench.sbatch │ ├── nixl-perftest.sbatch │ ├── nixl_bench.sbatch │ ├── sleep.sbatch │ ├── slurm_container.sbatch │ ├── triton-inference-start_server_wrapper.sh │ ├── triton-inference.sbatch │ └── ucc.sbatch ├── report_generation_strategy │ ├── conftest.py │ ├── test_ai_dynamo_report_generation_strategy.py │ ├── test_checkpoint_timings.py │ ├── test_comparison_report.py │ ├── test_data_store_report_generation_strategy.py │ ├── test_jax_toolbox_report_generation_strategy.py │ ├── test_nccl_performance_report_generation_strategy.py │ ├── test_nccl_prediction_report_generator.py │ ├── test_nemo_launcher_report_generation_strategy.py │ ├── test_nemo_run_report_generation_strategy.py │ ├── test_nixl_bench_report.py │ ├── test_report_groups.py │ └── test_ucc_report_generation_strategy.py ├── runai │ ├── test_runai_cluster.py │ ├── test_runai_event.py │ ├── test_runai_node.py │ ├── test_runai_project.py │ ├── test_runai_pvc.py │ ├── test_runai_rest_client.py │ └── test_runai_training.py ├── slurm_command_gen_strategy │ ├── test_ai_dynamo_slurm_command_gen_strategy.py │ ├── test_bash_cmd_slurm_command_gen_strategy.py │ ├── test_chakra_replay_slurm_command_gen_strategy.py │ ├── test_common_slurm_command_gen_strategy.py │ ├── test_jax_toolbox_slurm_command_gen_strategy.py │ ├── test_nccl_slurm_command_gen_strategy.py │ ├── test_nemo_launcher_slurm_command_gen_strategy.py │ ├── test_nemo_run_slurm_command_gen_strategy.py │ ├── test_nixl_bench_slurm_command_gen_strategy.py │ ├── test_nixl_kvbench_command_gen.py │ ├── test_nixl_perftest_slurm_command_gen_strategy.py │ ├── test_sleep_slurm_command_gen_strategy.py │ ├── test_slurm_container_slurm_command_gen_strategy.py │ ├── test_triton_inference_slurm_command_gen_strategy.py │ └── test_ucc_slurm_command_gen_strategy.py ├── test_acceptance.py ├── test_agents.py ├── test_base_installer.py ├── test_base_runner.py ├── test_check_copyright_headers.py ├── test_cli.py ├── test_cloudaigym.py ├── test_command_gen_strategy.py ├── test_csv_report_tool.py ├── test_docker_image_cache_manager.py ├── test_get_job_id.py ├── test_git_repo_installer.py ├── test_handlers.py ├── test_init.py ├── test_job_type_handler.py ├── test_kubernetes_system.py ├── test_lsf_system.py ├── test_parser.py ├── test_registry.py ├── test_reporter.py ├── test_single_sbatch_runner.py ├── test_slurm_allocation.py ├── test_slurm_installer.py ├── test_slurm_system.py ├── test_standalone_installer.py ├── test_standalone_system.py ├── test_test_definitions.py ├── test_test_run.py ├── test_test_scenario.py ├── test_test_scenario_parser.py └── test_toml_files.py └── uv.lock /.coderabbit.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.coderabbit.yaml -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/CODEOWNERS -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/ISSUE_TEMPLATE/bug_report.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/ISSUE_TEMPLATE/feature_request.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general_question.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/ISSUE_TEMPLATE/general_question.md -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/workflows/ci.yml -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.github/workflows/docs.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.gitignore -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /.taplo.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/.taplo.toml -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/LICENSE.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/README.md -------------------------------------------------------------------------------- /conf/common/system/example_runai_cluster.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/system/example_runai_cluster.toml -------------------------------------------------------------------------------- /conf/common/system/example_slurm_cluster.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/system/example_slurm_cluster.toml -------------------------------------------------------------------------------- /conf/common/system/kubernetes_cluster.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/system/kubernetes_cluster.toml -------------------------------------------------------------------------------- /conf/common/system/standalone_system.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/system/standalone_system.toml -------------------------------------------------------------------------------- /conf/common/test/dse_nccl_all_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/dse_nccl_all_gather.toml -------------------------------------------------------------------------------- /conf/common/test/nccl_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/nccl_test.toml -------------------------------------------------------------------------------- /conf/common/test/nccl_test_all_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/nccl_test_all_gather.toml -------------------------------------------------------------------------------- /conf/common/test/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/nemo_run_llama3_8b.toml -------------------------------------------------------------------------------- /conf/common/test/sleep.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/sleep.toml -------------------------------------------------------------------------------- /conf/common/test/ucc_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test/ucc_test.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/dse_nccl_all_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/dse_nccl_all_gather.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/dse_nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/dse_nemo_run_llama3_8b.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/nccl_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/nccl_test.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/nemo_run_llama3_8b.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/sleep.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/sleep.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/slurm_container.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/slurm_container.toml -------------------------------------------------------------------------------- /conf/common/test_scenario/ucc_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/common/test_scenario/ucc_test.toml -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test/agg.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/ai_dynamo/test/agg.yaml -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test/vllm.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/ai_dynamo/test/vllm.toml -------------------------------------------------------------------------------- /conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml -------------------------------------------------------------------------------- /conf/experimental/test/ddlb_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/ddlb_test.toml -------------------------------------------------------------------------------- /conf/experimental/test/deepep_low_latency.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/deepep_low_latency.toml -------------------------------------------------------------------------------- /conf/experimental/test/deepep_standard.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/deepep_standard.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_bf16_128_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_bf16_128_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_bf16_256_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_bf16_256_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_bf16_2_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_bf16_2_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_128_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_128_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_16_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_16_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_256_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_256_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_2_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_2_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_32_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_32_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_4_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_4_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_64_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_64_node.toml -------------------------------------------------------------------------------- /conf/experimental/test/nemo_launcher_nemotron_15b_fp8_8_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test/nemo_launcher_nemotron_15b_fp8_8_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/ddlb_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/ddlb_test.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/deepep.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/deepep.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_128_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_128_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_16_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_16_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_256_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_256_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_2_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_2_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_32_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_32_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_4_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_4_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_64_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_64_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_8_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_bf16_8_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_128_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_128_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_16_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_16_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_256_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_256_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_2_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_2_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_32_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_32_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_4_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_4_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_64_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_64_node.toml -------------------------------------------------------------------------------- /conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_8_node.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/experimental/test_scenario/nemo_launcher_nemotron_15b_fp8_8_node.toml -------------------------------------------------------------------------------- /conf/hook/nccl_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/hook/nccl_test.toml -------------------------------------------------------------------------------- /conf/hook/test/nccl_test_all_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/hook/test/nccl_test_all_gather.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/gpt3_126m_mock.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test/gpt3_126m_mock.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/gpt3_126m_pile.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test/gpt3_126m_pile.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/nccl_test_all_reduce_loopback.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test/nccl_test_all_reduce_loopback.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test/nemo_run_llama3_8b.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/gpt3_126m_mock.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test_scenario/gpt3_126m_mock.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/gpt3_126m_pile.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test_scenario/gpt3_126m_pile.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/nccl_test_loopback.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test_scenario/nccl_test_loopback.toml -------------------------------------------------------------------------------- /conf/release/nemo_acceptance/test_scenario/nemo_run_llama3_8b.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/nemo_acceptance/test_scenario/nemo_run_llama3_8b.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather_worst.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_gather_worst.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce_worst.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_all_reduce_worst.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst_failover.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_alltoall_worst_failover.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_bisection.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_bisection.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_broadcast.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_broadcast.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_gather.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_gather.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_hypercube.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_hypercube.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter_worst.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_reduce_scatter_worst.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_scatter.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_scatter.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv_worst.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test/l40s-bc-nccl_test_sendrecv_worst.toml -------------------------------------------------------------------------------- /conf/release/spcx/l40s/test_scenario/l40s_bc_nccl_test.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/conf/release/spcx/l40s/test_scenario/l40s_bc_nccl_test.toml -------------------------------------------------------------------------------- /doc/DEV.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/DEV.md -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/Makefile -------------------------------------------------------------------------------- /doc/USER_GUIDE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/USER_GUIDE.md -------------------------------------------------------------------------------- /doc/ai_dynamo.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/ai_dynamo.md -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/conf.py -------------------------------------------------------------------------------- /doc/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/index.md -------------------------------------------------------------------------------- /doc/reporting.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/reporting.md -------------------------------------------------------------------------------- /doc/workloads/ai_dynamo.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/ai_dynamo.rst -------------------------------------------------------------------------------- /doc/workloads/bash_cmd.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/bash_cmd.rst -------------------------------------------------------------------------------- /doc/workloads/chakra_replay.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/chakra_replay.rst -------------------------------------------------------------------------------- /doc/workloads/ddlb.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/ddlb.rst -------------------------------------------------------------------------------- /doc/workloads/deepep.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/deepep.rst -------------------------------------------------------------------------------- /doc/workloads/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/index.md -------------------------------------------------------------------------------- /doc/workloads/nccl.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/nccl.rst -------------------------------------------------------------------------------- /doc/workloads/nemo_run.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/nemo_run.rst -------------------------------------------------------------------------------- /doc/workloads/nixl_bench.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/nixl_bench.rst -------------------------------------------------------------------------------- /doc/workloads/nixl_kvbench.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/nixl_kvbench.rst -------------------------------------------------------------------------------- /doc/workloads/nixl_perftest.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/nixl_perftest.rst -------------------------------------------------------------------------------- /doc/workloads/sleep.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/sleep.rst -------------------------------------------------------------------------------- /doc/workloads/slurm_container.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/slurm_container.rst -------------------------------------------------------------------------------- /doc/workloads/ucc.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/doc/workloads/ucc.rst -------------------------------------------------------------------------------- /greptile.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/greptile.json -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/cloudai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/__init__.py -------------------------------------------------------------------------------- /src/cloudai/_core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/__init__.py -------------------------------------------------------------------------------- /src/cloudai/_core/base_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/base_installer.py -------------------------------------------------------------------------------- /src/cloudai/_core/base_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/base_job.py -------------------------------------------------------------------------------- /src/cloudai/_core/base_reporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/base_reporter.py -------------------------------------------------------------------------------- /src/cloudai/_core/base_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/base_runner.py -------------------------------------------------------------------------------- /src/cloudai/_core/base_system_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/base_system_parser.py -------------------------------------------------------------------------------- /src/cloudai/_core/command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/_core/exceptions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/exceptions.py -------------------------------------------------------------------------------- /src/cloudai/_core/grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/grader.py -------------------------------------------------------------------------------- /src/cloudai/_core/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/_core/install_status_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/install_status_result.py -------------------------------------------------------------------------------- /src/cloudai/_core/installables.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/installables.py -------------------------------------------------------------------------------- /src/cloudai/_core/job_status_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/job_status_result.py -------------------------------------------------------------------------------- /src/cloudai/_core/json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/json_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/_core/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/registry.py -------------------------------------------------------------------------------- /src/cloudai/_core/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/_core/runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/runner.py -------------------------------------------------------------------------------- /src/cloudai/_core/system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/system.py -------------------------------------------------------------------------------- /src/cloudai/_core/test_scenario.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/_core/test_scenario.py -------------------------------------------------------------------------------- /src/cloudai/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/cli/__init__.py -------------------------------------------------------------------------------- /src/cloudai/cli/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/cli/cli.py -------------------------------------------------------------------------------- /src/cloudai/cli/handlers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/cli/handlers.py -------------------------------------------------------------------------------- /src/cloudai/configurator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/__init__.py -------------------------------------------------------------------------------- /src/cloudai/configurator/base_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/base_agent.py -------------------------------------------------------------------------------- /src/cloudai/configurator/base_gym.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/base_gym.py -------------------------------------------------------------------------------- /src/cloudai/configurator/cloudai_gym.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/cloudai_gym.py -------------------------------------------------------------------------------- /src/cloudai/configurator/grid_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/grid_search.py -------------------------------------------------------------------------------- /src/cloudai/configurator/reward_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/configurator/reward_functions.py -------------------------------------------------------------------------------- /src/cloudai/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/core.py -------------------------------------------------------------------------------- /src/cloudai/models/scenario.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/models/scenario.py -------------------------------------------------------------------------------- /src/cloudai/models/workload.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/models/workload.py -------------------------------------------------------------------------------- /src/cloudai/parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/parser.py -------------------------------------------------------------------------------- /src/cloudai/registration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/registration.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/__init__.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/comparison_report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/comparison_report.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/groups.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/groups.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/tool/__init__.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/bokeh_report_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/tool/bokeh_report_tool.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/csv_report_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/tool/csv_report_tool.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/report_tool_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/tool/report_tool_interface.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/tool/tensorboard_data_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/tool/tensorboard_data_reader.py -------------------------------------------------------------------------------- /src/cloudai/report_generator/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/report_generator/util.py -------------------------------------------------------------------------------- /src/cloudai/reporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/reporter.py -------------------------------------------------------------------------------- /src/cloudai/systems/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/kubernetes/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/kubernetes_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/kubernetes/kubernetes_installer.py -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/kubernetes_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/kubernetes/kubernetes_job.py -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/kubernetes_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/kubernetes/kubernetes_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/kubernetes/kubernetes_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/kubernetes/kubernetes_system.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_installer.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_job.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_node.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/lsf/lsf_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/lsf/lsf_system.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_cluster.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_event.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_event.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_installer.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_job.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_node.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_project.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_project.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_pvc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_pvc.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_rest_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_rest_client.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_system.py -------------------------------------------------------------------------------- /src/cloudai/systems/runai/runai_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/runai/runai_training.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/docker_image_cache_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/docker_image_cache_manager.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/single_sbatch_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/single_sbatch_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm-metadata.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm-metadata.sh -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_installer.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_job.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_metadata.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_node.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/slurm/slurm_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/slurm/slurm_system.py -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/standalone/__init__.py -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/standalone_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/standalone/standalone_installer.py -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/standalone_job.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/standalone/standalone_job.py -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/standalone_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/standalone/standalone_runner.py -------------------------------------------------------------------------------- /src/cloudai/systems/standalone/standalone_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/systems/standalone/standalone_system.py -------------------------------------------------------------------------------- /src/cloudai/test_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/test_parser.py -------------------------------------------------------------------------------- /src/cloudai/test_scenario_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/test_scenario_parser.py -------------------------------------------------------------------------------- /src/cloudai/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/__init__.py -------------------------------------------------------------------------------- /src/cloudai/util/base-report.jinja2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/base-report.jinja2 -------------------------------------------------------------------------------- /src/cloudai/util/command_shell.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/command_shell.py -------------------------------------------------------------------------------- /src/cloudai/util/general-report.jinja2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/general-report.jinja2 -------------------------------------------------------------------------------- /src/cloudai/util/general-slurm-report.jinja2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/general-slurm-report.jinja2 -------------------------------------------------------------------------------- /src/cloudai/util/lazy_imports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/lazy_imports.py -------------------------------------------------------------------------------- /src/cloudai/util/nixl_report_template.jinja2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/nixl_report_template.jinja2 -------------------------------------------------------------------------------- /src/cloudai/util/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/util/utils.py -------------------------------------------------------------------------------- /src/cloudai/workloads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/ai_dynamo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/ai_dynamo.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/ai_dynamo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/kubernetes_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/kubernetes_json_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/bash_cmd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/bash_cmd/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/bash_cmd/bash_cmd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/bash_cmd/bash_cmd.py -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/chakra_replay/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/chakra_replay.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/chakra_replay/chakra_replay.py -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/chakra_replay/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/chakra_replay/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/chakra_replay/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/chakra_replay/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/common/nixl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/common/nixl.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ddlb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ddlb/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ddlb/ddlb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ddlb/ddlb.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/deepep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/deepep/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/deepep/deepep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/deepep/deepep.py -------------------------------------------------------------------------------- /src/cloudai/workloads/deepep/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/deepep/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/deepep/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/deepep/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/gpt.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/grok.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/grok.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/jax_toolbox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/jax_toolbox.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/nemotron.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/nemotron.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/jax_toolbox/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/jax_toolbox/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/megatron_run/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/megatron_run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/megatron_run/megatron_run.py -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/megatron_run/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/megatron_run/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/megatron_run/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/kubernetes_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/kubernetes_json_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/nccl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/nccl.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/nccl_comparison_report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/nccl_comparison_report.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/performance_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/performance_report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/prediction_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/prediction_report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/prediction_report_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/prediction_report_generator.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/runai_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/runai_json_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nccl_test/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nccl_test/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_launcher/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_launcher/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/nemo_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_launcher/nemo_launcher.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_launcher/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_launcher/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_launcher/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/cloudai_nemorun.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/cloudai_nemorun.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/data_store_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/data_store_report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/http_data_repository.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/http_data_repository.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/nemo_run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/nemo_run.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nemo_run/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_bench/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/nixl_bench.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_bench/nixl_bench.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/nixl_summary_report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_bench/nixl_summary_report.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_bench/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_kvbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_kvbench/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_perftest/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/nixl_perftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_perftest/nixl_perftest.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_perftest/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/lsf_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/lsf_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/sleep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/sleep.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/sleep/standalone_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/sleep/standalone_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/slurm_container/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/slurm_container/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/slurm_container/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/slurm_container/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/slurm_container/slurm_container.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/slurm_container/slurm_container.py -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/triton_inference/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/triton_inference/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/triton_inference/triton_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/triton_inference/triton_inference.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ucc_test/__init__.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/grading_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ucc_test/grading_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ucc_test/report_generation_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ucc_test/slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /src/cloudai/workloads/ucc_test/ucc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/src/cloudai/workloads/ucc_test/ucc.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/job_status_retrieval_strategy/test_jax_toolbox_job_status_retrieval_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/job_status_retrieval_strategy/test_jax_toolbox_job_status_retrieval_strategy.py -------------------------------------------------------------------------------- /tests/job_status_retrieval_strategy/test_nccl_job_status_retrieval_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/job_status_retrieval_strategy/test_nccl_job_status_retrieval_strategy.py -------------------------------------------------------------------------------- /tests/job_status_retrieval_strategy/test_nemo_run_job_status_retrieval_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/job_status_retrieval_strategy/test_nemo_run_job_status_retrieval_strategy.py -------------------------------------------------------------------------------- /tests/json_gen_strategy/test_nccl_kubernetes_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/json_gen_strategy/test_nccl_kubernetes_json_gen_strategy.py -------------------------------------------------------------------------------- /tests/json_gen_strategy/test_nccl_runai_json_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/json_gen_strategy/test_nccl_runai_json_gen_strategy.py -------------------------------------------------------------------------------- /tests/ref_data/ai-dynamo.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/ai-dynamo.sbatch -------------------------------------------------------------------------------- /tests/ref_data/ddlb.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/ddlb.sbatch -------------------------------------------------------------------------------- /tests/ref_data/deepep-benchmark.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/deepep-benchmark.sbatch -------------------------------------------------------------------------------- /tests/ref_data/gpt-no-hook.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/gpt-no-hook.sbatch -------------------------------------------------------------------------------- /tests/ref_data/gpt-pre-test.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/gpt-pre-test.sbatch -------------------------------------------------------------------------------- /tests/ref_data/gpt.run: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/gpt.run -------------------------------------------------------------------------------- /tests/ref_data/grok-no-hook.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/grok-no-hook.sbatch -------------------------------------------------------------------------------- /tests/ref_data/grok-pre-test.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/grok-pre-test.sbatch -------------------------------------------------------------------------------- /tests/ref_data/grok.run: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/grok.run -------------------------------------------------------------------------------- /tests/ref_data/megatron-run.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/megatron-run.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nccl.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nccl.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nemo-launcher.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nemo-launcher.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nemo-run-no-hook.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nemo-run-no-hook.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nemo-run-pre-test.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nemo-run-pre-test.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nemo-run-vboost.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nemo-run-vboost.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nixl-kvbench.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nixl-kvbench.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nixl-perftest.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nixl-perftest.sbatch -------------------------------------------------------------------------------- /tests/ref_data/nixl_bench.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/nixl_bench.sbatch -------------------------------------------------------------------------------- /tests/ref_data/sleep.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/sleep.sbatch -------------------------------------------------------------------------------- /tests/ref_data/slurm_container.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/slurm_container.sbatch -------------------------------------------------------------------------------- /tests/ref_data/triton-inference-start_server_wrapper.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/triton-inference-start_server_wrapper.sh -------------------------------------------------------------------------------- /tests/ref_data/triton-inference.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/triton-inference.sbatch -------------------------------------------------------------------------------- /tests/ref_data/ucc.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/ref_data/ucc.sbatch -------------------------------------------------------------------------------- /tests/report_generation_strategy/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/conftest.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_checkpoint_timings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_checkpoint_timings.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_comparison_report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_comparison_report.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_data_store_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_data_store_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_jax_toolbox_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_jax_toolbox_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_nccl_performance_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_nccl_performance_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_nccl_prediction_report_generator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_nccl_prediction_report_generator.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_nemo_launcher_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_nemo_launcher_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_nemo_run_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_nemo_run_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_nixl_bench_report.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_nixl_bench_report.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_report_groups.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_report_groups.py -------------------------------------------------------------------------------- /tests/report_generation_strategy/test_ucc_report_generation_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/report_generation_strategy/test_ucc_report_generation_strategy.py -------------------------------------------------------------------------------- /tests/runai/test_runai_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_cluster.py -------------------------------------------------------------------------------- /tests/runai/test_runai_event.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_event.py -------------------------------------------------------------------------------- /tests/runai/test_runai_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_node.py -------------------------------------------------------------------------------- /tests/runai/test_runai_project.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_project.py -------------------------------------------------------------------------------- /tests/runai/test_runai_pvc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_pvc.py -------------------------------------------------------------------------------- /tests/runai/test_runai_rest_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_rest_client.py -------------------------------------------------------------------------------- /tests/runai/test_runai_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/runai/test_runai_training.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_ai_dynamo_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_ai_dynamo_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_bash_cmd_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_bash_cmd_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_chakra_replay_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_chakra_replay_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nemo_launcher_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nemo_launcher_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nixl_bench_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nixl_bench_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nixl_kvbench_command_gen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nixl_kvbench_command_gen.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_nixl_perftest_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_nixl_perftest_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_sleep_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_sleep_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_triton_inference_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_triton_inference_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/slurm_command_gen_strategy/test_ucc_slurm_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/slurm_command_gen_strategy/test_ucc_slurm_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/test_acceptance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_acceptance.py -------------------------------------------------------------------------------- /tests/test_agents.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_agents.py -------------------------------------------------------------------------------- /tests/test_base_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_base_installer.py -------------------------------------------------------------------------------- /tests/test_base_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_base_runner.py -------------------------------------------------------------------------------- /tests/test_check_copyright_headers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_check_copyright_headers.py -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_cli.py -------------------------------------------------------------------------------- /tests/test_cloudaigym.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_cloudaigym.py -------------------------------------------------------------------------------- /tests/test_command_gen_strategy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_command_gen_strategy.py -------------------------------------------------------------------------------- /tests/test_csv_report_tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_csv_report_tool.py -------------------------------------------------------------------------------- /tests/test_docker_image_cache_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_docker_image_cache_manager.py -------------------------------------------------------------------------------- /tests/test_get_job_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_get_job_id.py -------------------------------------------------------------------------------- /tests/test_git_repo_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_git_repo_installer.py -------------------------------------------------------------------------------- /tests/test_handlers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_handlers.py -------------------------------------------------------------------------------- /tests/test_init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_init.py -------------------------------------------------------------------------------- /tests/test_job_type_handler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_job_type_handler.py -------------------------------------------------------------------------------- /tests/test_kubernetes_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_kubernetes_system.py -------------------------------------------------------------------------------- /tests/test_lsf_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_lsf_system.py -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_parser.py -------------------------------------------------------------------------------- /tests/test_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_registry.py -------------------------------------------------------------------------------- /tests/test_reporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_reporter.py -------------------------------------------------------------------------------- /tests/test_single_sbatch_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_single_sbatch_runner.py -------------------------------------------------------------------------------- /tests/test_slurm_allocation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_slurm_allocation.py -------------------------------------------------------------------------------- /tests/test_slurm_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_slurm_installer.py -------------------------------------------------------------------------------- /tests/test_slurm_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_slurm_system.py -------------------------------------------------------------------------------- /tests/test_standalone_installer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_standalone_installer.py -------------------------------------------------------------------------------- /tests/test_standalone_system.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_standalone_system.py -------------------------------------------------------------------------------- /tests/test_test_definitions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_test_definitions.py -------------------------------------------------------------------------------- /tests/test_test_run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_test_run.py -------------------------------------------------------------------------------- /tests/test_test_scenario.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_test_scenario.py -------------------------------------------------------------------------------- /tests/test_test_scenario_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_test_scenario_parser.py -------------------------------------------------------------------------------- /tests/test_toml_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/tests/test_toml_files.py -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/cloudai/HEAD/uv.lock --------------------------------------------------------------------------------