├── .github └── workflows │ ├── build_docs.yml │ ├── cherry_pick_release.yml │ ├── lint_code.yml │ └── unit_test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── SECURITY.md ├── cupti_build.py ├── docs ├── Makefile ├── make.bat └── source │ ├── checkpointing │ ├── async │ │ ├── api.rst │ │ ├── api │ │ │ ├── core.rst │ │ │ ├── filesystem_async.rst │ │ │ ├── state_dict_saver.rst │ │ │ └── torch_ckpt.rst │ │ ├── examples.rst │ │ ├── examples │ │ │ ├── basic_example.rst │ │ │ └── writer_example.rst │ │ ├── index.rst │ │ └── usage_guide.rst │ └── local │ │ ├── api.rst │ │ ├── api │ │ ├── base_ckpt_manager.rst │ │ ├── base_state_dict.rst │ │ ├── basic_state_dict.rst │ │ ├── callback.rst │ │ ├── local_ckpt_manager.rst │ │ └── replication.rst │ │ ├── examples.rst │ │ ├── examples │ │ └── basic_example.rst │ │ ├── index.rst │ │ └── usage_guide.rst │ ├── conf.py │ ├── fault_tolerance │ ├── README-pci-topo-file.md │ ├── api.rst │ ├── api │ │ ├── callback.rst │ │ ├── client.rst │ │ ├── config.rst │ │ └── server.rst │ ├── examples.rst │ ├── examples │ │ ├── basic_example.rst │ │ ├── in_job_and_in_process_example.rst │ │ ├── train_ddp_heartbeats.rst │ │ └── train_ddp_sections.rst │ ├── index.rst │ ├── integration.rst │ ├── integration │ │ ├── heartbeats.rst │ │ ├── inprocess.rst │ │ ├── ptl.rst │ │ └── sections.rst │ └── usage_guide.rst │ ├── index.rst │ ├── inprocess │ ├── api.rst │ ├── api │ │ ├── abort.rst │ │ ├── compose.rst │ │ ├── exception.rst │ │ ├── finalize.rst │ │ ├── health_check.rst │ │ ├── initialize.rst │ │ ├── rank_assignment.rst │ │ ├── rank_filter.rst │ │ ├── state.rst │ │ └── wrap.rst │ ├── examples.rst │ ├── examples │ │ ├── basic_example.rst │ │ └── optimal_example.rst │ ├── index.rst │ └── usage_guide.rst │ ├── media │ ├── nvrx_core_features.png │ └── nvrx_docs_source.png │ ├── release-notes.md │ ├── shared_utils │ ├── api.rst │ ├── config_reference.rst │ ├── index.rst │ └── logger_guide.rst │ └── straggler_det │ ├── api.rst │ ├── api │ ├── callback.rst │ ├── reporting.rst │ ├── statistics.rst │ └── straggler.rst │ ├── examples.rst │ ├── examples │ └── basic_example.rst │ ├── index.rst │ └── usage_guide.rst ├── examples ├── attribution │ └── single_server_example.py ├── checkpointing │ ├── async_ckpt.py │ ├── async_writer.py │ └── local_ckpt.py ├── fault_tolerance │ ├── basic_ft_example.py │ ├── dist_utils.py │ ├── fault_tol_cfg_heartbeats.yaml │ ├── fault_tol_cfg_sections.yaml │ ├── in_job_and_in_process_example.py │ ├── log_utils.py │ ├── run_inprocess_injob_example.sh │ ├── train_ddp_heartbeats_api.py │ └── train_ddp_sections_api.py ├── inprocess │ ├── advanced_example.py │ └── basic_example.py └── straggler │ └── example.py ├── pyproject.toml ├── src └── nvidia_resiliency_ext │ ├── __init__.py │ ├── attribution │ ├── __init__.py │ ├── base.py │ ├── mcp_integration │ │ ├── README.md │ │ ├── __init__.py │ │ ├── mcp_client.py │ │ ├── mcp_server.py │ │ ├── module_definitions.py │ │ ├── registry.py │ │ └── server_launcher.py │ ├── straggler │ │ ├── __init__.py │ │ ├── cupti.py │ │ ├── cupti_src │ │ │ ├── BufferPool.cpp │ │ │ ├── BufferPool.h │ │ │ ├── CircularBuffer.h │ │ │ ├── CuptiProfiler.cpp │ │ │ ├── CuptiProfiler.h │ │ │ └── cupti_module_py.cpp │ │ ├── dist_utils.py │ │ ├── interval_tracker.py │ │ ├── name_mapper.py │ │ ├── reporting.py │ │ ├── statistics.py │ │ └── straggler.py │ ├── trace_analyzer │ │ ├── __init__.py │ │ ├── fr_attribution.py │ │ └── trace_collector.py │ └── utils.py │ ├── checkpointing │ ├── __init__.py │ ├── async_ckpt │ │ ├── cached_metadata_filesystem_reader.py │ │ ├── core.py │ │ ├── filesystem_async.py │ │ ├── state_dict_saver.py │ │ └── torch_ckpt.py │ ├── local │ │ ├── __init__.py │ │ ├── base_state_dict.py │ │ ├── basic_state_dict.py │ │ ├── ckpt_managers │ │ │ ├── base_manager.py │ │ │ └── local_manager.py │ │ └── replication │ │ │ ├── __init__.py │ │ │ ├── _torch_future.py │ │ │ ├── group_utils.py │ │ │ ├── strategies.py │ │ │ ├── torch_device_utils.py │ │ │ └── utils.py │ └── utils.py │ ├── fault_tolerance │ ├── __init__.py │ ├── _ft_rendezvous.py │ ├── c10d_monkey_patch.py │ ├── config.py │ ├── data.py │ ├── dict_utils.py │ ├── ft_rendezvous_barrier.py │ ├── ipc_connector.py │ ├── launcher.py │ ├── rank_monitor_client.py │ ├── rank_monitor_server.py │ ├── rank_monitor_state_machine.py │ ├── timeouts_calc.py │ └── utils.py │ ├── inprocess │ ├── __init__.py │ ├── abort.py │ ├── attribution.py │ ├── completion.py │ ├── compose.py │ ├── exception.py │ ├── finalize.py │ ├── health_check.py │ ├── initialize.py │ ├── monitor_process.py │ ├── monitor_thread.py │ ├── nested_restarter.py │ ├── param_utils.py │ ├── progress_watchdog.py │ ├── rank_assignment.py │ ├── sibling_monitor.py │ ├── state.py │ ├── store.py │ ├── terminate.py │ ├── tools │ │ ├── __init__.py │ │ └── inject_fault.py │ ├── utils.py │ └── wrap.py │ ├── ptl_resiliency │ ├── __init__.py │ ├── _utils.py │ ├── fault_tolerance_callback.py │ ├── fault_tolerance_sections_callback.py │ ├── local_checkpoint_callback.py │ └── straggler_det_callback.py │ └── shared_utils │ ├── __init__.py │ ├── health_check.py │ ├── log_aggregator.py │ ├── log_manager.py │ ├── log_node_local_tmp.py │ ├── os_utils.py │ ├── profiling.py │ └── wait_daemon.py └── tests ├── attribution └── unit │ ├── REFERENCE_VALIDATION_SUMMARY.md │ ├── fr_attribution_test_utils.py │ ├── fr_traces │ ├── gpu_error_1st │ │ ├── _dump_0 │ │ ├── _dump_1 │ │ ├── _dump_10 │ │ ├── _dump_11 │ │ ├── _dump_12 │ │ ├── _dump_13 │ │ ├── _dump_14 │ │ ├── _dump_15 │ │ ├── _dump_2 │ │ ├── _dump_3 │ │ ├── _dump_4 │ │ ├── _dump_5 │ │ ├── _dump_6 │ │ ├── _dump_7 │ │ ├── _dump_8 │ │ ├── _dump_9 │ │ └── fault_injection.log │ ├── gpu_error_2nd │ │ ├── _dump_0 │ │ ├── _dump_1 │ │ ├── _dump_10 │ │ ├── _dump_11 │ │ ├── _dump_12 │ │ ├── _dump_13 │ │ ├── _dump_14 │ │ ├── _dump_15 │ │ ├── _dump_2 │ │ ├── _dump_3 │ │ ├── _dump_4 │ │ ├── _dump_5 │ │ ├── _dump_6 │ │ ├── _dump_7 │ │ ├── _dump_8 │ │ ├── _dump_9 │ │ └── fault_injection.log │ ├── lock_gil_1st │ │ ├── _dump_0 │ │ ├── _dump_1 │ │ ├── _dump_10 │ │ ├── _dump_11 │ │ ├── _dump_12 │ │ ├── _dump_13 │ │ ├── _dump_14 │ │ ├── _dump_15 │ │ ├── _dump_2 │ │ ├── _dump_3 │ │ ├── _dump_4 │ │ ├── _dump_5 │ │ ├── _dump_6 │ │ ├── _dump_7 │ │ ├── _dump_8 │ │ ├── _dump_9 │ │ └── fault_injection.log │ └── lock_gil_2nd │ │ ├── _dump_0 │ │ ├── _dump_1 │ │ ├── _dump_10 │ │ ├── _dump_11 │ │ ├── _dump_12 │ │ ├── _dump_13 │ │ ├── _dump_14 │ │ ├── _dump_15 │ │ ├── _dump_2 │ │ ├── _dump_3 │ │ ├── _dump_4 │ │ ├── _dump_5 │ │ ├── _dump_6 │ │ ├── _dump_7 │ │ ├── _dump_8 │ │ ├── _dump_9 │ │ └── fault_injection.log │ ├── reference_outputs │ ├── gpu_error_1st_reference.txt │ ├── gpu_error_2nd_reference.txt │ ├── lock_gil_1st_reference.txt │ └── lock_gil_2nd_reference.txt │ ├── test_base.py │ └── test_fr.py ├── checkpointing └── unit │ ├── __init__.py │ ├── conftest.py │ ├── test_async_save.py │ ├── test_async_writer.py │ ├── test_async_writer_msc.py │ ├── test_basic_local.py │ ├── test_cleanup.py │ └── test_utilities.py ├── fault_tolerance ├── func │ ├── _launcher_mode_test_worker.py │ ├── _workload_ctrl_test_worker.py │ ├── run_launcher_any_failed_mode_test.sh │ ├── run_launcher_min_healthy_mode_test.sh │ ├── run_local_ddp_test_heartbeats.sh │ ├── run_local_ddp_test_sections.sh │ ├── run_workload_ctrl_test_excl_node.sh │ └── run_workload_ctrl_test_shutdown.sh └── unit │ ├── __init__.py │ ├── _launcher_test_util.py │ ├── conftest.py │ ├── test_barrier_rendezvous.py │ ├── test_config.py │ ├── test_dynamic_rendezvous.py │ ├── test_init.py │ ├── test_ipc_connector.py │ ├── test_launcher.py │ ├── test_layered_restart_v1.py │ ├── test_process_utils.py │ ├── test_rank_monitor_server.py │ ├── test_reconnect.py │ ├── test_shutdown.py │ ├── test_shutdown_sections.py │ ├── test_timeouts.py │ ├── test_timeouts_calc.py │ ├── test_timeouts_sections.py │ └── utils.py ├── inprocess ├── __init__.py ├── app.py ├── common.py ├── test_abort.py ├── test_app.py ├── test_compose.py ├── test_hang_protection_disabler.py ├── test_health_check.py ├── test_monitor_thread.py ├── test_nested_restarter.py ├── test_progress_watchdog.py ├── test_rank_assignment.py ├── test_timeout.py ├── test_torch.py └── test_wrap.py ├── ptl_resiliency ├── func │ └── nemo20 │ │ ├── Dockerfile.ft_test │ │ ├── check_straggler_log.py │ │ ├── ft_test_asserts.sh │ │ ├── ft_test_launchers.sh │ │ ├── ft_test_llama3.py │ │ ├── local_ckpt_test.sh │ │ ├── straggler_test_llama3.py │ │ └── test_local_ckpt_llama3.py └── unit │ ├── __init__.py │ ├── test_ft_callback_hb.py │ ├── test_ft_callback_sections.py │ ├── test_ft_state_machine.py │ ├── test_local_ckpt_callback.py │ └── test_straggler_det_callback.py ├── shared_utils ├── test_health_check.py └── test_logger.py └── straggler ├── README.md ├── func ├── check_log.py └── ddp_test.py └── unit ├── __init__.py ├── _utils.py ├── test_cupti_ext.py ├── test_cupti_manager.py ├── test_data_shared.py ├── test_det_section_api.py ├── test_individual_gpu_scores.py ├── test_interval_tracker.py ├── test_name_mapper.py ├── test_relative_gpu_scores.py ├── test_reporting.py ├── test_reporting_elapsed.py ├── test_sections.py └── test_wrap_callables.py /.github/workflows/build_docs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.github/workflows/build_docs.yml -------------------------------------------------------------------------------- /.github/workflows/cherry_pick_release.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.github/workflows/cherry_pick_release.yml -------------------------------------------------------------------------------- /.github/workflows/lint_code.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.github/workflows/lint_code.yml -------------------------------------------------------------------------------- /.github/workflows/unit_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.github/workflows/unit_test.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/LICENSE.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/SECURITY.md -------------------------------------------------------------------------------- /cupti_build.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/cupti_build.py -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/make.bat -------------------------------------------------------------------------------- /docs/source/checkpointing/async/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/api.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/api/core.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/api/core.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/api/filesystem_async.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/api/filesystem_async.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/api/state_dict_saver.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/api/state_dict_saver.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/api/torch_ckpt.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/api/torch_ckpt.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/examples.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/examples/basic_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/examples/basic_example.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/examples/writer_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/examples/writer_example.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/index.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/async/usage_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/async/usage_guide.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/base_ckpt_manager.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/base_ckpt_manager.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/base_state_dict.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/base_state_dict.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/basic_state_dict.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/basic_state_dict.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/callback.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/callback.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/local_ckpt_manager.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/local_ckpt_manager.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/api/replication.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/api/replication.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/examples.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/examples/basic_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/examples/basic_example.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/index.rst -------------------------------------------------------------------------------- /docs/source/checkpointing/local/usage_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/checkpointing/local/usage_guide.rst -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/conf.py -------------------------------------------------------------------------------- /docs/source/fault_tolerance/README-pci-topo-file.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/README-pci-topo-file.md -------------------------------------------------------------------------------- /docs/source/fault_tolerance/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/api.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/api/callback.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/api/callback.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/api/client.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/api/client.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/api/config.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/api/config.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/api/server.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/api/server.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/examples.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/examples/basic_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/examples/basic_example.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/examples/in_job_and_in_process_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/examples/in_job_and_in_process_example.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/examples/train_ddp_heartbeats.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/examples/train_ddp_heartbeats.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/examples/train_ddp_sections.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/examples/train_ddp_sections.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/index.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/integration.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/integration.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/integration/heartbeats.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/integration/heartbeats.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/integration/inprocess.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/integration/inprocess.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/integration/ptl.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/integration/ptl.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/integration/sections.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/integration/sections.rst -------------------------------------------------------------------------------- /docs/source/fault_tolerance/usage_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/fault_tolerance/usage_guide.rst -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/index.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/abort.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/abort.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/compose.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/compose.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/exception.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/exception.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/finalize.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/finalize.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/health_check.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/health_check.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/initialize.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/initialize.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/rank_assignment.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/rank_assignment.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/rank_filter.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/rank_filter.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/state.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/state.rst -------------------------------------------------------------------------------- /docs/source/inprocess/api/wrap.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/api/wrap.rst -------------------------------------------------------------------------------- /docs/source/inprocess/examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/examples.rst -------------------------------------------------------------------------------- /docs/source/inprocess/examples/basic_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/examples/basic_example.rst -------------------------------------------------------------------------------- /docs/source/inprocess/examples/optimal_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/examples/optimal_example.rst -------------------------------------------------------------------------------- /docs/source/inprocess/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/index.rst -------------------------------------------------------------------------------- /docs/source/inprocess/usage_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/inprocess/usage_guide.rst -------------------------------------------------------------------------------- /docs/source/media/nvrx_core_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/media/nvrx_core_features.png -------------------------------------------------------------------------------- /docs/source/media/nvrx_docs_source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/media/nvrx_docs_source.png -------------------------------------------------------------------------------- /docs/source/release-notes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/release-notes.md -------------------------------------------------------------------------------- /docs/source/shared_utils/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/shared_utils/api.rst -------------------------------------------------------------------------------- /docs/source/shared_utils/config_reference.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/shared_utils/config_reference.rst -------------------------------------------------------------------------------- /docs/source/shared_utils/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/shared_utils/index.rst -------------------------------------------------------------------------------- /docs/source/shared_utils/logger_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/shared_utils/logger_guide.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/api.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/api/callback.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/api/callback.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/api/reporting.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/api/reporting.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/api/statistics.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/api/statistics.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/api/straggler.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/api/straggler.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/examples.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/examples.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/examples/basic_example.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/examples/basic_example.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/index.rst -------------------------------------------------------------------------------- /docs/source/straggler_det/usage_guide.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/docs/source/straggler_det/usage_guide.rst -------------------------------------------------------------------------------- /examples/attribution/single_server_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/attribution/single_server_example.py -------------------------------------------------------------------------------- /examples/checkpointing/async_ckpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/checkpointing/async_ckpt.py -------------------------------------------------------------------------------- /examples/checkpointing/async_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/checkpointing/async_writer.py -------------------------------------------------------------------------------- /examples/checkpointing/local_ckpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/checkpointing/local_ckpt.py -------------------------------------------------------------------------------- /examples/fault_tolerance/basic_ft_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/basic_ft_example.py -------------------------------------------------------------------------------- /examples/fault_tolerance/dist_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/dist_utils.py -------------------------------------------------------------------------------- /examples/fault_tolerance/fault_tol_cfg_heartbeats.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/fault_tol_cfg_heartbeats.yaml -------------------------------------------------------------------------------- /examples/fault_tolerance/fault_tol_cfg_sections.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/fault_tol_cfg_sections.yaml -------------------------------------------------------------------------------- /examples/fault_tolerance/in_job_and_in_process_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/in_job_and_in_process_example.py -------------------------------------------------------------------------------- /examples/fault_tolerance/log_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/log_utils.py -------------------------------------------------------------------------------- /examples/fault_tolerance/run_inprocess_injob_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/run_inprocess_injob_example.sh -------------------------------------------------------------------------------- /examples/fault_tolerance/train_ddp_heartbeats_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/train_ddp_heartbeats_api.py -------------------------------------------------------------------------------- /examples/fault_tolerance/train_ddp_sections_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/fault_tolerance/train_ddp_sections_api.py -------------------------------------------------------------------------------- /examples/inprocess/advanced_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/inprocess/advanced_example.py -------------------------------------------------------------------------------- /examples/inprocess/basic_example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/inprocess/basic_example.py -------------------------------------------------------------------------------- /examples/straggler/example.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/examples/straggler/example.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/base.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/README.md -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_client.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/mcp_server.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/module_definitions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/module_definitions.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/registry.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/mcp_integration/server_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/mcp_integration/server_launcher.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/BufferPool.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/BufferPool.cpp -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/BufferPool.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/BufferPool.h -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CircularBuffer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CircularBuffer.h -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CuptiProfiler.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CuptiProfiler.cpp -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CuptiProfiler.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/CuptiProfiler.h -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/cupti_src/cupti_module_py.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/cupti_src/cupti_module_py.cpp -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/dist_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/dist_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/interval_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/interval_tracker.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/name_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/name_mapper.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/reporting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/reporting.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/statistics.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/straggler/straggler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/straggler/straggler.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/trace_analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/trace_analyzer/fr_attribution.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/trace_analyzer/trace_collector.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/attribution/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/attribution/utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/async_ckpt/cached_metadata_filesystem_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/async_ckpt/cached_metadata_filesystem_reader.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/async_ckpt/core.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/async_ckpt/filesystem_async.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/async_ckpt/filesystem_async.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/async_ckpt/state_dict_saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/async_ckpt/state_dict_saver.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/async_ckpt/torch_ckpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/async_ckpt/torch_ckpt.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/base_state_dict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/base_state_dict.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/basic_state_dict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/basic_state_dict.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/ckpt_managers/base_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/ckpt_managers/base_manager.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/ckpt_managers/local_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/ckpt_managers/local_manager.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/_torch_future.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/_torch_future.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/group_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/group_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/strategies.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/strategies.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/torch_device_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/torch_device_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/local/replication/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/local/replication/utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/checkpointing/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/checkpointing/utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/c10d_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/c10d_monkey_patch.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/config.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/data.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/dict_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/dict_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/ft_rendezvous_barrier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/ft_rendezvous_barrier.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/ipc_connector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/ipc_connector.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/launcher.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_client.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_server.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_state_machine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/rank_monitor_state_machine.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/timeouts_calc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/timeouts_calc.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/fault_tolerance/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/fault_tolerance/utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/abort.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/abort.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/attribution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/attribution.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/completion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/completion.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/compose.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/compose.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/exception.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/exception.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/finalize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/finalize.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/health_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/health_check.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/initialize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/initialize.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/monitor_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/monitor_process.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/monitor_thread.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/monitor_thread.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/nested_restarter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/nested_restarter.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/param_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/param_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/progress_watchdog.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/progress_watchdog.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/rank_assignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/rank_assignment.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/sibling_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/sibling_monitor.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/state.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/store.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/store.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/terminate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/terminate.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/tools/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/tools/inject_fault.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/tools/inject_fault.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/inprocess/wrap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/inprocess/wrap.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/__init__.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/fault_tolerance_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/fault_tolerance_callback.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/fault_tolerance_sections_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/fault_tolerance_sections_callback.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/local_checkpoint_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/local_checkpoint_callback.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/ptl_resiliency/straggler_det_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/ptl_resiliency/straggler_det_callback.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/health_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/health_check.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/log_aggregator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/log_aggregator.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/log_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/log_manager.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/log_node_local_tmp.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/os_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/os_utils.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/profiling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/profiling.py -------------------------------------------------------------------------------- /src/nvidia_resiliency_ext/shared_utils/wait_daemon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/src/nvidia_resiliency_ext/shared_utils/wait_daemon.py -------------------------------------------------------------------------------- /tests/attribution/unit/REFERENCE_VALIDATION_SUMMARY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/REFERENCE_VALIDATION_SUMMARY.md -------------------------------------------------------------------------------- /tests/attribution/unit/fr_attribution_test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_attribution_test_utils.py -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_0 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_1 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_10 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_11 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_12 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_13: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_13 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_14: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_14 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_15: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_15 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_2 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_3 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_4 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_5 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_6 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_7 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_8 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/_dump_9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/_dump_9 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_1st/fault_injection.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_1st/fault_injection.log -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_0 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_1 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_10 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_11 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_12 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_13: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_13 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_14: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_14 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_15: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_15 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_2 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_3 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_4 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_5 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_6 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_7 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_8 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/_dump_9 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/gpu_error_2nd/fault_injection.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/gpu_error_2nd/fault_injection.log -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_0 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_1 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_10 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_11 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_12 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_13: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_13 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_14: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_14 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_15: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_15 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_2 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_3 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_4 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_5 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_6 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_7 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_8 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/_dump_9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/_dump_9 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_1st/fault_injection.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_1st/fault_injection.log -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_0 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_1 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_10: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_10 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_11 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_12: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_12 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_13: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_13 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_14: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_14 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_15: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_15 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_2 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_3 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_4 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_5 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_6 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_7 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_8 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_9: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/_dump_9 -------------------------------------------------------------------------------- /tests/attribution/unit/fr_traces/lock_gil_2nd/fault_injection.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/fr_traces/lock_gil_2nd/fault_injection.log -------------------------------------------------------------------------------- /tests/attribution/unit/reference_outputs/gpu_error_1st_reference.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/reference_outputs/gpu_error_1st_reference.txt -------------------------------------------------------------------------------- /tests/attribution/unit/reference_outputs/gpu_error_2nd_reference.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/reference_outputs/gpu_error_2nd_reference.txt -------------------------------------------------------------------------------- /tests/attribution/unit/reference_outputs/lock_gil_1st_reference.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/reference_outputs/lock_gil_1st_reference.txt -------------------------------------------------------------------------------- /tests/attribution/unit/reference_outputs/lock_gil_2nd_reference.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/reference_outputs/lock_gil_2nd_reference.txt -------------------------------------------------------------------------------- /tests/attribution/unit/test_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/test_base.py -------------------------------------------------------------------------------- /tests/attribution/unit/test_fr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/attribution/unit/test_fr.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/__init__.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/conftest.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_async_save.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_async_save.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_async_writer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_async_writer.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_async_writer_msc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_async_writer_msc.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_basic_local.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_basic_local.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_cleanup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_cleanup.py -------------------------------------------------------------------------------- /tests/checkpointing/unit/test_utilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/checkpointing/unit/test_utilities.py -------------------------------------------------------------------------------- /tests/fault_tolerance/func/_launcher_mode_test_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/_launcher_mode_test_worker.py -------------------------------------------------------------------------------- /tests/fault_tolerance/func/_workload_ctrl_test_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/_workload_ctrl_test_worker.py -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_launcher_any_failed_mode_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_launcher_any_failed_mode_test.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_launcher_min_healthy_mode_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_launcher_min_healthy_mode_test.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_local_ddp_test_heartbeats.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_local_ddp_test_heartbeats.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_local_ddp_test_sections.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_local_ddp_test_sections.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_workload_ctrl_test_excl_node.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_workload_ctrl_test_excl_node.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/func/run_workload_ctrl_test_shutdown.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/func/run_workload_ctrl_test_shutdown.sh -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/__init__.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/_launcher_test_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/_launcher_test_util.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/conftest.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_barrier_rendezvous.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_barrier_rendezvous.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_config.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_dynamic_rendezvous.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_dynamic_rendezvous.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_init.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_ipc_connector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_ipc_connector.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_launcher.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_layered_restart_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_layered_restart_v1.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_process_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_process_utils.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_rank_monitor_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_rank_monitor_server.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_reconnect.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_reconnect.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_shutdown.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_shutdown.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_shutdown_sections.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_shutdown_sections.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_timeouts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_timeouts.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_timeouts_calc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_timeouts_calc.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/test_timeouts_sections.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/test_timeouts_sections.py -------------------------------------------------------------------------------- /tests/fault_tolerance/unit/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/fault_tolerance/unit/utils.py -------------------------------------------------------------------------------- /tests/inprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/__init__.py -------------------------------------------------------------------------------- /tests/inprocess/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/app.py -------------------------------------------------------------------------------- /tests/inprocess/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/common.py -------------------------------------------------------------------------------- /tests/inprocess/test_abort.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_abort.py -------------------------------------------------------------------------------- /tests/inprocess/test_app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_app.py -------------------------------------------------------------------------------- /tests/inprocess/test_compose.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_compose.py -------------------------------------------------------------------------------- /tests/inprocess/test_hang_protection_disabler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_hang_protection_disabler.py -------------------------------------------------------------------------------- /tests/inprocess/test_health_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_health_check.py -------------------------------------------------------------------------------- /tests/inprocess/test_monitor_thread.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_monitor_thread.py -------------------------------------------------------------------------------- /tests/inprocess/test_nested_restarter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_nested_restarter.py -------------------------------------------------------------------------------- /tests/inprocess/test_progress_watchdog.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_progress_watchdog.py -------------------------------------------------------------------------------- /tests/inprocess/test_rank_assignment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_rank_assignment.py -------------------------------------------------------------------------------- /tests/inprocess/test_timeout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_timeout.py -------------------------------------------------------------------------------- /tests/inprocess/test_torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_torch.py -------------------------------------------------------------------------------- /tests/inprocess/test_wrap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/inprocess/test_wrap.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/Dockerfile.ft_test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/Dockerfile.ft_test -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/check_straggler_log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/check_straggler_log.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/ft_test_asserts.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/ft_test_asserts.sh -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/ft_test_launchers.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/ft_test_launchers.sh -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/ft_test_llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/ft_test_llama3.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/local_ckpt_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/local_ckpt_test.sh -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/straggler_test_llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/straggler_test_llama3.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/func/nemo20/test_local_ckpt_llama3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/func/nemo20/test_local_ckpt_llama3.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/test_ft_callback_hb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/unit/test_ft_callback_hb.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/test_ft_callback_sections.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/unit/test_ft_callback_sections.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/test_ft_state_machine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/unit/test_ft_state_machine.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/test_local_ckpt_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/unit/test_local_ckpt_callback.py -------------------------------------------------------------------------------- /tests/ptl_resiliency/unit/test_straggler_det_callback.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/ptl_resiliency/unit/test_straggler_det_callback.py -------------------------------------------------------------------------------- /tests/shared_utils/test_health_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/shared_utils/test_health_check.py -------------------------------------------------------------------------------- /tests/shared_utils/test_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/shared_utils/test_logger.py -------------------------------------------------------------------------------- /tests/straggler/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/README.md -------------------------------------------------------------------------------- /tests/straggler/func/check_log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/func/check_log.py -------------------------------------------------------------------------------- /tests/straggler/func/ddp_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/func/ddp_test.py -------------------------------------------------------------------------------- /tests/straggler/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/__init__.py -------------------------------------------------------------------------------- /tests/straggler/unit/_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/_utils.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_cupti_ext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_cupti_ext.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_cupti_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_cupti_manager.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_data_shared.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_data_shared.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_det_section_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_det_section_api.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_individual_gpu_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_individual_gpu_scores.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_interval_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_interval_tracker.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_name_mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_name_mapper.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_relative_gpu_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_relative_gpu_scores.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_reporting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_reporting.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_reporting_elapsed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_reporting_elapsed.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_sections.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_sections.py -------------------------------------------------------------------------------- /tests/straggler/unit/test_wrap_callables.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/nvidia-resiliency-ext/HEAD/tests/straggler/unit/test_wrap_callables.py --------------------------------------------------------------------------------