├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── artifact_evaluation ├── README.md ├── evaluation │ ├── sensitivity_analysis │ │ ├── run_microbenchmarks.py │ │ └── run_pccheck_async.py │ └── throughput │ │ ├── get_goodput.sh │ │ ├── get_throughput_single_node.sh │ │ ├── gpus_trace.csv │ │ ├── loading.py │ │ ├── run_goodput_model.py │ │ └── run_throughput_model.py └── test_simple.sh ├── checkpoint_eval ├── DISTRIBUTED.md ├── checkfreq │ ├── chk_manager.py │ └── utils.py ├── deepspeed │ └── __init__.py ├── gemini │ └── chk_manager.py ├── get_throughput_multi_node.py ├── gpm │ ├── Makefile │ ├── change-ddio.h │ ├── checkp_func.cu │ ├── checkp_func.h │ ├── cuda_wrapper.cpp │ ├── gpm-annotations.cuh │ ├── gpm-helper.cuh │ ├── gpm_manager.py │ ├── lenet.cu │ ├── libgpm.cuh │ ├── libgpmcp.cuh │ └── simulate.py ├── models │ ├── bert │ │ ├── bertPrep.py │ │ ├── create_datasets_from_start.sh │ │ ├── data_download.sh │ │ ├── modeling.py │ │ ├── optimization.py │ │ ├── run_cfreq.py │ │ ├── run_gpm.py │ │ ├── run_pccheck.py │ │ ├── run_squad.py │ │ ├── run_squad_chfreq.py │ │ ├── run_squad_default.py │ │ ├── run_squad_gpm.py │ │ └── run_squad_pccheck.py │ ├── llm_distr │ │ ├── bloom_ds.py │ │ ├── convert_to_ds.py │ │ ├── deepspeed.py │ │ ├── ds_config.json │ │ ├── llama_ds.py │ │ ├── opt_ds.py │ │ ├── run_cfreq.py │ │ ├── run_clm_pp.py │ │ ├── run_clm_pp_cfreq.py │ │ ├── run_clm_pp_gemini.py │ │ ├── run_clm_pp_gpm.py │ │ ├── run_clm_pp_pccheck.py │ │ ├── run_gpm.py │ │ ├── run_pccheck.py │ │ └── trainer_pp.py │ ├── microbenchmarks │ │ ├── test_cfreq.py │ │ ├── test_gemini.py │ │ ├── test_gpm.py │ │ └── test_pccheck.py │ ├── opt │ │ ├── __init__.py │ │ ├── ds_config.json │ │ ├── run_cfreq.py │ │ ├── run_clm_cfreq.py │ │ ├── run_clm_default.py │ │ ├── run_clm_gpm.py │ │ ├── run_clm_pccheck.py │ │ ├── run_gpm.py │ │ ├── run_pccheck.py │ │ ├── trainer_checkfreq.py │ │ ├── trainer_default.py │ │ ├── trainer_gpm.py │ │ └── trainer_pccheck.py │ ├── transformer │ │ ├── getdata.sh │ │ ├── run_cfreq.py │ │ ├── run_gpm.py │ │ ├── run_pccheck.py │ │ ├── train_checkfreq.py │ │ ├── train_gpm.py │ │ └── train_pccheck.py │ └── vision │ │ ├── run_cfreq.py │ │ ├── run_gpm.py │ │ ├── run_pccheck.py │ │ ├── train_checkfreq.py │ │ ├── train_gpm.py │ │ └── train_pccheck.py ├── pccheck │ ├── DRAMAlloc.h │ ├── FAAQueue.h │ ├── FAAQueueAdd.h │ ├── HazardPointers.hpp │ ├── MSQueue.h │ ├── Makefile │ ├── chk_checkpoint_pipeline.py │ ├── chk_monitor.py │ ├── main_ssd_memory.cpp │ ├── socket_work.cpp │ ├── socket_work.h │ └── test_queue.cpp └── pccheck_utils.py ├── install.sh ├── install_preq_at_vm.sh ├── requirements.txt ├── setup.py └── setup_models_and_datasets.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/.gitignore -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/README.md -------------------------------------------------------------------------------- /artifact_evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/README.md -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/sensitivity_analysis/run_microbenchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/sensitivity_analysis/run_microbenchmarks.py -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/sensitivity_analysis/run_pccheck_async.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/sensitivity_analysis/run_pccheck_async.py -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/get_goodput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3.9 run_goodput_model.py transformer -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/get_throughput_single_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python3.9 run_throughput_model.py transformer -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/gpus_trace.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/throughput/gpus_trace.csv -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/loading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/throughput/loading.py -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/run_goodput_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/throughput/run_goodput_model.py -------------------------------------------------------------------------------- /artifact_evaluation/evaluation/throughput/run_throughput_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/evaluation/throughput/run_throughput_model.py -------------------------------------------------------------------------------- /artifact_evaluation/test_simple.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/artifact_evaluation/test_simple.sh -------------------------------------------------------------------------------- /checkpoint_eval/DISTRIBUTED.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/DISTRIBUTED.md -------------------------------------------------------------------------------- /checkpoint_eval/checkfreq/chk_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/checkfreq/chk_manager.py -------------------------------------------------------------------------------- /checkpoint_eval/checkfreq/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/checkfreq/utils.py -------------------------------------------------------------------------------- /checkpoint_eval/deepspeed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/deepspeed/__init__.py -------------------------------------------------------------------------------- /checkpoint_eval/gemini/chk_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gemini/chk_manager.py -------------------------------------------------------------------------------- /checkpoint_eval/get_throughput_multi_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/get_throughput_multi_node.py -------------------------------------------------------------------------------- /checkpoint_eval/gpm/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/Makefile -------------------------------------------------------------------------------- /checkpoint_eval/gpm/change-ddio.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/change-ddio.h -------------------------------------------------------------------------------- /checkpoint_eval/gpm/checkp_func.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/checkp_func.cu -------------------------------------------------------------------------------- /checkpoint_eval/gpm/checkp_func.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/checkp_func.h -------------------------------------------------------------------------------- /checkpoint_eval/gpm/cuda_wrapper.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/cuda_wrapper.cpp -------------------------------------------------------------------------------- /checkpoint_eval/gpm/gpm-annotations.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/gpm-annotations.cuh -------------------------------------------------------------------------------- /checkpoint_eval/gpm/gpm-helper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/gpm-helper.cuh -------------------------------------------------------------------------------- /checkpoint_eval/gpm/gpm_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/gpm_manager.py -------------------------------------------------------------------------------- /checkpoint_eval/gpm/lenet.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/lenet.cu -------------------------------------------------------------------------------- /checkpoint_eval/gpm/libgpm.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/libgpm.cuh -------------------------------------------------------------------------------- /checkpoint_eval/gpm/libgpmcp.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/libgpmcp.cuh -------------------------------------------------------------------------------- /checkpoint_eval/gpm/simulate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/gpm/simulate.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/bertPrep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/bertPrep.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/create_datasets_from_start.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/create_datasets_from_start.sh -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/data_download.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/data_download.sh -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/modeling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/modeling.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/optimization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/optimization.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_squad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_squad.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_squad_chfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_squad_chfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_squad_default.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_squad_default.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_squad_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_squad_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/bert/run_squad_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/bert/run_squad_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/bloom_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/bloom_ds.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/convert_to_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/convert_to_ds.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/deepspeed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/deepspeed.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/ds_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/ds_config.json -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/llama_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/llama_ds.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/opt_ds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/opt_ds.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_clm_pp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_clm_pp.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_clm_pp_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_clm_pp_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_clm_pp_gemini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_clm_pp_gemini.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_clm_pp_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_clm_pp_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_clm_pp_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_clm_pp_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/run_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/run_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/llm_distr/trainer_pp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/llm_distr/trainer_pp.py -------------------------------------------------------------------------------- /checkpoint_eval/models/microbenchmarks/test_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/microbenchmarks/test_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/microbenchmarks/test_gemini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/microbenchmarks/test_gemini.py -------------------------------------------------------------------------------- /checkpoint_eval/models/microbenchmarks/test_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/microbenchmarks/test_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/microbenchmarks/test_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/microbenchmarks/test_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/__init__.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/ds_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/ds_config.json -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_clm_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_clm_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_clm_default.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_clm_default.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_clm_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_clm_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_clm_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_clm_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/run_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/run_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/trainer_checkfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/trainer_checkfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/trainer_default.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/trainer_default.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/trainer_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/trainer_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/opt/trainer_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/opt/trainer_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/getdata.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/getdata.sh -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/run_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/run_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/run_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/run_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/run_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/run_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/train_checkfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/train_checkfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/train_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/train_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/transformer/train_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/transformer/train_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/run_cfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/run_cfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/run_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/run_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/run_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/run_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/train_checkfreq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/train_checkfreq.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/train_gpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/train_gpm.py -------------------------------------------------------------------------------- /checkpoint_eval/models/vision/train_pccheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/models/vision/train_pccheck.py -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/DRAMAlloc.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/DRAMAlloc.h -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/FAAQueue.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/FAAQueue.h -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/FAAQueueAdd.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/FAAQueueAdd.h -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/HazardPointers.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/HazardPointers.hpp -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/MSQueue.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/MSQueue.h -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/Makefile -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/chk_checkpoint_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/chk_checkpoint_pipeline.py -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/chk_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/chk_monitor.py -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/main_ssd_memory.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/main_ssd_memory.cpp -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/socket_work.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/socket_work.cpp -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/socket_work.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/socket_work.h -------------------------------------------------------------------------------- /checkpoint_eval/pccheck/test_queue.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck/test_queue.cpp -------------------------------------------------------------------------------- /checkpoint_eval/pccheck_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/checkpoint_eval/pccheck_utils.py -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/install.sh -------------------------------------------------------------------------------- /install_preq_at_vm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/install_preq_at_vm.sh -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/setup.py -------------------------------------------------------------------------------- /setup_models_and_datasets.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eth-easl/pccheck/HEAD/setup_models_and_datasets.sh --------------------------------------------------------------------------------