├── .gitignore ├── LICENSE ├── README.md ├── cluster_exp ├── Makefile ├── README.md ├── __init__.py ├── calc.py ├── cluster.py ├── cluster_specs │ ├── n1g8.csv │ ├── n2g8.csv │ ├── n4g8.csv │ └── n8g8.csv ├── controller.py ├── draw.py ├── flags.py ├── jobs.py ├── log.py ├── matching.py ├── models.py ├── node.py ├── prepare_env.sh ├── results │ ├── Muri-L │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out │ ├── Muri-S │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out │ ├── SRSF │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out │ ├── SRTF │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out │ ├── themis │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out │ └── tiresias │ │ ├── cluster.csv │ │ ├── job.csv │ │ └── result.out ├── run.py ├── run.sh ├── runtime │ ├── __init__.py │ ├── proto │ │ ├── master_to_worker.proto │ │ ├── scheduler_to_trainer.proto │ │ ├── trainer_to_scheduler.proto │ │ └── worker_to_master.proto │ ├── rpc │ │ ├── master_client.py │ │ ├── master_server.py │ │ ├── scheduler_client.py │ │ ├── scheduler_server.py │ │ ├── trainer_client.py │ │ ├── trainer_server.py │ │ ├── worker_client.py │ │ └── worker_server.py │ ├── rpc_stubs │ │ ├── master_to_worker_pb2.py │ │ ├── master_to_worker_pb2_grpc.py │ │ ├── scheduler_to_trainer_pb2.py │ │ ├── scheduler_to_trainer_pb2_grpc.py │ │ ├── trainer_to_scheduler_pb2.py │ │ ├── trainer_to_scheduler_pb2_grpc.py │ │ ├── worker_to_master_pb2.py │ │ └── worker_to_master_pb2_grpc.py │ └── tests │ │ ├── localhost_tests.py │ │ └── worker_server_tests.py ├── scheduler.py ├── switch.py ├── task.py ├── trace-data │ └── cluster_trace.csv ├── trainer.py ├── utils.py ├── worker.py └── workloads │ ├── main_real_preenv.py │ ├── main_real_util.py │ ├── models │ ├── __init__.py │ ├── a2c_model.py │ ├── cv_model.py │ ├── deep_rl │ │ ├── __init__.py │ │ ├── agent │ │ │ ├── A2C_agent.py │ │ │ ├── BaseAgent.py │ │ │ ├── CategoricalDQN_agent.py │ │ │ ├── DDPG_agent.py │ │ │ ├── DQN_agent.py │ │ │ ├── NStepDQN_agent.py │ │ │ ├── OptionCritic_agent.py │ │ │ ├── PPO_agent.py │ │ │ ├── QuantileRegressionDQN_agent.py │ │ │ ├── TD3_agent.py │ │ │ └── __init__.py │ │ ├── component │ │ │ ├── __init__.py │ │ │ ├── envs.py │ │ │ ├── random_process.py │ │ │ └── replay.py │ │ ├── network │ │ │ ├── __init__.py │ │ │ ├── network_bodies.py │ │ │ ├── network_heads.py │ │ │ └── network_utils.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── logger.py │ │ │ ├── misc.py │ │ │ ├── normalizer.py │ │ │ ├── plot.py │ │ │ ├── schedule.py │ │ │ ├── sum_tree.py │ │ │ └── torch_utils.py │ ├── dqn_model.py │ └── nlp_model.py │ ├── options.py │ ├── requirements.txt │ ├── run.sh │ └── run_preenv.sh └── simulator ├── .gitignore ├── README.md ├── calc.py ├── cluster.py ├── cluster_spec └── n8g8.csv ├── draw_fig11-13.py ├── draw_fig9-10.py ├── flags.py ├── jobs.py ├── log.py ├── matching.py ├── models.py ├── node.py ├── run_sim.py ├── sim_fig10.sh ├── sim_fig11.sh ├── sim_fig12.sh ├── sim_fig13.sh ├── sim_fig9.sh ├── switch.py ├── tf_job.csv ├── trace-data ├── job_type_1.csv ├── job_type_2.csv ├── job_type_3.csv ├── job_type_4.csv ├── trace1.csv ├── trace1_pr.csv ├── trace2.csv ├── trace2_pr.csv ├── trace3.csv ├── trace3_pr.csv ├── trace4.csv └── trace4_pr.csv └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | !cluster_exp/results/*/*.out 35 | *.app 36 | *.i*86 37 | *.x86_64 38 | *.hex 39 | 40 | # Debug files 41 | *.dSYM/ 42 | *.su 43 | *.idb 44 | *.pdb 45 | 46 | # Kernel Module Compile Results 47 | *.mod* 48 | *.cmd 49 | .tmp_versions/ 50 | modules.order 51 | Module.symvers 52 | Mkfile.old 53 | dkms.conf 54 | 55 | cluster_exp/workloads/datasets/ 56 | cluster_exp/workloads/test*.txt 57 | cluster_exp/workloads/output/ 58 | cluster_exp/workloads/hostfiles/ 59 | cluster_exp/workloads/tf_log/ 60 | cluster_exp/workloads/log/ 61 | cluster_exp/job_logs/ 62 | cluster_exp/hostfiles/ 63 | cluster_exp/tf_log/ 64 | cluster_exp/log/ 65 | *.xml 66 | 67 | .DS_Store 68 | cluster_exp/.DS_Store 69 | simulator/.DS_Store 70 | 71 | __pycache__ 72 | cluster_exp/__pycache__/ 73 | simulator/__pypache__/ 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 0. Introduction 2 | This repository contains the source code for our SIGCOMM'22 paper "Multi-Resource Interleaving for Deep Learning Training". 3 | 4 | 5 | # 1. Content 6 | - **simulator/** contains code for simulation and is adapted from [Tiresias](https://github.com/SymbioticLab/Tiresias). Please refer to ```/simulator/README.md``` for detailed information. 7 | - **cluster_exp/** contains code for real-cluster experiment. Please refer to ```/cluster_exp/README.md``` for detailed information. 8 | 9 | 10 | # 2. Reproduce results (for SIGCOMM'22 artifact evaluation) 11 | Please refer to ```/simulator/README.md``` and ```/cluster_exp/README.md``` for details. 12 | 13 | Note: Due to the execution scripts of testbed experiments are highly related to intracompany platform, we only demonstrate the functionality and show the pseudocode of the related scripts (e.g., run.sh, prepare_env.sh). Please adjust to your platform if you would like to execute the testbed experiment. 14 | 15 | 16 | # 3. Contact 17 | For any question, please contact ```zhaoyh98 at pku dot edu dot cn``` -------------------------------------------------------------------------------- /cluster_exp/Makefile: -------------------------------------------------------------------------------- 1 | rpc: 2 | python3 -m grpc_tools.protoc -Iruntime/proto --python_out=runtime/rpc_stubs --grpc_python_out=runtime/rpc_stubs runtime/proto/worker_to_master.proto 3 | python3 -m grpc_tools.protoc -Iruntime/proto --python_out=runtime/rpc_stubs --grpc_python_out=runtime/rpc_stubs runtime/proto/master_to_worker.proto 4 | python3 -m grpc_tools.protoc -Iruntime/proto --python_out=runtime/rpc_stubs --grpc_python_out=runtime/rpc_stubs runtime/proto/trainer_to_scheduler.proto 5 | python3 -m grpc_tools.protoc -Iruntime/proto --python_out=runtime/rpc_stubs --grpc_python_out=runtime/rpc_stubs runtime/proto/scheduler_to_trainer.proto 6 | 7 | clean: 8 | rm -rf runtime/rpc_stubs/*_pb2.py runtime/rpc_stubs/*_pb2_grpc.py -------------------------------------------------------------------------------- /cluster_exp/README.md: -------------------------------------------------------------------------------- 1 | # Testbed experiments 2 | Note: 3 | - Due to the execution scripts are highly related to intracompany platform, we only demonstrate the functionality and show the pseudocode of the related scripts (e.g., run.sh, prepare_env.sh). Please adjust the scripts to your platform if you would like to execute the testbed experiment. 4 | - Our testbed experiments were performed on 8 nodes with 8 V100 GPUs per node. For other cluster settings, please change ```setups``` in ```run.sh```. 5 | 6 | # 0. Content 7 | - **cluster_exp/** contains code for real-cluster experiment. 8 | - **cluster_spec/** contains configuration files for cluster, e.g., the number of nodes, the number of GPU per node. 9 | - **runtime/** contains gRPC runtime of scheduler, trainer, master, and worker. 10 | - **trace-data/** contains traces for testbed evaluation. 11 | - **workloads/** contains the implementations of DL workloads used in our evaluation. 12 | - **calc.py** computes metrics, e.g., avg. JCT, Makespan, and 99th JCT. 13 | - **cluster.py**, **switch.py**, and **node.py** contain implementations of the cluster. 14 | - **jobs.py** and **model.py** contain information of the jobs. 15 | - **flags.py** contains the argument definition method. 16 | - **log.py** and **utils.py** contain auxiliary functions. 17 | - **matching.py** contains the implementation of the matching algorithm for Muri. 18 | - **run.py** contains the implementation of different scheduling policies. 19 | - **controller.py**, **scheduler.py**, **trainer.py**, **worker.py**, and **task.py** contain the implementation of scheduler components and scheduling tasks. 20 | - **Makefile** prepares gRPC 21 | 22 | # 1. Environment config 23 | ### Step 1: interconnect each node 24 | 25 | ### Step 2: create conda environment 26 | ``` 27 | # create conda env 28 | conda create -n muri python=3.8 29 | conda activate muri 30 | ``` 31 | 32 | ### Step 3: install Open MPI 33 | [Install Open MPI](https://www.open-mpi.org/faq/?category=building#easy-build) or other MPI implementation. 34 | 35 | ### Step 4: install python dependencies 36 | ``` 37 | # gRPC 38 | python -m pip install grpcio 39 | python -m pip install grpcio-tools 40 | 41 | # prepare rpc 42 | cd /cluster_exp 43 | make rpc 44 | 45 | # other dependencies 46 | conda install numpy 47 | conda install -c conda-forge cvxpy 48 | conda install pytorch torchvision torchaudio cudatoolkit -c pytorch 49 | HOROVOD_GPU_OPERATIONS=NCCL python -m pip install horovod 50 | 51 | # dependencies for workloads 52 | # NLP 53 | conda install -c huggingface transformers 54 | # RL 55 | python -m pip install -r /cluster_exp/workloads/requirements.txt 56 | ``` 57 | 58 | ### Step 5: prepare datasets (for testbed experiment) 59 | - [Imagenet-1k](https://academictorrents.com/details/a306397ccf9c2ead27155983c254227c0fd938e2) for CV models. 60 | - [Wikitext](https://huggingface.co/datasets/wikitext) for NLP models. 61 | Store these datsets in ```/cluster_exp/datasets/``` 62 | 63 | # 2. Reproduce testbed results (for SIGCOMM'22 artifact evaluation) 64 | - ```cd /cluster_exp``` 65 | - Table 3&4, Figure 8: ```bash run.sh ```, `````` can be set to 66 | - ```shortest```: SRTF 67 | - ```shortest-gpu```: SRSF 68 | - ```multi-resource-blossom-same-gpu```: Muri-S 69 | - ```dlas-gpu```: Tiresias 70 | - ```themis```: Themis 71 | - ```multi-resource-blossom-same-gpu-unaware```: Muri-L 72 | - Each test takes about 1 day. 73 | 74 | Note: We list the detailed log (```/cluster.csv``` and ```/job.csv```) and evaluation results (```/result.out```) in ```/cluster_exp/results```. You can use ```python3 draw.py``` to get the figures shown in our paper. 75 | -------------------------------------------------------------------------------- /cluster_exp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rivendile/Muri/4e69a78ce75f629a5a00abd48150a1860b341ce6/cluster_exp/__init__.py -------------------------------------------------------------------------------- /cluster_exp/calc.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | log_path = sys.argv[1] 5 | 6 | csv_reader = csv.reader(open(log_path+"/job.csv")) 7 | jct_sum = 0 8 | makespan = 0 9 | cnt = 0 10 | jct_list = [] 11 | for line_id,line in enumerate(csv_reader): 12 | if line_id > 0: 13 | jct_sum += float(line[-5]) 14 | makespan = max(makespan, float(line[5])) 15 | cnt += 1 16 | jct_list.append(float(line[-5])) 17 | 18 | jct_list.sort() 19 | 20 | print("Total jobs: %d, avg JCT: %.6f, makespan: %.6f, 99th JCT: %.6f" % (cnt, jct_sum/cnt, makespan, jct_list[int(cnt*0.99)])) -------------------------------------------------------------------------------- /cluster_exp/cluster_specs/n1g8.csv: -------------------------------------------------------------------------------- 1 | num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node 2 | 1,1,8,92,350 3 | -------------------------------------------------------------------------------- /cluster_exp/cluster_specs/n2g8.csv: -------------------------------------------------------------------------------- 1 | num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node 2 | 1,2,8,92,350 3 | -------------------------------------------------------------------------------- /cluster_exp/cluster_specs/n4g8.csv: -------------------------------------------------------------------------------- 1 | num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node 2 | 1,2,8,92,350 -------------------------------------------------------------------------------- /cluster_exp/cluster_specs/n8g8.csv: -------------------------------------------------------------------------------- 1 | num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node 2 | 1,8,8,92,350 -------------------------------------------------------------------------------- /cluster_exp/controller.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import threading 4 | import utils 5 | import queue 6 | 7 | from runtime.rpc import master_server, master_client 8 | import log 9 | 10 | 11 | class Controller(object): 12 | def __init__(self, port: int, num_workers: int) -> None: 13 | super().__init__() 14 | 15 | self._logger = utils.make_logger(__name__) 16 | 17 | self._num_workers = num_workers 18 | self._workers = [] 19 | 20 | self.done_queue = queue.Queue() 21 | 22 | self._server_for_worker = self.make_server_for_worker(port) 23 | 24 | self.wait_for_workers() 25 | self._jump_time = 0 26 | self._start_time = time.time() 27 | 28 | def set_start_time(self): 29 | self._start_time = time.time() 30 | 31 | def get_time(self): 32 | return time.time()+self._jump_time-self._start_time 33 | 34 | 35 | def make_server_for_worker(self, port: int): 36 | callbacks = { 37 | 'RegisterWorker' : self._register_worker_impl, 38 | 'Done' : self._done_impl, 39 | } 40 | 41 | server_thread = threading.Thread( 42 | target=master_server.serve, 43 | args=(port, self._logger, callbacks)) 44 | server_thread.setDaemon(True) 45 | server_thread.start() 46 | 47 | return server_thread 48 | 49 | 50 | def execute(self, job_info): 51 | self._logger.info(f'controller execute job {list(job_info.job_id)}, use node: {list(job_info.node_id)}') 52 | self._workers[min(list(job_info.node_id))].execute(job_info) 53 | 54 | 55 | def kill(self, job_info): 56 | self._workers[min(list(job_info.node_id))].kill(job_info) 57 | 58 | def get_util(self, secs=20): 59 | num_workers = len(self._workers) 60 | self._logger.info(f'controller get util of {num_workers} worker(s): {secs}s') 61 | avg_gpu_util_all, avg_cpu_util_all, avg_io_read_all = 0, 0, 0 62 | for worker in self._workers: 63 | avg_gpu_util, avg_cpu_util, avg_io_read = worker.get_util(secs) 64 | avg_gpu_util_all += avg_gpu_util 65 | avg_cpu_util_all += avg_cpu_util 66 | avg_io_read_all += avg_io_read 67 | avg_gpu_util_all /= num_workers 68 | avg_cpu_util_all /= num_workers 69 | avg_io_read_all /= num_workers 70 | return avg_gpu_util_all, avg_cpu_util_all, avg_io_read_all 71 | 72 | 73 | def _register_worker_impl(self, worker_ip, worker_port, num_gpus): 74 | success = True 75 | worker_id = len(self._workers) 76 | self._workers.append(master_client.MasterClientForWorker(self._logger, worker_id, worker_ip, worker_port)) 77 | 78 | self._logger.info(f'controller, register, {worker_id}, {worker_ip}:{worker_port}') 79 | 80 | return success, worker_id 81 | 82 | 83 | def _done_impl(self, job_id, job_counter, worker_id, gpus, returncode) -> bool: 84 | success = True 85 | 86 | self.done_queue.put((self.get_time(), job_id, worker_id, gpus, returncode)) 87 | self._logger.info(f'controller, done, {worker_id}, {job_id} - {job_counter} @ {worker_id}, {gpus}, return code: {returncode}') 88 | 89 | return success 90 | 91 | 92 | def wait_for_workers(self): 93 | while len(self._workers) < self._num_workers: 94 | time.sleep(5) 95 | 96 | def kill_workers(self): 97 | for workers in self._workers: 98 | workers.exit_command() 99 | 100 | 101 | if __name__ == '__main__': 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument('--port', type=int, default=9012) 104 | parser.add_argument('--num_workers', type=int, default=8) 105 | args = parser.parse_args() 106 | 107 | controller = Controller(args.port, args.num_workers) -------------------------------------------------------------------------------- /cluster_exp/draw.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import matplotlib.pyplot as plt 3 | 4 | SRTF = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 5 | SRSF = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 6 | Muri_S = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 7 | Tiresias_L = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 8 | Themis = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 9 | Muri_L = {"time":[], "queue_length":[], "blocking_index":[], "gpu_util":[], "cpu_util":[], "io_read_speed":[]} 10 | 11 | def read_csv(file:str, result:dict): 12 | with open(file) as f: 13 | reader = csv.reader(f, delimiter=',') 14 | header = next(reader) 15 | count = 0 16 | for row in reader: 17 | count += 1 18 | if count % 50 != 0: 19 | continue 20 | result["time"].append(float(row[0]) / 3600) 21 | result["queue_length"].append(int(row[1])) 22 | result["blocking_index"].append(float(row[2])) 23 | result["gpu_util"].append(float(row[3])) 24 | result["cpu_util"].append(float(row[4])/8*96/6/8) 25 | result["io_read_speed"].append(result["io_read_speed"][-1] if float(row[5]) > 1000000 else float(row[5])) 26 | for i in range(len(result["io_read_speed"])): 27 | result["io_read_speed"][i] /= 1024 28 | 29 | read_csv("results/SRTF/cluster.csv", SRTF) 30 | read_csv("results/SRSF/cluster.csv", SRSF) 31 | read_csv("results/Muri-S/cluster.csv", Muri_S) 32 | read_csv("results/tiresias/cluster.csv", Tiresias_L) 33 | read_csv("results/themis/cluster.csv", Themis) 34 | read_csv("results/Muri-L/cluster.csv", Muri_L) 35 | 36 | 37 | def aware(x:str, ax=None): 38 | plt.plot(SRTF["time"], SRTF[x], '', label="SRTF") 39 | plt.plot(SRSF["time"], SRSF[x], '', label="SRSF") 40 | plt.plot(Muri_S["time"], Muri_S[x], '', color = 'green', label="Muri-S") 41 | 42 | if x == "queue_length": 43 | plt.ylabel("Queue Length") 44 | elif x == "blocking_index": 45 | plt.ylabel("Blocking Index") 46 | elif x == "gpu_util": 47 | plt.ylabel("GPU Util (%)") 48 | elif x == "cpu_util": 49 | plt.ylabel("CPU Util (%)") 50 | elif x == "io_read_speed": 51 | plt.ylabel("IO speed (MB/s)") 52 | 53 | def unaware(x:str, ax=None): 54 | plt.plot(Tiresias_L["time"], Tiresias_L[x], '', label="Tiresias") 55 | plt.plot(Themis["time"], Themis[x], '', label="Themis") 56 | plt.plot(Muri_L["time"], Muri_L[x], '', color = 'green',label="Muri-L") 57 | 58 | if x == "queue_length": 59 | plt.ylabel("Queue Length") 60 | elif x == "blocking_index": 61 | plt.ylabel("Blocking Index") 62 | elif x == "gpu_util": 63 | plt.ylabel("GPU Util (%)") 64 | elif x == "cpu_util": 65 | plt.ylabel("CPU Util (%)") 66 | elif x == "io_read_speed": 67 | plt.ylabel("IO speed (MB/s)") 68 | # plt.yaxis.set_ticks_position('left') 69 | 70 | # ax.spines['top'].set_color('none') 71 | # ax.spines['right'].set_color('none') 72 | 73 | # plt.savefig("unaware_" + x + ".pdf", bbox_inches='tight') 74 | 75 | def draw_all(x:list): 76 | plt.clf() 77 | plt.rc('font',**{'size': 36, 'family': 'Arial' }) 78 | plt.rc('pdf',fonttype = 42) 79 | fig=plt.figure(figsize=(8, 25)) 80 | plt.subplot(5,1,1) 81 | aware(x[0]) 82 | plt.legend(loc='upper right', fontsize=24) 83 | ax1 = plt.subplot(5,1,2) 84 | aware(x[1], ax1) 85 | plt.subplot(5,1,3) 86 | aware(x[2]) 87 | plt.subplot(5,1,4) 88 | aware(x[3]) 89 | plt.subplot(5,1,5) 90 | aware(x[4]) 91 | plt.xlabel('Time (h)') 92 | fig.align_labels() 93 | plt.savefig("Figure8a.pdf", bbox_inches='tight') 94 | 95 | plt.clf() 96 | plt.rc('font',**{'size': 36, 'family': 'Arial' }) 97 | plt.rc('pdf',fonttype = 42) 98 | fig=plt.figure(figsize=(8, 25)) 99 | plt.subplot(5,1,1) 100 | unaware(x[0]) 101 | plt.legend(loc='upper right', fontsize=24) 102 | ax1 = plt.subplot(5,1,2) 103 | unaware(x[1], ax1) 104 | plt.subplot(5,1,3) 105 | unaware(x[2]) 106 | plt.subplot(5,1,4) 107 | unaware(x[3]) 108 | plt.subplot(5,1,5) 109 | unaware(x[4]) 110 | plt.xlabel('Time (h)') 111 | fig.align_labels() 112 | plt.savefig("Figure8b.pdf", bbox_inches='tight') 113 | 114 | if __name__ == "__main__": 115 | draw_all(["queue_length", "blocking_index", "gpu_util", "cpu_util", "io_read_speed"]) 116 | -------------------------------------------------------------------------------- /cluster_exp/flags.py: -------------------------------------------------------------------------------- 1 | """Implementation of the flags interface.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import argparse as _argparse 7 | 8 | _global_parser = _argparse.ArgumentParser(description='') 9 | 10 | 11 | # pylint: disable=invalid-name 12 | 13 | 14 | class _FlagValues(object): 15 | """Global container and accessor for flags and their values.""" 16 | 17 | def __init__(self): 18 | self.__dict__['__flags'] = {} 19 | self.__dict__['__parsed'] = False 20 | 21 | def _parse_flags(self, args=None): 22 | result, unparsed = _global_parser.parse_known_args(args=args) 23 | for flag_name, val in vars(result).items(): 24 | self.__dict__['__flags'][flag_name] = val 25 | self.__dict__['__parsed'] = True 26 | return unparsed 27 | 28 | def __getattr__(self, name): 29 | """Retrieves the 'value' attribute of the flag --name.""" 30 | try: 31 | parsed = self.__dict__['__parsed'] 32 | except KeyError: 33 | # May happen during pickle.load or copy.copy 34 | raise AttributeError(name) 35 | if not parsed: 36 | self._parse_flags() 37 | if name not in self.__dict__['__flags']: 38 | raise AttributeError(name) 39 | return self.__dict__['__flags'][name] 40 | 41 | def __setattr__(self, name, value): 42 | """Sets the 'value' attribute of the flag --name.""" 43 | if not self.__dict__['__parsed']: 44 | self._parse_flags() 45 | self.__dict__['__flags'][name] = value 46 | 47 | 48 | def _define_helper(flag_name, default_value, docstring, flagtype): 49 | """Registers 'flag_name' with 'default_value' and 'docstring'.""" 50 | _global_parser.add_argument('--' + flag_name, 51 | default=default_value, 52 | help=docstring, 53 | type=flagtype) 54 | 55 | 56 | # Provides the global object that can be used to access flags. 57 | FLAGS = _FlagValues() 58 | 59 | 60 | def DEFINE_string(flag_name, default_value, docstring): 61 | """Defines a flag of type 'string'. 62 | 63 | Args: 64 | flag_name: The name of the flag as a string. 65 | default_value: The default value the flag should take as a string. 66 | docstring: A helpful message explaining the use of the flag. 67 | """ 68 | _define_helper(flag_name, default_value, docstring, str) 69 | 70 | 71 | def DEFINE_integer(flag_name, default_value, docstring): 72 | """Defines a flag of type 'int'. 73 | 74 | Args: 75 | flag_name: The name of the flag as a string. 76 | default_value: The default value the flag should take as an int. 77 | docstring: A helpful message explaining the use of the flag. 78 | """ 79 | _define_helper(flag_name, default_value, docstring, int) 80 | 81 | 82 | def DEFINE_boolean(flag_name, default_value, docstring): 83 | """Defines a flag of type 'boolean'. 84 | 85 | Args: 86 | flag_name: The name of the flag as a string. 87 | default_value: The default value the flag should take as a boolean. 88 | docstring: A helpful message explaining the use of the flag. 89 | """ 90 | # Register a custom function for 'bool' so --flag=True works. 91 | def str2bool(v): 92 | return v.lower() in ('true', 't', '1') 93 | _global_parser.add_argument('--' + flag_name, 94 | nargs='?', 95 | const=True, 96 | help=docstring, 97 | default=default_value, 98 | type=str2bool) 99 | 100 | # Add negated version, stay consistent with argparse with regard to 101 | # dashes in flag names. 102 | _global_parser.add_argument('--no' + flag_name, 103 | action='store_false', 104 | dest=flag_name.replace('-', '_')) 105 | 106 | 107 | # The internal google library defines the following alias, so we match 108 | # the API for consistency. 109 | DEFINE_bool = DEFINE_boolean # pylint: disable=invalid-name 110 | 111 | 112 | def DEFINE_float(flag_name, default_value, docstring): 113 | """Defines a flag of type 'float'. 114 | 115 | Args: 116 | flag_name: The name of the flag as a string. 117 | default_value: The default value the flag should take as a float. 118 | docstring: A helpful message explaining the use of the flag. 119 | """ 120 | _define_helper(flag_name, default_value, docstring, float) 121 | 122 | 123 | def DEFINE_version(v_string): 124 | _global_parser.add_argument("-v", "--version", action='version', version='%(prog)s ' + v_string, dest='version', 125 | help="display version information") 126 | _allowed_symbols = [ 127 | # We rely on gflags documentation. 128 | 'DEFINE_bool', 129 | 'DEFINE_boolean', 130 | 'DEFINE_float', 131 | 'DEFINE_integer', 132 | 'DEFINE_string', 133 | 'DEFINE_version', 134 | 'FLAGS', 135 | ] 136 | # remove_undocumented(__name__, _allowed_symbols) -------------------------------------------------------------------------------- /cluster_exp/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import random 6 | import utils 7 | 8 | m_tensors = [[1.1,2.3,2.3,2.3,4.5,9.0,9.0,9.0,9.0,9.0,9.0,9.0,392.0,64.0,15.6], 9 | [1.1,2.3,2.3,4.5,9.0,9.0,9.0,9.0,9.0,392.0,64.0,15.6], 10 | [1.1,2.3,4.5,9.0,9.0,9.0,392.0,64.0,15.6], 11 | [1.2,2.5,5.1,3.4,144.0,64.0,15.6], 12 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 13 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 14 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 15 | [1.3,5.1,1.5,2.0,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.3,1.8,2.2,3.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,5.9], 16 | [3.8,2.1,1.3,1.6,1.9,1.7,1.7,2.2,5.9,1.7,1.7,2.5,3.0,1.7,1.7,3.5,5.9,1.7,1.7,1.5,7.8]] 17 | 18 | 19 | 20 | m_names = ['vgg19', 'vgg16', 'vgg11', 'alexnet', 'resnet152', 'resnet101', 'resnet50', 'inception4', 'inception3'] 21 | # m_mem = [0.60, 0.55, 0.45, 0.13, 0.85, 0.70, 0.50, 0.85, 0.80] 22 | m_mem = [1, 1, 1, 1, 1, 1, 1, 1, 1] 23 | 24 | worker_mem = 5 25 | ps_mem = 8 26 | per_worker_mem = 0.2 27 | 28 | 29 | def get_model(model_name): 30 | ''' 31 | get model tensor information by model_name 32 | return a dict{name, tensors(list)} 33 | ''' 34 | if model_name == 'vgg19': 35 | m_idx = 0 36 | elif model_name == 'vgg16': 37 | m_idx = 1 38 | elif model_name == 'vgg11': 39 | m_idx = 2 40 | elif model_name == 'alexnet': 41 | m_idx = 3 42 | elif model_name == 'resnet152': 43 | m_idx = 4 44 | elif model_name == 'resnet101': 45 | m_idx = 5 46 | elif model_name == 'resnet50': 47 | m_idx = 6 48 | elif model_name == 'inception4': 49 | m_idx = 7 50 | elif model_name == 'inception3': 51 | m_idx = 8 52 | else: 53 | # m_idx = random.randint(0,8) 54 | m_idx = 8 55 | utils.print_fn('No model match, pick %s' % m_names[m_idx]) 56 | 57 | ret = {'name':m_names[m_idx], 'ind':m_idx, 'tensors':m_tensors[m_idx], 'mem_util':m_mem[m_idx]} 58 | return ret 59 | 60 | def get_model_with_scale(model_name, model_scale): 61 | ''' 62 | get model tensor information by model_name 63 | and extend the number of tensors with model_scale 64 | return a dict{name, tensors(list)} 65 | ''' 66 | ret = get_model(model_name) 67 | ret['tensors'] = ret['tensors'] * int(model_scale) 68 | total_size = 0.0 69 | for i in ret['tensors']: 70 | total_size += i 71 | ret['total_size'] = round(total_size, 1) #float x.x 72 | return ret 73 | 74 | 75 | 76 | # if __name__ == '__main__': 77 | # # print('Hello world %d' % 2) 78 | # print(get_model_with_scale('vgg11', 2)) -------------------------------------------------------------------------------- /cluster_exp/prepare_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: Due to the scripts are highly related to intracompany platform, 3 | #we only demonstrate the functionality and show the pseudocode of the 4 | #related scripts (e.g., run.sh, prepare_env.sh). Please adjust to your 5 | #platform if you would like to execute the testbed experiment. 6 | 7 | FA_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 8 | THIS_DIR=$FA_DIR/workloads 9 | 10 | # Prepare datasets and make sure that the used nodes can connect with 11 | # each other. 12 | 13 | 14 | # set worker ip and port 15 | SCHEDULER_IP=$1 16 | shift 17 | WORKER_PORT=$1 18 | shift 19 | TRAINER_PORT=$1 20 | shift 21 | WORKER_ID=$1 22 | shift 23 | 24 | mkdir $THIS_DIR/hostfiles 25 | hostfile=$THIS_DIR/hostfiles/hostfile-[0-0-0-0]-[0-0-0-0] 26 | rm -f $hostfile 27 | echo "worker-${WORKER_ID}" >>${hostfile} 28 | 29 | CUDA_VISIBLE_DEVICES=0 bash $THIS_DIR/run_preenv.sh gpt2 4 0 2 -1 10 0 0 gpt2 4 0 2 -1 0 0 0 gpt2 4 0 2 -1 0 0 0 gpt2 4 0 2 -1 0 0 0 1 --scheduler-ip $SCHEDULER_IP --trainer-port $TRAINER_PORT 30 | CUDA_VISIBLE_DEVICES=0 bash $THIS_DIR/run_preenv.sh bert 4 0 2 -1 10 0 0 gpt2 4 0 2 -1 0 0 0 gpt2 4 0 2 -1 0 0 0 gpt2 4 0 2 -1 0 0 0 1 --scheduler-ip $SCHEDULER_IP --trainer-port $TRAINER_PORT 31 | -------------------------------------------------------------------------------- /cluster_exp/results/Muri-L/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 129744.660102, makespan: 2666872.437762, 99th JCT: 1106155.678542 2 | -------------------------------------------------------------------------------- /cluster_exp/results/Muri-S/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 117634.554467, makespan: 2662548.294849, 99th JCT: 822571.676997 2 | -------------------------------------------------------------------------------- /cluster_exp/results/SRSF/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 238612.092932, makespan: 4223121.821402, 99th JCT: 3138494.383750 2 | -------------------------------------------------------------------------------- /cluster_exp/results/SRTF/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 249140.176953, makespan: 4156633.262890, 99th JCT: 2723588.792933 2 | -------------------------------------------------------------------------------- /cluster_exp/results/themis/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 461456.234115, makespan: 3924818.880355, 99th JCT: 2876142.917767 2 | -------------------------------------------------------------------------------- /cluster_exp/results/tiresias/result.out: -------------------------------------------------------------------------------- 1 | Total jobs: 400, avg JCT: 336533.171665, makespan: 3959796.871355, 99th JCT: 2812808.288529 2 | -------------------------------------------------------------------------------- /cluster_exp/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: Due to the scripts are highly related to intracompany platform, 3 | #we only demonstrate the functionality and show the pseudocode of the 4 | #related scripts (e.g., run.sh, prepare_env.sh). Please adjust to your 5 | #platform if you would like to execute the testbed experiment. 6 | 7 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 8 | 9 | # set the worker ip and port 10 | SCHEDULER_IP=$1 11 | shift 12 | WORKER_PORT=$1 13 | shift 14 | TRAINER_PORT=$1 15 | shift 16 | WORKER_ID=$1 17 | shift 18 | 19 | # prepare environment before start, takes several minutes 20 | cd $THIS_DIR 21 | bash $THIS_DIR/prepare_env.sh $SCHEDULER_IP $WORKER_PORT $TRAINER_PORT $WORKER_ID 22 | 23 | # set the scheduling policy and related parameters 24 | placement=('yarn') 25 | export schedules_all=$1 26 | shift 27 | jobs = ('cluster_trace') 28 | setups=("n8g8") 29 | packing_nums=("4") 30 | schedule_intervals=("360") 31 | fast_forwards=("60") 32 | 33 | IFS=',' 34 | read -ra schedule <<<"$schedules_all" 35 | 36 | mkdir $THIS_DIR/results 37 | for setup in ${setups[@]};do 38 | cluster_spec="cluster_specs/${setup}.csv" 39 | for job in ${jobs[@]};do 40 | job_file="trace-data/${job}.csv" 41 | for packing_num in ${packing_nums[@]};do 42 | for schedule_interval in ${schedule_intervals[@]};do 43 | for fast_forward in ${fast_forwards[@]};do 44 | trace_name="${setup}j${job}p${packing_num}si${schedule_interval}ff${fast_forward}" 45 | log_folder="results/${trace_name}" 46 | mkdir $THIS_DIR/${log_folder} 47 | for p in ${placement[@]};do 48 | for s in ${schedule[@]};do 49 | log_name="${log_folder}/${s}-${p}-${packing_num}" 50 | mkdir $THIS_DIR/$log_name 51 | job_log="$THIS_DIR/job_logs/${trace_name}/${s}-${p}-${packing_num}" 52 | rm -rf $job_log 53 | echo "running..." $setup $job $s 54 | if [ $WORKER_ID -eq 1 ]; then 55 | # start scheduler for the main node 56 | python $THIS_DIR/run.py --cluster_spec=$THIS_DIR/${cluster_spec} --print --scheme=${p} --trace_file=$THIS_DIR/${job_file} --schedule=${s} --log_path=$THIS_DIR/${log_name} --packing_num ${packing_num} --schedule_interval ${schedule_interval} --fast_forwarding ${fast_forward} >$THIS_DIR/${log_name}/scheduler.out & 57 | sleep 10s 58 | else 59 | sleep 6m 60 | fi 61 | 62 | # start worker for all nodes 63 | python $THIS_DIR/worker.py --master_ip $SCHEDULER_IP --worker_port $WORKER_PORT --trace_name ${job_log} --this-dir ${THIS_DIR} $arg >$THIS_DIR/${log_name}/worker.out & 64 | 65 | wait 66 | 67 | # get the results after execution 68 | echo "calcing..." $setup $job $s 69 | if [ $WORKER_ID -eq 1 ]; then 70 | python $THIS_DIR/calc.py $THIS_DIR/${log_name} >$THIS_DIR/${log_name}/result.out 71 | else 72 | sleep 2m 73 | fi 74 | done 75 | done 76 | done 77 | done 78 | done 79 | done 80 | done 81 | 82 | -------------------------------------------------------------------------------- /cluster_exp/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | from .rpc import worker_client 2 | from .rpc import worker_server 3 | from .rpc import master_client 4 | from .rpc import master_server -------------------------------------------------------------------------------- /cluster_exp/runtime/proto/master_to_worker.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | service MasterToWorker { 4 | rpc Execute (ExecuteRequest) returns (ExecuteResponse) {}; 5 | rpc Kill (KillRequest) returns (KillResponse) {}; 6 | rpc ExitCommand (ExitCommandRequest) returns (ExitCommandResponse) {}; 7 | rpc GetUtil (GetUtilRequest) returns (GetUtilResponse) {}; 8 | } 9 | 10 | message JobInfo { 11 | uint32 num = 1; 12 | repeated uint32 node_id = 2; 13 | repeated int32 job_id = 3; 14 | repeated string job_name = 4; 15 | repeated uint32 batch_size = 5; 16 | repeated uint32 iterations = 6; 17 | string gpus = 7; 18 | repeated uint32 job_counter = 8; 19 | uint32 num_gpu = 9; 20 | } 21 | 22 | message ExecuteRequest { 23 | JobInfo job_info = 1; 24 | } 25 | 26 | message ExecuteResponse { 27 | bool success = 1; 28 | } 29 | 30 | message KillRequest { 31 | JobInfo job_info = 1; 32 | } 33 | 34 | message KillResponse { 35 | bool success = 1; 36 | } 37 | 38 | message ExitCommandRequest { 39 | 40 | } 41 | 42 | message ExitCommandResponse { 43 | bool success = 1; 44 | } 45 | 46 | message GetUtilRequest { 47 | uint32 secs = 1; 48 | } 49 | 50 | message GetUtilResponse { 51 | double gpu_util = 1; 52 | double cpu_util = 2; 53 | double io_read = 3; 54 | } -------------------------------------------------------------------------------- /cluster_exp/runtime/proto/scheduler_to_trainer.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | service SchedulerToTrainer { 4 | rpc QueryStats (QueryStatsRequest) returns (QueryStatsResponse) {}; 5 | } 6 | 7 | message QueryStatsRequest { 8 | 9 | } 10 | 11 | message QueryStatsResponse { 12 | uint32 finished_iterations = 1; 13 | } -------------------------------------------------------------------------------- /cluster_exp/runtime/proto/trainer_to_scheduler.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | service TrainerToScheduler { 4 | rpc RegisterTrainer (RegisterTrainerRequest) returns (RegisterTrainerResponse) {}; 5 | rpc ReportIterTime (ReportIterTimeRequest) returns (ReportIterTimeResponse) {}; 6 | } 7 | 8 | message RegisterTrainerRequest { 9 | string trainer_ip = 1; 10 | uint32 trainer_port = 2; 11 | repeated int32 job_id = 3; 12 | } 13 | 14 | message RegisterTrainerResponse { 15 | bool success = 1; 16 | } 17 | 18 | message ReportIterTimeRequest { 19 | repeated int32 job_id = 1; 20 | repeated double iter_time = 2; 21 | repeated double src_utils = 3; 22 | } 23 | 24 | message ReportIterTimeResponse { 25 | bool success = 1; 26 | } -------------------------------------------------------------------------------- /cluster_exp/runtime/proto/worker_to_master.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | service WorkerToMaster { 4 | rpc RegisterWorker (RegisterWorkerRequest) returns (RegisterWorkerResponse) {}; 5 | rpc Done (DoneRequest) returns (DoneResponse); 6 | } 7 | 8 | message RegisterWorkerRequest { 9 | string worker_ip = 1; 10 | uint32 worker_port = 2; 11 | uint32 num_gpus = 3; 12 | } 13 | 14 | message RegisterWorkerResponse { 15 | bool success = 1; 16 | uint32 worker_id = 2; 17 | } 18 | 19 | message DoneRequest { 20 | int32 job_id = 1; 21 | uint32 job_counter = 2; 22 | uint32 worker_id = 3; 23 | string gpus = 4; 24 | int32 returncode = 5; 25 | } 26 | 27 | message DoneResponse { 28 | bool success = 1; 29 | } -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/master_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.master_to_worker_pb2 import ExecuteRequest, KillRequest, ExitCommandRequest, GetUtilRequest 6 | import runtime.rpc_stubs.master_to_worker_pb2_grpc as m2w_rpc 7 | 8 | import grpc 9 | from logging import Logger 10 | 11 | 12 | class MasterClientForWorker(object): 13 | def __init__(self, logger : Logger, worker_id, worker_ip, worker_port) -> None: 14 | super().__init__() 15 | 16 | self._worker_id = worker_id 17 | self._worker_ip = worker_ip 18 | self._worker_port = worker_port 19 | channel = grpc.insecure_channel(self.addr) 20 | self._stub = m2w_rpc.MasterToWorkerStub(channel) 21 | self._logger = logger 22 | 23 | 24 | @property 25 | def addr(self): 26 | return f'{self._worker_ip}:{self._worker_port}' 27 | 28 | 29 | def execute(self, job_info): 30 | self._logger.info(f'controller, execute, {job_info.job_id} - {job_info.job_counter} @ {self._worker_id}-{job_info.node_id}, {job_info.gpus}') 31 | request = ExecuteRequest() 32 | request.job_info.num = int(job_info.num) 33 | request.job_info.node_id.extend(list(job_info.node_id)) 34 | request.job_info.job_id.extend(list(job_info.job_id)) 35 | request.job_info.job_name.extend(list(job_info.job_name)) 36 | request.job_info.batch_size.extend(list(job_info.batch_size)) 37 | request.job_info.iterations.extend(list(job_info.iterations)) 38 | request.job_info.gpus = job_info.gpus 39 | request.job_info.job_counter.extend(list(job_info.job_counter)) 40 | request.job_info.num_gpu = int(job_info.num_gpu) 41 | response = self._stub.Execute(request) 42 | assert response.success == True 43 | 44 | 45 | def kill(self, job_info): 46 | self._logger.info(f'controller, kill, {job_info.job_id} - {job_info.job_counter} @ {self._worker_id}-{job_info.node_id}, {job_info.gpus}') 47 | request = KillRequest() 48 | request.job_info.num = int(job_info.num) 49 | request.job_info.node_id.extend(list(job_info.node_id)) 50 | request.job_info.job_id.extend(list(job_info.job_id)) 51 | request.job_info.job_name.extend(list(job_info.job_name)) 52 | request.job_info.batch_size.extend(list(job_info.batch_size)) 53 | request.job_info.iterations.extend(list(job_info.iterations)) 54 | request.job_info.gpus = job_info.gpus 55 | request.job_info.job_counter.extend(list(job_info.job_counter)) 56 | request.job_info.num_gpu = int(job_info.num_gpu) 57 | response = self._stub.Kill(request) 58 | assert response.success == True 59 | 60 | def exit_command(self): 61 | self._logger.info(f'controller ask worker {self._worker_id} to exit') 62 | request = ExitCommandRequest() 63 | response = self._stub.ExitCommand(request) 64 | assert response.success == True 65 | 66 | def get_util(self, secs): 67 | request = GetUtilRequest(secs=secs) 68 | response = self._stub.GetUtil(request) 69 | return response.gpu_util, response.cpu_util, response.io_read 70 | 71 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/master_server.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | 6 | from runtime.rpc_stubs.worker_to_master_pb2 import RegisterWorkerRequest, RegisterWorkerResponse, DoneResponse 7 | from runtime.rpc_stubs.worker_to_master_pb2_grpc import WorkerToMasterServicer 8 | import runtime.rpc_stubs.worker_to_master_pb2_grpc as w2m_rpc 9 | 10 | import grpc 11 | from concurrent import futures 12 | 13 | 14 | class MasterServerForWorker(WorkerToMasterServicer): 15 | def __init__(self, logger, callbacks) -> None: 16 | super().__init__() 17 | 18 | self._logger = logger 19 | self._callbacks = callbacks 20 | 21 | 22 | def RegisterWorker(self, request: RegisterWorkerRequest, context) -> RegisterWorkerResponse: 23 | # return super().RegisterWorker(request, context) 24 | assert 'RegisterWorker' in self._callbacks 25 | register_worker_impl = self._callbacks['RegisterWorker'] 26 | 27 | success, worker_id = register_worker_impl(request.worker_ip, request.worker_port, request.num_gpus) 28 | response = RegisterWorkerResponse(success=success, worker_id=worker_id) 29 | 30 | return response 31 | 32 | 33 | def Done(self, request, context): 34 | # return super().Done(request, context) 35 | assert 'Done' in self._callbacks 36 | done_impl = self._callbacks['Done'] 37 | 38 | success = done_impl(request.job_id, request.job_counter, request.worker_id, request.gpus, request.returncode) 39 | response = DoneResponse(success=success) 40 | 41 | return response 42 | 43 | 44 | def serve(port, logger, callbacks): 45 | server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) 46 | w2m_rpc.add_WorkerToMasterServicer_to_server(MasterServerForWorker(logger, callbacks), server) 47 | server.add_insecure_port(f'[::]:{port}') 48 | server.start() 49 | 50 | logger.info(f'controller, rpc, start, server @ {port}') 51 | 52 | server.wait_for_termination() -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/scheduler_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.scheduler_to_trainer_pb2 import QueryStatsRequest 6 | import runtime.rpc_stubs.scheduler_to_trainer_pb2_grpc as s2t_rpc 7 | 8 | import grpc 9 | from logging import Logger 10 | 11 | 12 | class SchedulerClientForTrainer(object): 13 | def __init__(self, logger : Logger, job_id, trainer_ip, trainer_port) -> None: 14 | super().__init__() 15 | 16 | self._job_id = job_id 17 | self._trainer_ip = trainer_ip 18 | self._trainer_port = trainer_port 19 | channel = grpc.insecure_channel(self.addr) 20 | self._stub = s2t_rpc.SchedulerToTrainerStub(channel) 21 | self._logger = logger 22 | 23 | 24 | @property 25 | def addr(self): 26 | return f'{self._trainer_ip}:{self._trainer_port}' 27 | 28 | 29 | def query_stats(self): 30 | self._logger.info(f'scheduler, query, job {self._job_id}') 31 | request = QueryStatsRequest() 32 | response = self._stub.QueryStats(request) 33 | 34 | return response.finished_iterations -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/scheduler_server.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | 6 | from runtime.rpc_stubs.trainer_to_scheduler_pb2 import RegisterTrainerRequest, RegisterTrainerResponse, ReportIterTimeRequest, ReportIterTimeResponse 7 | from runtime.rpc_stubs.trainer_to_scheduler_pb2_grpc import TrainerToSchedulerServicer 8 | import runtime.rpc_stubs.trainer_to_scheduler_pb2_grpc as t2s_rpc 9 | 10 | import grpc 11 | from concurrent import futures 12 | 13 | 14 | class SchedulerServerForTrainer(TrainerToSchedulerServicer): 15 | def __init__(self, logger, callbacks) -> None: 16 | super().__init__() 17 | 18 | self._logger = logger 19 | self._callbacks = callbacks 20 | 21 | 22 | def RegisterTrainer(self, request: RegisterTrainerRequest, context): 23 | # return super().RegisterTrainer(request, context) 24 | assert 'RegisterTrainer' in self._callbacks 25 | register_trainer_impl = self._callbacks['RegisterTrainer'] 26 | 27 | success = register_trainer_impl(request.trainer_ip, request.trainer_port, request.job_id) 28 | response = RegisterTrainerResponse(success=success) 29 | 30 | return response 31 | 32 | def ReportIterTime(self, request: ReportIterTimeRequest, context) -> ReportIterTimeResponse: 33 | assert 'ReportIterTime' in self._callbacks 34 | report_itertime_impl = self._callbacks['ReportIterTime'] 35 | 36 | success = report_itertime_impl(request.job_id, request.iter_time, request.src_utils) 37 | response = ReportIterTimeResponse(success=success) 38 | 39 | return response 40 | 41 | 42 | def serve(port, logger, callbacks): 43 | server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) 44 | t2s_rpc.add_TrainerToSchedulerServicer_to_server(SchedulerServerForTrainer(logger, callbacks), server) 45 | server.add_insecure_port(f'[::]:{port}') 46 | server.start() 47 | 48 | logger.info(f'scheduler, rpc, start, server @ {port}') 49 | 50 | server.wait_for_termination() -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/trainer_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.trainer_to_scheduler_pb2 import RegisterTrainerRequest, ReportIterTimeRequest, ReportIterTimeResponse 6 | import runtime.rpc_stubs.trainer_to_scheduler_pb2_grpc as t2s_rpc 7 | 8 | import grpc 9 | 10 | 11 | class TrainerClientForScheduler(object): 12 | def __init__(self, logger, scheduler_ip, scheduler_port) -> None: 13 | super().__init__() 14 | 15 | self._logger = logger 16 | 17 | self._scheduler_ip = scheduler_ip 18 | self._scheduler_port = scheduler_port 19 | self._logger.info(f'{self.addr}') 20 | channel = grpc.insecure_channel(self.addr) 21 | self._stub = t2s_rpc.TrainerToSchedulerStub(channel) 22 | 23 | 24 | @property 25 | def addr(self): 26 | return f'{self._scheduler_ip}:{self._scheduler_port}' 27 | 28 | 29 | def register_trainer(self, trainer_ip, trainer_port, job_id): 30 | request = RegisterTrainerRequest(trainer_ip=trainer_ip, trainer_port=trainer_port, job_id=job_id) 31 | # self._logger.info(f'job {job_id} {request}') 32 | try: 33 | response = self._stub.RegisterTrainer(request) 34 | self._logger.info(f'job {job_id}, register, {response.success}') 35 | return response.success 36 | except Exception as e: 37 | self._logger.info(f'job {job_id}, register, fail, {e}') 38 | return False, None 39 | 40 | def report_itertime(self, job_id, iter_time, src_utils): 41 | request = ReportIterTimeRequest() 42 | request.job_id.extend(job_id) 43 | request.iter_time.extend(iter_time) 44 | request.src_utils.extend(src_utils) 45 | response = self._stub.ReportIterTime(request) 46 | assert response.success == True 47 | return response.success 48 | 49 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/trainer_server.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.scheduler_to_trainer_pb2 import QueryStatsResponse 6 | from runtime.rpc_stubs.scheduler_to_trainer_pb2_grpc import SchedulerToTrainerServicer 7 | import runtime.rpc_stubs.scheduler_to_trainer_pb2_grpc as s2t_rpc 8 | 9 | import grpc 10 | from concurrent import futures 11 | 12 | 13 | class TrainerServerForScheduler(SchedulerToTrainerServicer): 14 | def __init__(self, logger, callbacks) -> None: 15 | super().__init__() 16 | self._logger = logger 17 | self._callbacks = callbacks 18 | 19 | 20 | def QueryStats(self, request, context): 21 | # return super().QueryStats(request, context) 22 | assert 'QueryStats' in self._callbacks 23 | query_stats_impl = self._callbacks['QueryStats'] 24 | 25 | finished_iterations = query_stats_impl() 26 | response = QueryStatsResponse(finished_iterations=finished_iterations) 27 | 28 | return response 29 | 30 | 31 | def serve(port, logger, callbacks): 32 | server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) 33 | s2t_rpc.add_SchedulerToTrainerServicer_to_server(TrainerServerForScheduler(logger, callbacks), server) 34 | server.add_insecure_port(f'[::]:{port}') 35 | server.start() 36 | 37 | logger.info(f'trainer, rpc, start, server @ {port}') 38 | 39 | server.wait_for_termination() -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/worker_client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.worker_to_master_pb2 import RegisterWorkerRequest, DoneRequest 6 | import runtime.rpc_stubs.worker_to_master_pb2_grpc as w2m_rpc 7 | 8 | import grpc 9 | 10 | 11 | class WorkerClientForMaster(object): 12 | def __init__(self, logger, master_ip, master_port) -> None: 13 | super().__init__() 14 | 15 | self._logger = logger 16 | 17 | self._master_ip = master_ip 18 | self._master_port = master_port 19 | 20 | channel = grpc.insecure_channel(self.addr) 21 | self._stub = w2m_rpc.WorkerToMasterStub(channel) 22 | 23 | 24 | @property 25 | def addr(self): 26 | return f'{self._master_ip}:{self._master_port}' 27 | 28 | 29 | def register_worker(self, worker_ip, worker_port, num_gpus): 30 | request = RegisterWorkerRequest(worker_ip=worker_ip, worker_port=worker_port, num_gpus=num_gpus) 31 | 32 | try: 33 | response = self._stub.RegisterWorker(request) 34 | self._logger.info(f'{response.worker_id}, register, {response.success}') 35 | return response.success, response.worker_id 36 | except Exception as e: 37 | self._logger.info(f'worker, register, fail, {e}') 38 | return False, None 39 | 40 | def done(self, job_id, job_counter, worker_id, gpus, returncode): 41 | request = DoneRequest(job_id=job_id, job_counter=job_counter, worker_id=worker_id, gpus=gpus, returncode = returncode) 42 | response = self._stub.Done(request) 43 | 44 | self._logger.info(f'{worker_id}, done, {job_id} - {job_counter}, {gpus}, return code: {returncode}') 45 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc/worker_server.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc_stubs')) 4 | 5 | from runtime.rpc_stubs.master_to_worker_pb2 import ExecuteResponse, KillResponse, ExitCommandResponse, GetUtilResponse 6 | from runtime.rpc_stubs.master_to_worker_pb2_grpc import MasterToWorkerServicer 7 | import runtime.rpc_stubs.master_to_worker_pb2_grpc as m2w_rpc 8 | 9 | import grpc 10 | from concurrent import futures 11 | 12 | 13 | class WorkerServerForMaster(MasterToWorkerServicer): 14 | def __init__(self, logger, callbacks) -> None: 15 | super().__init__() 16 | self._logger = logger 17 | self._callbacks = callbacks 18 | 19 | 20 | def Execute(self, request, context): 21 | # return super().Execute(request, context) 22 | assert 'Execute' in self._callbacks 23 | execute_impl = self._callbacks['Execute'] 24 | 25 | success = execute_impl(request.job_info) 26 | response = ExecuteResponse(success=success) 27 | 28 | return response 29 | 30 | 31 | def Kill(self, request, context): 32 | # return super().Kill(request, context) 33 | assert 'Kill' in self._callbacks 34 | kill_impl = self._callbacks['Kill'] 35 | 36 | success = kill_impl(request.job_info) 37 | response = KillResponse(success=success) 38 | 39 | return response 40 | 41 | def ExitCommand(self, request, context): 42 | assert 'ExitCommand' in self._callbacks 43 | exit_command_impl = self._callbacks['ExitCommand'] 44 | 45 | success = exit_command_impl() 46 | response = ExitCommandResponse(success=success) 47 | 48 | return response 49 | 50 | def GetUtil(self, request, context): 51 | assert 'GetUtil' in self._callbacks 52 | get_util_impl = self._callbacks['GetUtil'] 53 | 54 | gpu_util, cpu_util, io_read = get_util_impl(request.secs) 55 | response = GetUtilResponse(gpu_util=gpu_util, cpu_util=cpu_util, io_read=io_read) 56 | 57 | return response 58 | 59 | 60 | def serve(port, logger, callbacks): 61 | server = grpc.server(futures.ThreadPoolExecutor(max_workers=4)) 62 | m2w_rpc.add_MasterToWorkerServicer_to_server(WorkerServerForMaster(logger, callbacks), server) 63 | server.add_insecure_port(f'[::]:{port}') 64 | server.start() 65 | 66 | logger.info(f'worker, rpc, start, server @ {port}') 67 | 68 | server.wait_for_termination() -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc_stubs/scheduler_to_trainer_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # source: scheduler_to_trainer.proto 4 | """Generated protocol buffer code.""" 5 | from google.protobuf import descriptor as _descriptor 6 | from google.protobuf import message as _message 7 | from google.protobuf import reflection as _reflection 8 | from google.protobuf import symbol_database as _symbol_database 9 | # @@protoc_insertion_point(imports) 10 | 11 | _sym_db = _symbol_database.Default() 12 | 13 | 14 | 15 | 16 | DESCRIPTOR = _descriptor.FileDescriptor( 17 | name='scheduler_to_trainer.proto', 18 | package='', 19 | syntax='proto3', 20 | serialized_options=None, 21 | create_key=_descriptor._internal_create_key, 22 | serialized_pb=b'\n\x1ascheduler_to_trainer.proto\"\x13\n\x11QueryStatsRequest\"1\n\x12QueryStatsResponse\x12\x1b\n\x13\x66inished_iterations\x18\x01 \x01(\r2M\n\x12SchedulerToTrainer\x12\x37\n\nQueryStats\x12\x12.QueryStatsRequest\x1a\x13.QueryStatsResponse\"\x00\x62\x06proto3' 23 | ) 24 | 25 | 26 | 27 | 28 | _QUERYSTATSREQUEST = _descriptor.Descriptor( 29 | name='QueryStatsRequest', 30 | full_name='QueryStatsRequest', 31 | filename=None, 32 | file=DESCRIPTOR, 33 | containing_type=None, 34 | create_key=_descriptor._internal_create_key, 35 | fields=[ 36 | ], 37 | extensions=[ 38 | ], 39 | nested_types=[], 40 | enum_types=[ 41 | ], 42 | serialized_options=None, 43 | is_extendable=False, 44 | syntax='proto3', 45 | extension_ranges=[], 46 | oneofs=[ 47 | ], 48 | serialized_start=30, 49 | serialized_end=49, 50 | ) 51 | 52 | 53 | _QUERYSTATSRESPONSE = _descriptor.Descriptor( 54 | name='QueryStatsResponse', 55 | full_name='QueryStatsResponse', 56 | filename=None, 57 | file=DESCRIPTOR, 58 | containing_type=None, 59 | create_key=_descriptor._internal_create_key, 60 | fields=[ 61 | _descriptor.FieldDescriptor( 62 | name='finished_iterations', full_name='QueryStatsResponse.finished_iterations', index=0, 63 | number=1, type=13, cpp_type=3, label=1, 64 | has_default_value=False, default_value=0, 65 | message_type=None, enum_type=None, containing_type=None, 66 | is_extension=False, extension_scope=None, 67 | serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), 68 | ], 69 | extensions=[ 70 | ], 71 | nested_types=[], 72 | enum_types=[ 73 | ], 74 | serialized_options=None, 75 | is_extendable=False, 76 | syntax='proto3', 77 | extension_ranges=[], 78 | oneofs=[ 79 | ], 80 | serialized_start=51, 81 | serialized_end=100, 82 | ) 83 | 84 | DESCRIPTOR.message_types_by_name['QueryStatsRequest'] = _QUERYSTATSREQUEST 85 | DESCRIPTOR.message_types_by_name['QueryStatsResponse'] = _QUERYSTATSRESPONSE 86 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 87 | 88 | QueryStatsRequest = _reflection.GeneratedProtocolMessageType('QueryStatsRequest', (_message.Message,), { 89 | 'DESCRIPTOR' : _QUERYSTATSREQUEST, 90 | '__module__' : 'scheduler_to_trainer_pb2' 91 | # @@protoc_insertion_point(class_scope:QueryStatsRequest) 92 | }) 93 | _sym_db.RegisterMessage(QueryStatsRequest) 94 | 95 | QueryStatsResponse = _reflection.GeneratedProtocolMessageType('QueryStatsResponse', (_message.Message,), { 96 | 'DESCRIPTOR' : _QUERYSTATSRESPONSE, 97 | '__module__' : 'scheduler_to_trainer_pb2' 98 | # @@protoc_insertion_point(class_scope:QueryStatsResponse) 99 | }) 100 | _sym_db.RegisterMessage(QueryStatsResponse) 101 | 102 | 103 | 104 | _SCHEDULERTOTRAINER = _descriptor.ServiceDescriptor( 105 | name='SchedulerToTrainer', 106 | full_name='SchedulerToTrainer', 107 | file=DESCRIPTOR, 108 | index=0, 109 | serialized_options=None, 110 | create_key=_descriptor._internal_create_key, 111 | serialized_start=102, 112 | serialized_end=179, 113 | methods=[ 114 | _descriptor.MethodDescriptor( 115 | name='QueryStats', 116 | full_name='SchedulerToTrainer.QueryStats', 117 | index=0, 118 | containing_service=None, 119 | input_type=_QUERYSTATSREQUEST, 120 | output_type=_QUERYSTATSRESPONSE, 121 | serialized_options=None, 122 | create_key=_descriptor._internal_create_key, 123 | ), 124 | ]) 125 | _sym_db.RegisterServiceDescriptor(_SCHEDULERTOTRAINER) 126 | 127 | DESCRIPTOR.services_by_name['SchedulerToTrainer'] = _SCHEDULERTOTRAINER 128 | 129 | # @@protoc_insertion_point(module_scope) 130 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc_stubs/scheduler_to_trainer_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | 5 | import scheduler_to_trainer_pb2 as scheduler__to__trainer__pb2 6 | 7 | 8 | class SchedulerToTrainerStub(object): 9 | """Missing associated documentation comment in .proto file.""" 10 | 11 | def __init__(self, channel): 12 | """Constructor. 13 | 14 | Args: 15 | channel: A grpc.Channel. 16 | """ 17 | self.QueryStats = channel.unary_unary( 18 | '/SchedulerToTrainer/QueryStats', 19 | request_serializer=scheduler__to__trainer__pb2.QueryStatsRequest.SerializeToString, 20 | response_deserializer=scheduler__to__trainer__pb2.QueryStatsResponse.FromString, 21 | ) 22 | 23 | 24 | class SchedulerToTrainerServicer(object): 25 | """Missing associated documentation comment in .proto file.""" 26 | 27 | def QueryStats(self, request, context): 28 | """Missing associated documentation comment in .proto file.""" 29 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 30 | context.set_details('Method not implemented!') 31 | raise NotImplementedError('Method not implemented!') 32 | 33 | 34 | def add_SchedulerToTrainerServicer_to_server(servicer, server): 35 | rpc_method_handlers = { 36 | 'QueryStats': grpc.unary_unary_rpc_method_handler( 37 | servicer.QueryStats, 38 | request_deserializer=scheduler__to__trainer__pb2.QueryStatsRequest.FromString, 39 | response_serializer=scheduler__to__trainer__pb2.QueryStatsResponse.SerializeToString, 40 | ), 41 | } 42 | generic_handler = grpc.method_handlers_generic_handler( 43 | 'SchedulerToTrainer', rpc_method_handlers) 44 | server.add_generic_rpc_handlers((generic_handler,)) 45 | 46 | 47 | # This class is part of an EXPERIMENTAL API. 48 | class SchedulerToTrainer(object): 49 | """Missing associated documentation comment in .proto file.""" 50 | 51 | @staticmethod 52 | def QueryStats(request, 53 | target, 54 | options=(), 55 | channel_credentials=None, 56 | call_credentials=None, 57 | insecure=False, 58 | compression=None, 59 | wait_for_ready=None, 60 | timeout=None, 61 | metadata=None): 62 | return grpc.experimental.unary_unary(request, target, '/SchedulerToTrainer/QueryStats', 63 | scheduler__to__trainer__pb2.QueryStatsRequest.SerializeToString, 64 | scheduler__to__trainer__pb2.QueryStatsResponse.FromString, 65 | options, channel_credentials, 66 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 67 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc_stubs/trainer_to_scheduler_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | 5 | import trainer_to_scheduler_pb2 as trainer__to__scheduler__pb2 6 | 7 | 8 | class TrainerToSchedulerStub(object): 9 | """Missing associated documentation comment in .proto file.""" 10 | 11 | def __init__(self, channel): 12 | """Constructor. 13 | 14 | Args: 15 | channel: A grpc.Channel. 16 | """ 17 | self.RegisterTrainer = channel.unary_unary( 18 | '/TrainerToScheduler/RegisterTrainer', 19 | request_serializer=trainer__to__scheduler__pb2.RegisterTrainerRequest.SerializeToString, 20 | response_deserializer=trainer__to__scheduler__pb2.RegisterTrainerResponse.FromString, 21 | ) 22 | self.ReportIterTime = channel.unary_unary( 23 | '/TrainerToScheduler/ReportIterTime', 24 | request_serializer=trainer__to__scheduler__pb2.ReportIterTimeRequest.SerializeToString, 25 | response_deserializer=trainer__to__scheduler__pb2.ReportIterTimeResponse.FromString, 26 | ) 27 | 28 | 29 | class TrainerToSchedulerServicer(object): 30 | """Missing associated documentation comment in .proto file.""" 31 | 32 | def RegisterTrainer(self, request, context): 33 | """Missing associated documentation comment in .proto file.""" 34 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 35 | context.set_details('Method not implemented!') 36 | raise NotImplementedError('Method not implemented!') 37 | 38 | def ReportIterTime(self, request, context): 39 | """Missing associated documentation comment in .proto file.""" 40 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 41 | context.set_details('Method not implemented!') 42 | raise NotImplementedError('Method not implemented!') 43 | 44 | 45 | def add_TrainerToSchedulerServicer_to_server(servicer, server): 46 | rpc_method_handlers = { 47 | 'RegisterTrainer': grpc.unary_unary_rpc_method_handler( 48 | servicer.RegisterTrainer, 49 | request_deserializer=trainer__to__scheduler__pb2.RegisterTrainerRequest.FromString, 50 | response_serializer=trainer__to__scheduler__pb2.RegisterTrainerResponse.SerializeToString, 51 | ), 52 | 'ReportIterTime': grpc.unary_unary_rpc_method_handler( 53 | servicer.ReportIterTime, 54 | request_deserializer=trainer__to__scheduler__pb2.ReportIterTimeRequest.FromString, 55 | response_serializer=trainer__to__scheduler__pb2.ReportIterTimeResponse.SerializeToString, 56 | ), 57 | } 58 | generic_handler = grpc.method_handlers_generic_handler( 59 | 'TrainerToScheduler', rpc_method_handlers) 60 | server.add_generic_rpc_handlers((generic_handler,)) 61 | 62 | 63 | # This class is part of an EXPERIMENTAL API. 64 | class TrainerToScheduler(object): 65 | """Missing associated documentation comment in .proto file.""" 66 | 67 | @staticmethod 68 | def RegisterTrainer(request, 69 | target, 70 | options=(), 71 | channel_credentials=None, 72 | call_credentials=None, 73 | insecure=False, 74 | compression=None, 75 | wait_for_ready=None, 76 | timeout=None, 77 | metadata=None): 78 | return grpc.experimental.unary_unary(request, target, '/TrainerToScheduler/RegisterTrainer', 79 | trainer__to__scheduler__pb2.RegisterTrainerRequest.SerializeToString, 80 | trainer__to__scheduler__pb2.RegisterTrainerResponse.FromString, 81 | options, channel_credentials, 82 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 83 | 84 | @staticmethod 85 | def ReportIterTime(request, 86 | target, 87 | options=(), 88 | channel_credentials=None, 89 | call_credentials=None, 90 | insecure=False, 91 | compression=None, 92 | wait_for_ready=None, 93 | timeout=None, 94 | metadata=None): 95 | return grpc.experimental.unary_unary(request, target, '/TrainerToScheduler/ReportIterTime', 96 | trainer__to__scheduler__pb2.ReportIterTimeRequest.SerializeToString, 97 | trainer__to__scheduler__pb2.ReportIterTimeResponse.FromString, 98 | options, channel_credentials, 99 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 100 | -------------------------------------------------------------------------------- /cluster_exp/runtime/rpc_stubs/worker_to_master_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | """Client and server classes corresponding to protobuf-defined services.""" 3 | import grpc 4 | 5 | import worker_to_master_pb2 as worker__to__master__pb2 6 | 7 | 8 | class WorkerToMasterStub(object): 9 | """Missing associated documentation comment in .proto file.""" 10 | 11 | def __init__(self, channel): 12 | """Constructor. 13 | 14 | Args: 15 | channel: A grpc.Channel. 16 | """ 17 | self.RegisterWorker = channel.unary_unary( 18 | '/WorkerToMaster/RegisterWorker', 19 | request_serializer=worker__to__master__pb2.RegisterWorkerRequest.SerializeToString, 20 | response_deserializer=worker__to__master__pb2.RegisterWorkerResponse.FromString, 21 | ) 22 | self.Done = channel.unary_unary( 23 | '/WorkerToMaster/Done', 24 | request_serializer=worker__to__master__pb2.DoneRequest.SerializeToString, 25 | response_deserializer=worker__to__master__pb2.DoneResponse.FromString, 26 | ) 27 | 28 | 29 | class WorkerToMasterServicer(object): 30 | """Missing associated documentation comment in .proto file.""" 31 | 32 | def RegisterWorker(self, request, context): 33 | """Missing associated documentation comment in .proto file.""" 34 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 35 | context.set_details('Method not implemented!') 36 | raise NotImplementedError('Method not implemented!') 37 | 38 | def Done(self, request, context): 39 | """Missing associated documentation comment in .proto file.""" 40 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 41 | context.set_details('Method not implemented!') 42 | raise NotImplementedError('Method not implemented!') 43 | 44 | 45 | def add_WorkerToMasterServicer_to_server(servicer, server): 46 | rpc_method_handlers = { 47 | 'RegisterWorker': grpc.unary_unary_rpc_method_handler( 48 | servicer.RegisterWorker, 49 | request_deserializer=worker__to__master__pb2.RegisterWorkerRequest.FromString, 50 | response_serializer=worker__to__master__pb2.RegisterWorkerResponse.SerializeToString, 51 | ), 52 | 'Done': grpc.unary_unary_rpc_method_handler( 53 | servicer.Done, 54 | request_deserializer=worker__to__master__pb2.DoneRequest.FromString, 55 | response_serializer=worker__to__master__pb2.DoneResponse.SerializeToString, 56 | ), 57 | } 58 | generic_handler = grpc.method_handlers_generic_handler( 59 | 'WorkerToMaster', rpc_method_handlers) 60 | server.add_generic_rpc_handlers((generic_handler,)) 61 | 62 | 63 | # This class is part of an EXPERIMENTAL API. 64 | class WorkerToMaster(object): 65 | """Missing associated documentation comment in .proto file.""" 66 | 67 | @staticmethod 68 | def RegisterWorker(request, 69 | target, 70 | options=(), 71 | channel_credentials=None, 72 | call_credentials=None, 73 | insecure=False, 74 | compression=None, 75 | wait_for_ready=None, 76 | timeout=None, 77 | metadata=None): 78 | return grpc.experimental.unary_unary(request, target, '/WorkerToMaster/RegisterWorker', 79 | worker__to__master__pb2.RegisterWorkerRequest.SerializeToString, 80 | worker__to__master__pb2.RegisterWorkerResponse.FromString, 81 | options, channel_credentials, 82 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 83 | 84 | @staticmethod 85 | def Done(request, 86 | target, 87 | options=(), 88 | channel_credentials=None, 89 | call_credentials=None, 90 | insecure=False, 91 | compression=None, 92 | wait_for_ready=None, 93 | timeout=None, 94 | metadata=None): 95 | return grpc.experimental.unary_unary(request, target, '/WorkerToMaster/Done', 96 | worker__to__master__pb2.DoneRequest.SerializeToString, 97 | worker__to__master__pb2.DoneResponse.FromString, 98 | options, channel_credentials, 99 | insecure, call_credentials, compression, wait_for_ready, timeout, metadata) 100 | -------------------------------------------------------------------------------- /cluster_exp/runtime/tests/localhost_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import socket 3 | import sys 4 | import os 5 | import time 6 | import threading 7 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 8 | import worker 9 | import master 10 | 11 | LOCALHOST = "127.0.0.1" 12 | MASTER_PORT = 6888 13 | WORKER_PORT1 = 9000 14 | WORKER_PORT2 = 9001 15 | WORKER_PORT3 = 9002 16 | TRACE_FILE = "../../traces/fake.txt" 17 | 18 | class Worker2MasterTests(unittest.TestCase): 19 | # worker_client -> master_server 20 | def test_register_and_done(self): 21 | # start master server in the background at localhost:6888 22 | m = master.Master(MASTER_PORT) 23 | time.sleep(2) 24 | 25 | # initialize two worker client 26 | client0 = worker.Worker(LOCALHOST, MASTER_PORT, LOCALHOST, WORKER_PORT1, 2) 27 | client1 = worker.Worker(LOCALHOST, MASTER_PORT, LOCALHOST, WORKER_PORT2, 2) 28 | 29 | # register a worker with 2 GPUs 30 | ret = client0.register() 31 | # register successfully with worker_id 0 32 | self.assertEqual(0, ret) 33 | 34 | #register a worker with 5 GPUs 35 | ret = client1.register() 36 | # worker_id 1 37 | self.assertEqual(1, ret) 38 | 39 | #clinet 0 job 0 request for fast forward 40 | client0._worker_rpc_client.report_stable(0) 41 | m.fast_forward(1234) 42 | client1._worker_rpc_client.report_stable(0) 43 | 44 | # client0 has done job 7 45 | client0._worker_rpc_client.done(7) 46 | # client1 has done job 12 47 | client1._worker_rpc_client.done(12) 48 | # client0 has done job 999 49 | client0._worker_rpc_client.done(999) 50 | # client1 has done job 777 51 | client1._worker_rpc_client.done(777) 52 | 53 | class Master2WorkerTests(unittest.TestCase): 54 | def setUp(self): 55 | self.client0 = worker.Worker(LOCALHOST, MASTER_PORT, LOCALHOST, WORKER_PORT1, 2) 56 | self.client1 = worker.Worker(LOCALHOST, MASTER_PORT, LOCALHOST, WORKER_PORT2, 2) 57 | self.master = master.Master(MASTER_PORT) 58 | self.client0.register() 59 | self.client1.register() 60 | 61 | # master_client => worker_server 62 | def test_run_and_kill_and_update(self): 63 | job_description1 = ["fake", 0, LOCALHOST, 3] # start fake.py with job_id = 0 on localhost 64 | job_description2 = ["fake", 1, LOCALHOST, 1] # start fake.py with job_id = 1 on localhost 65 | job_description3 = ["fake", 2, LOCALHOST, 1] # start fake.py with job_id = 2 on localhost 66 | 67 | self.master._workers[0].caller.run_job(job_description1) 68 | self.master._workers[1].caller.run_job(job_description2) 69 | self.master._workers[1].caller.run_job(job_description3) 70 | time.sleep(5) 71 | job_description1[-1] = 8 72 | self.master._workers[0].caller.update_job(job_description1) 73 | time.sleep(5) 74 | self.master._workers[0].caller.kill_job(0) # kill job 0 75 | time.sleep(10) 76 | self.master._workers[1].caller.kill_job(1) # kill job 1 77 | 78 | def tearDown(self): 79 | # this will kill job 2 80 | self.master.shutdown() 81 | 82 | if __name__ == "__main__": 83 | unittest.main() -------------------------------------------------------------------------------- /cluster_exp/runtime/tests/worker_server_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import socket 3 | import sys 4 | import os 5 | sys.path.append(os.path.join(os.path.dirname(__file__), '../rpc')) 6 | from worker_server import WorkerRpcServer 7 | from worker_client import WorkerRpcClient 8 | 9 | class TestWorkerRpcServer(unittest.TestCase): 10 | def test_fetch_gpu_list(self): 11 | workerSever = WorkerRpcServer(None) 12 | tests = [0b1, 0, 0b101, 0b1111] 13 | tests_sol = [[0], [], [0, 2], [0, 1, 2, 3]] 14 | for test, sol in zip(tests, tests_sol): 15 | self.assertEqual(workerSever._fetch_GPU_list(test), sol) 16 | 17 | 18 | if __name__ == "__main__": 19 | unittest.main() -------------------------------------------------------------------------------- /cluster_exp/scheduler.py: -------------------------------------------------------------------------------- 1 | from runtime.rpc import scheduler_server, scheduler_client 2 | from controller import Controller 3 | from cluster import CLUSTER 4 | 5 | import argparse 6 | import threading 7 | import utils 8 | import copy 9 | from jobs import JOBS 10 | 11 | 12 | class Scheduler(object): 13 | def __init__(self, scheduler_port: int, controller_port: int) -> None: 14 | super().__init__() 15 | 16 | self._logger = utils.make_logger(__name__) 17 | 18 | self._trainers = dict() 19 | self._server_for_trainer = self.make_server_for_trainer(scheduler_port) 20 | 21 | self._num_workers = CLUSTER.num_node_p_switch 22 | self._controller = Controller(controller_port, self._num_workers) 23 | self._src_num = 3 24 | self._src_utils = [0 for _ in range(self._src_num)] 25 | 26 | # self._start_time = self._controller.get_time() 27 | 28 | def get_time(self): 29 | return self._controller.get_time() 30 | 31 | def make_server_for_trainer(self, port): 32 | callbacks = { 33 | 'RegisterTrainer': self._register_trainer_impl, 34 | 'ReportIterTime': self._report_itertime_impl, 35 | } 36 | 37 | server_thread = threading.Thread( 38 | target=scheduler_server.serve, 39 | args=(port, self._logger, callbacks)) 40 | server_thread.setDaemon(True) 41 | server_thread.start() 42 | 43 | return server_thread 44 | 45 | 46 | def _register_trainer_impl(self, trainer_ip, trainer_port, job_id_list): 47 | success = True 48 | # self._logger.info(f'scheduler, before register, {job_id} {trainer_ip}:{trainer_port} {self._trainers.keys()}') 49 | job_id = max(job_id_list) 50 | # assert job_id not in self._trainers 51 | tmp_client = scheduler_client.SchedulerClientForTrainer(self._logger, job_id_list, trainer_ip, trainer_port) 52 | self._trainers[job_id] = tmp_client 53 | self._logger.info(f'scheduler, register, {job_id}-{job_id_list}, {trainer_ip}:{trainer_port}') 54 | 55 | return success 56 | 57 | def _report_itertime_impl(self, job_id, iter_time, src_utils): 58 | success = True 59 | num_gpu = 0 60 | for rjob_id in job_id: 61 | if rjob_id>=0: 62 | rjob = JOBS.find_runnable_job(rjob_id) 63 | rjob['real_itertime'] = copy.deepcopy(list(iter_time)) 64 | num_gpu = rjob['num_gpu'] 65 | for i in range(self._src_num): # cpu util is approximate 66 | self._src_utils[i] += src_utils[i]*num_gpu 67 | self._logger.info(f'scheduler, update job {job_id} iter_time {list(iter_time)}; src_utils {src_utils} -> {self._src_utils}') 68 | return success 69 | 70 | def query_stats(self, job_id_list): 71 | job_id = max(job_id_list) 72 | assert job_id in self._trainers 73 | finished_iterations = self._trainers[job_id].query_stats() 74 | return finished_iterations 75 | 76 | def has_ready_jobs(self, tmp_time): 77 | if len(JOBS.job_events)>0 and JOBS.job_events[0]['time']<=tmp_time: 78 | return True 79 | else: 80 | return False 81 | 82 | def has_running_trainers(self, running_jobs): 83 | if running_jobs>self._controller.done_queue.qsize(): 84 | return True 85 | else: 86 | return False 87 | 88 | def clear_src_utils(self): 89 | self._src_utils = [0 for _ in range(self._src_num)] 90 | 91 | 92 | if __name__ == '__main__': 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--scheduler_port', type=int, default=9011) 95 | parser.add_argument('--controller_port', type=int, default=9012) 96 | args = parser.parse_args() 97 | 98 | scheduler = Scheduler(args.scheduler_port, args.controller_port) -------------------------------------------------------------------------------- /cluster_exp/task.py: -------------------------------------------------------------------------------- 1 | from runtime.rpc_stubs.master_to_worker_pb2 import JobInfo 2 | 3 | import subprocess 4 | import os 5 | import utils 6 | 7 | 8 | class Task(object): 9 | def __init__(self, job_info: JobInfo, scheduler_ip, trace_name, this_dir) -> None: 10 | super().__init__() 11 | 12 | self._job_num = job_info.num 13 | self._node_id = list(job_info.node_id) 14 | self._job_id = job_info.job_id 15 | self._job_name = job_info.job_name 16 | self._batch_size = job_info.batch_size 17 | self._iterations = job_info.iterations 18 | self._gpus = job_info.gpus 19 | self._scheduler_ip = scheduler_ip 20 | self._num_gpu = job_info.num_gpu 21 | self._this_dir = this_dir 22 | self._job_counter = job_info.job_counter 23 | self._trace_name = trace_name 24 | 25 | 26 | def get_idle_port(self): 27 | return 9013 + 8*min(self._node_id) + int(self._gpus.split(',')[0]) 28 | # return utils.find_free_port() 29 | 30 | 31 | @staticmethod 32 | def test_kill_restart(): 33 | bash_cmd = 'nvidia-smi; sleep 2m; date' 34 | return bash_cmd 35 | 36 | 37 | def real_job(self): 38 | bash_cmd = f'bash {self._this_dir}/workloads/run.sh' 39 | for i in range(self._job_num): 40 | bash_cmd += f' {self._job_name[i]} {self._batch_size[i]} 0 2 -1 {self._iterations[i]} {self._job_id[i]} {self._job_counter[i]}' 41 | bash_cmd += f' {self._num_gpu}' 42 | bash_cmd += f' --scheduler-ip {self._scheduler_ip}' 43 | bash_cmd += f' --trainer-port {self.get_idle_port()} --this-dir {self._this_dir}/workloads' 44 | return bash_cmd 45 | 46 | def run(self): 47 | bash_cmd = '' 48 | # if self._job_name == 'test_kill_restart': 49 | # bash_cmd = self.test_kill_restart() 50 | # else: 51 | bash_cmd = self.real_job() 52 | 53 | cmd = bash_cmd.split() 54 | 55 | hostfile_dir = self._this_dir+'/workloads/hostfiles' 56 | assert os.path.exists(hostfile_dir) 57 | hostfile_list = [f'worker-{node_id}\n' for node_id in self._node_id] 58 | ch = '-' 59 | job_id_str = ch.join([str(x) for x in list(self._job_id)]) 60 | job_counter_str = ch.join([str(x) for x in list(self._job_counter)]) 61 | # print(self._iterations) 62 | with open(hostfile_dir+f'/hostfile-[{job_id_str}]-[{job_counter_str}]', 'w') as f: 63 | f.writelines(hostfile_list) 64 | 65 | environ_dict = dict(os.environ) 66 | environ_dict['CUDA_VISIBLE_DEVICES'] = self._gpus 67 | with open(self.log_path, 'w+') as f: 68 | self._handler = subprocess.Popen( 69 | cmd, 70 | stdout=f, 71 | stderr=f, 72 | env=environ_dict, 73 | ) 74 | 75 | return cmd 76 | 77 | 78 | def terminate(self): 79 | self._handler.terminate() 80 | 81 | def wait(self): 82 | self._handler.wait() 83 | 84 | 85 | @property 86 | def return_code(self): 87 | return self._handler.poll() 88 | 89 | @property 90 | def pid(self): 91 | return self._handler.pid 92 | 93 | 94 | @property 95 | def log_path(self): 96 | if not os.path.exists(f'{self._trace_name}/'): 97 | os.makedirs(f'{self._trace_name}/') 98 | path = '' 99 | for i in range(self._job_num): 100 | if i==0: 101 | path = f'{self._trace_name}/{self._job_id[i]}-{self._job_counter[i]}-{self._job_name[i]}' 102 | else: 103 | path += f'_{self._job_id[i]}-{self._job_counter[i]}-{self._job_name[i]}' 104 | return path + '.txt' 105 | -------------------------------------------------------------------------------- /cluster_exp/trainer.py: -------------------------------------------------------------------------------- 1 | from runtime.rpc import trainer_client, trainer_server 2 | 3 | import argparse 4 | import utils 5 | import time 6 | import threading 7 | 8 | 9 | class Trainer(object): 10 | def __init__(self, scheduler_ip, scheduler_port, trainer_ip, trainer_port, job_id) -> None: 11 | super().__init__() 12 | 13 | self._trainer_ip = trainer_ip 14 | self._trainer_port = trainer_port 15 | self._job_id = job_id 16 | # self._batch_size = batch_size 17 | # self._demotion_threshold = demotion_threshold 18 | 19 | self._logger = utils.make_logger(__name__) 20 | self._start_time = time.time() 21 | self._finished_iteraions = 0 22 | 23 | self._client_for_scheduler = trainer_client.TrainerClientForScheduler(self._logger, scheduler_ip, scheduler_port) 24 | self.init_stats() 25 | 26 | self._server_for_scheduler = self.make_server_for_scheduler(self._trainer_port) 27 | 28 | self.register() 29 | 30 | self._logger.info(f'job {self._job_id}, trainer, start, {self._start_time}') 31 | 32 | def register(self): 33 | success = False 34 | while success == False: 35 | success = self._client_for_scheduler.register_trainer(self._trainer_ip, self._trainer_port, self._job_id) 36 | 37 | def report_itertime(self, iter_time, src_utils): 38 | success = self._client_for_scheduler.report_itertime(self._job_id, iter_time, src_utils) 39 | self._logger.info(f'job {self._job_id} reported iteration time {iter_time} and resource utils {src_utils}') 40 | 41 | def init_stats(self): 42 | pass 43 | 44 | 45 | def update_stats(self, iteration_time): 46 | self._finished_iteraions += 1 47 | self._logger.info(f'trainer update_stats: {self._finished_iteraions}, {iteration_time}') 48 | 49 | 50 | def record(self, iteration_time): 51 | self.update_stats(iteration_time) 52 | 53 | # if self.demotion() == True: 54 | # self._client_for_scheduler.report_stats(self._job_id, self._finished_iteraions, True) 55 | 56 | 57 | 58 | def make_server_for_scheduler(self, port: int): 59 | callbacks = { 60 | 'QueryStats' : self._query_stats_impl, 61 | } 62 | 63 | server_thread = threading.Thread( 64 | target=trainer_server.serve, 65 | args=(port, self._logger, callbacks)) 66 | server_thread.setDaemon(True) 67 | server_thread.start() 68 | 69 | return server_thread 70 | 71 | 72 | def _query_stats_impl(self): 73 | self._logger.info(f'trainer query stats, {self._finished_iteraions}') 74 | return self._finished_iteraions 75 | 76 | 77 | # def demotion(self) -> bool: 78 | # if self._demotion_threshold == None: 79 | # return False 80 | 81 | # return (time.time() - self._start_time >= self._demotion_threshold) 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument('--scheduler_ip', type=str, required=True) 87 | parser.add_argument('--scheduler_port', type=int, default=9011) 88 | parser.add_argument('--trainer_port', type=int) 89 | parser.add_argument('--job_id', type=int, default=-1) 90 | # parser.add_argument('--batch_size', type=int, default=8) 91 | # parser.add_argument('--demotion_threshold', type=float, default=None) 92 | args = parser.parse_args() 93 | 94 | trainer = Trainer(args.scheduler_ip, args.scheduler_port, utils.get_host_ip(), args.trainer_port, args.job_id) -------------------------------------------------------------------------------- /cluster_exp/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import socket 3 | import sys 4 | import subprocess 5 | import flags 6 | import logging 7 | import math 8 | import os 9 | import xml.etree.ElementTree as ET 10 | from contextlib import closing 11 | import cvxpy as cp 12 | import numpy as np 13 | 14 | FLAGS = flags.FLAGS 15 | 16 | def make_logger(name): 17 | LOG_FORMAT = '{name}:{levelname} [{asctime}] {message}' 18 | 19 | logger = logging.getLogger(name) 20 | logger.setLevel(logging.DEBUG) 21 | logger.propagate = False 22 | ch = logging.StreamHandler() 23 | ch.setFormatter(logging.Formatter(LOG_FORMAT, style='{')) 24 | logger.addHandler(ch) 25 | 26 | return logger 27 | 28 | def get_host_ip(): 29 | """get the host ip elegantly 30 | https://www.chenyudong.com/archives/python-get-local-ip-graceful.html 31 | """ 32 | try: 33 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 34 | s.connect(('8.8.8.8', 80)) 35 | ip = s.getsockname()[0] 36 | finally: 37 | s.close() 38 | return ip 39 | 40 | def print_fn(log): 41 | if FLAGS.print: 42 | print(log) 43 | if FLAGS.flush_stdout: 44 | sys.stdout.flush() 45 | 46 | 47 | def mkdir(folder_path): 48 | cmd = 'mkdir -p ' + folder_path 49 | ret = subprocess.check_call(cmd, shell=True) 50 | print_fn(ret) 51 | 52 | 53 | def search_dict_list(dict_list, key, value): 54 | ''' 55 | Search the targeted in the dict_list 56 | Return: 57 | list entry, or just None 58 | ''' 59 | for e in dict_list: 60 | # if e.has_key(key) == True: 61 | if key in e: 62 | if math.isclose(e[key], value, rel_tol=1e-9): 63 | return e 64 | 65 | return None 66 | 67 | def parse_xml(filename:str): 68 | fb_memory_usage = [] 69 | utilization = [] 70 | file_content = open(filename, mode='r').read() 71 | xmls = file_content.split('\n') 72 | for i in range(len(xmls) - 1): 73 | root = ET.fromstring(xmls[i] + '\n') 74 | for child in root[4]: 75 | if child.tag == 'fb_memory_usage': 76 | fb_memory_usage.append(child[1].text) 77 | if child.tag == "utilization": 78 | utilization.append(child[0].text) 79 | return fb_memory_usage, utilization 80 | 81 | def find_free_port(): 82 | """ 83 | https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number 84 | """ 85 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 86 | s.bind(('', 0)) 87 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 88 | return s.getsockname()[1] 89 | 90 | if __name__ == '__main__': 91 | print(get_host_ip()) -------------------------------------------------------------------------------- /cluster_exp/workloads/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .deep_rl import * 2 | from .cv_model import * 3 | from .a2c_model import * 4 | from .dqn_model import * 5 | from .nlp_model import * -------------------------------------------------------------------------------- /cluster_exp/workloads/models/a2c_model.py: -------------------------------------------------------------------------------- 1 | # add for RL 2 | from .deep_rl import * 3 | 4 | 5 | class A2CModel: 6 | def __init__(self, idx, args, sargs): 7 | self.idx = idx 8 | self.args = args 9 | self.sargs = sargs # specific args for this model 10 | 11 | def prepare(self, hvd): 12 | ''' 13 | prepare dataloader, model, optimizer for training 14 | ''' 15 | if hvd.local_rank()==0: 16 | mkdir('log') 17 | mkdir('tf_log') 18 | self.device = torch.device("cuda") 19 | Config.DEVICE = self.device 20 | kwargs = dict() 21 | kwargs['log_level'] = 0 22 | kwargs['game'] = 'BreakoutNoFrameskip-v4' 23 | config = Config() 24 | config.merge(kwargs) 25 | 26 | config.num_workers = self.sargs["batch_size"] 27 | config.task_fn = lambda: Task(config.game, num_envs=config.num_workers) 28 | config.eval_env = Task(config.game) 29 | config.optimizer_fn = lambda params: torch.optim.RMSprop(params, lr=1e-4, alpha=0.99, eps=1e-5) 30 | config.network_fn = lambda: CategoricalActorCriticNet(config.state_dim, config.action_dim, NatureConvBody()) 31 | config.state_normalizer = ImageNormalizer() 32 | config.reward_normalizer = SignNormalizer() 33 | config.discount = 0.99 34 | config.use_gae = True 35 | config.gae_tau = 1.0 36 | config.entropy_weight = 0.01 37 | config.rollout_length = self.args.rollout_length 38 | config.gradient_clip = 5 39 | config.max_steps = int(2e7) 40 | 41 | self.model = A2CAgent(config) 42 | self.config = self.model.config 43 | 44 | self.optimizer = hvd.DistributedOptimizer(self.model.optimizer, named_parameters=self.model.network.named_parameters(prefix='model'+str(self.idx))) 45 | 46 | hvd.broadcast_parameters(self.model.network.state_dict(), root_rank=0) 47 | hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) 48 | 49 | def get_data(self): 50 | ''' 51 | get data 52 | ''' 53 | self.storage = Storage(self.config.rollout_length) 54 | states = self.model.states 55 | for _ in range(self.model.config.rollout_length): 56 | prediction = self.model.network(self.model.config.state_normalizer(states)) 57 | next_states, rewards, terminals, info = self.model.task.step(to_np(prediction['action'])) 58 | self.model.record_online_return(info) 59 | rewards = self.model.config.reward_normalizer(rewards) 60 | self.storage.feed(prediction) 61 | self.storage.feed({'reward': tensor(rewards).unsqueeze(-1), 62 | 'mask': tensor(1 - terminals).unsqueeze(-1)}) 63 | 64 | states = next_states 65 | self.model.total_steps += self.model.config.num_workers 66 | return states 67 | 68 | 69 | def forward_backward(self, thread): 70 | ''' 71 | forward, calculate loss and backward 72 | ''' 73 | thread.join() 74 | states = thread.get_result() 75 | self.model.states = states 76 | 77 | prediction = self.model.network(self.model.config.state_normalizer(states)) 78 | self.storage.feed(prediction) 79 | self.storage.placeholder() 80 | 81 | advantages = tensor(np.zeros((self.model.config.num_workers, 1))) 82 | returns = prediction['v'].detach() 83 | for i in reversed(range(self.model.config.rollout_length)): 84 | returns = self.storage.reward[i] + self.model.config.discount * self.storage.mask[i] * returns 85 | if not self.model.config.use_gae: 86 | advantages = returns - self.storage.v[i].detach() 87 | else: 88 | td_error = self.storage.reward[i] + self.model.config.discount * self.storage.mask[i] * self.storage.v[i + 1] - self.storage.v[i] 89 | advantages = advantages * self.model.config.gae_tau * self.model.config.discount * self.storage.mask[i] + td_error 90 | self.storage.advantage[i] = advantages.detach() 91 | self.storage.ret[i] = returns.detach() 92 | 93 | entries = self.storage.extract(['log_pi_a', 'v', 'ret', 'advantage', 'entropy']) 94 | policy_loss = -(entries.log_pi_a * entries.advantage).mean() 95 | value_loss = 0.5 * (entries.ret - entries.v).pow(2).mean() 96 | entropy_loss = entries.entropy.mean() 97 | 98 | self.model.optimizer.zero_grad() 99 | (policy_loss - self.model.config.entropy_weight * entropy_loss + 100 | self.model.config.value_loss_weight * value_loss).backward() 101 | 102 | def comm(self): 103 | # self.optimizer.synchronize() 104 | # nn.utils.clip_grad_norm_(self.model.network.parameters(), self.model.config.gradient_clip) 105 | # with self.optimizer.skip_synchronize(): 106 | self.optimizer.step() 107 | 108 | def print_info(self): 109 | print("Model ", self.idx, ": ", self.sargs["model_name"], "; batch size: ", self.sargs["batch_size"], "; rollout length: ", self.args.rollout_length) 110 | 111 | def data_size(self): 112 | return 0 -------------------------------------------------------------------------------- /cluster_exp/workloads/models/cv_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.multiprocessing as mp 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import torch.utils.data.distributed 6 | # import torch.profiler 7 | # from torch.utils.tensorboard import SummaryWriter 8 | from torchvision import datasets, transforms, models 9 | 10 | 11 | class CVModel: 12 | def __init__(self, idx, args, sargs): 13 | self.idx = idx 14 | self.args = args 15 | self.sargs = sargs # specific args for this model 16 | 17 | def prepare(self, hvd): 18 | ''' 19 | prepare dataloader, model, optimizer for training 20 | ''' 21 | self.device = torch.device("cuda") 22 | 23 | train_dataset = \ 24 | datasets.ImageFolder(self.sargs["train_dir"], 25 | transform=transforms.Compose([ 26 | transforms.RandomResizedCrop(224), 27 | transforms.RandomHorizontalFlip(), 28 | transforms.ToTensor(), 29 | transforms.Normalize(mean=[0.485, 0.456, 0.406], 30 | std=[0.229, 0.224, 0.225]) 31 | ])) 32 | 33 | self.train_sampler = torch.utils.data.distributed.DistributedSampler( 34 | train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) 35 | self.train_loader = torch.utils.data.DataLoader( 36 | train_dataset, batch_size=self.sargs["batch_size"], 37 | sampler=self.train_sampler, num_workers=self.sargs["num_workers"], 38 | prefetch_factor=self.sargs["prefetch_factor"]) 39 | 40 | 41 | self.model = getattr(models, self.sargs["model_name"])(num_classes=self.args.num_classes) 42 | 43 | if self.args.cuda: 44 | self.model.cuda() 45 | 46 | optimizer = optim.SGD(self.model.parameters(), lr=(self.args.base_lr), 47 | momentum=self.args.momentum, weight_decay=self.args.wd) 48 | compression = hvd.Compression.fp16 if self.args.fp16_allreduce else hvd.Compression.none 49 | self.optimizer = hvd.DistributedOptimizer( 50 | optimizer, named_parameters=self.model.named_parameters(prefix='model'+str(self.idx)), 51 | compression=compression, 52 | op=hvd.Adasum if self.args.use_adasum else hvd.Average) 53 | 54 | hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) 55 | hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) 56 | 57 | self.dataloader_iter = iter(self.train_loader) 58 | 59 | self.cur_epoch = 0 60 | self.batch_idx = -1 61 | 62 | self.model.train() 63 | 64 | def get_data(self): 65 | ''' 66 | get data 67 | ''' 68 | try: 69 | data,target = next(self.dataloader_iter) 70 | except StopIteration: 71 | self.cur_epoch += 1 72 | self.train_sampler.set_epoch(self.cur_epoch) 73 | self.dataloader_iter = iter(self.train_loader) 74 | data,target = next(self.dataloader_iter) 75 | self.batch_idx = -1 76 | self.batch_idx +=1 77 | 78 | return data,target 79 | 80 | def forward_backward(self, thread): 81 | ''' 82 | forward, calculate loss and backward 83 | ''' 84 | thread.join() 85 | data, target = thread.get_result() 86 | if self.args.cuda: 87 | data = data.to(self.device, non_blocking=True) 88 | target = target.to(self.device, non_blocking=True) 89 | 90 | self.optimizer.zero_grad() 91 | output = self.model(data) 92 | loss = F.cross_entropy(output, target) 93 | loss.backward() 94 | 95 | def comm(self): 96 | ''' 97 | sync for communication 98 | ''' 99 | self.optimizer.step() 100 | 101 | def print_info(self): 102 | print("Model ", self.idx, ": ", self.sargs["model_name"], "; batch size: ", self.sargs["batch_size"]) 103 | 104 | def data_size(self): 105 | # each image is 108.6kb on average 106 | return self.sargs["batch_size"] * 108.6 -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import * 2 | from .component import * 3 | from .network import * 4 | from .utils import * -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/A2C_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from .BaseAgent import * 10 | import time 11 | 12 | 13 | class A2CAgent(BaseAgent): 14 | def __init__(self, config): 15 | BaseAgent.__init__(self, config) 16 | self.config = config 17 | self.task = config.task_fn() 18 | self.network = config.network_fn() 19 | self.optimizer = config.optimizer_fn(self.network.parameters()) 20 | self.total_steps = 0 21 | self.states = self.task.reset() 22 | 23 | 24 | def step(self): 25 | config = self.config 26 | storage = Storage(config.rollout_length) 27 | states = self.states 28 | for _ in range(config.rollout_length): 29 | prediction = self.network(config.state_normalizer(states)) 30 | next_states, rewards, terminals, info = self.task.step(to_np(prediction['action'])) 31 | self.record_online_return(info) 32 | rewards = config.reward_normalizer(rewards) 33 | storage.feed(prediction) 34 | storage.feed({'reward': tensor(rewards).unsqueeze(-1), 35 | 'mask': tensor(1 - terminals).unsqueeze(-1)}) 36 | 37 | states = next_states 38 | self.total_steps += config.num_workers 39 | 40 | self.states = states 41 | prediction = self.network(config.state_normalizer(states)) 42 | storage.feed(prediction) 43 | storage.placeholder() 44 | 45 | advantages = tensor(np.zeros((config.num_workers, 1))) 46 | returns = prediction['v'].detach() 47 | for i in reversed(range(config.rollout_length)): 48 | returns = storage.reward[i] + config.discount * storage.mask[i] * returns 49 | if not config.use_gae: 50 | advantages = returns - storage.v[i].detach() 51 | else: 52 | td_error = storage.reward[i] + config.discount * storage.mask[i] * storage.v[i + 1] - storage.v[i] 53 | advantages = advantages * config.gae_tau * config.discount * storage.mask[i] + td_error 54 | storage.advantage[i] = advantages.detach() 55 | storage.ret[i] = returns.detach() 56 | 57 | entries = storage.extract(['log_pi_a', 'v', 'ret', 'advantage', 'entropy']) 58 | policy_loss = -(entries.log_pi_a * entries.advantage).mean() 59 | value_loss = 0.5 * (entries.ret - entries.v).pow(2).mean() 60 | entropy_loss = entries.entropy.mean() 61 | 62 | self.optimizer.zero_grad() 63 | (policy_loss - config.entropy_weight * entropy_loss + 64 | config.value_loss_weight * value_loss).backward() 65 | nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip) 66 | self.optimizer.step() 67 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/BaseAgent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | import torch 8 | import numpy as np 9 | from ..utils import * 10 | import torch.multiprocessing as mp 11 | from collections import deque 12 | from skimage.io import imsave 13 | import time 14 | 15 | 16 | class BaseAgent: 17 | def __init__(self, config): 18 | self.config = config 19 | self.logger = get_logger(tag=config.tag, log_level=config.log_level) 20 | self.task_ind = 0 21 | 22 | def close(self): 23 | close_obj(self.task) 24 | 25 | def save(self, filename): 26 | torch.save(self.network.state_dict(), '%s.model' % (filename)) 27 | with open('%s.stats' % (filename), 'wb') as f: 28 | pickle.dump(self.config.state_normalizer.state_dict(), f) 29 | 30 | def load(self, filename): 31 | state_dict = torch.load('%s.model' % filename, map_location=lambda storage, loc: storage) 32 | self.network.load_state_dict(state_dict) 33 | with open('%s.stats' % (filename), 'rb') as f: 34 | self.config.state_normalizer.load_state_dict(pickle.load(f)) 35 | 36 | def eval_step(self, state): 37 | raise NotImplementedError 38 | 39 | def eval_episode(self): 40 | env = self.config.eval_env 41 | state = env.reset() 42 | while True: 43 | action = self.eval_step(state) 44 | state, reward, done, info = env.step(action) 45 | ret = info[0]['episodic_return'] 46 | if ret is not None: 47 | break 48 | return ret 49 | 50 | def eval_episodes(self): 51 | episodic_returns = [] 52 | for ep in range(self.config.eval_episodes): 53 | total_rewards = self.eval_episode() 54 | episodic_returns.append(np.sum(total_rewards)) 55 | self.logger.info('steps %d, episodic_return_test %.2f(%.2f)' % ( 56 | self.total_steps, np.mean(episodic_returns), np.std(episodic_returns) / np.sqrt(len(episodic_returns)) 57 | )) 58 | self.logger.add_scalar('episodic_return_test', np.mean(episodic_returns), self.total_steps) 59 | return { 60 | 'episodic_return_test': np.mean(episodic_returns), 61 | } 62 | 63 | def record_online_return(self, info, offset=0): 64 | if isinstance(info, dict): 65 | ret = info['episodic_return'] 66 | if ret is not None: 67 | self.logger.add_scalar('episodic_return_train', ret, self.total_steps + offset) 68 | self.logger.info('steps %d, episodic_return_train %s' % (self.total_steps + offset, ret)) 69 | elif isinstance(info, tuple): 70 | for i, info_ in enumerate(info): 71 | self.record_online_return(info_, i) 72 | else: 73 | raise NotImplementedError 74 | 75 | def switch_task(self): 76 | config = self.config 77 | if not config.tasks: 78 | return 79 | segs = np.linspace(0, config.max_steps, len(config.tasks) + 1) 80 | if self.total_steps > segs[self.task_ind + 1]: 81 | self.task_ind += 1 82 | self.task = config.tasks[self.task_ind] 83 | self.states = self.task.reset() 84 | self.states = config.state_normalizer(self.states) 85 | 86 | def record_episode(self, dir, env): 87 | mkdir(dir) 88 | steps = 0 89 | state = env.reset() 90 | while True: 91 | self.record_obs(env, dir, steps) 92 | action = self.record_step(state) 93 | state, reward, done, info = env.step(action) 94 | ret = info[0]['episodic_return'] 95 | steps += 1 96 | if ret is not None: 97 | break 98 | 99 | def record_step(self, state): 100 | raise NotImplementedError 101 | 102 | # For DMControl 103 | def record_obs(self, env, dir, steps): 104 | env = env.env.envs[0] 105 | obs = env.render(mode='rgb_array') 106 | imsave('%s/%04d.png' % (dir, steps), obs) 107 | 108 | 109 | class BaseActor(mp.Process): 110 | STEP = 0 111 | RESET = 1 112 | EXIT = 2 113 | SPECS = 3 114 | NETWORK = 4 115 | CACHE = 5 116 | 117 | def __init__(self, config): 118 | mp.Process.__init__(self) 119 | self.config = config 120 | self.__pipe, self.__worker_pipe = mp.Pipe() 121 | 122 | self._state = None 123 | self._task = None 124 | self._network = None 125 | self._total_steps = 0 126 | self.__cache_len = 2 127 | 128 | if not config.async_actor: 129 | self.start = lambda: None 130 | self.step = self._sample 131 | self.close = lambda: None 132 | self._set_up() 133 | self._task = config.task_fn() 134 | 135 | def _sample(self): 136 | transitions = [] 137 | for _ in range(self.config.sgd_update_frequency): 138 | transition = self._transition() 139 | if transition is not None: 140 | transitions.append(transition) 141 | return transitions 142 | 143 | def run(self): 144 | self._set_up() 145 | config = self.config 146 | self._task = config.task_fn() 147 | 148 | cache = deque([], maxlen=2) 149 | while True: 150 | op, data = self.__worker_pipe.recv() 151 | if op == self.STEP: 152 | if not len(cache): 153 | cache.append(self._sample()) 154 | cache.append(self._sample()) 155 | self.__worker_pipe.send(cache.popleft()) 156 | cache.append(self._sample()) 157 | elif op == self.EXIT: 158 | self.__worker_pipe.close() 159 | return 160 | elif op == self.NETWORK: 161 | self._network = data 162 | else: 163 | raise NotImplementedError 164 | 165 | def _transition(self): 166 | raise NotImplementedError 167 | 168 | def _set_up(self): 169 | pass 170 | 171 | def step(self): 172 | self.__pipe.send([self.STEP, None]) 173 | return self.__pipe.recv() 174 | 175 | def close(self): 176 | self.__pipe.send([self.EXIT, None]) 177 | self.__pipe.close() 178 | 179 | def set_network(self, net): 180 | if not self.config.async_actor: 181 | self._network = net 182 | else: 183 | self.__pipe.send([self.NETWORK, net]) 184 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/CategoricalDQN_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from ..utils import * 10 | import time 11 | from .BaseAgent import * 12 | from .DQN_agent import * 13 | 14 | 15 | class CategoricalDQNActor(DQNActor): 16 | def __init__(self, config): 17 | super().__init__(config) 18 | 19 | def _set_up(self): 20 | self.config.atoms = tensor(self.config.atoms) 21 | 22 | def compute_q(self, prediction): 23 | q_values = (prediction['prob'] * self.config.atoms).sum(-1) 24 | return to_np(q_values) 25 | 26 | 27 | class CategoricalDQNAgent(DQNAgent): 28 | def __init__(self, config): 29 | BaseAgent.__init__(self, config) 30 | self.config = config 31 | config.lock = mp.Lock() 32 | config.atoms = np.linspace(config.categorical_v_min, 33 | config.categorical_v_max, config.categorical_n_atoms) 34 | 35 | self.replay = config.replay_fn() 36 | self.actor = CategoricalDQNActor(config) 37 | 38 | self.network = config.network_fn() 39 | self.network.share_memory() 40 | self.target_network = config.network_fn() 41 | self.target_network.load_state_dict(self.network.state_dict()) 42 | self.optimizer = config.optimizer_fn(self.network.parameters()) 43 | 44 | self.actor.set_network(self.network) 45 | 46 | self.total_steps = 0 47 | self.batch_indices = range_tensor(config.batch_size) 48 | self.atoms = tensor(config.atoms) 49 | self.delta_atom = (config.categorical_v_max - config.categorical_v_min) / float(config.categorical_n_atoms - 1) 50 | 51 | def eval_step(self, state): 52 | self.config.state_normalizer.set_read_only() 53 | state = self.config.state_normalizer(state) 54 | prediction = self.network(state) 55 | q = (prediction['prob'] * self.atoms).sum(-1) 56 | action = to_np(q.argmax(-1)) 57 | self.config.state_normalizer.unset_read_only() 58 | return action 59 | 60 | def compute_loss(self, transitions): 61 | config = self.config 62 | states = self.config.state_normalizer(transitions.state) 63 | next_states = self.config.state_normalizer(transitions.next_state) 64 | with torch.no_grad(): 65 | prob_next = self.target_network(next_states)['prob'] 66 | q_next = (prob_next * self.atoms).sum(-1) 67 | if config.double_q: 68 | a_next = torch.argmax((self.network(next_states)['prob'] * self.atoms).sum(-1), dim=-1) 69 | else: 70 | a_next = torch.argmax(q_next, dim=-1) 71 | prob_next = prob_next[self.batch_indices, a_next, :] 72 | 73 | rewards = tensor(transitions.reward).unsqueeze(-1) 74 | masks = tensor(transitions.mask).unsqueeze(-1) 75 | atoms_target = rewards + self.config.discount ** config.n_step * masks * self.atoms.view(1, -1) 76 | atoms_target.clamp_(self.config.categorical_v_min, self.config.categorical_v_max) 77 | atoms_target = atoms_target.unsqueeze(1) 78 | target_prob = (1 - (atoms_target - self.atoms.view(1, -1, 1)).abs() / self.delta_atom).clamp(0, 1) * \ 79 | prob_next.unsqueeze(1) 80 | target_prob = target_prob.sum(-1) 81 | 82 | log_prob = self.network(states)['log_prob'] 83 | actions = tensor(transitions.action).long() 84 | log_prob = log_prob[self.batch_indices, actions, :] 85 | KL = (target_prob * target_prob.add(1e-5).log() - target_prob * log_prob).sum(-1) 86 | return KL 87 | 88 | def reduce_loss(self, loss): 89 | return loss.mean() 90 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/DDPG_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from .BaseAgent import * 10 | import torchvision 11 | 12 | 13 | class DDPGAgent(BaseAgent): 14 | def __init__(self, config): 15 | BaseAgent.__init__(self, config) 16 | self.config = config 17 | self.task = config.task_fn() 18 | self.network = config.network_fn() 19 | self.target_network = config.network_fn() 20 | self.target_network.load_state_dict(self.network.state_dict()) 21 | self.replay = config.replay_fn() 22 | self.random_process = config.random_process_fn() 23 | self.total_steps = 0 24 | self.state = None 25 | 26 | def soft_update(self, target, src): 27 | for target_param, param in zip(target.parameters(), src.parameters()): 28 | target_param.detach_() 29 | target_param.copy_(target_param * (1.0 - self.config.target_network_mix) + 30 | param * self.config.target_network_mix) 31 | 32 | def eval_step(self, state): 33 | self.config.state_normalizer.set_read_only() 34 | state = self.config.state_normalizer(state) 35 | action = self.network(state) 36 | self.config.state_normalizer.unset_read_only() 37 | return to_np(action) 38 | 39 | def step(self): 40 | config = self.config 41 | if self.state is None: 42 | self.random_process.reset_states() 43 | self.state = self.task.reset() 44 | self.state = config.state_normalizer(self.state) 45 | 46 | if self.total_steps < config.warm_up: 47 | action = [self.task.action_space.sample()] 48 | else: 49 | action = self.network(self.state) 50 | action = to_np(action) 51 | action += self.random_process.sample() 52 | action = np.clip(action, self.task.action_space.low, self.task.action_space.high) 53 | next_state, reward, done, info = self.task.step(action) 54 | next_state = self.config.state_normalizer(next_state) 55 | self.record_online_return(info) 56 | reward = self.config.reward_normalizer(reward) 57 | 58 | self.replay.feed(dict( 59 | state=self.state, 60 | action=action, 61 | reward=reward, 62 | next_state=next_state, 63 | mask=1-np.asarray(done, dtype=np.int32), 64 | )) 65 | 66 | if done[0]: 67 | self.random_process.reset_states() 68 | self.state = next_state 69 | self.total_steps += 1 70 | 71 | if self.replay.size() >= config.warm_up: 72 | transitions = self.replay.sample() 73 | states = tensor(transitions.state) 74 | actions = tensor(transitions.action) 75 | rewards = tensor(transitions.reward).unsqueeze(-1) 76 | next_states = tensor(transitions.next_state) 77 | mask = tensor(transitions.mask).unsqueeze(-1) 78 | 79 | phi_next = self.target_network.feature(next_states) 80 | a_next = self.target_network.actor(phi_next) 81 | q_next = self.target_network.critic(phi_next, a_next) 82 | q_next = config.discount * mask * q_next 83 | q_next.add_(rewards) 84 | q_next = q_next.detach() 85 | phi = self.network.feature(states) 86 | q = self.network.critic(phi, actions) 87 | critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean() 88 | 89 | self.network.zero_grad() 90 | critic_loss.backward() 91 | self.network.critic_opt.step() 92 | 93 | phi = self.network.feature(states) 94 | action = self.network.actor(phi) 95 | policy_loss = -self.network.critic(phi.detach(), action).mean() 96 | 97 | self.network.zero_grad() 98 | policy_loss.backward() 99 | self.network.actor_opt.step() 100 | 101 | self.soft_update(self.target_network, self.network) 102 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/DQN_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from ..utils import * 10 | import time 11 | from .BaseAgent import * 12 | 13 | 14 | class DQNActor(BaseActor): 15 | def __init__(self, config): 16 | BaseActor.__init__(self, config) 17 | self.config = config 18 | self.start() 19 | 20 | def compute_q(self, prediction): 21 | q_values = to_np(prediction['q']) 22 | return q_values 23 | 24 | def _transition(self): 25 | if self._state is None: 26 | self._state = self._task.reset() 27 | config = self.config 28 | if config.noisy_linear: 29 | self._network.reset_noise() 30 | with config.lock: 31 | prediction = self._network(config.state_normalizer(self._state)) 32 | q_values = self.compute_q(prediction) 33 | 34 | if config.noisy_linear: 35 | epsilon = 0 36 | elif self._total_steps < config.exploration_steps: 37 | epsilon = 1 38 | else: 39 | epsilon = config.random_action_prob() 40 | action = epsilon_greedy(epsilon, q_values) 41 | next_state, reward, done, info = self._task.step(action) 42 | entry = [self._state, action, reward, next_state, done, info] 43 | self._total_steps += 1 44 | self._state = next_state 45 | return entry 46 | 47 | 48 | class DQNAgent(BaseAgent): 49 | def __init__(self, config): 50 | BaseAgent.__init__(self, config) 51 | self.config = config 52 | config.lock = mp.Lock() 53 | 54 | self.replay = config.replay_fn() 55 | self.actor = DQNActor(config) 56 | 57 | self.network = config.network_fn() 58 | self.network.share_memory() 59 | self.target_network = config.network_fn() 60 | self.target_network.load_state_dict(self.network.state_dict()) 61 | self.optimizer = config.optimizer_fn(self.network.parameters()) 62 | 63 | self.actor.set_network(self.network) 64 | self.total_steps = 0 65 | 66 | def close(self): 67 | close_obj(self.replay) 68 | close_obj(self.actor) 69 | 70 | def eval_step(self, state): 71 | self.config.state_normalizer.set_read_only() 72 | state = self.config.state_normalizer(state) 73 | q = self.network(state)['q'] 74 | action = to_np(q.argmax(-1)) 75 | self.config.state_normalizer.unset_read_only() 76 | return action 77 | 78 | def reduce_loss(self, loss): 79 | return loss.pow(2).mul(0.5).mean() 80 | 81 | def compute_loss(self, transitions): 82 | config = self.config 83 | states = self.config.state_normalizer(transitions.state) 84 | next_states = self.config.state_normalizer(transitions.next_state) 85 | with torch.no_grad(): 86 | q_next = self.target_network(next_states)['q'].detach() 87 | if self.config.double_q: 88 | best_actions = torch.argmax(self.network(next_states)['q'], dim=-1) 89 | q_next = q_next.gather(1, best_actions.unsqueeze(-1)).squeeze(1) 90 | else: 91 | q_next = q_next.max(1)[0] 92 | masks = tensor(transitions.mask) 93 | rewards = tensor(transitions.reward) 94 | q_target = rewards + self.config.discount ** config.n_step * q_next * masks 95 | actions = tensor(transitions.action).long() 96 | q = self.network(states)['q'] 97 | q = q.gather(1, actions.unsqueeze(-1)).squeeze(-1) 98 | loss = q_target - q 99 | return loss 100 | 101 | def step(self): 102 | config = self.config 103 | transitions = self.actor.step() 104 | for states, actions, rewards, next_states, dones, info in transitions: 105 | self.record_online_return(info) 106 | self.total_steps += 1 107 | self.replay.feed(dict( 108 | state=np.array([s[-1] if isinstance(s, LazyFrames) else s for s in states]), 109 | action=actions, 110 | reward=[config.reward_normalizer(r) for r in rewards], 111 | mask=1 - np.asarray(dones, dtype=np.int32), 112 | )) 113 | 114 | if self.total_steps > self.config.exploration_steps: 115 | transitions = self.replay.sample() 116 | if config.noisy_linear: 117 | self.target_network.reset_noise() 118 | self.network.reset_noise() 119 | loss = self.compute_loss(transitions) 120 | if isinstance(transitions, PrioritizedTransition): 121 | priorities = loss.abs().add(config.replay_eps).pow(config.replay_alpha) 122 | idxs = tensor(transitions.idx).long() 123 | self.replay.update_priorities(zip(to_np(idxs), to_np(priorities))) 124 | sampling_probs = tensor(transitions.sampling_prob) 125 | weights = sampling_probs.mul(sampling_probs.size(0)).add(1e-6).pow(-config.replay_beta()) 126 | weights = weights / weights.max() 127 | loss = loss.mul(weights) 128 | 129 | loss = self.reduce_loss(loss) 130 | self.optimizer.zero_grad() 131 | loss.backward() 132 | nn.utils.clip_grad_norm_(self.network.parameters(), self.config.gradient_clip) 133 | with config.lock: 134 | self.optimizer.step() 135 | 136 | if self.total_steps / self.config.sgd_update_frequency % \ 137 | self.config.target_network_update_freq == 0: 138 | self.target_network.load_state_dict(self.network.state_dict()) 139 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/NStepDQN_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from ..utils import * 10 | from .BaseAgent import * 11 | 12 | 13 | class NStepDQNAgent(BaseAgent): 14 | def __init__(self, config): 15 | BaseAgent.__init__(self, config) 16 | self.config = config 17 | self.task = config.task_fn() 18 | self.network = config.network_fn() 19 | self.target_network = config.network_fn() 20 | self.optimizer = config.optimizer_fn(self.network.parameters()) 21 | self.target_network.load_state_dict(self.network.state_dict()) 22 | 23 | self.total_steps = 0 24 | self.states = self.task.reset() 25 | 26 | def step(self): 27 | config = self.config 28 | storage = Storage(config.rollout_length) 29 | 30 | states = self.states 31 | for _ in range(config.rollout_length): 32 | q = self.network(self.config.state_normalizer(states))['q'] 33 | 34 | epsilon = config.random_action_prob(config.num_workers) 35 | actions = epsilon_greedy(epsilon, to_np(q)) 36 | 37 | next_states, rewards, terminals, info = self.task.step(actions) 38 | self.record_online_return(info) 39 | rewards = config.reward_normalizer(rewards) 40 | 41 | storage.feed({'q': q, 42 | 'action': tensor(actions).unsqueeze(-1).long(), 43 | 'reward': tensor(rewards).unsqueeze(-1), 44 | 'mask': tensor(1 - terminals).unsqueeze(-1)}) 45 | 46 | states = next_states 47 | 48 | self.total_steps += config.num_workers 49 | if self.total_steps // config.num_workers % config.target_network_update_freq == 0: 50 | self.target_network.load_state_dict(self.network.state_dict()) 51 | 52 | self.states = states 53 | 54 | storage.placeholder() 55 | 56 | ret = self.target_network(config.state_normalizer(states))['q'].detach() 57 | ret = torch.max(ret, dim=1, keepdim=True)[0] 58 | for i in reversed(range(config.rollout_length)): 59 | ret = storage.reward[i] + config.discount * storage.mask[i] * ret 60 | storage.ret[i] = ret 61 | 62 | entries = storage.extract(['q', 'action', 'ret']) 63 | loss = 0.5 * (entries.q.gather(1, entries.action) - entries.ret).pow(2).mean() 64 | self.optimizer.zero_grad() 65 | loss.backward() 66 | nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip) 67 | self.optimizer.step() 68 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/OptionCritic_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from .BaseAgent import * 10 | 11 | 12 | class OptionCriticAgent(BaseAgent): 13 | def __init__(self, config): 14 | BaseAgent.__init__(self, config) 15 | self.config = config 16 | self.task = config.task_fn() 17 | self.network = config.network_fn() 18 | self.target_network = config.network_fn() 19 | self.optimizer = config.optimizer_fn(self.network.parameters()) 20 | self.target_network.load_state_dict(self.network.state_dict()) 21 | 22 | self.total_steps = 0 23 | self.worker_index = tensor(np.arange(config.num_workers)).long() 24 | 25 | self.states = self.config.state_normalizer(self.task.reset()) 26 | self.is_initial_states = tensor(np.ones((config.num_workers))).byte() 27 | self.prev_options = self.is_initial_states.clone().long() 28 | 29 | def sample_option(self, prediction, epsilon, prev_option, is_intial_states): 30 | with torch.no_grad(): 31 | q_option = prediction['q'] 32 | pi_option = torch.zeros_like(q_option).add(epsilon / q_option.size(1)) 33 | greedy_option = q_option.argmax(dim=-1, keepdim=True) 34 | prob = 1 - epsilon + epsilon / q_option.size(1) 35 | prob = torch.zeros_like(pi_option).add(prob) 36 | pi_option.scatter_(1, greedy_option, prob) 37 | 38 | mask = torch.zeros_like(q_option) 39 | mask[self.worker_index, prev_option] = 1 40 | beta = prediction['beta'] 41 | pi_hat_option = (1 - beta) * mask + beta * pi_option 42 | 43 | dist = torch.distributions.Categorical(probs=pi_option) 44 | options = dist.sample() 45 | dist = torch.distributions.Categorical(probs=pi_hat_option) 46 | options_hat = dist.sample() 47 | 48 | options = torch.where(is_intial_states, options, options_hat) 49 | return options 50 | 51 | def step(self): 52 | config = self.config 53 | storage = Storage(config.rollout_length, ['beta', 'option', 'beta_advantage', 'prev_option', 'init_state', 'eps']) 54 | 55 | for _ in range(config.rollout_length): 56 | prediction = self.network(self.states) 57 | epsilon = config.random_option_prob(config.num_workers) 58 | options = self.sample_option(prediction, epsilon, self.prev_options, self.is_initial_states) 59 | prediction['pi'] = prediction['pi'][self.worker_index, options] 60 | prediction['log_pi'] = prediction['log_pi'][self.worker_index, options] 61 | dist = torch.distributions.Categorical(probs=prediction['pi']) 62 | actions = dist.sample() 63 | entropy = dist.entropy() 64 | 65 | next_states, rewards, terminals, info = self.task.step(to_np(actions)) 66 | self.record_online_return(info) 67 | next_states = config.state_normalizer(next_states) 68 | rewards = config.reward_normalizer(rewards) 69 | storage.feed(prediction) 70 | storage.feed({'reward': tensor(rewards).unsqueeze(-1), 71 | 'mask': tensor(1 - terminals).unsqueeze(-1), 72 | 'option': options.unsqueeze(-1), 73 | 'prev_option': self.prev_options.unsqueeze(-1), 74 | 'entropy': entropy.unsqueeze(-1), 75 | 'action': actions.unsqueeze(-1), 76 | 'init_state': self.is_initial_states.unsqueeze(-1).float(), 77 | 'eps': epsilon}) 78 | 79 | self.is_initial_states = tensor(terminals).byte() 80 | self.prev_options = options 81 | self.states = next_states 82 | 83 | self.total_steps += config.num_workers 84 | if self.total_steps // config.num_workers % config.target_network_update_freq == 0: 85 | self.target_network.load_state_dict(self.network.state_dict()) 86 | 87 | with torch.no_grad(): 88 | prediction = self.target_network(self.states) 89 | storage.placeholder() 90 | betas = prediction['beta'][self.worker_index, self.prev_options] 91 | ret = (1 - betas) * prediction['q'][self.worker_index, self.prev_options] + \ 92 | betas * torch.max(prediction['q'], dim=-1)[0] 93 | ret = ret.unsqueeze(-1) 94 | 95 | for i in reversed(range(config.rollout_length)): 96 | ret = storage.reward[i] + config.discount * storage.mask[i] * ret 97 | adv = ret - storage.q[i].gather(1, storage.option[i]) 98 | storage.ret[i] = ret 99 | storage.advantage[i] = adv 100 | 101 | v = storage.q[i].max(dim=-1, keepdim=True)[0] * (1 - storage.eps[i]) + storage.q[i].mean(-1).unsqueeze(-1) * \ 102 | storage.eps[i] 103 | q = storage.q[i].gather(1, storage.prev_option[i]) 104 | storage.beta_advantage[i] = q - v + config.termination_regularizer 105 | 106 | entries = storage.extract( 107 | ['q', 'beta', 'log_pi', 'ret', 'advantage', 'beta_advantage', 'entropy', 'option', 'action', 'init_state', 'prev_option']) 108 | 109 | q_loss = (entries.q.gather(1, entries.option) - entries.ret.detach()).pow(2).mul(0.5).mean() 110 | pi_loss = -(entries.log_pi.gather(1, 111 | entries.action) * entries.advantage.detach()) - config.entropy_weight * entries.entropy 112 | pi_loss = pi_loss.mean() 113 | beta_loss = (entries.beta.gather(1, entries.prev_option) * entries.beta_advantage.detach() * (1 - entries.init_state)).mean() 114 | 115 | self.optimizer.zero_grad() 116 | (pi_loss + q_loss + beta_loss).backward() 117 | nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip) 118 | self.optimizer.step() 119 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/PPO_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from .BaseAgent import * 10 | 11 | 12 | class PPOAgent(BaseAgent): 13 | def __init__(self, config): 14 | BaseAgent.__init__(self, config) 15 | self.config = config 16 | self.task = config.task_fn() 17 | self.network = config.network_fn() 18 | if config.shared_repr: 19 | self.opt = config.optimizer_fn(self.network.parameters()) 20 | else: 21 | self.actor_opt = config.actor_opt_fn(self.network.actor_params) 22 | self.critic_opt = config.critic_opt_fn(self.network.critic_params) 23 | self.total_steps = 0 24 | self.states = self.task.reset() 25 | self.states = config.state_normalizer(self.states) 26 | if config.shared_repr: 27 | self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.opt, lambda step: 1 - step / config.max_steps) 28 | 29 | def step(self): 30 | config = self.config 31 | storage = Storage(config.rollout_length) 32 | states = self.states 33 | for _ in range(config.rollout_length): 34 | prediction = self.network(states) 35 | next_states, rewards, terminals, info = self.task.step(to_np(prediction['action'])) 36 | self.record_online_return(info) 37 | rewards = config.reward_normalizer(rewards) 38 | next_states = config.state_normalizer(next_states) 39 | storage.feed(prediction) 40 | storage.feed({'reward': tensor(rewards).unsqueeze(-1), 41 | 'mask': tensor(1 - terminals).unsqueeze(-1), 42 | 'state': tensor(states)}) 43 | states = next_states 44 | self.total_steps += config.num_workers 45 | 46 | self.states = states 47 | prediction = self.network(states) 48 | storage.feed(prediction) 49 | storage.placeholder() 50 | 51 | advantages = tensor(np.zeros((config.num_workers, 1))) 52 | returns = prediction['v'].detach() 53 | for i in reversed(range(config.rollout_length)): 54 | returns = storage.reward[i] + config.discount * storage.mask[i] * returns 55 | if not config.use_gae: 56 | advantages = returns - storage.v[i].detach() 57 | else: 58 | td_error = storage.reward[i] + config.discount * storage.mask[i] * storage.v[i + 1] - storage.v[i] 59 | advantages = advantages * config.gae_tau * config.discount * storage.mask[i] + td_error 60 | storage.advantage[i] = advantages.detach() 61 | storage.ret[i] = returns.detach() 62 | 63 | entries = storage.extract(['state', 'action', 'log_pi_a', 'ret', 'advantage']) 64 | EntryCLS = entries.__class__ 65 | entries = EntryCLS(*list(map(lambda x: x.detach(), entries))) 66 | entries.advantage.copy_((entries.advantage - entries.advantage.mean()) / entries.advantage.std()) 67 | 68 | if config.shared_repr: 69 | self.lr_scheduler.step(self.total_steps) 70 | 71 | for _ in range(config.optimization_epochs): 72 | sampler = random_sample(np.arange(entries.state.size(0)), config.mini_batch_size) 73 | for batch_indices in sampler: 74 | batch_indices = tensor(batch_indices).long() 75 | entry = EntryCLS(*list(map(lambda x: x[batch_indices], entries))) 76 | 77 | prediction = self.network(entry.state, entry.action) 78 | ratio = (prediction['log_pi_a'] - entry.log_pi_a).exp() 79 | obj = ratio * entry.advantage 80 | obj_clipped = ratio.clamp(1.0 - self.config.ppo_ratio_clip, 81 | 1.0 + self.config.ppo_ratio_clip) * entry.advantage 82 | policy_loss = -torch.min(obj, obj_clipped).mean() - config.entropy_weight * prediction['entropy'].mean() 83 | 84 | value_loss = 0.5 * (entry.ret - prediction['v']).pow(2).mean() 85 | 86 | approx_kl = (entry.log_pi_a - prediction['log_pi_a']).mean() 87 | if config.shared_repr: 88 | self.opt.zero_grad() 89 | (policy_loss + value_loss).backward() 90 | nn.utils.clip_grad_norm_(self.network.parameters(), config.gradient_clip) 91 | self.opt.step() 92 | else: 93 | if approx_kl <= 1.5 * config.target_kl: 94 | self.actor_opt.zero_grad() 95 | policy_loss.backward() 96 | self.actor_opt.step() 97 | self.critic_opt.zero_grad() 98 | value_loss.backward() 99 | self.critic_opt.step() 100 | 101 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/QuantileRegressionDQN_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from ..utils import * 10 | from .BaseAgent import * 11 | from .DQN_agent import * 12 | 13 | 14 | class QuantileRegressionDQNActor(DQNActor): 15 | def __init__(self, config): 16 | super().__init__(config) 17 | 18 | def compute_q(self, prediction): 19 | q_values = prediction['quantile'].mean(-1) 20 | return to_np(q_values) 21 | 22 | 23 | class QuantileRegressionDQNAgent(DQNAgent): 24 | def __init__(self, config): 25 | BaseAgent.__init__(self, config) 26 | self.config = config 27 | config.lock = mp.Lock() 28 | 29 | self.replay = config.replay_fn() 30 | self.actor = QuantileRegressionDQNActor(config) 31 | 32 | self.network = config.network_fn() 33 | self.network.share_memory() 34 | self.target_network = config.network_fn() 35 | self.target_network.load_state_dict(self.network.state_dict()) 36 | self.optimizer = config.optimizer_fn(self.network.parameters()) 37 | 38 | self.actor.set_network(self.network) 39 | 40 | self.total_steps = 0 41 | self.batch_indices = range_tensor(config.batch_size) 42 | 43 | self.quantile_weight = 1.0 / self.config.num_quantiles 44 | self.cumulative_density = tensor( 45 | (2 * np.arange(self.config.num_quantiles) + 1) / (2.0 * self.config.num_quantiles)).view(1, -1) 46 | 47 | def eval_step(self, state): 48 | self.config.state_normalizer.set_read_only() 49 | state = self.config.state_normalizer(state) 50 | q = self.network(state)['quantile'].mean(-1) 51 | action = np.argmax(to_np(q).flatten()) 52 | self.config.state_normalizer.unset_read_only() 53 | return [action] 54 | 55 | def compute_loss(self, transitions): 56 | states = self.config.state_normalizer(transitions.state) 57 | next_states = self.config.state_normalizer(transitions.next_state) 58 | 59 | quantiles_next = self.target_network(next_states)['quantile'].detach() 60 | a_next = torch.argmax(quantiles_next.sum(-1), dim=-1) 61 | quantiles_next = quantiles_next[self.batch_indices, a_next, :] 62 | 63 | rewards = tensor(transitions.reward).unsqueeze(-1) 64 | masks = tensor(transitions.mask).unsqueeze(-1) 65 | quantiles_next = rewards + self.config.discount ** self.config.n_step * masks * quantiles_next 66 | 67 | quantiles = self.network(states)['quantile'] 68 | actions = tensor(transitions.action).long() 69 | quantiles = quantiles[self.batch_indices, actions, :] 70 | 71 | quantiles_next = quantiles_next.t().unsqueeze(-1) 72 | diff = quantiles_next - quantiles 73 | loss = huber(diff) * (self.cumulative_density - (diff.detach() < 0).float()).abs() 74 | return loss.sum(-1).mean(1) 75 | 76 | def reduce_loss(self, loss): 77 | return loss.mean() 78 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/TD3_agent.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from ..network import * 8 | from ..component import * 9 | from .BaseAgent import * 10 | import torchvision 11 | 12 | 13 | class TD3Agent(BaseAgent): 14 | def __init__(self, config): 15 | BaseAgent.__init__(self, config) 16 | self.config = config 17 | self.task = config.task_fn() 18 | self.network = config.network_fn() 19 | self.target_network = config.network_fn() 20 | self.target_network.load_state_dict(self.network.state_dict()) 21 | self.replay = config.replay_fn() 22 | self.random_process = config.random_process_fn() 23 | self.total_steps = 0 24 | self.state = None 25 | 26 | def soft_update(self, target, src): 27 | for target_param, param in zip(target.parameters(), src.parameters()): 28 | target_param.detach_() 29 | target_param.copy_(target_param * (1.0 - self.config.target_network_mix) + 30 | param * self.config.target_network_mix) 31 | 32 | def eval_step(self, state): 33 | self.config.state_normalizer.set_read_only() 34 | state = self.config.state_normalizer(state) 35 | action = self.network(state) 36 | self.config.state_normalizer.unset_read_only() 37 | return to_np(action) 38 | 39 | def step(self): 40 | config = self.config 41 | if self.state is None: 42 | self.random_process.reset_states() 43 | self.state = self.task.reset() 44 | self.state = config.state_normalizer(self.state) 45 | 46 | if self.total_steps < config.warm_up: 47 | action = [self.task.action_space.sample()] 48 | else: 49 | action = self.network(self.state) 50 | action = to_np(action) 51 | action += self.random_process.sample() 52 | action = np.clip(action, self.task.action_space.low, self.task.action_space.high) 53 | next_state, reward, done, info = self.task.step(action) 54 | next_state = self.config.state_normalizer(next_state) 55 | self.record_online_return(info) 56 | reward = self.config.reward_normalizer(reward) 57 | 58 | self.replay.feed(dict( 59 | state=self.state, 60 | action=action, 61 | reward=reward, 62 | next_state=next_state, 63 | mask=1-np.asarray(done, dtype=np.int32), 64 | )) 65 | 66 | if done[0]: 67 | self.random_process.reset_states() 68 | self.state = next_state 69 | self.total_steps += 1 70 | 71 | if self.total_steps >= config.warm_up: 72 | transitions = self.replay.sample() 73 | states = tensor(transitions.state) 74 | actions = tensor(transitions.action) 75 | rewards = tensor(transitions.reward).unsqueeze(-1) 76 | next_states = tensor(transitions.next_state) 77 | mask = tensor(transitions.mask).unsqueeze(-1) 78 | 79 | a_next = self.target_network(next_states) 80 | noise = torch.randn_like(a_next).mul(config.td3_noise) 81 | noise = noise.clamp(-config.td3_noise_clip, config.td3_noise_clip) 82 | 83 | min_a = float(self.task.action_space.low[0]) 84 | max_a = float(self.task.action_space.high[0]) 85 | a_next = (a_next + noise).clamp(min_a, max_a) 86 | 87 | q_1, q_2 = self.target_network.q(next_states, a_next) 88 | target = rewards + config.discount * mask * torch.min(q_1, q_2) 89 | target = target.detach() 90 | 91 | q_1, q_2 = self.network.q(states, actions) 92 | critic_loss = F.mse_loss(q_1, target) + F.mse_loss(q_2, target) 93 | 94 | self.network.zero_grad() 95 | critic_loss.backward() 96 | self.network.critic_opt.step() 97 | 98 | if self.total_steps % config.td3_delay: 99 | action = self.network(states) 100 | policy_loss = -self.network.q(states, action)[0].mean() 101 | 102 | self.network.zero_grad() 103 | policy_loss.backward() 104 | self.network.actor_opt.step() 105 | 106 | self.soft_update(self.target_network, self.network) 107 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .DQN_agent import * 2 | from .DDPG_agent import * 3 | from .A2C_agent import * 4 | from .CategoricalDQN_agent import * 5 | from .NStepDQN_agent import * 6 | from .QuantileRegressionDQN_agent import * 7 | from .PPO_agent import * 8 | from .OptionCritic_agent import * 9 | from .TD3_agent import * 10 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/component/__init__.py: -------------------------------------------------------------------------------- 1 | from .replay import * 2 | from .random_process import * 3 | from .envs import Task 4 | from .envs import LazyFrames 5 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/component/envs.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | import os 8 | import gym 9 | import numpy as np 10 | import torch 11 | from gym.spaces.box import Box 12 | from gym.spaces.discrete import Discrete 13 | 14 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 15 | from baselines.common.atari_wrappers import FrameStack as FrameStack_ 16 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv, VecEnv 17 | 18 | from ..utils import * 19 | 20 | try: 21 | import roboschool 22 | except ImportError: 23 | pass 24 | 25 | 26 | # adapted from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/envs.py 27 | def make_env(env_id, seed, rank, episode_life=True): 28 | def _thunk(): 29 | random_seed(seed) 30 | if env_id.startswith("dm"): 31 | import dm_control2gym 32 | _, domain, task = env_id.split('-') 33 | env = dm_control2gym.make(domain_name=domain, task_name=task) 34 | else: 35 | env = gym.make(env_id) 36 | is_atari = hasattr(gym.envs, 'atari') and isinstance( 37 | env.unwrapped, gym.envs.atari.atari_env.AtariEnv) 38 | if is_atari: 39 | env = make_atari(env_id) 40 | env.seed(seed + rank) 41 | env = OriginalReturnWrapper(env) 42 | if is_atari: 43 | env = wrap_deepmind(env, 44 | episode_life=episode_life, 45 | clip_rewards=False, 46 | frame_stack=False, 47 | scale=False) 48 | obs_shape = env.observation_space.shape 49 | if len(obs_shape) == 3: 50 | env = TransposeImage(env) 51 | env = FrameStack(env, 4) 52 | 53 | return env 54 | 55 | return _thunk 56 | 57 | 58 | class OriginalReturnWrapper(gym.Wrapper): 59 | def __init__(self, env): 60 | gym.Wrapper.__init__(self, env) 61 | self.total_rewards = 0 62 | 63 | def step(self, action): 64 | obs, reward, done, info = self.env.step(action) 65 | self.total_rewards += reward 66 | if done: 67 | info['episodic_return'] = self.total_rewards 68 | self.total_rewards = 0 69 | else: 70 | info['episodic_return'] = None 71 | return obs, reward, done, info 72 | 73 | def reset(self): 74 | return self.env.reset() 75 | 76 | 77 | class TransposeImage(gym.ObservationWrapper): 78 | def __init__(self, env=None): 79 | super(TransposeImage, self).__init__(env) 80 | obs_shape = self.observation_space.shape 81 | self.observation_space = Box( 82 | self.observation_space.low[0, 0, 0], 83 | self.observation_space.high[0, 0, 0], 84 | [obs_shape[2], obs_shape[1], obs_shape[0]], 85 | dtype=self.observation_space.dtype) 86 | 87 | def observation(self, observation): 88 | return observation.transpose(2, 0, 1) 89 | 90 | 91 | # The original LayzeFrames doesn't work well 92 | class LazyFrames(object): 93 | def __init__(self, frames): 94 | """This object ensures that common frames between the observations are only stored once. 95 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 96 | buffers. 97 | 98 | This object should only be converted to numpy array before being passed to the model. 99 | 100 | You'd not believe how complex the previous solution was.""" 101 | self._frames = frames 102 | 103 | def __array__(self, dtype=None): 104 | out = np.concatenate(self._frames, axis=0) 105 | if dtype is not None: 106 | out = out.astype(dtype) 107 | return out 108 | 109 | def __len__(self): 110 | return len(self.__array__()) 111 | 112 | def __getitem__(self, i): 113 | return self.__array__()[i] 114 | 115 | 116 | class FrameStack(FrameStack_): 117 | def __init__(self, env, k): 118 | FrameStack_.__init__(self, env, k) 119 | 120 | def _get_ob(self): 121 | assert len(self.frames) == self.k 122 | return LazyFrames(list(self.frames)) 123 | 124 | 125 | # The original one in baselines is really bad 126 | class DummyVecEnv(VecEnv): 127 | def __init__(self, env_fns): 128 | self.envs = [fn() for fn in env_fns] 129 | env = self.envs[0] 130 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 131 | self.actions = None 132 | 133 | def step_async(self, actions): 134 | self.actions = actions 135 | 136 | def step_wait(self): 137 | data = [] 138 | for i in range(self.num_envs): 139 | obs, rew, done, info = self.envs[i].step(self.actions[i]) 140 | if done: 141 | obs = self.envs[i].reset() 142 | data.append([obs, rew, done, info]) 143 | obs, rew, done, info = zip(*data) 144 | return obs, np.asarray(rew), np.asarray(done), info 145 | 146 | def reset(self): 147 | return [env.reset() for env in self.envs] 148 | 149 | def close(self): 150 | return 151 | 152 | 153 | class Task: 154 | def __init__(self, 155 | name, 156 | num_envs=1, 157 | single_process=True, 158 | log_dir=None, 159 | episode_life=True, 160 | seed=None): 161 | if seed is None: 162 | seed = np.random.randint(int(1e9)) 163 | if log_dir is not None: 164 | mkdir(log_dir) 165 | envs = [make_env(name, seed, i, episode_life) for i in range(num_envs)] 166 | if single_process: 167 | Wrapper = DummyVecEnv 168 | else: 169 | Wrapper = SubprocVecEnv 170 | self.env = Wrapper(envs) 171 | self.name = name 172 | self.observation_space = self.env.observation_space 173 | self.state_dim = int(np.prod(self.env.observation_space.shape)) 174 | 175 | self.action_space = self.env.action_space 176 | if isinstance(self.action_space, Discrete): 177 | self.action_dim = self.action_space.n 178 | elif isinstance(self.action_space, Box): 179 | self.action_dim = self.action_space.shape[0] 180 | else: 181 | assert 'unknown action space' 182 | 183 | def reset(self): 184 | return self.env.reset() 185 | 186 | def step(self, actions): 187 | if isinstance(self.action_space, Box): 188 | actions = np.clip(actions, self.action_space.low, self.action_space.high) 189 | return self.env.step(actions) 190 | 191 | 192 | if __name__ == '__main__': 193 | task = Task('Hopper-v2', 5, single_process=False) 194 | state = task.reset() 195 | while True: 196 | action = np.random.rand(task.observation_space.shape[0]) 197 | next_state, reward, done, _ = task.step(action) 198 | print(done) 199 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/component/random_process.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | import numpy as np 8 | 9 | 10 | class RandomProcess(object): 11 | def reset_states(self): 12 | pass 13 | 14 | 15 | class GaussianProcess(RandomProcess): 16 | def __init__(self, size, std): 17 | self.size = size 18 | self.std = std 19 | 20 | def sample(self): 21 | return np.random.randn(*self.size) * self.std() 22 | 23 | 24 | class OrnsteinUhlenbeckProcess(RandomProcess): 25 | def __init__(self, size, std, theta=.15, dt=1e-2, x0=None): 26 | self.theta = theta 27 | self.mu = 0 28 | self.std = std 29 | self.dt = dt 30 | self.x0 = x0 31 | self.size = size 32 | self.reset_states() 33 | 34 | def sample(self): 35 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.std() * np.sqrt( 36 | self.dt) * np.random.randn(*self.size) 37 | self.x_prev = x 38 | return x 39 | 40 | def reset_states(self): 41 | self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size) 42 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/network/__init__.py: -------------------------------------------------------------------------------- 1 | from .network_utils import * 2 | from .network_bodies import * 3 | from .network_heads import * 4 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/network/network_bodies.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from .network_utils import * 8 | 9 | 10 | class NatureConvBody(nn.Module): 11 | def __init__(self, in_channels=4, noisy_linear=False): 12 | super(NatureConvBody, self).__init__() 13 | self.feature_dim = 512 14 | self.conv1 = layer_init(nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)) 15 | self.conv2 = layer_init(nn.Conv2d(32, 64, kernel_size=4, stride=2)) 16 | self.conv3 = layer_init(nn.Conv2d(64, 64, kernel_size=3, stride=1)) 17 | if noisy_linear: 18 | self.fc4 = NoisyLinear(7 * 7 * 64, self.feature_dim) 19 | else: 20 | self.fc4 = layer_init(nn.Linear(7 * 7 * 64, self.feature_dim)) 21 | self.noisy_linear = noisy_linear 22 | 23 | def reset_noise(self): 24 | if self.noisy_linear: 25 | self.fc4.reset_noise() 26 | 27 | def forward(self, x): 28 | y = F.relu(self.conv1(x)) 29 | y = F.relu(self.conv2(y)) 30 | y = F.relu(self.conv3(y)) 31 | y = y.view(y.size(0), -1) 32 | y = F.relu(self.fc4(y)) 33 | return y 34 | 35 | 36 | class DDPGConvBody(nn.Module): 37 | def __init__(self, in_channels=4): 38 | super(DDPGConvBody, self).__init__() 39 | self.feature_dim = 39 * 39 * 32 40 | self.conv1 = layer_init(nn.Conv2d(in_channels, 32, kernel_size=3, stride=2)) 41 | self.conv2 = layer_init(nn.Conv2d(32, 32, kernel_size=3)) 42 | 43 | def forward(self, x): 44 | y = F.elu(self.conv1(x)) 45 | y = F.elu(self.conv2(y)) 46 | y = y.view(y.size(0), -1) 47 | return y 48 | 49 | 50 | class FCBody(nn.Module): 51 | def __init__(self, state_dim, hidden_units=(64, 64), gate=F.relu, noisy_linear=False): 52 | super(FCBody, self).__init__() 53 | dims = (state_dim,) + hidden_units 54 | if noisy_linear: 55 | self.layers = nn.ModuleList( 56 | [NoisyLinear(dim_in, dim_out) for dim_in, dim_out in zip(dims[:-1], dims[1:])]) 57 | else: 58 | self.layers = nn.ModuleList( 59 | [layer_init(nn.Linear(dim_in, dim_out)) for dim_in, dim_out in zip(dims[:-1], dims[1:])]) 60 | 61 | self.gate = gate 62 | self.feature_dim = dims[-1] 63 | self.noisy_linear = noisy_linear 64 | 65 | def reset_noise(self): 66 | if self.noisy_linear: 67 | for layer in self.layers: 68 | layer.reset_noise() 69 | 70 | def forward(self, x): 71 | for layer in self.layers: 72 | x = self.gate(layer(x)) 73 | return x 74 | 75 | 76 | class DummyBody(nn.Module): 77 | def __init__(self, state_dim): 78 | super(DummyBody, self).__init__() 79 | self.feature_dim = state_dim 80 | 81 | def forward(self, x): 82 | return x 83 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/network/network_utils.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import numpy as np 11 | import math 12 | from ..utils import * 13 | 14 | 15 | class BaseNet: 16 | def __init__(self): 17 | pass 18 | 19 | def reset_noise(self): 20 | pass 21 | 22 | 23 | def layer_init(layer, w_scale=1.0): 24 | nn.init.orthogonal_(layer.weight.data) 25 | layer.weight.data.mul_(w_scale) 26 | nn.init.constant_(layer.bias.data, 0) 27 | return layer 28 | 29 | 30 | # Adapted from https://github.com/saj1919/RL-Adventure/blob/master/5.noisy%20dqn.ipynb 31 | class NoisyLinear(nn.Module): 32 | def __init__(self, in_features, out_features, std_init=0.4): 33 | super(NoisyLinear, self).__init__() 34 | 35 | self.in_features = in_features 36 | self.out_features = out_features 37 | self.std_init = std_init 38 | 39 | self.weight_mu = nn.Parameter(torch.zeros((out_features, in_features)), requires_grad=True) 40 | self.weight_sigma = nn.Parameter(torch.zeros((out_features, in_features)), requires_grad=True) 41 | self.register_buffer('weight_epsilon', torch.zeros((out_features, in_features))) 42 | 43 | self.bias_mu = nn.Parameter(torch.zeros(out_features), requires_grad=True) 44 | self.bias_sigma = nn.Parameter(torch.zeros(out_features), requires_grad=True) 45 | self.register_buffer('bias_epsilon', torch.zeros(out_features)) 46 | 47 | self.register_buffer('noise_in', torch.zeros(in_features)) 48 | self.register_buffer('noise_out_weight', torch.zeros(out_features)) 49 | self.register_buffer('noise_out_bias', torch.zeros(out_features)) 50 | 51 | self.reset_parameters() 52 | self.reset_noise() 53 | 54 | def forward(self, x): 55 | if self.training: 56 | weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon) 57 | bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon) 58 | else: 59 | weight = self.weight_mu 60 | bias = self.bias_mu 61 | 62 | return F.linear(x, weight, bias) 63 | 64 | def reset_parameters(self): 65 | mu_range = 1 / math.sqrt(self.weight_mu.size(1)) 66 | 67 | self.weight_mu.data.uniform_(-mu_range, mu_range) 68 | self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1))) 69 | 70 | self.bias_mu.data.uniform_(-mu_range, mu_range) 71 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 72 | 73 | def reset_noise(self): 74 | self.noise_in.normal_(std=Config.NOISY_LAYER_STD) 75 | self.noise_out_weight.normal_(std=Config.NOISY_LAYER_STD) 76 | self.noise_out_bias.normal_(std=Config.NOISY_LAYER_STD) 77 | 78 | self.weight_epsilon.copy_(self.transform_noise(self.noise_out_weight).ger( 79 | self.transform_noise(self.noise_in))) 80 | self.bias_epsilon.copy_(self.transform_noise(self.noise_out_bias)) 81 | 82 | def transform_noise(self, x): 83 | return x.sign().mul(x.abs().sqrt()) 84 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | from .normalizer import * 3 | from .misc import * 4 | from .logger import * 5 | from .plot import Plotter 6 | from .schedule import * 7 | from .torch_utils import * 8 | from .sum_tree import * 9 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/config.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | from .normalizer import * 7 | import argparse 8 | import torch 9 | 10 | 11 | class Config: 12 | DEVICE = torch.device('cpu') 13 | NOISY_LAYER_STD = 0.1 14 | DEFAULT_REPLAY = 'replay' 15 | PRIORITIZED_REPLAY = 'prioritized_replay' 16 | 17 | def __init__(self): 18 | self.parser = argparse.ArgumentParser() 19 | self.task_fn = None 20 | self.optimizer_fn = None 21 | self.actor_optimizer_fn = None 22 | self.critic_optimizer_fn = None 23 | self.network_fn = None 24 | self.actor_network_fn = None 25 | self.critic_network_fn = None 26 | self.replay_fn = None 27 | self.random_process_fn = None 28 | self.discount = None 29 | self.target_network_update_freq = None 30 | self.exploration_steps = None 31 | self.log_level = 0 32 | self.history_length = None 33 | self.double_q = False 34 | self.tag = 'vanilla' 35 | self.num_workers = 1 36 | self.gradient_clip = None 37 | self.entropy_weight = 0 38 | self.use_gae = False 39 | self.gae_tau = 1.0 40 | self.target_network_mix = 0.001 41 | self.state_normalizer = RescaleNormalizer() 42 | self.reward_normalizer = RescaleNormalizer() 43 | self.min_memory_size = None 44 | self.max_steps = 0 45 | self.rollout_length = None 46 | self.value_loss_weight = 1.0 47 | self.iteration_log_interval = 30 48 | self.categorical_v_min = None 49 | self.categorical_v_max = None 50 | self.categorical_n_atoms = 51 51 | self.num_quantiles = None 52 | self.optimization_epochs = 4 53 | self.mini_batch_size = 64 54 | self.termination_regularizer = 0 55 | self.sgd_update_frequency = None 56 | self.random_action_prob = None 57 | self.__eval_env = None 58 | self.log_interval = int(1e3) 59 | self.save_interval = 0 60 | self.eval_interval = 0 61 | self.eval_episodes = 10 62 | self.async_actor = True 63 | self.tasks = False 64 | self.replay_type = Config.DEFAULT_REPLAY 65 | self.decaying_lr = False 66 | self.shared_repr = False 67 | self.noisy_linear = False 68 | self.n_step = 1 69 | 70 | @property 71 | def eval_env(self): 72 | return self.__eval_env 73 | 74 | @eval_env.setter 75 | def eval_env(self, env): 76 | self.__eval_env = env 77 | self.state_dim = env.state_dim 78 | self.action_dim = env.action_dim 79 | self.task_name = env.name 80 | 81 | def add_argument(self, *args, **kwargs): 82 | self.parser.add_argument(*args, **kwargs) 83 | 84 | def merge(self, config_dict=None): 85 | if config_dict is None: 86 | args = self.parser.parse_args() 87 | config_dict = args.__dict__ 88 | for key in config_dict.keys(): 89 | setattr(self, key, config_dict[key]) 90 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/logger.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from torch.utils.tensorboard import SummaryWriter 8 | import os 9 | import numpy as np 10 | import torch 11 | import logging 12 | 13 | logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') 14 | from .misc import * 15 | 16 | 17 | def get_logger(tag='default', log_level=0): 18 | logger = logging.getLogger() 19 | logger.setLevel(logging.INFO) 20 | if tag is not None: 21 | fh = logging.FileHandler('./log/%s-%s.txt' % (tag, get_time_str())) 22 | fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s')) 23 | fh.setLevel(logging.INFO) 24 | logger.addHandler(fh) 25 | return Logger(logger, './tf_log/logger-%s-%s' % (tag, get_time_str()), log_level) 26 | 27 | 28 | class Logger(object): 29 | def __init__(self, vanilla_logger, log_dir, log_level=0): 30 | self.log_level = log_level 31 | self.writer = None 32 | if vanilla_logger is not None: 33 | self.info = vanilla_logger.info 34 | self.debug = vanilla_logger.debug 35 | self.warning = vanilla_logger.warning 36 | self.all_steps = {} 37 | self.log_dir = log_dir 38 | 39 | def lazy_init_writer(self): 40 | if self.writer is None: 41 | self.writer = SummaryWriter(self.log_dir) 42 | 43 | def to_numpy(self, v): 44 | if isinstance(v, torch.Tensor): 45 | v = v.cpu().detach().numpy() 46 | return v 47 | 48 | def get_step(self, tag): 49 | if tag not in self.all_steps: 50 | self.all_steps[tag] = 0 51 | step = self.all_steps[tag] 52 | self.all_steps[tag] += 1 53 | return step 54 | 55 | def add_scalar(self, tag, value, step=None, log_level=0): 56 | self.lazy_init_writer() 57 | if log_level > self.log_level: 58 | return 59 | value = self.to_numpy(value) 60 | if step is None: 61 | step = self.get_step(tag) 62 | if np.isscalar(value): 63 | value = np.asarray([value]) 64 | self.writer.add_scalar(tag, value, step) 65 | 66 | def add_histogram(self, tag, values, step=None, log_level=0): 67 | self.lazy_init_writer() 68 | if log_level > self.log_level: 69 | return 70 | values = self.to_numpy(values) 71 | if step is None: 72 | step = self.get_step(tag) 73 | self.writer.add_histogram(tag, values, step) 74 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/misc.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | import numpy as np 8 | import pickle 9 | import os 10 | import datetime 11 | import torch 12 | import time 13 | from .torch_utils import * 14 | from pathlib import Path 15 | import itertools 16 | from collections import OrderedDict, Sequence 17 | 18 | 19 | def run_steps(agent): 20 | config = agent.config 21 | agent_name = agent.__class__.__name__ 22 | t0 = time.time() 23 | while True: 24 | if config.save_interval and not agent.total_steps % config.save_interval: 25 | agent.save('data/%s-%s-%d' % (agent_name, config.tag, agent.total_steps)) 26 | if config.log_interval and not agent.total_steps % config.log_interval: 27 | agent.logger.info('steps %d, %.2f steps/s' % (agent.total_steps, config.log_interval / (time.time() - t0))) 28 | t0 = time.time() 29 | if config.eval_interval and not agent.total_steps % config.eval_interval: 30 | agent.eval_episodes() 31 | if config.max_steps and agent.total_steps >= config.max_steps: 32 | agent.close() 33 | break 34 | agent.step() 35 | agent.switch_task() 36 | 37 | 38 | def get_time_str(): 39 | return datetime.datetime.now().strftime("%y%m%d-%H%M%S") 40 | 41 | 42 | def get_default_log_dir(name): 43 | return './log/%s-%s' % (name, get_time_str()) 44 | 45 | 46 | def mkdir(path): 47 | Path(path).mkdir(parents=True, exist_ok=True) 48 | 49 | 50 | def close_obj(obj): 51 | if hasattr(obj, 'close'): 52 | obj.close() 53 | 54 | 55 | def random_sample(indices, batch_size): 56 | indices = np.asarray(np.random.permutation(indices)) 57 | batches = indices[:len(indices) // batch_size * batch_size].reshape(-1, batch_size) 58 | for batch in batches: 59 | yield batch 60 | r = len(indices) % batch_size 61 | if r: 62 | yield indices[-r:] 63 | 64 | 65 | def is_plain_type(x): 66 | for t in [str, int, float, bool]: 67 | if isinstance(x, t): 68 | return True 69 | return False 70 | 71 | 72 | def generate_tag(params): 73 | if 'tag' in params.keys(): 74 | return 75 | game = params['game'] 76 | params.setdefault('run', 0) 77 | run = params['run'] 78 | del params['game'] 79 | del params['run'] 80 | str = ['%s_%s' % (k, v if is_plain_type(v) else v.__name__) for k, v in sorted(params.items())] 81 | tag = '%s-%s-run-%d' % (game, '-'.join(str), run) 82 | params['tag'] = tag 83 | params['game'] = game 84 | params['run'] = run 85 | 86 | 87 | def translate(pattern): 88 | groups = pattern.split('.') 89 | pattern = ('\.').join(groups) 90 | return pattern 91 | 92 | 93 | def split(a, n): 94 | k, m = divmod(len(a), n) 95 | return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) 96 | 97 | 98 | class HyperParameter: 99 | def __init__(self, id, param): 100 | self.id = id 101 | self.param = dict() 102 | for key, item in param: 103 | self.param[key] = item 104 | 105 | def __str__(self): 106 | return str(self.id) 107 | 108 | def dict(self): 109 | return self.param 110 | 111 | 112 | class HyperParameters(Sequence): 113 | def __init__(self, ordered_params): 114 | if not isinstance(ordered_params, OrderedDict): 115 | raise NotImplementedError 116 | params = [] 117 | for key in ordered_params.keys(): 118 | param = [[key, iterm] for iterm in ordered_params[key]] 119 | params.append(param) 120 | self.params = list(itertools.product(*params)) 121 | 122 | def __getitem__(self, index): 123 | return HyperParameter(index, self.params[index]) 124 | 125 | def __len__(self): 126 | return len(self.params) -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/normalizer.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | import numpy as np 7 | import torch 8 | from baselines.common.running_mean_std import RunningMeanStd 9 | 10 | 11 | class BaseNormalizer: 12 | def __init__(self, read_only=False): 13 | self.read_only = read_only 14 | 15 | def set_read_only(self): 16 | self.read_only = True 17 | 18 | def unset_read_only(self): 19 | self.read_only = False 20 | 21 | def state_dict(self): 22 | return None 23 | 24 | def load_state_dict(self, _): 25 | return 26 | 27 | 28 | class MeanStdNormalizer(BaseNormalizer): 29 | def __init__(self, read_only=False, clip=10.0, epsilon=1e-8): 30 | BaseNormalizer.__init__(self, read_only) 31 | self.read_only = read_only 32 | self.rms = None 33 | self.clip = clip 34 | self.epsilon = epsilon 35 | 36 | def __call__(self, x): 37 | x = np.asarray(x) 38 | if self.rms is None: 39 | self.rms = RunningMeanStd(shape=(1,) + x.shape[1:]) 40 | if not self.read_only: 41 | self.rms.update(x) 42 | return np.clip((x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), 43 | -self.clip, self.clip) 44 | 45 | def state_dict(self): 46 | return {'mean': self.rms.mean, 47 | 'var': self.rms.var} 48 | 49 | def load_state_dict(self, saved): 50 | self.rms.mean = saved['mean'] 51 | self.rms.var = saved['var'] 52 | 53 | class RescaleNormalizer(BaseNormalizer): 54 | def __init__(self, coef=1.0): 55 | BaseNormalizer.__init__(self) 56 | self.coef = coef 57 | 58 | def __call__(self, x): 59 | if not isinstance(x, torch.Tensor): 60 | x = np.asarray(x) 61 | return self.coef * x 62 | 63 | 64 | class ImageNormalizer(RescaleNormalizer): 65 | def __init__(self): 66 | RescaleNormalizer.__init__(self, 1.0 / 255) 67 | 68 | 69 | class SignNormalizer(BaseNormalizer): 70 | def __call__(self, x): 71 | return np.sign(x) 72 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/schedule.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | class ConstantSchedule: 8 | def __init__(self, val): 9 | self.val = val 10 | 11 | def __call__(self, steps=1): 12 | return self.val 13 | 14 | 15 | class LinearSchedule: 16 | def __init__(self, start, end=None, steps=None): 17 | if end is None: 18 | end = start 19 | steps = 1 20 | self.inc = (end - start) / float(steps) 21 | self.current = start 22 | self.end = end 23 | if end > start: 24 | self.bound = min 25 | else: 26 | self.bound = max 27 | 28 | def __call__(self, steps=1): 29 | val = self.current 30 | self.current = self.bound(self.current + self.inc * steps, self.end) 31 | return val 32 | -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/sum_tree.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/rlcode/per/blob/master/SumTree.py 2 | 3 | import numpy 4 | # SumTree 5 | # a binary tree data structure where the parent’s value is the sum of its children 6 | class SumTree: 7 | write = 0 8 | def __init__(self, capacity): 9 | self.capacity = capacity 10 | self.tree = numpy.zeros(2 * capacity - 1) 11 | self.data = numpy.zeros(capacity, dtype=object) 12 | self.n_entries = 0 13 | self.pending_idx = set() 14 | 15 | # update to the root node 16 | def _propagate(self, idx, change): 17 | parent = (idx - 1) // 2 18 | self.tree[parent] += change 19 | if parent != 0: 20 | self._propagate(parent, change) 21 | 22 | # find sample on leaf node 23 | def _retrieve(self, idx, s): 24 | left = 2 * idx + 1 25 | right = left + 1 26 | 27 | if left >= len(self.tree): 28 | return idx 29 | 30 | if s <= self.tree[left]: 31 | return self._retrieve(left, s) 32 | else: 33 | return self._retrieve(right, s - self.tree[left]) 34 | 35 | def total(self): 36 | return self.tree[0] 37 | 38 | # store priority and sample 39 | def add(self, p, data): 40 | idx = self.write + self.capacity - 1 41 | self.pending_idx.add(idx) 42 | 43 | self.data[self.write] = data 44 | self.update(idx, p) 45 | 46 | self.write += 1 47 | if self.write >= self.capacity: 48 | self.write = 0 49 | 50 | if self.n_entries < self.capacity: 51 | self.n_entries += 1 52 | 53 | # update priority 54 | def update(self, idx, p): 55 | if idx not in self.pending_idx: 56 | return 57 | self.pending_idx.remove(idx) 58 | change = p - self.tree[idx] 59 | self.tree[idx] = p 60 | self._propagate(idx, change) 61 | 62 | # get priority and sample 63 | def get(self, s): 64 | idx = self._retrieve(0, s) 65 | dataIdx = idx - self.capacity + 1 66 | self.pending_idx.add(idx) 67 | return (idx, self.tree[idx], dataIdx) -------------------------------------------------------------------------------- /cluster_exp/workloads/models/deep_rl/utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # Copyright (C) 2017 Shangtong Zhang(zhangshangtong.cpp@gmail.com) # 3 | # Permission given to modify the code as long as you keep this # 4 | # declaration at the top # 5 | ####################################################################### 6 | 7 | from .config import * 8 | import torch 9 | import os 10 | 11 | 12 | def select_device(gpu_id): 13 | # if torch.cuda.is_available() and gpu_id >= 0: 14 | if gpu_id >= 0: 15 | Config.DEVICE = torch.device('cuda:%d' % (gpu_id)) 16 | else: 17 | Config.DEVICE = torch.device('cpu') 18 | 19 | 20 | def tensor(x): 21 | if isinstance(x, torch.Tensor): 22 | return x 23 | x = np.asarray(x, dtype=np.float32) 24 | x = torch.from_numpy(x).to(Config.DEVICE) 25 | return x 26 | 27 | 28 | def range_tensor(end): 29 | return torch.arange(end).long().to(Config.DEVICE) 30 | 31 | 32 | def to_np(t): 33 | return t.cpu().detach().numpy() 34 | 35 | 36 | def random_seed(seed=None): 37 | np.random.seed(seed) 38 | torch.manual_seed(np.random.randint(int(1e6))) 39 | 40 | 41 | def set_one_thread(): 42 | os.environ['OMP_NUM_THREADS'] = '1' 43 | os.environ['MKL_NUM_THREADS'] = '1' 44 | torch.set_num_threads(1) 45 | 46 | 47 | def huber(x, k=1.0): 48 | return torch.where(x.abs() < k, 0.5 * x.pow(2), k * (x.abs() - 0.5 * k)) 49 | 50 | 51 | def epsilon_greedy(epsilon, x): 52 | if len(x.shape) == 1: 53 | return np.random.randint(len(x)) if np.random.rand() < epsilon else np.argmax(x) 54 | elif len(x.shape) == 2: 55 | random_actions = np.random.randint(x.shape[1], size=x.shape[0]) 56 | greedy_actions = np.argmax(x, axis=-1) 57 | dice = np.random.rand(x.shape[0]) 58 | return np.where(dice < epsilon, random_actions, greedy_actions) 59 | 60 | 61 | def sync_grad(target_network, src_network): 62 | for param, src_param in zip(target_network.parameters(), src_network.parameters()): 63 | if src_param.grad is not None: 64 | param._grad = src_param.grad.clone() 65 | 66 | 67 | # adapted from https://github.com/pytorch/pytorch/issues/12160 68 | def batch_diagonal(input): 69 | # idea from here: https://discuss.pytorch.org/t/batch-of-diagonal-matrix/13560 70 | # batches a stack of vectors (batch x N) -> a stack of diagonal matrices (batch x N x N) 71 | # works in 2D -> 3D, should also work in higher dimensions 72 | # make a zero matrix, which duplicates the last dim of input 73 | dims = input.size() 74 | dims = dims + dims[-1:] 75 | output = torch.zeros(dims, device=input.device) 76 | # stride across the first dimensions, add one to get the diagonal of the last dimension 77 | strides = [output.stride(i) for i in range(input.dim() - 1)] 78 | strides.append(output.size(-1) + 1) 79 | # stride and copy the input to the diagonal 80 | output.as_strided(input.size(), strides).copy_(input) 81 | return output 82 | 83 | 84 | def batch_trace(input): 85 | i = range_tensor(input.size(-1)) 86 | t = input[:, i, i].sum(-1).unsqueeze(-1).unsqueeze(-1) 87 | return t 88 | 89 | 90 | class DiagonalNormal: 91 | def __init__(self, mean, std): 92 | self.dist = torch.distributions.Normal(mean, std) 93 | self.sample = self.dist.sample 94 | 95 | def log_prob(self, action): 96 | return self.dist.log_prob(action).sum(-1).unsqueeze(-1) 97 | 98 | def entropy(self): 99 | return self.dist.entropy().sum(-1).unsqueeze(-1) 100 | 101 | def cdf(self, action): 102 | return self.dist.cdf(action).prod(-1).unsqueeze(-1) 103 | 104 | 105 | class BatchCategorical: 106 | def __init__(self, logits): 107 | self.pre_shape = logits.size()[:-1] 108 | logits = logits.view(-1, logits.size(-1)) 109 | self.dist = torch.distributions.Categorical(logits=logits) 110 | 111 | def log_prob(self, action): 112 | log_pi = self.dist.log_prob(action.view(-1)) 113 | log_pi = log_pi.view(action.size()[:-1] + (-1,)) 114 | return log_pi 115 | 116 | def entropy(self): 117 | ent = self.dist.entropy() 118 | ent = ent.view(self.pre_shape + (-1,)) 119 | return ent 120 | 121 | def sample(self, sample_shape=torch.Size([])): 122 | ret = self.dist.sample(sample_shape) 123 | ret = ret.view(sample_shape + self.pre_shape + (-1,)) 124 | return ret 125 | 126 | 127 | class Grad: 128 | def __init__(self, network=None, grads=None): 129 | if grads is not None: 130 | self.grads = grads 131 | else: 132 | self.grads = [] 133 | for param in network.parameters(): 134 | self.grads.append(torch.zeros(param.data.size(), device=Config.DEVICE)) 135 | 136 | def add(self, op): 137 | if isinstance(op, Grad): 138 | for grad, op_grad in zip(self.grads, op.grads): 139 | grad.add_(op_grad) 140 | elif isinstance(op, torch.nn.Module): 141 | for grad, param in zip(self.grads, op.parameters()): 142 | if param.grad is not None: 143 | grad.add_(param.grad) 144 | return self 145 | 146 | def mul(self, coef): 147 | for grad in self.grads: 148 | grad.mul_(coef) 149 | return self 150 | 151 | def assign(self, network): 152 | for grad, param in zip(self.grads, network.parameters()): 153 | param._grad = grad.clone() 154 | 155 | def zero(self): 156 | for grad in self.grads: 157 | grad.zero_() 158 | 159 | def clone(self): 160 | return Grad(grads=[grad.clone() for grad in self.grads]) 161 | 162 | 163 | class Grads: 164 | def __init__(self, network=None, n=0, grads=None): 165 | if grads is not None: 166 | self.grads = grads 167 | else: 168 | self.grads = [Grad(network) for _ in range(n)] 169 | 170 | def clone(self): 171 | return Grads(grads=[grad.clone() for grad in self.grads]) 172 | 173 | def mul(self, op): 174 | if np.isscalar(op): 175 | for grad in self.grads: 176 | grad.mul(op) 177 | elif isinstance(op, torch.Tensor): 178 | op = op.view(-1) 179 | for i, grad in enumerate(self.grads): 180 | grad.mul(op[i]) 181 | else: 182 | raise NotImplementedError 183 | return self 184 | 185 | def add(self, op): 186 | if np.isscalar(op): 187 | for grad in self.grads: 188 | grad.mul(op) 189 | elif isinstance(op, Grads): 190 | for grad, op_grad in zip(self.grads, op.grads): 191 | grad.add(op_grad) 192 | elif isinstance(op, torch.Tensor): 193 | op = op.view(-1) 194 | for i, grad in enumerate(self.grads): 195 | grad.mul(op[i]) 196 | else: 197 | raise NotImplementedError 198 | return self 199 | 200 | def mean(self): 201 | grad = self.grads[0].clone() 202 | grad.zero() 203 | for g in self.grads: 204 | grad.add(g) 205 | grad.mul(1 / len(self.grads)) 206 | return grad 207 | 208 | 209 | def escape_float(x): 210 | return ('%s' % x).replace('.', '\.') -------------------------------------------------------------------------------- /cluster_exp/workloads/models/dqn_model.py: -------------------------------------------------------------------------------- 1 | # add for RL 2 | from .deep_rl import * 3 | 4 | 5 | class DQNModel: 6 | def __init__(self, idx, args, sargs): 7 | self.idx = idx 8 | self.args = args 9 | self.sargs = sargs # specific args for this model 10 | 11 | def prepare(self, hvd): 12 | ''' 13 | prepare dataloader, model, optimizer for training 14 | ''' 15 | if hvd.local_rank()==0: 16 | mkdir('log') 17 | mkdir('tf_log') 18 | self.device = torch.device("cuda:%d" % (hvd.local_rank())) 19 | Config.DEVICE = self.device 20 | kwargs = dict() 21 | kwargs['log_level'] = 0 22 | kwargs['n_step'] = 1 23 | kwargs['replay_cls'] = UniformReplay 24 | kwargs['async_replay'] = False 25 | kwargs['game'] = 'BreakoutNoFrameskip-v4' 26 | kwargs['run'] = 0 27 | config = Config() 28 | config.merge(kwargs) 29 | config.task_fn = lambda: Task(config.game) 30 | config.eval_env = config.task_fn() 31 | config.optimizer_fn = lambda params: torch.optim.RMSprop( 32 | params, lr=0.00025, alpha=0.95, eps=0.01, centered=True) 33 | config.network_fn = lambda: VanillaNet(config.action_dim, NatureConvBody(in_channels=config.history_length)) 34 | config.random_action_prob = LinearSchedule(1.0, 0.01, 1e6) 35 | config.batch_size = self.sargs['batch_size'] 36 | config.discount = 0.99 37 | config.history_length = 4 38 | config.max_steps = int(2e7) 39 | replay_kwargs = dict( 40 | memory_size=int(1e6), 41 | batch_size=config.batch_size, 42 | n_step=config.n_step, 43 | discount=config.discount, 44 | history_length=config.history_length, 45 | ) 46 | config.replay_fn = lambda: ReplayWrapper(config.replay_cls, replay_kwargs, config.async_replay) 47 | config.replay_eps = 0.01 48 | config.replay_alpha = 0.5 49 | config.replay_beta = LinearSchedule(0.4, 1.0, config.max_steps) 50 | 51 | config.state_normalizer = ImageNormalizer() 52 | config.reward_normalizer = SignNormalizer() 53 | config.target_network_update_freq = 10000 54 | config.exploration_steps = config.batch_size 55 | # config.exploration_steps = 100 56 | config.sgd_update_frequency = 4 57 | config.gradient_clip = 5 58 | config.double_q = False 59 | config.async_actor = False 60 | # if hvd.rank()==0: 61 | # print(config) 62 | self.model = DQNAgent(config) 63 | # print("build model!!!!!") 64 | self.optimizer = hvd.DistributedOptimizer(self.model.optimizer, named_parameters=self.model.network.named_parameters(prefix='model'+str(self.idx))) 65 | hvd.broadcast_parameters(self.model.network.state_dict(), root_rank=0) 66 | hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) 67 | 68 | self.config = config 69 | 70 | def get_data(self): 71 | ''' 72 | get data 73 | ''' 74 | for _ in range(self.args.play_times): 75 | transitions = self.model.actor.step() 76 | for states, actions, rewards, next_states, dones, info in transitions: 77 | self.model.record_online_return(info) 78 | self.model.total_steps += 1 79 | self.model.replay.feed(dict( 80 | state=np.array([s[-1] if isinstance(s, LazyFrames) else s for s in states]), 81 | action=actions, 82 | reward=[self.config.reward_normalizer(r) for r in rewards], 83 | mask=1 - np.asarray(dones, dtype=np.int32), 84 | )) 85 | states = next_states 86 | 87 | transitions = self.model.replay.sample() 88 | return transitions 89 | 90 | def forward_backward(self, thread): 91 | ''' 92 | forward, calculate loss and backward 93 | ''' 94 | thread.join() 95 | transitions = thread.get_result() 96 | 97 | if self.model.total_steps < self.config.exploration_steps: 98 | return 99 | 100 | if self.config.noisy_linear: 101 | self.model.target_network.reset_noise() 102 | self.model.network.reset_noise() 103 | loss = self.model.compute_loss(transitions) 104 | if isinstance(transitions, PrioritizedTransition): 105 | priorities = loss.abs().add(self.config.replay_eps).pow(self.config.replay_alpha) 106 | idxs = tensor(transitions.idx).long() 107 | self.model.replay.update_priorities(zip(to_np(idxs), to_np(priorities))) 108 | sampling_probs = tensor(transitions.sampling_prob) 109 | weights = sampling_probs.mul(sampling_probs.size(0)).add(1e-6).pow(-self.config.replay_beta()) 110 | weights = weights / weights.max() 111 | loss = loss.mul(weights) 112 | 113 | loss = self.model.reduce_loss(loss) 114 | self.optimizer.zero_grad() 115 | loss.backward() 116 | 117 | def comm(self): 118 | with self.model.config.lock: 119 | # self.optimizer.synchronize() 120 | # nn.utils.clip_grad_norm_(self.model.network.parameters(), self.model.config.gradient_clip) 121 | # with self.optimizer.skip_synchronize(): 122 | self.optimizer.step() 123 | if self.model.total_steps / self.model.config.sgd_update_frequency % \ 124 | self.model.config.target_network_update_freq == 0: 125 | self.model.target_network.load_state_dict(self.model.network.state_dict()) 126 | 127 | def print_info(self): 128 | print("Model ", self.idx, ": ", self.sargs["model_name"], "; batch size: ", self.sargs["batch_size"], "; play times: ", self.args.play_times) 129 | 130 | def data_size(self): 131 | return 0 -------------------------------------------------------------------------------- /cluster_exp/workloads/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | gym 3 | tensorflow 4 | atari-py 5 | opencv-python 6 | scikit-image 7 | tqdm 8 | pandas 9 | pathlib 10 | seaborn 11 | roboschool 12 | torchmeta 13 | torchvision -------------------------------------------------------------------------------- /cluster_exp/workloads/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: Due to the scripts are highly related to intracompany platform, 3 | #we only demonstrate the functionality and show the pseudocode of the 4 | #related scripts (e.g., run.sh, prepare_env.sh). Please adjust to your 5 | #platform if you would like to execute the testbed experiment. 6 | 7 | if [ $# -lt 33 ]; then 8 | echo "usage: model_0, batch-size_0, num_workers0, prefetch_factor0, train_dir0, iter0, job_id0, job_counter0, model_1, batch-size_1, num_workers1, prefetch_factor1, train_dir1, iter1, job_id1, job_counter1, model_2, batch-size_2, num_workers2, prefetch_factor2, train_dir2, iter2, job_id2, job_counter2, model_3, batch-size_3, num_workers3, prefetch_factor3, train_dir3, iter3, job_id3, job_counter3, num_gpu, other_params" 9 | exit -1; 10 | fi 11 | 12 | export MODEL0=$1 13 | shift 14 | export BS0=$1 15 | shift 16 | export NUM_WORKERS0=$1 17 | shift 18 | export PREFETCH_FACTOR0=$1 19 | shift 20 | export TRAIN_DIR0=$1 21 | shift 22 | export ITER0=$1 23 | shift 24 | export JOB_ID0=$1 25 | shift 26 | export JOB_COUNTER0=$1 27 | shift 28 | export MODEL1=$1 29 | shift 30 | export BS1=$1 31 | shift 32 | export NUM_WORKERS1=$1 33 | shift 34 | export PREFETCH_FACTOR1=$1 35 | shift 36 | export TRAIN_DIR1=$1 37 | shift 38 | export ITER1=$1 39 | shift 40 | export JOB_ID1=$1 41 | shift 42 | export JOB_COUNTER1=$1 43 | shift 44 | export MODEL2=$1 45 | shift 46 | export BS2=$1 47 | shift 48 | export NUM_WORKERS2=$1 49 | shift 50 | export PREFETCH_FACTOR2=$1 51 | shift 52 | export TRAIN_DIR2=$1 53 | shift 54 | export ITER2=$1 55 | shift 56 | export JOB_ID2=$1 57 | shift 58 | export JOB_COUNTER2=$1 59 | shift 60 | export MODEL3=$1 61 | shift 62 | export BS3=$1 63 | shift 64 | export NUM_WORKERS3=$1 65 | shift 66 | export PREFETCH_FACTOR3=$1 67 | shift 68 | export TRAIN_DIR3=$1 69 | shift 70 | export ITER3=$1 71 | shift 72 | export JOB_ID3=$1 73 | shift 74 | export JOB_COUNTER3=$1 75 | shift 76 | export NUM_GPU=$1 77 | shift 78 | 79 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 80 | echo $THIS_DIR 81 | 82 | #get real datasets -- imagenet-1k 83 | judge_path="$THIS_DIR/datasets/imagenet" 84 | 85 | #get nlp datasets - wikitext 86 | TRAIN_FILE=$THIS_DIR/datasets/wikitext-2-raw/wiki.train.raw 87 | 88 | arg="$@" 89 | echo $arg 90 | 91 | if [[ "$MODEL1" == "-1" ]]; then 92 | MODEL1=$MODEL0 93 | fi 94 | if [ $NUM_WORKERS1 -eq -1 ]; then 95 | NUM_WORKERS1=$NUM_WORKERS0 96 | fi 97 | if [ $PREFETCH_FACTOR1 -eq -1 ]; then 98 | PREFETCH_FACTOR1=$PREFETCH_FACTOR0 99 | fi 100 | if [ $BS1 -eq -1 ]; then 101 | BS1=$BS0 102 | fi 103 | 104 | if [[ "$MODEL2" == "-1" ]]; then 105 | MODEL2=$MODEL0 106 | fi 107 | if [ $NUM_WORKERS2 -eq -1 ]; then 108 | NUM_WORKERS2=$NUM_WORKERS0 109 | fi 110 | if [ $PREFETCH_FACTOR2 -eq -1 ]; then 111 | PREFETCH_FACTOR2=$PREFETCH_FACTOR0 112 | fi 113 | if [ $BS2 -eq -1 ]; then 114 | BS2=$BS0 115 | fi 116 | 117 | if [[ "$MODEL3" == "-1" ]]; then 118 | MODEL3=$MODEL0 119 | fi 120 | if [ $NUM_WORKERS3 -eq -1 ]; then 121 | NUM_WORKERS3=$NUM_WORKERS0 122 | fi 123 | if [ $PREFETCH_FACTOR3 -eq -1 ]; then 124 | PREFETCH_FACTOR3=$PREFETCH_FACTOR0 125 | fi 126 | if [ $BS3 -eq -1 ]; then 127 | BS3=$BS0 128 | fi 129 | 130 | # train data path 131 | if [[ "$TRAIN_DIR0" == "-1" ]]; then 132 | if [[ "$MODEL0" == "dqn" ]] || [[ "$MODEL0" == "a2c" ]]; then 133 | TRAIN_DIR0="./" 134 | elif [[ "$MODEL0" == "bert" ]] || [[ "$MODEL0" == "gpt2" ]]; then 135 | TRAIN_DIR0=$TRAIN_FILE 136 | else 137 | TRAIN_DIR0=$judge_path 138 | fi 139 | fi 140 | if [[ "$TRAIN_DIR1" == "-1" ]]; then 141 | if [[ "$MODEL1" == "dqn" ]] || [[ "$MODEL1" == "a2c" ]]; then 142 | TRAIN_DIR1="./" 143 | elif [[ "$MODEL1" == "bert" ]] || [[ "$MODEL1" == "gpt2" ]]; then 144 | TRAIN_DIR1=$TRAIN_FILE 145 | else 146 | TRAIN_DIR1=$judge_path 147 | fi 148 | fi 149 | if [[ "$TRAIN_DIR2" == "-1" ]]; then 150 | if [[ "$MODEL2" == "dqn" ]] || [[ "$MODEL2" == "a2c" ]]; then 151 | TRAIN_DIR2="./" 152 | elif [[ "$MODEL2" == "bert" ]] || [[ "$MODEL2" == "gpt2" ]]; then 153 | TRAIN_DIR2=$TRAIN_FILE 154 | else 155 | TRAIN_DIR2=$judge_path 156 | fi 157 | fi 158 | if [[ "$TRAIN_DIR3" == "-1" ]]; then 159 | if [[ "$MODEL3" == "dqn" ]] || [[ "$MODEL3" == "a2c" ]]; then 160 | TRAIN_DIR3="./" 161 | elif [[ "$MODEL3" == "bert" ]] || [[ "$MODEL3" == "gpt2" ]]; then 162 | TRAIN_DIR3=$TRAIN_FILE 163 | else 164 | TRAIN_DIR3=$judge_path 165 | fi 166 | fi 167 | 168 | hostfile=$THIS_DIR/hostfiles/hostfile-[${JOB_ID0}-${JOB_ID1}-${JOB_ID2}-${JOB_ID3}]-[${JOB_COUNTER0}-${JOB_COUNTER1}-${JOB_COUNTER2}-${JOB_COUNTER3}] 169 | echo $hostfile 170 | 171 | # set common command for mpirun 172 | COMMON_CMD="" 173 | 174 | if [ $NUM_GPU -ge 8 ]; then 175 | GPU_PERNODE=8 176 | else 177 | GPU_PERNODE=$NUM_GPU 178 | fi 179 | 180 | echo $NUM_GPU $GPU_PERNODE $COMMON_CMD 181 | 182 | echo "-------------------------------" 183 | echo $MODEL0 $BS0 $MODEL1 $BS1 $MODEL2 $BS2 $MODEL3 $BS3 184 | 185 | ID_MAX=$JOB_ID0 186 | if [ $JOB_ID1 -gt $ID_MAX ]; then 187 | ID_MAX=$JOB_ID1 188 | fi 189 | if [ $JOB_ID2 -gt $ID_MAX ]; then 190 | ID_MAX=$JOB_ID2 191 | fi 192 | if [ $JOB_ID3 -gt $ID_MAX ]; then 193 | ID_MAX=$JOB_ID3 194 | fi 195 | 196 | exec mpirun -n $NUM_GPU --npernode $GPU_PERNODE ${COMMON_CMD} \ 197 | python3 $THIS_DIR/main_real_util.py --model0 $MODEL0 --batch-size0 $BS0 --train-dir0 $TRAIN_DIR0 --num-workers0 ${NUM_WORKERS0} --prefetch-factor0 ${PREFETCH_FACTOR0} --iters0 $ITER0 --job-id0 $JOB_ID0 --model1 $MODEL1 --batch-size1 $BS1 --train-dir1 $TRAIN_DIR1 --num-workers1 ${NUM_WORKERS1} --prefetch-factor1 ${PREFETCH_FACTOR1} --iters1 $ITER1 --job-id1 $JOB_ID1 --model2 $MODEL2 --batch-size2 $BS2 --train-dir2 $TRAIN_DIR2 --num-workers2 ${NUM_WORKERS2} --prefetch-factor2 ${PREFETCH_FACTOR2} --iters2 $ITER2 --job-id2 $JOB_ID2 --model3 $MODEL3 --batch-size3 $BS3 --train-dir3 $TRAIN_DIR3 --num-workers3 ${NUM_WORKERS3} --prefetch-factor3 ${PREFETCH_FACTOR3} --iters3 $ITER3 --job-id3 $JOB_ID3 --this-dir $THIS_DIR $arg >$THIS_DIR/test_${ID_MAX}.txt 198 | -------------------------------------------------------------------------------- /cluster_exp/workloads/run_preenv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Note: Due to the scripts are highly related to intracompany platform, 3 | #we only demonstrate the functionality and show the pseudocode of the 4 | #related scripts (e.g., run.sh, prepare_env.sh). Please adjust to your 5 | #platform if you would like to execute the testbed experiment. 6 | 7 | if [ $# -lt 33 ]; then 8 | echo "usage: model_0, batch-size_0, num_workers0, prefetch_factor0, train_dir0, iter0, job_id0, job_counter0, model_1, batch-size_1, num_workers1, prefetch_factor1, train_dir1, iter1, job_id1, job_counter1, model_2, batch-size_2, num_workers2, prefetch_factor2, train_dir2, iter2, job_id2, job_counter2, model_3, batch-size_3, num_workers3, prefetch_factor3, train_dir3, iter3, job_id3, job_counter3, num_gpu, other_params" 9 | exit -1; 10 | fi 11 | 12 | export MODEL0=$1 13 | shift 14 | export BS0=$1 15 | shift 16 | export NUM_WORKERS0=$1 17 | shift 18 | export PREFETCH_FACTOR0=$1 19 | shift 20 | export TRAIN_DIR0=$1 21 | shift 22 | export ITER0=$1 23 | shift 24 | export JOB_ID0=$1 25 | shift 26 | export JOB_COUNTER0=$1 27 | shift 28 | export MODEL1=$1 29 | shift 30 | export BS1=$1 31 | shift 32 | export NUM_WORKERS1=$1 33 | shift 34 | export PREFETCH_FACTOR1=$1 35 | shift 36 | export TRAIN_DIR1=$1 37 | shift 38 | export ITER1=$1 39 | shift 40 | export JOB_ID1=$1 41 | shift 42 | export JOB_COUNTER1=$1 43 | shift 44 | export MODEL2=$1 45 | shift 46 | export BS2=$1 47 | shift 48 | export NUM_WORKERS2=$1 49 | shift 50 | export PREFETCH_FACTOR2=$1 51 | shift 52 | export TRAIN_DIR2=$1 53 | shift 54 | export ITER2=$1 55 | shift 56 | export JOB_ID2=$1 57 | shift 58 | export JOB_COUNTER2=$1 59 | shift 60 | export MODEL3=$1 61 | shift 62 | export BS3=$1 63 | shift 64 | export NUM_WORKERS3=$1 65 | shift 66 | export PREFETCH_FACTOR3=$1 67 | shift 68 | export TRAIN_DIR3=$1 69 | shift 70 | export ITER3=$1 71 | shift 72 | export JOB_ID3=$1 73 | shift 74 | export JOB_COUNTER3=$1 75 | shift 76 | export NUM_GPU=$1 77 | shift 78 | 79 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 80 | echo $THIS_DIR 81 | 82 | #get real datasets -- imagenet-1k 83 | judge_path="$THIS_DIR/datasets/imagenet" 84 | 85 | #get nlp datasets - wikitext 86 | TRAIN_FILE=$THIS_DIR/datasets/wikitext-2-raw/wiki.train.raw 87 | 88 | arg="$@" 89 | echo $arg 90 | 91 | if [[ "$MODEL1" == "-1" ]]; then 92 | MODEL1=$MODEL0 93 | fi 94 | if [ $NUM_WORKERS1 -eq -1 ]; then 95 | NUM_WORKERS1=$NUM_WORKERS0 96 | fi 97 | if [ $PREFETCH_FACTOR1 -eq -1 ]; then 98 | PREFETCH_FACTOR1=$PREFETCH_FACTOR0 99 | fi 100 | if [ $BS1 -eq -1 ]; then 101 | BS1=$BS0 102 | fi 103 | 104 | if [[ "$MODEL2" == "-1" ]]; then 105 | MODEL2=$MODEL0 106 | fi 107 | if [ $NUM_WORKERS2 -eq -1 ]; then 108 | NUM_WORKERS2=$NUM_WORKERS0 109 | fi 110 | if [ $PREFETCH_FACTOR2 -eq -1 ]; then 111 | PREFETCH_FACTOR2=$PREFETCH_FACTOR0 112 | fi 113 | if [ $BS2 -eq -1 ]; then 114 | BS2=$BS0 115 | fi 116 | 117 | if [[ "$MODEL3" == "-1" ]]; then 118 | MODEL3=$MODEL0 119 | fi 120 | if [ $NUM_WORKERS3 -eq -1 ]; then 121 | NUM_WORKERS3=$NUM_WORKERS0 122 | fi 123 | if [ $PREFETCH_FACTOR3 -eq -1 ]; then 124 | PREFETCH_FACTOR3=$PREFETCH_FACTOR0 125 | fi 126 | if [ $BS3 -eq -1 ]; then 127 | BS3=$BS0 128 | fi 129 | 130 | # train data path 131 | if [[ "$TRAIN_DIR0" == "-1" ]]; then 132 | if [[ "$MODEL0" == "dqn" ]] || [[ "$MODEL0" == "a2c" ]]; then 133 | TRAIN_DIR0="./" 134 | elif [[ "$MODEL0" == "bert" ]] || [[ "$MODEL0" == "gpt2" ]]; then 135 | TRAIN_DIR0=$TRAIN_FILE 136 | else 137 | TRAIN_DIR0=$judge_path 138 | fi 139 | fi 140 | if [[ "$TRAIN_DIR1" == "-1" ]]; then 141 | if [[ "$MODEL1" == "dqn" ]] || [[ "$MODEL1" == "a2c" ]]; then 142 | TRAIN_DIR1="./" 143 | elif [[ "$MODEL1" == "bert" ]] || [[ "$MODEL1" == "gpt2" ]]; then 144 | TRAIN_DIR1=$TRAIN_FILE 145 | else 146 | TRAIN_DIR1=$judge_path 147 | fi 148 | fi 149 | if [[ "$TRAIN_DIR2" == "-1" ]]; then 150 | if [[ "$MODEL2" == "dqn" ]] || [[ "$MODEL2" == "a2c" ]]; then 151 | TRAIN_DIR2="./" 152 | elif [[ "$MODEL2" == "bert" ]] || [[ "$MODEL2" == "gpt2" ]]; then 153 | TRAIN_DIR2=$TRAIN_FILE 154 | else 155 | TRAIN_DIR2=$judge_path 156 | fi 157 | fi 158 | if [[ "$TRAIN_DIR3" == "-1" ]]; then 159 | if [[ "$MODEL3" == "dqn" ]] || [[ "$MODEL3" == "a2c" ]]; then 160 | TRAIN_DIR3="./" 161 | elif [[ "$MODEL3" == "bert" ]] || [[ "$MODEL3" == "gpt2" ]]; then 162 | TRAIN_DIR3=$TRAIN_FILE 163 | else 164 | TRAIN_DIR3=$judge_path 165 | fi 166 | fi 167 | 168 | hostfile=$THIS_DIR/hostfiles/hostfile-[${JOB_ID0}-${JOB_ID1}-${JOB_ID2}-${JOB_ID3}]-[${JOB_COUNTER0}-${JOB_COUNTER1}-${JOB_COUNTER2}-${JOB_COUNTER3}] 169 | echo $hostfile 170 | 171 | # set common command for mpirun 172 | COMMON_CMD="" 173 | 174 | if [ $NUM_GPU -ge 8 ]; then 175 | GPU_PERNODE=8 176 | else 177 | GPU_PERNODE=$NUM_GPU 178 | fi 179 | 180 | echo $NUM_GPU $GPU_PERNODE $COMMON_CMD 181 | 182 | echo "-------------------------------" 183 | echo $MODEL0 $BS0 $MODEL1 $BS1 $MODEL2 $BS2 $MODEL3 $BS3 184 | 185 | exec mpirun -n $NUM_GPU --npernode $GPU_PERNODE ${COMMON_CMD} \ 186 | python3 $THIS_DIR/main_real_preenv.py --model0 $MODEL0 --batch-size0 $BS0 --train-dir0 $TRAIN_DIR0 --num-workers0 ${NUM_WORKERS0} --prefetch-factor0 ${PREFETCH_FACTOR0} --iters0 $ITER0 --job-id0 $JOB_ID0 --model1 $MODEL1 --batch-size1 $BS1 --train-dir1 $TRAIN_DIR1 --num-workers1 ${NUM_WORKERS1} --prefetch-factor1 ${PREFETCH_FACTOR1} --iters1 $ITER1 --job-id1 $JOB_ID1 --model2 $MODEL2 --batch-size2 $BS2 --train-dir2 $TRAIN_DIR2 --num-workers2 ${NUM_WORKERS2} --prefetch-factor2 ${PREFETCH_FACTOR2} --iters2 $ITER2 --job-id2 $JOB_ID2 --model3 $MODEL3 --batch-size3 $BS3 --train-dir3 $TRAIN_DIR3 --num-workers3 ${NUM_WORKERS3} --prefetch-factor3 ${PREFETCH_FACTOR3} --iters3 $ITER3 --job-id3 $JOB_ID3 --this-dir $THIS_DIR $arg >$THIS_DIR/test.txt 187 | 188 | -------------------------------------------------------------------------------- /simulator/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | result* 3 | __pycache__ 4 | test* 5 | plot* 6 | trace-data/cluster* 7 | !trace-data/cluster*.csv 8 | *.txt 9 | # *.csv 10 | *.xlsx 11 | .DS_Store 12 | *.out -------------------------------------------------------------------------------- /simulator/README.md: -------------------------------------------------------------------------------- 1 | # Simulator 2 | 3 | # 0. Content 4 | - **simulator/** contains code for simulation and is adapted from [Tiresias](https://github.com/SymbioticLab/Tiresias). 5 | - **cluster_spec/** contains configuration files for cluster, e.g., the number of nodes, the number of GPU per node. 6 | - **trace-data/** contains traces for simulation evaluation. 7 | - **calc.py** computes metrics, e.g., avg. JCT, Makespan, and 99th JCT. 8 | - **cluster.py**, **switch.py**, and **node.py** contain implementations of the cluster. 9 | - **jobs.py** and **model.py** contain information of the jobs. 10 | - **flags.py** contains the argument definition method. 11 | - **log.py** and **utils.py** contain auxiliary functions. 12 | - **matching.py** contains the implementation of the matching algorithm for Muri. 13 | - **run_sim.py** contains the implementation of different scheduling policies. 14 | 15 | # 1. Environment config 16 | ### Step 1: create conda environment 17 | ``` 18 | # create conda env 19 | conda create -n muri python=3.8 20 | conda activate muri 21 | ``` 22 | 23 | ### Step 2: install python dependencies 24 | ``` 25 | conda install numpy 26 | conda install -c conda-forge cvxpy 27 | ``` 28 | 29 | # 2. Reproduce simulation results (for SIGCOMM'22 artifact evaluation) 30 | - ```cd /simulator``` 31 | - Figure 9: ```bash sim_fig9.sh``` (takes about 2 days) 32 | - Figure 10: ```bash sim_fig10.sh``` (takes about 4 days) 33 | - Figure 11: ```bash sim_fig11.sh``` (takes about 2 days) 34 | - Figure 12: ```bash sim_fig12.sh``` (takes about 2 days) 35 | - Figure 13: ```bash sim_fig13.sh``` (takes about 1 days) 36 | - Note: The detailed results will be stored in ```/simulator/results/```. 37 | - To generate the figure shown in our paper, please change the raw data in ```draw_fig9-10.py``` and ```draw_fig11-13.py``` to the test results (JCT, Makespan, and/or 99th JCT), and run ```python draw_fig9-10.py``` and ```python draw_fig11-13.py```. 38 | 39 | -------------------------------------------------------------------------------- /simulator/calc.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | 4 | log_path = sys.argv[1] 5 | 6 | csv_reader = csv.reader(open(log_path+"/job.csv")) 7 | jct_sum = 0 8 | makespan = 0 9 | cnt = 0 10 | jct_list = [] 11 | for line_id,line in enumerate(csv_reader): 12 | if line_id > 0: 13 | jct_sum += float(line[-5]) 14 | makespan = max(makespan, float(line[5])) 15 | cnt += 1 16 | jct_list.append(float(line[-5])) 17 | 18 | jct_list.sort() 19 | 20 | print("Total jobs: %d, avg JCT: %.6f, makespan: %.6f, 99th JCT: %.6f" % (cnt, jct_sum/cnt, makespan, jct_list[int(cnt*0.99)])) -------------------------------------------------------------------------------- /simulator/cluster_spec/n8g8.csv: -------------------------------------------------------------------------------- 1 | num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node 2 | 1,8,8,92,520 -------------------------------------------------------------------------------- /simulator/flags.py: -------------------------------------------------------------------------------- 1 | """Implementation of the flags interface.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import argparse as _argparse 7 | 8 | _global_parser = _argparse.ArgumentParser(description='') 9 | 10 | 11 | # pylint: disable=invalid-name 12 | 13 | 14 | class _FlagValues(object): 15 | """Global container and accessor for flags and their values.""" 16 | 17 | def __init__(self): 18 | self.__dict__['__flags'] = {} 19 | self.__dict__['__parsed'] = False 20 | 21 | def _parse_flags(self, args=None): 22 | result, unparsed = _global_parser.parse_known_args(args=args) 23 | for flag_name, val in vars(result).items(): 24 | self.__dict__['__flags'][flag_name] = val 25 | self.__dict__['__parsed'] = True 26 | return unparsed 27 | 28 | def __getattr__(self, name): 29 | """Retrieves the 'value' attribute of the flag --name.""" 30 | try: 31 | parsed = self.__dict__['__parsed'] 32 | except KeyError: 33 | # May happen during pickle.load or copy.copy 34 | raise AttributeError(name) 35 | if not parsed: 36 | self._parse_flags() 37 | if name not in self.__dict__['__flags']: 38 | raise AttributeError(name) 39 | return self.__dict__['__flags'][name] 40 | 41 | def __setattr__(self, name, value): 42 | """Sets the 'value' attribute of the flag --name.""" 43 | if not self.__dict__['__parsed']: 44 | self._parse_flags() 45 | self.__dict__['__flags'][name] = value 46 | 47 | 48 | def _define_helper(flag_name, default_value, docstring, flagtype): 49 | """Registers 'flag_name' with 'default_value' and 'docstring'.""" 50 | _global_parser.add_argument('--' + flag_name, 51 | default=default_value, 52 | help=docstring, 53 | type=flagtype) 54 | 55 | 56 | # Provides the global object that can be used to access flags. 57 | FLAGS = _FlagValues() 58 | 59 | 60 | def DEFINE_string(flag_name, default_value, docstring): 61 | """Defines a flag of type 'string'. 62 | 63 | Args: 64 | flag_name: The name of the flag as a string. 65 | default_value: The default value the flag should take as a string. 66 | docstring: A helpful message explaining the use of the flag. 67 | """ 68 | _define_helper(flag_name, default_value, docstring, str) 69 | 70 | 71 | def DEFINE_integer(flag_name, default_value, docstring): 72 | """Defines a flag of type 'int'. 73 | 74 | Args: 75 | flag_name: The name of the flag as a string. 76 | default_value: The default value the flag should take as an int. 77 | docstring: A helpful message explaining the use of the flag. 78 | """ 79 | _define_helper(flag_name, default_value, docstring, int) 80 | 81 | 82 | def DEFINE_boolean(flag_name, default_value, docstring): 83 | """Defines a flag of type 'boolean'. 84 | 85 | Args: 86 | flag_name: The name of the flag as a string. 87 | default_value: The default value the flag should take as a boolean. 88 | docstring: A helpful message explaining the use of the flag. 89 | """ 90 | # Register a custom function for 'bool' so --flag=True works. 91 | def str2bool(v): 92 | return v.lower() in ('true', 't', '1') 93 | _global_parser.add_argument('--' + flag_name, 94 | nargs='?', 95 | const=True, 96 | help=docstring, 97 | default=default_value, 98 | type=str2bool) 99 | 100 | # Add negated version, stay consistent with argparse with regard to 101 | # dashes in flag names. 102 | _global_parser.add_argument('--no' + flag_name, 103 | action='store_false', 104 | dest=flag_name.replace('-', '_')) 105 | 106 | 107 | # The internal google library defines the following alias, so we match 108 | # the API for consistency. 109 | DEFINE_bool = DEFINE_boolean # pylint: disable=invalid-name 110 | 111 | 112 | def DEFINE_float(flag_name, default_value, docstring): 113 | """Defines a flag of type 'float'. 114 | 115 | Args: 116 | flag_name: The name of the flag as a string. 117 | default_value: The default value the flag should take as a float. 118 | docstring: A helpful message explaining the use of the flag. 119 | """ 120 | _define_helper(flag_name, default_value, docstring, float) 121 | 122 | 123 | def DEFINE_version(v_string): 124 | _global_parser.add_argument("-v", "--version", action='version', version='%(prog)s ' + v_string, dest='version', 125 | help="display version information") 126 | _allowed_symbols = [ 127 | # We rely on gflags documentation. 128 | 'DEFINE_bool', 129 | 'DEFINE_boolean', 130 | 'DEFINE_float', 131 | 'DEFINE_integer', 132 | 'DEFINE_string', 133 | 'DEFINE_version', 134 | 'FLAGS', 135 | ] 136 | # remove_undocumented(__name__, _allowed_symbols) -------------------------------------------------------------------------------- /simulator/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import random 6 | import util 7 | 8 | m_tensors = [[1.1,2.3,2.3,2.3,4.5,9.0,9.0,9.0,9.0,9.0,9.0,9.0,392.0,64.0,15.6], 9 | [1.1,2.3,2.3,4.5,9.0,9.0,9.0,9.0,9.0,392.0,64.0,15.6], 10 | [1.1,2.3,4.5,9.0,9.0,9.0,392.0,64.0,15.6], 11 | [1.2,2.5,5.1,3.4,144.0,64.0,15.6], 12 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 13 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 14 | [2.0,2.3,2.3,2.3,2.3,2.3,2.3,8.0,2.0,9.0,4.0,4.0,9.0,4.0,4.0,9.0,4.0,7.8], 15 | [1.3,5.1,1.5,2.0,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.5,1.1,1.5,1.1,1.3,1.5,1.3,1.8,2.2,3.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,1.5,1.5,2.3,1.1,1.1,2.3,2.0,2.6,1.5,1.5,5.9], 16 | [3.8,2.1,1.3,1.6,1.9,1.7,1.7,2.2,5.9,1.7,1.7,2.5,3.0,1.7,1.7,3.5,5.9,1.7,1.7,1.5,7.8]] 17 | 18 | 19 | 20 | m_names = ['vgg19', 'vgg16', 'vgg11', 'alexnet', 'resnet152', 'resnet101', 'resnet50', 'inception4', 'inception3'] 21 | # m_mem = [0.60, 0.55, 0.45, 0.13, 0.85, 0.70, 0.50, 0.85, 0.80] 22 | m_mem = [1, 1, 1, 1, 1, 1, 1, 1, 1] 23 | 24 | worker_mem = 5 25 | ps_mem = 8 26 | per_worker_mem = 0.2 27 | 28 | 29 | def get_model(model_name): 30 | ''' 31 | get model tensor information by model_name 32 | return a dict{name, tensors(list)} 33 | ''' 34 | if model_name == 'vgg19': 35 | m_idx = 0 36 | elif model_name == 'vgg16': 37 | m_idx = 1 38 | elif model_name == 'vgg11': 39 | m_idx = 2 40 | elif model_name == 'alexnet': 41 | m_idx = 3 42 | elif model_name == 'resnet152': 43 | m_idx = 4 44 | elif model_name == 'resnet101': 45 | m_idx = 5 46 | elif model_name == 'resnet50': 47 | m_idx = 6 48 | elif model_name == 'inception4': 49 | m_idx = 7 50 | elif model_name == 'inception3': 51 | m_idx = 8 52 | else: 53 | # m_idx = random.randint(0,8) 54 | m_idx = 8 55 | util.print_fn('No model match, pick %s' % m_names[m_idx]) 56 | 57 | ret = {'name':m_names[m_idx], 'ind':m_idx, 'tensors':m_tensors[m_idx], 'mem_util':m_mem[m_idx]} 58 | return ret 59 | 60 | def get_model_with_scale(model_name, model_scale): 61 | ''' 62 | get model tensor information by model_name 63 | and extend the number of tensors with model_scale 64 | return a dict{name, tensors(list)} 65 | ''' 66 | ret = get_model(model_name) 67 | ret['tensors'] = ret['tensors'] * int(model_scale) 68 | total_size = 0.0 69 | for i in ret['tensors']: 70 | total_size += i 71 | ret['total_size'] = round(total_size, 1) #float x.x 72 | return ret 73 | 74 | 75 | 76 | # if __name__ == '__main__': 77 | # # print('Hello world %d' % 2) 78 | # print(get_model_with_scale('vgg11', 2)) -------------------------------------------------------------------------------- /simulator/sim_fig10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | placement=("yarn") 3 | # dlas-gpu -- Tiresias; antman -- AntMan; themis -- Themis; multi-resource-blossom-same-gpu-unaware -- Muri-L 4 | schedule=("dlas-gpu" "antman" "themis" "multi-resource-blossom-same-gpu-unaware" ) 5 | # schedule=("themis") 6 | # philly trace 7 | jobs=("trace1" "trace1_pr" "trace2" "trace2_pr" "trace3" "trace3_pr" "trace4" "trace4_pr") 8 | 9 | setups=("n8g8") 10 | multi_resource=4 11 | 12 | mkdir results 13 | echo "running..." 14 | for setup in ${setups[@]};do 15 | cluster_spec="cluster_spec/${setup}.csv" 16 | for job in ${jobs[@]};do 17 | job_file="trace-data/${job}.csv" 18 | log_folder="results/${setup}j${job}" 19 | mkdir ${log_folder} 20 | # echo ${job} 21 | for p in ${placement[@]};do 22 | for s in ${schedule[@]};do 23 | log_name="${log_folder}/${s}-${p}" 24 | mkdir $log_name 25 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >tmp.out" 26 | # echo ${cmd} 27 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >${log_name}/tmp.out & 28 | done 29 | done 30 | done 31 | done 32 | 33 | wait 34 | echo "calc..." 35 | for setup in ${setups[@]};do 36 | cluster_spec="cluster_spec/${setup}.csv" 37 | for job in ${jobs[@]};do 38 | job_file="trace-data/${job}.csv" 39 | log_folder="results/${setup}j${job}" 40 | echo ${job} 41 | for p in ${placement[@]};do 42 | for s in ${schedule[@]};do 43 | echo $s 44 | log_name="${log_folder}/${s}-${p}" 45 | python calc.py ${log_name} 46 | done 47 | done 48 | done 49 | done 50 | -------------------------------------------------------------------------------- /simulator/sim_fig11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | placement=("yarn") 3 | # multi-resource-blossom-same-gpu-unaware -- Muri-L; 4 | # multi-resource-gpu-unaware -- Muri-L w/o Blossom 5 | # multi-resource-gpu-unaware -- Muri-L w/ worst ordering 6 | schedule=("multi-resource-blossom-same-gpu-unaware" "multi-resource-blossom-same-gpu-unaware-worstordering" "multi-resource-gpu-unaware") 7 | 8 | # philly trace 9 | jobs=("trace1" "trace2" "trace3" "trace4") 10 | 11 | setups=("n8g8") 12 | multi_resource=4 13 | 14 | mkdir results 15 | echo "running..." 16 | for setup in ${setups[@]};do 17 | cluster_spec="cluster_spec/${setup}.csv" 18 | for job in ${jobs[@]};do 19 | job_file="trace-data/${job}.csv" 20 | log_folder="results/${setup}j${job}" 21 | mkdir ${log_folder} 22 | # echo ${job} 23 | for p in ${placement[@]};do 24 | for s in ${schedule[@]};do 25 | log_name="${log_folder}/${s}-${p}" 26 | mkdir $log_name 27 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >tmp.out" 28 | # echo ${cmd} 29 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >${log_name}/tmp.out & 30 | done 31 | done 32 | done 33 | done 34 | 35 | wait 36 | echo "calc..." 37 | for setup in ${setups[@]};do 38 | cluster_spec="cluster_spec/${setup}.csv" 39 | for job in ${jobs[@]};do 40 | job_file="trace-data/${job}.csv" 41 | log_folder="results/${setup}j${job}" 42 | echo ${job} 43 | for p in ${placement[@]};do 44 | for s in ${schedule[@]};do 45 | echo $s 46 | log_name="${log_folder}/${s}-${p}" 47 | echo ${s}-${p} 48 | python calc.py ${log_name} 49 | done 50 | done 51 | done 52 | done 53 | -------------------------------------------------------------------------------- /simulator/sim_fig12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | placement=("yarn") 3 | 4 | schedule=("antman" "multi-resource-blossom-same-gpu-unaware") 5 | 6 | # philly trace 7 | jobs=("trace1_pr" "trace2_pr" "trace3_pr" "trace4_pr") 8 | 9 | setups=("n8g8") 10 | multi_resource=4 11 | packing_num=(2 3 4) 12 | 13 | mkdir results 14 | echo "running..." 15 | for setup in ${setups[@]};do 16 | cluster_spec="cluster_spec/${setup}.csv" 17 | for job in ${jobs[@]};do 18 | job_file="trace-data/${job}.csv" 19 | log_folder="results/${setup}j${job}" 20 | mkdir ${log_folder} 21 | # echo ${job} 22 | for p in ${placement[@]};do 23 | for s in ${schedule[@]};do 24 | if [ "$s" == "multi-resource-blossom-same-gpu-unaware" ]; then 25 | for pn in ${packing_num[@]};do 26 | log_name="${log_folder}/${s}-${p}-${pn}" 27 | mkdir $log_name 28 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} --packing_num ${pn} >tmp.out" 29 | # echo ${cmd} 30 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} --packing_num ${pn} >${log_name}/tmp.out & 31 | done 32 | else 33 | log_name="${log_folder}/${s}-${p}" 34 | mkdir $log_name 35 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >tmp.out" 36 | # echo ${cmd} 37 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >${log_name}/tmp.out & 38 | fi 39 | done 40 | done 41 | done 42 | done 43 | 44 | wait 45 | echo "calc..." 46 | for setup in ${setups[@]};do 47 | cluster_spec="cluster_spec/${setup}.csv" 48 | for job in ${jobs[@]};do 49 | job_file="trace-data/${job}.csv" 50 | log_folder="results/${setup}j${job}" 51 | echo ${job} 52 | for p in ${placement[@]};do 53 | for s in ${schedule[@]};do 54 | echo $s 55 | if [ "$s" == "multi-resource-blossom-same-gpu-unaware" ]; then 56 | for pn in ${packing_num[@]};do 57 | log_name="${log_folder}/${s}-${p}-${pn}" 58 | echo ${s}-${p}-${pn} 59 | python calc.py ${log_name} 60 | done 61 | else 62 | log_name="${log_folder}/${s}-${p}" 63 | echo ${s}-${p} 64 | python calc.py ${log_name} 65 | fi 66 | done 67 | done 68 | done 69 | done 70 | -------------------------------------------------------------------------------- /simulator/sim_fig13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | placement=("yarn") 3 | # shortest -- SRTF; multi-resource-blossom-same-gpu -- Muri-S; 4 | # dlas-gpu -- Tiresias; multi-resource-blossom-same-gpu-unaware -- Muri-L 5 | schedule=("shortest" "multi-resource-blossom-same-gpu" "dlas-gpu" "multi-resource-blossom-same-gpu-unaware") 6 | 7 | # philly trace 8 | jobs=("job_type_1" "job_type_2" "job_type_3" "job_type_4") 9 | 10 | setups=("n8g8") 11 | multi_resource=4 12 | 13 | mkdir results 14 | echo "running..." 15 | for setup in ${setups[@]};do 16 | cluster_spec="cluster_spec/${setup}.csv" 17 | for job in ${jobs[@]};do 18 | job_file="trace-data/${job}.csv" 19 | log_folder="results/${setup}j${job}" 20 | mkdir ${log_folder} 21 | # echo ${job} 22 | for p in ${placement[@]};do 23 | for s in ${schedule[@]};do 24 | log_name="${log_folder}/${s}-${p}" 25 | mkdir $log_name 26 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >tmp.out" 27 | # echo ${cmd} 28 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >${log_name}/tmp.out & 29 | done 30 | done 31 | done 32 | done 33 | 34 | wait 35 | echo "calc..." 36 | for setup in ${setups[@]};do 37 | cluster_spec="cluster_spec/${setup}.csv" 38 | for job in ${jobs[@]};do 39 | job_file="trace-data/${job}.csv" 40 | log_folder="results/${setup}j${job}" 41 | echo ${job} 42 | for p in ${placement[@]};do 43 | for s in ${schedule[@]};do 44 | echo $s 45 | log_name="${log_folder}/${s}-${p}" 46 | python calc.py ${log_name} 47 | done 48 | done 49 | done 50 | done 51 | -------------------------------------------------------------------------------- /simulator/sim_fig9.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | placement=("yarn") 3 | # shortest -- SRTF; shortest-gpu -- SRSF; multi-resource-blossom-same-gpu -- Muri-S 4 | schedule=("shortest" "shortest-gpu" "multi-resource-blossom-same-gpu") 5 | 6 | # philly trace 7 | jobs=("trace1" "trace1_pr" "trace2" "trace2_pr" "trace3" "trace3_pr" "trace4" "trace4_pr") 8 | 9 | setups=("n8g8") 10 | multi_resource=4 11 | 12 | mkdir results 13 | echo "running..." 14 | for setup in ${setups[@]};do 15 | cluster_spec="cluster_spec/${setup}.csv" 16 | for job in ${jobs[@]};do 17 | job_file="trace-data/${job}.csv" 18 | log_folder="results/${setup}j${job}" 19 | mkdir ${log_folder} 20 | # echo ${job} 21 | for p in ${placement[@]};do 22 | for s in ${schedule[@]};do 23 | log_name="${log_folder}/${s}-${p}" 24 | mkdir $log_name 25 | cmd="python3 run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >tmp.out" 26 | # echo ${cmd} 27 | python run_sim.py --cluster_spec=${cluster_spec} --print --scheme=${p} --trace_file=${job_file} --schedule=${s} --log_path=${log_name} --multi_resource ${multi_resource} >${log_name}/tmp.out & 28 | done 29 | done 30 | done 31 | done 32 | 33 | wait 34 | echo "calc..." 35 | for setup in ${setups[@]};do 36 | cluster_spec="cluster_spec/${setup}.csv" 37 | for job in ${jobs[@]};do 38 | job_file="trace-data/${job}.csv" 39 | log_folder="results/${setup}j${job}" 40 | echo ${job} 41 | for p in ${placement[@]};do 42 | for s in ${schedule[@]};do 43 | echo $s 44 | log_name="${log_folder}/${s}-${p}" 45 | python calc.py ${log_name} 46 | done 47 | done 48 | done 49 | done 50 | -------------------------------------------------------------------------------- /simulator/tf_job.csv: -------------------------------------------------------------------------------- 1 | job_id,num_gpu,submit_time,start_time,duration,model_size,aggr_interval,model_name,model_scale 2 | 10,12,11,14,20,400,0,vgg19,1 3 | 10,12,11,14,20,400,0,vgg19,1 -------------------------------------------------------------------------------- /simulator/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import subprocess 4 | import flags 5 | import logging 6 | import math 7 | FLAGS = flags.FLAGS 8 | 9 | def make_logger(name): 10 | LOG_FORMAT = '{name}:{levelname} [{asctime}] {message}' 11 | 12 | logger = logging.getLogger(name) 13 | logger.setLevel(logging.DEBUG) 14 | logger.propagate = False 15 | ch = logging.StreamHandler() 16 | ch.setFormatter(logging.Formatter(LOG_FORMAT, style='{')) 17 | logger.addHandler(ch) 18 | 19 | return logger 20 | 21 | def print_fn(log): 22 | if FLAGS.print: 23 | print(log) 24 | if FLAGS.flush_stdout: 25 | sys.stdout.flush() 26 | 27 | 28 | def mkdir(folder_path): 29 | cmd = 'mkdir -p ' + folder_path 30 | ret = subprocess.check_call(cmd, shell=True) 31 | print_fn(ret) 32 | 33 | 34 | def search_dict_list(dict_list, key, value): 35 | ''' 36 | Search the targeted in the dict_list 37 | Return: 38 | list entry, or just None 39 | ''' 40 | for e in dict_list: 41 | # if e.has_key(key) == True: 42 | if key in e: 43 | if math.isclose(e[key], value, rel_tol=1e-9): 44 | return e 45 | 46 | return None 47 | --------------------------------------------------------------------------------