├── .gitattributes ├── Comparison-methods ├── LICENSE ├── README.md ├── install-requirement ├── kill_tensorflow ├── local_start_slave ├── local_task_executor ├── remote_start_slave ├── remote_task_executor └── start_pytorch ├── Conext-ClusterML-camera-ready-version.pdf ├── MLFS ├── RLmodel.py ├── cluster.py ├── completion_task.py ├── computation.py ├── cpu_task.py ├── epoch_test.py ├── gpu_task.py ├── scheduler.py ├── server.py ├── statistics.py ├── task.py ├── task_executor.py └── train_RL_model.py ├── README.md ├── Simu ├── RLmodel.py ├── cluster.py ├── completion_task.py ├── computation.py ├── cpu_task.py ├── epoch_test.py ├── evaluator.py ├── gpu_task.py ├── scheduler.py ├── server.py ├── start_pytorch ├── statistics.py ├── task.py ├── task_executor.py ├── test.py ├── train_RL_model.py └── workload.py ├── Test ├── cluster.py ├── evaluator.py ├── real-test.py ├── server.py └── test.py ├── Workload └── workload.py ├── other ├── RLmodel.py ├── cluster.py ├── completion_task.py ├── computation.py ├── cpu_task.py ├── epoch_test.py ├── evaluator.py ├── gpu_task.py ├── scheduler.py ├── server.py ├── start_pytorch ├── statistics.py ├── task.py ├── task_executor.py ├── test.py ├── train_RL_model.py └── workload.py └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Comparison-methods/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 hiddenlayer2020 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Comparison-methods/README.md: -------------------------------------------------------------------------------- 1 | Please read the paper for more detail of the comparison methods. -------------------------------------------------------------------------------- /Comparison-methods/install-requirement: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo apt-get update 3 | sudo apt-get install python-pip 4 | pip install -r requirements.txt 5 | sudo apt-get update 6 | sudo apt-get install -y libsm6 libxext6 libxrender-dev 7 | pip install opencv-python 8 | pip install scipy 9 | pip install torch 10 | #pip uninstall numpy 11 | #pip uninstall numpy 12 | #pip uninstall numpy 13 | #pip install numpy==1.16.2 14 | sudo apt-get install python-tk 15 | pip install psutil -------------------------------------------------------------------------------- /Comparison-methods/kill_tensorflow: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | kill -9 $(ps -A | grep python) 3 | rm -rf graph* 4 | rm -rf model* 5 | rm -rf event* 6 | rm -rf checkpoint* 7 | ps -A | grep python 8 | -------------------------------------------------------------------------------- /Comparison-methods/local_start_slave: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python /home/ubuntu/TF-scheduler/server.py slave -------------------------------------------------------------------------------- /Comparison-methods/local_task_executor: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /home/ubuntu/TF-scheduler 4 | if [ ! -d "/home/ubuntu/TF-scheduler/"$4 ] 5 | then 6 | mkdir /home/ubuntu/TF-scheduler/$4 7 | cd /home/ubuntu/TF-scheduler/$4 8 | cp ../{task.py,cluster.py,completion_task.py,computation.py,evaluator.py,RLmodel.py,scheduler.py,server.py,statistics.py,task_executor.py,train_RL_model.py,workload.py} ./ 9 | fi 10 | cd /home/ubuntu/TF-scheduler/$4 11 | 12 | python ./task.py --ps_hosts=$1 --worker_hosts=$2 --job_name=$3 --task_index=0 --train_folder=/home/ubuntu/TF-scheduler/$4 --job $4 --task $5 &>$4_log_task$5 -------------------------------------------------------------------------------- /Comparison-methods/remote_start_slave: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if(($#!=1)) 3 | then 4 | echo 'usage: ./remote_start_slave ip' 5 | fi 6 | ssh -i ./tensorflow-zetian.pem ubuntu@$1 "/home/ubuntu/TF-scheduler/local_start_slave &" & -------------------------------------------------------------------------------- /Comparison-methods/remote_task_executor: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if(($#!=6)) 3 | then 4 | echo 'usage: ./remote_task_executor ps_host_ip port worker_host_ip port job_name task_index' 5 | fi 6 | 7 | ps_host_ip=$1 8 | ps_host_port=$2 9 | ps_host=$1:$2 10 | 11 | worker_host_ip=$3 12 | worker_host_port=$4 13 | worker_host=$3:$4 14 | 15 | #echo 'ps host ip: '$ps_host_ip 16 | #echo 'ps host port: '$ps_host_port 17 | #echo 'ps host: '$ps_host 18 | 19 | #echo 'worker host ip: '$worker_host_ip 20 | #echo 'worker host port: '$worker_host_port 21 | #echo 'worker host: '$worker_host 22 | 23 | #python trainer.py --ps_hosts=($ps_host_ip):$(ps_host_port) --worker_hosts=$(worker_host_ip):$(worker_host_port) --job_name=ps --task_index=0 --train_folder=/home/ubuntu/TF-scheduler 24 | 25 | #python trainer.py --ps_hosts=($ps_host_ip):$(ps_host_port) --worker_hosts=$(worker_host_ip):$(worker_host_port) --job_name=worker --task_index=0 --train_folder=/home/ubuntu/TF-scheduler 26 | 27 | ssh -i ./tensorflow-zetian.pem ubuntu@$ps_host_ip "/home/ubuntu/TF-scheduler/local_task_executor $ps_host $worker_host ps $5 $6 &" & 28 | 29 | ssh -i ./tensorflow-zetian.pem ubuntu@$worker_host_ip "/home/ubuntu/TF-scheduler/local_task_executor $ps_host $worker_host worker $5 $6 &" & 30 | 31 | -------------------------------------------------------------------------------- /Comparison-methods/start_pytorch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python trainer.py \ 3 | --ps_hosts=172.31.6.117:2222\ 4 | --worker_hosts=172.31.15.67:2222\ 5 | --job_name=worker --task_index=0 6 | -------------------------------------------------------------------------------- /Conext-ClusterML-camera-ready-version.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hiddenlayer2020/ML-Job-Scheduler-MLFS/38728f360fa2090abaa5aedbcc72e22dc84fc76f/Conext-ClusterML-camera-ready-version.pdf -------------------------------------------------------------------------------- /MLFS/RLmodel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | import math 5 | class RLnetwork(object): 6 | def __init__(self, numInputs, numOutputs): 7 | self.input_dim = numInputs 8 | self.output_dim = numOutputs 9 | self.weight = torch.Tensor(numInputs, numOutputs).normal_(0, 0.01) 10 | self.weight_grads = torch.Tensor(numInputs, numOutputs) 11 | self.bias = torch.Tensor(numOutputs).zero_() 12 | self.bias_grads = torch.Tensor(numOutputs) 13 | 14 | def forward(self, state_vector): #input: 1 x n. weight: m x n. output: weight x input.t() = m x 1 15 | 16 | return torch.matmul(state_vector, self.weight) + self.bias 17 | 18 | def backward(self, state_vector, vt, learning_rate): 19 | log_grad = 1/(torch.matmul(state_vector, self.weight) + self.bias) 20 | sv = torch.Tensor(self.input_dim,1) 21 | lg = torch.Tensor(1,self.output_dim) 22 | sv = state_vector.unsqueeze_(-1) 23 | lg[0][:] = log_grad 24 | weight_grads = torch.matmul(sv,lg) 25 | self.weight_grads += learning_rate * weight_grads * vt 26 | self.bias_grads += learning_rate * log_grad * vt 27 | return self.weight_grads, self.bias_grads 28 | 29 | def zero_grad(self): 30 | self.weight_grads = 0 31 | self.bias_grads = 0 32 | 33 | def update_grads(self): 34 | self.weight -= self.weight_grads 35 | self.bias -= self.bias_grads -------------------------------------------------------------------------------- /MLFS/cluster.py: -------------------------------------------------------------------------------- 1 | import random 2 | #!/usr/bin/env python 3 | import psutil 4 | from server import * 5 | import threading 6 | import socket 7 | import server 8 | from time import sleep 9 | def get_cpu_memory(): 10 | # gives a single float value 11 | cpu_usage = psutil.cpu_percent() 12 | #print('cpu percentage: '+str(cpu_usage)+'%') 13 | # gives an object with many fields 14 | psutil.virtual_memory() 15 | # you can convert that object to a dictionary 16 | vm_dic = dict(psutil.virtual_memory()._asdict()) 17 | memory_usage = vm_dic['percent'] 18 | #print('memory percentage: '+str(memory_usage)+'%') 19 | 20 | return cpu_usage, memory_usage 21 | 22 | class Cluster: 23 | node_list = [] 24 | topology = {} 25 | completed_task = {} #job_name:[task_name, task_name, ...] 26 | task_distribution = {} #job_name:{node_name: num_task, node_name: num_task, ...} 27 | def __init__(self, num_rack, max_rack_size): 28 | self.num_rack = num_rack 29 | self.max_rack_size = max_rack_size 30 | 31 | for i in range(num_rack): 32 | self.topology[i] = [] 33 | 34 | def process(self, cur_timestep): 35 | for node in self.node_list: 36 | pop_index = [] 37 | if(len(node.queue) != 0): 38 | for i in range(len(node.queue)): 39 | task = node.queue[i] 40 | if(task.job_name not in node.job_list): 41 | node.add_task(task,cur_timestep) 42 | pop_index.append(i) 43 | for index in sorted(pop_index, reverse = True): 44 | node.queue.pop(index) 45 | 46 | def find_master(self): 47 | for node in self.node_list: 48 | if(node.master): 49 | return node 50 | 51 | def add_node(self,node,rack): 52 | self.node_list.append(node) 53 | self.topology[rack].append(node) 54 | 55 | def has_av_res(self): 56 | for node in self.node_list: 57 | num_rtype = len(node.av_resource) 58 | node_available = True 59 | for i in range(num_rtype): 60 | if(node.av_resource[i] > 0.2): 61 | continue 62 | else: 63 | node_available = False 64 | break 65 | if(node_available): 66 | return True 67 | 68 | return False 69 | 70 | ''' 71 | def update_process_rate(self): 72 | new_message_distribution = {} # node_name: num_new_message 73 | for i in range(len(self.node_list)):#initialization 74 | node_name = self.node_list[i].name 75 | new_message_distribution[node_name] = 0 76 | 77 | for job_name in self.task_distribution: 78 | task_map = self.task_distribution[job_name] 79 | num_task = 0 80 | for exe_node_name,num_node_task in task_map.items(): 81 | num_task += num_node_task 82 | for exe_node_name,num_node_task in task_map.items(): 83 | num_other_node_task = num_task - num_node_task 84 | num_new_message = num_node_task * num_other_node_task 85 | new_message_distribution[exe_node_name] += num_new_message 86 | 87 | for node in self.node_list: 88 | node_new_message = new_message_distribution[node.name] 89 | node.process_rate = 1 - 0.005 * node_new_message 90 | if(node.process_rate < 0.2): 91 | node.process_rate = 0.2 92 | node.num_total_message += node_new_message 93 | ''' 94 | 95 | def step(self,time): 96 | #self.complete_task #job_name:[task_name, task_name, ... ] 97 | 98 | #for job_name, task_list in self.completed_task.items(): 99 | # self.task_distribution[job_name][node.name] -= len(task_list) 100 | 101 | return self.completed_task 102 | 103 | def complete_task(self,task,node): 104 | task_map = self.task_distribution[task.job_name] 105 | task_map[node.name] -= 1 106 | if(task_map[node.name] == 0): 107 | del task_map[node.name] 108 | if(len(task_map) == 0): 109 | del self.task_distribution[task.job_name] 110 | 111 | def find_task_on_node(self, worker_ip): 112 | for node in self.node_list: 113 | if(worker_ip == node.ip): 114 | return node 115 | 116 | def find_node(self, task): 117 | demand = task.demand 118 | min_affinity = len(demand) + 1 119 | selected_node = None 120 | available = True 121 | for node in self.node_list: 122 | cur_affinity = 0 123 | available = True 124 | for i in range(len(demand)): 125 | if(demand[i] < node.av_resource[i]): 126 | cur_affinity += node.av_resource[i] - demand[i] 127 | else: 128 | available = False 129 | if(available and cur_affinity < min_affinity): 130 | min_affinity = cur_affinity 131 | selected_node = node 132 | 133 | return selected_node 134 | 135 | def find_minload_node(self): 136 | num_rtype = len(self.node_list[0].av_resource) 137 | minload = 0 138 | first_node = self.node_list[0] 139 | selected_node = first_node 140 | for i in range(num_rtype): 141 | minload += 1 - first_node.av_resource[i] 142 | 143 | for node in self.node_list: 144 | cur_load = 0 145 | for i in range(num_rtype): 146 | cur_load += node.av_resource[i] 147 | if(cur_load < minload): 148 | minload = cur_load 149 | selected_node = node 150 | return selected_node 151 | 152 | 153 | class Node: 154 | 155 | def __init__(self, name, num_rtype, ip): 156 | self.localIP = socket.gethostbyname(socket.gethostname()) 157 | self.ip = ip 158 | self.master = (self.localIP == self.ip) 159 | self.job_list = [] 160 | self.queue = [] 161 | self.workload = None 162 | self.cluster = None 163 | self.scheduler = None 164 | 165 | 166 | self.name = name 167 | self.server_port = 9999 168 | self.port_availability = {} 169 | for port in range(2222,8888+1): 170 | self.port_availability[port] = True 171 | 172 | self.av_resource = [] 173 | for i in range(num_rtype): 174 | self.av_resource.append(100) 175 | 176 | self.num_total_message = 0 177 | self.num_exe_task = 0 178 | self.num_total_task = 0 179 | self.task_list = [] 180 | if(not self.master): 181 | start_server_cmd = '/home/ubuntu/TF-scheduler/remote_start_slave '+ip 182 | thread = threading.Thread(target = execute_command, args = (start_server_cmd,), name = 'Slave-Thread'+self.ip) 183 | thread.start() 184 | 185 | def start_master(self, workload, cluster, scheduler): 186 | 187 | localIP = socket.gethostbyname(socket.gethostname()) 188 | Master(localIP, 9999, workload, cluster, scheduler).start() 189 | 190 | 191 | 192 | ''' 193 | def process(self): 194 | completed_task = {} #job_name:[task_name, task_name, ... ] 195 | len_task_list = len(self.task_list) 196 | pop_index = [] 197 | max_resource = 0 198 | num_rtype = len(self.av_resource) 199 | for i in range(num_rtype): 200 | if(max_resource < 1 - self.av_resource[i]): 201 | max_resource = 1 - self.av_resource[i] 202 | 203 | true_process_rate = self.process_rate 204 | 205 | if(max_resource > 1): 206 | true_process_rate *= 1 / max_resource 207 | 208 | for i in range(len_task_list): 209 | task = self.task_list[i] 210 | if(task.duration - true_process_rate <= 0): 211 | if(task.job_name not in completed_task): 212 | completed_task[task.job_name] = [] 213 | self.num_exe_task -= 1 214 | task.duration = 0 215 | task.executing = False 216 | task.complete = True 217 | 218 | completed_task[task.job_name].append(task.task_name) 219 | 220 | num_rtype = len(self.av_resource) 221 | for j in range(num_rtype): 222 | self.av_resource[j] += task.demand[j] 223 | if(self.av_resource[j] > 1): 224 | self.av_resource[j] = 1 225 | pop_index.append(i) 226 | 227 | 228 | else: 229 | task.duration -= true_process_rate 230 | 231 | for ind in sorted(pop_index, reverse = True): 232 | self.task_list.pop(ind) 233 | 234 | return completed_task 235 | ''' 236 | 237 | def get_available_port(self): 238 | for port in range(2222,8888+1): 239 | if(self.port_availability[port]): 240 | self.port_availability[port] = False 241 | return str(port) 242 | return -1 243 | 244 | def execute_task(self, task): 245 | 246 | worker_ip = task.worker_ip 247 | ps_port = task.ps_port 248 | ps_ip = task.ps_ip 249 | 250 | request = 'execute,'+ps_ip+':'+str(ps_port)+','+worker_ip+':'+str(task.occupied_worker_port)+','+task.job_name+','+str(task.index) 251 | 252 | response = TCP_request(ps_ip, 9999, request) 253 | print('execute task of '+task.job_name+' on worker: '+worker_ip+':'+str(task.occupied_worker_port)) 254 | 255 | def add_task(self, task, cur_timestep): 256 | task.in_queue = False 257 | task.executing = True 258 | self.job_list.append(task.job_name) 259 | task.waiting_time = cur_timestep - task.arriving_time 260 | self.num_exe_task += 1 261 | self.num_total_task += 1 262 | #num_rtype = len(self.av_resource) 263 | #for i in range(num_rtype): 264 | # self.av_resource[i] -= task.demand[i] 265 | self.task_list.append(task) 266 | self.execute_task(task) 267 | 268 | def is_started(self): 269 | request = 'is_started' 270 | response = 'not start' 271 | try: 272 | response = TCP_request(self.ip, 9999, request) 273 | finally: 274 | return (response == 'start') 275 | def clear_removed_task(self): 276 | if(len(self.queue) == 0): 277 | return 278 | pop_index = [] 279 | for i in range(len(self.queue)): 280 | task = self.queue[i] 281 | if(task.complete): 282 | pop_index.append(i) 283 | 284 | for index in sorted(pop_index, reverse = True): 285 | self.queue.pop(index) 286 | 287 | 288 | def update_av_resource(self,cur_timestep): 289 | 290 | while(not self.is_started()): 291 | print(self.ip+' does not start, waiting...') 292 | sleep(5) 293 | 294 | request = 'cpu' 295 | response = TCP_request(self.ip, 9999, request) 296 | CPU = float(response) 297 | request = 'memory' 298 | response = TCP_request(self.ip, 9999, request) 299 | Memory = float(response) 300 | self.av_resource[0] = 100 - CPU 301 | self.av_resource[1] = 100 - Memory 302 | 303 | for task in self.queue: 304 | if(task.job_name not in self.job_list): 305 | self.add_task(task,cur_timestep) 306 | 307 | 308 | 309 | def create_node(node_name, num_rtype, ip): 310 | node = Node(node_name, num_rtype, ip) 311 | return node 312 | 313 | def create_cluster(max_rack, max_rack_size, max_num_nodes, num_rtype, ip_list): 314 | cur_num_node = 0 315 | cluster = Cluster(max_rack, max_rack_size) 316 | 317 | for i in range(max_num_nodes): 318 | cur_num_node += 1 319 | node_name = 'node'+str(cur_num_node) 320 | node = create_node(node_name,num_rtype, ip_list[i]) 321 | rack = random.randint(0,max_rack-1) 322 | cluster.add_node(node,rack) 323 | 324 | return cluster 325 | 326 | def display_available_port(port_availability): 327 | for port in range(2222,8888+1): 328 | if(not port_availability[port]): 329 | continue 330 | else: 331 | print('available ports: '+str(port)+'-8888') 332 | break 333 | 334 | def display_node(node): 335 | print('node name: '+node.name) 336 | print('node ip: '+node.ip) 337 | display_available_port(node.port_availability) 338 | print('available resource percent: ') 339 | num_rtype = len(node.av_resource) 340 | for i in range(num_rtype): 341 | print(str(node.av_resource[i])) 342 | if(len(node.task_list) != 0): 343 | print('task list') 344 | for task in node.task_list: 345 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 346 | else: 347 | print('task list empty') 348 | 349 | if(len(node.queue) != 0): 350 | print('node queue') 351 | for task in node.queue: 352 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 353 | else: 354 | print('node queue empty') 355 | 356 | if(len(node.job_list) != 0): 357 | print('node job list') 358 | for job_name in node.job_list: 359 | print(job_name) 360 | else: 361 | print('node job list empty') 362 | print('\n') 363 | 364 | 365 | def display_cluster(cluster, num_rtype): 366 | print('\nnumber of nodes: '+str(len(cluster.node_list))) 367 | for node in cluster.node_list: 368 | display_node(node) 369 | 370 | def find_node(cluster, target_ip): 371 | for node in cluster.node_list: 372 | if(node.ip == target_ip): 373 | return node 374 | 375 | if __name__ == "__main__": 376 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 377 | 378 | ip_list = [] 379 | for ip in servers.values(): 380 | ip_list.append(ip) 381 | 382 | max_num_rack = 3 383 | max_rack_size = 5 384 | max_num_nodes = len(servers) 385 | num_rtype = 2 386 | 387 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 388 | display_cluster(cluster,2) -------------------------------------------------------------------------------- /MLFS/completion_task.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | import server 4 | #TCP_request(ip,port,request): 5 | 6 | shared_resource_lock = threading.Lock() 7 | 8 | def stop_ps(ps_host): 9 | cmd = "kill -9 $(ps -ef | grep " + str(ps_host) + " | grep 'job_name=ps' | awk '{print $2}')" 10 | server.execute_command(cmd) 11 | 12 | 13 | def completion_task(job_name, task_index, worker_ip, worker_port, update_loss, job_dir, cur_serial): 14 | ip = '172.31.6.117' 15 | port = '9999' 16 | request = 'task_completion,'+job_name+','+task_index+','+worker_ip+','+worker_port+','+update_loss+','+job_dir+','+str(cur_serial) 17 | response = server.TCP_request(ip,port,request) 18 | print(response) 19 | -------------------------------------------------------------------------------- /MLFS/computation.py: -------------------------------------------------------------------------------- 1 | import math 2 | def average(a): 3 | total = 0 4 | num_element = len(a) 5 | for element in a: 6 | total += element 7 | 8 | return total / num_element 9 | 10 | def std_dev(a): 11 | aver = average(a) 12 | total = 0 13 | for element in a: 14 | total += (element - aver) * (element - aver) 15 | 16 | return math.sqrt(total) 17 | 18 | def get_num_message_list(cluster): 19 | num_message_list = [] 20 | for node in cluster.node_list: 21 | num_message_list.append(node.num_total_message) 22 | return num_message_list 23 | 24 | def get_num_task_list(cluster): 25 | num_task_list = [] 26 | for node in cluster.node_list: 27 | num_task_list.append(node.num_total_task) 28 | return num_task_list 29 | 30 | def get_remaining_time_list(queue,timestep): 31 | remaining_time_list = [] 32 | for task in queue: 33 | remaining_time_list.append(task.arriving_time + task.deadline - timestep) 34 | return remaining_time_list 35 | 36 | def get_input_size_list(queue): 37 | input_size_list = [] 38 | for task in queue: 39 | input_size_list.append(task.input_size) 40 | return input_size_list 41 | 42 | def node_reward(W_c_m, W_c_t, t,cluster): 43 | ''' 44 | node.num_total_message 45 | node.num_exe_task 46 | node.num_total_task 47 | ''' 48 | reward_t = 0 49 | num_task_list = get_num_task_list(cluster) 50 | num_message_list = get_num_message_list(cluster) 51 | 52 | aver_num_task = average(num_task_list) 53 | std_num_task = std_dev(num_task_list) 54 | 55 | aver_num_message = average(num_message_list) 56 | std_num_message = std_dev(num_message_list) 57 | 58 | for node in cluster.node_list: 59 | r1 = 0 60 | r2 = 0 61 | 62 | if(std_num_message != 0): 63 | r1 = (node.num_total_message - aver_num_message) / std_num_message 64 | 65 | if(std_num_task != 0): 66 | r2 = (aver_num_task - node.num_total_task) / std_num_task 67 | 68 | reward_t += W_c_m * r1 + W_c_t * r2 69 | 70 | return reward_t 71 | 72 | 73 | 74 | def queue_reward(W_q_d, W_q_s, W_q_r, t, aver_S, scheduler, beta, queue, workload): 75 | 76 | reward_t = 0 77 | input_size_list = get_input_size_list(queue) 78 | remaining_time_list = get_remaining_time_list(queue,t) 79 | 80 | 81 | aver_remaining_time = average(remaining_time_list) 82 | std_remaining_time = std_dev(remaining_time_list) 83 | 84 | aver_input_size = average(input_size_list) 85 | std_input_size = std_dev(input_size_list) 86 | 87 | 88 | for task in queue: 89 | job = workload[int(task.job_name[3])] 90 | num_scheduled_task = scheduler.get_num_scheduled_task(job) 91 | remaining_time = task.arriving_time + task.deadline - t 92 | r1 = 0 93 | r2 =0 94 | r3 = 0 95 | 96 | if(std_remaining_time != 0): 97 | r1 = W_q_d * (remaining_time - aver_remaining_time) / std_remaining_time 98 | 99 | if(std_input_size != 0): 100 | r2 = W_q_s * (task.input_size - aver_input_size / std_input_size) 101 | 102 | r3 = W_q_r * (100 - num_scheduled_task*beta)/beta 103 | 104 | reward_t += r1 + r2 + r3 105 | 106 | return reward_t 107 | 108 | 109 | def discounted_reward(gama,t,reward_traj): 110 | acc_reward = 0 111 | for ti in range(t): 112 | acc_reward += pow(gama,ti) * reward_traj[ti] 113 | return acc_reward -------------------------------------------------------------------------------- /MLFS/epoch_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 10000 24 | max_num_nodes = 3 25 | min_task = 3600 26 | max_task = 3600 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 10000 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 1 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | node1 = None 104 | node2 = None 105 | 106 | for node in cluster.node_list: 107 | if(node.ip == '172.31.4.135'): 108 | node1 = node 109 | elif(node.ip == '172.31.3.225'): 110 | node2 = node 111 | 112 | num_task = 1 113 | for task in workload[0].task_list: 114 | if(num_task % 2 == 1): 115 | task.worker_ip = node1.ip 116 | task.ps_port = master.get_available_port() 117 | task.occupied_worker_port = node1.get_available_port() 118 | node1.queue.append(task) 119 | else: 120 | task.worker_ip = node2.ip 121 | task.ps_port = master.get_available_port() 122 | task.occupied_worker_port = node2.get_available_port() 123 | node2.queue.append(task) 124 | num_task += 1 125 | 126 | while(True): 127 | cluster.process(0) 128 | time.sleep(8) 129 | 130 | -------------------------------------------------------------------------------- /MLFS/server.py: -------------------------------------------------------------------------------- 1 | 2 | import socket 3 | import threading 4 | import sys 5 | import random 6 | #!/usr/bin/env python 7 | import psutil 8 | import os 9 | import completion_task 10 | from workload import * 11 | from cluster import * 12 | 13 | def get_files(job_dir, file_keyword): 14 | files = [] 15 | for file in os.listdir(job_dir): 16 | if(os.path.isfile(os.path.join(job_dir,file)) and file.startswith(file_keyword)): 17 | files.append(file) 18 | return files 19 | 20 | 21 | def del_old_model(job_dir, cur_serial, task_index): 22 | files1 = get_files(job_dir, "model.ckpt") 23 | files2 = get_files(job_dir, "latest_model") 24 | for file in files1: 25 | partition = file.split('.') 26 | part2 = partition[1] 27 | serial_number = int(part2[5:]) 28 | if(cur_serial - serial_number >=50): 29 | os.remove(job_dir+"/"+file) 30 | for file in files2: 31 | partition = file.split('.') 32 | part1 = partition[0] 33 | epoch = int(part1.split('_')[3][5:]) 34 | if(epoch != task_index): 35 | os.remove(job_dir+"/"+file) 36 | 37 | def getbytes(string): 38 | return string.encode() 39 | 40 | def getstring(byte): 41 | return byte.decode() 42 | 43 | def TCP_request(ip,port,request): 44 | c_s = socket.socket() 45 | response = 'request failed' 46 | try: 47 | c_s.connect((ip,int(port))) 48 | byte_request = getbytes(request) 49 | c_s.send(byte_request) 50 | byte_response = c_s.recv(1024) 51 | response = getstring(byte_response) 52 | 53 | except Exception, e: 54 | print('ip: '+ip+"\tport: "+str(port)+'\t request: '+request) 55 | print(e) 56 | 57 | finally: 58 | return response 59 | 60 | def execute_command(cmd): 61 | os.system(cmd) 62 | 63 | 64 | class Master: 65 | def __init__(self, ip, port, workload, cluster, scheduler): 66 | self.workload = workload 67 | self.sock = socket.socket() 68 | self.ip = ip 69 | self.port = port 70 | self.sock.bind((ip, int(port))) 71 | self.sock.listen(5) 72 | self.cluster = cluster 73 | self.scheduler = scheduler 74 | self.should_stop = False 75 | 76 | def TCP_reply(self, sock, addr): 77 | 78 | #print('new request from ' + str(addr) + ' is accepted') 79 | while (not self.should_stop): 80 | response = '' 81 | b_data=sock.recv(1024) 82 | data = getstring(b_data) 83 | if (data =='exit' or not data): 84 | break; 85 | 86 | elif(data =='ip'): 87 | response = str(self.ip) 88 | 89 | elif(data =='port'): 90 | response = str(self.port) 91 | 92 | elif(data == 'is_started'): 93 | response = 'start' 94 | 95 | elif(data == 'cpu'): 96 | cpu_usage = psutil.cpu_percent() 97 | response = str(cpu_usage) 98 | 99 | elif(data == 'memory'): 100 | vm_dic = dict(psutil.virtual_memory()._asdict()) 101 | memory_usage = vm_dic['percent'] 102 | response = str(memory_usage) 103 | 104 | elif(data.startswith('loss')): 105 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 106 | response = f.readline() 107 | 108 | elif(data.startswith('accuracy')): 109 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 110 | response = f.readline() 111 | elif(data.startswith('cluster')): 112 | response = 'nodes infomation' 113 | for node in self.cluster.node_list: 114 | response += 'node name: '+node.name+'\n' 115 | response += 'node ip: '+node.ip+'\n' 116 | 117 | if(len(node.task_list) != 0): 118 | response += 'task list\n' 119 | for task in node.task_list: 120 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 121 | else: 122 | response += 'task list empty' 123 | 124 | if(len(node.queue) != 0): 125 | response += 'node queue\n' 126 | for task in node.queue: 127 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 128 | else: 129 | response += 'node queue empty\n' 130 | 131 | 132 | elif(data.startswith('execute')): 133 | info = data.split(',') 134 | ps_addr = info[1] 135 | ps_ip = ps_addr.split(':')[0] 136 | ps_port = ps_addr.split(':')[1] 137 | worker_addr = info[2] 138 | worker_ip = worker_addr.split(':')[0] 139 | worker_port = worker_addr.split(':')[1] 140 | job_name = info[3] 141 | task_index = info[4] 142 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 143 | 144 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 145 | thread.start() 146 | response = 'executing the task of '+job_name 147 | 148 | elif(data.startswith('checkpoint')): 149 | job_dir = data.split(',')[1] 150 | if(not os.path.exists(job_dir+'/checkpoint')): 151 | response = 'none' 152 | else: 153 | with open(job_dir+'/checkpoint', 'r') as f: 154 | response = f.read() 155 | 156 | elif(data.startswith('update_checkpoint')): 157 | partition = data.split(',') 158 | checkpoint_info = partition[1] 159 | job_dir = partition[2] 160 | with open(job_dir+'/checkpoint', 'w') as f: 161 | f.write(checkpoint_info) 162 | if(os.path.exists(job_dir+'/checkpoint')): 163 | response = 'update checkpoint file success!' 164 | else: 165 | response = 'update failed' 166 | 167 | elif(data.startswith('task_completion')): 168 | info = data.split(',') #task_completion job_name task_index worker_ip worker_port 169 | job_name = info[1] 170 | job_index = int(job_name[3]) 171 | task_index = int(info[2]) 172 | worker_ip = info[3] 173 | worker_port = info[4] 174 | update_loss = info[5] 175 | job_dir = info[6] 176 | cur_serial = int(info[7]) 177 | 178 | job = self.workload[job_index] 179 | task = job.task_list[task_index] 180 | 181 | if(update_loss == 'YES'): 182 | request = 'loss_'+job_name 183 | response = TCP_request(worker_ip, 9999, request) 184 | cur_loss = float(response) 185 | job.loss_traj.append(cur_loss) 186 | 187 | del_old_model(job_dir, cur_serial, task_index) 188 | 189 | task.executing = False 190 | task.complete = True 191 | 192 | completion_task.shared_resource_lock.acquire() 193 | if(not self.cluster.completed_task.has_key(job_name)): 194 | self.cluster.completed_task[job_name] = [] 195 | 196 | self.cluster.completed_task[job_name].append(task.task_name) 197 | 198 | node = self.cluster.find_task_on_node(worker_ip) 199 | for i in range(len(node.task_list)): 200 | cur_task = node.task_list[i] 201 | if(cur_task.job_name == task.job_name and 202 | cur_task.task_name == task.task_name): 203 | node.task_list.pop(i) 204 | break 205 | for i in range(len(node.job_list)): 206 | cur_job_name = node.job_list[i] 207 | if(cur_job_name == task.job_name): 208 | node.job_list.pop(i) 209 | break 210 | #display_cluster(self.cluster, 2) 211 | completion_task.shared_resource_lock.release() 212 | print('port number is '+str(task.ps_port)) 213 | response = 'processed task completion!' 214 | completion_task.stop_ps(task.ps_port) 215 | 216 | b_response = getbytes(response) 217 | sock.send(b_response) 218 | sock.close() 219 | #print('Connection from %s:%s closed' % addr) 220 | 221 | def start(self): 222 | while (True): 223 | c_sock, addr = self.sock.accept() 224 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 225 | thread.start() 226 | 227 | class Slave: 228 | def __init__(self, ip, port): 229 | self.sock = socket.socket() 230 | self.ip = ip 231 | self.port = str(port) 232 | self.sock.bind((ip, int(port))) 233 | self.sock.listen(5) 234 | self.should_stop = False 235 | 236 | 237 | def TCP_reply(self, sock, addr): 238 | 239 | #print('new request from ' + str(addr) + ' is accepted') 240 | while (not self.should_stop): 241 | response = '' 242 | b_data=sock.recv(1024) 243 | data = getstring(b_data) 244 | if (data =='exit' or not data): 245 | break; 246 | 247 | elif(data =='ip'): 248 | response = str(self.ip) 249 | 250 | elif(data =='port'): 251 | response = str(self.port) 252 | 253 | elif(data == 'is_started'): 254 | response = 'start' 255 | 256 | elif(data == 'cpu'): 257 | cpu_usage = psutil.cpu_percent() 258 | response = str(cpu_usage) 259 | 260 | elif(data == 'memory'): 261 | vm_dic = dict(psutil.virtual_memory()._asdict()) 262 | memory_usage = vm_dic['percent'] 263 | response = str(memory_usage) 264 | 265 | elif(data.startswith('loss')): 266 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 267 | response = f.readline() 268 | 269 | elif(data.startswith('accuracy')): 270 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 271 | response = f.readline() 272 | 273 | elif(data.startswith('execute')): 274 | info = data.split(',') 275 | ps_addr = info[1] 276 | ps_ip = ps_addr.split(':')[0] 277 | ps_port = ps_addr.split(':')[1] 278 | worker_addr = info[2] 279 | worker_ip = worker_addr.split(':')[0] 280 | worker_port = worker_addr.split(':')[1] 281 | job_name = info[3] 282 | task_index = info[4] 283 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 284 | 285 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 286 | thread.start() 287 | response = 'executing the task of '+job_name 288 | 289 | b_response = getbytes(response) 290 | sock.send(b_response) 291 | sock.close() 292 | #print('Connection from %s:%s closed' % addr) 293 | 294 | def start(self): 295 | while (True): 296 | c_sock, addr = self.sock.accept() 297 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 298 | thread.start() 299 | 300 | 301 | if (__name__ == '__main__'): 302 | #print(len(sys.argv)) 303 | if(len(sys.argv) != 2): 304 | print('usage: python server master/slave') 305 | print('exit') 306 | sys.exit(0) 307 | role = sys.argv[1] 308 | localIP = socket.gethostbyname(socket.gethostname()) 309 | node = None 310 | 311 | if(role == 'master'): 312 | print('master') 313 | node = Master(localIP,9999) 314 | print('register') 315 | node.start() 316 | print('listening') 317 | 318 | elif(role == 'slave'): 319 | print('slave') 320 | node = Slave(localIP,9999) 321 | print('register') 322 | node.start() 323 | print('listening') 324 | 325 | else: 326 | print('request') 327 | response = TCP_request(localIP,9999,sys.argv[1]) 328 | print("response is: "+response) 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /MLFS/statistics.py: -------------------------------------------------------------------------------- 1 | def show_statistics(workload,cluster,timer): 2 | 3 | total_complete_time = 0 4 | num_deadline_satisfied = 0 5 | total_num_message = 0 6 | 7 | total_waiting_time = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 8 | num_job = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 9 | for i in range(4): 10 | total_waiting_time.append(0) 11 | num_job.append(0) 12 | 13 | acc_reduction = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 4:0.45-0.6, 5:0.6-0.75, 6:0.75-1 14 | aver_jct_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 15 | num_job_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 16 | 17 | for i in range(6): 18 | acc_reduction.append(0) 19 | aver_jct_removed_ratio.append(0) 20 | num_job_removed_ratio.append(0) 21 | 22 | 23 | #total_task = 0 24 | JCTs = [] 25 | for job in workload: 26 | arriving_time = job.task_list[0].arriving_time 27 | complete_time = job.complete_time 28 | JCTs.append(complete_time - arriving_time) 29 | 30 | num_task = len(job.task_list) 31 | job_waiting_time = 0 32 | 33 | for task in job.task_list: 34 | #total_task += 1 35 | job_waiting_time += task.waiting_time 36 | 37 | if(job.input_size<325):#100-325 38 | total_waiting_time[0] += job_waiting_time / num_task 39 | num_job[0] += 1 40 | elif(job.input_size<550):#325-550 41 | total_waiting_time[1] += job_waiting_time / num_task 42 | num_job[1] += 1 43 | elif(job.input_size<775):#550-775 44 | total_waiting_time[2] += job_waiting_time / num_task 45 | num_job[2] += 1 46 | elif(job.input_size<1000):#775-1000 47 | total_waiting_time[3] += job_waiting_time / num_task 48 | num_job[3] += 1 49 | 50 | removed_ratio = job.num_removed_task / len(job.task_list) 51 | total_num_task = len(job.task_list) 52 | num_removed_task = job.num_removed_task 53 | 54 | if(removed_ratio < 0.15): 55 | acc_reduction[0] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 56 | aver_jct_removed_ratio[0] += complete_time - arriving_time 57 | num_job_removed_ratio[0] += 1 58 | elif(removed_ratio < 0.3): 59 | acc_reduction[1] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 60 | aver_jct_removed_ratio[1] += complete_time - arriving_time 61 | num_job_removed_ratio[1] += 1 62 | elif(removed_ratio < 0.45): 63 | acc_reduction[2] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 64 | aver_jct_removed_ratio[2] += complete_time - arriving_time 65 | num_job_removed_ratio[2] += 1 66 | elif(removed_ratio < 0.6): 67 | acc_reduction[3] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 68 | aver_jct_removed_ratio[3] += complete_time - arriving_time 69 | num_job_removed_ratio[3] += 1 70 | elif(removed_ratio < 0.75): 71 | acc_reduction[4] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 72 | aver_jct_removed_ratio[4] += complete_time - arriving_time 73 | num_job_removed_ratio[4] += 1 74 | elif(removed_ratio < 1): 75 | acc_reduction[5] += 1 - (job.initial_loss - 0.35246) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 76 | aver_jct_removed_ratio[5] += complete_time - arriving_time 77 | num_job_removed_ratio[5] += 1 78 | 79 | jct = complete_time - arriving_time 80 | total_complete_time += jct 81 | 82 | if(jct < job.deadline): 83 | num_deadline_satisfied += 1 84 | 85 | sorted_JCTs = sorted(JCTs) 86 | num_job = len(workload) 87 | for i in range(num_job): 88 | jct = sorted_JCTs[i] 89 | perc = ((i+1)/num_job)*100 90 | print('CDF point( '+str(jct)+', '+str(perc)+'%)') 91 | 92 | avg_waiting_time = 0 93 | 94 | deadline_gurantee = num_deadline_satisfied / len(workload) 95 | print('job deadline gurantee = '+str(deadline_gurantee)) 96 | 97 | print('remove ratio: 0-15%, 15-30%, 30-45%, 45-60%, 60-75%') 98 | 99 | aver_job_acc_red = 0 100 | 101 | for i in range(6): 102 | if(num_job_removed_ratio[i] == 0): 103 | acc_reduction[i] = 0 104 | aver_jct_removed_ratio[i] = 0 105 | else: 106 | acc_reduction[i] /= num_job_removed_ratio[i] 107 | aver_jct_removed_ratio[i] /= num_job_removed_ratio[i] 108 | aver_job_acc_red += acc_reduction[i] 109 | 110 | print('percentage of this part: '+str(num_job_removed_ratio[i] * 100 / len(workload))+'%') 111 | print('job accuracy reduction = '+str(acc_reduction[i] / len(workload))) 112 | print('average JCT over removed ratio = '+str(aver_jct_removed_ratio[i])) 113 | 114 | print('average job accuracy reduction: '+str(aver_job_acc_red / len(workload))) 115 | 116 | for i in range(4): 117 | if(num_job[i] != 0): 118 | avg_waiting_time += total_waiting_time[i] 119 | total_waiting_time[i] /= num_job[i] 120 | print('job waiting time = '+str(total_waiting_time[i])) 121 | print('average job waiting time = '+str(avg_waiting_time/len(workload))) 122 | average_jct = total_complete_time / len(workload) 123 | print('average jct = '+str(average_jct)) 124 | 125 | total_num_message = 0 126 | 127 | for node in cluster.node_list: 128 | total_num_message += node.num_total_message 129 | 130 | avg_message = total_num_message/len(cluster.node_list) 131 | avg_bandwidth = avg_message * 0.0035 132 | print('avg_bandwidth = '+str(avg_bandwidth)) 133 | 134 | total_time = 0 135 | 136 | for time in timer: 137 | total_time += time 138 | aver_time = total_time / len(timer) 139 | print('average latency = '+str(aver_time)) 140 | 141 | 142 | def is_schedule_all_task(job_list): 143 | for job in job_list: 144 | for task in job.task_list: 145 | if(not task.in_queue and not task.executing and not task.complete): 146 | return False 147 | return True 148 | 149 | def schedule_policy(Gandiva, job_list, cluster): 150 | can_schedule = True 151 | if(not Gandiva): 152 | can_schedule = cluster.has_av_res() 153 | else: 154 | can_schedule = not is_schedule_all_task(job_list) 155 | return can_schedule -------------------------------------------------------------------------------- /MLFS/task_executor.py: -------------------------------------------------------------------------------- 1 | #from task import * 2 | #from workload import * 3 | import os 4 | 5 | cmd = 'ls' 6 | os.system(cmd) 7 | 8 | -------------------------------------------------------------------------------- /MLFS/train_RL_model.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from statistics import * 5 | from workload import * 6 | from cluster import * 7 | from scheduler import * 8 | #from task import * 9 | from RLmodel import * 10 | from computation import * 11 | import threading 12 | import completion_task 13 | 14 | 15 | max_res_demand = 0.3 16 | min_res_demand = 0.1 17 | min_in_size = 100 18 | max_in_size = 1000 19 | min_task_dur = 10 20 | max_task_dur = 50 21 | max_length_q = 100 22 | max_num_nodes = 4 23 | min_task = 30 24 | max_task = 50 25 | 26 | max_rack_size = 5 27 | max_num_rack = 3 28 | num_rtype = 2 29 | 30 | max_q_length = 50 31 | max_num_task = 200 32 | min_num_task = 20 33 | min_ddl = max_task_dur * min_task 34 | max_ddl = max_task_dur * max_task 35 | add_task_perc = 0.1 36 | beta = 10 #percentage of tasks one time 37 | 38 | W_q_d = 0.5 39 | W_q_s = 0.5 40 | W_q_r = 0.5 41 | 42 | W_c_m = 0.5 43 | W_c_t = 0.5 44 | 45 | gama_q = 0.9 46 | gama_n = 0.9 47 | learning_rate = 0.1 48 | 49 | remove_threshold = 0.0 50 | 51 | num_coming_job = 5 52 | 53 | remove_option = False 54 | order_net_option = True 55 | node_net_option = True 56 | 57 | node_net_in_dim = 2 * max_num_nodes 58 | node_net_out_dim = max_num_nodes 59 | 60 | order_net_in_dim = 3 * max_q_length 61 | order_net_out_dim = max_q_length 62 | 63 | #servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 64 | servers = {'node1':'172.31.6.117', 'node2':'172.31.44.243', 'node3':'172.31.35.92', 'node4':'172.31.35.219'} 65 | 66 | ip_list = [] 67 | for ip in servers.values(): 68 | ip_list.append(ip) 69 | 70 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 71 | 72 | new_job = True 73 | 74 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 75 | 76 | num_episodes = 1 77 | num_iteration = 1 78 | 79 | wl_len = 10 80 | 81 | Gandiva = False 82 | 83 | ps_ip = socket.gethostbyname(socket.gethostname()) 84 | ps_node = find_node(cluster, ps_ip) 85 | 86 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 87 | for node in cluster.node_list: 88 | if(node.master): 89 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 90 | thread.start() 91 | break 92 | 93 | timer = [] 94 | #workloads = [] 95 | #for i in range(num_episodes): 96 | # random.shuffle(workload) 97 | # workload_copy = copy_workload(workload) 98 | # workloads.append(workload_copy) 99 | 100 | for iteration in range(num_iteration): 101 | print('iteration: '+str(iteration)) 102 | #workload = generate_workload(wl_len,min_task,max_task,min_in_size,max_in_size,min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype) 103 | unfinished_job_list = workload 104 | queue_reward_traj = [] 105 | node_reward_traj = [] 106 | 107 | queue_sv_traj = [] 108 | node_sv_traj = [] 109 | 110 | queue_discounted_reward_traj = [] 111 | node_discounted_reward_traj = [] 112 | 113 | scheduler.node_network.zero_grad() 114 | scheduler.order_network.zero_grad() 115 | for episode in range(num_episodes): 116 | print('episode: '+str(episode)) 117 | #workload = workloads[episode] 118 | queue_reward_traj.append([]) 119 | node_reward_traj.append([]) 120 | 121 | queue_sv_traj.append([]) 122 | node_sv_traj.append([]) 123 | 124 | queue_discounted_reward_traj.append([]) 125 | node_discounted_reward_traj.append([]) 126 | 127 | cur_timestep = time.clock() 128 | cur_job_num = 0 129 | num_schedule = 0 130 | schedule_interval = 5 131 | 132 | while(len(unfinished_job_list) != 0): 133 | 134 | cur_timestep = time.clock() + num_schedule * schedule_interval 135 | if(cur_job_num < wl_len): 136 | workload[cur_job_num].update_arr_time(cur_timestep) 137 | scheduler.job_arriving(workload[cur_job_num]) 138 | cur_job_num += num_coming_job 139 | if(cur_job_num >= wl_len): 140 | cur_job_num = wl_len 141 | 142 | unfinished_job_list = find_unfinished_jobs(workload, cur_job_num, cur_timestep) 143 | 144 | if(len(unfinished_job_list) == 0): 145 | print('all jobs finish') 146 | break 147 | last_add_job = int(unfinished_job_list[0].job_name[3]) 148 | 149 | valid = True 150 | num_decision = 0 151 | 152 | while(valid): 153 | 154 | dead_loop = 0 155 | 156 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 157 | 158 | while(can_schedule and len(scheduler.queue) < scheduler.max_q_length): 159 | 160 | dead_loop += 1 161 | 162 | last_add_job = scheduler.fill_queue(unfinished_job_list, last_add_job, dead_loop, remove_threshold, remove_option) 163 | if(last_add_job < 0): 164 | break 165 | 166 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 167 | 168 | start = time.clock() 169 | if(not Gandiva): 170 | cluster.process(cur_timestep) 171 | valid, q_sv, n_sv, t_ind, n_ind = scheduler.schedule_one_task(cur_timestep, cluster, order_net_option, node_net_option) 172 | else: 173 | valid = scheduler.Gandiva_schedule_one_task(cur_timestep, cluster) 174 | elapsed = (time.clock() - start) 175 | 176 | 177 | num_decision += 1 178 | 179 | if(len(scheduler.queue) == 0): 180 | #print('schedule all tasks') 181 | break 182 | 183 | if(not valid): 184 | #print('invalid action') 185 | num_decision -= 1 186 | break 187 | 188 | timer.append(elapsed) 189 | if(not Gandiva): 190 | queue_sv_traj[episode].append(q_sv) 191 | node_sv_traj[episode].append(n_sv) 192 | 193 | aver_size = scheduler.average_size() 194 | 195 | beta = scheduler.add_task_perc 196 | queue = scheduler.queue 197 | 198 | q_rt = queue_reward(W_q_d, W_q_s, W_q_r, cur_timestep, aver_size, scheduler, beta, queue, workload) 199 | queue_reward_traj[episode].append(q_rt) 200 | 201 | n_rt = node_reward(W_c_m, W_c_t, cur_timestep, cluster) 202 | node_reward_traj[episode].append(n_rt) 203 | 204 | print('\n\ncurrent time: '+str(cur_timestep)) 205 | 206 | completion_task.shared_resource_lock.acquire() 207 | for job in workload: 208 | job.update_priority() 209 | for node in cluster.node_list: 210 | node.clear_removed_task() 211 | 212 | scheduler.clear_removed_task() 213 | display_scheduler(scheduler) 214 | display_cluster(cluster,2) 215 | for job in workload: 216 | display_job(job) 217 | 218 | print('scheduler now is sleeping... itertaion '+str(num_schedule)) 219 | time.sleep(schedule_interval) 220 | 221 | completion_task.shared_resource_lock.release() 222 | num_schedule += 1 223 | 224 | print('finish episode '+str(episode)+', makespan = '+str(cur_timestep)) 225 | 226 | if(not Gandiva): 227 | num_action = len(queue_sv_traj[episode]) 228 | for j in range(num_action): 229 | n_vt = discounted_reward(gama_n, j, node_reward_traj[episode]) 230 | q_vt = discounted_reward(gama_q, j, queue_reward_traj[episode]) 231 | node_discounted_reward_traj[episode].append(n_vt) 232 | queue_discounted_reward_traj[episode].append(q_vt) 233 | 234 | show_statistics(workload,cluster,timer) 235 | timer = [] 236 | 237 | unfinished_job_list = workload 238 | 239 | if(not Gandiva): 240 | num_action = 100000000 241 | 242 | for episode in range(num_episodes): 243 | if(num_action > len(queue_sv_traj[episode])): 244 | num_action = len(queue_sv_traj[episode]) 245 | 246 | q_r_episode_baseline = [] 247 | n_r_episode_baseline = [] 248 | for j in range(num_action): 249 | total_qr_b = 0 250 | total_nr_b = 0 251 | for episode in range(num_episodes): 252 | total_qr_b += queue_discounted_reward_traj[episode][j] 253 | total_nr_b += node_discounted_reward_traj[episode][j] 254 | 255 | q_r_episode_baseline.append(total_qr_b / num_episodes) 256 | n_r_episode_baseline.append(total_nr_b / num_episodes) 257 | 258 | for episode in range(num_episodes): 259 | for j in range(num_action): 260 | q_sv = queue_sv_traj[episode][j] 261 | n_sv = node_sv_traj[episode][j] 262 | q_vt = queue_discounted_reward_traj[episode][j] 263 | n_vt = node_discounted_reward_traj[episode][j] 264 | qr_b = q_r_episode_baseline[j] 265 | nr_b = n_r_episode_baseline[j] 266 | scheduler.order_network.backward(q_sv, q_vt - qr_b, learning_rate) 267 | scheduler.node_network.backward(n_sv, n_vt - nr_b, learning_rate) 268 | 269 | scheduler.node_network.update_grads() 270 | scheduler.order_network.update_grads() 271 | 272 | torch.save(scheduler.node_network.weight, 'node_weight_ep'+str(iteration)+'.para') 273 | torch.save(scheduler.node_network.bias, 'node_bias_ep'+str(iteration)+'.para') 274 | torch.save(scheduler.order_network.weight, 'order_weight_ep'+str(iteration)+'.para') 275 | torch.save(scheduler.order_network.bias, 'order_bias_ep'+str(iteration)+'.para') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Job Scheduling for Large-Scale Machine Learning Clusters 2 | 3 | 4 | MLFS is a machine learning job feature based job scheduler for machine learning clusters running both data parallelism and model parallelism machine learning jobs. The corresponding research work is published in CoNEXT 2020. Please click [here](https://dl.acm.org/doi/pdf/10.1145/3386367.3432588) for the detail of the paper. 5 | 6 | ### Prerequisites 7 | - Install prerequisites (tested with Ubuntu 16.04, Tensorflow v1.8.0) 8 | ``` 9 | pip3 install -r (all the required softwares) 10 | ``` 11 | 12 | Required software: tensorflow==1.8.0, opencv-python==3.4.0.12, tqdm==4.19.6, pandas==0.22.0, matplotlib==2.2.0 13 | 14 | numpy==1.16.2, scikit-learn==0.19.1, Python3==python 3.7.3 15 | 16 | ### Training 17 | - To train a new model, put training data in `MLFS`, then in `sim/` run `python RLmodel.py` and then run 18 | ``` 19 | python train_RL_model.py 20 | ``` 21 | 22 | The reward signal and meta-setting of video can be modified in `RLmodel.py`. 23 | 24 | ### Testing 25 | - To test the trained model in simulated environment, first copy over the model to `test/models` and modify the `NN_MODEL` field of `test/train_RL_model.py` , and then in `test/` run `python evaluator.py` and then run 26 | ``` 27 | python test.py 28 | ``` 29 | 30 | ### Real-world experiments 31 | - To run real-world experiments, distribute the whole folder to each physical machine within the cluster. Then, copy the trained RL model to `MLFS` and modify the `NN_MODEL` filed of `test/train_RL_model.py`. Next, update the IP addresses in `cluster.py` and `server.py`. Finally, in `test` run 32 | ``` 33 | python real-test.py 34 | ``` 35 | 36 | The results will be saved to `test/results` folder. 37 | 38 | 39 | ### ACKNOWLEDGEMENTS 40 | We thank the valuable suggestions from the anonymous reviewers and the shepherd. This research was supported in part by U.S. NSF grants, Microsoft Research Faculty Fellowship, and AWS Machine Learning Research Awards. 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /Simu/RLmodel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | import math 5 | class RLnetwork(object): 6 | def __init__(self, numInputs, numOutputs): 7 | self.input_dim = numInputs 8 | self.output_dim = numOutputs 9 | self.weight = torch.Tensor(numInputs, numOutputs).normal_(0, 0.01) 10 | self.weight_grads = torch.Tensor(numInputs, numOutputs) 11 | self.bias = torch.Tensor(numOutputs).zero_() 12 | self.bias_grads = torch.Tensor(numOutputs) 13 | 14 | def forward(self, state_vector): #input: 1 x n. weight: m x n. output: weight x input.t() = m x 1 15 | 16 | return torch.matmul(state_vector, self.weight) + self.bias 17 | 18 | def backward(self, state_vector, vt, learning_rate): 19 | log_grad = 1/(torch.matmul(state_vector, self.weight) + self.bias) 20 | sv = torch.Tensor(self.input_dim,1) 21 | lg = torch.Tensor(1,self.output_dim) 22 | sv = state_vector.unsqueeze_(-1) 23 | lg[0][:] = log_grad 24 | weight_grads = torch.matmul(sv,lg) 25 | self.weight_grads += learning_rate * weight_grads * vt 26 | self.bias_grads += learning_rate * log_grad * vt 27 | return self.weight_grads, self.bias_grads 28 | 29 | def zero_grad(self): 30 | self.weight_grads = 0 31 | self.bias_grads = 0 32 | 33 | def update_grads(self): 34 | self.weight -= self.weight_grads 35 | self.bias -= self.bias_grads -------------------------------------------------------------------------------- /Simu/cluster.py: -------------------------------------------------------------------------------- 1 | import random 2 | #!/usr/bin/env python 3 | import psutil 4 | from server import * 5 | import threading 6 | import socket 7 | import server 8 | from time import sleep 9 | def get_cpu_memory(): 10 | # gives a single float value 11 | cpu_usage = psutil.cpu_percent() 12 | #print('cpu percentage: '+str(cpu_usage)+'%') 13 | # gives an object with many fields 14 | psutil.virtual_memory() 15 | # you can convert that object to a dictionary 16 | vm_dic = dict(psutil.virtual_memory()._asdict()) 17 | memory_usage = vm_dic['percent'] 18 | #print('memory percentage: '+str(memory_usage)+'%') 19 | 20 | return cpu_usage, memory_usage 21 | 22 | class Cluster: 23 | node_list = [] 24 | topology = {} 25 | completed_task = {} #job_name:[task_name, task_name, ...] 26 | task_distribution = {} #job_name:{node_name: num_task, node_name: num_task, ...} 27 | def __init__(self, num_rack, max_rack_size): 28 | self.num_rack = num_rack 29 | self.max_rack_size = max_rack_size 30 | 31 | for i in range(num_rack): 32 | self.topology[i] = [] 33 | 34 | def process(self, cur_timestep): 35 | for node in self.node_list: 36 | pop_index = [] 37 | if(len(node.queue) != 0): 38 | for i in range(len(node.queue)): 39 | task = node.queue[i] 40 | if(task.job_name not in node.job_list): 41 | node.add_task(task,cur_timestep) 42 | pop_index.append(i) 43 | for index in sorted(pop_index, reverse = True): 44 | node.queue.pop(index) 45 | 46 | def find_master(self): 47 | for node in self.node_list: 48 | if(node.master): 49 | return node 50 | 51 | def add_node(self,node,rack): 52 | self.node_list.append(node) 53 | self.topology[rack].append(node) 54 | 55 | def has_av_res(self): 56 | for node in self.node_list: 57 | num_rtype = len(node.av_resource) 58 | node_available = True 59 | for i in range(num_rtype): 60 | if(node.av_resource[i] > 0.2): 61 | continue 62 | else: 63 | node_available = False 64 | break 65 | if(node_available): 66 | return True 67 | 68 | return False 69 | 70 | ''' 71 | def update_process_rate(self): 72 | new_message_distribution = {} # node_name: num_new_message 73 | for i in range(len(self.node_list)):#initialization 74 | node_name = self.node_list[i].name 75 | new_message_distribution[node_name] = 0 76 | 77 | for job_name in self.task_distribution: 78 | task_map = self.task_distribution[job_name] 79 | num_task = 0 80 | for exe_node_name,num_node_task in task_map.items(): 81 | num_task += num_node_task 82 | for exe_node_name,num_node_task in task_map.items(): 83 | num_other_node_task = num_task - num_node_task 84 | num_new_message = num_node_task * num_other_node_task 85 | new_message_distribution[exe_node_name] += num_new_message 86 | 87 | for node in self.node_list: 88 | node_new_message = new_message_distribution[node.name] 89 | node.process_rate = 1 - 0.005 * node_new_message 90 | if(node.process_rate < 0.2): 91 | node.process_rate = 0.2 92 | node.num_total_message += node_new_message 93 | ''' 94 | 95 | def step(self,time): 96 | #self.complete_task #job_name:[task_name, task_name, ... ] 97 | 98 | #for job_name, task_list in self.completed_task.items(): 99 | # self.task_distribution[job_name][node.name] -= len(task_list) 100 | 101 | return self.completed_task 102 | 103 | def complete_task(self,task,node): 104 | task_map = self.task_distribution[task.job_name] 105 | task_map[node.name] -= 1 106 | if(task_map[node.name] == 0): 107 | del task_map[node.name] 108 | if(len(task_map) == 0): 109 | del self.task_distribution[task.job_name] 110 | 111 | def find_task_on_node(self, worker_ip): 112 | for node in self.node_list: 113 | if(worker_ip == node.ip): 114 | return node 115 | 116 | def find_node(self, task): 117 | demand = task.demand 118 | min_affinity = len(demand) + 1 119 | selected_node = None 120 | available = True 121 | for node in self.node_list: 122 | cur_affinity = 0 123 | available = True 124 | for i in range(len(demand)): 125 | if(demand[i] < node.av_resource[i]): 126 | cur_affinity += node.av_resource[i] - demand[i] 127 | else: 128 | available = False 129 | if(available and cur_affinity < min_affinity): 130 | min_affinity = cur_affinity 131 | selected_node = node 132 | 133 | return selected_node 134 | 135 | def find_minload_node(self): 136 | num_rtype = len(self.node_list[0].av_resource) 137 | minload = 0 138 | first_node = self.node_list[0] 139 | selected_node = first_node 140 | for i in range(num_rtype): 141 | minload += 1 - first_node.av_resource[i] 142 | 143 | for node in self.node_list: 144 | cur_load = 0 145 | for i in range(num_rtype): 146 | cur_load += node.av_resource[i] 147 | if(cur_load < minload): 148 | minload = cur_load 149 | selected_node = node 150 | return selected_node 151 | 152 | 153 | class Node: 154 | 155 | def __init__(self, name, num_rtype, ip): 156 | self.localIP = socket.gethostbyname(socket.gethostname()) 157 | self.ip = ip 158 | self.master = (self.localIP == self.ip) 159 | self.job_list = [] 160 | self.queue = [] 161 | self.workload = None 162 | self.cluster = None 163 | self.scheduler = None 164 | 165 | 166 | self.name = name 167 | self.server_port = 9999 168 | self.port_availability = {} 169 | for port in range(2222,8888+1): 170 | self.port_availability[port] = True 171 | 172 | self.av_resource = [] 173 | for i in range(num_rtype): 174 | self.av_resource.append(100) 175 | 176 | self.num_total_message = 0 177 | self.num_exe_task = 0 178 | self.num_total_task = 0 179 | self.task_list = [] 180 | if(not self.master): 181 | start_server_cmd = '/home/ubuntu/TF-scheduler/remote_start_slave '+ip 182 | thread = threading.Thread(target = execute_command, args = (start_server_cmd,), name = 'Slave-Thread'+self.ip) 183 | thread.start() 184 | 185 | def start_master(self, workload, cluster, scheduler): 186 | 187 | localIP = socket.gethostbyname(socket.gethostname()) 188 | Master(localIP, 9999, workload, cluster, scheduler).start() 189 | 190 | 191 | 192 | ''' 193 | def process(self): 194 | completed_task = {} #job_name:[task_name, task_name, ... ] 195 | len_task_list = len(self.task_list) 196 | pop_index = [] 197 | max_resource = 0 198 | num_rtype = len(self.av_resource) 199 | for i in range(num_rtype): 200 | if(max_resource < 1 - self.av_resource[i]): 201 | max_resource = 1 - self.av_resource[i] 202 | 203 | true_process_rate = self.process_rate 204 | 205 | if(max_resource > 1): 206 | true_process_rate *= 1 / max_resource 207 | 208 | for i in range(len_task_list): 209 | task = self.task_list[i] 210 | if(task.duration - true_process_rate <= 0): 211 | if(task.job_name not in completed_task): 212 | completed_task[task.job_name] = [] 213 | self.num_exe_task -= 1 214 | task.duration = 0 215 | task.executing = False 216 | task.complete = True 217 | 218 | completed_task[task.job_name].append(task.task_name) 219 | 220 | num_rtype = len(self.av_resource) 221 | for j in range(num_rtype): 222 | self.av_resource[j] += task.demand[j] 223 | if(self.av_resource[j] > 1): 224 | self.av_resource[j] = 1 225 | pop_index.append(i) 226 | 227 | 228 | else: 229 | task.duration -= true_process_rate 230 | 231 | for ind in sorted(pop_index, reverse = True): 232 | self.task_list.pop(ind) 233 | 234 | return completed_task 235 | ''' 236 | 237 | def get_available_port(self): 238 | for port in range(2222,8888+1): 239 | if(self.port_availability[port]): 240 | self.port_availability[port] = False 241 | return str(port) 242 | return -1 243 | 244 | def execute_task(self, task): 245 | 246 | worker_ip = task.worker_ip 247 | ps_port = task.ps_port 248 | ps_ip = task.ps_ip 249 | 250 | request = 'execute,'+ps_ip+':'+str(ps_port)+','+worker_ip+':'+str(task.occupied_worker_port)+','+task.job_name+','+str(task.index) 251 | 252 | response = TCP_request(ps_ip, 9999, request) 253 | print('execute task of '+task.job_name+' on worker: '+worker_ip+':'+str(task.occupied_worker_port)) 254 | 255 | def add_task(self, task, cur_timestep): 256 | task.in_queue = False 257 | task.executing = True 258 | self.job_list.append(task.job_name) 259 | task.waiting_time = cur_timestep - task.arriving_time 260 | self.num_exe_task += 1 261 | self.num_total_task += 1 262 | #num_rtype = len(self.av_resource) 263 | #for i in range(num_rtype): 264 | # self.av_resource[i] -= task.demand[i] 265 | self.task_list.append(task) 266 | self.execute_task(task) 267 | 268 | def is_started(self): 269 | request = 'is_started' 270 | response = 'not start' 271 | try: 272 | response = TCP_request(self.ip, 9999, request) 273 | finally: 274 | return (response == 'start') 275 | def clear_removed_task(self): 276 | if(len(self.queue) == 0): 277 | return 278 | pop_index = [] 279 | for i in range(len(self.queue)): 280 | task = self.queue[i] 281 | if(task.complete): 282 | pop_index.append(i) 283 | 284 | for index in sorted(pop_index, reverse = True): 285 | self.queue.pop(index) 286 | 287 | 288 | def update_av_resource(self,cur_timestep): 289 | 290 | while(not self.is_started()): 291 | print(self.ip+' does not start, waiting...') 292 | sleep(5) 293 | 294 | request = 'cpu' 295 | response = TCP_request(self.ip, 9999, request) 296 | CPU = float(response) 297 | request = 'memory' 298 | response = TCP_request(self.ip, 9999, request) 299 | Memory = float(response) 300 | self.av_resource[0] = 100 - CPU 301 | self.av_resource[1] = 100 - Memory 302 | 303 | for task in self.queue: 304 | if(task.job_name not in self.job_list): 305 | self.add_task(task,cur_timestep) 306 | 307 | 308 | 309 | def create_node(node_name, num_rtype, ip): 310 | node = Node(node_name, num_rtype, ip) 311 | return node 312 | 313 | def create_cluster(max_rack, max_rack_size, max_num_nodes, num_rtype, ip_list): 314 | cur_num_node = 0 315 | cluster = Cluster(max_rack, max_rack_size) 316 | 317 | for i in range(max_num_nodes): 318 | cur_num_node += 1 319 | node_name = 'node'+str(cur_num_node) 320 | node = create_node(node_name,num_rtype, ip_list[i]) 321 | rack = random.randint(0,max_rack-1) 322 | cluster.add_node(node,rack) 323 | 324 | return cluster 325 | 326 | def display_available_port(port_availability): 327 | for port in range(2222,8888+1): 328 | if(not port_availability[port]): 329 | continue 330 | else: 331 | print('available ports: '+str(port)+'-8888') 332 | break 333 | 334 | def display_node(node): 335 | print('node name: '+node.name) 336 | print('node ip: '+node.ip) 337 | display_available_port(node.port_availability) 338 | print('available resource percent: ') 339 | num_rtype = len(node.av_resource) 340 | for i in range(num_rtype): 341 | print(str(node.av_resource[i])) 342 | if(len(node.task_list) != 0): 343 | print('task list') 344 | for task in node.task_list: 345 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 346 | else: 347 | print('task list empty') 348 | 349 | if(len(node.queue) != 0): 350 | print('node queue') 351 | for task in node.queue: 352 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 353 | else: 354 | print('node queue empty') 355 | 356 | if(len(node.job_list) != 0): 357 | print('node job list') 358 | for job_name in node.job_list: 359 | print(job_name) 360 | else: 361 | print('node job list empty') 362 | print('\n') 363 | 364 | 365 | def display_cluster(cluster, num_rtype): 366 | print('\nnumber of nodes: '+str(len(cluster.node_list))) 367 | for node in cluster.node_list: 368 | display_node(node) 369 | 370 | def find_node(cluster, target_ip): 371 | for node in cluster.node_list: 372 | if(node.ip == target_ip): 373 | return node 374 | 375 | if __name__ == "__main__": 376 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 377 | 378 | ip_list = [] 379 | for ip in servers.values(): 380 | ip_list.append(ip) 381 | 382 | max_num_rack = 3 383 | max_rack_size = 5 384 | max_num_nodes = len(servers) 385 | num_rtype = 2 386 | 387 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 388 | display_cluster(cluster,2) -------------------------------------------------------------------------------- /Simu/completion_task.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | import server 4 | #TCP_request(ip,port,request): 5 | 6 | shared_resource_lock = threading.Lock() 7 | 8 | def stop_ps(ps_host): 9 | cmd = "kill -9 $(ps -ef | grep " + str(ps_host) + " | grep 'job_name=ps' | awk '{print $2}')" 10 | server.execute_command(cmd) 11 | 12 | 13 | def completion_task(job_name, task_index, worker_ip, worker_port, update_loss, job_dir, cur_serial): 14 | ip = '172.31.6.117' 15 | port = '9999' 16 | request = 'task_completion,'+job_name+','+task_index+','+worker_ip+','+worker_port+','+update_loss+','+job_dir+','+str(cur_serial) 17 | response = server.TCP_request(ip,port,request) 18 | print(response) 19 | -------------------------------------------------------------------------------- /Simu/computation.py: -------------------------------------------------------------------------------- 1 | import math 2 | def average(a): 3 | total = 0 4 | num_element = len(a) 5 | for element in a: 6 | total += element 7 | 8 | return total / num_element 9 | 10 | def std_dev(a): 11 | aver = average(a) 12 | total = 0 13 | for element in a: 14 | total += (element - aver) * (element - aver) 15 | 16 | return math.sqrt(total) 17 | 18 | def get_num_message_list(cluster): 19 | num_message_list = [] 20 | for node in cluster.node_list: 21 | num_message_list.append(node.num_total_message) 22 | return num_message_list 23 | 24 | def get_num_task_list(cluster): 25 | num_task_list = [] 26 | for node in cluster.node_list: 27 | num_task_list.append(node.num_total_task) 28 | return num_task_list 29 | 30 | def get_remaining_time_list(queue,timestep): 31 | remaining_time_list = [] 32 | for task in queue: 33 | remaining_time_list.append(task.arriving_time + task.deadline - timestep) 34 | return remaining_time_list 35 | 36 | def get_input_size_list(queue): 37 | input_size_list = [] 38 | for task in queue: 39 | input_size_list.append(task.input_size) 40 | return input_size_list 41 | 42 | def node_reward(W_c_m, W_c_t, t,cluster): 43 | ''' 44 | node.num_total_message 45 | node.num_exe_task 46 | node.num_total_task 47 | ''' 48 | reward_t = 0 49 | num_task_list = get_num_task_list(cluster) 50 | num_message_list = get_num_message_list(cluster) 51 | 52 | aver_num_task = average(num_task_list) 53 | std_num_task = std_dev(num_task_list) 54 | 55 | aver_num_message = average(num_message_list) 56 | std_num_message = std_dev(num_message_list) 57 | 58 | for node in cluster.node_list: 59 | r1 = 0 60 | r2 = 0 61 | 62 | if(std_num_message != 0): 63 | r1 = (node.num_total_message - aver_num_message) / std_num_message 64 | 65 | if(std_num_task != 0): 66 | r2 = (aver_num_task - node.num_total_task) / std_num_task 67 | 68 | reward_t += W_c_m * r1 + W_c_t * r2 69 | 70 | return reward_t 71 | 72 | 73 | 74 | def queue_reward(W_q_d, W_q_s, W_q_r, t, aver_S, scheduler, beta, queue, workload): 75 | 76 | reward_t = 0 77 | input_size_list = get_input_size_list(queue) 78 | remaining_time_list = get_remaining_time_list(queue,t) 79 | 80 | 81 | aver_remaining_time = average(remaining_time_list) 82 | std_remaining_time = std_dev(remaining_time_list) 83 | 84 | aver_input_size = average(input_size_list) 85 | std_input_size = std_dev(input_size_list) 86 | 87 | 88 | for task in queue: 89 | job = workload[int(task.job_name[3])] 90 | num_scheduled_task = scheduler.get_num_scheduled_task(job) 91 | remaining_time = task.arriving_time + task.deadline - t 92 | r1 = 0 93 | r2 =0 94 | r3 = 0 95 | 96 | if(std_remaining_time != 0): 97 | r1 = W_q_d * (remaining_time - aver_remaining_time) / std_remaining_time 98 | 99 | if(std_input_size != 0): 100 | r2 = W_q_s * (task.input_size - aver_input_size / std_input_size) 101 | 102 | r3 = W_q_r * (100 - num_scheduled_task*beta)/beta 103 | 104 | reward_t += r1 + r2 + r3 105 | 106 | return reward_t 107 | 108 | 109 | def discounted_reward(gama,t,reward_traj): 110 | acc_reward = 0 111 | for ti in range(t): 112 | acc_reward += pow(gama,ti) * reward_traj[ti] 113 | return acc_reward -------------------------------------------------------------------------------- /Simu/epoch_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 10000 24 | max_num_nodes = 3 25 | min_task = 3600 26 | max_task = 3600 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 10000 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 1 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | node1 = None 104 | node2 = None 105 | 106 | for node in cluster.node_list: 107 | if(node.ip == '172.31.4.135'): 108 | node1 = node 109 | elif(node.ip == '172.31.3.225'): 110 | node2 = node 111 | 112 | num_task = 1 113 | for task in workload[0].task_list: 114 | if(num_task % 2 == 1): 115 | task.worker_ip = node1.ip 116 | task.ps_port = master.get_available_port() 117 | task.occupied_worker_port = node1.get_available_port() 118 | node1.queue.append(task) 119 | else: 120 | task.worker_ip = node2.ip 121 | task.ps_port = master.get_available_port() 122 | task.occupied_worker_port = node2.get_available_port() 123 | node2.queue.append(task) 124 | num_task += 1 125 | 126 | while(True): 127 | cluster.process(0) 128 | time.sleep(8) 129 | 130 | -------------------------------------------------------------------------------- /Simu/server.py: -------------------------------------------------------------------------------- 1 | 2 | import socket 3 | import threading 4 | import sys 5 | import random 6 | #!/usr/bin/env python 7 | import psutil 8 | import os 9 | import completion_task 10 | from workload import * 11 | from cluster import * 12 | 13 | def get_files(job_dir, file_keyword): 14 | files = [] 15 | for file in os.listdir(job_dir): 16 | if(os.path.isfile(os.path.join(job_dir,file)) and file.startswith(file_keyword)): 17 | files.append(file) 18 | return files 19 | 20 | 21 | def del_old_model(job_dir, cur_serial, task_index): 22 | files1 = get_files(job_dir, "model.ckpt") 23 | files2 = get_files(job_dir, "latest_model") 24 | for file in files1: 25 | partition = file.split('.') 26 | part2 = partition[1] 27 | serial_number = int(part2[5:]) 28 | if(cur_serial - serial_number >=50): 29 | os.remove(job_dir+"/"+file) 30 | for file in files2: 31 | partition = file.split('.') 32 | part1 = partition[0] 33 | epoch = int(part1.split('_')[3][5:]) 34 | if(epoch != task_index): 35 | os.remove(job_dir+"/"+file) 36 | 37 | def getbytes(string): 38 | return string.encode() 39 | 40 | def getstring(byte): 41 | return byte.decode() 42 | 43 | def TCP_request(ip,port,request): 44 | c_s = socket.socket() 45 | response = 'request failed' 46 | try: 47 | c_s.connect((ip,int(port))) 48 | byte_request = getbytes(request) 49 | c_s.send(byte_request) 50 | byte_response = c_s.recv(1024) 51 | response = getstring(byte_response) 52 | 53 | except Exception, e: 54 | print('ip: '+ip+"\tport: "+str(port)+'\t request: '+request) 55 | print(e) 56 | 57 | finally: 58 | return response 59 | 60 | def execute_command(cmd): 61 | os.system(cmd) 62 | 63 | 64 | class Master: 65 | def __init__(self, ip, port, workload, cluster, scheduler): 66 | self.workload = workload 67 | self.sock = socket.socket() 68 | self.ip = ip 69 | self.port = port 70 | self.sock.bind((ip, int(port))) 71 | self.sock.listen(5) 72 | self.cluster = cluster 73 | self.scheduler = scheduler 74 | self.should_stop = False 75 | 76 | def TCP_reply(self, sock, addr): 77 | 78 | #print('new request from ' + str(addr) + ' is accepted') 79 | while (not self.should_stop): 80 | response = '' 81 | b_data=sock.recv(1024) 82 | data = getstring(b_data) 83 | if (data =='exit' or not data): 84 | break; 85 | 86 | elif(data =='ip'): 87 | response = str(self.ip) 88 | 89 | elif(data =='port'): 90 | response = str(self.port) 91 | 92 | elif(data == 'is_started'): 93 | response = 'start' 94 | 95 | elif(data == 'cpu'): 96 | cpu_usage = psutil.cpu_percent() 97 | response = str(cpu_usage) 98 | 99 | elif(data == 'memory'): 100 | vm_dic = dict(psutil.virtual_memory()._asdict()) 101 | memory_usage = vm_dic['percent'] 102 | response = str(memory_usage) 103 | 104 | elif(data.startswith('loss')): 105 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 106 | response = f.readline() 107 | 108 | elif(data.startswith('accuracy')): 109 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 110 | response = f.readline() 111 | elif(data.startswith('cluster')): 112 | response = 'nodes infomation' 113 | for node in self.cluster.node_list: 114 | response += 'node name: '+node.name+'\n' 115 | response += 'node ip: '+node.ip+'\n' 116 | 117 | if(len(node.task_list) != 0): 118 | response += 'task list\n' 119 | for task in node.task_list: 120 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 121 | else: 122 | response += 'task list empty' 123 | 124 | if(len(node.queue) != 0): 125 | response += 'node queue\n' 126 | for task in node.queue: 127 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 128 | else: 129 | response += 'node queue empty\n' 130 | 131 | 132 | elif(data.startswith('execute')): 133 | info = data.split(',') 134 | ps_addr = info[1] 135 | ps_ip = ps_addr.split(':')[0] 136 | ps_port = ps_addr.split(':')[1] 137 | worker_addr = info[2] 138 | worker_ip = worker_addr.split(':')[0] 139 | worker_port = worker_addr.split(':')[1] 140 | job_name = info[3] 141 | task_index = info[4] 142 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 143 | 144 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 145 | thread.start() 146 | response = 'executing the task of '+job_name 147 | 148 | elif(data.startswith('checkpoint')): 149 | job_dir = data.split(',')[1] 150 | if(not os.path.exists(job_dir+'/checkpoint')): 151 | response = 'none' 152 | else: 153 | with open(job_dir+'/checkpoint', 'r') as f: 154 | response = f.read() 155 | 156 | elif(data.startswith('update_checkpoint')): 157 | partition = data.split(',') 158 | checkpoint_info = partition[1] 159 | job_dir = partition[2] 160 | with open(job_dir+'/checkpoint', 'w') as f: 161 | f.write(checkpoint_info) 162 | if(os.path.exists(job_dir+'/checkpoint')): 163 | response = 'update checkpoint file success!' 164 | else: 165 | response = 'update failed' 166 | 167 | elif(data.startswith('task_completion')): 168 | info = data.split(',') #task_completion job_name task_index worker_ip worker_port 169 | job_name = info[1] 170 | job_index = int(job_name[3]) 171 | task_index = int(info[2]) 172 | worker_ip = info[3] 173 | worker_port = info[4] 174 | update_loss = info[5] 175 | job_dir = info[6] 176 | cur_serial = int(info[7]) 177 | 178 | job = self.workload[job_index] 179 | task = job.task_list[task_index] 180 | 181 | if(update_loss == 'YES'): 182 | request = 'loss_'+job_name 183 | response = TCP_request(worker_ip, 9999, request) 184 | cur_loss = float(response) 185 | job.loss_traj.append(cur_loss) 186 | 187 | del_old_model(job_dir, cur_serial, task_index) 188 | 189 | task.executing = False 190 | task.complete = True 191 | 192 | completion_task.shared_resource_lock.acquire() 193 | if(not self.cluster.completed_task.has_key(job_name)): 194 | self.cluster.completed_task[job_name] = [] 195 | 196 | self.cluster.completed_task[job_name].append(task.task_name) 197 | 198 | node = self.cluster.find_task_on_node(worker_ip) 199 | for i in range(len(node.task_list)): 200 | cur_task = node.task_list[i] 201 | if(cur_task.job_name == task.job_name and 202 | cur_task.task_name == task.task_name): 203 | node.task_list.pop(i) 204 | break 205 | for i in range(len(node.job_list)): 206 | cur_job_name = node.job_list[i] 207 | if(cur_job_name == task.job_name): 208 | node.job_list.pop(i) 209 | break 210 | #display_cluster(self.cluster, 2) 211 | completion_task.shared_resource_lock.release() 212 | print('port number is '+str(task.ps_port)) 213 | response = 'processed task completion!' 214 | completion_task.stop_ps(task.ps_port) 215 | 216 | b_response = getbytes(response) 217 | sock.send(b_response) 218 | sock.close() 219 | #print('Connection from %s:%s closed' % addr) 220 | 221 | def start(self): 222 | while (True): 223 | c_sock, addr = self.sock.accept() 224 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 225 | thread.start() 226 | 227 | class Slave: 228 | def __init__(self, ip, port): 229 | self.sock = socket.socket() 230 | self.ip = ip 231 | self.port = str(port) 232 | self.sock.bind((ip, int(port))) 233 | self.sock.listen(5) 234 | self.should_stop = False 235 | 236 | 237 | def TCP_reply(self, sock, addr): 238 | 239 | #print('new request from ' + str(addr) + ' is accepted') 240 | while (not self.should_stop): 241 | response = '' 242 | b_data=sock.recv(1024) 243 | data = getstring(b_data) 244 | if (data =='exit' or not data): 245 | break; 246 | 247 | elif(data =='ip'): 248 | response = str(self.ip) 249 | 250 | elif(data =='port'): 251 | response = str(self.port) 252 | 253 | elif(data == 'is_started'): 254 | response = 'start' 255 | 256 | elif(data == 'cpu'): 257 | cpu_usage = psutil.cpu_percent() 258 | response = str(cpu_usage) 259 | 260 | elif(data == 'memory'): 261 | vm_dic = dict(psutil.virtual_memory()._asdict()) 262 | memory_usage = vm_dic['percent'] 263 | response = str(memory_usage) 264 | 265 | elif(data.startswith('loss')): 266 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 267 | response = f.readline() 268 | 269 | elif(data.startswith('accuracy')): 270 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 271 | response = f.readline() 272 | 273 | elif(data.startswith('execute')): 274 | info = data.split(',') 275 | ps_addr = info[1] 276 | ps_ip = ps_addr.split(':')[0] 277 | ps_port = ps_addr.split(':')[1] 278 | worker_addr = info[2] 279 | worker_ip = worker_addr.split(':')[0] 280 | worker_port = worker_addr.split(':')[1] 281 | job_name = info[3] 282 | task_index = info[4] 283 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 284 | 285 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 286 | thread.start() 287 | response = 'executing the task of '+job_name 288 | 289 | b_response = getbytes(response) 290 | sock.send(b_response) 291 | sock.close() 292 | #print('Connection from %s:%s closed' % addr) 293 | 294 | def start(self): 295 | while (True): 296 | c_sock, addr = self.sock.accept() 297 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 298 | thread.start() 299 | 300 | 301 | if (__name__ == '__main__'): 302 | #print(len(sys.argv)) 303 | if(len(sys.argv) != 2): 304 | print('usage: python server master/slave') 305 | print('exit') 306 | sys.exit(0) 307 | role = sys.argv[1] 308 | localIP = socket.gethostbyname(socket.gethostname()) 309 | node = None 310 | 311 | if(role == 'master'): 312 | print('master') 313 | node = Master(localIP,9999) 314 | print('register') 315 | node.start() 316 | print('listening') 317 | 318 | elif(role == 'slave'): 319 | print('slave') 320 | node = Slave(localIP,9999) 321 | print('register') 322 | node.start() 323 | print('listening') 324 | 325 | else: 326 | print('request') 327 | response = TCP_request(localIP,9999,sys.argv[1]) 328 | print("response is: "+response) 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /Simu/start_pytorch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python trainer.py \ 3 | --ps_hosts=172.31.6.117:2222\ 4 | --worker_hosts=172.31.15.67:2222\ 5 | --job_name=worker --task_index=0 6 | -------------------------------------------------------------------------------- /Simu/statistics.py: -------------------------------------------------------------------------------- 1 | def show_statistics(workload,cluster,timer): 2 | 3 | total_complete_time = 0 4 | num_deadline_satisfied = 0 5 | total_num_message = 0 6 | 7 | total_waiting_time = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 8 | num_job = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 9 | for i in range(4): 10 | total_waiting_time.append(0) 11 | num_job.append(0) 12 | 13 | acc_reduction = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 4:0.45-0.6, 5:0.6-0.75, 6:0.75-1 14 | aver_jct_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 15 | num_job_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 16 | 17 | for i in range(6): 18 | acc_reduction.append(0) 19 | aver_jct_removed_ratio.append(0) 20 | num_job_removed_ratio.append(0) 21 | 22 | 23 | #total_task = 0 24 | JCTs = [] 25 | for job in workload: 26 | arriving_time = job.task_list[0].arriving_time 27 | complete_time = job.complete_time 28 | JCTs.append(complete_time - arriving_time) 29 | 30 | num_task = len(job.task_list) 31 | job_waiting_time = 0 32 | 33 | for task in job.task_list: 34 | #total_task += 1 35 | job_waiting_time += task.waiting_time 36 | 37 | if(job.input_size<325):#100-325 38 | total_waiting_time[0] += job_waiting_time / num_task 39 | num_job[0] += 1 40 | elif(job.input_size<550):#325-550 41 | total_waiting_time[1] += job_waiting_time / num_task 42 | num_job[1] += 1 43 | elif(job.input_size<775):#550-775 44 | total_waiting_time[2] += job_waiting_time / num_task 45 | num_job[2] += 1 46 | elif(job.input_size<1000):#775-1000 47 | total_waiting_time[3] += job_waiting_time / num_task 48 | num_job[3] += 1 49 | 50 | removed_ratio = job.num_removed_task / len(job.task_list) 51 | total_num_task = len(job.task_list) 52 | num_removed_task = job.num_removed_task 53 | 54 | if(removed_ratio < 0.15): 55 | acc_reduction[0] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 56 | aver_jct_removed_ratio[0] += complete_time - arriving_time 57 | num_job_removed_ratio[0] += 1 58 | elif(removed_ratio < 0.3): 59 | acc_reduction[1] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 60 | aver_jct_removed_ratio[1] += complete_time - arriving_time 61 | num_job_removed_ratio[1] += 1 62 | elif(removed_ratio < 0.45): 63 | acc_reduction[2] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 64 | aver_jct_removed_ratio[2] += complete_time - arriving_time 65 | num_job_removed_ratio[2] += 1 66 | elif(removed_ratio < 0.6): 67 | acc_reduction[3] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 68 | aver_jct_removed_ratio[3] += complete_time - arriving_time 69 | num_job_removed_ratio[3] += 1 70 | elif(removed_ratio < 0.75): 71 | acc_reduction[4] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 72 | aver_jct_removed_ratio[4] += complete_time - arriving_time 73 | num_job_removed_ratio[4] += 1 74 | elif(removed_ratio < 1): 75 | acc_reduction[5] += 1 - (job.initial_loss - 0.35246) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 76 | aver_jct_removed_ratio[5] += complete_time - arriving_time 77 | num_job_removed_ratio[5] += 1 78 | 79 | jct = complete_time - arriving_time 80 | total_complete_time += jct 81 | 82 | if(jct < job.deadline): 83 | num_deadline_satisfied += 1 84 | 85 | sorted_JCTs = sorted(JCTs) 86 | num_job = len(workload) 87 | for i in range(num_job): 88 | jct = sorted_JCTs[i] 89 | perc = ((i+1)/num_job)*100 90 | print('CDF point( '+str(jct)+', '+str(perc)+'%)') 91 | 92 | avg_waiting_time = 0 93 | 94 | deadline_gurantee = num_deadline_satisfied / len(workload) 95 | print('job deadline gurantee = '+str(deadline_gurantee)) 96 | 97 | print('remove ratio: 0-15%, 15-30%, 30-45%, 45-60%, 60-75%') 98 | 99 | aver_job_acc_red = 0 100 | 101 | for i in range(6): 102 | if(num_job_removed_ratio[i] == 0): 103 | acc_reduction[i] = 0 104 | aver_jct_removed_ratio[i] = 0 105 | else: 106 | acc_reduction[i] /= num_job_removed_ratio[i] 107 | aver_jct_removed_ratio[i] /= num_job_removed_ratio[i] 108 | aver_job_acc_red += acc_reduction[i] 109 | 110 | print('percentage of this part: '+str(num_job_removed_ratio[i] * 100 / len(workload))+'%') 111 | print('job accuracy reduction = '+str(acc_reduction[i] / len(workload))) 112 | print('average JCT over removed ratio = '+str(aver_jct_removed_ratio[i])) 113 | 114 | print('average job accuracy reduction: '+str(aver_job_acc_red / len(workload))) 115 | 116 | for i in range(4): 117 | if(num_job[i] != 0): 118 | avg_waiting_time += total_waiting_time[i] 119 | total_waiting_time[i] /= num_job[i] 120 | print('job waiting time = '+str(total_waiting_time[i])) 121 | print('average job waiting time = '+str(avg_waiting_time/len(workload))) 122 | average_jct = total_complete_time / len(workload) 123 | print('average jct = '+str(average_jct)) 124 | 125 | total_num_message = 0 126 | 127 | for node in cluster.node_list: 128 | total_num_message += node.num_total_message 129 | 130 | avg_message = total_num_message/len(cluster.node_list) 131 | avg_bandwidth = avg_message * 0.0035 132 | print('avg_bandwidth = '+str(avg_bandwidth)) 133 | 134 | total_time = 0 135 | 136 | for time in timer: 137 | total_time += time 138 | aver_time = total_time / len(timer) 139 | print('average latency = '+str(aver_time)) 140 | 141 | 142 | def is_schedule_all_task(job_list): 143 | for job in job_list: 144 | for task in job.task_list: 145 | if(not task.in_queue and not task.executing and not task.complete): 146 | return False 147 | return True 148 | 149 | def schedule_policy(Gandiva, job_list, cluster): 150 | can_schedule = True 151 | if(not Gandiva): 152 | can_schedule = cluster.has_av_res() 153 | else: 154 | can_schedule = not is_schedule_all_task(job_list) 155 | return can_schedule -------------------------------------------------------------------------------- /Simu/task_executor.py: -------------------------------------------------------------------------------- 1 | #from task import * 2 | #from workload import * 3 | import os 4 | 5 | cmd = 'ls' 6 | os.system(cmd) 7 | 8 | -------------------------------------------------------------------------------- /Simu/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 100 24 | max_num_nodes = 3 25 | min_task = 10 26 | max_task = 10 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 50 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 10 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | 104 | task = workload[0].task_list[0] 105 | task.ps_port = master.get_available_port() 106 | task.occupied_worker_port = 2222 107 | 108 | for node in cluster.node_list: 109 | if(node.ip == '172.31.4.135'): 110 | task.worker_ip = node.ip 111 | node.add_task(task,0) 112 | -------------------------------------------------------------------------------- /Simu/train_RL_model.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from statistics import * 5 | from workload import * 6 | from cluster import * 7 | from scheduler import * 8 | #from task import * 9 | from RLmodel import * 10 | from computation import * 11 | import threading 12 | import completion_task 13 | 14 | 15 | max_res_demand = 0.3 16 | min_res_demand = 0.1 17 | min_in_size = 100 18 | max_in_size = 1000 19 | min_task_dur = 10 20 | max_task_dur = 50 21 | max_length_q = 100 22 | max_num_nodes = 4 23 | min_task = 30 24 | max_task = 50 25 | 26 | max_rack_size = 5 27 | max_num_rack = 3 28 | num_rtype = 2 29 | 30 | max_q_length = 50 31 | max_num_task = 200 32 | min_num_task = 20 33 | min_ddl = max_task_dur * min_task 34 | max_ddl = max_task_dur * max_task 35 | add_task_perc = 0.1 36 | beta = 10 #percentage of tasks one time 37 | 38 | W_q_d = 0.5 39 | W_q_s = 0.5 40 | W_q_r = 0.5 41 | 42 | W_c_m = 0.5 43 | W_c_t = 0.5 44 | 45 | gama_q = 0.9 46 | gama_n = 0.9 47 | learning_rate = 0.1 48 | 49 | remove_threshold = 0.0 50 | 51 | num_coming_job = 5 52 | 53 | remove_option = False 54 | order_net_option = True 55 | node_net_option = True 56 | 57 | node_net_in_dim = 2 * max_num_nodes 58 | node_net_out_dim = max_num_nodes 59 | 60 | order_net_in_dim = 3 * max_q_length 61 | order_net_out_dim = max_q_length 62 | 63 | #servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 64 | servers = {'node1':'172.31.6.117', 'node2':'172.31.44.243', 'node3':'172.31.35.92', 'node4':'172.31.35.219'} 65 | 66 | ip_list = [] 67 | for ip in servers.values(): 68 | ip_list.append(ip) 69 | 70 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 71 | 72 | new_job = True 73 | 74 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 75 | 76 | num_episodes = 1 77 | num_iteration = 1 78 | 79 | wl_len = 10 80 | 81 | Gandiva = False 82 | 83 | ps_ip = socket.gethostbyname(socket.gethostname()) 84 | ps_node = find_node(cluster, ps_ip) 85 | 86 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 87 | for node in cluster.node_list: 88 | if(node.master): 89 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 90 | thread.start() 91 | break 92 | 93 | timer = [] 94 | #workloads = [] 95 | #for i in range(num_episodes): 96 | # random.shuffle(workload) 97 | # workload_copy = copy_workload(workload) 98 | # workloads.append(workload_copy) 99 | 100 | for iteration in range(num_iteration): 101 | print('iteration: '+str(iteration)) 102 | #workload = generate_workload(wl_len,min_task,max_task,min_in_size,max_in_size,min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype) 103 | unfinished_job_list = workload 104 | queue_reward_traj = [] 105 | node_reward_traj = [] 106 | 107 | queue_sv_traj = [] 108 | node_sv_traj = [] 109 | 110 | queue_discounted_reward_traj = [] 111 | node_discounted_reward_traj = [] 112 | 113 | scheduler.node_network.zero_grad() 114 | scheduler.order_network.zero_grad() 115 | for episode in range(num_episodes): 116 | print('episode: '+str(episode)) 117 | #workload = workloads[episode] 118 | queue_reward_traj.append([]) 119 | node_reward_traj.append([]) 120 | 121 | queue_sv_traj.append([]) 122 | node_sv_traj.append([]) 123 | 124 | queue_discounted_reward_traj.append([]) 125 | node_discounted_reward_traj.append([]) 126 | 127 | cur_timestep = time.clock() 128 | cur_job_num = 0 129 | num_schedule = 0 130 | schedule_interval = 5 131 | 132 | while(len(unfinished_job_list) != 0): 133 | 134 | cur_timestep = time.clock() + num_schedule * schedule_interval 135 | if(cur_job_num < wl_len): 136 | workload[cur_job_num].update_arr_time(cur_timestep) 137 | scheduler.job_arriving(workload[cur_job_num]) 138 | cur_job_num += num_coming_job 139 | if(cur_job_num >= wl_len): 140 | cur_job_num = wl_len 141 | 142 | unfinished_job_list = find_unfinished_jobs(workload, cur_job_num, cur_timestep) 143 | 144 | if(len(unfinished_job_list) == 0): 145 | print('all jobs finish') 146 | break 147 | last_add_job = int(unfinished_job_list[0].job_name[3]) 148 | 149 | valid = True 150 | num_decision = 0 151 | 152 | while(valid): 153 | 154 | dead_loop = 0 155 | 156 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 157 | 158 | while(can_schedule and len(scheduler.queue) < scheduler.max_q_length): 159 | 160 | dead_loop += 1 161 | 162 | last_add_job = scheduler.fill_queue(unfinished_job_list, last_add_job, dead_loop, remove_threshold, remove_option) 163 | if(last_add_job < 0): 164 | break 165 | 166 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 167 | 168 | start = time.clock() 169 | if(not Gandiva): 170 | cluster.process(cur_timestep) 171 | valid, q_sv, n_sv, t_ind, n_ind = scheduler.schedule_one_task(cur_timestep, cluster, order_net_option, node_net_option) 172 | else: 173 | valid = scheduler.Gandiva_schedule_one_task(cur_timestep, cluster) 174 | elapsed = (time.clock() - start) 175 | 176 | 177 | num_decision += 1 178 | 179 | if(len(scheduler.queue) == 0): 180 | #print('schedule all tasks') 181 | break 182 | 183 | if(not valid): 184 | #print('invalid action') 185 | num_decision -= 1 186 | break 187 | 188 | timer.append(elapsed) 189 | if(not Gandiva): 190 | queue_sv_traj[episode].append(q_sv) 191 | node_sv_traj[episode].append(n_sv) 192 | 193 | aver_size = scheduler.average_size() 194 | 195 | beta = scheduler.add_task_perc 196 | queue = scheduler.queue 197 | 198 | q_rt = queue_reward(W_q_d, W_q_s, W_q_r, cur_timestep, aver_size, scheduler, beta, queue, workload) 199 | queue_reward_traj[episode].append(q_rt) 200 | 201 | n_rt = node_reward(W_c_m, W_c_t, cur_timestep, cluster) 202 | node_reward_traj[episode].append(n_rt) 203 | 204 | print('\n\ncurrent time: '+str(cur_timestep)) 205 | 206 | completion_task.shared_resource_lock.acquire() 207 | for job in workload: 208 | job.update_priority() 209 | for node in cluster.node_list: 210 | node.clear_removed_task() 211 | 212 | scheduler.clear_removed_task() 213 | display_scheduler(scheduler) 214 | display_cluster(cluster,2) 215 | for job in workload: 216 | display_job(job) 217 | 218 | print('scheduler now is sleeping... itertaion '+str(num_schedule)) 219 | time.sleep(schedule_interval) 220 | 221 | completion_task.shared_resource_lock.release() 222 | num_schedule += 1 223 | 224 | print('finish episode '+str(episode)+', makespan = '+str(cur_timestep)) 225 | 226 | if(not Gandiva): 227 | num_action = len(queue_sv_traj[episode]) 228 | for j in range(num_action): 229 | n_vt = discounted_reward(gama_n, j, node_reward_traj[episode]) 230 | q_vt = discounted_reward(gama_q, j, queue_reward_traj[episode]) 231 | node_discounted_reward_traj[episode].append(n_vt) 232 | queue_discounted_reward_traj[episode].append(q_vt) 233 | 234 | show_statistics(workload,cluster,timer) 235 | timer = [] 236 | 237 | unfinished_job_list = workload 238 | 239 | if(not Gandiva): 240 | num_action = 100000000 241 | 242 | for episode in range(num_episodes): 243 | if(num_action > len(queue_sv_traj[episode])): 244 | num_action = len(queue_sv_traj[episode]) 245 | 246 | q_r_episode_baseline = [] 247 | n_r_episode_baseline = [] 248 | for j in range(num_action): 249 | total_qr_b = 0 250 | total_nr_b = 0 251 | for episode in range(num_episodes): 252 | total_qr_b += queue_discounted_reward_traj[episode][j] 253 | total_nr_b += node_discounted_reward_traj[episode][j] 254 | 255 | q_r_episode_baseline.append(total_qr_b / num_episodes) 256 | n_r_episode_baseline.append(total_nr_b / num_episodes) 257 | 258 | for episode in range(num_episodes): 259 | for j in range(num_action): 260 | q_sv = queue_sv_traj[episode][j] 261 | n_sv = node_sv_traj[episode][j] 262 | q_vt = queue_discounted_reward_traj[episode][j] 263 | n_vt = node_discounted_reward_traj[episode][j] 264 | qr_b = q_r_episode_baseline[j] 265 | nr_b = n_r_episode_baseline[j] 266 | scheduler.order_network.backward(q_sv, q_vt - qr_b, learning_rate) 267 | scheduler.node_network.backward(n_sv, n_vt - nr_b, learning_rate) 268 | 269 | scheduler.node_network.update_grads() 270 | scheduler.order_network.update_grads() 271 | 272 | torch.save(scheduler.node_network.weight, 'node_weight_ep'+str(iteration)+'.para') 273 | torch.save(scheduler.node_network.bias, 'node_bias_ep'+str(iteration)+'.para') 274 | torch.save(scheduler.order_network.weight, 'order_weight_ep'+str(iteration)+'.para') 275 | torch.save(scheduler.order_network.bias, 'order_bias_ep'+str(iteration)+'.para') -------------------------------------------------------------------------------- /Simu/workload.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import socket 4 | from enum import Enum 5 | 6 | class Job_Phase(Enum): 7 | Quick_drop = 0 8 | fluctuation = 1 9 | overfitting = 2 10 | 11 | class Task: 12 | def __init__(self, task_name, job_name, deadline, input_size, duration, demand, ps_port, task_index): 13 | self.task_name = task_name 14 | self.job_name = job_name 15 | self.demand = demand 16 | self.deadline = deadline 17 | self.arriving_time = 0 18 | self.input_size = input_size 19 | self.priority = 1 20 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 21 | self.ps_port = ps_port 22 | self.index = task_index 23 | #self.duration = duration 24 | #self.pre_dur = duration 25 | self.worker_ip = None 26 | self.occupied_worker_port = None 27 | self.waiting_time = 0 28 | self.in_queue = False 29 | self.executing = False 30 | self.complete = False 31 | 32 | class Job: 33 | 34 | def __init__(self, job_name, deadline, tasks, input_size, ps_port): 35 | self.mloss_ind = -1 36 | self.removed_task = [] 37 | self.loss_phase = Job_Phase.Quick_drop 38 | self.pivot_index = 0 39 | self.samples = 0 40 | self.job_name = job_name 41 | self.task_list = tasks 42 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 43 | self.ps_port = ps_port 44 | self.input_size = input_size 45 | self.num_task = len(tasks) 46 | self.deadline = deadline 47 | self.complete_time = 0 48 | self.initial_loss = 1.41 # loss of model without training 49 | self.min_loss = self.initial_loss 50 | self.loss_traj = [] 51 | self.num_beta = 0 52 | self.add_task_perc = 0.1 53 | self.num_removed_task = 0 54 | self.initialized = False 55 | #simulation loss trajectory 56 | #for i in range(self.num_task): 57 | # cur_loss = self.initial_loss * math.pow(0.85,i) 58 | # self.loss_traj.append(cur_loss) 59 | 60 | self.arriving_time = 0 61 | self.complete_time = 0 62 | 63 | def cutoff_task(self): 64 | for task in self.task_list: 65 | if(task.in_queue): 66 | task.in_queue = False 67 | task.complete = True 68 | self.removed_task.append(task.task_name) 69 | self.num_removed_task = len(self.removed_task) 70 | 71 | def update_arr_time(self, timestep): 72 | if(self.arriving_time == 0): 73 | self.arriving_time = timestep 74 | for task in self.task_list: 75 | task.arriving_time = timestep 76 | 77 | def add_task(task): 78 | self.task_list.append(task) 79 | self.num_task += 1 80 | 81 | def count_complete_task(self): 82 | num_complete_task = 0 83 | for task in self.task_list: 84 | if(task.complete): 85 | num_complete_task += 1 86 | 87 | return num_complete_task 88 | 89 | def get_num_scheduled_task(self): 90 | num_scheduled_task = 0 91 | for task in self.task_list: 92 | if(task.in_queue or task.complete or task.executing): 93 | num_scheduled_task += 1 94 | return num_scheduled_task 95 | 96 | def detect_overfitting(self): 97 | L = len(self.loss_traj) 98 | local_optimals = self.find_local_optimals() 99 | loss_value_overfitting = False 100 | 101 | if(len(local_optimals) > 1): 102 | first_local_opt = list(local_optimals.values())[0] 103 | last_local_opt = list(local_optimals.values())[-1] 104 | if(last_local_opt > first_local_opt): 105 | loss_value_overfitting = True 106 | 107 | num_loin_epoch = 0 # loss increase epochs 108 | last_loss = self.loss_traj[-1] 109 | for i in range(-2, -L-1, -1): 110 | loss_change = last_loss - self.loss_traj[i] 111 | last_loss = self.loss_traj[i] 112 | if(loss_change > 0): 113 | num_loin_epoch += 1 114 | else: 115 | break 116 | 117 | if(num_loin_epoch > 5 or loss_value_overfitting): 118 | self.loss_phase = Job_Phase.overfitting 119 | self.cutoff_task() 120 | 121 | def find_local_optimals(self): 122 | local_optimals = {} # {(index1:loss1),(index2:loss2),....} 123 | L = len(self.loss_traj) 124 | if(L < 3): 125 | return local_optimals 126 | 127 | for i in range(1,L-1): 128 | last_loss = self.loss_traj[i-1] 129 | cur_loss = self.loss_traj[i] 130 | next_loss = self.loss_traj[i+1] 131 | if(cur_loss < last_loss and cur_loss < next_loss): 132 | local_optimals[i] = cur_loss 133 | 134 | return local_optimals 135 | 136 | def update_priority(self): 137 | #num_complete_task = self.count_complete_task() 138 | num_complete_task = len(self.loss_traj) 139 | ind_last = num_complete_task - 1 140 | if(ind_last <= 0):#none finished task 141 | return 142 | 143 | num_total_task = self.num_task 144 | num_beta = self.num_beta 145 | 146 | num_scheduled_task = self.get_num_scheduled_task() 147 | ini_loss = self.initial_loss 148 | 149 | num_rest_task = num_total_task - num_complete_task 150 | last_loss_reduction = self.loss_traj[-2] - self.loss_traj[-1] 151 | 152 | if(last_loss_reduction < 0 and self.loss_phase == Job_Phase.Quick_drop): 153 | self.loss_phase = Job_Phase.fluctuation 154 | self.pivot_index = num_complete_task - 2 155 | self.samples = 1 156 | self.min_loss = self.loss_traj[-2] 157 | self.mloss_ind = num_complete_task - 2 158 | 159 | if(self.loss_phase == Job_Phase.fluctuation): 160 | self.detect_overfitting() 161 | if(self.loss_phase == Job_Phase.overfitting): 162 | return 163 | self.samples = num_complete_task - self.pivot_index - 1 164 | num_fluc_task = num_total_task - self.pivot_index - 1 165 | max_num_samples = int(num_fluc_task * 0.37) 166 | if(self.samples > max_num_samples): 167 | if(self.min_loss > self.loss_traj[-1]): 168 | self.min_loss = self.loss_traj[-1] 169 | self.mloss_ind = num_complete_task - 1 170 | self.cutoff_task() 171 | elif(self.min_loss > self.loss_traj[-1]): 172 | self.min_loss = self.loss_traj[-1] 173 | self.mloss_ind = num_complete_task - 1 174 | 175 | 176 | def display_task(task): 177 | print('task '+task.task_name) 178 | if(task.in_queue): 179 | print('status: in queue') 180 | elif(task.executing): 181 | print('status: executing') 182 | elif(task.complete): 183 | print('status: complete') 184 | else: 185 | print('status: not scheduled') 186 | 187 | 188 | def create_demand(num_rtype,min_rd,max_rd): 189 | demand = [] 190 | for i in range(num_rtype): 191 | demand.append(min_rd + (max_rd - min_rd) * random.random()) 192 | return demand 193 | 194 | 195 | def create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype,input_size, duration, flag_demand, demands, ps_port, task_index): 196 | demand = [] 197 | if(flag_demand): 198 | demand = create_demand(num_rtype,min_rd,max_rd) 199 | else: 200 | demand = demands 201 | task = Task(task_name, job_name, deadline,input_size, duration, demand, ps_port, task_index) 202 | return task 203 | 204 | def create_job(job_name, num_task, input_size, min_rd, max_rd, deadline, num_rtype, min_task_dur, max_task_dur, ps_node, flag_demand): 205 | tasks = [] 206 | for i in range(num_task): 207 | task_name = str(i) 208 | demands = [] 209 | duration = random.randint(min_task_dur,max_task_dur) 210 | ps_port = ps_node.get_available_port() 211 | task = create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype, input_size, duration, flag_demand, demands, ps_port, i) 212 | tasks.append(task) 213 | job = Job(job_name,deadline, tasks, input_size, ps_port) 214 | return job 215 | 216 | 217 | def display_job(job): 218 | 219 | print('\n\njob name: '+job.job_name) 220 | print('validation loss:') 221 | if(len(job.loss_traj) == 0): 222 | print('no finished task') 223 | else: 224 | print(job.loss_traj) 225 | if(len(job.removed_task) == 0): 226 | print('no removed task') 227 | else: 228 | print('removed tasks: '+str(job.removed_task)) 229 | for task in job.task_list: 230 | display_task(task) 231 | 232 | import random 233 | def generate_workload(length, min_task, max_task, min_in_size, max_in_size, min_rd, max_rd, min_ddl, max_ddl,min_task_dur,max_task_dur, num_rtype, ps_node): 234 | workload = [] 235 | for i in range(length): 236 | job_name = 'job'+str(i) 237 | num_task = random.randint(min_task,max_task) 238 | input_size = random.randint(min_in_size,max_in_size) 239 | deadline = random.randint(min_ddl, max_ddl) 240 | 241 | job = create_job(job_name,num_task,input_size,min_rd,max_rd, deadline, num_rtype,min_task_dur,max_task_dur, ps_node, flag_demand = True) 242 | workload.append(job) 243 | return workload 244 | 245 | def job_not_complete(job): 246 | for task in job.task_list: 247 | if(not task.complete): 248 | return True 249 | return False 250 | 251 | def find_unfinished_jobs(workload, cur_job_num, timestep): 252 | unfinished_job_list = [] 253 | for i in range(cur_job_num): 254 | if(job_not_complete(workload[i])): 255 | unfinished_job_list.append(workload[i]) 256 | elif(workload[i].complete_time < timestep and workload[i].complete_time == 0): 257 | workload[i].complete_time = timestep 258 | return unfinished_job_list 259 | 260 | 261 | def copy_task(task): 262 | cp_task = Task(task.task_name, task.job_name, task.deadline, task.input_size, task.duration, task.demand, task.ps_port) 263 | return cp_task 264 | 265 | def copy_job(job): 266 | cp_tasks = [] 267 | for task in job.task_list: 268 | cp_task = copy_task(task) 269 | cp_tasks.append(cp_task) 270 | cp_job = Job(job.job_name, job.deadline, cp_tasks, job.input_size, job.ps_port) 271 | return cp_job 272 | 273 | def copy_workload(workload): 274 | cp_wl = [] 275 | for job in workload: 276 | cp_job = copy_job(job) 277 | cp_wl.append(cp_job) 278 | return cp_wl 279 | 280 | -------------------------------------------------------------------------------- /Test/cluster.py: -------------------------------------------------------------------------------- 1 | import random 2 | #!/usr/bin/env python 3 | import psutil 4 | from server import * 5 | import threading 6 | import socket 7 | import server 8 | from time import sleep 9 | def get_cpu_memory(): 10 | # gives a single float value 11 | cpu_usage = psutil.cpu_percent() 12 | #print('cpu percentage: '+str(cpu_usage)+'%') 13 | # gives an object with many fields 14 | psutil.virtual_memory() 15 | # you can convert that object to a dictionary 16 | vm_dic = dict(psutil.virtual_memory()._asdict()) 17 | memory_usage = vm_dic['percent'] 18 | #print('memory percentage: '+str(memory_usage)+'%') 19 | 20 | return cpu_usage, memory_usage 21 | 22 | class Cluster: 23 | node_list = [] 24 | topology = {} 25 | completed_task = {} #job_name:[task_name, task_name, ...] 26 | task_distribution = {} #job_name:{node_name: num_task, node_name: num_task, ...} 27 | def __init__(self, num_rack, max_rack_size): 28 | self.num_rack = num_rack 29 | self.max_rack_size = max_rack_size 30 | 31 | for i in range(num_rack): 32 | self.topology[i] = [] 33 | 34 | def process(self, cur_timestep): 35 | for node in self.node_list: 36 | pop_index = [] 37 | if(len(node.queue) != 0): 38 | for i in range(len(node.queue)): 39 | task = node.queue[i] 40 | if(task.job_name not in node.job_list): 41 | node.add_task(task,cur_timestep) 42 | pop_index.append(i) 43 | for index in sorted(pop_index, reverse = True): 44 | node.queue.pop(index) 45 | 46 | def find_master(self): 47 | for node in self.node_list: 48 | if(node.master): 49 | return node 50 | 51 | def add_node(self,node,rack): 52 | self.node_list.append(node) 53 | self.topology[rack].append(node) 54 | 55 | def has_av_res(self): 56 | for node in self.node_list: 57 | num_rtype = len(node.av_resource) 58 | node_available = True 59 | for i in range(num_rtype): 60 | if(node.av_resource[i] > 0.2): 61 | continue 62 | else: 63 | node_available = False 64 | break 65 | if(node_available): 66 | return True 67 | 68 | return False 69 | 70 | ''' 71 | def update_process_rate(self): 72 | new_message_distribution = {} # node_name: num_new_message 73 | for i in range(len(self.node_list)):#initialization 74 | node_name = self.node_list[i].name 75 | new_message_distribution[node_name] = 0 76 | 77 | for job_name in self.task_distribution: 78 | task_map = self.task_distribution[job_name] 79 | num_task = 0 80 | for exe_node_name,num_node_task in task_map.items(): 81 | num_task += num_node_task 82 | for exe_node_name,num_node_task in task_map.items(): 83 | num_other_node_task = num_task - num_node_task 84 | num_new_message = num_node_task * num_other_node_task 85 | new_message_distribution[exe_node_name] += num_new_message 86 | 87 | for node in self.node_list: 88 | node_new_message = new_message_distribution[node.name] 89 | node.process_rate = 1 - 0.005 * node_new_message 90 | if(node.process_rate < 0.2): 91 | node.process_rate = 0.2 92 | node.num_total_message += node_new_message 93 | ''' 94 | 95 | def step(self,time): 96 | #self.complete_task #job_name:[task_name, task_name, ... ] 97 | 98 | #for job_name, task_list in self.completed_task.items(): 99 | # self.task_distribution[job_name][node.name] -= len(task_list) 100 | 101 | return self.completed_task 102 | 103 | def complete_task(self,task,node): 104 | task_map = self.task_distribution[task.job_name] 105 | task_map[node.name] -= 1 106 | if(task_map[node.name] == 0): 107 | del task_map[node.name] 108 | if(len(task_map) == 0): 109 | del self.task_distribution[task.job_name] 110 | 111 | def find_task_on_node(self, worker_ip): 112 | for node in self.node_list: 113 | if(worker_ip == node.ip): 114 | return node 115 | 116 | def find_node(self, task): 117 | demand = task.demand 118 | min_affinity = len(demand) + 1 119 | selected_node = None 120 | available = True 121 | for node in self.node_list: 122 | cur_affinity = 0 123 | available = True 124 | for i in range(len(demand)): 125 | if(demand[i] < node.av_resource[i]): 126 | cur_affinity += node.av_resource[i] - demand[i] 127 | else: 128 | available = False 129 | if(available and cur_affinity < min_affinity): 130 | min_affinity = cur_affinity 131 | selected_node = node 132 | 133 | return selected_node 134 | 135 | def find_minload_node(self): 136 | num_rtype = len(self.node_list[0].av_resource) 137 | minload = 0 138 | first_node = self.node_list[0] 139 | selected_node = first_node 140 | for i in range(num_rtype): 141 | minload += 1 - first_node.av_resource[i] 142 | 143 | for node in self.node_list: 144 | cur_load = 0 145 | for i in range(num_rtype): 146 | cur_load += node.av_resource[i] 147 | if(cur_load < minload): 148 | minload = cur_load 149 | selected_node = node 150 | return selected_node 151 | 152 | 153 | class Node: 154 | 155 | def __init__(self, name, num_rtype, ip): 156 | self.localIP = socket.gethostbyname(socket.gethostname()) 157 | self.ip = ip 158 | self.master = (self.localIP == self.ip) 159 | self.job_list = [] 160 | self.queue = [] 161 | self.workload = None 162 | self.cluster = None 163 | self.scheduler = None 164 | 165 | 166 | self.name = name 167 | self.server_port = 9999 168 | self.port_availability = {} 169 | for port in range(2222,8888+1): 170 | self.port_availability[port] = True 171 | 172 | self.av_resource = [] 173 | for i in range(num_rtype): 174 | self.av_resource.append(100) 175 | 176 | self.num_total_message = 0 177 | self.num_exe_task = 0 178 | self.num_total_task = 0 179 | self.task_list = [] 180 | if(not self.master): 181 | start_server_cmd = '/home/ubuntu/TF-scheduler/remote_start_slave '+ip 182 | thread = threading.Thread(target = execute_command, args = (start_server_cmd,), name = 'Slave-Thread'+self.ip) 183 | thread.start() 184 | 185 | def start_master(self, workload, cluster, scheduler): 186 | 187 | localIP = socket.gethostbyname(socket.gethostname()) 188 | Master(localIP, 9999, workload, cluster, scheduler).start() 189 | 190 | 191 | 192 | ''' 193 | def process(self): 194 | completed_task = {} #job_name:[task_name, task_name, ... ] 195 | len_task_list = len(self.task_list) 196 | pop_index = [] 197 | max_resource = 0 198 | num_rtype = len(self.av_resource) 199 | for i in range(num_rtype): 200 | if(max_resource < 1 - self.av_resource[i]): 201 | max_resource = 1 - self.av_resource[i] 202 | 203 | true_process_rate = self.process_rate 204 | 205 | if(max_resource > 1): 206 | true_process_rate *= 1 / max_resource 207 | 208 | for i in range(len_task_list): 209 | task = self.task_list[i] 210 | if(task.duration - true_process_rate <= 0): 211 | if(task.job_name not in completed_task): 212 | completed_task[task.job_name] = [] 213 | self.num_exe_task -= 1 214 | task.duration = 0 215 | task.executing = False 216 | task.complete = True 217 | 218 | completed_task[task.job_name].append(task.task_name) 219 | 220 | num_rtype = len(self.av_resource) 221 | for j in range(num_rtype): 222 | self.av_resource[j] += task.demand[j] 223 | if(self.av_resource[j] > 1): 224 | self.av_resource[j] = 1 225 | pop_index.append(i) 226 | 227 | 228 | else: 229 | task.duration -= true_process_rate 230 | 231 | for ind in sorted(pop_index, reverse = True): 232 | self.task_list.pop(ind) 233 | 234 | return completed_task 235 | ''' 236 | 237 | def get_available_port(self): 238 | for port in range(2222,8888+1): 239 | if(self.port_availability[port]): 240 | self.port_availability[port] = False 241 | return str(port) 242 | return -1 243 | 244 | def execute_task(self, task): 245 | 246 | worker_ip = task.worker_ip 247 | ps_port = task.ps_port 248 | ps_ip = task.ps_ip 249 | 250 | request = 'execute,'+ps_ip+':'+str(ps_port)+','+worker_ip+':'+str(task.occupied_worker_port)+','+task.job_name+','+str(task.index) 251 | 252 | response = TCP_request(ps_ip, 9999, request) 253 | print('execute task of '+task.job_name+' on worker: '+worker_ip+':'+str(task.occupied_worker_port)) 254 | 255 | def add_task(self, task, cur_timestep): 256 | task.in_queue = False 257 | task.executing = True 258 | self.job_list.append(task.job_name) 259 | task.waiting_time = cur_timestep - task.arriving_time 260 | self.num_exe_task += 1 261 | self.num_total_task += 1 262 | #num_rtype = len(self.av_resource) 263 | #for i in range(num_rtype): 264 | # self.av_resource[i] -= task.demand[i] 265 | self.task_list.append(task) 266 | self.execute_task(task) 267 | 268 | def is_started(self): 269 | request = 'is_started' 270 | response = 'not start' 271 | try: 272 | response = TCP_request(self.ip, 9999, request) 273 | finally: 274 | return (response == 'start') 275 | def clear_removed_task(self): 276 | if(len(self.queue) == 0): 277 | return 278 | pop_index = [] 279 | for i in range(len(self.queue)): 280 | task = self.queue[i] 281 | if(task.complete): 282 | pop_index.append(i) 283 | 284 | for index in sorted(pop_index, reverse = True): 285 | self.queue.pop(index) 286 | 287 | 288 | def update_av_resource(self,cur_timestep): 289 | 290 | while(not self.is_started()): 291 | print(self.ip+' does not start, waiting...') 292 | sleep(5) 293 | 294 | request = 'cpu' 295 | response = TCP_request(self.ip, 9999, request) 296 | CPU = float(response) 297 | request = 'memory' 298 | response = TCP_request(self.ip, 9999, request) 299 | Memory = float(response) 300 | self.av_resource[0] = 100 - CPU 301 | self.av_resource[1] = 100 - Memory 302 | 303 | for task in self.queue: 304 | if(task.job_name not in self.job_list): 305 | self.add_task(task,cur_timestep) 306 | 307 | 308 | 309 | def create_node(node_name, num_rtype, ip): 310 | node = Node(node_name, num_rtype, ip) 311 | return node 312 | 313 | def create_cluster(max_rack, max_rack_size, max_num_nodes, num_rtype, ip_list): 314 | cur_num_node = 0 315 | cluster = Cluster(max_rack, max_rack_size) 316 | 317 | for i in range(max_num_nodes): 318 | cur_num_node += 1 319 | node_name = 'node'+str(cur_num_node) 320 | node = create_node(node_name,num_rtype, ip_list[i]) 321 | rack = random.randint(0,max_rack-1) 322 | cluster.add_node(node,rack) 323 | 324 | return cluster 325 | 326 | def display_available_port(port_availability): 327 | for port in range(2222,8888+1): 328 | if(not port_availability[port]): 329 | continue 330 | else: 331 | print('available ports: '+str(port)+'-8888') 332 | break 333 | 334 | def display_node(node): 335 | print('node name: '+node.name) 336 | print('node ip: '+node.ip) 337 | display_available_port(node.port_availability) 338 | print('available resource percent: ') 339 | num_rtype = len(node.av_resource) 340 | for i in range(num_rtype): 341 | print(str(node.av_resource[i])) 342 | if(len(node.task_list) != 0): 343 | print('task list') 344 | for task in node.task_list: 345 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 346 | else: 347 | print('task list empty') 348 | 349 | if(len(node.queue) != 0): 350 | print('node queue') 351 | for task in node.queue: 352 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 353 | else: 354 | print('node queue empty') 355 | 356 | if(len(node.job_list) != 0): 357 | print('node job list') 358 | for job_name in node.job_list: 359 | print(job_name) 360 | else: 361 | print('node job list empty') 362 | print('\n') 363 | 364 | 365 | def display_cluster(cluster, num_rtype): 366 | print('\nnumber of nodes: '+str(len(cluster.node_list))) 367 | for node in cluster.node_list: 368 | display_node(node) 369 | 370 | def find_node(cluster, target_ip): 371 | for node in cluster.node_list: 372 | if(node.ip == target_ip): 373 | return node 374 | 375 | if __name__ == "__main__": 376 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 377 | 378 | ip_list = [] 379 | for ip in servers.values(): 380 | ip_list.append(ip) 381 | 382 | max_num_rack = 3 383 | max_rack_size = 5 384 | max_num_nodes = len(servers) 385 | num_rtype = 2 386 | 387 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 388 | display_cluster(cluster,2) -------------------------------------------------------------------------------- /Test/real-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 100 24 | max_num_nodes = 3 25 | min_task = 10 26 | max_task = 10 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 50 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 10 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | 104 | task = workload[0].task_list[0] 105 | task.ps_port = master.get_available_port() 106 | task.occupied_worker_port = 2222 107 | 108 | for node in cluster.node_list: 109 | if(node.ip == '172.31.4.135'): 110 | task.worker_ip = node.ip 111 | node.add_task(task,0) 112 | -------------------------------------------------------------------------------- /Test/server.py: -------------------------------------------------------------------------------- 1 | 2 | import socket 3 | import threading 4 | import sys 5 | import random 6 | #!/usr/bin/env python 7 | import psutil 8 | import os 9 | import completion_task 10 | from workload import * 11 | from cluster import * 12 | 13 | def get_files(job_dir, file_keyword): 14 | files = [] 15 | for file in os.listdir(job_dir): 16 | if(os.path.isfile(os.path.join(job_dir,file)) and file.startswith(file_keyword)): 17 | files.append(file) 18 | return files 19 | 20 | 21 | def del_old_model(job_dir, cur_serial, task_index): 22 | files1 = get_files(job_dir, "model.ckpt") 23 | files2 = get_files(job_dir, "latest_model") 24 | for file in files1: 25 | partition = file.split('.') 26 | part2 = partition[1] 27 | serial_number = int(part2[5:]) 28 | if(cur_serial - serial_number >=50): 29 | os.remove(job_dir+"/"+file) 30 | for file in files2: 31 | partition = file.split('.') 32 | part1 = partition[0] 33 | epoch = int(part1.split('_')[3][5:]) 34 | if(epoch != task_index): 35 | os.remove(job_dir+"/"+file) 36 | 37 | def getbytes(string): 38 | return string.encode() 39 | 40 | def getstring(byte): 41 | return byte.decode() 42 | 43 | def TCP_request(ip,port,request): 44 | c_s = socket.socket() 45 | response = 'request failed' 46 | try: 47 | c_s.connect((ip,int(port))) 48 | byte_request = getbytes(request) 49 | c_s.send(byte_request) 50 | byte_response = c_s.recv(1024) 51 | response = getstring(byte_response) 52 | 53 | except Exception, e: 54 | print('ip: '+ip+"\tport: "+str(port)+'\t request: '+request) 55 | print(e) 56 | 57 | finally: 58 | return response 59 | 60 | def execute_command(cmd): 61 | os.system(cmd) 62 | 63 | 64 | class Master: 65 | def __init__(self, ip, port, workload, cluster, scheduler): 66 | self.workload = workload 67 | self.sock = socket.socket() 68 | self.ip = ip 69 | self.port = port 70 | self.sock.bind((ip, int(port))) 71 | self.sock.listen(5) 72 | self.cluster = cluster 73 | self.scheduler = scheduler 74 | self.should_stop = False 75 | 76 | def TCP_reply(self, sock, addr): 77 | 78 | #print('new request from ' + str(addr) + ' is accepted') 79 | while (not self.should_stop): 80 | response = '' 81 | b_data=sock.recv(1024) 82 | data = getstring(b_data) 83 | if (data =='exit' or not data): 84 | break; 85 | 86 | elif(data =='ip'): 87 | response = str(self.ip) 88 | 89 | elif(data =='port'): 90 | response = str(self.port) 91 | 92 | elif(data == 'is_started'): 93 | response = 'start' 94 | 95 | elif(data == 'cpu'): 96 | cpu_usage = psutil.cpu_percent() 97 | response = str(cpu_usage) 98 | 99 | elif(data == 'memory'): 100 | vm_dic = dict(psutil.virtual_memory()._asdict()) 101 | memory_usage = vm_dic['percent'] 102 | response = str(memory_usage) 103 | 104 | elif(data.startswith('loss')): 105 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 106 | response = f.readline() 107 | 108 | elif(data.startswith('accuracy')): 109 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 110 | response = f.readline() 111 | elif(data.startswith('cluster')): 112 | response = 'nodes infomation' 113 | for node in self.cluster.node_list: 114 | response += 'node name: '+node.name+'\n' 115 | response += 'node ip: '+node.ip+'\n' 116 | 117 | if(len(node.task_list) != 0): 118 | response += 'task list\n' 119 | for task in node.task_list: 120 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 121 | else: 122 | response += 'task list empty' 123 | 124 | if(len(node.queue) != 0): 125 | response += 'node queue\n' 126 | for task in node.queue: 127 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 128 | else: 129 | response += 'node queue empty\n' 130 | 131 | 132 | elif(data.startswith('execute')): 133 | info = data.split(',') 134 | ps_addr = info[1] 135 | ps_ip = ps_addr.split(':')[0] 136 | ps_port = ps_addr.split(':')[1] 137 | worker_addr = info[2] 138 | worker_ip = worker_addr.split(':')[0] 139 | worker_port = worker_addr.split(':')[1] 140 | job_name = info[3] 141 | task_index = info[4] 142 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 143 | 144 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 145 | thread.start() 146 | response = 'executing the task of '+job_name 147 | 148 | elif(data.startswith('checkpoint')): 149 | job_dir = data.split(',')[1] 150 | if(not os.path.exists(job_dir+'/checkpoint')): 151 | response = 'none' 152 | else: 153 | with open(job_dir+'/checkpoint', 'r') as f: 154 | response = f.read() 155 | 156 | elif(data.startswith('update_checkpoint')): 157 | partition = data.split(',') 158 | checkpoint_info = partition[1] 159 | job_dir = partition[2] 160 | with open(job_dir+'/checkpoint', 'w') as f: 161 | f.write(checkpoint_info) 162 | if(os.path.exists(job_dir+'/checkpoint')): 163 | response = 'update checkpoint file success!' 164 | else: 165 | response = 'update failed' 166 | 167 | elif(data.startswith('task_completion')): 168 | info = data.split(',') #task_completion job_name task_index worker_ip worker_port 169 | job_name = info[1] 170 | job_index = int(job_name[3]) 171 | task_index = int(info[2]) 172 | worker_ip = info[3] 173 | worker_port = info[4] 174 | update_loss = info[5] 175 | job_dir = info[6] 176 | cur_serial = int(info[7]) 177 | 178 | job = self.workload[job_index] 179 | task = job.task_list[task_index] 180 | 181 | if(update_loss == 'YES'): 182 | request = 'loss_'+job_name 183 | response = TCP_request(worker_ip, 9999, request) 184 | cur_loss = float(response) 185 | job.loss_traj.append(cur_loss) 186 | 187 | del_old_model(job_dir, cur_serial, task_index) 188 | 189 | task.executing = False 190 | task.complete = True 191 | 192 | completion_task.shared_resource_lock.acquire() 193 | if(not self.cluster.completed_task.has_key(job_name)): 194 | self.cluster.completed_task[job_name] = [] 195 | 196 | self.cluster.completed_task[job_name].append(task.task_name) 197 | 198 | node = self.cluster.find_task_on_node(worker_ip) 199 | for i in range(len(node.task_list)): 200 | cur_task = node.task_list[i] 201 | if(cur_task.job_name == task.job_name and 202 | cur_task.task_name == task.task_name): 203 | node.task_list.pop(i) 204 | break 205 | for i in range(len(node.job_list)): 206 | cur_job_name = node.job_list[i] 207 | if(cur_job_name == task.job_name): 208 | node.job_list.pop(i) 209 | break 210 | #display_cluster(self.cluster, 2) 211 | completion_task.shared_resource_lock.release() 212 | print('port number is '+str(task.ps_port)) 213 | response = 'processed task completion!' 214 | completion_task.stop_ps(task.ps_port) 215 | 216 | b_response = getbytes(response) 217 | sock.send(b_response) 218 | sock.close() 219 | #print('Connection from %s:%s closed' % addr) 220 | 221 | def start(self): 222 | while (True): 223 | c_sock, addr = self.sock.accept() 224 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 225 | thread.start() 226 | 227 | class Slave: 228 | def __init__(self, ip, port): 229 | self.sock = socket.socket() 230 | self.ip = ip 231 | self.port = str(port) 232 | self.sock.bind((ip, int(port))) 233 | self.sock.listen(5) 234 | self.should_stop = False 235 | 236 | 237 | def TCP_reply(self, sock, addr): 238 | 239 | #print('new request from ' + str(addr) + ' is accepted') 240 | while (not self.should_stop): 241 | response = '' 242 | b_data=sock.recv(1024) 243 | data = getstring(b_data) 244 | if (data =='exit' or not data): 245 | break; 246 | 247 | elif(data =='ip'): 248 | response = str(self.ip) 249 | 250 | elif(data =='port'): 251 | response = str(self.port) 252 | 253 | elif(data == 'is_started'): 254 | response = 'start' 255 | 256 | elif(data == 'cpu'): 257 | cpu_usage = psutil.cpu_percent() 258 | response = str(cpu_usage) 259 | 260 | elif(data == 'memory'): 261 | vm_dic = dict(psutil.virtual_memory()._asdict()) 262 | memory_usage = vm_dic['percent'] 263 | response = str(memory_usage) 264 | 265 | elif(data.startswith('loss')): 266 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 267 | response = f.readline() 268 | 269 | elif(data.startswith('accuracy')): 270 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 271 | response = f.readline() 272 | 273 | elif(data.startswith('execute')): 274 | info = data.split(',') 275 | ps_addr = info[1] 276 | ps_ip = ps_addr.split(':')[0] 277 | ps_port = ps_addr.split(':')[1] 278 | worker_addr = info[2] 279 | worker_ip = worker_addr.split(':')[0] 280 | worker_port = worker_addr.split(':')[1] 281 | job_name = info[3] 282 | task_index = info[4] 283 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 284 | 285 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 286 | thread.start() 287 | response = 'executing the task of '+job_name 288 | 289 | b_response = getbytes(response) 290 | sock.send(b_response) 291 | sock.close() 292 | #print('Connection from %s:%s closed' % addr) 293 | 294 | def start(self): 295 | while (True): 296 | c_sock, addr = self.sock.accept() 297 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 298 | thread.start() 299 | 300 | 301 | if (__name__ == '__main__'): 302 | #print(len(sys.argv)) 303 | if(len(sys.argv) != 2): 304 | print('usage: python server master/slave') 305 | print('exit') 306 | sys.exit(0) 307 | role = sys.argv[1] 308 | localIP = socket.gethostbyname(socket.gethostname()) 309 | node = None 310 | 311 | if(role == 'master'): 312 | print('master') 313 | node = Master(localIP,9999) 314 | print('register') 315 | node.start() 316 | print('listening') 317 | 318 | elif(role == 'slave'): 319 | print('slave') 320 | node = Slave(localIP,9999) 321 | print('register') 322 | node.start() 323 | print('listening') 324 | 325 | else: 326 | print('request') 327 | response = TCP_request(localIP,9999,sys.argv[1]) 328 | print("response is: "+response) 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /Test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 100 24 | max_num_nodes = 3 25 | min_task = 10 26 | max_task = 10 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 50 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 10 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | 104 | task = workload[0].task_list[0] 105 | task.ps_port = master.get_available_port() 106 | task.occupied_worker_port = 2222 107 | 108 | for node in cluster.node_list: 109 | if(node.ip == '172.31.4.135'): 110 | task.worker_ip = node.ip 111 | node.add_task(task,0) 112 | -------------------------------------------------------------------------------- /Workload/workload.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import socket 4 | from enum import Enum 5 | 6 | class Job_Phase(Enum): 7 | Quick_drop = 0 8 | fluctuation = 1 9 | overfitting = 2 10 | 11 | class Task: 12 | def __init__(self, task_name, job_name, deadline, input_size, duration, demand, ps_port, task_index): 13 | self.task_name = task_name 14 | self.job_name = job_name 15 | self.demand = demand 16 | self.deadline = deadline 17 | self.arriving_time = 0 18 | self.input_size = input_size 19 | self.priority = 1 20 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 21 | self.ps_port = ps_port 22 | self.index = task_index 23 | #self.duration = duration 24 | #self.pre_dur = duration 25 | self.worker_ip = None 26 | self.occupied_worker_port = None 27 | self.waiting_time = 0 28 | self.in_queue = False 29 | self.executing = False 30 | self.complete = False 31 | 32 | class Job: 33 | 34 | def __init__(self, job_name, deadline, tasks, input_size, ps_port): 35 | self.mloss_ind = -1 36 | self.removed_task = [] 37 | self.loss_phase = Job_Phase.Quick_drop 38 | self.pivot_index = 0 39 | self.samples = 0 40 | self.job_name = job_name 41 | self.task_list = tasks 42 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 43 | self.ps_port = ps_port 44 | self.input_size = input_size 45 | self.num_task = len(tasks) 46 | self.deadline = deadline 47 | self.complete_time = 0 48 | self.initial_loss = 1.41 # loss of model without training 49 | self.min_loss = self.initial_loss 50 | self.loss_traj = [] 51 | self.num_beta = 0 52 | self.add_task_perc = 0.1 53 | self.num_removed_task = 0 54 | self.initialized = False 55 | #simulation loss trajectory 56 | #for i in range(self.num_task): 57 | # cur_loss = self.initial_loss * math.pow(0.85,i) 58 | # self.loss_traj.append(cur_loss) 59 | 60 | self.arriving_time = 0 61 | self.complete_time = 0 62 | 63 | def cutoff_task(self): 64 | for task in self.task_list: 65 | if(task.in_queue): 66 | task.in_queue = False 67 | task.complete = True 68 | self.removed_task.append(task.task_name) 69 | self.num_removed_task = len(self.removed_task) 70 | 71 | def update_arr_time(self, timestep): 72 | if(self.arriving_time == 0): 73 | self.arriving_time = timestep 74 | for task in self.task_list: 75 | task.arriving_time = timestep 76 | 77 | def add_task(task): 78 | self.task_list.append(task) 79 | self.num_task += 1 80 | 81 | def count_complete_task(self): 82 | num_complete_task = 0 83 | for task in self.task_list: 84 | if(task.complete): 85 | num_complete_task += 1 86 | 87 | return num_complete_task 88 | 89 | def get_num_scheduled_task(self): 90 | num_scheduled_task = 0 91 | for task in self.task_list: 92 | if(task.in_queue or task.complete or task.executing): 93 | num_scheduled_task += 1 94 | return num_scheduled_task 95 | 96 | def detect_overfitting(self): 97 | L = len(self.loss_traj) 98 | local_optimals = self.find_local_optimals() 99 | loss_value_overfitting = False 100 | 101 | if(len(local_optimals) > 1): 102 | first_local_opt = list(local_optimals.values())[0] 103 | last_local_opt = list(local_optimals.values())[-1] 104 | if(last_local_opt > first_local_opt): 105 | loss_value_overfitting = True 106 | 107 | num_loin_epoch = 0 # loss increase epochs 108 | last_loss = self.loss_traj[-1] 109 | for i in range(-2, -L-1, -1): 110 | loss_change = last_loss - self.loss_traj[i] 111 | last_loss = self.loss_traj[i] 112 | if(loss_change > 0): 113 | num_loin_epoch += 1 114 | else: 115 | break 116 | 117 | if(num_loin_epoch > 5 or loss_value_overfitting): 118 | self.loss_phase = Job_Phase.overfitting 119 | self.cutoff_task() 120 | 121 | def find_local_optimals(self): 122 | local_optimals = {} # {(index1:loss1),(index2:loss2),....} 123 | L = len(self.loss_traj) 124 | if(L < 3): 125 | return local_optimals 126 | 127 | for i in range(1,L-1): 128 | last_loss = self.loss_traj[i-1] 129 | cur_loss = self.loss_traj[i] 130 | next_loss = self.loss_traj[i+1] 131 | if(cur_loss < last_loss and cur_loss < next_loss): 132 | local_optimals[i] = cur_loss 133 | 134 | return local_optimals 135 | 136 | def update_priority(self): 137 | #num_complete_task = self.count_complete_task() 138 | num_complete_task = len(self.loss_traj) 139 | ind_last = num_complete_task - 1 140 | if(ind_last <= 0):#none finished task 141 | return 142 | 143 | num_total_task = self.num_task 144 | num_beta = self.num_beta 145 | 146 | num_scheduled_task = self.get_num_scheduled_task() 147 | ini_loss = self.initial_loss 148 | 149 | num_rest_task = num_total_task - num_complete_task 150 | last_loss_reduction = self.loss_traj[-2] - self.loss_traj[-1] 151 | 152 | if(last_loss_reduction < 0 and self.loss_phase == Job_Phase.Quick_drop): 153 | self.loss_phase = Job_Phase.fluctuation 154 | self.pivot_index = num_complete_task - 2 155 | self.samples = 1 156 | self.min_loss = self.loss_traj[-2] 157 | self.mloss_ind = num_complete_task - 2 158 | 159 | if(self.loss_phase == Job_Phase.fluctuation): 160 | self.detect_overfitting() 161 | if(self.loss_phase == Job_Phase.overfitting): 162 | return 163 | self.samples = num_complete_task - self.pivot_index - 1 164 | num_fluc_task = num_total_task - self.pivot_index - 1 165 | max_num_samples = int(num_fluc_task * 0.37) 166 | if(self.samples > max_num_samples): 167 | if(self.min_loss > self.loss_traj[-1]): 168 | self.min_loss = self.loss_traj[-1] 169 | self.mloss_ind = num_complete_task - 1 170 | self.cutoff_task() 171 | elif(self.min_loss > self.loss_traj[-1]): 172 | self.min_loss = self.loss_traj[-1] 173 | self.mloss_ind = num_complete_task - 1 174 | 175 | 176 | def display_task(task): 177 | print('task '+task.task_name) 178 | if(task.in_queue): 179 | print('status: in queue') 180 | elif(task.executing): 181 | print('status: executing') 182 | elif(task.complete): 183 | print('status: complete') 184 | else: 185 | print('status: not scheduled') 186 | 187 | 188 | def create_demand(num_rtype,min_rd,max_rd): 189 | demand = [] 190 | for i in range(num_rtype): 191 | demand.append(min_rd + (max_rd - min_rd) * random.random()) 192 | return demand 193 | 194 | 195 | def create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype,input_size, duration, flag_demand, demands, ps_port, task_index): 196 | demand = [] 197 | if(flag_demand): 198 | demand = create_demand(num_rtype,min_rd,max_rd) 199 | else: 200 | demand = demands 201 | task = Task(task_name, job_name, deadline,input_size, duration, demand, ps_port, task_index) 202 | return task 203 | 204 | def create_job(job_name, num_task, input_size, min_rd, max_rd, deadline, num_rtype, min_task_dur, max_task_dur, ps_node, flag_demand): 205 | tasks = [] 206 | for i in range(num_task): 207 | task_name = str(i) 208 | demands = [] 209 | duration = random.randint(min_task_dur,max_task_dur) 210 | ps_port = ps_node.get_available_port() 211 | task = create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype, input_size, duration, flag_demand, demands, ps_port, i) 212 | tasks.append(task) 213 | job = Job(job_name,deadline, tasks, input_size, ps_port) 214 | return job 215 | 216 | 217 | def display_job(job): 218 | 219 | print('\n\njob name: '+job.job_name) 220 | print('validation loss:') 221 | if(len(job.loss_traj) == 0): 222 | print('no finished task') 223 | else: 224 | print(job.loss_traj) 225 | if(len(job.removed_task) == 0): 226 | print('no removed task') 227 | else: 228 | print('removed tasks: '+str(job.removed_task)) 229 | for task in job.task_list: 230 | display_task(task) 231 | 232 | import random 233 | def generate_workload(length, min_task, max_task, min_in_size, max_in_size, min_rd, max_rd, min_ddl, max_ddl,min_task_dur,max_task_dur, num_rtype, ps_node): 234 | workload = [] 235 | for i in range(length): 236 | job_name = 'job'+str(i) 237 | num_task = random.randint(min_task,max_task) 238 | input_size = random.randint(min_in_size,max_in_size) 239 | deadline = random.randint(min_ddl, max_ddl) 240 | 241 | job = create_job(job_name,num_task,input_size,min_rd,max_rd, deadline, num_rtype,min_task_dur,max_task_dur, ps_node, flag_demand = True) 242 | workload.append(job) 243 | return workload 244 | 245 | def job_not_complete(job): 246 | for task in job.task_list: 247 | if(not task.complete): 248 | return True 249 | return False 250 | 251 | def find_unfinished_jobs(workload, cur_job_num, timestep): 252 | unfinished_job_list = [] 253 | for i in range(cur_job_num): 254 | if(job_not_complete(workload[i])): 255 | unfinished_job_list.append(workload[i]) 256 | elif(workload[i].complete_time < timestep and workload[i].complete_time == 0): 257 | workload[i].complete_time = timestep 258 | return unfinished_job_list 259 | 260 | 261 | def copy_task(task): 262 | cp_task = Task(task.task_name, task.job_name, task.deadline, task.input_size, task.duration, task.demand, task.ps_port) 263 | return cp_task 264 | 265 | def copy_job(job): 266 | cp_tasks = [] 267 | for task in job.task_list: 268 | cp_task = copy_task(task) 269 | cp_tasks.append(cp_task) 270 | cp_job = Job(job.job_name, job.deadline, cp_tasks, job.input_size, job.ps_port) 271 | return cp_job 272 | 273 | def copy_workload(workload): 274 | cp_wl = [] 275 | for job in workload: 276 | cp_job = copy_job(job) 277 | cp_wl.append(cp_job) 278 | return cp_wl 279 | 280 | -------------------------------------------------------------------------------- /other/RLmodel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | import math 5 | class RLnetwork(object): 6 | def __init__(self, numInputs, numOutputs): 7 | self.input_dim = numInputs 8 | self.output_dim = numOutputs 9 | self.weight = torch.Tensor(numInputs, numOutputs).normal_(0, 0.01) 10 | self.weight_grads = torch.Tensor(numInputs, numOutputs) 11 | self.bias = torch.Tensor(numOutputs).zero_() 12 | self.bias_grads = torch.Tensor(numOutputs) 13 | 14 | def forward(self, state_vector): #input: 1 x n. weight: m x n. output: weight x input.t() = m x 1 15 | 16 | return torch.matmul(state_vector, self.weight) + self.bias 17 | 18 | def backward(self, state_vector, vt, learning_rate): 19 | log_grad = 1/(torch.matmul(state_vector, self.weight) + self.bias) 20 | sv = torch.Tensor(self.input_dim,1) 21 | lg = torch.Tensor(1,self.output_dim) 22 | sv = state_vector.unsqueeze_(-1) 23 | lg[0][:] = log_grad 24 | weight_grads = torch.matmul(sv,lg) 25 | self.weight_grads += learning_rate * weight_grads * vt 26 | self.bias_grads += learning_rate * log_grad * vt 27 | return self.weight_grads, self.bias_grads 28 | 29 | def zero_grad(self): 30 | self.weight_grads = 0 31 | self.bias_grads = 0 32 | 33 | def update_grads(self): 34 | self.weight -= self.weight_grads 35 | self.bias -= self.bias_grads -------------------------------------------------------------------------------- /other/cluster.py: -------------------------------------------------------------------------------- 1 | import random 2 | #!/usr/bin/env python 3 | import psutil 4 | from server import * 5 | import threading 6 | import socket 7 | import server 8 | from time import sleep 9 | def get_cpu_memory(): 10 | # gives a single float value 11 | cpu_usage = psutil.cpu_percent() 12 | #print('cpu percentage: '+str(cpu_usage)+'%') 13 | # gives an object with many fields 14 | psutil.virtual_memory() 15 | # you can convert that object to a dictionary 16 | vm_dic = dict(psutil.virtual_memory()._asdict()) 17 | memory_usage = vm_dic['percent'] 18 | #print('memory percentage: '+str(memory_usage)+'%') 19 | 20 | return cpu_usage, memory_usage 21 | 22 | class Cluster: 23 | node_list = [] 24 | topology = {} 25 | completed_task = {} #job_name:[task_name, task_name, ...] 26 | task_distribution = {} #job_name:{node_name: num_task, node_name: num_task, ...} 27 | def __init__(self, num_rack, max_rack_size): 28 | self.num_rack = num_rack 29 | self.max_rack_size = max_rack_size 30 | 31 | for i in range(num_rack): 32 | self.topology[i] = [] 33 | 34 | def process(self, cur_timestep): 35 | for node in self.node_list: 36 | pop_index = [] 37 | if(len(node.queue) != 0): 38 | for i in range(len(node.queue)): 39 | task = node.queue[i] 40 | if(task.job_name not in node.job_list): 41 | node.add_task(task,cur_timestep) 42 | pop_index.append(i) 43 | for index in sorted(pop_index, reverse = True): 44 | node.queue.pop(index) 45 | 46 | def find_master(self): 47 | for node in self.node_list: 48 | if(node.master): 49 | return node 50 | 51 | def add_node(self,node,rack): 52 | self.node_list.append(node) 53 | self.topology[rack].append(node) 54 | 55 | def has_av_res(self): 56 | for node in self.node_list: 57 | num_rtype = len(node.av_resource) 58 | node_available = True 59 | for i in range(num_rtype): 60 | if(node.av_resource[i] > 0.2): 61 | continue 62 | else: 63 | node_available = False 64 | break 65 | if(node_available): 66 | return True 67 | 68 | return False 69 | 70 | ''' 71 | def update_process_rate(self): 72 | new_message_distribution = {} # node_name: num_new_message 73 | for i in range(len(self.node_list)):#initialization 74 | node_name = self.node_list[i].name 75 | new_message_distribution[node_name] = 0 76 | 77 | for job_name in self.task_distribution: 78 | task_map = self.task_distribution[job_name] 79 | num_task = 0 80 | for exe_node_name,num_node_task in task_map.items(): 81 | num_task += num_node_task 82 | for exe_node_name,num_node_task in task_map.items(): 83 | num_other_node_task = num_task - num_node_task 84 | num_new_message = num_node_task * num_other_node_task 85 | new_message_distribution[exe_node_name] += num_new_message 86 | 87 | for node in self.node_list: 88 | node_new_message = new_message_distribution[node.name] 89 | node.process_rate = 1 - 0.005 * node_new_message 90 | if(node.process_rate < 0.2): 91 | node.process_rate = 0.2 92 | node.num_total_message += node_new_message 93 | ''' 94 | 95 | def step(self,time): 96 | #self.complete_task #job_name:[task_name, task_name, ... ] 97 | 98 | #for job_name, task_list in self.completed_task.items(): 99 | # self.task_distribution[job_name][node.name] -= len(task_list) 100 | 101 | return self.completed_task 102 | 103 | def complete_task(self,task,node): 104 | task_map = self.task_distribution[task.job_name] 105 | task_map[node.name] -= 1 106 | if(task_map[node.name] == 0): 107 | del task_map[node.name] 108 | if(len(task_map) == 0): 109 | del self.task_distribution[task.job_name] 110 | 111 | def find_task_on_node(self, worker_ip): 112 | for node in self.node_list: 113 | if(worker_ip == node.ip): 114 | return node 115 | 116 | def find_node(self, task): 117 | demand = task.demand 118 | min_affinity = len(demand) + 1 119 | selected_node = None 120 | available = True 121 | for node in self.node_list: 122 | cur_affinity = 0 123 | available = True 124 | for i in range(len(demand)): 125 | if(demand[i] < node.av_resource[i]): 126 | cur_affinity += node.av_resource[i] - demand[i] 127 | else: 128 | available = False 129 | if(available and cur_affinity < min_affinity): 130 | min_affinity = cur_affinity 131 | selected_node = node 132 | 133 | return selected_node 134 | 135 | def find_minload_node(self): 136 | num_rtype = len(self.node_list[0].av_resource) 137 | minload = 0 138 | first_node = self.node_list[0] 139 | selected_node = first_node 140 | for i in range(num_rtype): 141 | minload += 1 - first_node.av_resource[i] 142 | 143 | for node in self.node_list: 144 | cur_load = 0 145 | for i in range(num_rtype): 146 | cur_load += node.av_resource[i] 147 | if(cur_load < minload): 148 | minload = cur_load 149 | selected_node = node 150 | return selected_node 151 | 152 | 153 | class Node: 154 | 155 | def __init__(self, name, num_rtype, ip): 156 | self.localIP = socket.gethostbyname(socket.gethostname()) 157 | self.ip = ip 158 | self.master = (self.localIP == self.ip) 159 | self.job_list = [] 160 | self.queue = [] 161 | self.workload = None 162 | self.cluster = None 163 | self.scheduler = None 164 | 165 | 166 | self.name = name 167 | self.server_port = 9999 168 | self.port_availability = {} 169 | for port in range(2222,8888+1): 170 | self.port_availability[port] = True 171 | 172 | self.av_resource = [] 173 | for i in range(num_rtype): 174 | self.av_resource.append(100) 175 | 176 | self.num_total_message = 0 177 | self.num_exe_task = 0 178 | self.num_total_task = 0 179 | self.task_list = [] 180 | if(not self.master): 181 | start_server_cmd = '/home/ubuntu/TF-scheduler/remote_start_slave '+ip 182 | thread = threading.Thread(target = execute_command, args = (start_server_cmd,), name = 'Slave-Thread'+self.ip) 183 | thread.start() 184 | 185 | def start_master(self, workload, cluster, scheduler): 186 | 187 | localIP = socket.gethostbyname(socket.gethostname()) 188 | Master(localIP, 9999, workload, cluster, scheduler).start() 189 | 190 | 191 | 192 | ''' 193 | def process(self): 194 | completed_task = {} #job_name:[task_name, task_name, ... ] 195 | len_task_list = len(self.task_list) 196 | pop_index = [] 197 | max_resource = 0 198 | num_rtype = len(self.av_resource) 199 | for i in range(num_rtype): 200 | if(max_resource < 1 - self.av_resource[i]): 201 | max_resource = 1 - self.av_resource[i] 202 | 203 | true_process_rate = self.process_rate 204 | 205 | if(max_resource > 1): 206 | true_process_rate *= 1 / max_resource 207 | 208 | for i in range(len_task_list): 209 | task = self.task_list[i] 210 | if(task.duration - true_process_rate <= 0): 211 | if(task.job_name not in completed_task): 212 | completed_task[task.job_name] = [] 213 | self.num_exe_task -= 1 214 | task.duration = 0 215 | task.executing = False 216 | task.complete = True 217 | 218 | completed_task[task.job_name].append(task.task_name) 219 | 220 | num_rtype = len(self.av_resource) 221 | for j in range(num_rtype): 222 | self.av_resource[j] += task.demand[j] 223 | if(self.av_resource[j] > 1): 224 | self.av_resource[j] = 1 225 | pop_index.append(i) 226 | 227 | 228 | else: 229 | task.duration -= true_process_rate 230 | 231 | for ind in sorted(pop_index, reverse = True): 232 | self.task_list.pop(ind) 233 | 234 | return completed_task 235 | ''' 236 | 237 | def get_available_port(self): 238 | for port in range(2222,8888+1): 239 | if(self.port_availability[port]): 240 | self.port_availability[port] = False 241 | return str(port) 242 | return -1 243 | 244 | def execute_task(self, task): 245 | 246 | worker_ip = task.worker_ip 247 | ps_port = task.ps_port 248 | ps_ip = task.ps_ip 249 | 250 | request = 'execute,'+ps_ip+':'+str(ps_port)+','+worker_ip+':'+str(task.occupied_worker_port)+','+task.job_name+','+str(task.index) 251 | 252 | response = TCP_request(ps_ip, 9999, request) 253 | print('execute task of '+task.job_name+' on worker: '+worker_ip+':'+str(task.occupied_worker_port)) 254 | 255 | def add_task(self, task, cur_timestep): 256 | task.in_queue = False 257 | task.executing = True 258 | self.job_list.append(task.job_name) 259 | task.waiting_time = cur_timestep - task.arriving_time 260 | self.num_exe_task += 1 261 | self.num_total_task += 1 262 | #num_rtype = len(self.av_resource) 263 | #for i in range(num_rtype): 264 | # self.av_resource[i] -= task.demand[i] 265 | self.task_list.append(task) 266 | self.execute_task(task) 267 | 268 | def is_started(self): 269 | request = 'is_started' 270 | response = 'not start' 271 | try: 272 | response = TCP_request(self.ip, 9999, request) 273 | finally: 274 | return (response == 'start') 275 | def clear_removed_task(self): 276 | if(len(self.queue) == 0): 277 | return 278 | pop_index = [] 279 | for i in range(len(self.queue)): 280 | task = self.queue[i] 281 | if(task.complete): 282 | pop_index.append(i) 283 | 284 | for index in sorted(pop_index, reverse = True): 285 | self.queue.pop(index) 286 | 287 | 288 | def update_av_resource(self,cur_timestep): 289 | 290 | while(not self.is_started()): 291 | print(self.ip+' does not start, waiting...') 292 | sleep(5) 293 | 294 | request = 'cpu' 295 | response = TCP_request(self.ip, 9999, request) 296 | CPU = float(response) 297 | request = 'memory' 298 | response = TCP_request(self.ip, 9999, request) 299 | Memory = float(response) 300 | self.av_resource[0] = 100 - CPU 301 | self.av_resource[1] = 100 - Memory 302 | 303 | for task in self.queue: 304 | if(task.job_name not in self.job_list): 305 | self.add_task(task,cur_timestep) 306 | 307 | 308 | 309 | def create_node(node_name, num_rtype, ip): 310 | node = Node(node_name, num_rtype, ip) 311 | return node 312 | 313 | def create_cluster(max_rack, max_rack_size, max_num_nodes, num_rtype, ip_list): 314 | cur_num_node = 0 315 | cluster = Cluster(max_rack, max_rack_size) 316 | 317 | for i in range(max_num_nodes): 318 | cur_num_node += 1 319 | node_name = 'node'+str(cur_num_node) 320 | node = create_node(node_name,num_rtype, ip_list[i]) 321 | rack = random.randint(0,max_rack-1) 322 | cluster.add_node(node,rack) 323 | 324 | return cluster 325 | 326 | def display_available_port(port_availability): 327 | for port in range(2222,8888+1): 328 | if(not port_availability[port]): 329 | continue 330 | else: 331 | print('available ports: '+str(port)+'-8888') 332 | break 333 | 334 | def display_node(node): 335 | print('node name: '+node.name) 336 | print('node ip: '+node.ip) 337 | display_available_port(node.port_availability) 338 | print('available resource percent: ') 339 | num_rtype = len(node.av_resource) 340 | for i in range(num_rtype): 341 | print(str(node.av_resource[i])) 342 | if(len(node.task_list) != 0): 343 | print('task list') 344 | for task in node.task_list: 345 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 346 | else: 347 | print('task list empty') 348 | 349 | if(len(node.queue) != 0): 350 | print('node queue') 351 | for task in node.queue: 352 | print('task '+str(task.task_name)+'\tjob '+task.job_name) 353 | else: 354 | print('node queue empty') 355 | 356 | if(len(node.job_list) != 0): 357 | print('node job list') 358 | for job_name in node.job_list: 359 | print(job_name) 360 | else: 361 | print('node job list empty') 362 | print('\n') 363 | 364 | 365 | def display_cluster(cluster, num_rtype): 366 | print('\nnumber of nodes: '+str(len(cluster.node_list))) 367 | for node in cluster.node_list: 368 | display_node(node) 369 | 370 | def find_node(cluster, target_ip): 371 | for node in cluster.node_list: 372 | if(node.ip == target_ip): 373 | return node 374 | 375 | if __name__ == "__main__": 376 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 377 | 378 | ip_list = [] 379 | for ip in servers.values(): 380 | ip_list.append(ip) 381 | 382 | max_num_rack = 3 383 | max_rack_size = 5 384 | max_num_nodes = len(servers) 385 | num_rtype = 2 386 | 387 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 388 | display_cluster(cluster,2) -------------------------------------------------------------------------------- /other/completion_task.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | import server 4 | #TCP_request(ip,port,request): 5 | 6 | shared_resource_lock = threading.Lock() 7 | 8 | def stop_ps(ps_host): 9 | cmd = "kill -9 $(ps -ef | grep " + str(ps_host) + " | grep 'job_name=ps' | awk '{print $2}')" 10 | server.execute_command(cmd) 11 | 12 | 13 | def completion_task(job_name, task_index, worker_ip, worker_port, update_loss, job_dir, cur_serial): 14 | ip = '172.31.6.117' 15 | port = '9999' 16 | request = 'task_completion,'+job_name+','+task_index+','+worker_ip+','+worker_port+','+update_loss+','+job_dir+','+str(cur_serial) 17 | response = server.TCP_request(ip,port,request) 18 | print(response) 19 | -------------------------------------------------------------------------------- /other/computation.py: -------------------------------------------------------------------------------- 1 | import math 2 | def average(a): 3 | total = 0 4 | num_element = len(a) 5 | for element in a: 6 | total += element 7 | 8 | return total / num_element 9 | 10 | def std_dev(a): 11 | aver = average(a) 12 | total = 0 13 | for element in a: 14 | total += (element - aver) * (element - aver) 15 | 16 | return math.sqrt(total) 17 | 18 | def get_num_message_list(cluster): 19 | num_message_list = [] 20 | for node in cluster.node_list: 21 | num_message_list.append(node.num_total_message) 22 | return num_message_list 23 | 24 | def get_num_task_list(cluster): 25 | num_task_list = [] 26 | for node in cluster.node_list: 27 | num_task_list.append(node.num_total_task) 28 | return num_task_list 29 | 30 | def get_remaining_time_list(queue,timestep): 31 | remaining_time_list = [] 32 | for task in queue: 33 | remaining_time_list.append(task.arriving_time + task.deadline - timestep) 34 | return remaining_time_list 35 | 36 | def get_input_size_list(queue): 37 | input_size_list = [] 38 | for task in queue: 39 | input_size_list.append(task.input_size) 40 | return input_size_list 41 | 42 | def node_reward(W_c_m, W_c_t, t,cluster): 43 | ''' 44 | node.num_total_message 45 | node.num_exe_task 46 | node.num_total_task 47 | ''' 48 | reward_t = 0 49 | num_task_list = get_num_task_list(cluster) 50 | num_message_list = get_num_message_list(cluster) 51 | 52 | aver_num_task = average(num_task_list) 53 | std_num_task = std_dev(num_task_list) 54 | 55 | aver_num_message = average(num_message_list) 56 | std_num_message = std_dev(num_message_list) 57 | 58 | for node in cluster.node_list: 59 | r1 = 0 60 | r2 = 0 61 | 62 | if(std_num_message != 0): 63 | r1 = (node.num_total_message - aver_num_message) / std_num_message 64 | 65 | if(std_num_task != 0): 66 | r2 = (aver_num_task - node.num_total_task) / std_num_task 67 | 68 | reward_t += W_c_m * r1 + W_c_t * r2 69 | 70 | return reward_t 71 | 72 | 73 | 74 | def queue_reward(W_q_d, W_q_s, W_q_r, t, aver_S, scheduler, beta, queue, workload): 75 | 76 | reward_t = 0 77 | input_size_list = get_input_size_list(queue) 78 | remaining_time_list = get_remaining_time_list(queue,t) 79 | 80 | 81 | aver_remaining_time = average(remaining_time_list) 82 | std_remaining_time = std_dev(remaining_time_list) 83 | 84 | aver_input_size = average(input_size_list) 85 | std_input_size = std_dev(input_size_list) 86 | 87 | 88 | for task in queue: 89 | job = workload[int(task.job_name[3])] 90 | num_scheduled_task = scheduler.get_num_scheduled_task(job) 91 | remaining_time = task.arriving_time + task.deadline - t 92 | r1 = 0 93 | r2 =0 94 | r3 = 0 95 | 96 | if(std_remaining_time != 0): 97 | r1 = W_q_d * (remaining_time - aver_remaining_time) / std_remaining_time 98 | 99 | if(std_input_size != 0): 100 | r2 = W_q_s * (task.input_size - aver_input_size / std_input_size) 101 | 102 | r3 = W_q_r * (100 - num_scheduled_task*beta)/beta 103 | 104 | reward_t += r1 + r2 + r3 105 | 106 | return reward_t 107 | 108 | 109 | def discounted_reward(gama,t,reward_traj): 110 | acc_reward = 0 111 | for ti in range(t): 112 | acc_reward += pow(gama,ti) * reward_traj[ti] 113 | return acc_reward -------------------------------------------------------------------------------- /other/epoch_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 10000 24 | max_num_nodes = 3 25 | min_task = 3600 26 | max_task = 3600 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 10000 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 1 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | node1 = None 104 | node2 = None 105 | 106 | for node in cluster.node_list: 107 | if(node.ip == '172.31.4.135'): 108 | node1 = node 109 | elif(node.ip == '172.31.3.225'): 110 | node2 = node 111 | 112 | num_task = 1 113 | for task in workload[0].task_list: 114 | if(num_task % 2 == 1): 115 | task.worker_ip = node1.ip 116 | task.ps_port = master.get_available_port() 117 | task.occupied_worker_port = node1.get_available_port() 118 | node1.queue.append(task) 119 | else: 120 | task.worker_ip = node2.ip 121 | task.ps_port = master.get_available_port() 122 | task.occupied_worker_port = node2.get_available_port() 123 | node2.queue.append(task) 124 | num_task += 1 125 | 126 | while(True): 127 | cluster.process(0) 128 | time.sleep(8) 129 | 130 | -------------------------------------------------------------------------------- /other/server.py: -------------------------------------------------------------------------------- 1 | 2 | import socket 3 | import threading 4 | import sys 5 | import random 6 | #!/usr/bin/env python 7 | import psutil 8 | import os 9 | import completion_task 10 | from workload import * 11 | from cluster import * 12 | 13 | def get_files(job_dir, file_keyword): 14 | files = [] 15 | for file in os.listdir(job_dir): 16 | if(os.path.isfile(os.path.join(job_dir,file)) and file.startswith(file_keyword)): 17 | files.append(file) 18 | return files 19 | 20 | 21 | def del_old_model(job_dir, cur_serial, task_index): 22 | files1 = get_files(job_dir, "model.ckpt") 23 | files2 = get_files(job_dir, "latest_model") 24 | for file in files1: 25 | partition = file.split('.') 26 | part2 = partition[1] 27 | serial_number = int(part2[5:]) 28 | if(cur_serial - serial_number >=50): 29 | os.remove(job_dir+"/"+file) 30 | for file in files2: 31 | partition = file.split('.') 32 | part1 = partition[0] 33 | epoch = int(part1.split('_')[3][5:]) 34 | if(epoch != task_index): 35 | os.remove(job_dir+"/"+file) 36 | 37 | def getbytes(string): 38 | return string.encode() 39 | 40 | def getstring(byte): 41 | return byte.decode() 42 | 43 | def TCP_request(ip,port,request): 44 | c_s = socket.socket() 45 | response = 'request failed' 46 | try: 47 | c_s.connect((ip,int(port))) 48 | byte_request = getbytes(request) 49 | c_s.send(byte_request) 50 | byte_response = c_s.recv(1024) 51 | response = getstring(byte_response) 52 | 53 | except Exception, e: 54 | print('ip: '+ip+"\tport: "+str(port)+'\t request: '+request) 55 | print(e) 56 | 57 | finally: 58 | return response 59 | 60 | def execute_command(cmd): 61 | os.system(cmd) 62 | 63 | 64 | class Master: 65 | def __init__(self, ip, port, workload, cluster, scheduler): 66 | self.workload = workload 67 | self.sock = socket.socket() 68 | self.ip = ip 69 | self.port = port 70 | self.sock.bind((ip, int(port))) 71 | self.sock.listen(5) 72 | self.cluster = cluster 73 | self.scheduler = scheduler 74 | self.should_stop = False 75 | 76 | def TCP_reply(self, sock, addr): 77 | 78 | #print('new request from ' + str(addr) + ' is accepted') 79 | while (not self.should_stop): 80 | response = '' 81 | b_data=sock.recv(1024) 82 | data = getstring(b_data) 83 | if (data =='exit' or not data): 84 | break; 85 | 86 | elif(data =='ip'): 87 | response = str(self.ip) 88 | 89 | elif(data =='port'): 90 | response = str(self.port) 91 | 92 | elif(data == 'is_started'): 93 | response = 'start' 94 | 95 | elif(data == 'cpu'): 96 | cpu_usage = psutil.cpu_percent() 97 | response = str(cpu_usage) 98 | 99 | elif(data == 'memory'): 100 | vm_dic = dict(psutil.virtual_memory()._asdict()) 101 | memory_usage = vm_dic['percent'] 102 | response = str(memory_usage) 103 | 104 | elif(data.startswith('loss')): 105 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 106 | response = f.readline() 107 | 108 | elif(data.startswith('accuracy')): 109 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 110 | response = f.readline() 111 | elif(data.startswith('cluster')): 112 | response = 'nodes infomation' 113 | for node in self.cluster.node_list: 114 | response += 'node name: '+node.name+'\n' 115 | response += 'node ip: '+node.ip+'\n' 116 | 117 | if(len(node.task_list) != 0): 118 | response += 'task list\n' 119 | for task in node.task_list: 120 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 121 | else: 122 | response += 'task list empty' 123 | 124 | if(len(node.queue) != 0): 125 | response += 'node queue\n' 126 | for task in node.queue: 127 | response += 'task '+str(task.task_name)+'\tjob '+task.job_name+'\n' 128 | else: 129 | response += 'node queue empty\n' 130 | 131 | 132 | elif(data.startswith('execute')): 133 | info = data.split(',') 134 | ps_addr = info[1] 135 | ps_ip = ps_addr.split(':')[0] 136 | ps_port = ps_addr.split(':')[1] 137 | worker_addr = info[2] 138 | worker_ip = worker_addr.split(':')[0] 139 | worker_port = worker_addr.split(':')[1] 140 | job_name = info[3] 141 | task_index = info[4] 142 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 143 | 144 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 145 | thread.start() 146 | response = 'executing the task of '+job_name 147 | 148 | elif(data.startswith('checkpoint')): 149 | job_dir = data.split(',')[1] 150 | if(not os.path.exists(job_dir+'/checkpoint')): 151 | response = 'none' 152 | else: 153 | with open(job_dir+'/checkpoint', 'r') as f: 154 | response = f.read() 155 | 156 | elif(data.startswith('update_checkpoint')): 157 | partition = data.split(',') 158 | checkpoint_info = partition[1] 159 | job_dir = partition[2] 160 | with open(job_dir+'/checkpoint', 'w') as f: 161 | f.write(checkpoint_info) 162 | if(os.path.exists(job_dir+'/checkpoint')): 163 | response = 'update checkpoint file success!' 164 | else: 165 | response = 'update failed' 166 | 167 | elif(data.startswith('task_completion')): 168 | info = data.split(',') #task_completion job_name task_index worker_ip worker_port 169 | job_name = info[1] 170 | job_index = int(job_name[3]) 171 | task_index = int(info[2]) 172 | worker_ip = info[3] 173 | worker_port = info[4] 174 | update_loss = info[5] 175 | job_dir = info[6] 176 | cur_serial = int(info[7]) 177 | 178 | job = self.workload[job_index] 179 | task = job.task_list[task_index] 180 | 181 | if(update_loss == 'YES'): 182 | request = 'loss_'+job_name 183 | response = TCP_request(worker_ip, 9999, request) 184 | cur_loss = float(response) 185 | job.loss_traj.append(cur_loss) 186 | 187 | del_old_model(job_dir, cur_serial, task_index) 188 | 189 | task.executing = False 190 | task.complete = True 191 | 192 | completion_task.shared_resource_lock.acquire() 193 | if(not self.cluster.completed_task.has_key(job_name)): 194 | self.cluster.completed_task[job_name] = [] 195 | 196 | self.cluster.completed_task[job_name].append(task.task_name) 197 | 198 | node = self.cluster.find_task_on_node(worker_ip) 199 | for i in range(len(node.task_list)): 200 | cur_task = node.task_list[i] 201 | if(cur_task.job_name == task.job_name and 202 | cur_task.task_name == task.task_name): 203 | node.task_list.pop(i) 204 | break 205 | for i in range(len(node.job_list)): 206 | cur_job_name = node.job_list[i] 207 | if(cur_job_name == task.job_name): 208 | node.job_list.pop(i) 209 | break 210 | #display_cluster(self.cluster, 2) 211 | completion_task.shared_resource_lock.release() 212 | print('port number is '+str(task.ps_port)) 213 | response = 'processed task completion!' 214 | completion_task.stop_ps(task.ps_port) 215 | 216 | b_response = getbytes(response) 217 | sock.send(b_response) 218 | sock.close() 219 | #print('Connection from %s:%s closed' % addr) 220 | 221 | def start(self): 222 | while (True): 223 | c_sock, addr = self.sock.accept() 224 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 225 | thread.start() 226 | 227 | class Slave: 228 | def __init__(self, ip, port): 229 | self.sock = socket.socket() 230 | self.ip = ip 231 | self.port = str(port) 232 | self.sock.bind((ip, int(port))) 233 | self.sock.listen(5) 234 | self.should_stop = False 235 | 236 | 237 | def TCP_reply(self, sock, addr): 238 | 239 | #print('new request from ' + str(addr) + ' is accepted') 240 | while (not self.should_stop): 241 | response = '' 242 | b_data=sock.recv(1024) 243 | data = getstring(b_data) 244 | if (data =='exit' or not data): 245 | break; 246 | 247 | elif(data =='ip'): 248 | response = str(self.ip) 249 | 250 | elif(data =='port'): 251 | response = str(self.port) 252 | 253 | elif(data == 'is_started'): 254 | response = 'start' 255 | 256 | elif(data == 'cpu'): 257 | cpu_usage = psutil.cpu_percent() 258 | response = str(cpu_usage) 259 | 260 | elif(data == 'memory'): 261 | vm_dic = dict(psutil.virtual_memory()._asdict()) 262 | memory_usage = vm_dic['percent'] 263 | response = str(memory_usage) 264 | 265 | elif(data.startswith('loss')): 266 | with open('/home/ubuntu/TF-scheduler/loss_folder/'+data,'r') as f:#data looks like: loss_jobname 267 | response = f.readline() 268 | 269 | elif(data.startswith('accuracy')): 270 | with open('/home/ubuntu/TF-scheduler/accuracy_folder/'+data,'r') as f:#data looks like: accuracy_jobname 271 | response = f.readline() 272 | 273 | elif(data.startswith('execute')): 274 | info = data.split(',') 275 | ps_addr = info[1] 276 | ps_ip = ps_addr.split(':')[0] 277 | ps_port = ps_addr.split(':')[1] 278 | worker_addr = info[2] 279 | worker_ip = worker_addr.split(':')[0] 280 | worker_port = worker_addr.split(':')[1] 281 | job_name = info[3] 282 | task_index = info[4] 283 | cmd = '/home/ubuntu/TF-scheduler/remote_task_executor '+ps_ip+' '+ps_port+' '+worker_ip+' '+worker_port+' '+job_name+' '+task_index 284 | 285 | thread = threading.Thread(target=execute_command, args=(cmd,), name = 'Task-Execution-Thread-'+worker_ip+'-'+job_name+'-'+task_index) 286 | thread.start() 287 | response = 'executing the task of '+job_name 288 | 289 | b_response = getbytes(response) 290 | sock.send(b_response) 291 | sock.close() 292 | #print('Connection from %s:%s closed' % addr) 293 | 294 | def start(self): 295 | while (True): 296 | c_sock, addr = self.sock.accept() 297 | thread = threading.Thread(target=self.TCP_reply, args=(c_sock,addr), name = 'Reuqest-Listening-Thread-'+str(addr)) 298 | thread.start() 299 | 300 | 301 | if (__name__ == '__main__'): 302 | #print(len(sys.argv)) 303 | if(len(sys.argv) != 2): 304 | print('usage: python server master/slave') 305 | print('exit') 306 | sys.exit(0) 307 | role = sys.argv[1] 308 | localIP = socket.gethostbyname(socket.gethostname()) 309 | node = None 310 | 311 | if(role == 'master'): 312 | print('master') 313 | node = Master(localIP,9999) 314 | print('register') 315 | node.start() 316 | print('listening') 317 | 318 | elif(role == 'slave'): 319 | print('slave') 320 | node = Slave(localIP,9999) 321 | print('register') 322 | node.start() 323 | print('listening') 324 | 325 | else: 326 | print('request') 327 | response = TCP_request(localIP,9999,sys.argv[1]) 328 | print("response is: "+response) 329 | 330 | 331 | 332 | 333 | -------------------------------------------------------------------------------- /other/start_pytorch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python trainer.py \ 3 | --ps_hosts=172.31.6.117:2222\ 4 | --worker_hosts=172.31.15.67:2222\ 5 | --job_name=worker --task_index=0 6 | -------------------------------------------------------------------------------- /other/statistics.py: -------------------------------------------------------------------------------- 1 | def show_statistics(workload,cluster,timer): 2 | 3 | total_complete_time = 0 4 | num_deadline_satisfied = 0 5 | total_num_message = 0 6 | 7 | total_waiting_time = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 8 | num_job = [] #0:size 100-300, 1: size 300-500, 2: size 500-700, 3: size 700-900 9 | for i in range(4): 10 | total_waiting_time.append(0) 11 | num_job.append(0) 12 | 13 | acc_reduction = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 4:0.45-0.6, 5:0.6-0.75, 6:0.75-1 14 | aver_jct_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 15 | num_job_removed_ratio = [] #0:0-0.15, 1:0.15-0.3, 2:0.3-0.45, 3:0.45-0.6, 4:0.6-0.75, 5:0.75-1 16 | 17 | for i in range(6): 18 | acc_reduction.append(0) 19 | aver_jct_removed_ratio.append(0) 20 | num_job_removed_ratio.append(0) 21 | 22 | 23 | #total_task = 0 24 | JCTs = [] 25 | for job in workload: 26 | arriving_time = job.task_list[0].arriving_time 27 | complete_time = job.complete_time 28 | JCTs.append(complete_time - arriving_time) 29 | 30 | num_task = len(job.task_list) 31 | job_waiting_time = 0 32 | 33 | for task in job.task_list: 34 | #total_task += 1 35 | job_waiting_time += task.waiting_time 36 | 37 | if(job.input_size<325):#100-325 38 | total_waiting_time[0] += job_waiting_time / num_task 39 | num_job[0] += 1 40 | elif(job.input_size<550):#325-550 41 | total_waiting_time[1] += job_waiting_time / num_task 42 | num_job[1] += 1 43 | elif(job.input_size<775):#550-775 44 | total_waiting_time[2] += job_waiting_time / num_task 45 | num_job[2] += 1 46 | elif(job.input_size<1000):#775-1000 47 | total_waiting_time[3] += job_waiting_time / num_task 48 | num_job[3] += 1 49 | 50 | removed_ratio = job.num_removed_task / len(job.task_list) 51 | total_num_task = len(job.task_list) 52 | num_removed_task = job.num_removed_task 53 | 54 | if(removed_ratio < 0.15): 55 | acc_reduction[0] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 56 | aver_jct_removed_ratio[0] += complete_time - arriving_time 57 | num_job_removed_ratio[0] += 1 58 | elif(removed_ratio < 0.3): 59 | acc_reduction[1] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 60 | aver_jct_removed_ratio[1] += complete_time - arriving_time 61 | num_job_removed_ratio[1] += 1 62 | elif(removed_ratio < 0.45): 63 | acc_reduction[2] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 64 | aver_jct_removed_ratio[2] += complete_time - arriving_time 65 | num_job_removed_ratio[2] += 1 66 | elif(removed_ratio < 0.6): 67 | acc_reduction[3] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 68 | aver_jct_removed_ratio[3] += complete_time - arriving_time 69 | num_job_removed_ratio[3] += 1 70 | elif(removed_ratio < 0.75): 71 | acc_reduction[4] += 1 - (job.initial_loss - 0.372) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 72 | aver_jct_removed_ratio[4] += complete_time - arriving_time 73 | num_job_removed_ratio[4] += 1 74 | elif(removed_ratio < 1): 75 | acc_reduction[5] += 1 - (job.initial_loss - 0.35246) / (job.initial_loss - job.loss_traj[job.mloss_ind]) 76 | aver_jct_removed_ratio[5] += complete_time - arriving_time 77 | num_job_removed_ratio[5] += 1 78 | 79 | jct = complete_time - arriving_time 80 | total_complete_time += jct 81 | 82 | if(jct < job.deadline): 83 | num_deadline_satisfied += 1 84 | 85 | sorted_JCTs = sorted(JCTs) 86 | num_job = len(workload) 87 | for i in range(num_job): 88 | jct = sorted_JCTs[i] 89 | perc = ((i+1)/num_job)*100 90 | print('CDF point( '+str(jct)+', '+str(perc)+'%)') 91 | 92 | avg_waiting_time = 0 93 | 94 | deadline_gurantee = num_deadline_satisfied / len(workload) 95 | print('job deadline gurantee = '+str(deadline_gurantee)) 96 | 97 | print('remove ratio: 0-15%, 15-30%, 30-45%, 45-60%, 60-75%') 98 | 99 | aver_job_acc_red = 0 100 | 101 | for i in range(6): 102 | if(num_job_removed_ratio[i] == 0): 103 | acc_reduction[i] = 0 104 | aver_jct_removed_ratio[i] = 0 105 | else: 106 | acc_reduction[i] /= num_job_removed_ratio[i] 107 | aver_jct_removed_ratio[i] /= num_job_removed_ratio[i] 108 | aver_job_acc_red += acc_reduction[i] 109 | 110 | print('percentage of this part: '+str(num_job_removed_ratio[i] * 100 / len(workload))+'%') 111 | print('job accuracy reduction = '+str(acc_reduction[i] / len(workload))) 112 | print('average JCT over removed ratio = '+str(aver_jct_removed_ratio[i])) 113 | 114 | print('average job accuracy reduction: '+str(aver_job_acc_red / len(workload))) 115 | 116 | for i in range(4): 117 | if(num_job[i] != 0): 118 | avg_waiting_time += total_waiting_time[i] 119 | total_waiting_time[i] /= num_job[i] 120 | print('job waiting time = '+str(total_waiting_time[i])) 121 | print('average job waiting time = '+str(avg_waiting_time/len(workload))) 122 | average_jct = total_complete_time / len(workload) 123 | print('average jct = '+str(average_jct)) 124 | 125 | total_num_message = 0 126 | 127 | for node in cluster.node_list: 128 | total_num_message += node.num_total_message 129 | 130 | avg_message = total_num_message/len(cluster.node_list) 131 | avg_bandwidth = avg_message * 0.0035 132 | print('avg_bandwidth = '+str(avg_bandwidth)) 133 | 134 | total_time = 0 135 | 136 | for time in timer: 137 | total_time += time 138 | aver_time = total_time / len(timer) 139 | print('average latency = '+str(aver_time)) 140 | 141 | 142 | def is_schedule_all_task(job_list): 143 | for job in job_list: 144 | for task in job.task_list: 145 | if(not task.in_queue and not task.executing and not task.complete): 146 | return False 147 | return True 148 | 149 | def schedule_policy(Gandiva, job_list, cluster): 150 | can_schedule = True 151 | if(not Gandiva): 152 | can_schedule = cluster.has_av_res() 153 | else: 154 | can_schedule = not is_schedule_all_task(job_list) 155 | return can_schedule -------------------------------------------------------------------------------- /other/task_executor.py: -------------------------------------------------------------------------------- 1 | #from task import * 2 | #from workload import * 3 | import os 4 | 5 | cmd = 'ls' 6 | os.system(cmd) 7 | 8 | -------------------------------------------------------------------------------- /other/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import random 4 | import time 5 | 6 | from statistics import * 7 | from workload import * 8 | from cluster import * 9 | from scheduler import * 10 | #from task import * 11 | from RLmodel import * 12 | from computation import * 13 | import threading 14 | import completion_task 15 | 16 | 17 | max_res_demand = 0.3 18 | min_res_demand = 0.1 19 | min_in_size = 100 20 | max_in_size = 1000 21 | min_task_dur = 10 22 | max_task_dur = 50 23 | max_length_q = 100 24 | max_num_nodes = 3 25 | min_task = 10 26 | max_task = 10 27 | 28 | max_rack_size = 5 29 | max_num_rack = 3 30 | num_rtype = 2 31 | 32 | max_q_length = 50 33 | max_num_task = 200 34 | min_num_task = 20 35 | min_ddl = max_task_dur * min_task 36 | max_ddl = max_task_dur * max_task 37 | add_task_perc = 0.1 38 | beta = 10 #percentage of tasks one time 39 | 40 | W_q_d = 0.5 41 | W_q_s = 0.5 42 | W_q_r = 0.5 43 | 44 | W_c_m = 0.5 45 | W_c_t = 0.5 46 | 47 | gama_q = 0.9 48 | gama_n = 0.9 49 | learning_rate = 0.1 50 | 51 | remove_threshold = 0.1 52 | 53 | num_coming_job = 5 54 | 55 | remove_option = True 56 | order_net_option = True 57 | node_net_option = True 58 | 59 | node_net_in_dim = 2 * max_num_nodes 60 | node_net_out_dim = max_num_nodes 61 | 62 | order_net_in_dim = 3 * max_q_length 63 | order_net_out_dim = max_q_length 64 | 65 | servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 66 | 67 | ip_list = [] 68 | for ip in servers.values(): 69 | ip_list.append(ip) 70 | 71 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 72 | 73 | new_job = True 74 | 75 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 76 | 77 | num_episodes = 1 78 | num_iteration = 1 79 | 80 | wl_len = 10 81 | 82 | Gandiva = False 83 | 84 | ps_ip = socket.gethostbyname(socket.gethostname()) 85 | ps_node = find_node(cluster, ps_ip) 86 | master = None 87 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 88 | for node in cluster.node_list: 89 | if(node.master): 90 | master = node 91 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 92 | thread.start() 93 | break 94 | 95 | timer = [] 96 | #workloads = [] 97 | #for i in range(num_episodes): 98 | # random.shuffle(workload) 99 | # workload_copy = copy_workload(workload) 100 | # workloads.append(workload_copy) 101 | 102 | cluster.node_list[0].update_av_resource() 103 | 104 | task = workload[0].task_list[0] 105 | task.ps_port = master.get_available_port() 106 | task.occupied_worker_port = 2222 107 | 108 | for node in cluster.node_list: 109 | if(node.ip == '172.31.4.135'): 110 | task.worker_ip = node.ip 111 | node.add_task(task,0) 112 | -------------------------------------------------------------------------------- /other/train_RL_model.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from statistics import * 5 | from workload import * 6 | from cluster import * 7 | from scheduler import * 8 | #from task import * 9 | from RLmodel import * 10 | from computation import * 11 | import threading 12 | import completion_task 13 | 14 | 15 | max_res_demand = 0.3 16 | min_res_demand = 0.1 17 | min_in_size = 100 18 | max_in_size = 1000 19 | min_task_dur = 10 20 | max_task_dur = 50 21 | max_length_q = 100 22 | max_num_nodes = 4 23 | min_task = 30 24 | max_task = 50 25 | 26 | max_rack_size = 5 27 | max_num_rack = 3 28 | num_rtype = 2 29 | 30 | max_q_length = 50 31 | max_num_task = 200 32 | min_num_task = 20 33 | min_ddl = max_task_dur * min_task 34 | max_ddl = max_task_dur * max_task 35 | add_task_perc = 0.1 36 | beta = 10 #percentage of tasks one time 37 | 38 | W_q_d = 0.5 39 | W_q_s = 0.5 40 | W_q_r = 0.5 41 | 42 | W_c_m = 0.5 43 | W_c_t = 0.5 44 | 45 | gama_q = 0.9 46 | gama_n = 0.9 47 | learning_rate = 0.1 48 | 49 | remove_threshold = 0.0 50 | 51 | num_coming_job = 5 52 | 53 | remove_option = False 54 | order_net_option = True 55 | node_net_option = True 56 | 57 | node_net_in_dim = 2 * max_num_nodes 58 | node_net_out_dim = max_num_nodes 59 | 60 | order_net_in_dim = 3 * max_q_length 61 | order_net_out_dim = max_q_length 62 | 63 | #servers = {'node1':'172.31.6.117', 'node2':'172.31.4.135', 'node3':'172.31.3.225'} 64 | servers = {'node1':'172.31.6.117', 'node2':'172.31.44.243', 'node3':'172.31.35.92', 'node4':'172.31.35.219'} 65 | 66 | ip_list = [] 67 | for ip in servers.values(): 68 | ip_list.append(ip) 69 | 70 | cluster = create_cluster(max_num_rack, max_rack_size, max_num_nodes, num_rtype, ip_list) 71 | 72 | new_job = True 73 | 74 | scheduler = Scheduler(max_q_length, add_task_perc, node_net_in_dim, node_net_out_dim, order_net_in_dim, order_net_out_dim) 75 | 76 | num_episodes = 1 77 | num_iteration = 1 78 | 79 | wl_len = 10 80 | 81 | Gandiva = False 82 | 83 | ps_ip = socket.gethostbyname(socket.gethostname()) 84 | ps_node = find_node(cluster, ps_ip) 85 | 86 | workload = generate_workload(wl_len, min_task, max_task, min_in_size, max_in_size, min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype, ps_node) 87 | for node in cluster.node_list: 88 | if(node.master): 89 | thread = threading.Thread(target = node.start_master, args = (workload, cluster, scheduler), name = 'Master-Thread'+node.ip) 90 | thread.start() 91 | break 92 | 93 | timer = [] 94 | #workloads = [] 95 | #for i in range(num_episodes): 96 | # random.shuffle(workload) 97 | # workload_copy = copy_workload(workload) 98 | # workloads.append(workload_copy) 99 | 100 | for iteration in range(num_iteration): 101 | print('iteration: '+str(iteration)) 102 | #workload = generate_workload(wl_len,min_task,max_task,min_in_size,max_in_size,min_res_demand,max_res_demand,min_ddl,max_ddl,min_task_dur,max_task_dur,num_rtype) 103 | unfinished_job_list = workload 104 | queue_reward_traj = [] 105 | node_reward_traj = [] 106 | 107 | queue_sv_traj = [] 108 | node_sv_traj = [] 109 | 110 | queue_discounted_reward_traj = [] 111 | node_discounted_reward_traj = [] 112 | 113 | scheduler.node_network.zero_grad() 114 | scheduler.order_network.zero_grad() 115 | for episode in range(num_episodes): 116 | print('episode: '+str(episode)) 117 | #workload = workloads[episode] 118 | queue_reward_traj.append([]) 119 | node_reward_traj.append([]) 120 | 121 | queue_sv_traj.append([]) 122 | node_sv_traj.append([]) 123 | 124 | queue_discounted_reward_traj.append([]) 125 | node_discounted_reward_traj.append([]) 126 | 127 | cur_timestep = time.clock() 128 | cur_job_num = 0 129 | num_schedule = 0 130 | schedule_interval = 5 131 | 132 | while(len(unfinished_job_list) != 0): 133 | 134 | cur_timestep = time.clock() + num_schedule * schedule_interval 135 | if(cur_job_num < wl_len): 136 | workload[cur_job_num].update_arr_time(cur_timestep) 137 | scheduler.job_arriving(workload[cur_job_num]) 138 | cur_job_num += num_coming_job 139 | if(cur_job_num >= wl_len): 140 | cur_job_num = wl_len 141 | 142 | unfinished_job_list = find_unfinished_jobs(workload, cur_job_num, cur_timestep) 143 | 144 | if(len(unfinished_job_list) == 0): 145 | print('all jobs finish') 146 | break 147 | last_add_job = int(unfinished_job_list[0].job_name[3]) 148 | 149 | valid = True 150 | num_decision = 0 151 | 152 | while(valid): 153 | 154 | dead_loop = 0 155 | 156 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 157 | 158 | while(can_schedule and len(scheduler.queue) < scheduler.max_q_length): 159 | 160 | dead_loop += 1 161 | 162 | last_add_job = scheduler.fill_queue(unfinished_job_list, last_add_job, dead_loop, remove_threshold, remove_option) 163 | if(last_add_job < 0): 164 | break 165 | 166 | can_schedule = schedule_policy(Gandiva, unfinished_job_list, cluster) 167 | 168 | start = time.clock() 169 | if(not Gandiva): 170 | cluster.process(cur_timestep) 171 | valid, q_sv, n_sv, t_ind, n_ind = scheduler.schedule_one_task(cur_timestep, cluster, order_net_option, node_net_option) 172 | else: 173 | valid = scheduler.Gandiva_schedule_one_task(cur_timestep, cluster) 174 | elapsed = (time.clock() - start) 175 | 176 | 177 | num_decision += 1 178 | 179 | if(len(scheduler.queue) == 0): 180 | #print('schedule all tasks') 181 | break 182 | 183 | if(not valid): 184 | #print('invalid action') 185 | num_decision -= 1 186 | break 187 | 188 | timer.append(elapsed) 189 | if(not Gandiva): 190 | queue_sv_traj[episode].append(q_sv) 191 | node_sv_traj[episode].append(n_sv) 192 | 193 | aver_size = scheduler.average_size() 194 | 195 | beta = scheduler.add_task_perc 196 | queue = scheduler.queue 197 | 198 | q_rt = queue_reward(W_q_d, W_q_s, W_q_r, cur_timestep, aver_size, scheduler, beta, queue, workload) 199 | queue_reward_traj[episode].append(q_rt) 200 | 201 | n_rt = node_reward(W_c_m, W_c_t, cur_timestep, cluster) 202 | node_reward_traj[episode].append(n_rt) 203 | 204 | print('\n\ncurrent time: '+str(cur_timestep)) 205 | 206 | completion_task.shared_resource_lock.acquire() 207 | for job in workload: 208 | job.update_priority() 209 | for node in cluster.node_list: 210 | node.clear_removed_task() 211 | 212 | scheduler.clear_removed_task() 213 | display_scheduler(scheduler) 214 | display_cluster(cluster,2) 215 | for job in workload: 216 | display_job(job) 217 | 218 | print('scheduler now is sleeping... itertaion '+str(num_schedule)) 219 | time.sleep(schedule_interval) 220 | 221 | completion_task.shared_resource_lock.release() 222 | num_schedule += 1 223 | 224 | print('finish episode '+str(episode)+', makespan = '+str(cur_timestep)) 225 | 226 | if(not Gandiva): 227 | num_action = len(queue_sv_traj[episode]) 228 | for j in range(num_action): 229 | n_vt = discounted_reward(gama_n, j, node_reward_traj[episode]) 230 | q_vt = discounted_reward(gama_q, j, queue_reward_traj[episode]) 231 | node_discounted_reward_traj[episode].append(n_vt) 232 | queue_discounted_reward_traj[episode].append(q_vt) 233 | 234 | show_statistics(workload,cluster,timer) 235 | timer = [] 236 | 237 | unfinished_job_list = workload 238 | 239 | if(not Gandiva): 240 | num_action = 100000000 241 | 242 | for episode in range(num_episodes): 243 | if(num_action > len(queue_sv_traj[episode])): 244 | num_action = len(queue_sv_traj[episode]) 245 | 246 | q_r_episode_baseline = [] 247 | n_r_episode_baseline = [] 248 | for j in range(num_action): 249 | total_qr_b = 0 250 | total_nr_b = 0 251 | for episode in range(num_episodes): 252 | total_qr_b += queue_discounted_reward_traj[episode][j] 253 | total_nr_b += node_discounted_reward_traj[episode][j] 254 | 255 | q_r_episode_baseline.append(total_qr_b / num_episodes) 256 | n_r_episode_baseline.append(total_nr_b / num_episodes) 257 | 258 | for episode in range(num_episodes): 259 | for j in range(num_action): 260 | q_sv = queue_sv_traj[episode][j] 261 | n_sv = node_sv_traj[episode][j] 262 | q_vt = queue_discounted_reward_traj[episode][j] 263 | n_vt = node_discounted_reward_traj[episode][j] 264 | qr_b = q_r_episode_baseline[j] 265 | nr_b = n_r_episode_baseline[j] 266 | scheduler.order_network.backward(q_sv, q_vt - qr_b, learning_rate) 267 | scheduler.node_network.backward(n_sv, n_vt - nr_b, learning_rate) 268 | 269 | scheduler.node_network.update_grads() 270 | scheduler.order_network.update_grads() 271 | 272 | torch.save(scheduler.node_network.weight, 'node_weight_ep'+str(iteration)+'.para') 273 | torch.save(scheduler.node_network.bias, 'node_bias_ep'+str(iteration)+'.para') 274 | torch.save(scheduler.order_network.weight, 'order_weight_ep'+str(iteration)+'.para') 275 | torch.save(scheduler.order_network.bias, 'order_bias_ep'+str(iteration)+'.para') -------------------------------------------------------------------------------- /other/workload.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import socket 4 | from enum import Enum 5 | 6 | class Job_Phase(Enum): 7 | Quick_drop = 0 8 | fluctuation = 1 9 | overfitting = 2 10 | 11 | class Task: 12 | def __init__(self, task_name, job_name, deadline, input_size, duration, demand, ps_port, task_index): 13 | self.task_name = task_name 14 | self.job_name = job_name 15 | self.demand = demand 16 | self.deadline = deadline 17 | self.arriving_time = 0 18 | self.input_size = input_size 19 | self.priority = 1 20 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 21 | self.ps_port = ps_port 22 | self.index = task_index 23 | #self.duration = duration 24 | #self.pre_dur = duration 25 | self.worker_ip = None 26 | self.occupied_worker_port = None 27 | self.waiting_time = 0 28 | self.in_queue = False 29 | self.executing = False 30 | self.complete = False 31 | 32 | class Job: 33 | 34 | def __init__(self, job_name, deadline, tasks, input_size, ps_port): 35 | self.mloss_ind = -1 36 | self.removed_task = [] 37 | self.loss_phase = Job_Phase.Quick_drop 38 | self.pivot_index = 0 39 | self.samples = 0 40 | self.job_name = job_name 41 | self.task_list = tasks 42 | self.ps_ip = socket.gethostbyname(socket.gethostname()) 43 | self.ps_port = ps_port 44 | self.input_size = input_size 45 | self.num_task = len(tasks) 46 | self.deadline = deadline 47 | self.complete_time = 0 48 | self.initial_loss = 1.41 # loss of model without training 49 | self.min_loss = self.initial_loss 50 | self.loss_traj = [] 51 | self.num_beta = 0 52 | self.add_task_perc = 0.1 53 | self.num_removed_task = 0 54 | self.initialized = False 55 | #simulation loss trajectory 56 | #for i in range(self.num_task): 57 | # cur_loss = self.initial_loss * math.pow(0.85,i) 58 | # self.loss_traj.append(cur_loss) 59 | 60 | self.arriving_time = 0 61 | self.complete_time = 0 62 | 63 | def cutoff_task(self): 64 | for task in self.task_list: 65 | if(task.in_queue): 66 | task.in_queue = False 67 | task.complete = True 68 | self.removed_task.append(task.task_name) 69 | self.num_removed_task = len(self.removed_task) 70 | 71 | def update_arr_time(self, timestep): 72 | if(self.arriving_time == 0): 73 | self.arriving_time = timestep 74 | for task in self.task_list: 75 | task.arriving_time = timestep 76 | 77 | def add_task(task): 78 | self.task_list.append(task) 79 | self.num_task += 1 80 | 81 | def count_complete_task(self): 82 | num_complete_task = 0 83 | for task in self.task_list: 84 | if(task.complete): 85 | num_complete_task += 1 86 | 87 | return num_complete_task 88 | 89 | def get_num_scheduled_task(self): 90 | num_scheduled_task = 0 91 | for task in self.task_list: 92 | if(task.in_queue or task.complete or task.executing): 93 | num_scheduled_task += 1 94 | return num_scheduled_task 95 | 96 | def detect_overfitting(self): 97 | L = len(self.loss_traj) 98 | local_optimals = self.find_local_optimals() 99 | loss_value_overfitting = False 100 | 101 | if(len(local_optimals) > 1): 102 | first_local_opt = list(local_optimals.values())[0] 103 | last_local_opt = list(local_optimals.values())[-1] 104 | if(last_local_opt > first_local_opt): 105 | loss_value_overfitting = True 106 | 107 | num_loin_epoch = 0 # loss increase epochs 108 | last_loss = self.loss_traj[-1] 109 | for i in range(-2, -L-1, -1): 110 | loss_change = last_loss - self.loss_traj[i] 111 | last_loss = self.loss_traj[i] 112 | if(loss_change > 0): 113 | num_loin_epoch += 1 114 | else: 115 | break 116 | 117 | if(num_loin_epoch > 5 or loss_value_overfitting): 118 | self.loss_phase = Job_Phase.overfitting 119 | self.cutoff_task() 120 | 121 | def find_local_optimals(self): 122 | local_optimals = {} # {(index1:loss1),(index2:loss2),....} 123 | L = len(self.loss_traj) 124 | if(L < 3): 125 | return local_optimals 126 | 127 | for i in range(1,L-1): 128 | last_loss = self.loss_traj[i-1] 129 | cur_loss = self.loss_traj[i] 130 | next_loss = self.loss_traj[i+1] 131 | if(cur_loss < last_loss and cur_loss < next_loss): 132 | local_optimals[i] = cur_loss 133 | 134 | return local_optimals 135 | 136 | def update_priority(self): 137 | #num_complete_task = self.count_complete_task() 138 | num_complete_task = len(self.loss_traj) 139 | ind_last = num_complete_task - 1 140 | if(ind_last <= 0):#none finished task 141 | return 142 | 143 | num_total_task = self.num_task 144 | num_beta = self.num_beta 145 | 146 | num_scheduled_task = self.get_num_scheduled_task() 147 | ini_loss = self.initial_loss 148 | 149 | num_rest_task = num_total_task - num_complete_task 150 | last_loss_reduction = self.loss_traj[-2] - self.loss_traj[-1] 151 | 152 | if(last_loss_reduction < 0 and self.loss_phase == Job_Phase.Quick_drop): 153 | self.loss_phase = Job_Phase.fluctuation 154 | self.pivot_index = num_complete_task - 2 155 | self.samples = 1 156 | self.min_loss = self.loss_traj[-2] 157 | self.mloss_ind = num_complete_task - 2 158 | 159 | if(self.loss_phase == Job_Phase.fluctuation): 160 | self.detect_overfitting() 161 | if(self.loss_phase == Job_Phase.overfitting): 162 | return 163 | self.samples = num_complete_task - self.pivot_index - 1 164 | num_fluc_task = num_total_task - self.pivot_index - 1 165 | max_num_samples = int(num_fluc_task * 0.37) 166 | if(self.samples > max_num_samples): 167 | if(self.min_loss > self.loss_traj[-1]): 168 | self.min_loss = self.loss_traj[-1] 169 | self.mloss_ind = num_complete_task - 1 170 | self.cutoff_task() 171 | elif(self.min_loss > self.loss_traj[-1]): 172 | self.min_loss = self.loss_traj[-1] 173 | self.mloss_ind = num_complete_task - 1 174 | 175 | 176 | def display_task(task): 177 | print('task '+task.task_name) 178 | if(task.in_queue): 179 | print('status: in queue') 180 | elif(task.executing): 181 | print('status: executing') 182 | elif(task.complete): 183 | print('status: complete') 184 | else: 185 | print('status: not scheduled') 186 | 187 | 188 | def create_demand(num_rtype,min_rd,max_rd): 189 | demand = [] 190 | for i in range(num_rtype): 191 | demand.append(min_rd + (max_rd - min_rd) * random.random()) 192 | return demand 193 | 194 | 195 | def create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype,input_size, duration, flag_demand, demands, ps_port, task_index): 196 | demand = [] 197 | if(flag_demand): 198 | demand = create_demand(num_rtype,min_rd,max_rd) 199 | else: 200 | demand = demands 201 | task = Task(task_name, job_name, deadline,input_size, duration, demand, ps_port, task_index) 202 | return task 203 | 204 | def create_job(job_name, num_task, input_size, min_rd, max_rd, deadline, num_rtype, min_task_dur, max_task_dur, ps_node, flag_demand): 205 | tasks = [] 206 | for i in range(num_task): 207 | task_name = str(i) 208 | demands = [] 209 | duration = random.randint(min_task_dur,max_task_dur) 210 | ps_port = ps_node.get_available_port() 211 | task = create_task(task_name, job_name, min_rd, max_rd, deadline, num_rtype, input_size, duration, flag_demand, demands, ps_port, i) 212 | tasks.append(task) 213 | job = Job(job_name,deadline, tasks, input_size, ps_port) 214 | return job 215 | 216 | 217 | def display_job(job): 218 | 219 | print('\n\njob name: '+job.job_name) 220 | print('validation loss:') 221 | if(len(job.loss_traj) == 0): 222 | print('no finished task') 223 | else: 224 | print(job.loss_traj) 225 | if(len(job.removed_task) == 0): 226 | print('no removed task') 227 | else: 228 | print('removed tasks: '+str(job.removed_task)) 229 | for task in job.task_list: 230 | display_task(task) 231 | 232 | import random 233 | def generate_workload(length, min_task, max_task, min_in_size, max_in_size, min_rd, max_rd, min_ddl, max_ddl,min_task_dur,max_task_dur, num_rtype, ps_node): 234 | workload = [] 235 | for i in range(length): 236 | job_name = 'job'+str(i) 237 | num_task = random.randint(min_task,max_task) 238 | input_size = random.randint(min_in_size,max_in_size) 239 | deadline = random.randint(min_ddl, max_ddl) 240 | 241 | job = create_job(job_name,num_task,input_size,min_rd,max_rd, deadline, num_rtype,min_task_dur,max_task_dur, ps_node, flag_demand = True) 242 | workload.append(job) 243 | return workload 244 | 245 | def job_not_complete(job): 246 | for task in job.task_list: 247 | if(not task.complete): 248 | return True 249 | return False 250 | 251 | def find_unfinished_jobs(workload, cur_job_num, timestep): 252 | unfinished_job_list = [] 253 | for i in range(cur_job_num): 254 | if(job_not_complete(workload[i])): 255 | unfinished_job_list.append(workload[i]) 256 | elif(workload[i].complete_time < timestep and workload[i].complete_time == 0): 257 | workload[i].complete_time = timestep 258 | return unfinished_job_list 259 | 260 | 261 | def copy_task(task): 262 | cp_task = Task(task.task_name, task.job_name, task.deadline, task.input_size, task.duration, task.demand, task.ps_port) 263 | return cp_task 264 | 265 | def copy_job(job): 266 | cp_tasks = [] 267 | for task in job.task_list: 268 | cp_task = copy_task(task) 269 | cp_tasks.append(cp_task) 270 | cp_job = Job(job.job_name, job.deadline, cp_tasks, job.input_size, job.ps_port) 271 | return cp_job 272 | 273 | def copy_workload(workload): 274 | cp_wl = [] 275 | for job in workload: 276 | cp_job = copy_job(job) 277 | cp_wl.append(cp_job) 278 | return cp_wl 279 | 280 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.8.0 2 | opencv-python==3.4.0.12 3 | tqdm==4.19.6 4 | pandas==0.22.0 5 | matplotlib==2.2.0 6 | numpy==1.16.2 7 | scikit-learn==0.19.1 8 | 9 | 10 | --------------------------------------------------------------------------------