├── README.md ├── configs ├── docker_containers ├── local └── ssh ├── env_builder.py ├── loader.py ├── logging.conf ├── main_agent.py ├── main_measurements.py ├── operations.txt ├── pqos_handler.py ├── rdt_env.py ├── requirements.txt ├── scheduler.py ├── scripts ├── study_abliation.sh ├── study_coeff.sh ├── study_coex.sh ├── study_different_bes.sh ├── study_feature.sh ├── study_interval.sh ├── study_lc_profiling.sh ├── study_measurement.sh ├── study_measurement_diff_bes.sh ├── study_multi.sh ├── study_quantile.sh ├── study_test.sh ├── study_transfer.sh └── study_transfer_train.sh └── utils ├── argparser.py ├── config_constants.py ├── constants.py └── functions.py /README.md: -------------------------------------------------------------------------------- 1 | ## Resource-Allocation-Reinforcement-Learning 2 | 3 | This repository is part of my Master Thesis titled: "__*Design and implementation of an intelligent agent, 4 | capable of sharing resources in multicore systems, using Deep Reinforcement Learning*__". 5 | 6 | Implementation of a Deep Reinforcement Learning agent that is capable to share the last-level-cache of a multi-core system, between a Latency Critical Service and a number of Best Effort applications. The agent by utilising the DQN family of algorithms, achieved to keep the SLAs violation of the critical service below 3% and in the same time succeeded even a 4x speed up for the Best Effort apps, by allocating cache ways to them when possible. 7 | 8 | Please use this identifier to cite or link to this item: http://artemis.cslab.ece.ntua.gr:8080/jspui/handle/123456789/17662 9 | 10 | ### Dependencies 11 | 12 | In a new conda environment execute: 13 | 14 | pip install -r requirements.txt 15 | 16 | -------------------------------------------------------------------------------- /configs/docker_containers: -------------------------------------------------------------------------------- 1 | { 2 | #'in-memory': ('zilutian/in-memory-analytics:amd64', '/data/ml-latest /data/myratings.csv --driver-memory 6g --executor-memory 16g', 'data'), 3 | #'in-memory-small': ('zilutian/in-memory-analytics:amd64', '/data/ml-latest-small /data/myratings.csv', 'data'), 4 | 'graphs': ('cloudsuite/graph-analytics', '--driver-memory 6g --executor-memory 16g', 'data-twitter'), 5 | 'DecisionTreeClassification': ('nikmand/bes:spark-example', 'ml.JavaDecisionTreeClassificationExample', 'data-svm'), 6 | 'GradientBoostedTreeRegressor': ('nikmand/bes:spark-example', 'ml.JavaGradientBoostedTreeRegressorExample', 'data-svm'), 7 | 'LinearSVC': ('nikmand/bes:spark-example', 'ml.JavaLinearSVCExample', 'data-svm'), 8 | 'LogisticRegressionWithElasticNet': ('nikmand/bes:spark-example', 'ml.JavaLogisticRegressionWithElasticNetExample', 'data-svm'), 9 | #'RandomForestClassifier': ('nikmand/bes:spark-example', 'ml.JavaRandomForestClassifierExample', 'data-svm'), 10 | #'libquantum': ('nikmand/spec:462', None, None), 11 | #'mcf': ('nikmand/spec:429', None, None), 12 | #'lbm': ('nikmand/spec:470', None, None) 13 | #'astar': ('nikmand/spec:473', None, None), 14 | 'hmmer': ('nikmand/spec:456', None, None), 15 | #'gems': ('nikmand/spec:459', None, None) 16 | } 17 | 18 | -------------------------------------------------------------------------------- /configs/local: -------------------------------------------------------------------------------- 1 | [env] 2 | latency_thr = 10 3 | num_ways = 9 4 | pen_coef = 2 5 | 6 | [pqos] 7 | cores_lc = 0 8 | pqos_interface = none 9 | 10 | [loader] 11 | hp_ip = 127.0.0.1 12 | hp_port = 42171 13 | cores_loader = 2 14 | loader_dir = /home/nikmand/CLionProjects/memcached 15 | loader_threads = 1 16 | loader_conn = 10 17 | ratio = 0 18 | rps = 100 19 | exp_dist = 20 | interval = -1 21 | 22 | [scheduler] 23 | cores_be = 1-3 24 | cores_per_be = 3 25 | num_bes = 1 26 | be_repeated = 1 27 | docker_file = configs/docker_containers 28 | # scheduler subclasses 29 | seed = 1 30 | bes_list = ['graphs', 'in-memory'] 31 | 32 | [agent] 33 | lr = 1e-2 34 | layers_dim = [24, 48] 35 | target_update = 100 36 | batch_size = 32 37 | gamma = 0.99 38 | arch = dueling 39 | algo = ddqn 40 | mem_size = 10_000 41 | mem_type = per 42 | eps_decay = 0.001 43 | eps_start = 1 44 | eps_end = 0.01 45 | checkpoint = 46 | weights = noinit 47 | 48 | [misc] 49 | -------------------------------------------------------------------------------- /configs/ssh: -------------------------------------------------------------------------------- 1 | [env] 2 | latency_thr = 10 3 | num_ways = 18 4 | pen_coef = 2 5 | 6 | [pqos] 7 | cores_lc = 0 8 | pqos_interface = none 9 | 10 | [loader] 11 | hp_ip = 127.0.0.1 12 | hp_port = 42171 13 | cores_loader = 2 14 | loader_dir = /home/users/nmandil/memcached 15 | interval = -1 16 | rps = 100 17 | loader_threads = 1 18 | loader_conn = 10 19 | ratio = 0 20 | exp_dist = 21 | 22 | [scheduler] 23 | cores_per_be = 3 24 | num_bes = 1 25 | cores_be = 1-9 26 | be_repeated = 1 27 | docker_file = configs/docker_containers 28 | # scheduler subclasses 29 | seed = 1 30 | bes_list = ['in-memory', 'in-memory', 'in-memory'] 31 | 32 | [agent] 33 | lr = 1e-2 34 | layers_dim = [24, 48] 35 | target_update = 100 36 | batch_size = 32 37 | gamma = 0.99 38 | arch = dueling 39 | algo = ddqn 40 | mem_size = 10_000 41 | mem_type = per 42 | eps_decay = 0.001 43 | eps_start = 1 44 | eps_end = 0.01 45 | checkpoint = 46 | weights = noinit 47 | 48 | [misc] 49 | -------------------------------------------------------------------------------- /env_builder.py: -------------------------------------------------------------------------------- 1 | from loader import MemCachedLoader 2 | from pqos_handler import PqosHandlerMock, PqosHandlerPid, PqosHandlerCore 3 | from rdt_env import Rdt 4 | from scheduler import RandomScheduler, QueueScheduler 5 | from utils.constants import Loaders, Schedulers 6 | from utils.functions import parse_num_list 7 | 8 | 9 | def loader_factory(service_name, config): 10 | """ """ 11 | if service_name == Loaders.MEMCACHED: 12 | loader = MemCachedLoader(config) 13 | else: 14 | raise ValueError("Loader option {} is not supported".format(service_name)) 15 | 16 | return loader 17 | 18 | 19 | def scheduler_factory(scheduler_type, config): 20 | """ """ 21 | if scheduler_type == Schedulers.RANDOM: 22 | scheduler = RandomScheduler(config) 23 | elif scheduler_type == Schedulers.QUEUE: 24 | scheduler = QueueScheduler(config) 25 | else: 26 | raise ValueError("Scheduler option {} is not supported".format(scheduler_type)) 27 | 28 | return scheduler 29 | 30 | 31 | def pqos_factory(pqos_interface, cores_pid_hp, cores_pids_be): 32 | """ """ 33 | cores_pid_hp_range = parse_num_list(cores_pid_hp) 34 | cores_pids_be_range = parse_num_list(cores_pids_be) 35 | if pqos_interface == 'MSR': 36 | pqos_handler = PqosHandlerCore(cores_pid_hp_range, cores_pids_be_range) 37 | elif pqos_interface == 'OS': 38 | pqos_handler = PqosHandlerPid(cores_pid_hp_range, cores_pids_be_range) 39 | else: 40 | pqos_handler = PqosHandlerMock() 41 | 42 | return pqos_handler 43 | 44 | 45 | class EnvBuilder: 46 | """ It takes over the creation of the environment. """ 47 | 48 | def __init__(self): 49 | self.loader = None 50 | self.scheduler = None 51 | self.pqos_handler = None 52 | 53 | def build_loader(self, service_name, config): 54 | self.loader = loader_factory(service_name, config) 55 | 56 | return self 57 | 58 | def build_pqos(self, pqos_interface, cores_pid_hp_range, cores_pids_be_range): 59 | self.pqos_handler = pqos_factory(pqos_interface, cores_pid_hp_range, cores_pids_be_range) 60 | 61 | return self 62 | 63 | def build_scheduler(self, scheduler_type, config): 64 | self.scheduler = scheduler_factory(scheduler_type, config) 65 | 66 | return self 67 | 68 | def build(self, config): 69 | env = Rdt(config, self.loader, self.scheduler, self.pqos_handler) 70 | 71 | return env 72 | -------------------------------------------------------------------------------- /loader.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import struct 3 | import subprocess 4 | from time import sleep 5 | import logging.config 6 | from abc import ABC, abstractmethod 7 | from utils.config_constants import * 8 | 9 | logging.config.fileConfig('logging.conf') 10 | log = logging.getLogger('simpleExample') 11 | 12 | 13 | class Loader(ABC): 14 | """ Abstract class that handles all the functionality that concerns the service loader. """ 15 | def __init__(self, config): 16 | self.client = None 17 | self.service_ip = config[HP_IP] 18 | self.service_port = config.getint(HP_PORT) 19 | self.loader_dir = config[LOADER_DIR] 20 | self.quantile = config[QUANTILE] 21 | self.measurement_interval = config[ACTION_INTERVAL] 22 | self.rps = config.getint(LOADER_RPS) 23 | self.cores_loader = config[CORES_LOADER] 24 | 25 | @abstractmethod 26 | def start(self): 27 | """ Starts loader as a subprocess. """ 28 | raise NotImplementedError 29 | 30 | def stop(self): 31 | """ Sends signal to stop the loader and checks for proper termination """ 32 | 33 | self.client.terminate() 34 | sleep(0.5) 35 | while self.client.poll() is None: 36 | log.debug("Unable to shutdown loader. Retrying...") 37 | self.client.terminate() 38 | 39 | def reset(self): 40 | """ Restart the loader. """ 41 | 42 | if self.client is not None: 43 | self.stop() 44 | self.start() 45 | 46 | def get_stats(self): 47 | """ Collects the stats from the loader. Currently we are receiving the specified quantile 48 | and the requests per second. """ 49 | 50 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 51 | s.connect((self.service_ip, self.service_port)) 52 | s.sendall(b'get q95') # The text can be anything it just unblocks the loader 53 | 54 | fmt = "dd" 55 | fmt_size = struct.calcsize(fmt) 56 | data = s.recv(fmt_size) # this call will block 57 | latency, rps = struct.unpack(fmt, data[:fmt_size]) 58 | 59 | # log.debug('Tail latency {}: {}'.format(self.quantile, latency)) 60 | # log.debug('RPS: {}'.format(rps)) 61 | 62 | return latency, rps 63 | 64 | 65 | class MemCachedLoader(Loader): 66 | """ Wrapper class for Memcached loader. """ 67 | def __init__(self, config): 68 | super().__init__(config) 69 | self.loader_threads = config[LOADER_THREADS] 70 | self.loader_conn = config[LOADER_CONN] 71 | self.ratio = config[GET_SET_RATIO] 72 | self.exponential_dist = config[EXP_DIST] 73 | 74 | def start(self): 75 | """ Starts memcached loader with all necessary args. """ 76 | # TODO probably subprocess could be moved to superclass and only arguments should be defined in subclasses 77 | loader = '{}/loader'.format(self.loader_dir) 78 | dataset = '{}/twitter_dataset/twitter_dataset_30x'.format(self.loader_dir) 79 | servers = '{}/docker_servers.txt'.format(self.loader_dir) 80 | self.client = subprocess.Popen(['taskset', '--cpu-list', self.cores_loader, loader, '-a', dataset, '-s', 81 | servers, '-g', self.ratio, '-c', self.loader_conn, '-w', self.loader_threads, 82 | '-T', self.measurement_interval, '-r', str(self.rps), '-q', self.quantile, 83 | self.exponential_dist]) 84 | sleep(10) # wait in order to bind the socket 85 | 86 | log.debug("Loader started.") 87 | -------------------------------------------------------------------------------- /logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, simpleExample, rlsuite 3 | 4 | [handlers] 5 | keys=consoleHandler 6 | 7 | [formatters] 8 | keys=simpleFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=consoleHandler 13 | 14 | [logger_simpleExample] 15 | level=DEBUG 16 | handlers=consoleHandler 17 | qualname=simpleExample 18 | propagate=0 19 | 20 | [logger_rlsuite] 21 | level=DEBUG 22 | handlers=consoleHandler 23 | qualname=rlsuite 24 | propagate=0 25 | 26 | [handler_consoleHandler] 27 | class=StreamHandler 28 | level=DEBUG 29 | formatter=simpleFormatter 30 | args=(sys.stdout,) 31 | 32 | [formatter_simpleFormatter] 33 | format=%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s 34 | datefmt= -------------------------------------------------------------------------------- /main_agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import numpy as np 4 | import ast 5 | from rlsuite.builders.factories import memory_factory 6 | from env_builder import EnvBuilder 7 | import logging.config 8 | from rlsuite.utils.functions import log_parameters_histograms 9 | from rlsuite.builders.agent_builder import DQNAgentBuilder 10 | from torch.utils.tensorboard import SummaryWriter 11 | from utils.config_constants import * 12 | from utils.constants import Loaders, Schedulers 13 | from utils.functions import write_metrics, form_duration, config_parser 14 | from utils.argparser import cmd_parser 15 | from datetime import datetime 16 | import os 17 | 18 | logging.config.fileConfig('logging.conf') 19 | log = logging.getLogger('simpleExample') 20 | 21 | MEM_START_SIZE = 1000 22 | 23 | time_at_start = datetime.now().strftime('%b%d_%H-%M-%S') 24 | parser = cmd_parser() 25 | args = parser.parse_args() 26 | 27 | config = config_parser(args.config_file) 28 | 29 | # some arguments are set from command line args, that was useful for tuning 30 | if config[LOADER][ACTION_INTERVAL] == "-1": 31 | config[LOADER][ACTION_INTERVAL] = args.interval 32 | 33 | if config[AGENT][EPS_DECAY] == "-1": 34 | config[AGENT][EPS_DECAY] = args.decay 35 | 36 | config[LOADER][QUANTILE] = args.quantile 37 | config[ENV][FEATURE] = args.feature 38 | 39 | env = EnvBuilder() \ 40 | .build_pqos(config[PQOS][PQOS_INTERFACE], config[PQOS][CORES_LC], config[SCHEDULER][CORES_BE]) \ 41 | .build_loader(Loaders.MEMCACHED, config[LOADER]) \ 42 | .build_scheduler(Schedulers.QUEUE, config[SCHEDULER]) \ 43 | .build(config[ENV]) 44 | 45 | comment = f"_{args.comment}" 46 | writer = SummaryWriter(comment=comment) 47 | 48 | num_of_observations = env.observation_space.shape[0] 49 | num_of_actions = env.action_space.n 50 | 51 | log.info("Number of available actions: {}".format(num_of_actions)) 52 | log.info("NUmber of input features: {}".format(num_of_observations)) 53 | 54 | # TODO handle this in an elegant way, maybe we good use a dict that maps each field to a function that can be applied 55 | # in order to ge the type. 56 | lr = config[AGENT].getfloat(LR) 57 | layers_dim = ast.literal_eval(config[AGENT][LAYERS_DIM]) 58 | target_update = config[AGENT].getint(TARGET_UPDATE) 59 | batch_size = config[AGENT].getint(BATCH_SIZE) 60 | arch = config[AGENT][ARCH] # Vanilla or Dueling DQN 61 | agent_algorithm = config[AGENT][ALGO] # DDQN or DQN 62 | mem_type = config[AGENT][MEM_PER] 63 | mem_size = config[AGENT].getint(MEM_SIZE) 64 | gamma = config[AGENT].getfloat(GAMMA) 65 | eps_decay = config[AGENT].getfloat(EPS_DECAY) 66 | eps_start = config[AGENT].getfloat(EPS_START) 67 | eps_end = config[AGENT].getfloat(EPS_END) 68 | checkpoint_path = config[AGENT][CHECKPOINT] 69 | init_weights = config[AGENT][WEIGHTS] 70 | 71 | criterion = torch.nn.MSELoss(reduction='none') # torch.nn.SmoothL1Loss() # Huber loss 72 | optimizer = optim.Adam 73 | 74 | memory = memory_factory(mem_type, mem_size) 75 | 76 | agent = DQNAgentBuilder(num_of_observations, num_of_actions, gamma, eps_decay, eps_start, eps_end) \ 77 | .set_criterion(criterion) \ 78 | .build_network(layers_dim, arch) \ 79 | .load_checkpoint(checkpoint_path) \ 80 | .build_optimizer(optimizer, lr) \ 81 | .build(agent_algorithm) 82 | 83 | done = False 84 | step = 0 85 | decaying_schedule = 0 86 | total_reward = 0 87 | exploration_viol = 0 88 | end_exploration_step = 1 89 | end_exploration_flag = False 90 | 91 | try: 92 | state = env.reset() 93 | state = np.float32(state) 94 | 95 | while not done: 96 | action = agent.choose_action(state) 97 | # measuring env step time 98 | # start_step_time = time.time() 99 | # could run in parallel with the rest of the loop but GIL prevents this 100 | next_state, reward, done, info = env.step(action) 101 | # end_step_time = time.time() 102 | # step_interval = (end_step_time - start_step_time) * 1000 103 | # writer.add_scalar('Timing/Env Step', step_interval, step) 104 | next_state = np.float32(next_state) 105 | memory.store(state, action, next_state, reward, done) # Store the transition in memory 106 | state = next_state 107 | 108 | step += 1 109 | 110 | if mem_type == 'per' and memory.tree.n_entries < MEM_START_SIZE: 111 | continue 112 | 113 | # measure the violations of the exploration phase separately 114 | if agent.epsilon < eps_end + 0.01 and not end_exploration_flag: 115 | log.info("Conventional end of exploration at step: {}".format(step)) 116 | exploration_viol = env.violations 117 | end_exploration_step = step 118 | end_exploration_flag = True 119 | 120 | total_reward += reward 121 | 122 | # experimental path used to create checkpoints: increase exploration when new be is started 123 | # if new_be: 124 | # log.info("New be started at step: {}. Exploration rate increased.".format(step)) 125 | # decaying_schedule = min(decaying_schedule, 0) # resets exploration rate at 0.2 with 3210, 4500 for 0.1 126 | # # memory.flush() # we didn't observe any benefit from emptying the memory 127 | # 128 | # save_file = os.path.join('checkpoints', time_at_start + comment + '_' + str(step) + '.pkl') 129 | # agent.save_checkpoint(save_file) 130 | 131 | try: 132 | transitions, indices, is_weights = memory.sample(batch_size) 133 | except ValueError: # not enough samples in memory 134 | continue 135 | 136 | decaying_schedule += 1 137 | 138 | loss, errors = agent.update(transitions, is_weights) # Perform one step of optimization on the policy net 139 | agent.adjust_exploration(decaying_schedule) # rate is updated at every step 140 | memory.batch_update(indices, errors) # only applicable for per 141 | 142 | if step % target_update == 0: # Update the target network 143 | agent.update_target_net() 144 | # creates enormous amount of data and gives little information so we disable the logging of weights 145 | # log_parameters_histograms(writer, agent.target_net, step, 'TargetNet') 146 | 147 | for key, value in info.items(): 148 | write_metrics(writer, key, value, step) 149 | writer.add_scalar('Agent/Action', action, step) 150 | writer.add_scalar('Agent/Reward', reward, step) 151 | writer.add_scalar('Agent/Reward Cumulative', total_reward, step) 152 | writer.add_scalar('Agent/Epsilon', agent.epsilon, step) 153 | writer.add_scalar('Agent/Loss', loss, step) 154 | writer.flush() 155 | # log_parameters_histograms(writer, agent.policy_net, step, 'PolicyNet') 156 | 157 | # measuring training time 158 | # end_training = time.time() 159 | # training_interval = (end_training - end_step_time) * 1000 160 | # writer.add_scalar('Timing/Training', training_interval, step) 161 | 162 | log.info("Experiment finished after {} steps.".format(step)) 163 | duration = env.get_experiment_duration() 164 | writer.add_graph(agent.policy_net, torch.tensor(state, device=agent.device)) 165 | writer.add_hparams({'lr': lr, 'gamma': gamma, 'HL Dims': str(layers_dim), 'Target_upd_interval': target_update, 166 | 'Algorithm': agent_algorithm, 'Arch': arch, 'Batch Size': batch_size, 'Mem Type': mem_type, 167 | 'Mem Size': mem_size}, 168 | {'Results/Viol. Post-Expl.': (env.violations - exploration_viol) / (step - end_exploration_step), 169 | 'Results/Viol. Exploration': exploration_viol / end_exploration_step, 170 | 'Results/Violations Total': env.violations / step, 171 | 'Results/Time': duration}) 172 | 173 | writer.add_text('duration', form_duration(duration)) 174 | 175 | finally: 176 | save_file = os.path.join('checkpoints', time_at_start + comment + '.pkl') 177 | agent.save_checkpoint(save_file) 178 | 179 | writer.flush() 180 | writer.close() 181 | env.stop() 182 | -------------------------------------------------------------------------------- /main_measurements.py: -------------------------------------------------------------------------------- 1 | from env_builder import EnvBuilder 2 | import logging.config 3 | from utils.argparser import cmd_parser 4 | from torch.utils.tensorboard import SummaryWriter 5 | from utils.functions import write_metrics, form_duration, config_parser 6 | from utils.constants import Loaders, Schedulers 7 | from utils.config_constants import * 8 | 9 | # This script enforces static allocation and writes the metrics of the execution. 10 | 11 | logging.config.fileConfig('logging.conf') 12 | log = logging.getLogger('simpleExample') 13 | 14 | parser = cmd_parser() 15 | parser.add_argument('--ways-be', type=int, default=-1, help='Ways to be allocated to best effort group') 16 | args = parser.parse_args() 17 | 18 | config = config_parser(args.config_file) 19 | 20 | if config[LOADER][ACTION_INTERVAL] == "-1": 21 | config[LOADER][ACTION_INTERVAL] = args.interval 22 | 23 | config[LOADER][QUANTILE] = args.quantile 24 | config[ENV][FEATURE] = args.feature 25 | 26 | env = EnvBuilder() \ 27 | .build_pqos(config[PQOS][PQOS_INTERFACE], config[PQOS][CORES_LC], config[SCHEDULER][CORES_BE]) \ 28 | .build_loader(Loaders.MEMCACHED, config[LOADER]) \ 29 | .build_scheduler(Schedulers.QUEUE, config[SCHEDULER]) \ 30 | .build(config[ENV]) 31 | 32 | comment = "_measurement_action_{}_{}".format(args.ways_be, args.comment) 33 | writer = SummaryWriter(comment=comment) 34 | 35 | done = False 36 | log.info("Num of ways that are going to be statically allocated to BEs: {}".format(args.ways_be)) 37 | 38 | try: 39 | state = env.reset() 40 | 41 | while not done: 42 | next_state, reward, done, info = env.step(args.ways_be) 43 | 44 | for key, value in info.items(): 45 | write_metrics(writer, key, value, env.steps) 46 | 47 | duration = env.get_experiment_duration() 48 | log.info("Experiment finished after {} steps.".format(env.steps)) 49 | writer.add_hparams({'Action': args.ways_be}, 50 | {'Results/Violations Total': env.violations / env.steps, 'Results/Time': duration}) 51 | 52 | writer.add_text('duration', form_duration(duration)) 53 | writer.flush() 54 | 55 | finally: 56 | writer.flush() 57 | writer.close() 58 | env.stop() 59 | -------------------------------------------------------------------------------- /operations.txt: -------------------------------------------------------------------------------- 1 | ### In this file we include instructions about the operations needed during this project. 2 | 3 | ### System operations 4 | 5 | # export pqos in server, otherwise it can be set permanently to ~/.bashrc(DONE). 6 | export LD_LIBRARY_PATH=/home/users/nmandil/intel-cmt-cat/lib/ 7 | 8 | # information about the cache (size of ways etc) and about the capabilities of pqos 9 | ~/intel-cmt-cat/pqos/pqos -D 10 | 11 | # reset COS and associations 12 | ./pqos -R 13 | 14 | # check current pqos settings 15 | ./pqos -s 16 | 17 | # command to get the hyperthreaded pair of cpu10, the second core number is a hyperthread 18 | cat /sys/devices/system/cpu/cpu10/topology/thread_siblings_list 19 | 20 | ### Project operations 21 | 22 | # launch the Memcached Server on core 0 with 16GB of memory 23 | numactl -m 0 -N 0 -C 0 ~/memcached-server/memcached -l 127.0.0.1:11211 -t 1 -m 16384 -n 550 & 24 | 25 | # warm up the server by running the loader (as indicated by cloudsuite) 26 | ~/memcached/loader -a ~/memcached/twitter_dataset/twitter_dataset_30x -s ~/memcached/docker_servers.txt -w 4 -S 1 -D 16384 -j -T 1000 -Z 27 | 28 | # launch tensorboard (locally) 29 | tensorboard --logdir runs 30 | 31 | # launch tensorboard on 2nd server socket 32 | #taskset --cpu-list 15 tensorboard --logdir ~/path/to/runs & 33 | 34 | # to see Tensorboard locally ssh with port forwarding from server 35 | ssh -L 16006:127.0.0.1:6006 username@broady3.cslab.ece.ntua.gr 36 | 37 | # to run the agent 38 | time taskset --cpu-list 18-19 python main_agent.py args 39 | 40 | # to run the measurements main 41 | time taskset --cpu-list 18-19 python main_measurements.py args --ways-be $way 42 | 43 | # to update rlsuite library 44 | pip install git+https://github.com/nikmand/Reinforcement-Learning-Library.git#egg=rlsuite -U 45 | 46 | -------------------------------------------------------------------------------- /pqos_handler.py: -------------------------------------------------------------------------------- 1 | from pqos import Pqos 2 | from pqos.capability import PqosCap, CPqosMonitor 3 | from pqos.cpuinfo import PqosCpuInfo 4 | from pqos.monitoring import PqosMon 5 | from pqos.l3ca import PqosCatL3 6 | from pqos.allocation import PqosAlloc 7 | import logging.config 8 | from random import randint, randrange 9 | from abc import ABC, abstractmethod 10 | 11 | logging.config.fileConfig('logging.conf') 12 | log = logging.getLogger('simpleExample') 13 | 14 | # NOTE we define all possible masks for our server which has 20 ways in LLC (L3) 15 | # Due to pqos limitation on the left side the min value of ways is 2 so HP service will always have at least two ways 16 | 17 | L3_NUM_WAYS = 20 # NOTE consider getting this number by CpuInfo 18 | 19 | # ways that can be assigned to BEs 20 | ways = [0x00001, 0x00003, 0x00007, 0x0000f, 21 | 0x0001f, 0x0003f, 0x0007f, 0x000ff, 22 | 0x001ff, 0x003ff, 0x007ff, 0x00fff, 23 | 0x01fff, 0x03fff, 0x07fff, 0x0ffff, 24 | 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff] 25 | 26 | # ways = [(1 << i) - 1 for i in range(1, L3_NUM_WAYS + 1)] 27 | 28 | base = (1 << L3_NUM_WAYS) - 1 # 29 | 30 | # base = ways[-1] 31 | 32 | 33 | def bytes_to_kb(num_bytes): 34 | """ 35 | Converts bytes to kilobytes. 36 | 37 | :param num_bytes: number of bytes 38 | :return: number of kilobytes 39 | """ 40 | 41 | return num_bytes / 1024.0 42 | 43 | 44 | def bytes_to_mb(num_bytes): 45 | """ 46 | Converts bytes to megabytes. 47 | 48 | :param num_bytes: number of bytes 49 | :returns: number of megabytes 50 | """ 51 | 52 | return num_bytes / (1024.0 * 1024.0) 53 | 54 | 55 | def get_event_name(event_type): 56 | """ 57 | Converts a monitoring event type to a string label required by libpqos Python wrapper. 58 | 59 | :param event_type: monitoring event type 60 | :return: a string label 61 | """ 62 | 63 | event_map = { 64 | CPqosMonitor.PQOS_MON_EVENT_L3_OCCUP: 'l3_occup', 65 | CPqosMonitor.PQOS_MON_EVENT_LMEM_BW: 'lmem_bw', 66 | CPqosMonitor.PQOS_MON_EVENT_TMEM_BW: 'tmem_bw', 67 | CPqosMonitor.PQOS_MON_EVENT_RMEM_BW: 'rmem_bw', 68 | CPqosMonitor.PQOS_PERF_EVENT_LLC_MISS: 'perf_llc_miss', 69 | CPqosMonitor.PQOS_PERF_EVENT_IPC: 'perf_ipc' 70 | } 71 | 72 | return event_map.get(event_type) 73 | 74 | 75 | def get_metrics(group_values, time_interval): 76 | """ """ 77 | ipc = group_values.ipc 78 | misses = group_values.llc_misses_delta # / (group_values.ipc_unhalted_delta / 1000.) 79 | 80 | llc = bytes_to_mb(group_values.llc) 81 | mbl = bytes_to_mb(group_values.mbm_local_delta) 82 | mbr = bytes_to_mb(group_values.mbm_remote_delta) 83 | 84 | cycles = group_values.ipc_unhalted_delta 85 | instructions = group_values.ipc_retired_delta 86 | 87 | mbl_ps, mbr_ps = mbl / time_interval, mbr / time_interval 88 | 89 | return ipc, misses, llc, mbl_ps, mbr_ps, cycles, instructions 90 | 91 | 92 | def get_metrics_random(): 93 | """ Mock method that returns same arguments as the func get_metrics """ 94 | ipc = randrange(0, 2) 95 | misses = randint(1e3, 1e5) 96 | llc = randint(1e3, 1e5) 97 | mbl = randint(1e2, 1e3) 98 | mbr = randint(1e2, 1e3) 99 | cycles = randint(1e2, 1e3) 100 | instructions = randint(1e2, 1e3) 101 | 102 | return ipc, misses, llc, mbl, mbr, cycles, instructions 103 | 104 | 105 | class PqosHandler(ABC): 106 | """ Generic class for monitoring """ 107 | 108 | def __init__(self, interface, socket=0, cos_id_hp=1, cos_id_be=2): 109 | self.pqos = Pqos() 110 | self.pqos.init(interface) 111 | self.mon = PqosMon() 112 | self.alloc = PqosAlloc() 113 | self.l3ca = PqosCatL3() 114 | self.cap = PqosCap() 115 | self.cpu_info = PqosCpuInfo() 116 | self.socket = socket # The experiment takes place at a signle socket 117 | self.cos_id_hp = cos_id_hp 118 | self.cos_id_be = cos_id_be 119 | self.group_hp, self.group_be = None, None 120 | self.events = self.get_supported_events() 121 | 122 | @abstractmethod 123 | def setup_groups(self): # NOTE this MUST follow reset of monitoring 124 | """Sets up monitoring groups. Needs to be implemented by a derived class.""" 125 | raise NotImplementedError 126 | 127 | @abstractmethod 128 | def set_association_class(self): 129 | """ 130 | Sets up allocation classes of service on selected CPUs or PIDs 131 | """ 132 | raise NotImplementedError 133 | 134 | @abstractmethod 135 | def print_association_config(self): 136 | """ """ 137 | raise NotImplementedError 138 | 139 | def finish(self): 140 | self.pqos.fini() 141 | 142 | def get_supported_events(self): 143 | """ Returns a list of supported monitoring events. """ 144 | 145 | mon_cap = self.cap.get_type('mon') 146 | 147 | events = [get_event_name(event.type) for event in mon_cap.events] 148 | 149 | # Filter out perf events 150 | # events = list(filter(lambda event: 'perf' not in event, events)) 151 | 152 | return events 153 | 154 | def get_all_cores(self): 155 | """ Returns a list of all available cores. Used for informational reasons only. """ 156 | 157 | cores = [] 158 | sockets = self.cpu_info.get_sockets() 159 | 160 | for socket in sockets: 161 | cores += self.cpu_info.get_cores(socket) 162 | 163 | return cores 164 | 165 | def reset(self): 166 | """ Resets monitoring and configures (starts) monitoring groups. """ 167 | 168 | self.mon.reset() 169 | self.reset_allocation_association() 170 | 171 | def update(self): 172 | """ Updates values for monitored events. """ 173 | 174 | self.mon.poll([self.group_hp, self.group_be]) 175 | 176 | def get_hp_metrics(self, time_interval): 177 | return get_metrics(self.group_hp.values, time_interval) 178 | 179 | def get_be_metrics(self, time_interval): 180 | return get_metrics(self.group_be.values, time_interval) 181 | 182 | def stop(self): 183 | """ Stops monitoring.""" 184 | 185 | self.group_hp.stop() 186 | self.group_be.stop() 187 | 188 | def set_allocation_class(self, ways_be): 189 | """ 190 | Sets up allocation classes of service on selected CPU sockets 191 | 192 | Parameters: 193 | ways_be: num of ways to be assigned for bes 194 | """ 195 | if ways_be == -1: # default setting, all ways can be accessed by both groups 196 | mask_be = ways[-1] 197 | mask_hp = ways[-1] 198 | else: 199 | mask_be = ways[ways_be] 200 | mask_hp = mask_be ^ base 201 | cos_hp = self.l3ca.COS(self.cos_id_hp, mask_hp) 202 | cos_be = self.l3ca.COS(self.cos_id_be, mask_be) 203 | 204 | try: 205 | self.l3ca.set(self.socket, [cos_hp, cos_be]) 206 | except: 207 | log.error("Setting up cache allocation class of service failed!") 208 | raise 209 | 210 | def print_allocation_config(self): 211 | """ """ 212 | sockets = [self.socket] # self.cpu_info.get_sockets() 213 | for socket in sockets: 214 | try: 215 | coses = self.l3ca.get(socket) 216 | 217 | log.debug("L3CA COS definitions for Socket %u:" % socket) 218 | 219 | for cos in coses: 220 | if cos.class_id == self.cos_id_be or cos.class_id == self.cos_id_hp: 221 | cos_params = (cos.class_id, cos.mask) 222 | log.debug(" L3CA COS%u => MASK 0x%x" % cos_params) 223 | except: 224 | log.warning("Error in getting allocation configuration") 225 | raise 226 | 227 | def reset_allocation_association(self): 228 | """ Resets allocation and association configuration. """ 229 | 230 | try: 231 | self.alloc.reset('any', 'any', 'any') 232 | log.debug("Allocation reset successful") 233 | except: 234 | log.warning("Allocation reset failed!") 235 | raise 236 | 237 | 238 | class PqosHandlerCore(PqosHandler): 239 | """ PqosHandler per core. """ 240 | 241 | def __init__(self, cores_hp, cores_be): 242 | """ 243 | Initializes object of this class with cores and events to monitor. 244 | 245 | Parameters: 246 | cores_hp: a list of cores assigned to hp 247 | cores_be: a list of cores assigned to bes 248 | """ 249 | 250 | interface = "MSR" 251 | super(PqosHandlerCore, self).__init__(interface) 252 | self.cores_hp = cores_hp 253 | self.cores_be = cores_be 254 | 255 | def setup_groups(self): 256 | """ Starts monitoring for each group of cores. """ 257 | 258 | self.group_hp = self.mon.start(self.cores_hp, self.events) 259 | self.group_be = self.mon.start(self.cores_be, self.events) 260 | 261 | def set_association_class(self): 262 | """ Sets up association classes of service on selected CPUs. """ 263 | 264 | try: 265 | for core_hp in self.cores_hp: 266 | self.alloc.assoc_set(core_hp, self.cos_id_hp) 267 | for core_be in self.cores_be: 268 | self.alloc.assoc_set(core_be, self.cos_id_be) 269 | except: 270 | log.error("Setting association between core and class of service failed!") 271 | raise 272 | 273 | def print_association_config(self): 274 | """ """ 275 | cores = self.cores_hp + self.cores_be # or self.get_all_cores() 276 | for core in cores: 277 | class_id = self.alloc.assoc_get(core) 278 | log.debug("Core %u => COS%u" % (core, class_id)) 279 | 280 | 281 | class PqosHandlerPid(PqosHandler): 282 | """ PqosHandler per PID (OS interface only). """ 283 | 284 | def __init__(self, pid_hp, pids_be): 285 | """ 286 | Initializes object of this class with PIDs and events to monitor. 287 | 288 | Parameters: 289 | pid_hp: pid of hp 290 | pids_be: a list of PIDs to monitor 291 | """ 292 | 293 | interface = "OS" 294 | super(PqosHandlerPid, self).__init__(interface) 295 | self.pid_hp = pid_hp 296 | self.pids_be = pids_be 297 | 298 | def setup_groups(self): 299 | """ Starts monitoring for group of PID(s). """ 300 | 301 | # NOTE there is the ability to add/remove pids_be to/from a group 302 | 303 | self.group_hp = self.mon.start_pids([self.pid_hp], self.events) 304 | self.group_be = self.mon.start_pids(self.pids_be, self.events) 305 | 306 | def set_association_class(self): 307 | """ Sets up association classes of service on hp pid as well as in be pids. """ 308 | 309 | try: 310 | self.alloc.assoc_set_pid(self.pid_hp, self.cos_id_hp) 311 | for pid in self.pids_be: 312 | self.alloc.assoc_set_pid(pid, self.cos_id_be) 313 | except: 314 | log.error("Setting association between pid and class of service failed!") 315 | raise 316 | 317 | def print_association_config(self): 318 | """ """ 319 | pids = [self.pid_hp] + self.pids_be 320 | for pid in pids: 321 | class_id = self.alloc.assoc_get_pid(pid) 322 | log.debug("Pid %u => COS%u" % (pid, class_id)) 323 | 324 | 325 | class PqosHandlerMock: 326 | """ Mock class for use in environments where pqos cannot be installed. """ 327 | 328 | def __int__(self, socket=0, cos_id_hp=1, cos_id_be=2): 329 | pass 330 | 331 | def setup_groups(self): 332 | pass 333 | 334 | def reset(self): 335 | pass 336 | 337 | def update(self): 338 | pass 339 | 340 | def get_hp_metrics(self, time_interval): 341 | return get_metrics_random() 342 | 343 | def get_be_metrics(self, time_interval): 344 | return get_metrics_random() 345 | 346 | def stop(self): 347 | pass 348 | 349 | def set_association_class(self): 350 | pass 351 | 352 | def set_allocation_class(self, ways_be): 353 | pass 354 | 355 | def reset_allocation_association(self): 356 | pass 357 | 358 | def print_association_config(self): 359 | pass 360 | 361 | def print_allocation_config(self): 362 | pass 363 | 364 | def finish(self): 365 | pass 366 | -------------------------------------------------------------------------------- /rdt_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | import numpy as np 4 | import logging.config 5 | from utils.constants import LC_TAG, BE_TAG 6 | from utils.config_constants import * 7 | import time 8 | 9 | from utils.functions import form_duration 10 | 11 | logging.config.fileConfig('logging.conf') 12 | log = logging.getLogger('simpleExample') 13 | 14 | features_min_max_values = { 15 | 'MPKC': (0, 14), 16 | 'MPKI': (0, 8), 17 | 'Misses': (0, 7*1e7), 18 | 'IPC': (0, 2.8), 19 | 'Bandwidth': (0, 3*1e4) 20 | } 21 | 22 | 23 | class Rdt(gym.Env): 24 | metadata = {'render.modes': ['human']} 25 | UPDATE_INTERVAL = 1000 # in ms, update status of BEs every 1s 26 | 27 | def __init__(self, config, loader, scheduler, pqos_handler): 28 | self.loader = loader 29 | self.scheduler = scheduler 30 | self.pqos_handler = pqos_handler 31 | 32 | self.latency_thr = int(config[LATENCY_thr]) 33 | self.violations = 0 34 | self.steps = 1 35 | self.penalty_coef = float(config[PEN_COEF]) 36 | self.feature = config[FEATURE] 37 | 38 | feature_min, feature_max = features_min_max_values[self.feature] 39 | log.info("Feature {} will be used with limits: {} - {}".format(self.feature, feature_min, feature_max)) 40 | 41 | self.action_space = spaces.Discrete(int(config[NUM_WAYS])) 42 | # latency, mpki_be # used to be 2*1e6, 5*1e7, ways_be # 14 me 30 gia mpc kai be=mcf 43 | # for gradient boost high in misses raised to 20 from 14 44 | self.observation_space = spaces.Box(low=np.array([feature_min, 0]), 45 | high=np.array([feature_max, self.action_space.n-1], dtype=np.float32), 46 | dtype=np.float32) 47 | 48 | self.previous_action = -1 # -1 action means all ways available to all groups 49 | 50 | self.update_interval_in_steps = self.UPDATE_INTERVAL // int(self.loader.measurement_interval) 51 | 52 | def _reset_pqos(self): 53 | self.pqos_handler.reset() 54 | self.pqos_handler.setup_groups() 55 | self.pqos_handler.set_association_class() 56 | self.pqos_handler.print_association_config() 57 | self.previous_action = -1 58 | 59 | def _stop_pqos(self): 60 | self.pqos_handler.stop() 61 | self.pqos_handler.reset() 62 | self.pqos_handler.finish() 63 | 64 | @staticmethod 65 | def _normalize(metric, min_val, max_val): 66 | """ Normalize the observed value between 1 and 0. """ 67 | if metric > max_val: 68 | return 1.0 69 | elif metric < min_val: 70 | return 0.0 71 | else: 72 | return (metric - min_val) / (max_val - min_val) 73 | 74 | def _get_next_state(self, action_be_ways): 75 | """ """ 76 | # poll metrics so the next poll will contains deltas from this point just after the action 77 | self.pqos_handler.update() 78 | start_time = time.time() 79 | # start the stats record, the recorder will go to sleep and the it 'll send the results 80 | tail_latency, rps = self.loader.get_stats() # NOTE this call will block 81 | 82 | self.pqos_handler.update() 83 | time_interval = time.time() - start_time 84 | ipc_hp, misses_hp, llc_hp, mbl_hp_ps, mbr_hp_ps, cycles_hp, instructions_hp =\ 85 | self.pqos_handler.get_hp_metrics(time_interval) 86 | ipc_be, misses_be, llc_be, mbl_be_ps, mbr_be_ps, cycles_be, instructions_be =\ 87 | self.pqos_handler.get_be_metrics(time_interval) 88 | 89 | # bw_socket_wide = mbl_hp_ps + mbl_be_ps 90 | # bw_lc = mbl_hp_ps + mbr_hp_ps 91 | 92 | if self.feature == 'IPC': 93 | feature = ipc_be 94 | elif self.feature == 'Misses': 95 | # normalization of misses on a specific time unit in order to compare with different action intervals 96 | # misses_be = misses_be / (int(self.action_interval) // 50) 97 | feature = misses_be 98 | elif self.feature == 'MPKC': 99 | misses_be = misses_be / (cycles_be / 1000.) 100 | misses_hp = misses_hp / (cycles_hp / 1000.) 101 | feature = misses_be 102 | elif self.feature == 'MPKI': 103 | misses_be = misses_be / (instructions_be / 1000.) 104 | misses_hp = misses_hp / (instructions_hp / 1000.) 105 | feature = misses_be 106 | elif self.feature == 'Bandwidth': 107 | feature = mbl_be_ps 108 | else: 109 | log.info("No such feature: {}".format(self.feature)) 110 | return 111 | 112 | info = {LC_TAG: (ipc_hp, misses_hp, llc_hp, mbl_hp_ps, mbr_hp_ps, tail_latency, rps), 113 | BE_TAG: (ipc_be, misses_be, llc_be, mbl_be_ps, mbr_be_ps, None, None)} 114 | 115 | state = [feature, action_be_ways] 116 | 117 | # we normalize as well the be_ways, as it is included in the state 118 | state_normalized = [self._normalize(metric, min_val, max_val) for metric, min_val, max_val in 119 | zip(state, self.observation_space.low, self.observation_space.high)] 120 | 121 | return state_normalized, info, tail_latency 122 | 123 | def _reward_func(self, action_be_ways, hp_tail_latency): 124 | """ Reward function. """ 125 | 126 | if hp_tail_latency < self.latency_thr: 127 | reward = action_be_ways 128 | # NOTE by shaping the reward function in this way, we are making the assumption that progress of BEs is 129 | # depended by the LLC ways that are allocated to them at any point of their execution. 130 | else: 131 | reward = - self.penalty_coef * self.action_space.n 132 | self.violations += 1 133 | 134 | return reward 135 | 136 | def reset(self): 137 | """ In case that this environment is used in episodic format. """ 138 | 139 | self._reset_pqos() 140 | self.loader.reset() 141 | self.scheduler.reset() 142 | 143 | state, _, _ = self._get_next_state(self.action_space.n) # we start with both groups sharing all ways 144 | 145 | log.info("Environment was successfully reset.") 146 | 147 | return state 148 | 149 | def step(self, action_be_ways): 150 | """ At each step the agent specifies the number of ways that are assigned to the be""" 151 | 152 | # log.debug("Action selected: {}".format(action_be_ways)) 153 | # self.new_be = False 154 | 155 | done = False # update the status of BEs once in a while to reduce docker demon cpu utilization 156 | if self.steps % self.update_interval_in_steps == 0: 157 | done = self.scheduler.update_status() 158 | 159 | # err_msg = "%r (%s) invalid" % (action_be_ways, type(action_be_ways)) 160 | # assert self.action_space.contains(action_be_ways), err_msg 161 | 162 | # avoid enforcing decision when nothing changes. Does this cause any inconsistencies ? 163 | if action_be_ways != self.previous_action: 164 | # enforce the decision with PQOS 165 | self.pqos_handler.set_allocation_class(action_be_ways) 166 | # self.pqos_handler.print_allocation_config() 167 | self.previous_action = action_be_ways 168 | 169 | state, info, tail_latency = self._get_next_state(action_be_ways) 170 | 171 | reward = self._reward_func(action_be_ways, tail_latency) # based on new metrics 172 | 173 | self.steps += 1 174 | 175 | return state, reward, done, info # , self.new_be 176 | 177 | def render(self, **kwargs): 178 | pass 179 | 180 | def get_experiment_duration(self): 181 | """ Properly shapes and returns the time needed for the experiment to finish. """ 182 | 183 | return self.scheduler.get_experiment_duration() 184 | 185 | def stop(self): 186 | log.warning('Stopping everything!') 187 | 188 | duration = form_duration(self.get_experiment_duration()) 189 | 190 | log.info('Percentage of violations: {}'.format(self.violations / self.steps)) 191 | log.info('Duration of experiment: {}'.format(duration)) 192 | 193 | self.scheduler.stop_bes() # stop and remove the be containers 194 | self.loader.stop() # stop the service loader 195 | self._stop_pqos() # stop pqos 196 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/nikmand/Reinforcement-Learning-Library.git#egg=rlsuite 2 | git+https://github.com/intel/intel-cmt-cat.git#egg=pqos&subdirectory=lib/python 3 | torch 4 | torchvision 5 | tb-nightly 6 | gym 7 | matplotlib 8 | docker 9 | numpy -------------------------------------------------------------------------------- /scheduler.py: -------------------------------------------------------------------------------- 1 | import time 2 | import ast 3 | import docker 4 | import random 5 | import logging.config 6 | from utils.functions import parse_num_list 7 | from utils.config_constants import * 8 | from abc import ABC, abstractmethod 9 | 10 | logging.config.fileConfig('logging.conf') 11 | log = logging.getLogger('simpleExample') 12 | 13 | 14 | def read_avail_dockers(docker_file): 15 | """ Gets a dictionary with the available BEs and their parameters needed for execution. """ 16 | 17 | with open(docker_file, "r") as file: 18 | contents = file.read() 19 | bes = ast.literal_eval(contents) 20 | 21 | return bes 22 | 23 | 24 | class Scheduler(ABC): 25 | """ Handles all the operations needed to execute the Best Effort applications. 26 | Dockers containers are used to handle the execution. """ 27 | 28 | def __init__(self, config): 29 | self.cores_per_be = config.getint(CORES_PER_BE) 30 | self.cores_pids_be_range = parse_num_list(config[CORES_BE]) 31 | self.container_bes = [] 32 | self.client = docker.from_env() 33 | 34 | self.finished_bes = 0 35 | self.bes_available = read_avail_dockers(config[DOCKER_FILE]) 36 | 37 | # self.issued_bes = 0 # what is it used for ? 38 | self.be_repeated = config.getint(BE_REPEATED) 39 | self.be_quota = self.be_repeated # there are set equal so that in the first check a new BE will be issued 40 | self.last_be = None 41 | self.new_be = False 42 | self.num_total_bes = config.getint(NUM_BES) 43 | # even in case of QueueScheduler we need that as we may provide additional BEs so that system is full until 44 | # the desired number of bes is completed. 45 | 46 | self.start_time_bes = None 47 | self.stop_time_bes = None 48 | self.experiment_duration = 0 # in minutes 49 | 50 | def cores_map(self, i): 51 | """ Returns the cores that corresponds to the ith container. """ 52 | cores_range = self.cores_pids_be_range[i * self.cores_per_be: (i + 1) * self.cores_per_be] 53 | cores_range_string = map(str, cores_range) 54 | return ','.join(cores_range_string) 55 | 56 | @abstractmethod 57 | def _select_be(self): 58 | raise NotImplementedError 59 | 60 | @abstractmethod 61 | def reset(self): 62 | raise NotImplementedError 63 | 64 | def _restart_scheduling(self): 65 | """ Stops currently running BEs and starts new ones. """ 66 | 67 | self.stop_bes() 68 | self.start_bes() 69 | log.debug('BEs started') 70 | 71 | def _repeat_be(self): 72 | """ Checks if a new be should be selected or the current one can be reintroduced. """ 73 | 74 | # ΝΟΤΕ do we still need this functionality for our experiments? If yes, it can be implemented as decorator. 75 | # Actually the case of a repeated BE can be treated as a special case of QueueScheduler. 76 | if self.be_quota >= self.be_repeated: 77 | self.be_quota = 1 78 | self.new_be = True 79 | return self._select_be() 80 | else: 81 | self.be_quota += 1 82 | return self.last_be 83 | 84 | def _start_be(self, cores): 85 | """ Start a container on specified cores. """ 86 | 87 | # log.info('New BE will be issued on core(s): {} at step: {}'.format(cores, self.steps)) 88 | 89 | be = self._select_be() 90 | log.info('Selected Job: {}'.format(be)) 91 | container, command, volume = self.bes_available[be] 92 | container_be = self.client.containers.run(container, command=command, name='be_' + cores.replace(",", "_"), 93 | cpuset_cpus=cores, volumes_from=[volume] if volume is not None 94 | else [], detach=True) 95 | # self.issued_bes += 1 96 | 97 | return container_be 98 | 99 | def start_bes(self): 100 | """ Launches bes. """ 101 | 102 | num_startup_bes = len(self.cores_pids_be_range) // self.cores_per_be 103 | # NOTE: each BE launched should be directly placed at the containers list. Otherwise if an error pops up 104 | # during the process the list will haven't been formed yet so the launched bes are not going to be stopped. 105 | for i in range(num_startup_bes): 106 | self.container_bes.append(self._start_be(self.cores_map(i))) 107 | 108 | self.start_time_bes = time.time() 109 | 110 | def reissue_bes(self, have_finished): 111 | """ Issue new bes on cores that finished execution, if there are any. """ 112 | 113 | for i, has_finished in enumerate(have_finished): 114 | if has_finished: 115 | self._stop_be(self.container_bes[i]) 116 | self.container_bes[i] = self._start_be(self.cores_map(i)) 117 | log.info("Finished Bes: {}/{}".format(self.finished_bes, self.num_total_bes)) 118 | 119 | def _poll_bes(self): 120 | """ Reloads the status of containers and checks if they have exited. """ 121 | 122 | have_finished = [] 123 | for container_be in self.container_bes: 124 | container_be.reload() 125 | if container_be.status == 'exited': 126 | have_finished.append(True) 127 | self.finished_bes += 1 128 | else: 129 | have_finished.append(False) 130 | 131 | return have_finished 132 | 133 | @staticmethod 134 | def _stop_be(container_be): 135 | """ Stops and removes exited containers. """ 136 | container_be.stop() 137 | container_be.remove() 138 | 139 | def stop_bes(self): 140 | """ Stops all the containers. """ 141 | for container_be in self.container_bes: 142 | self._stop_be(container_be) 143 | 144 | def update_status(self): 145 | """ Polls the status of the containers and determines which of them have finished. If the """ 146 | 147 | have_finished = self._poll_bes() 148 | done = self.finished_bes >= self.num_total_bes 149 | # done = any(have_finished) # done if any of the bes has finished execution 150 | if done: 151 | self.stop_time_bes = time.time() 152 | self.experiment_duration = (self.stop_time_bes - self.start_time_bes) / 60 153 | else: 154 | self.reissue_bes(have_finished) 155 | 156 | return done 157 | 158 | def get_experiment_duration(self): 159 | """ Returns the time (in minutes) needed to for the bes to be completed. """ 160 | 161 | return self.experiment_duration 162 | 163 | 164 | class RandomScheduler(Scheduler): 165 | """ Initializes a random generator given a specific seed. The choices of the bes are made by the generator. """ 166 | 167 | def __init__(self, config): 168 | super().__init__(config) 169 | self.seed = config.getint(SEED) 170 | self.generator = random.Random(self.seed) 171 | 172 | def _select_be(self): 173 | return self.generator.choice(list(self.bes_available.keys())) 174 | 175 | def reset(self): 176 | self.generator = random.Random(self.seed) 177 | self._restart_scheduling() 178 | 179 | 180 | class QueueScheduler(Scheduler): 181 | """ It takes a list of BEs as input. The choices of the bes are made as in a queue. """ 182 | 183 | def __init__(self, config): 184 | super().__init__(config) 185 | self.bes_list = config[BES_LIST] 186 | self.bes_selected = ast.literal_eval(self.bes_list) 187 | # num of total bes can be less than the provided BEs, but never bigger 188 | self.num_total_bes = min(self.num_total_bes, len(self.bes_selected)) 189 | 190 | def _select_be(self): 191 | return self.bes_selected.pop(0) 192 | 193 | def reset(self): 194 | self.bes_selected = ast.literal_eval(self.bes_list) 195 | self._restart_scheduling() 196 | -------------------------------------------------------------------------------- /scripts/study_abliation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for be in in-memory 5 | do 6 | echo " $be" 7 | #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_all --be-name $be --comment "$be"_abliation_all 8 | #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_noper --be-name $be --comment "$be"_abliation_noper 9 | #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_nodueling --be-name $be --comment "$be"_abliation_nodueling 10 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_nodouble --be-name $be --comment "$be"_ababliation_nodouble 11 | done 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/study_coeff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for quantile in .99 5 | do 6 | for be in graphs 7 | do 8 | echo "$quantile" " $be" 9 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_coeff -q $quantile --be-name $be --comment "$be"_q"$quantile"_coeff4 10 | done 11 | done 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/study_coex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for be in RandomForestClassifier #DecisionTreeClassification GradientBoostedTreeRegressor LinearSVC RandomForestClassifier astar hmmer LogisticRegressionWithElasticNet 5 | do 6 | echo " $be" 7 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_coex --be-name $be --comment "$be"_coex_agent 8 | done 9 | 10 | -------------------------------------------------------------------------------- /scripts/study_different_bes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_diff_bes_transfer --comment transfer_train_on_5Noflush_diff_bes 5 | 6 | -------------------------------------------------------------------------------- /scripts/study_feature.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for feature in MPKC MPKI Bandwidth IPC 5 | do 6 | for be in in-memory graphs 7 | do 8 | echo "$feature" " $be" 9 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_feature -f $feature --be-name $be --comment "$be"_"$feature"_feature 10 | done 11 | done 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/study_interval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for interval in "200 0.002" 5 | do 6 | set -- $interval 7 | for be in graphs 8 | do 9 | echo "$1" " $be" " $2" 10 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_interval -d $2 -t $1 --be-name $be --comment "$be"_"$1"ms_"$2"decay_interval 11 | done 12 | done 13 | 14 | -------------------------------------------------------------------------------- /scripts/study_lc_profiling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for i in {0..18} 5 | do 6 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/measurement_lc --ways-be $i --warm-up 150 --comment new_lc_profiling 7 | done 8 | 9 | -------------------------------------------------------------------------------- /scripts/study_measurement.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for way in -1 5 | do 6 | for be in RandomForestClassifier # hmmer astar LogisticRegressionWithElasticNet DecisionTreeClassification LinearSVC in-memory graphs GradientBoostedTreeRegressor 7 | do 8 | echo "$way" " $be" 9 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/study_measurement --ways-be $way --be-name $be --comment "$be"_coex_alone 10 | done 11 | done 12 | 13 | -------------------------------------------------------------------------------- /scripts/study_measurement_diff_bes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for way in 0 -1 5 | do 6 | echo "$way" 7 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/study_diff_bes_transfer --ways-be $way --comment diff_bes 8 | done 9 | 10 | -------------------------------------------------------------------------------- /scripts/study_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./study_transfer_train.sh 4 | 5 | ./study_transfer.sh 6 | -------------------------------------------------------------------------------- /scripts/study_quantile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for quantile in .95 5 | do 6 | for be in graphs 7 | do 8 | echo "$quantile" " $be" 9 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_quantile -q $quantile --be-name $be --comment "$be"_q"$quantile"_quantile 10 | done 11 | done 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /scripts/study_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in "50 0.0005" "e 5" 4 | do 5 | set -- $i 6 | echo $1 and $2 7 | done 8 | -------------------------------------------------------------------------------- /scripts/study_transfer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # TODO na pairnei to checkpoint ws parameter 5 | 6 | for be in astar in-memory RandomForestClassifier hmmer 7 | do 8 | echo " $be" 9 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer --be-name $be --comment "$be"_transfer_from5NoFlush 10 | done 11 | 12 | #for be in astar in-memory graphs RandomForestClassifier 13 | # do 14 | # echo " $be" 15 | # time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer_from4 --be-name $be --comment "$be"_transfer_from4 16 | #done 17 | 18 | -------------------------------------------------------------------------------- /scripts/study_transfer_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer_train --comment transfer_train_on_5Noflush 5 | 6 | -------------------------------------------------------------------------------- /utils/argparser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def cmd_parser(): 5 | """ 6 | Parses command line arguments. 7 | 8 | Returns: 9 | an object with parsed command line arguments 10 | """ 11 | 12 | description = 'RL Agent' 13 | parser = argparse.ArgumentParser(description=description, fromfile_prefix_chars='@') 14 | parser.add_argument('-i', '--interface', default='MSR', help='select pqos interface') 15 | parser.add_argument('-r', '--rps', type=int, default=10000, help='Requests per second that client should generate') 16 | parser.add_argument('-g', '--ratio', default='0.8', help='Ratio of get/set requests') 17 | parser.add_argument('-p', '--loader-dir', help='Path to memcached loader') 18 | parser.add_argument('-t', '--interval', default='200', help='Interval to wait after a decision in ms') 19 | parser.add_argument('--cores-lc', default="0", help='Cores in which lc critical service already run') 20 | parser.add_argument('--cores-be', default='1-9', help='Cores in which be process will be launched') 21 | parser.add_argument('--cores-client', default='10-13', help='Cores in which load client will be launched') 22 | parser.add_argument('--loader-threads', default='1', help='Number of workers for the load testing') 23 | parser.add_argument('--latency-thr', type=int, default=10, help='Q95 latency threshold in ms') 24 | parser.add_argument('--be-name', default='in-memory-small', help='Be name') 25 | parser.add_argument('--num-bes', type=int, default=1, help='Number of BE containers to be launched') 26 | parser.add_argument('--tensorboard', action='store_true', help='Enable Tensorboard') # unused 27 | parser.add_argument('-c', '--config-file', default='configs/local', help='Path to config file') 28 | parser.add_argument('--comment', default='', help='Comment to add on tensorboard folder name as suffix') 29 | parser.add_argument('-q', '--quantile', default='.95', help='Choose quantile for which stats will be reported') 30 | parser.add_argument('-f', '--feature', default='MPKC', help='Hw feature to be used as input') 31 | parser.add_argument('-d', '--decay', default='0.0005', help='Epsilon decay rate') 32 | # parser.add_argument('--path-mem', help='') 33 | # nargs='+' all command-line args present are gathered into a list 34 | 35 | return parser 36 | -------------------------------------------------------------------------------- /utils/config_constants.py: -------------------------------------------------------------------------------- 1 | # sectors 2 | ENV = "env" 3 | LOADER = "loader" 4 | SCHEDULER = "scheduler" 5 | AGENT = "agent" 6 | PQOS = "pqos" 7 | 8 | # env 9 | LATENCY_thr = "latency_thr" 10 | NUM_WAYS = 'num_ways' 11 | PEN_COEF = 'pen_coef' 12 | FEATURE = 'feature' 13 | 14 | # PQOS 15 | PQOS_INTERFACE = 'pqos_interface' 16 | CORES_LC = "cores_lc" 17 | # needs also cores be 18 | 19 | # loader 20 | HP_IP = 'hp_ip' 21 | HP_PORT = 'hp_port' 22 | CORES_LOADER = 'cores_loader' 23 | LOADER_DIR = 'loader_dir' 24 | ACTION_INTERVAL = 'interval' 25 | LOADER_RPS = 'rps' 26 | LOADER_THREADS = 'loader_threads' 27 | LOADER_CONN = 'loader_conn' 28 | QUANTILE = 'quantile' 29 | EXP_DIST = 'exp_dist' 30 | GET_SET_RATIO = 'ratio' 31 | 32 | # scheduler 33 | BE_REPEATED = 'be_repeated' 34 | CORES_BE = 'cores_be' 35 | CORES_PER_BE = 'cores_per_be' 36 | NUM_BES = 'num_bes' 37 | DOCKER_FILE = 'docker_file' 38 | # scheduler subclasses 39 | SEED = 'seed' 40 | BES_LIST = 'bes_list' 41 | 42 | 43 | # TODO get most or maybe even all these constants from suite library 44 | # agent 45 | LR = 'lr' 46 | LAYERS_DIM = 'layers_dim' 47 | TARGET_UPDATE = 'target_update' 48 | BATCH_SIZE = 'batch_size' 49 | GAMMA = 'gamma' 50 | ARCH = 'arch' 51 | ALGO = 'algo' 52 | MEM_SIZE = 'mem_size' 53 | MEM_PER = 'mem_type' 54 | EPS_DECAY = 'eps_decay' 55 | EPS_START = 'eps_start' 56 | EPS_END = 'eps_end' 57 | CHECKPOINT = 'checkpoint' 58 | WEIGHTS = 'weights' 59 | 60 | # misc 61 | -------------------------------------------------------------------------------- /utils/constants.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | LOGGER = 'simpleExample' 4 | LOGGER_PATH = 'logging.conf' 5 | 6 | LC_TAG = "Latency Critical" 7 | BE_TAG = "Best Effort" 8 | 9 | metric_names = ['IPC', 'Misses per k. cycles', 'LLC Occupancy', 'Bandwidth L.', 'Bandwidth R.', 'Latency', 'RPS'] 10 | 11 | 12 | class Loaders(str, Enum): 13 | MEMCACHED = "memcached" 14 | 15 | 16 | class Schedulers(str, Enum): 17 | RANDOM = "random" 18 | QUEUE = "queue" 19 | -------------------------------------------------------------------------------- /utils/functions.py: -------------------------------------------------------------------------------- 1 | from utils.constants import metric_names 2 | import re 3 | import configparser 4 | 5 | 6 | def config_parser(filename): 7 | config = configparser.ConfigParser() 8 | try: 9 | with open(filename) as f: 10 | config.read_file(f) 11 | except FileNotFoundError: 12 | raise FileNotFoundError("Filename {} does not exists. Exiting...".format(filename)) 13 | 14 | return config 15 | 16 | 17 | def parse_num_list(string): 18 | m = re.match(r'(\d+)(?:-(\d+))?$', string) 19 | start = m.group(1) 20 | end = m.group(2) or start 21 | return list(range(int(start), int(end)+1)) 22 | 23 | 24 | def write_metrics(tboard_writer, tag, metrics, step): 25 | """ Used to write to Tensorboard environment related metrics. """ 26 | header = '{}/'.format(tag) 27 | for metric, metric_name in zip(metrics, metric_names): 28 | if metric is not None: 29 | tboard_writer.add_scalar(header + metric_name, metric, step) 30 | tboard_writer.flush() 31 | 32 | 33 | def form_duration(duration_minutes): 34 | """ """ 35 | minutes = int(duration_minutes) 36 | seconds = int(round((duration_minutes % 1) * 60, 0)) 37 | duration = str(minutes) + 'm' + str(seconds) + 's' 38 | 39 | return duration 40 | 41 | # TODO remove this commented out code, check with jupyter notebook if similar code is present there 42 | # use to log latency with this 43 | # latency_per = np.percentile(latency_list, 99) 44 | # latency_list_per = [min(i, latency_per) for i in latency_list] 45 | # plt.plot(latency_list_per) 46 | # plt.title('Effect of collocation in tail latency') 47 | # plt.axvline(x=self.warm_up, color='g', linestyle='dashed', label='BEs starts') 48 | # plt.axvline(x=len(latency_list_per) - self.warm_up, color='r', linestyle='dashed', label='BEs stops') 49 | # plt.axhline(y=self.latency_thr, color='m', label='Latency threshold') 50 | # plt.xlabel('Steps') 51 | # plt.ylabel('Q95 Latency in ms') 52 | # plt.legend(loc='best') 53 | # plt.savefig('runs/collocation_{}.png'.format(datetime.today().strftime('%Y%m%d_%H%M%S'))) 54 | # plt.show() 55 | --------------------------------------------------------------------------------