├── README.md
├── configs
    ├── docker_containers
    ├── local
    └── ssh
├── env_builder.py
├── loader.py
├── logging.conf
├── main_agent.py
├── main_measurements.py
├── operations.txt
├── pqos_handler.py
├── rdt_env.py
├── requirements.txt
├── scheduler.py
├── scripts
    ├── study_abliation.sh
    ├── study_coeff.sh
    ├── study_coex.sh
    ├── study_different_bes.sh
    ├── study_feature.sh
    ├── study_interval.sh
    ├── study_lc_profiling.sh
    ├── study_measurement.sh
    ├── study_measurement_diff_bes.sh
    ├── study_multi.sh
    ├── study_quantile.sh
    ├── study_test.sh
    ├── study_transfer.sh
    └── study_transfer_train.sh
└── utils
    ├── argparser.py
    ├── config_constants.py
    ├── constants.py
    └── functions.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## Resource-Allocation-Reinforcement-Learning
 2 | 
 3 | This repository is part of my Master Thesis titled: "__*Design and implementation of an intelligent agent,
 4 | capable of sharing resources in multicore systems, using Deep Reinforcement Learning*__".
 5 | 
 6 | Implementation of a Deep Reinforcement Learning agent that is capable to share the last-level-cache of a multi-core system, between a Latency Critical Service and a number of Best Effort applications. The agent by utilising the DQN family of algorithms, achieved to keep the SLAs violation of the critical service below 3% and in the same time succeeded even a 4x speed up for the Best Effort apps, by allocating cache ways to them when possible.
 7 | 
 8 | Please use this identifier to cite or link to this item: http://artemis.cslab.ece.ntua.gr:8080/jspui/handle/123456789/17662
 9 | 
10 | ### Dependencies
11 | 
12 | In a new conda environment execute: 
13 |     
14 |     pip install -r requirements.txt
15 |     
16 | 


--------------------------------------------------------------------------------
/configs/docker_containers:
--------------------------------------------------------------------------------
 1 | {
 2 | #'in-memory': ('zilutian/in-memory-analytics:amd64', '/data/ml-latest /data/myratings.csv --driver-memory 6g --executor-memory 16g', 'data'),
 3 | #'in-memory-small': ('zilutian/in-memory-analytics:amd64', '/data/ml-latest-small /data/myratings.csv', 'data'),
 4 | 'graphs': ('cloudsuite/graph-analytics', '--driver-memory 6g --executor-memory 16g', 'data-twitter'),
 5 | 'DecisionTreeClassification': ('nikmand/bes:spark-example', 'ml.JavaDecisionTreeClassificationExample', 'data-svm'),
 6 | 'GradientBoostedTreeRegressor': ('nikmand/bes:spark-example', 'ml.JavaGradientBoostedTreeRegressorExample', 'data-svm'),
 7 | 'LinearSVC': ('nikmand/bes:spark-example', 'ml.JavaLinearSVCExample', 'data-svm'),
 8 | 'LogisticRegressionWithElasticNet': ('nikmand/bes:spark-example', 'ml.JavaLogisticRegressionWithElasticNetExample', 'data-svm'),
 9 | #'RandomForestClassifier': ('nikmand/bes:spark-example', 'ml.JavaRandomForestClassifierExample', 'data-svm'),
10 | #'libquantum': ('nikmand/spec:462', None, None),
11 | #'mcf': ('nikmand/spec:429', None, None),
12 | #'lbm': ('nikmand/spec:470', None, None)
13 | #'astar': ('nikmand/spec:473', None, None),
14 | 'hmmer': ('nikmand/spec:456', None, None),
15 | #'gems': ('nikmand/spec:459', None, None)
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/configs/local:
--------------------------------------------------------------------------------
 1 | [env]
 2 | latency_thr = 10
 3 | num_ways = 9
 4 | pen_coef = 2
 5 | 
 6 | [pqos]
 7 | cores_lc = 0
 8 | pqos_interface = none
 9 | 
10 | [loader]
11 | hp_ip = 127.0.0.1
12 | hp_port = 42171
13 | cores_loader = 2
14 | loader_dir = /home/nikmand/CLionProjects/memcached
15 | loader_threads = 1
16 | loader_conn = 10
17 | ratio = 0
18 | rps = 100
19 | exp_dist =
20 | interval = -1
21 | 
22 | [scheduler]
23 | cores_be = 1-3
24 | cores_per_be = 3
25 | num_bes = 1
26 | be_repeated = 1
27 | docker_file = configs/docker_containers
28 | # scheduler subclasses
29 | seed = 1
30 | bes_list = ['graphs', 'in-memory']
31 | 
32 | [agent]
33 | lr = 1e-2
34 | layers_dim = [24, 48]
35 | target_update = 100
36 | batch_size = 32
37 | gamma = 0.99
38 | arch = dueling
39 | algo = ddqn
40 | mem_size = 10_000
41 | mem_type = per
42 | eps_decay = 0.001
43 | eps_start = 1
44 | eps_end = 0.01
45 | checkpoint =
46 | weights = noinit
47 | 
48 | [misc]
49 | 


--------------------------------------------------------------------------------
/configs/ssh:
--------------------------------------------------------------------------------
 1 | [env]
 2 | latency_thr = 10
 3 | num_ways = 18
 4 | pen_coef = 2
 5 | 
 6 | [pqos]
 7 | cores_lc = 0
 8 | pqos_interface = none
 9 | 
10 | [loader]
11 | hp_ip = 127.0.0.1
12 | hp_port = 42171
13 | cores_loader = 2
14 | loader_dir = /home/users/nmandil/memcached
15 | interval = -1
16 | rps = 100
17 | loader_threads = 1
18 | loader_conn = 10
19 | ratio = 0
20 | exp_dist =
21 | 
22 | [scheduler]
23 | cores_per_be = 3
24 | num_bes = 1
25 | cores_be = 1-9
26 | be_repeated = 1
27 | docker_file = configs/docker_containers
28 | # scheduler subclasses
29 | seed = 1
30 | bes_list = ['in-memory', 'in-memory', 'in-memory']
31 | 
32 | [agent]
33 | lr = 1e-2
34 | layers_dim = [24, 48]
35 | target_update = 100
36 | batch_size = 32
37 | gamma = 0.99
38 | arch = dueling
39 | algo = ddqn
40 | mem_size = 10_000
41 | mem_type = per
42 | eps_decay = 0.001
43 | eps_start = 1
44 | eps_end = 0.01
45 | checkpoint =
46 | weights = noinit
47 | 
48 | [misc]
49 | 


--------------------------------------------------------------------------------
/env_builder.py:
--------------------------------------------------------------------------------
 1 | from loader import MemCachedLoader
 2 | from pqos_handler import PqosHandlerMock, PqosHandlerPid, PqosHandlerCore
 3 | from rdt_env import Rdt
 4 | from scheduler import RandomScheduler, QueueScheduler
 5 | from utils.constants import Loaders, Schedulers
 6 | from utils.functions import parse_num_list
 7 | 
 8 | 
 9 | def loader_factory(service_name, config):
10 |     """  """
11 |     if service_name == Loaders.MEMCACHED:
12 |         loader = MemCachedLoader(config)
13 |     else:
14 |         raise ValueError("Loader option {} is not supported".format(service_name))
15 | 
16 |     return loader
17 | 
18 | 
19 | def scheduler_factory(scheduler_type, config):
20 |     """  """
21 |     if scheduler_type == Schedulers.RANDOM:
22 |         scheduler = RandomScheduler(config)
23 |     elif scheduler_type == Schedulers.QUEUE:
24 |         scheduler = QueueScheduler(config)
25 |     else:
26 |         raise ValueError("Scheduler option {} is not supported".format(scheduler_type))
27 | 
28 |     return scheduler
29 | 
30 | 
31 | def pqos_factory(pqos_interface, cores_pid_hp, cores_pids_be):
32 |     """  """
33 |     cores_pid_hp_range = parse_num_list(cores_pid_hp)
34 |     cores_pids_be_range = parse_num_list(cores_pids_be)
35 |     if pqos_interface == 'MSR':
36 |         pqos_handler = PqosHandlerCore(cores_pid_hp_range, cores_pids_be_range)
37 |     elif pqos_interface == 'OS':
38 |         pqos_handler = PqosHandlerPid(cores_pid_hp_range, cores_pids_be_range)
39 |     else:
40 |         pqos_handler = PqosHandlerMock()
41 | 
42 |     return pqos_handler
43 | 
44 | 
45 | class EnvBuilder:
46 |     """ It takes over the creation of the environment. """
47 | 
48 |     def __init__(self):
49 |         self.loader = None
50 |         self.scheduler = None
51 |         self.pqos_handler = None
52 | 
53 |     def build_loader(self, service_name, config):
54 |         self.loader = loader_factory(service_name, config)
55 | 
56 |         return self
57 | 
58 |     def build_pqos(self, pqos_interface, cores_pid_hp_range, cores_pids_be_range):
59 |         self.pqos_handler = pqos_factory(pqos_interface, cores_pid_hp_range, cores_pids_be_range)
60 | 
61 |         return self
62 | 
63 |     def build_scheduler(self, scheduler_type, config):
64 |         self.scheduler = scheduler_factory(scheduler_type, config)
65 | 
66 |         return self
67 | 
68 |     def build(self, config):
69 |         env = Rdt(config, self.loader, self.scheduler, self.pqos_handler)
70 | 
71 |         return env
72 | 


--------------------------------------------------------------------------------
/loader.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import struct
 3 | import subprocess
 4 | from time import sleep
 5 | import logging.config
 6 | from abc import ABC, abstractmethod
 7 | from utils.config_constants import *
 8 | 
 9 | logging.config.fileConfig('logging.conf')
10 | log = logging.getLogger('simpleExample')
11 | 
12 | 
13 | class Loader(ABC):
14 |     """ Abstract class that handles all the functionality that concerns the service loader. """
15 |     def __init__(self, config):
16 |         self.client = None
17 |         self.service_ip = config[HP_IP]
18 |         self.service_port = config.getint(HP_PORT)
19 |         self.loader_dir = config[LOADER_DIR]
20 |         self.quantile = config[QUANTILE]
21 |         self.measurement_interval = config[ACTION_INTERVAL]
22 |         self.rps = config.getint(LOADER_RPS)
23 |         self.cores_loader = config[CORES_LOADER]
24 | 
25 |     @abstractmethod
26 |     def start(self):
27 |         """ Starts loader as a subprocess. """
28 |         raise NotImplementedError
29 | 
30 |     def stop(self):
31 |         """ Sends signal to stop the loader and checks for proper termination """
32 | 
33 |         self.client.terminate()
34 |         sleep(0.5)
35 |         while self.client.poll() is None:
36 |             log.debug("Unable to shutdown loader. Retrying...")
37 |             self.client.terminate()
38 | 
39 |     def reset(self):
40 |         """ Restart the loader. """
41 | 
42 |         if self.client is not None:
43 |             self.stop()
44 |         self.start()
45 | 
46 |     def get_stats(self):
47 |         """ Collects the stats from the loader. Currently we are receiving the specified quantile
48 |         and the requests per second. """
49 | 
50 |         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
51 |             s.connect((self.service_ip, self.service_port))
52 |             s.sendall(b'get q95')  # The text can be anything it just unblocks the loader
53 | 
54 |             fmt = "dd"
55 |             fmt_size = struct.calcsize(fmt)
56 |             data = s.recv(fmt_size)  # this call will block
57 |             latency, rps = struct.unpack(fmt, data[:fmt_size])
58 | 
59 |         # log.debug('Tail latency {}: {}'.format(self.quantile, latency))
60 |         # log.debug('RPS: {}'.format(rps))
61 | 
62 |         return latency, rps
63 | 
64 | 
65 | class MemCachedLoader(Loader):
66 |     """ Wrapper class for Memcached loader. """
67 |     def __init__(self, config):
68 |         super().__init__(config)
69 |         self.loader_threads = config[LOADER_THREADS]
70 |         self.loader_conn = config[LOADER_CONN]
71 |         self.ratio = config[GET_SET_RATIO]
72 |         self.exponential_dist = config[EXP_DIST]
73 | 
74 |     def start(self):
75 |         """ Starts memcached loader with all necessary args. """
76 |         # TODO probably subprocess could be moved to superclass and only arguments should be defined in subclasses
77 |         loader = '{}/loader'.format(self.loader_dir)
78 |         dataset = '{}/twitter_dataset/twitter_dataset_30x'.format(self.loader_dir)
79 |         servers = '{}/docker_servers.txt'.format(self.loader_dir)
80 |         self.client = subprocess.Popen(['taskset', '--cpu-list', self.cores_loader, loader, '-a', dataset, '-s',
81 |                                         servers, '-g', self.ratio, '-c', self.loader_conn, '-w', self.loader_threads,
82 |                                         '-T', self.measurement_interval, '-r', str(self.rps),  '-q', self.quantile,
83 |                                         self.exponential_dist])
84 |         sleep(10)  # wait in order to bind the socket
85 | 
86 |         log.debug("Loader started.")
87 | 


--------------------------------------------------------------------------------
/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root, simpleExample, rlsuite
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler
 6 | 
 7 | [formatters]
 8 | keys=simpleFormatter
 9 | 
10 | [logger_root]
11 | level=DEBUG
12 | handlers=consoleHandler
13 | 
14 | [logger_simpleExample]
15 | level=DEBUG
16 | handlers=consoleHandler
17 | qualname=simpleExample
18 | propagate=0
19 | 
20 | [logger_rlsuite]
21 | level=DEBUG
22 | handlers=consoleHandler
23 | qualname=rlsuite
24 | propagate=0
25 | 
26 | [handler_consoleHandler]
27 | class=StreamHandler
28 | level=DEBUG
29 | formatter=simpleFormatter
30 | args=(sys.stdout,)
31 | 
32 | [formatter_simpleFormatter]
33 | format=%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s
34 | datefmt=


--------------------------------------------------------------------------------
/main_agent.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.optim as optim
  3 | import numpy as np
  4 | import ast
  5 | from rlsuite.builders.factories import memory_factory
  6 | from env_builder import EnvBuilder
  7 | import logging.config
  8 | from rlsuite.utils.functions import log_parameters_histograms
  9 | from rlsuite.builders.agent_builder import DQNAgentBuilder
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | from utils.config_constants import *
 12 | from utils.constants import Loaders, Schedulers
 13 | from utils.functions import write_metrics, form_duration, config_parser
 14 | from utils.argparser import cmd_parser
 15 | from datetime import datetime
 16 | import os
 17 | 
 18 | logging.config.fileConfig('logging.conf')
 19 | log = logging.getLogger('simpleExample')
 20 | 
 21 | MEM_START_SIZE = 1000
 22 | 
 23 | time_at_start = datetime.now().strftime('%b%d_%H-%M-%S')
 24 | parser = cmd_parser()
 25 | args = parser.parse_args()
 26 | 
 27 | config = config_parser(args.config_file)
 28 | 
 29 | # some arguments are set from command line args, that was useful for tuning
 30 | if config[LOADER][ACTION_INTERVAL] == "-1":
 31 |     config[LOADER][ACTION_INTERVAL] = args.interval
 32 | 
 33 | if config[AGENT][EPS_DECAY] == "-1":
 34 |     config[AGENT][EPS_DECAY] = args.decay
 35 | 
 36 | config[LOADER][QUANTILE] = args.quantile
 37 | config[ENV][FEATURE] = args.feature
 38 | 
 39 | env = EnvBuilder() \
 40 |     .build_pqos(config[PQOS][PQOS_INTERFACE], config[PQOS][CORES_LC], config[SCHEDULER][CORES_BE]) \
 41 |     .build_loader(Loaders.MEMCACHED, config[LOADER]) \
 42 |     .build_scheduler(Schedulers.QUEUE, config[SCHEDULER]) \
 43 |     .build(config[ENV])
 44 | 
 45 | comment = f"_{args.comment}"
 46 | writer = SummaryWriter(comment=comment)
 47 | 
 48 | num_of_observations = env.observation_space.shape[0]
 49 | num_of_actions = env.action_space.n
 50 | 
 51 | log.info("Number of available actions: {}".format(num_of_actions))
 52 | log.info("NUmber of input features: {}".format(num_of_observations))
 53 | 
 54 | # TODO handle this in an elegant way, maybe we good use a dict that maps each field to a function that can be applied
 55 | #   in order to ge the type.
 56 | lr = config[AGENT].getfloat(LR)
 57 | layers_dim = ast.literal_eval(config[AGENT][LAYERS_DIM])
 58 | target_update = config[AGENT].getint(TARGET_UPDATE)
 59 | batch_size = config[AGENT].getint(BATCH_SIZE)
 60 | arch = config[AGENT][ARCH]  # Vanilla or Dueling DQN
 61 | agent_algorithm = config[AGENT][ALGO]  # DDQN or DQN
 62 | mem_type = config[AGENT][MEM_PER]
 63 | mem_size = config[AGENT].getint(MEM_SIZE)
 64 | gamma = config[AGENT].getfloat(GAMMA)
 65 | eps_decay = config[AGENT].getfloat(EPS_DECAY)
 66 | eps_start = config[AGENT].getfloat(EPS_START)
 67 | eps_end = config[AGENT].getfloat(EPS_END)
 68 | checkpoint_path = config[AGENT][CHECKPOINT]
 69 | init_weights = config[AGENT][WEIGHTS]
 70 | 
 71 | criterion = torch.nn.MSELoss(reduction='none')  # torch.nn.SmoothL1Loss()  # Huber loss
 72 | optimizer = optim.Adam
 73 | 
 74 | memory = memory_factory(mem_type, mem_size)
 75 | 
 76 | agent = DQNAgentBuilder(num_of_observations, num_of_actions, gamma, eps_decay, eps_start, eps_end) \
 77 |     .set_criterion(criterion) \
 78 |     .build_network(layers_dim, arch) \
 79 |     .load_checkpoint(checkpoint_path) \
 80 |     .build_optimizer(optimizer, lr) \
 81 |     .build(agent_algorithm)
 82 | 
 83 | done = False
 84 | step = 0
 85 | decaying_schedule = 0
 86 | total_reward = 0
 87 | exploration_viol = 0
 88 | end_exploration_step = 1
 89 | end_exploration_flag = False
 90 | 
 91 | try:
 92 |     state = env.reset()
 93 |     state = np.float32(state)
 94 | 
 95 |     while not done:
 96 |         action = agent.choose_action(state)
 97 |         # measuring env step time
 98 |         # start_step_time = time.time()
 99 |         # could run in parallel with the rest of the loop but GIL prevents this
100 |         next_state, reward, done, info = env.step(action)
101 |         # end_step_time = time.time()
102 |         # step_interval = (end_step_time - start_step_time) * 1000
103 |         # writer.add_scalar('Timing/Env Step', step_interval, step)
104 |         next_state = np.float32(next_state)
105 |         memory.store(state, action, next_state, reward, done)  # Store the transition in memory
106 |         state = next_state
107 | 
108 |         step += 1
109 | 
110 |         if mem_type == 'per' and memory.tree.n_entries < MEM_START_SIZE:
111 |             continue
112 | 
113 |         # measure the violations of the exploration phase separately
114 |         if agent.epsilon < eps_end + 0.01 and not end_exploration_flag:
115 |             log.info("Conventional end of exploration at step: {}".format(step))
116 |             exploration_viol = env.violations
117 |             end_exploration_step = step
118 |             end_exploration_flag = True
119 | 
120 |         total_reward += reward
121 | 
122 |         # experimental path used to create checkpoints: increase exploration when new be is started
123 |         # if new_be:
124 |         #     log.info("New be started at step: {}. Exploration rate increased.".format(step))
125 |         #     decaying_schedule = min(decaying_schedule, 0)  # resets exploration rate at 0.2 with 3210, 4500 for 0.1
126 |         #     # memory.flush()  # we didn't observe any benefit from emptying the memory
127 |         #
128 |         #     save_file = os.path.join('checkpoints', time_at_start + comment + '_' + str(step) + '.pkl')
129 |         #     agent.save_checkpoint(save_file)
130 | 
131 |         try:
132 |             transitions, indices, is_weights = memory.sample(batch_size)
133 |         except ValueError:  # not enough samples in memory
134 |             continue
135 | 
136 |         decaying_schedule += 1
137 | 
138 |         loss, errors = agent.update(transitions, is_weights)  # Perform one step of optimization on the policy net
139 |         agent.adjust_exploration(decaying_schedule)  # rate is updated at every step
140 |         memory.batch_update(indices, errors)  # only applicable for per
141 | 
142 |         if step % target_update == 0:  # Update the target network
143 |             agent.update_target_net()
144 |             # creates enormous amount of data and gives little information so we disable the logging of weights
145 |             # log_parameters_histograms(writer, agent.target_net, step, 'TargetNet')
146 | 
147 |         for key, value in info.items():
148 |             write_metrics(writer, key, value, step)
149 |         writer.add_scalar('Agent/Action', action, step)
150 |         writer.add_scalar('Agent/Reward', reward, step)
151 |         writer.add_scalar('Agent/Reward Cumulative', total_reward, step)
152 |         writer.add_scalar('Agent/Epsilon', agent.epsilon, step)
153 |         writer.add_scalar('Agent/Loss', loss, step)
154 |         writer.flush()
155 |         # log_parameters_histograms(writer, agent.policy_net, step, 'PolicyNet')
156 | 
157 |         # measuring training time
158 |         # end_training = time.time()
159 |         # training_interval = (end_training - end_step_time) * 1000
160 |         # writer.add_scalar('Timing/Training', training_interval, step)
161 | 
162 |     log.info("Experiment finished after {} steps.".format(step))
163 |     duration = env.get_experiment_duration()
164 |     writer.add_graph(agent.policy_net, torch.tensor(state, device=agent.device))
165 |     writer.add_hparams({'lr': lr, 'gamma': gamma, 'HL Dims': str(layers_dim), 'Target_upd_interval': target_update,
166 |                         'Algorithm': agent_algorithm, 'Arch': arch, 'Batch Size': batch_size, 'Mem Type': mem_type,
167 |                         'Mem Size': mem_size},
168 |                        {'Results/Viol. Post-Expl.': (env.violations - exploration_viol) / (step - end_exploration_step),
169 |                         'Results/Viol. Exploration': exploration_viol / end_exploration_step,
170 |                         'Results/Violations Total': env.violations / step,
171 |                         'Results/Time': duration})
172 | 
173 |     writer.add_text('duration', form_duration(duration))
174 | 
175 | finally:
176 |     save_file = os.path.join('checkpoints', time_at_start + comment + '.pkl')
177 |     agent.save_checkpoint(save_file)
178 | 
179 |     writer.flush()
180 |     writer.close()
181 |     env.stop()
182 | 


--------------------------------------------------------------------------------
/main_measurements.py:
--------------------------------------------------------------------------------
 1 | from env_builder import EnvBuilder
 2 | import logging.config
 3 | from utils.argparser import cmd_parser
 4 | from torch.utils.tensorboard import SummaryWriter
 5 | from utils.functions import write_metrics, form_duration, config_parser
 6 | from utils.constants import Loaders, Schedulers
 7 | from utils.config_constants import *
 8 | 
 9 | # This script enforces static allocation and writes the metrics of the execution.
10 | 
11 | logging.config.fileConfig('logging.conf')
12 | log = logging.getLogger('simpleExample')
13 | 
14 | parser = cmd_parser()
15 | parser.add_argument('--ways-be', type=int, default=-1, help='Ways to be allocated to best effort group')
16 | args = parser.parse_args()
17 | 
18 | config = config_parser(args.config_file)
19 | 
20 | if config[LOADER][ACTION_INTERVAL] == "-1":
21 |     config[LOADER][ACTION_INTERVAL] = args.interval
22 | 
23 | config[LOADER][QUANTILE] = args.quantile
24 | config[ENV][FEATURE] = args.feature
25 | 
26 | env = EnvBuilder() \
27 |     .build_pqos(config[PQOS][PQOS_INTERFACE], config[PQOS][CORES_LC], config[SCHEDULER][CORES_BE]) \
28 |     .build_loader(Loaders.MEMCACHED, config[LOADER]) \
29 |     .build_scheduler(Schedulers.QUEUE, config[SCHEDULER]) \
30 |     .build(config[ENV])
31 | 
32 | comment = "_measurement_action_{}_{}".format(args.ways_be, args.comment)
33 | writer = SummaryWriter(comment=comment)
34 | 
35 | done = False
36 | log.info("Num of ways that are going to be statically allocated to BEs: {}".format(args.ways_be))
37 | 
38 | try:
39 |     state = env.reset()
40 | 
41 |     while not done:
42 |         next_state, reward, done, info = env.step(args.ways_be)
43 | 
44 |         for key, value in info.items():
45 |             write_metrics(writer, key, value, env.steps)
46 | 
47 |     duration = env.get_experiment_duration()
48 |     log.info("Experiment finished after {} steps.".format(env.steps))
49 |     writer.add_hparams({'Action': args.ways_be},
50 |                        {'Results/Violations Total': env.violations / env.steps, 'Results/Time': duration})
51 | 
52 |     writer.add_text('duration', form_duration(duration))
53 |     writer.flush()
54 | 
55 | finally:
56 |     writer.flush()
57 |     writer.close()
58 |     env.stop()
59 | 


--------------------------------------------------------------------------------
/operations.txt:
--------------------------------------------------------------------------------
 1 | ### In this file we include instructions about the operations needed during this project.
 2 | 
 3 | ### System operations
 4 | 
 5 | # export pqos in server, otherwise it can be set permanently to ~/.bashrc(DONE).
 6 | export LD_LIBRARY_PATH=/home/users/nmandil/intel-cmt-cat/lib/
 7 | 
 8 | # information about the cache (size of ways etc) and about the capabilities of pqos
 9 | ~/intel-cmt-cat/pqos/pqos -D
10 | 
11 | # reset COS and associations
12 | ./pqos -R
13 | 
14 | # check current pqos settings
15 | ./pqos -s
16 | 
17 | # command to get the hyperthreaded pair of cpu10, the second core number is a hyperthread
18 | cat /sys/devices/system/cpu/cpu10/topology/thread_siblings_list
19 | 
20 | ### Project operations
21 | 
22 | # launch the Memcached Server on core 0 with 16GB of memory
23 | numactl -m 0 -N 0 -C 0 ~/memcached-server/memcached -l 127.0.0.1:11211 -t 1 -m 16384 -n 550 &
24 | 
25 | # warm up the server by running the loader (as indicated by cloudsuite)
26 | ~/memcached/loader -a ~/memcached/twitter_dataset/twitter_dataset_30x -s ~/memcached/docker_servers.txt -w 4 -S 1 -D 16384 -j -T 1000 -Z
27 | 
28 | # launch tensorboard (locally)
29 | tensorboard --logdir runs
30 | 
31 | # launch tensorboard on 2nd server socket
32 | #taskset --cpu-list 15 tensorboard --logdir ~/path/to/runs &
33 | 
34 | # to see Tensorboard locally ssh with port forwarding from server
35 | ssh -L 16006:127.0.0.1:6006 username@broady3.cslab.ece.ntua.gr
36 | 
37 | # to run the agent
38 | time taskset --cpu-list 18-19 python main_agent.py args
39 | 
40 | # to run the measurements main
41 | time taskset --cpu-list 18-19 python main_measurements.py args --ways-be $way
42 | 
43 | # to update rlsuite library
44 | pip install git+https://github.com/nikmand/Reinforcement-Learning-Library.git#egg=rlsuite -U
45 | 
46 | 


--------------------------------------------------------------------------------
/pqos_handler.py:
--------------------------------------------------------------------------------
  1 | from pqos import Pqos
  2 | from pqos.capability import PqosCap, CPqosMonitor
  3 | from pqos.cpuinfo import PqosCpuInfo
  4 | from pqos.monitoring import PqosMon
  5 | from pqos.l3ca import PqosCatL3
  6 | from pqos.allocation import PqosAlloc
  7 | import logging.config
  8 | from random import randint, randrange
  9 | from abc import ABC, abstractmethod
 10 | 
 11 | logging.config.fileConfig('logging.conf')
 12 | log = logging.getLogger('simpleExample')
 13 | 
 14 | # NOTE we define all possible masks for our server which has 20 ways in LLC (L3)
 15 | # Due to pqos limitation on the left side the min value of ways is 2 so HP service will always have at least two ways
 16 | 
 17 | L3_NUM_WAYS = 20  # NOTE consider getting this number by CpuInfo
 18 | 
 19 | # ways that can be assigned to BEs
 20 | ways = [0x00001, 0x00003, 0x00007, 0x0000f,
 21 |         0x0001f, 0x0003f, 0x0007f, 0x000ff,
 22 |         0x001ff, 0x003ff, 0x007ff, 0x00fff,
 23 |         0x01fff, 0x03fff, 0x07fff, 0x0ffff,
 24 |         0x1ffff, 0x3ffff, 0x7ffff, 0xfffff]
 25 | 
 26 | # ways = [(1 << i) - 1 for i in range(1, L3_NUM_WAYS + 1)]
 27 | 
 28 | base = (1 << L3_NUM_WAYS) - 1  #
 29 | 
 30 | # base = ways[-1]
 31 | 
 32 | 
 33 | def bytes_to_kb(num_bytes):
 34 |     """
 35 |     Converts bytes to kilobytes.
 36 | 
 37 |     :param num_bytes: number of bytes
 38 |     :return: number of kilobytes
 39 |     """
 40 | 
 41 |     return num_bytes / 1024.0
 42 | 
 43 | 
 44 | def bytes_to_mb(num_bytes):
 45 |     """
 46 |     Converts bytes to megabytes.
 47 | 
 48 |     :param num_bytes: number of bytes
 49 |     :returns: number of megabytes
 50 |     """
 51 | 
 52 |     return num_bytes / (1024.0 * 1024.0)
 53 | 
 54 | 
 55 | def get_event_name(event_type):
 56 |     """
 57 |     Converts a monitoring event type to a string label required by libpqos Python wrapper.
 58 | 
 59 |     :param event_type: monitoring event type
 60 |     :return: a string label
 61 |     """
 62 | 
 63 |     event_map = {
 64 |         CPqosMonitor.PQOS_MON_EVENT_L3_OCCUP: 'l3_occup',
 65 |         CPqosMonitor.PQOS_MON_EVENT_LMEM_BW: 'lmem_bw',
 66 |         CPqosMonitor.PQOS_MON_EVENT_TMEM_BW: 'tmem_bw',
 67 |         CPqosMonitor.PQOS_MON_EVENT_RMEM_BW: 'rmem_bw',
 68 |         CPqosMonitor.PQOS_PERF_EVENT_LLC_MISS: 'perf_llc_miss',
 69 |         CPqosMonitor.PQOS_PERF_EVENT_IPC: 'perf_ipc'
 70 |     }
 71 | 
 72 |     return event_map.get(event_type)
 73 | 
 74 | 
 75 | def get_metrics(group_values, time_interval):
 76 |     """  """
 77 |     ipc = group_values.ipc
 78 |     misses = group_values.llc_misses_delta  # / (group_values.ipc_unhalted_delta / 1000.)
 79 | 
 80 |     llc = bytes_to_mb(group_values.llc)
 81 |     mbl = bytes_to_mb(group_values.mbm_local_delta)
 82 |     mbr = bytes_to_mb(group_values.mbm_remote_delta)
 83 | 
 84 |     cycles = group_values.ipc_unhalted_delta
 85 |     instructions = group_values.ipc_retired_delta
 86 | 
 87 |     mbl_ps, mbr_ps = mbl / time_interval, mbr / time_interval
 88 | 
 89 |     return ipc, misses, llc, mbl_ps, mbr_ps, cycles, instructions
 90 | 
 91 | 
 92 | def get_metrics_random():
 93 |     """ Mock method that returns same arguments as the func get_metrics """
 94 |     ipc = randrange(0, 2)
 95 |     misses = randint(1e3, 1e5)
 96 |     llc = randint(1e3, 1e5)
 97 |     mbl = randint(1e2, 1e3)
 98 |     mbr = randint(1e2, 1e3)
 99 |     cycles = randint(1e2, 1e3)
100 |     instructions = randint(1e2, 1e3)
101 | 
102 |     return ipc, misses, llc, mbl, mbr, cycles, instructions
103 | 
104 | 
105 | class PqosHandler(ABC):
106 |     """ Generic class for monitoring """
107 | 
108 |     def __init__(self, interface, socket=0, cos_id_hp=1, cos_id_be=2):
109 |         self.pqos = Pqos()
110 |         self.pqos.init(interface)
111 |         self.mon = PqosMon()
112 |         self.alloc = PqosAlloc()
113 |         self.l3ca = PqosCatL3()
114 |         self.cap = PqosCap()
115 |         self.cpu_info = PqosCpuInfo()
116 |         self.socket = socket  # The experiment takes place at a signle socket
117 |         self.cos_id_hp = cos_id_hp
118 |         self.cos_id_be = cos_id_be
119 |         self.group_hp, self.group_be = None, None
120 |         self.events = self.get_supported_events()
121 | 
122 |     @abstractmethod
123 |     def setup_groups(self):  # NOTE this MUST follow reset of monitoring
124 |         """Sets up monitoring groups. Needs to be implemented by a derived class."""
125 |         raise NotImplementedError
126 | 
127 |     @abstractmethod
128 |     def set_association_class(self):
129 |         """
130 |         Sets up allocation classes of service on selected CPUs or PIDs
131 |         """
132 |         raise NotImplementedError
133 | 
134 |     @abstractmethod
135 |     def print_association_config(self):
136 |         """  """
137 |         raise NotImplementedError
138 | 
139 |     def finish(self):
140 |         self.pqos.fini()
141 | 
142 |     def get_supported_events(self):
143 |         """ Returns a list of supported monitoring events. """
144 | 
145 |         mon_cap = self.cap.get_type('mon')
146 | 
147 |         events = [get_event_name(event.type) for event in mon_cap.events]
148 | 
149 |         # Filter out perf events
150 |         # events = list(filter(lambda event: 'perf' not in event, events))
151 | 
152 |         return events
153 | 
154 |     def get_all_cores(self):
155 |         """ Returns a list of all available cores. Used for informational reasons only. """
156 | 
157 |         cores = []
158 |         sockets = self.cpu_info.get_sockets()
159 | 
160 |         for socket in sockets:
161 |             cores += self.cpu_info.get_cores(socket)
162 | 
163 |         return cores
164 | 
165 |     def reset(self):
166 |         """ Resets monitoring and configures (starts) monitoring groups. """
167 | 
168 |         self.mon.reset()
169 |         self.reset_allocation_association()
170 | 
171 |     def update(self):
172 |         """ Updates values for monitored events. """
173 | 
174 |         self.mon.poll([self.group_hp, self.group_be])
175 | 
176 |     def get_hp_metrics(self, time_interval):
177 |         return get_metrics(self.group_hp.values, time_interval)
178 | 
179 |     def get_be_metrics(self, time_interval):
180 |         return get_metrics(self.group_be.values, time_interval)
181 | 
182 |     def stop(self):
183 |         """ Stops monitoring."""
184 | 
185 |         self.group_hp.stop()
186 |         self.group_be.stop()
187 | 
188 |     def set_allocation_class(self, ways_be):
189 |         """
190 |         Sets up allocation classes of service on selected CPU sockets
191 | 
192 |         Parameters:
193 |             ways_be: num of ways to be assigned for bes
194 |         """
195 |         if ways_be == -1:  # default setting, all ways can be accessed by both groups
196 |             mask_be = ways[-1]
197 |             mask_hp = ways[-1]
198 |         else:
199 |             mask_be = ways[ways_be]
200 |             mask_hp = mask_be ^ base
201 |         cos_hp = self.l3ca.COS(self.cos_id_hp, mask_hp)
202 |         cos_be = self.l3ca.COS(self.cos_id_be, mask_be)
203 | 
204 |         try:
205 |             self.l3ca.set(self.socket, [cos_hp, cos_be])
206 |         except:
207 |             log.error("Setting up cache allocation class of service failed!")
208 |             raise
209 | 
210 |     def print_allocation_config(self):
211 |         """  """
212 |         sockets = [self.socket]  # self.cpu_info.get_sockets()
213 |         for socket in sockets:
214 |             try:
215 |                 coses = self.l3ca.get(socket)
216 | 
217 |                 log.debug("L3CA COS definitions for Socket %u:" % socket)
218 | 
219 |                 for cos in coses:
220 |                     if cos.class_id == self.cos_id_be or cos.class_id == self.cos_id_hp:
221 |                         cos_params = (cos.class_id, cos.mask)
222 |                         log.debug("    L3CA COS%u => MASK 0x%x" % cos_params)
223 |             except:
224 |                 log.warning("Error in getting allocation configuration")
225 |                 raise
226 | 
227 |     def reset_allocation_association(self):
228 |         """ Resets allocation and association configuration. """
229 | 
230 |         try:
231 |             self.alloc.reset('any', 'any', 'any')
232 |             log.debug("Allocation reset successful")
233 |         except:
234 |             log.warning("Allocation reset failed!")
235 |             raise
236 | 
237 | 
238 | class PqosHandlerCore(PqosHandler):
239 |     """ PqosHandler per core. """
240 | 
241 |     def __init__(self, cores_hp, cores_be):
242 |         """
243 |         Initializes object of this class with cores and events to monitor.
244 | 
245 |         Parameters:
246 |             cores_hp: a list of cores assigned to hp
247 |             cores_be: a list of cores assigned to bes
248 |         """
249 | 
250 |         interface = "MSR"
251 |         super(PqosHandlerCore, self).__init__(interface)
252 |         self.cores_hp = cores_hp
253 |         self.cores_be = cores_be
254 | 
255 |     def setup_groups(self):
256 |         """ Starts monitoring for each group of cores. """
257 | 
258 |         self.group_hp = self.mon.start(self.cores_hp, self.events)
259 |         self.group_be = self.mon.start(self.cores_be, self.events)
260 | 
261 |     def set_association_class(self):
262 |         """ Sets up association classes of service on selected CPUs. """
263 | 
264 |         try:
265 |             for core_hp in self.cores_hp:
266 |                 self.alloc.assoc_set(core_hp, self.cos_id_hp)
267 |             for core_be in self.cores_be:
268 |                 self.alloc.assoc_set(core_be, self.cos_id_be)
269 |         except:
270 |             log.error("Setting association between core and class of service failed!")
271 |             raise
272 | 
273 |     def print_association_config(self):
274 |         """  """
275 |         cores = self.cores_hp + self.cores_be  # or self.get_all_cores()
276 |         for core in cores:
277 |             class_id = self.alloc.assoc_get(core)
278 |             log.debug("Core %u => COS%u" % (core, class_id))
279 | 
280 | 
281 | class PqosHandlerPid(PqosHandler):
282 |     """ PqosHandler per PID (OS interface only). """
283 | 
284 |     def __init__(self, pid_hp, pids_be):
285 |         """
286 |         Initializes object of this class with PIDs and events to monitor.
287 | 
288 |         Parameters:
289 |             pid_hp: pid of hp
290 |             pids_be: a list of PIDs to monitor
291 |         """
292 | 
293 |         interface = "OS"
294 |         super(PqosHandlerPid, self).__init__(interface)
295 |         self.pid_hp = pid_hp
296 |         self.pids_be = pids_be
297 | 
298 |     def setup_groups(self):
299 |         """ Starts monitoring for group of PID(s). """
300 | 
301 |         # NOTE there is the ability to add/remove pids_be to/from a group
302 | 
303 |         self.group_hp = self.mon.start_pids([self.pid_hp], self.events)
304 |         self.group_be = self.mon.start_pids(self.pids_be, self.events)
305 | 
306 |     def set_association_class(self):
307 |         """ Sets up association classes of service on hp pid as well as in be pids. """
308 | 
309 |         try:
310 |             self.alloc.assoc_set_pid(self.pid_hp, self.cos_id_hp)
311 |             for pid in self.pids_be:
312 |                 self.alloc.assoc_set_pid(pid, self.cos_id_be)
313 |         except:
314 |             log.error("Setting association between pid and class of service failed!")
315 |             raise
316 | 
317 |     def print_association_config(self):
318 |         """  """
319 |         pids = [self.pid_hp] + self.pids_be
320 |         for pid in pids:
321 |             class_id = self.alloc.assoc_get_pid(pid)
322 |             log.debug("Pid %u => COS%u" % (pid, class_id))
323 | 
324 | 
325 | class PqosHandlerMock:
326 |     """ Mock class for use in environments where pqos cannot be installed. """
327 | 
328 |     def __int__(self, socket=0, cos_id_hp=1, cos_id_be=2):
329 |         pass
330 | 
331 |     def setup_groups(self):
332 |         pass
333 | 
334 |     def reset(self):
335 |         pass
336 | 
337 |     def update(self):
338 |         pass
339 | 
340 |     def get_hp_metrics(self, time_interval):
341 |         return get_metrics_random()
342 | 
343 |     def get_be_metrics(self, time_interval):
344 |         return get_metrics_random()
345 | 
346 |     def stop(self):
347 |         pass
348 | 
349 |     def set_association_class(self):
350 |         pass
351 | 
352 |     def set_allocation_class(self, ways_be):
353 |         pass
354 | 
355 |     def reset_allocation_association(self):
356 |         pass
357 | 
358 |     def print_association_config(self):
359 |         pass
360 | 
361 |     def print_allocation_config(self):
362 |         pass
363 | 
364 |     def finish(self):
365 |         pass
366 | 


--------------------------------------------------------------------------------
/rdt_env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | import numpy as np
  4 | import logging.config
  5 | from utils.constants import LC_TAG, BE_TAG
  6 | from utils.config_constants import *
  7 | import time
  8 | 
  9 | from utils.functions import form_duration
 10 | 
 11 | logging.config.fileConfig('logging.conf')
 12 | log = logging.getLogger('simpleExample')
 13 | 
 14 | features_min_max_values = {
 15 |     'MPKC': (0, 14),
 16 |     'MPKI': (0, 8),
 17 |     'Misses': (0, 7*1e7),
 18 |     'IPC': (0, 2.8),
 19 |     'Bandwidth': (0, 3*1e4)
 20 |     }
 21 | 
 22 | 
 23 | class Rdt(gym.Env):
 24 |     metadata = {'render.modes': ['human']}
 25 |     UPDATE_INTERVAL = 1000  # in ms, update status of BEs every 1s
 26 | 
 27 |     def __init__(self, config, loader, scheduler, pqos_handler):
 28 |         self.loader = loader
 29 |         self.scheduler = scheduler
 30 |         self.pqos_handler = pqos_handler
 31 | 
 32 |         self.latency_thr = int(config[LATENCY_thr])
 33 |         self.violations = 0
 34 |         self.steps = 1
 35 |         self.penalty_coef = float(config[PEN_COEF])
 36 |         self.feature = config[FEATURE]
 37 | 
 38 |         feature_min, feature_max = features_min_max_values[self.feature]
 39 |         log.info("Feature {} will be used with limits: {} - {}".format(self.feature, feature_min, feature_max))
 40 | 
 41 |         self.action_space = spaces.Discrete(int(config[NUM_WAYS]))
 42 |         # latency, mpki_be # used to be 2*1e6, 5*1e7, ways_be # 14 me 30 gia mpc kai be=mcf
 43 |         # for gradient boost high in misses raised to 20 from 14
 44 |         self.observation_space = spaces.Box(low=np.array([feature_min, 0]),
 45 |                                             high=np.array([feature_max, self.action_space.n-1], dtype=np.float32),
 46 |                                             dtype=np.float32)
 47 | 
 48 |         self.previous_action = -1  # -1 action means all ways available to all groups
 49 | 
 50 |         self.update_interval_in_steps = self.UPDATE_INTERVAL // int(self.loader.measurement_interval)
 51 | 
 52 |     def _reset_pqos(self):
 53 |         self.pqos_handler.reset()
 54 |         self.pqos_handler.setup_groups()
 55 |         self.pqos_handler.set_association_class()
 56 |         self.pqos_handler.print_association_config()
 57 |         self.previous_action = -1
 58 | 
 59 |     def _stop_pqos(self):
 60 |         self.pqos_handler.stop()
 61 |         self.pqos_handler.reset()
 62 |         self.pqos_handler.finish()
 63 | 
 64 |     @staticmethod
 65 |     def _normalize(metric, min_val, max_val):
 66 |         """ Normalize the observed value between 1 and 0.  """
 67 |         if metric > max_val:
 68 |             return 1.0
 69 |         elif metric < min_val:
 70 |             return 0.0
 71 |         else:
 72 |             return (metric - min_val) / (max_val - min_val)
 73 | 
 74 |     def _get_next_state(self, action_be_ways):
 75 |         """  """
 76 |         # poll metrics so the next poll will contains deltas from this point just after the action
 77 |         self.pqos_handler.update()
 78 |         start_time = time.time()
 79 |         # start the stats record, the recorder will go to sleep and the it 'll send the results
 80 |         tail_latency, rps = self.loader.get_stats()  # NOTE this call will block
 81 | 
 82 |         self.pqos_handler.update()
 83 |         time_interval = time.time() - start_time
 84 |         ipc_hp, misses_hp, llc_hp, mbl_hp_ps, mbr_hp_ps, cycles_hp, instructions_hp =\
 85 |             self.pqos_handler.get_hp_metrics(time_interval)
 86 |         ipc_be, misses_be, llc_be, mbl_be_ps, mbr_be_ps, cycles_be, instructions_be =\
 87 |             self.pqos_handler.get_be_metrics(time_interval)
 88 | 
 89 |         # bw_socket_wide = mbl_hp_ps + mbl_be_ps
 90 |         # bw_lc = mbl_hp_ps + mbr_hp_ps
 91 | 
 92 |         if self.feature == 'IPC':
 93 |             feature = ipc_be
 94 |         elif self.feature == 'Misses':
 95 |             # normalization of misses on a specific time unit in order to compare with different action intervals
 96 |             # misses_be = misses_be / (int(self.action_interval) // 50)
 97 |             feature = misses_be
 98 |         elif self.feature == 'MPKC':
 99 |             misses_be = misses_be / (cycles_be / 1000.)
100 |             misses_hp = misses_hp / (cycles_hp / 1000.)
101 |             feature = misses_be
102 |         elif self.feature == 'MPKI':
103 |             misses_be = misses_be / (instructions_be / 1000.)
104 |             misses_hp = misses_hp / (instructions_hp / 1000.)
105 |             feature = misses_be
106 |         elif self.feature == 'Bandwidth':
107 |             feature = mbl_be_ps
108 |         else:
109 |             log.info("No such feature: {}".format(self.feature))
110 |             return
111 | 
112 |         info = {LC_TAG: (ipc_hp, misses_hp, llc_hp, mbl_hp_ps, mbr_hp_ps, tail_latency, rps),
113 |                 BE_TAG: (ipc_be, misses_be, llc_be, mbl_be_ps, mbr_be_ps, None, None)}
114 | 
115 |         state = [feature, action_be_ways]
116 | 
117 |         # we normalize as well the be_ways, as it is included in the state
118 |         state_normalized = [self._normalize(metric, min_val, max_val) for metric, min_val, max_val in
119 |                             zip(state, self.observation_space.low, self.observation_space.high)]
120 | 
121 |         return state_normalized, info, tail_latency
122 | 
123 |     def _reward_func(self, action_be_ways, hp_tail_latency):
124 |         """ Reward function. """
125 | 
126 |         if hp_tail_latency < self.latency_thr:
127 |             reward = action_be_ways
128 |             # NOTE by shaping the reward function in this way, we are making the assumption that progress of BEs is
129 |             # depended by the LLC ways that are allocated to them at any point of their execution.
130 |         else:
131 |             reward = - self.penalty_coef * self.action_space.n
132 |             self.violations += 1
133 | 
134 |         return reward
135 | 
136 |     def reset(self):
137 |         """ In case that this environment is used in episodic format. """
138 | 
139 |         self._reset_pqos()
140 |         self.loader.reset()
141 |         self.scheduler.reset()
142 | 
143 |         state, _, _ = self._get_next_state(self.action_space.n)  # we start with both groups sharing all ways
144 | 
145 |         log.info("Environment was successfully reset.")
146 | 
147 |         return state
148 | 
149 |     def step(self, action_be_ways):
150 |         """ At each step the agent specifies the number of ways that are assigned to the be"""
151 | 
152 |         # log.debug("Action selected: {}".format(action_be_ways))
153 |         # self.new_be = False
154 | 
155 |         done = False  # update the status of BEs once in a while to reduce docker demon cpu utilization
156 |         if self.steps % self.update_interval_in_steps == 0:
157 |             done = self.scheduler.update_status()
158 | 
159 |         # err_msg = "%r (%s) invalid" % (action_be_ways, type(action_be_ways))
160 |         # assert self.action_space.contains(action_be_ways), err_msg
161 | 
162 |         # avoid enforcing decision when nothing changes. Does this cause any inconsistencies ?
163 |         if action_be_ways != self.previous_action:
164 |             # enforce the decision with PQOS
165 |             self.pqos_handler.set_allocation_class(action_be_ways)
166 |             # self.pqos_handler.print_allocation_config()
167 |             self.previous_action = action_be_ways
168 | 
169 |         state, info, tail_latency = self._get_next_state(action_be_ways)
170 | 
171 |         reward = self._reward_func(action_be_ways, tail_latency)  # based on new metrics
172 | 
173 |         self.steps += 1
174 | 
175 |         return state, reward, done, info  # , self.new_be
176 | 
177 |     def render(self, **kwargs):
178 |         pass
179 | 
180 |     def get_experiment_duration(self):
181 |         """ Properly shapes and returns the time needed for the experiment to finish. """
182 | 
183 |         return self.scheduler.get_experiment_duration()
184 | 
185 |     def stop(self):
186 |         log.warning('Stopping everything!')
187 | 
188 |         duration = form_duration(self.get_experiment_duration())
189 | 
190 |         log.info('Percentage of violations: {}'.format(self.violations / self.steps))
191 |         log.info('Duration of experiment: {}'.format(duration))
192 | 
193 |         self.scheduler.stop_bes()  # stop and remove the be containers
194 |         self.loader.stop()  # stop the service loader
195 |         self._stop_pqos()  # stop pqos
196 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/nikmand/Reinforcement-Learning-Library.git#egg=rlsuite
2 | git+https://github.com/intel/intel-cmt-cat.git#egg=pqos&subdirectory=lib/python
3 | torch
4 | torchvision
5 | tb-nightly
6 | gym
7 | matplotlib
8 | docker
9 | numpy


--------------------------------------------------------------------------------
/scheduler.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import ast
  3 | import docker
  4 | import random
  5 | import logging.config
  6 | from utils.functions import parse_num_list
  7 | from utils.config_constants import *
  8 | from abc import ABC, abstractmethod
  9 | 
 10 | logging.config.fileConfig('logging.conf')
 11 | log = logging.getLogger('simpleExample')
 12 | 
 13 | 
 14 | def read_avail_dockers(docker_file):
 15 |     """ Gets a dictionary with the available BEs and their parameters needed for execution. """
 16 | 
 17 |     with open(docker_file, "r") as file:
 18 |         contents = file.read()
 19 |         bes = ast.literal_eval(contents)
 20 | 
 21 |         return bes
 22 | 
 23 | 
 24 | class Scheduler(ABC):
 25 |     """ Handles all the operations needed to execute the Best Effort applications.
 26 |      Dockers containers are used to handle the execution. """
 27 | 
 28 |     def __init__(self, config):
 29 |         self.cores_per_be = config.getint(CORES_PER_BE)
 30 |         self.cores_pids_be_range = parse_num_list(config[CORES_BE])
 31 |         self.container_bes = []
 32 |         self.client = docker.from_env()
 33 | 
 34 |         self.finished_bes = 0
 35 |         self.bes_available = read_avail_dockers(config[DOCKER_FILE])
 36 | 
 37 |         # self.issued_bes = 0  # what is it used for ?
 38 |         self.be_repeated = config.getint(BE_REPEATED)
 39 |         self.be_quota = self.be_repeated  # there are set equal so that in the first check a new BE will be issued
 40 |         self.last_be = None
 41 |         self.new_be = False
 42 |         self.num_total_bes = config.getint(NUM_BES)
 43 |         # even in case of QueueScheduler we need that as we may provide additional BEs so that system is full until
 44 |         # the desired number of bes is completed.
 45 | 
 46 |         self.start_time_bes = None
 47 |         self.stop_time_bes = None
 48 |         self.experiment_duration = 0  # in minutes
 49 | 
 50 |     def cores_map(self, i):
 51 |         """ Returns the cores that corresponds to the ith container. """
 52 |         cores_range = self.cores_pids_be_range[i * self.cores_per_be: (i + 1) * self.cores_per_be]
 53 |         cores_range_string = map(str, cores_range)
 54 |         return ','.join(cores_range_string)
 55 | 
 56 |     @abstractmethod
 57 |     def _select_be(self):
 58 |         raise NotImplementedError
 59 | 
 60 |     @abstractmethod
 61 |     def reset(self):
 62 |         raise NotImplementedError
 63 | 
 64 |     def _restart_scheduling(self):
 65 |         """ Stops currently running BEs and starts new ones. """
 66 | 
 67 |         self.stop_bes()
 68 |         self.start_bes()
 69 |         log.debug('BEs started')
 70 | 
 71 |     def _repeat_be(self):
 72 |         """ Checks if a new be should be selected or the current one can be reintroduced. """
 73 | 
 74 |         # ΝΟΤΕ do we still need this functionality for our experiments? If yes, it can be implemented as decorator.
 75 |         # Actually the case of a repeated BE can be treated as a special case of QueueScheduler.
 76 |         if self.be_quota >= self.be_repeated:
 77 |             self.be_quota = 1
 78 |             self.new_be = True
 79 |             return self._select_be()
 80 |         else:
 81 |             self.be_quota += 1
 82 |             return self.last_be
 83 | 
 84 |     def _start_be(self, cores):
 85 |         """ Start a container on specified cores. """
 86 | 
 87 |         # log.info('New BE will be issued on core(s): {} at step: {}'.format(cores, self.steps))
 88 | 
 89 |         be = self._select_be()
 90 |         log.info('Selected Job: {}'.format(be))
 91 |         container, command, volume = self.bes_available[be]
 92 |         container_be = self.client.containers.run(container, command=command, name='be_' + cores.replace(",", "_"),
 93 |                                                   cpuset_cpus=cores, volumes_from=[volume] if volume is not None
 94 |                                                   else [], detach=True)
 95 |         # self.issued_bes += 1
 96 | 
 97 |         return container_be
 98 | 
 99 |     def start_bes(self):
100 |         """ Launches bes. """
101 | 
102 |         num_startup_bes = len(self.cores_pids_be_range) // self.cores_per_be
103 |         # NOTE: each BE launched should be directly placed at the containers list. Otherwise if an error pops up
104 |         #   during the process the list will haven't been formed yet so the launched bes are not going to be stopped.
105 |         for i in range(num_startup_bes):
106 |             self.container_bes.append(self._start_be(self.cores_map(i)))
107 | 
108 |         self.start_time_bes = time.time()
109 | 
110 |     def reissue_bes(self, have_finished):
111 |         """ Issue new bes on cores that finished execution, if there are any. """
112 | 
113 |         for i, has_finished in enumerate(have_finished):
114 |             if has_finished:
115 |                 self._stop_be(self.container_bes[i])
116 |                 self.container_bes[i] = self._start_be(self.cores_map(i))
117 |                 log.info("Finished Bes: {}/{}".format(self.finished_bes, self.num_total_bes))
118 | 
119 |     def _poll_bes(self):
120 |         """ Reloads the status of containers and checks if they have exited. """
121 | 
122 |         have_finished = []
123 |         for container_be in self.container_bes:
124 |             container_be.reload()
125 |             if container_be.status == 'exited':
126 |                 have_finished.append(True)
127 |                 self.finished_bes += 1
128 |             else:
129 |                 have_finished.append(False)
130 | 
131 |         return have_finished
132 | 
133 |     @staticmethod
134 |     def _stop_be(container_be):
135 |         """ Stops and removes exited containers.  """
136 |         container_be.stop()
137 |         container_be.remove()
138 | 
139 |     def stop_bes(self):
140 |         """ Stops all the containers. """
141 |         for container_be in self.container_bes:
142 |             self._stop_be(container_be)
143 | 
144 |     def update_status(self):
145 |         """ Polls the status of the containers and determines which of them have finished. If the  """
146 | 
147 |         have_finished = self._poll_bes()
148 |         done = self.finished_bes >= self.num_total_bes
149 |         # done = any(have_finished)  # done if any of the bes has finished execution
150 |         if done:
151 |             self.stop_time_bes = time.time()
152 |             self.experiment_duration = (self.stop_time_bes - self.start_time_bes) / 60
153 |         else:
154 |             self.reissue_bes(have_finished)
155 | 
156 |         return done
157 | 
158 |     def get_experiment_duration(self):
159 |         """ Returns the time (in minutes) needed to for the bes to be completed. """
160 | 
161 |         return self.experiment_duration
162 | 
163 | 
164 | class RandomScheduler(Scheduler):
165 |     """ Initializes a random generator given a specific seed. The choices of the bes are made by the generator. """
166 | 
167 |     def __init__(self, config):
168 |         super().__init__(config)
169 |         self.seed = config.getint(SEED)
170 |         self.generator = random.Random(self.seed)
171 | 
172 |     def _select_be(self):
173 |         return self.generator.choice(list(self.bes_available.keys()))
174 | 
175 |     def reset(self):
176 |         self.generator = random.Random(self.seed)
177 |         self._restart_scheduling()
178 | 
179 | 
180 | class QueueScheduler(Scheduler):
181 |     """ It takes a list of BEs as input. The choices of the bes are made as in a queue. """
182 | 
183 |     def __init__(self, config):
184 |         super().__init__(config)
185 |         self.bes_list = config[BES_LIST]
186 |         self.bes_selected = ast.literal_eval(self.bes_list)
187 |         # num of total bes can be less than the provided BEs, but never bigger
188 |         self.num_total_bes = min(self.num_total_bes, len(self.bes_selected))
189 | 
190 |     def _select_be(self):
191 |         return self.bes_selected.pop(0)
192 | 
193 |     def reset(self):
194 |         self.bes_selected = ast.literal_eval(self.bes_list)
195 |         self._restart_scheduling()
196 | 


--------------------------------------------------------------------------------
/scripts/study_abliation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 |   for be in in-memory
 5 |   do
 6 |     echo " $be"
 7 |     #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_all --be-name $be --comment "$be"_abliation_all
 8 |     #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_noper --be-name $be --comment "$be"_abliation_noper
 9 |     #time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_nodueling --be-name $be --comment "$be"_abliation_nodueling
10 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_abliation_nodouble --be-name $be --comment "$be"_ababliation_nodouble
11 |   done
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/scripts/study_coeff.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for quantile in .99
 5 | do
 6 |   for be in graphs
 7 |   do
 8 |     echo "$quantile" " $be"
 9 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_coeff -q $quantile --be-name $be --comment "$be"_q"$quantile"_coeff4
10 |   done
11 | done
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/scripts/study_coex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for be in RandomForestClassifier #DecisionTreeClassification GradientBoostedTreeRegressor LinearSVC RandomForestClassifier astar hmmer LogisticRegressionWithElasticNet
 5 | do
 6 |     echo " $be"
 7 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_coex --be-name $be --comment "$be"_coex_agent
 8 | done
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/study_different_bes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_diff_bes_transfer --comment transfer_train_on_5Noflush_diff_bes
5 | 
6 | 


--------------------------------------------------------------------------------
/scripts/study_feature.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for feature in MPKC MPKI Bandwidth IPC
 5 | do
 6 |   for be in in-memory graphs
 7 |   do
 8 |     echo "$feature" " $be"
 9 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_feature -f $feature --be-name $be --comment "$be"_"$feature"_feature
10 |   done
11 | done
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/scripts/study_interval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for interval in "200 0.002"
 5 | do
 6 |   set -- $interval
 7 |   for be in graphs
 8 |   do
 9 |     echo "$1" " $be" " $2"
10 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_interval -d $2 -t $1 --be-name $be --comment "$be"_"$1"ms_"$2"decay_interval
11 |   done
12 | done
13 | 
14 | 


--------------------------------------------------------------------------------
/scripts/study_lc_profiling.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | for i in {0..18}
5 | do
6 |   time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/measurement_lc --ways-be $i --warm-up 150 --comment new_lc_profiling
7 | done
8 |     
9 | 


--------------------------------------------------------------------------------
/scripts/study_measurement.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for way in -1 
 5 | do
 6 |   for be in RandomForestClassifier # hmmer astar LogisticRegressionWithElasticNet DecisionTreeClassification LinearSVC in-memory graphs GradientBoostedTreeRegressor
 7 |   do
 8 |     echo "$way" " $be"
 9 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/study_measurement --ways-be $way --be-name $be --comment "$be"_coex_alone
10 |   done
11 | done
12 | 
13 | 


--------------------------------------------------------------------------------
/scripts/study_measurement_diff_bes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for way in 0 -1 
 5 | do
 6 |     echo "$way" 
 7 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_measurements.py -c configs_sched/study_diff_bes_transfer --ways-be $way  --comment diff_bes
 8 | done
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/study_multi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./study_transfer_train.sh
4 | 
5 | ./study_transfer.sh
6 | 


--------------------------------------------------------------------------------
/scripts/study_quantile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for quantile in .95 
 5 | do
 6 |   for be in graphs
 7 |   do
 8 |     echo "$quantile" " $be"
 9 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_quantile -q $quantile --be-name $be --comment "$be"_q"$quantile"_quantile
10 |   done
11 | done
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/scripts/study_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for i in "50 0.0005" "e 5"
4 | do
5 |     set -- $i
6 |     echo $1 and $2
7 | done
8 | 


--------------------------------------------------------------------------------
/scripts/study_transfer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # TODO na pairnei to checkpoint ws parameter
 5 |  
 6 | for be in astar in-memory RandomForestClassifier hmmer
 7 | do
 8 |     echo " $be"
 9 |     time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer --be-name $be --comment "$be"_transfer_from5NoFlush
10 | done
11 | 
12 | #for be in astar in-memory graphs RandomForestClassifier
13 | #  do
14 | #    echo " $be"
15 | #    time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer_from4 --be-name $be --comment "$be"_transfer_from4
16 | #done
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/study_transfer_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | time taskset --cpu-list 18-19 python ~/reinforcement-learning/main_rdt.py -c configs_sched/study_transfer_train --comment transfer_train_on_5Noflush
5 | 
6 | 


--------------------------------------------------------------------------------
/utils/argparser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def cmd_parser():
 5 |     """
 6 |     Parses command line arguments.
 7 | 
 8 |     Returns:
 9 |         an object with parsed command line arguments
10 |     """
11 | 
12 |     description = 'RL Agent'
13 |     parser = argparse.ArgumentParser(description=description, fromfile_prefix_chars='@')
14 |     parser.add_argument('-i', '--interface', default='MSR', help='select pqos interface')
15 |     parser.add_argument('-r', '--rps', type=int, default=10000, help='Requests per second that client should generate')
16 |     parser.add_argument('-g', '--ratio', default='0.8', help='Ratio of get/set requests')
17 |     parser.add_argument('-p', '--loader-dir', help='Path to memcached loader')
18 |     parser.add_argument('-t', '--interval', default='200', help='Interval to wait after a decision in ms')
19 |     parser.add_argument('--cores-lc', default="0", help='Cores in which lc critical service already run')
20 |     parser.add_argument('--cores-be', default='1-9', help='Cores in which be process will be launched')
21 |     parser.add_argument('--cores-client', default='10-13', help='Cores in which load client will be launched')
22 |     parser.add_argument('--loader-threads', default='1', help='Number of workers for the load testing')
23 |     parser.add_argument('--latency-thr', type=int, default=10, help='Q95 latency threshold in ms')
24 |     parser.add_argument('--be-name', default='in-memory-small', help='Be name')
25 |     parser.add_argument('--num-bes', type=int, default=1, help='Number of BE containers to be launched')
26 |     parser.add_argument('--tensorboard', action='store_true', help='Enable Tensorboard')  # unused
27 |     parser.add_argument('-c', '--config-file', default='configs/local', help='Path to config file')
28 |     parser.add_argument('--comment', default='', help='Comment to add on tensorboard folder name as suffix')
29 |     parser.add_argument('-q', '--quantile', default='.95', help='Choose quantile for which stats will be reported')
30 |     parser.add_argument('-f', '--feature', default='MPKC', help='Hw feature to be used as input')
31 |     parser.add_argument('-d', '--decay', default='0.0005', help='Epsilon decay rate')
32 |     # parser.add_argument('--path-mem', help='')
33 |     # nargs='+' all command-line args present are gathered into a list
34 | 
35 |     return parser
36 | 


--------------------------------------------------------------------------------
/utils/config_constants.py:
--------------------------------------------------------------------------------
 1 | # sectors
 2 | ENV = "env"
 3 | LOADER = "loader"
 4 | SCHEDULER = "scheduler"
 5 | AGENT = "agent"
 6 | PQOS = "pqos"
 7 | 
 8 | # env
 9 | LATENCY_thr = "latency_thr"
10 | NUM_WAYS = 'num_ways'
11 | PEN_COEF = 'pen_coef'
12 | FEATURE = 'feature'
13 | 
14 | # PQOS
15 | PQOS_INTERFACE = 'pqos_interface'
16 | CORES_LC = "cores_lc"
17 | # needs also cores be
18 | 
19 | # loader
20 | HP_IP = 'hp_ip'
21 | HP_PORT = 'hp_port'
22 | CORES_LOADER = 'cores_loader'
23 | LOADER_DIR = 'loader_dir'
24 | ACTION_INTERVAL = 'interval'
25 | LOADER_RPS = 'rps'
26 | LOADER_THREADS = 'loader_threads'
27 | LOADER_CONN = 'loader_conn'
28 | QUANTILE = 'quantile'
29 | EXP_DIST = 'exp_dist'
30 | GET_SET_RATIO = 'ratio'
31 | 
32 | # scheduler
33 | BE_REPEATED = 'be_repeated'
34 | CORES_BE = 'cores_be'
35 | CORES_PER_BE = 'cores_per_be'
36 | NUM_BES = 'num_bes'
37 | DOCKER_FILE = 'docker_file'
38 | # scheduler subclasses
39 | SEED = 'seed'
40 | BES_LIST = 'bes_list'
41 | 
42 | 
43 | # TODO get most or maybe even all these constants from suite library
44 | # agent
45 | LR = 'lr'
46 | LAYERS_DIM = 'layers_dim'
47 | TARGET_UPDATE = 'target_update'
48 | BATCH_SIZE = 'batch_size'
49 | GAMMA = 'gamma'
50 | ARCH = 'arch'
51 | ALGO = 'algo'
52 | MEM_SIZE = 'mem_size'
53 | MEM_PER = 'mem_type'
54 | EPS_DECAY = 'eps_decay'
55 | EPS_START = 'eps_start'
56 | EPS_END = 'eps_end'
57 | CHECKPOINT = 'checkpoint'
58 | WEIGHTS = 'weights'
59 | 
60 | # misc
61 | 


--------------------------------------------------------------------------------
/utils/constants.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | LOGGER = 'simpleExample'
 4 | LOGGER_PATH = 'logging.conf'
 5 | 
 6 | LC_TAG = "Latency Critical"
 7 | BE_TAG = "Best Effort"
 8 | 
 9 | metric_names = ['IPC', 'Misses per k. cycles', 'LLC Occupancy', 'Bandwidth L.', 'Bandwidth R.', 'Latency', 'RPS']
10 | 
11 | 
12 | class Loaders(str, Enum):
13 |     MEMCACHED = "memcached"
14 | 
15 | 
16 | class Schedulers(str, Enum):
17 |     RANDOM = "random"
18 |     QUEUE = "queue"
19 | 


--------------------------------------------------------------------------------
/utils/functions.py:
--------------------------------------------------------------------------------
 1 | from utils.constants import metric_names
 2 | import re
 3 | import configparser
 4 | 
 5 | 
 6 | def config_parser(filename):
 7 |     config = configparser.ConfigParser()
 8 |     try:
 9 |         with open(filename) as f:
10 |             config.read_file(f)
11 |     except FileNotFoundError:
12 |         raise FileNotFoundError("Filename {} does not exists. Exiting...".format(filename))
13 | 
14 |     return config
15 | 
16 | 
17 | def parse_num_list(string):
18 |     m = re.match(r'(\d+)(?:-(\d+))?$', string)
19 |     start = m.group(1)
20 |     end = m.group(2) or start
21 |     return list(range(int(start), int(end)+1))
22 | 
23 | 
24 | def write_metrics(tboard_writer, tag, metrics, step):
25 |     """ Used to write to Tensorboard environment related metrics. """
26 |     header = '{}/'.format(tag)
27 |     for metric, metric_name in zip(metrics, metric_names):
28 |         if metric is not None:
29 |             tboard_writer.add_scalar(header + metric_name, metric, step)
30 |     tboard_writer.flush()
31 | 
32 | 
33 | def form_duration(duration_minutes):
34 |     """  """
35 |     minutes = int(duration_minutes)
36 |     seconds = int(round((duration_minutes % 1) * 60, 0))
37 |     duration = str(minutes) + 'm' + str(seconds) + 's'
38 | 
39 |     return duration
40 | 
41 | # TODO remove this commented out code, check with jupyter notebook if similar code is present there
42 | # use to log latency with this
43 | # latency_per = np.percentile(latency_list, 99)
44 | # latency_list_per = [min(i, latency_per) for i in latency_list]
45 | # plt.plot(latency_list_per)
46 | # plt.title('Effect of collocation in tail latency')
47 | # plt.axvline(x=self.warm_up, color='g', linestyle='dashed', label='BEs starts')
48 | # plt.axvline(x=len(latency_list_per) - self.warm_up, color='r', linestyle='dashed', label='BEs stops')
49 | # plt.axhline(y=self.latency_thr, color='m', label='Latency threshold')
50 | # plt.xlabel('Steps')
51 | # plt.ylabel('Q95 Latency in ms')
52 | # plt.legend(loc='best')
53 | # plt.savefig('runs/collocation_{}.png'.format(datetime.today().strftime('%Y%m%d_%H%M%S')))
54 | # plt.show()
55 | 


--------------------------------------------------------------------------------