├── __init__.py ├── logs └── temp.log ├── utils ├── __init__.py └── weights_logger.py ├── algorithms ├── __init__.py ├── deterministic.py ├── semi_gradient_sarsa_algorithm.py ├── reinforce_algorithm.py ├── true_online_sarsa_lambda.py ├── actor_critic_eligibility_trace_algorithm_nn.py ├── actor_critic_eligibility_trace_algorithm_linear.py ├── optimal_algorithm.py ├── actor_critic_eligibility_trace_algorithm_tc.py ├── actor_critic_one_step.py └── semi_gradient_n_step_sarsa_algorithm.py ├── policy_approximations ├── __init__.py └── linear_policy_approximation.py ├── replacement_policies ├── __init__.py ├── policy_base.py ├── lru.py ├── fifo.py └── lfu.py ├── state_approximations ├── __init__.py ├── state_approximation.py ├── linear_v_approximation.py ├── nn_v_approximation.py └── one_d_tc.py ├── state_action_approximations ├── __init__.py ├── state_action_approximation.py ├── linear_q_approximation.py ├── nn_q_approximation.py ├── tile_coding_state_action.py └── one_d_tc.py ├── experiments ├── graphs │ ├── lru.png │ ├── reinforce.png │ ├── semi_gradient_sarsa.png │ ├── true_online_sarsa_lambda.png │ └── semi_gradient_n_step_sarsa.png └── scratch ├── RLCar_Reinforcement_Learning_for_Cache_admission_and_Replacement.pdf ├── requirements.txt ├── LICENSE ├── .gitignore ├── README.md ├── replacement_agent.py ├── trace_generator.py ├── trace_loader.py ├── main.py ├── run.sh └── cache.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logs/temp.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /policy_approximations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /replacement_policies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /state_approximations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /state_action_approximations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/graphs/lru.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/experiments/graphs/lru.png -------------------------------------------------------------------------------- /experiments/graphs/reinforce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/experiments/graphs/reinforce.png -------------------------------------------------------------------------------- /experiments/graphs/semi_gradient_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/experiments/graphs/semi_gradient_sarsa.png -------------------------------------------------------------------------------- /experiments/graphs/true_online_sarsa_lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/experiments/graphs/true_online_sarsa_lambda.png -------------------------------------------------------------------------------- /experiments/graphs/semi_gradient_n_step_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/experiments/graphs/semi_gradient_n_step_sarsa.png -------------------------------------------------------------------------------- /RLCar_Reinforcement_Learning_for_Cache_admission_and_Replacement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Godric877/RLCaR/HEAD/RLCar_Reinforcement_Learning_for_Cache_admission_and_Replacement.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cloudpickle==2.0.0 2 | cycler==0.11.0 3 | fonttools==4.32.0 4 | gym==0.23.1 5 | gym-notices==0.0.6 6 | importlib-metadata==4.11.3 7 | joblib==1.1.0 8 | kiwisolver==1.4.2 9 | matplotlib==3.5.1 10 | numpy==1.22.3 11 | packaging==21.3 12 | pandas==1.4.2 13 | Pillow==9.1.0 14 | pyparsing==3.0.8 15 | python-dateutil==2.8.2 16 | pytz==2022.1 17 | scikit-learn==1.0.2 18 | scipy==1.8.0 19 | six==1.16.0 20 | sklearn==0.0 21 | threadpoolctl==3.1.0 22 | torch==1.11.0 23 | typing-extensions==4.1.1 24 | zipp==3.8.0 25 | -------------------------------------------------------------------------------- /replacement_policies/policy_base.py: -------------------------------------------------------------------------------- 1 | class PolicyBase: 2 | 3 | def __init__(self, capacity: int): 4 | pass 5 | 6 | def update(self, key: int, val: int) -> bool: 7 | pass 8 | 9 | def get_remove_candidate(self): 10 | pass 11 | 12 | def update_history(self): 13 | pass 14 | 15 | def remove(self): 16 | pass 17 | 18 | def put(self, key: int, value : int) -> None: 19 | pass 20 | 21 | def remove_key(self, key): 22 | pass 23 | 24 | def reset(self): 25 | pass -------------------------------------------------------------------------------- /utils/weights_logger.py: -------------------------------------------------------------------------------- 1 | import csv 2 | class WeightLogger(): 3 | def __init__(self): 4 | self.weights = [] 5 | 6 | def update_weights(self, weight, tick): 7 | #print("weights = ", weight) 8 | list_weight = list(weight) 9 | list_weight.insert(0, tick) 10 | list_str_weight = [str(w) for w in list_weight] 11 | self.weights.append(list_str_weight) 12 | #print("weights in list = ", list_weight) 13 | 14 | def end(self, index): 15 | file = "weights_{}.csv".format(index) 16 | #print("weights = ", self.weights) 17 | with open(file, "w", newline="") as f: 18 | writer = csv.writer(f) 19 | writer.writerows(self.weights) 20 | self.weights =[] 21 | -------------------------------------------------------------------------------- /state_approximations/state_approximation.py: -------------------------------------------------------------------------------- 1 | class StateApproximation(object): 2 | def __call__(self,s) -> float: 3 | """ 4 | return the value of given state; \hat{v}(s) 5 | 6 | input: 7 | state 8 | output: 9 | value of the given state 10 | """ 11 | raise NotImplementedError() 12 | 13 | def update(self,alpha,G,state): 14 | """ 15 | Implement the update rule; 16 | w <- w + \alpha[G- \hat{v}(s_tau;w)] \nabla\hat{v}(s_tau;w) 17 | 18 | input: 19 | alpha: learning rate 20 | G: TD-target 21 | s_tau: target state for updating (yet, update will affect the other states) 22 | ouptut: 23 | None 24 | """ 25 | raise NotImplementedError() -------------------------------------------------------------------------------- /state_action_approximations/state_action_approximation.py: -------------------------------------------------------------------------------- 1 | class StateActionApproximation(object): 2 | def __call__(self,s, a) -> float: 3 | """ 4 | return the value of given state; \hat{v}(s) 5 | 6 | input: 7 | state 8 | output: 9 | value of the given state 10 | """ 11 | raise NotImplementedError() 12 | 13 | def update(self,alpha,G,state, action): 14 | """ 15 | Implement the update rule; 16 | w <- w + \alpha[G- \hat{v}(s_tau;w)] \nabla\hat{v}(s_tau;w) 17 | 18 | input: 19 | alpha: learning rate 20 | G: TD-target 21 | s_tau: target state for updating (yet, update will affect the other states) 22 | ouptut: 23 | None 24 | """ 25 | raise NotImplementedError() -------------------------------------------------------------------------------- /algorithms/deterministic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def always_evict(env, episodes): 4 | bhr_metric = {} 5 | rewards = {} 6 | for i in episodes: 7 | env.reset(i) 8 | done = False 9 | episode_rewards = [] 10 | while not done: 11 | act = 1 12 | obs, reward, done, info = env.step(act) 13 | episode_rewards.append(reward) 14 | if done: 15 | bhr_metric[i] = info[2] 16 | rewards[i] = episode_rewards 17 | return rewards, bhr_metric 18 | 19 | def random_eviction(env, episodes, p=[0.5, 0.5]): 20 | bhr_metric = {} 21 | rewards = {} 22 | for i in episodes: 23 | env.reset(i) 24 | done = False 25 | episode_rewards = [] 26 | while not done: 27 | act = np.random.choice(np.arange(2), p=p) 28 | obs, reward, done, info = env.step(act) 29 | episode_rewards.append(reward) 30 | if done: 31 | bhr_metric[i] = info[2] 32 | rewards[i] = episode_rewards 33 | return rewards, bhr_metric 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Syamantak Kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /replacement_policies/lru.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict, Counter 2 | from replacement_policies.policy_base import PolicyBase 3 | 4 | class LRUCache(PolicyBase): 5 | 6 | def __init__(self, capacity: int): 7 | self.capacity = capacity 8 | self.cache=OrderedDict() 9 | self.history = [] 10 | self.history_dict = Counter() 11 | 12 | def update(self, key: int, val : int): 13 | self.cache.move_to_end(key) 14 | is_present = (key in self.history_dict) 15 | return is_present 16 | 17 | def get_remove_candidate(self): 18 | return next(iter(self.cache)) 19 | 20 | def update_history(self): 21 | candidate = self.get_remove_candidate() 22 | if (len(self.history) >= self.capacity): 23 | remove_item = self.history.pop(0) 24 | if(self.history_dict[remove_item] == 1): 25 | del self.history_dict[remove_item] 26 | else: 27 | self.history_dict[remove_item] -= 1 28 | self.history.append(candidate) 29 | self.history_dict[candidate] += 1 30 | 31 | def remove(self): 32 | self.update_history() 33 | return self.cache.popitem(last=False) 34 | 35 | def put(self, key: int, value : int) -> None: 36 | if len(self.cache) >= self.capacity: 37 | self.remove() 38 | self.cache[key]=value 39 | self.cache.move_to_end(key) 40 | 41 | def remove_key(self, key): 42 | self.update_history() 43 | if key in self.cache: 44 | del self.cache[key] 45 | 46 | def reset(self): 47 | self.cache = OrderedDict() 48 | self.history = [] 49 | self.history_dict = Counter() -------------------------------------------------------------------------------- /experiments/scratch: -------------------------------------------------------------------------------- 1 | 2 | Average BHR on test data : 0.32431377571990927 LFU 20 episodes semi-gradient-sarsa 3 | Average BHR on test data : 0.3048781347374865 LRU 20 episodes semi-gradient-sarsa 4 | Average BHR on test data : 0.2069968126235923 FIFO 20 episodes semi-gradient-sarsa 5 | 6 | Average BHR on test data : 0.31541169628008753 LFU 20 episodes semi-gradient-sarsa 7 | Average BHR on test data : 0.30545926748786245 LRU 20 episodes semi-gradient-sarsa 8 | Average BHR on test data : 0.21447233150446082 FIFO 20 episodes semi-gradient-sarsa 9 | Average BHR on test data : 0.21905908945736902 LRU with action = 1 10 | Average BHR on test data : 0.34458940077996336 LFU with action = 1 11 | Average BHR on test data : 0.17755621299854102 FIFO with action = 1 12 | 13 | Average BHR on test data : 0.31167647066116966 LRU+LFU 20 episodes semi-gradient-sarsa 14 | 15 | Average BHR on test data : 0.339925737554025 LFU with random action 16 | Average BHR on test data : 0.2376831244487361 LRU with random action 17 | 18 | Average BHR on test data : 0.3367723665646943 19 | 20 | max_frequency = 1200 21 | 22 | TODO : 23 | 24 | 25 | 1) 1-d Tiling 26 | 2) Actor-critic 27 | 3) tune n-step sarsa and reinforce 28 | 4) try zipf distribution 29 | 30 | 31 | 32 | 33 | 2) Q-learning 34 | 35 | {'always_evict': [0.22056036436 36 | 37227, 0.2198801156035919, 0.22439159912212325, 0.22364186974022077, 0.2211926687800138], 'semi_gradient_sarsa_1': [0.32004784380764345, 0.30892408845407265, 0.32025700429202836, 0.25863111078096196, 0.2979258752894557], 'semi_gradient_sarsa_5': [0.2950395667965652, 0.24151138777404516, 0.2820994152510916, 0.29906585138712416, 0.26672195128955045], 'reinforce': [0.2719402382715548, 0.27601263900307127, 0.23157174971041872, 0.258354871289979, 0.2662656307278579]} 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /replacement_policies/fifo.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from replacement_policies.policy_base import PolicyBase 3 | 4 | class FifoCache(PolicyBase): 5 | 6 | def __init__(self, capacity: int): 7 | self.capacity = capacity 8 | self.cache_stack = [] 9 | self.value_dict = {} 10 | self.history = [] 11 | self.history_dict = Counter() 12 | 13 | def update(self, key: int, val: int): 14 | return (key in self.history_dict) 15 | 16 | def get_remove_candidate(self): 17 | return self.cache_stack[-1] 18 | 19 | def update_history(self): 20 | candidate = self.get_remove_candidate() 21 | if (len(self.history) >= self.capacity): 22 | remove_item = self.history.pop(0) 23 | if(self.history_dict[remove_item] == 1): 24 | del self.history_dict[remove_item] 25 | else: 26 | self.history_dict[remove_item] -= 1 27 | self.history.append(candidate) 28 | self.history_dict[candidate] += 1 29 | 30 | def remove(self): 31 | self.update_history() 32 | key = self.cache_stack.pop() 33 | val = self.value_dict[key] 34 | del self.value_dict[key] 35 | return key ,val 36 | 37 | def put(self, key: int, value : int) -> None: 38 | if len(self.cache_stack) >= self.capacity: 39 | self.remove() 40 | self.cache_stack.append(key) 41 | self.value_dict[key] = value 42 | 43 | def remove_key(self, key): 44 | self.update_history() 45 | if key in self.cache_stack: 46 | self.cache_stack.remove(key) 47 | del self.value_dict[key] 48 | 49 | def reset(self): 50 | self.cache_stack = [] 51 | self.value_dict = {} 52 | self.history = [] 53 | self.history_dict = Counter() -------------------------------------------------------------------------------- /algorithms/semi_gradient_sarsa_algorithm.py: -------------------------------------------------------------------------------- 1 | # One step Semi Gradient Sarsa 2 | 3 | import numpy as np 4 | 5 | from state_action_approximations.state_action_approximation import StateActionApproximation 6 | 7 | def epsilon_greedy(Q:StateActionApproximation, epsilon, state, actions): 8 | random = np.random.binomial(1, epsilon) 9 | max_ac = 0 10 | if random == 0: 11 | max_q = np.NINF 12 | for action in actions: 13 | current_q = Q(state, action) 14 | if current_q > max_q: 15 | max_ac = action 16 | max_q = current_q 17 | else: 18 | action_size = len(actions) 19 | index = np.random.randint(action_size) 20 | max_ac = actions[index] 21 | return max_ac 22 | 23 | def semi_gradient_sarsa(env, gamma, alpha, Q:StateActionApproximation, 24 | epsilon, episodes, actions): 25 | #episodes = np.arange(num_episode) 26 | #train, eval = train_test_split(episodes, test_size=0.2) 27 | bhr_metric = {} 28 | rewards = {} 29 | for i in episodes: 30 | s_current = env.reset(i) 31 | action = epsilon_greedy(Q, epsilon, s_current, actions) 32 | done = False 33 | episode_rewards = [] 34 | while not done: 35 | s_next, reward, done, info = env.step(action) 36 | # if action == 0: 37 | # print(action) 38 | episode_rewards.append(reward) 39 | if done: 40 | Q.update(alpha, reward, s_current, action) 41 | bhr_metric[i] = info[2] 42 | else: 43 | next_action = epsilon_greedy(Q, epsilon, s_next, actions) 44 | G = reward + gamma*Q(s_next, next_action) 45 | Q.update(alpha, G, s_current, action) 46 | s_current = s_next 47 | action = next_action 48 | rewards[i] = episode_rewards 49 | return rewards, bhr_metric 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /algorithms/reinforce_algorithm.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from state_approximations.linear_v_approximation import Baseline 4 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 5 | 6 | def reinforce( 7 | env, #open-ai environment 8 | gamma:float, 9 | episodes, 10 | pi:LinearPolicyApproximation, 11 | V:Baseline): 12 | """ 13 | implement REINFORCE algorithm with and without baseline. 14 | 15 | input: 16 | env: target environment; openai gym 17 | gamma: discount factor 18 | num_episode: #episodes to iterate 19 | pi: policy 20 | V: baseline 21 | output: 22 | a list that includes the G_0 for every episodes. 23 | """ 24 | bhr_metric = {} 25 | return_rewards = {} 26 | for i in episodes: 27 | state = env.reset(i) 28 | done = False 29 | rewards = [0] 30 | states = [state] 31 | actions = [] 32 | episode_rewards = [] 33 | while not done: 34 | action = pi(state) 35 | state, r, done, info = env.step(action) 36 | rewards.append(r) 37 | episode_rewards.append(r) 38 | actions.append(action) 39 | if not done: 40 | states.append(state) 41 | else: 42 | bhr_metric[i] = info[2] 43 | return_rewards[i] = episode_rewards 44 | G = 0 45 | for t in range(len(states)): 46 | G += math.pow(gamma, t)*rewards[t + 1] 47 | delta = G - V(states[0]) 48 | V.update(states[0], G) 49 | pi.update(states[0], actions[0], 1, delta) 50 | i = 1 51 | gamma_t = 1 52 | while i < len(states): 53 | G = (G - rewards[i])/gamma 54 | gamma_t = gamma_t*gamma 55 | delta = G - V(states[i]) 56 | V.update(states[i], G) 57 | pi.update(states[i], actions[i], gamma_t, delta) 58 | i = i + 1 59 | 60 | return return_rewards, bhr_metric -------------------------------------------------------------------------------- /algorithms/true_online_sarsa_lambda.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | 5 | from state_action_approximations.one_d_tc import StateActionOneDTileCoding 6 | 7 | def TrueOnlineSarsaLambda( 8 | env, 9 | epsilon:float, # exploration factor 10 | gamma:float, # discount factor 11 | lam:float, # decay rate 12 | alpha:float, # step size 13 | X:StateActionOneDTileCoding, 14 | episodes, 15 | ) -> np.array: 16 | """ 17 | Implement True online Sarsa(\lambda) 18 | """ 19 | def epsilon_greedy_policy(s,done,w,epsilon=.0): 20 | nA = env.action_space.n 21 | Q = [np.dot(w, X(s,a,done)) for a in range(nA)] 22 | 23 | if np.random.rand() < epsilon: 24 | return np.random.randint(nA) 25 | else: 26 | return np.argmax(Q) 27 | 28 | w = np.zeros((X.feature_vector_len())) 29 | bhr_metric = {} 30 | rewards = {} 31 | #TODO: implement this function 32 | for i in episodes: 33 | state = env.reset(i) 34 | done = False 35 | eps = copy.deepcopy(epsilon) 36 | action = epsilon_greedy_policy(state, done, w, eps) 37 | q_old = 0 38 | x = X(state, action, done) 39 | z = np.zeros(shape=X.feature_vector_len()) 40 | t = 1 41 | episode_rewards = [] 42 | while not done: 43 | t+=1 44 | state_new, r, done, info = env.step(action) 45 | episode_rewards.append(r) 46 | action_new = epsilon_greedy_policy(state_new, done, w, epsilon) 47 | x_new = X(state_new, action_new, done) 48 | q = np.dot(w, x) 49 | q_new = np.dot(w, x_new) 50 | delta = r + gamma*q_new - q 51 | multiplier = alpha*lam*gamma*np.dot(z, x) 52 | z = gamma*lam*z + (1-multiplier)*x 53 | w += alpha*(delta + q - q_old)*z - alpha*(q - q_old)*x 54 | q_old = q_new 55 | x = x_new 56 | action = action_new 57 | eps/=t 58 | if done: 59 | bhr_metric[i] = info[2] 60 | rewards[i] = episode_rewards 61 | return rewards, bhr_metric -------------------------------------------------------------------------------- /algorithms/actor_critic_eligibility_trace_algorithm_nn.py: -------------------------------------------------------------------------------- 1 | # Actor Critic with Eligibility Traces using Neural Network Approximation for State and Linear Approximation Policy 2 | 3 | import numpy as np 4 | 5 | from state_approximations.nn_v_approximation import NNStateApproximation 6 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 7 | 8 | def actor_critic_eligibility_trace_nn(env, gamma, alpha_theta, alpha_w, lambda_theta, lambda_w, 9 | V:NNStateApproximation, pi:LinearPolicyApproximation, 10 | episodes): 11 | bhr_metric = {} 12 | rewards = {} 13 | for i in episodes: 14 | s_current = env.reset(i) 15 | 16 | #print(s_current) 17 | #print("Feature length {}".format(list(V.model.model.weight.shape)) 18 | 19 | V_weights_shape = np.array(list(V.model.weight.shape)) 20 | pi_weights_shape = np.array(list(pi.model.model[0].weight.shape)) 21 | 22 | z_w = np.zeros(V_weights_shape) 23 | z_theta = np.zeros(pi_weights_shape) 24 | 25 | I = 1 26 | done = False 27 | episode_rewards = [] 28 | while not done: 29 | #s_current = [s_x /1000 for s_x in s_current] 30 | action = pi(s_current) 31 | s_next, reward, done, info = env.step(action) 32 | episode_rewards.append(reward) 33 | delta = reward + gamma*V(s_next)- V(s_current) 34 | z_w = gamma*lambda_w*z_w + V.return_gradient(s_current) 35 | z_theta = gamma*lambda_theta*z_theta + I*pi.return_gradient(s_current, action) 36 | # print("z_w = ", z_w) 37 | # print("z_theta = ", z_theta) 38 | # print("state: ", s_current) 39 | # print("pi.return_gradient ", pi.return_gradient(s_current, action)) 40 | V.manual_update(alpha_w*delta*z_w) 41 | pi.manual_update(alpha_theta*delta*z_theta) 42 | I = gamma*I 43 | s_current = s_next 44 | if done: 45 | bhr_metric[i] = info[2] 46 | rewards[i] = episode_rewards 47 | return rewards, bhr_metric 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /algorithms/actor_critic_eligibility_trace_algorithm_linear.py: -------------------------------------------------------------------------------- 1 | # Actor Critic with Eligibility Traces using Linear Approximation for State and Policy 2 | 3 | import numpy as np 4 | 5 | from state_approximations.linear_v_approximation import LinearStateApproximation 6 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 7 | 8 | def actor_critic_eligibility_trace_linear(env, gamma, alpha_theta, alpha_w, lambda_theta, lambda_w, 9 | V:LinearStateApproximation, pi:LinearPolicyApproximation, 10 | episodes): 11 | bhr_metric = {} 12 | rewards = {} 13 | for i in episodes: 14 | s_current = env.reset(i) 15 | 16 | #print(s_current) 17 | #print("Feature length {}".format(list(V.model.model.weight.shape)) 18 | 19 | V_weights_shape = np.array(list(V.model.model.weight.shape)) 20 | pi_weights_shape = np.array(list(pi.model.model[0].weight.shape)) 21 | 22 | z_w = np.zeros(V_weights_shape) 23 | z_theta = np.zeros(pi_weights_shape) 24 | 25 | I = 1 26 | done = False 27 | episode_rewards = [] 28 | while not done: 29 | #s_current = [s_x /1000 for s_x in s_current] 30 | action = pi(s_current) 31 | s_next, reward, done, info = env.step(action) 32 | episode_rewards.append(reward) 33 | delta = reward + gamma*V(s_next)- V(s_current) 34 | z_w = gamma*lambda_w*z_w + V.return_gradient(s_current) 35 | z_theta = gamma*lambda_theta*z_theta + I*pi.return_gradient(s_current, action) 36 | # print("z_w = ", z_w) 37 | # print("z_theta = ", z_theta) 38 | # print("state: ", s_current) 39 | # print("pi.return_gradient ", pi.return_gradient(s_current, action)) 40 | V.manual_update(alpha_w*delta*z_w) 41 | pi.manual_update(alpha_theta*delta*z_theta) 42 | I = gamma*I 43 | s_current = s_next 44 | if done: 45 | bhr_metric[i] = info[2] 46 | rewards[i] = episode_rewards 47 | return rewards, bhr_metric 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /algorithms/optimal_algorithm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | from trace_loader import load_traces 4 | 5 | def get_next_access(id, current_time, next_access_times): 6 | if id not in next_access_times: 7 | return 20000 8 | else: 9 | for access_time in next_access_times[id]: 10 | if(access_time > current_time): 11 | return access_time 12 | return 20000 13 | 14 | def pre_process(ids): 15 | next_access_times = defaultdict(list) 16 | for time_step, id in enumerate(ids): 17 | next_access_times[id].append(time_step) 18 | return next_access_times 19 | 20 | def optimal_admission(episodes, cache_size, trace='test'): 21 | rewards = {} 22 | bhr_metric = {} 23 | for i in episodes: 24 | current_cache = {} 25 | bhr = 0 26 | ids = list(load_traces(trace, cache_size, i)[1]) 27 | next_access_times = pre_process(ids) 28 | episode_rewards = [] 29 | hits_since_previous_miss = 0 30 | for time_step, id in enumerate(ids): 31 | if id in current_cache: 32 | bhr += 1 33 | hits_since_previous_miss += 1 34 | else: 35 | episode_rewards.append(hits_since_previous_miss) 36 | hits_since_previous_miss = 0 37 | if len(current_cache) < cache_size: 38 | current_cache[id] = 1 39 | else: 40 | max_next_access = time_step 41 | max_next_access_id = -1 42 | for element in current_cache.keys(): 43 | next_access = get_next_access(element, time_step, next_access_times) 44 | if(next_access > max_next_access): 45 | max_next_access = next_access 46 | max_next_access_id = element 47 | if max_next_access > get_next_access(id, time_step, next_access_times): 48 | del current_cache[max_next_access_id] 49 | current_cache[id] = 1 50 | bhr_metric[i] = bhr/len(ids) 51 | rewards[i] = episode_rewards 52 | return rewards, bhr_metric 53 | -------------------------------------------------------------------------------- /algorithms/actor_critic_eligibility_trace_algorithm_tc.py: -------------------------------------------------------------------------------- 1 | # Actor Critic with Eligibility Traces using Linear Approximation for State and Policy 2 | 3 | import numpy as np 4 | 5 | from state_approximations.one_d_tc import StateOneDTileCoding 6 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 7 | 8 | def actor_critic_eligibility_trace_tc(env, gamma, alpha_theta, alpha_w, lambda_theta, lambda_w, 9 | V:StateOneDTileCoding, pi:LinearPolicyApproximation, 10 | episodes): 11 | bhr_metric = {} 12 | rewards = {} 13 | for i in episodes: 14 | s_current = env.reset(i) 15 | 16 | #print(s_current) 17 | print("Feature length {}".format(V.feature_vector_len())) 18 | 19 | V_weights_shape = np.array(V.feature_vector_len()) 20 | pi_weights_shape = np.array(list(pi.model.model[0].weight.shape)) 21 | 22 | z_w = np.zeros(V_weights_shape) 23 | z_theta = np.zeros(pi_weights_shape) 24 | 25 | V_weights = np.zeros(V_weights_shape) 26 | 27 | I = 1 28 | done = False 29 | episode_rewards = [] 30 | while not done: 31 | #s_current = [s_x /1000 for s_x in s_current] 32 | action = pi(s_current) 33 | s_next, reward, done, info = env.step(action) 34 | episode_rewards.append(reward) 35 | delta = reward + gamma*np.dot(V(s_next, done), V_weights)- np.dot(V(s_current,done), V_weights) 36 | z_w = gamma*lambda_w*z_w + V(s_current,done) 37 | z_theta = gamma*lambda_theta*z_theta + I*pi.return_gradient(s_current, action) 38 | # print("z_w = ", z_w) 39 | # print("z_theta = ", z_theta) 40 | # print("state: ", s_current) 41 | # print("pi.return_gradient ", pi.return_gradient(s_current, action)) 42 | V_weights += alpha_w*delta*z_w 43 | pi.manual_update(alpha_theta*delta*z_theta) 44 | I = gamma*I 45 | s_current = s_next 46 | if done: 47 | bhr_metric[i] = info[2] 48 | rewards[i] = episode_rewards 49 | return rewards, bhr_metric 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /state_action_approximations/linear_q_approximation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from state_action_approximations.state_action_approximation import StateActionApproximation 5 | from torch import nn 6 | 7 | class NeuralNetwork(nn.Module): 8 | def __init__(self, dims): 9 | super(NeuralNetwork, self).__init__() 10 | self.model = nn.Linear(dims, 1) 11 | 12 | def forward(self, x): 13 | return self.model(x) 14 | 15 | class LinearStateActionApproximation(StateActionApproximation): 16 | def create_model(self): 17 | self.model = NeuralNetwork(self.state_dims * self.num_actions) 18 | self.loss_fn = nn.MSELoss() 19 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.alpha) 20 | 21 | def __init__(self, 22 | state_dims, num_actions, alpha): 23 | """ 24 | state_dims: the number of dimensions of state space 25 | """ 26 | # TODO: implement this method 27 | self.state_dims = state_dims 28 | self.num_actions = num_actions 29 | self.alpha = alpha 30 | self.create_model() 31 | 32 | def get_input(self, s, a): 33 | # if type(a) == int: 34 | # act = [a] 35 | input = np.zeros(shape = self.state_dims*self.num_actions) 36 | for i in range(self.state_dims): 37 | input[a*self.state_dims + i] = s[i] 38 | return torch.tensor(input) 39 | 40 | def __call__(self, s, a): 41 | # TODO: implement this method 42 | self.model.eval() 43 | input = self.get_input(s, a) 44 | pred = self.model(input.float()) 45 | return pred.detach().numpy()[0] 46 | 47 | def update(self, alpha, G, s, a): 48 | # TODO: implement this method 49 | self.model.train() 50 | input = self.get_input(s, a) 51 | pred = self.model(input.float()) 52 | G = torch.tensor([G], dtype=torch.float32) 53 | loss = 0.5 * self.loss_fn(pred, G) 54 | self.optimizer.zero_grad() 55 | loss.backward() 56 | self.optimizer.step() 57 | -------------------------------------------------------------------------------- /state_approximations/linear_v_approximation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | 5 | class NeuralNetworkVA(nn.Module): 6 | def __init__(self, dims): 7 | super(NeuralNetworkVA, self).__init__() 8 | self.model = nn.Linear(dims, 1) 9 | 10 | def forward(self, x): 11 | return self.model(x) 12 | 13 | class Baseline(object): 14 | """ 15 | The dumbest baseline; a constant for every state 16 | """ 17 | def __init__(self,b): 18 | self.b = b 19 | 20 | def __call__(self,s) -> float: 21 | return self.b 22 | 23 | def update(self,s,G): 24 | pass 25 | 26 | class LinearStateApproximation(Baseline): 27 | 28 | def create_model(self): 29 | self.model = NeuralNetworkVA(self.state_dims) 30 | self.loss_fn = nn.MSELoss() 31 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.alpha, betas=(0.9, 0.999)) 32 | 33 | def __init__(self, 34 | state_dims, 35 | alpha): 36 | """ 37 | state_dims: the number of dimensions of state space 38 | alpha: learning rate 39 | """ 40 | self.state_dims = state_dims 41 | self.alpha = alpha 42 | self.create_model() 43 | 44 | def __call__(self,s) -> float: 45 | self.model.eval() 46 | s = torch.tensor(s) 47 | pred = self.model(s.float()) 48 | return pred.detach().numpy()[0] 49 | 50 | def update(self,s,G): 51 | self.model.train() 52 | s = torch.tensor(s) 53 | pred = self.model(s.float()) 54 | G = torch.tensor([G], dtype=torch.float32) 55 | loss = 0.5 * self.loss_fn(pred, G) 56 | self.optimizer.zero_grad() 57 | loss.backward() 58 | self.optimizer.step() 59 | 60 | def return_gradient(self, s): 61 | self.model.train() 62 | s = torch.tensor(s) 63 | pred = self.model(s.float()) 64 | self.model.zero_grad() 65 | pred.backward() 66 | grad = self.model.model.weight.grad.numpy() 67 | #print("grad: ", grad) 68 | return grad 69 | 70 | def manual_update(self, update_vector): 71 | with torch.no_grad(): 72 | update_vector = torch.tensor(update_vector) 73 | self.model.model.weight += update_vector 74 | #print("Weights : ", self.model.model[0].weight) -------------------------------------------------------------------------------- /state_action_approximations/nn_q_approximation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from state_action_approximations.state_action_approximation import StateActionApproximation 5 | from collections import OrderedDict 6 | from torch import nn 7 | 8 | 9 | class NeuralNetwork(nn.Module): 10 | def __init__(self, dims): 11 | super(NeuralNetwork, self).__init__() 12 | self.model = nn.Sequential(OrderedDict([('fc1', nn.Linear(dims, dims)), 13 | ('fc2', nn.Linear(dims, 1))])) 14 | 15 | def forward(self, x): 16 | return self.model(x) 17 | 18 | class NeuralNetworkStateActionApproximation(StateActionApproximation): 19 | def create_model(self): 20 | self.model = NeuralNetwork(self.state_dims * self.num_actions) 21 | self.loss_fn = nn.MSELoss() 22 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.alpha) 23 | 24 | def __init__(self, 25 | state_dims, num_actions, alpha): 26 | """ 27 | state_dims: the number of dimensions of state space 28 | """ 29 | # TODO: implement this method 30 | self.state_dims = state_dims 31 | self.num_actions = num_actions 32 | self.alpha = alpha 33 | self.create_model() 34 | 35 | def get_input(self, s, a): 36 | # if type(a) == int: 37 | # act = [a] 38 | input = np.zeros(shape = self.state_dims*self.num_actions) 39 | for i in range(self.state_dims): 40 | input[a*self.state_dims + i] = s[i] 41 | return torch.tensor(input) 42 | 43 | def __call__(self, s, a): 44 | # TODO: implement this method 45 | self.model.eval() 46 | input = self.get_input(s, a) 47 | pred = self.model(input.float()) 48 | return pred.detach().numpy()[0] 49 | 50 | def update(self, alpha, G, s, a): 51 | # TODO: implement this method 52 | self.model.train() 53 | input = self.get_input(s, a) 54 | pred = self.model(input.float()) 55 | G = torch.tensor([G], dtype=torch.float32) 56 | loss = 0.5 * self.loss_fn(pred, G) 57 | self.optimizer.zero_grad() 58 | loss.backward() 59 | self.optimizer.step() 60 | -------------------------------------------------------------------------------- /algorithms/actor_critic_one_step.py: -------------------------------------------------------------------------------- 1 | # Actor Critic with Eligibility Traces using Linear Approximation for State and Policy 2 | 3 | import numpy as np 4 | 5 | from state_approximations.linear_v_approximation import LinearStateApproximation 6 | from state_approximations.nn_v_approximation import NNStateApproximation 7 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 8 | 9 | def actor_critic_one_step_nn(env, gamma, alpha_theta, alpha_w, 10 | V:NNStateApproximation, pi:LinearPolicyApproximation, 11 | episodes): 12 | bhr_metric = {} 13 | rewards = {} 14 | for i in episodes: 15 | s_current = env.reset(i) 16 | I = 1 17 | done = False 18 | episode_rewards = [] 19 | while not done: 20 | action = pi(s_current) 21 | s_next, reward, done, info = env.step(action) 22 | episode_rewards.append(reward) 23 | delta = reward + gamma*V(s_next)- V(s_current) 24 | V.manual_update(V.return_gradient(s_current)*delta*alpha_w) 25 | pi.manual_update(pi.return_gradient(s_current, action)*alpha_theta*I*delta) 26 | I = gamma*I 27 | s_current = s_next 28 | if done: 29 | bhr_metric[i] = info[2] 30 | rewards[i] = episode_rewards 31 | return rewards, bhr_metric 32 | 33 | def actor_critic_one_step(env, gamma, alpha_theta, alpha_w, 34 | V:LinearStateApproximation, pi:LinearPolicyApproximation, 35 | episodes): 36 | bhr_metric = {} 37 | rewards = {} 38 | for i in episodes: 39 | s_current = env.reset(i) 40 | I = 1 41 | done = False 42 | episode_rewards = [] 43 | while not done: 44 | action = pi(s_current) 45 | s_next, reward, done, info = env.step(action) 46 | episode_rewards.append(reward) 47 | delta = reward + gamma*V(s_next)- V(s_current) 48 | V.manual_update(V.return_gradient(s_current)*delta*alpha_w) 49 | pi.manual_update(pi.return_gradient(s_current, action)*alpha_theta*I*delta) 50 | I = gamma*I 51 | s_current = s_next 52 | if done: 53 | bhr_metric[i] = info[2] 54 | rewards[i] = episode_rewards 55 | return rewards, bhr_metric 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /state_approximations/nn_v_approximation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from torch import nn 5 | from collections import OrderedDict 6 | 7 | class NeuralNetworkVA(nn.Module): 8 | def __init__(self, dims): 9 | super(NeuralNetworkVA, self).__init__() 10 | self.model = nn.Sequential(OrderedDict([('fc1', nn.Linear(dims, dims)), 11 | ('fc2', nn.Linear(dims, 1))])) 12 | 13 | def forward(self, x): 14 | return self.model(x) 15 | 16 | class Baseline(object): 17 | """ 18 | The dumbest baseline; a constant for every state 19 | """ 20 | def __init__(self,b): 21 | self.b = b 22 | 23 | def __call__(self,s) -> float: 24 | return self.b 25 | 26 | def update(self,s,G): 27 | pass 28 | 29 | class NNStateApproximation(Baseline): 30 | 31 | def create_model(self): 32 | self.model = NeuralNetworkVA(self.state_dims) 33 | self.loss_fn = nn.MSELoss() 34 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.alpha, betas=(0.9, 0.999)) 35 | 36 | def __init__(self, 37 | state_dims, 38 | alpha): 39 | """ 40 | state_dims: the number of dimensions of state space 41 | alpha: learning rate 42 | """ 43 | self.state_dims = state_dims 44 | self.alpha = alpha 45 | self.create_model() 46 | 47 | def __call__(self,s) -> float: 48 | self.model.eval() 49 | s = torch.tensor(s) 50 | pred = self.model(s.float()) 51 | return pred.detach().numpy()[0] 52 | 53 | def update(self,s,G): 54 | self.model.train() 55 | s = torch.tensor(s) 56 | pred = self.model(s.float()) 57 | G = torch.tensor([G], dtype=torch.float32) 58 | loss = 0.5 * self.loss_fn(pred, G) 59 | self.optimizer.zero_grad() 60 | loss.backward() 61 | self.optimizer.step() 62 | 63 | def return_gradient(self, s): 64 | self.model.train() 65 | s = torch.tensor(s) 66 | pred = self.model(s.float()) 67 | self.model.zero_grad() 68 | pred.backward() 69 | grad = self.model.model.weight.grad.numpy() 70 | #print("grad: ", grad) 71 | return grad 72 | 73 | def manual_update(self, update_vector): 74 | with torch.no_grad(): 75 | update_vector = torch.tensor(update_vector) 76 | self.model.model.weight += update_vector 77 | #print("Weights : ", self.model.model[0].weight) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | trace/ 30 | .idea/ 31 | rlcar/ 32 | logs/ 33 | .DS_Store 34 | run.sh 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | *.csv 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /policy_approximations/linear_policy_approximation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from collections import OrderedDict 5 | 6 | class NeuralNetworkPA(nn.Module): 7 | def __init__(self, dims, outputs): 8 | super(NeuralNetworkPA, self).__init__() 9 | self.model = nn.Sequential(OrderedDict([('fc1', nn.Linear(dims, outputs)), 10 | ('act1', nn.Softmax(dim=0))])) 11 | 12 | def forward(self, x): 13 | return self.model(x) 14 | 15 | class LinearPolicyApproximation(): 16 | 17 | def create_model(self): 18 | self.model = NeuralNetworkPA(self.state_dims, self.num_actions) 19 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.alpha, betas=(0.9, 0.999)) 20 | 21 | 22 | def __init__(self, 23 | state_dims, 24 | num_actions, 25 | alpha): 26 | """ 27 | state_dims: the number of dimensions of state space 28 | action_dims: the number of possible actions 29 | alpha: learning rate 30 | """ 31 | self.state_dims = state_dims 32 | self.num_actions = num_actions 33 | self.alpha = alpha 34 | self.create_model() 35 | 36 | def __call__(self, s) -> int: 37 | self.model.eval() 38 | s = torch.tensor(s) 39 | pred = self.model(s.float()) 40 | action_probs = pred.detach().numpy() 41 | action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 42 | return action 43 | 44 | def update(self, s, a, gamma_t, delta): 45 | """ 46 | s: state S_t 47 | a: action A_t 48 | gamma_t: gamma^t 49 | delta: G-v(S_t,w) 50 | """ 51 | self.model.train() 52 | s = torch.tensor(s) 53 | pred = self.model(s.float()) 54 | log_prob = torch.log(pred)[a].unsqueeze(0) 55 | loss = - delta * gamma_t * log_prob 56 | self.optimizer.zero_grad() 57 | loss.backward() 58 | self.optimizer.step() 59 | 60 | def return_gradient(self, s, a): 61 | self.model.train() 62 | s = torch.tensor(s) 63 | pred = self.model(s.float()) 64 | self.model.zero_grad() 65 | log_prob = torch.log(pred)[a].unsqueeze(0) 66 | log_prob.backward() 67 | grad = self.model.model[0].weight.grad.numpy() 68 | #print("grad: ", grad) 69 | return grad 70 | 71 | def manual_update(self, update_vector): 72 | with torch.no_grad(): 73 | update_vector = torch.tensor(update_vector) 74 | self.model.model[0].weight += update_vector 75 | #print("Weights : ", self.model.model[0].weight) 76 | 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RLCaR 2 | Reinforcement Learning for Cache admission and Replacement 3 | 4 | We propose to apply reinforcement learning on caching systems. The first problem 5 | we consider is to decide whether we want to admit an object in the cache, when 6 | an object request leads to a cache miss. While cache replacement policies have 7 | received significant traction over the years, most systems use simple LRU for 8 | eviction, without explicit admission algorithms. The optimal algorithm for solving 9 | cache admission will require access to future requests, thus making it impractical. 10 | We train an RL agent to give a binary decision of admit/don’t admit for each cache 11 | miss. We show that using our RL agent gives a higher byte hit rate compared 12 | to always admitting on a cache miss or using a random policy to admit an item 13 | in the cache when LRU (Least Recently Used) is used as the cache replacement 14 | policy. The next problem that we consider is the more common problem of cache 15 | replacement, i.e, deciding which object to evict from the cache on a miss. We model 16 | this as an adversarial bandit problem, treating LRU, LFU (Least Frequently Used) 17 | and FIFO (First In First Out) as experts, and solve it using the Hedge algorithm, 18 | assuming full feedback. We show that the algorithm eventually converges to the 19 | best expert. Our experiments are based on a simulated environment, where the 20 | cache traces are generated using a Zip-f distribution, which has been widely used 21 | in simulations. 22 | 23 | ## Environment Setup 24 | Create a new python environment and install dependencies using `pip3 install -r requirements.txt` 25 | 26 | ## Instructions to Run 27 | To run with default arguments : `python3 main.py` 28 | 29 | Arguments : 30 | + `-ne NUM_EPISODES, --num_episodes NUM_EPISODES 31 | Number of episodes` 32 | 33 | + `-nr NUM_REPETITIONS, --num_repetitions NUM_REPETITIONS 34 | Number of repetitions` 35 | 36 | + `-fa FUNCTION_APPROXIMATION, --function_approximation FUNCTION_APPROXIMATION 37 | function approximation to use [linear, tc, nn]` 38 | 39 | + `-n_steps N_STEPS, --n_steps N_STEPS 40 | number of steps in sarsa` 41 | 42 | + `-lam LAM, --lam LAM lambda in sarsa` 43 | 44 | + `-rl RL_ALGO, --rl_algo RL_ALGO 45 | rl algorithm to use [always_evict, random_eviction, actor_critic, n_step_sarsa, optimal, sarsa_lambda]` 46 | 47 | + `-policy POLICY, --policy POLICY 48 | cache replacement policy space separated [LRU, LFU, FIFO]` 49 | 50 | + `-ts TEST_SIZE, --test_size TEST_SIZE 51 | test size` 52 | 53 | + `-cs CACHE_SIZE, --cache_size CACHE_SIZE 54 | cache size` 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /replacement_policies/lfu.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import defaultdict, Counter 3 | from replacement_policies.policy_base import PolicyBase 4 | 5 | class LFUCache(PolicyBase): 6 | 7 | def __init__(self, capacity: int): 8 | self.capacity = capacity 9 | self.object_to_count = {} 10 | self.count_to_object = defaultdict(defaultdict) 11 | self.min_count = None 12 | self.history = [] 13 | self.history_dict = Counter() 14 | 15 | def update(self, key: int, val: int): 16 | if key in self.object_to_count: 17 | count = self.object_to_count[key] 18 | self.object_to_count[key] += 1 19 | size = self.count_to_object[count][key] 20 | del self.count_to_object[count][key] 21 | self.count_to_object[count + 1][key] = size 22 | if not self.count_to_object[self.min_count]: 23 | self.min_count += 1 24 | is_present = (key in self.history_dict) 25 | return is_present 26 | 27 | def get_remove_candidate(self): 28 | return next(iter(self.count_to_object[self.min_count])) 29 | 30 | def update_history(self): 31 | candidate = self.get_remove_candidate() 32 | if (len(self.history) >= self.capacity): 33 | remove_item = self.history.pop(0) 34 | if(self.history_dict[remove_item] == 1): 35 | del self.history_dict[remove_item] 36 | else: 37 | self.history_dict[remove_item] -= 1 38 | self.history.append(candidate) 39 | self.history_dict[candidate] += 1 40 | 41 | def remove(self): 42 | self.update_history() 43 | count_dictionary = self.count_to_object[self.min_count] 44 | key = random.choice(list(count_dictionary.keys())) 45 | val = count_dictionary[key] 46 | del self.count_to_object[self.min_count][key] 47 | del self.object_to_count[key] 48 | return key ,val 49 | 50 | def put(self, key: int, value : int) -> None: 51 | if len(self.object_to_count) >= self.capacity: 52 | self.remove() 53 | self.min_count = 1 54 | self.object_to_count[key] = 1 55 | self.count_to_object[1][key] = value 56 | 57 | def remove_key(self, key): 58 | self.update_history() 59 | if key in self.object_to_count: 60 | count = self.object_to_count[key] 61 | del self.object_to_count[key] 62 | del self.count_to_object[count][key] 63 | if not self.count_to_object[self.min_count]: 64 | self.min_count += 1 65 | 66 | def reset(self): 67 | self.object_to_count = {} 68 | self.count_to_object = defaultdict(defaultdict) 69 | self.min_count = None 70 | self.history = [] 71 | self.history_dict = Counter() -------------------------------------------------------------------------------- /replacement_agent.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from replacement_policies.lru import LRUCache 6 | from replacement_policies.lfu import LFUCache 7 | from replacement_policies.fifo import FifoCache 8 | from utils.weights_logger import WeightLogger 9 | 10 | class ReplacementAgent: 11 | def __init__(self, capacity, policies, episode_index): 12 | self.capacity = capacity 13 | self.policies = policies 14 | self.experts = [] 15 | self.num_experts = len(policies) 16 | self.current_expert = 0 17 | self.hit_reward = 1 18 | self.miss_reward = -0.5 19 | self.epsilon = 0.1 20 | if "LRU" in policies: 21 | self.experts.append(LRUCache(capacity)) 22 | if "LFU" in policies: 23 | self.experts.append(LFUCache(capacity)) 24 | if "FIFO" in policies: 25 | self.experts.append(FifoCache(capacity)) 26 | self.running_reward = 0 27 | 28 | self.weights = np.ones(shape=self.num_experts) 29 | self.reward = np.zeros(shape=self.num_experts) 30 | self.weight_logger = WeightLogger() 31 | self.tick = 1 32 | self.episode_index = episode_index 33 | 34 | def update(self, key: int, obj_size) -> None: 35 | for index, expert in enumerate(self.experts): 36 | is_present_in_history = expert.update(key, obj_size) 37 | if is_present_in_history: 38 | self.reward[index] = self.miss_reward 39 | else: 40 | self.reward[index] = self.hit_reward 41 | if(self.num_experts > 1): 42 | self.weight_update() 43 | 44 | def remove(self): 45 | current_expert = int(np.random.choice(np.arange(self.num_experts), 1, p=self.weights/np.sum(self.weights))) 46 | key ,val = self.experts[current_expert].remove() 47 | for i in range(self.num_experts): 48 | if i != current_expert: 49 | self.experts[i].remove_key(key) 50 | self.running_reward = 0 51 | return key, val 52 | 53 | def put(self, key: int, value : int) -> None: 54 | for expert in self.experts: 55 | expert.put(key, value) 56 | self.update(key, value) 57 | 58 | def weight_update(self): 59 | for index, expert in enumerate(self.experts): 60 | self.weights[index] *= math.pow(1 + self.epsilon, self.reward[index]) 61 | #print("weights = ", self.weights) 62 | self.weights = self.weights / np.sum(self.weights) 63 | self.weight_logger.update_weights(self.weights, self.tick) 64 | self.tick += 1 65 | 66 | def reset(self, index): 67 | for expert in self.experts: 68 | expert.reset() 69 | self.weights = self.weights/np.sum(self.weights) 70 | self.current_expert = 0 71 | self.weight_logger.end(self.episode_index) 72 | self.episode_index = index -------------------------------------------------------------------------------- /state_approximations/one_d_tc.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | class StateOneDTileCoding(): 4 | 5 | def get_state(self, state): 6 | states = [] 7 | for tile_index in range(self.num_tilings): 8 | indices = [] 9 | previous_index = 0 10 | for i in range(self.tiling_dimensions): 11 | start = self.state_low[i] - tile_index * self.tile_width[i]/self.num_tilings 12 | diff = state[i] - start 13 | index = math.floor(diff / self.tile_width[i]) 14 | index = min(index, self.tiles_per_dim[i] - 1) 15 | index += previous_index 16 | indices.append(index) 17 | previous_index += self.tiles_per_dim[i] 18 | states.append(indices) 19 | return states 20 | 21 | def __init__(self, 22 | state_low:np.array, 23 | state_high:np.array, 24 | num_tilings:int, 25 | tile_width:np.array): 26 | """ 27 | state_low: possible minimum value for each dimension in state 28 | state_high: possible maimum value for each dimension in state 29 | num_actions: the number of possible actions 30 | num_tilings: # tilings 31 | tile_width: tile width for each dimension 32 | """ 33 | self.num_tilings = num_tilings 34 | self.tile_width = tile_width 35 | self.state_low = state_low 36 | self.state_high = state_high 37 | self.tiling_dimensions = len(self.state_low) 38 | tiles_per_dim = np.zeros(shape=self.tiling_dimensions, dtype=np.int64) 39 | for i in range(self.tiling_dimensions): 40 | tiles_per_dim[i] = math.ceil((self.state_high[i] - self.state_low[i]) /self.tile_width[i]) + 1 41 | self.tiles_per_dim = tiles_per_dim 42 | self.num_tiles_per_tiling = np.sum(self.tiles_per_dim) 43 | self.feature_dims = self.num_tilings*self.num_tiles_per_tiling 44 | self.feature_array = np.zeros(shape=(2)) 45 | self.feature_array[0] = self.num_tilings 46 | self.feature_array[1] = self.num_tiles_per_tiling 47 | self.feature_array = self.feature_array.astype(int) 48 | 49 | def feature_vector_len(self) -> int: 50 | """ 51 | return dimension of feature_vector: d = num_actions * num_tilings * num_tiles 52 | """ 53 | return self.feature_dims 54 | 55 | def __call__(self, s, done) -> np.array: 56 | """ 57 | implement function x: S+ x A -> [0,1]^d 58 | if done is True, then return 0^d 59 | """ 60 | # feature = np.zeros(shape=self.feature_vector_len()) 61 | feature = np.zeros(tuple(self.feature_array)) 62 | if done: 63 | return feature.flatten() 64 | active_states = self.get_state(s) 65 | for i in range(len(active_states)): 66 | feature[i][active_states[i]] = 1 67 | return feature.flatten() -------------------------------------------------------------------------------- /trace_generator.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import math 4 | 5 | import pandas as pd 6 | 7 | from trace_loader import load_traces 8 | from collections import Counter 9 | 10 | def analyse_trace(index): 11 | trace = load_traces('test', 20, index) 12 | 13 | counts = Counter() 14 | total_count = 0 15 | for index, row in trace.iterrows(): 16 | counts[row[1]]+=1 17 | total_count+=1 18 | for key in counts: 19 | counts[key] /= total_count 20 | 21 | print("Total Count : ", total_count) 22 | print("Total Unique Objects : ", len(counts)) 23 | 24 | real_probabilities = np.sort(np.array(list(counts.values())))[::-1] 25 | 26 | predicted_probabilities = np.zeros(len(counts)) 27 | for i in range(1,len(counts)+1): 28 | predicted_probabilities[i-1] = math.pow(i, -(0.939)) 29 | predicted_probabilities /= np.sum(predicted_probabilities) 30 | 31 | # print("real probabilities : ", real_probabilities) 32 | # print("predicted probabilities : ", predicted_probabilities) 33 | 34 | # plt.figure() 35 | # plt.plot(real_probabilities, label='real_probabilities') 36 | # plt.plot(predicted_probabilities, label='predicted_probabilities') 37 | # plt.legend() 38 | # plt.show() 39 | 40 | def generate_zipf(alpha, total_requests, unique_requests): 41 | predicted_probabilities = np.zeros(unique_requests) 42 | for i in range(1, unique_requests + 1): 43 | predicted_probabilities[i - 1] = math.pow(i, -alpha) 44 | predicted_probabilities /= np.sum(predicted_probabilities) 45 | 46 | trace = np.random.choice(np.arange(unique_requests), p=predicted_probabilities, size=total_requests) 47 | trace_dict = {"timestamp" : np.arange(total_requests), 48 | "id" : trace, 49 | "obj_size" : np.ones(total_requests)} 50 | return pd.DataFrame(trace_dict) 51 | 52 | def generate_lru_optimal(total_requests, unique_requests): 53 | trace_subarray = np.zeros(2*unique_requests) 54 | trace_subarray[:unique_requests] = np.arange(unique_requests) 55 | trace_subarray[unique_requests:] = np.flip(np.arange(unique_requests)) 56 | trace = np.repeat(trace_subarray, int(total_requests/(2*unique_requests)) + 1) 57 | trace = trace[:total_requests] 58 | trace_dict = {"timestamp": np.arange(total_requests), 59 | "id": trace, 60 | "obj_size": np.ones(total_requests)} 61 | return pd.DataFrame(trace_dict) 62 | 63 | alpha = 0.5 64 | num_request = 100000 65 | unique_requests = 9000 66 | 67 | for i in range(1): 68 | trace_df = generate_zipf(alpha, num_request, unique_requests) 69 | file_name = "trace/zipf_{}/trace_0.tr".format(alpha, i) 70 | trace_df.to_csv(file_name, index=False, header=False, sep=" ") 71 | 72 | # for i in range(1): 73 | # trace_df = generate_lru_optimal(num_request, unique_requests) 74 | # file_name = "trace/lru_optimal/trace_{}.tr".format(i) 75 | # trace_df.to_csv(file_name, index=False, header=False, sep=" ") 76 | -------------------------------------------------------------------------------- /state_action_approximations/tile_coding_state_action.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | class StateActionFeatureVectorWithTile(): 4 | 5 | def get_state(self, state): 6 | states = [] 7 | for tile_index in range(self.num_tilings): 8 | indices = [tile_index] 9 | for i in range(self.tiling_dimensions): 10 | start = self.state_low[i] - tile_index * self.tile_width[i]/self.num_tilings 11 | diff = state[i] - start 12 | index = math.floor(diff / self.tile_width[i]) 13 | index = min(index, self.tiles_per_dim[i] - 1) 14 | indices.append(index) 15 | states.append(indices) 16 | return states 17 | 18 | def __init__(self, 19 | state_low:np.array, 20 | state_high:np.array, 21 | num_actions:int, 22 | num_tilings:int, 23 | tile_width:np.array): 24 | """ 25 | state_low: possible minimum value for each dimension in state 26 | state_high: possible maimum value for each dimension in state 27 | num_actions: the number of possible actions 28 | num_tilings: # tilings 29 | tile_width: tile width for each dimension 30 | """ 31 | self.num_tilings = num_tilings 32 | self.tile_width = tile_width 33 | self.state_low = state_low 34 | self.state_high = state_high 35 | self.num_actions = num_actions 36 | self.tiling_dimensions = len(self.state_low) 37 | tiles_per_dim = np.zeros(shape=self.tiling_dimensions, dtype=np.int64) 38 | for i in range(self.tiling_dimensions): 39 | tiles_per_dim[i] = math.ceil((self.state_high[i] - self.state_low[i]) /self.tile_width[i]) + 1 40 | self.tiles_per_dim = tiles_per_dim 41 | self.num_tiles_per_tiling = np.prod(self.tiles_per_dim) 42 | self.feature_dims = self.num_actions * self.num_tilings*self.num_tiles_per_tiling 43 | self.feature_array = np.zeros(shape=(2 + self.tiling_dimensions)) 44 | self.feature_array[0] = self.num_actions 45 | self.feature_array[1] = self.num_tilings 46 | self.feature_array[2:] = self.tiles_per_dim 47 | self.feature_array = self.feature_array.astype(int) 48 | 49 | def feature_vector_len(self) -> int: 50 | """ 51 | return dimension of feature_vector: d = num_actions * num_tilings * num_tiles 52 | """ 53 | return self.feature_dims 54 | 55 | def __call__(self, s, done, a) -> np.array: 56 | """ 57 | implement function x: S+ x A -> [0,1]^d 58 | if done is True, then return 0^d 59 | """ 60 | # feature = np.zeros(shape=self.feature_vector_len()) 61 | feature = np.zeros(tuple(self.feature_array)) 62 | if done: 63 | return feature.flatten() 64 | active_states = self.get_state(s) 65 | for i in range(len(active_states)): 66 | feature[a][tuple(active_states[i])] = 1 67 | return feature.flatten() -------------------------------------------------------------------------------- /state_action_approximations/one_d_tc.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | class StateActionOneDTileCoding(): 5 | 6 | def get_state(self, state): 7 | states = [] 8 | for tile_index in range(self.num_tilings): 9 | indices = [] 10 | previous_index = 0 11 | for i in range(self.tiling_dimensions): 12 | start = self.state_low[i] - tile_index * self.tile_width[i]/self.num_tilings 13 | diff = state[i] - start 14 | index = math.floor(diff / self.tile_width[i]) 15 | index = min(index, self.tiles_per_dim[i] - 1) 16 | index += previous_index 17 | indices.append(index) 18 | previous_index += self.tiles_per_dim[i] 19 | states.append(indices) 20 | return states 21 | 22 | def __init__(self, 23 | state_low:np.array, 24 | state_high:np.array, 25 | num_tilings:int, 26 | tile_width:np.array, 27 | num_actions:int): 28 | """ 29 | state_low: possible minimum value for each dimension in state 30 | state_high: possible maimum value for each dimension in state 31 | num_actions: the number of possible actions 32 | num_tilings: # tilings 33 | tile_width: tile width for each dimension 34 | """ 35 | self.num_tilings = num_tilings 36 | self.tile_width = tile_width 37 | self.state_low = state_low 38 | self.state_high = state_high 39 | self.tiling_dimensions = len(self.state_low) 40 | self.num_actions = num_actions 41 | tiles_per_dim = np.zeros(shape=self.tiling_dimensions, dtype=np.int64) 42 | for i in range(self.tiling_dimensions): 43 | tiles_per_dim[i] = math.ceil((self.state_high[i] - self.state_low[i]) /self.tile_width[i]) + 1 44 | self.tiles_per_dim = tiles_per_dim 45 | self.num_tiles_per_tiling = np.sum(self.tiles_per_dim) 46 | self.feature_dims = self.num_tilings*self.num_tiles_per_tiling*self.num_actions 47 | self.feature_array = np.zeros(shape=(3)) 48 | self.feature_array[0] = self.num_actions 49 | self.feature_array[1] = self.num_tilings 50 | self.feature_array[2] = self.num_tiles_per_tiling 51 | self.feature_array = self.feature_array.astype(int) 52 | 53 | def feature_vector_len(self) -> int: 54 | """ 55 | return dimension of feature_vector: d = num_actions * num_tilings * num_tiles 56 | """ 57 | return self.feature_dims 58 | 59 | def __call__(self, s, a, done) -> np.array: 60 | """ 61 | implement function x: S+ x A -> [0,1]^d 62 | if done is True, then return 0^d 63 | """ 64 | # feature = np.zeros(shape=self.feature_vector_len()) 65 | feature = np.zeros(tuple(self.feature_array)) 66 | if done: 67 | return feature.flatten() 68 | active_states = self.get_state(s) 69 | for i in range(len(active_states)): 70 | feature[a][i][active_states[i]] = 1 71 | return feature.flatten() -------------------------------------------------------------------------------- /trace_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import statistics 6 | 7 | from collections import Counter 8 | 9 | 10 | def load_traces(trace : str, cache_size, rnd): 11 | if trace == 'test': 12 | trace_folder = os.curdir + '/trace/' 13 | print(trace_folder) 14 | 15 | print('Load #%i trace for cache size of %i' % (rnd, cache_size)) 16 | 17 | # load time, request id, request size 18 | df = pd.read_csv(trace_folder + 'test_trace/test_' + str(rnd) + '.tr', sep=' ', header=None) 19 | # remaining cache size, object last access time 20 | df[3], df[4] = cache_size, 0 21 | df[2] = 1 22 | else: 23 | trace_folder = os.curdir + '/trace/' 24 | print(trace_folder) 25 | 26 | print('Load #%i trace for cache size of %i' % (rnd, cache_size)) 27 | 28 | # load time, request id, request size 29 | df = pd.read_csv(trace_folder + trace + '/trace_' + str(rnd) + '.tr', sep=' ', header=None) 30 | # remaining cache size, object last access time 31 | df[3], df[4] = cache_size, 0 32 | df[2] = 1 33 | 34 | # elif trace == 'real': 35 | # df = [] 36 | # else: 37 | # # load user's trace 38 | # df = pd.read_csv(trace, sep=' ', header=None) 39 | # df[3], df[4] = cache_size, 0 40 | 41 | return df 42 | 43 | def get_stats(df): 44 | cache_unseen_default = 500 45 | 46 | obj_freq = Counter() 47 | obj_interarrival_time = {} 48 | all_interarrival_times = [] 49 | last_access_time = [] 50 | for index, row in df.iterrows(): 51 | obj_freq[row[1]] += 1 52 | if(row[1] not in obj_interarrival_time): 53 | obj_interarrival_time[row[1]] = index 54 | last_access_time.append(cache_unseen_default) 55 | else: 56 | all_interarrival_times.append(index - obj_interarrival_time[row[1]]) 57 | last_access_time.append(index - obj_interarrival_time[row[1]]) 58 | obj_interarrival_time[row[1]] = index 59 | 60 | # stats for object frequency 61 | obj_freq_mean = statistics.mean(obj_freq.values()) 62 | obj_freq_stdev = statistics.stdev(obj_freq.values()) 63 | 64 | # stats for object size 65 | obj_size_mean = statistics.mean(df[2]) 66 | obj_size_stdev = statistics.stdev(df[2]) 67 | 68 | # stats for interarrival times 69 | obj_interarrival_time_mean = statistics.mean(all_interarrival_times) 70 | obj_interarrival_time_stdev = statistics.stdev(all_interarrival_times) 71 | 72 | # stats for last access time 73 | last_access_time_mean = statistics.mean(last_access_time) 74 | last_access_time_stdev = statistics.stdev(last_access_time) 75 | 76 | #stats for rank 77 | ranks = np.arange(len(obj_freq)) 78 | rank_mean = statistics.mean(ranks) 79 | rank_stdev = statistics.stdev(ranks) 80 | 81 | means = [obj_size_mean, 0, last_access_time_mean, obj_freq_mean, obj_interarrival_time_mean, rank_mean] 82 | stddevs = [obj_size_stdev, 1, last_access_time_stdev, obj_freq_stdev, obj_interarrival_time_stdev, rank_stdev] 83 | 84 | for index, stddev in enumerate(stddevs): 85 | if(stddev == 0): 86 | stddevs[index] = 1 87 | 88 | return means, stddevs 89 | 90 | 91 | print("running") 92 | df = load_traces('test', 20, 5) 93 | obj_freq = Counter() 94 | for index, row in df.iterrows(): 95 | obj_freq[row[1]] += 1 96 | freq = pd.DataFrame.from_records(list(dict(obj_freq).items()), columns=['id','count']) 97 | file_name = "freq.csv" 98 | freq.to_csv(file_name, index=False, header=False, sep=",") 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /algorithms/semi_gradient_n_step_sarsa_algorithm.py: -------------------------------------------------------------------------------- 1 | # One step Semi Gradient Sarsa 2 | 3 | import numpy as np 4 | import math 5 | import copy 6 | 7 | from state_action_approximations.state_action_approximation import StateActionApproximation 8 | from state_action_approximations.one_d_tc import StateActionOneDTileCoding 9 | 10 | def epsilon_greedy(Q:StateActionApproximation, epsilon, state, actions): 11 | random = np.random.binomial(1, epsilon) 12 | max_ac = 0 13 | if random == 0: 14 | max_q = np.NINF 15 | for action in actions: 16 | current_q = Q(state, action) 17 | if current_q > max_q: 18 | max_ac = action 19 | max_q = current_q 20 | else: 21 | action_size = len(actions) 22 | index = np.random.randint(action_size) 23 | max_ac = actions[index] 24 | return max_ac 25 | 26 | def semi_gradient_n_step_sarsa(env, gamma, alpha, Q:StateActionApproximation, 27 | epsilon, episodes, actions, n): 28 | bhr_metric = {} 29 | rewards = {} 30 | for i in episodes: 31 | s_current = env.reset(i) 32 | action = epsilon_greedy(Q, epsilon, s_current, actions) 33 | done = False 34 | episode_rewards = [] 35 | episode = [] 36 | t = 0 37 | T = math.inf 38 | while not done: 39 | if(t < T): 40 | s_next, reward, done, info = env.step(action) 41 | if done: 42 | episode.append((s_current, action, s_next, reward, -1)) 43 | T = t + 1 44 | bhr_metric[i] = info[2] 45 | else: 46 | next_action = epsilon_greedy(Q, epsilon, s_next, actions) 47 | episode.append((s_current, action, s_next, reward, next_action)) 48 | s_current = s_next 49 | action = next_action 50 | tau = t - n + 1 51 | if(tau >= 0): 52 | G = 0 53 | discount = 1 54 | for j in range(tau, min(tau+n, T)): 55 | G += discount*episode[j][3] 56 | discount *= gamma 57 | if(tau+n < T): 58 | G = G + np.power(gamma,n)*Q(episode[tau+n-1][2], episode[tau+n-1][4]) 59 | Q.update(alpha, G, episode[tau][0], episode[tau][1]) 60 | t += 1 61 | if(tau == T-1): 62 | break 63 | rewards[i] = episode_rewards 64 | return rewards, bhr_metric 65 | 66 | def epsilon_greedy_tc(Q, epsilon, state, actions, X, done): 67 | random = np.random.binomial(1, epsilon) 68 | max_ac = 0 69 | if random == 0: 70 | max_q = np.NINF 71 | for action in actions: 72 | current_q = Q(state, action, X, done) 73 | if current_q > max_q: 74 | max_ac = action 75 | max_q = current_q 76 | else: 77 | action_size = len(actions) 78 | index = np.random.randint(action_size) 79 | max_ac = actions[index] 80 | return max_ac 81 | 82 | def semi_gradient_n_step_sarsa_tc(env, gamma, alpha, 83 | X:StateActionOneDTileCoding, 84 | epsilon, episodes, actions, n): 85 | 86 | bhr_metric = {} 87 | rewards = {} 88 | weights = np.zeros(X.feature_vector_len()) 89 | 90 | def Q(state, action, X, done): 91 | features = X(state, action, done) 92 | return np.dot(features, weights) 93 | 94 | for i in episodes: 95 | s_current = env.reset(i) 96 | done = False 97 | action = epsilon_greedy_tc(Q, epsilon, s_current, actions, X, done) 98 | episode_rewards = [] 99 | episode = [] 100 | t = 0 101 | T = math.inf 102 | while not done: 103 | if(t < T): 104 | s_next, reward, done, info = env.step(action) 105 | if done: 106 | episode.append((s_current, action, s_next, reward, -1)) 107 | T = t + 1 108 | bhr_metric[i] = info[2] 109 | else: 110 | next_action = epsilon_greedy_tc(Q, epsilon, s_next, actions, X, done) 111 | episode.append((s_current, action, s_next, reward, next_action)) 112 | s_current = s_next 113 | action = next_action 114 | tau = t - n + 1 115 | if(tau >= 0): 116 | G = 0 117 | discount = 1 118 | for j in range(tau, min(tau+n, T)): 119 | G += discount*episode[j][3] 120 | discount *= gamma 121 | if(tau+n < T): 122 | G = G + np.power(gamma,n)*Q(episode[tau+n-1][2], episode[tau+n-1][4], X, done) 123 | delta = alpha*(G - Q(episode[tau][0], episode[tau][1], X, done)) 124 | weights += X(episode[tau][0], episode[tau][1], done)*delta 125 | t += 1 126 | if(tau == T-1): 127 | break 128 | rewards[i] = episode_rewards 129 | return rewards, bhr_metric -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from sklearn.model_selection import train_test_split 7 | from cache import CacheEnv 8 | 9 | from algorithms.deterministic import always_evict, random_eviction 10 | from algorithms.semi_gradient_n_step_sarsa_algorithm import semi_gradient_n_step_sarsa, semi_gradient_n_step_sarsa_tc 11 | from algorithms.actor_critic_eligibility_trace_algorithm_linear import actor_critic_eligibility_trace_linear 12 | from algorithms.actor_critic_eligibility_trace_algorithm_tc import actor_critic_eligibility_trace_tc 13 | from algorithms.optimal_algorithm import optimal_admission 14 | from algorithms.reinforce_algorithm import reinforce 15 | from algorithms.true_online_sarsa_lambda import TrueOnlineSarsaLambda 16 | from state_approximations.linear_v_approximation import LinearStateApproximation 17 | from state_approximations.one_d_tc import StateOneDTileCoding 18 | from policy_approximations.linear_policy_approximation import LinearPolicyApproximation 19 | from state_action_approximations.one_d_tc import StateActionOneDTileCoding 20 | from state_action_approximations.linear_q_approximation import LinearStateActionApproximation 21 | from state_action_approximations.nn_q_approximation import NeuralNetworkStateActionApproximation 22 | 23 | def plot_reward(rewards, filename): 24 | plt.plot(np.arange(0, len(rewards)), np.cumsum(rewards)) 25 | plt.xlabel("time") 26 | plt.ylabel("cumulative reward") 27 | plt.savefig(filename) 28 | plt.close() 29 | #plt.show() 30 | 31 | def get_metrics(test, episode, rewards, bhr, filename): 32 | avg_test_bhr = 0.0 33 | for index in test: 34 | avg_test_bhr += bhr[index] 35 | avg_test_bhr /= len(test) 36 | print("Average BHR on test data : ", avg_test_bhr) 37 | return avg_test_bhr 38 | # plot_reward(rewards[test[episode]], filename) 39 | 40 | def run_n_step_sarsa_linear(env, train, test, n=1): 41 | print("Running ", n, "-step Sarsa") 42 | # Semi-gradient n-step Sarsa with linear function approximation 43 | epsilon = 0.1 44 | actions = [0, 1] 45 | gamma = 1 46 | if n == 1: 47 | alpha = 1e-2 48 | else: 49 | alpha = 1e-3 50 | L = LinearStateActionApproximation(5, 2, alpha) 51 | semi_gradient_n_step_sarsa(env, gamma, alpha, L, epsilon, train, actions, n) 52 | rewards, bhr = semi_gradient_n_step_sarsa(env, gamma, alpha, L, epsilon, test, actions, n) 53 | print("======================") 54 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/semi_gradient_n_step_sarsa_with_linear_approx.png") 55 | 56 | def run_n_step_sarsa_nn(env, train, test, n=1): 57 | print("Running ", n, "-step Sarsa with nn approximation") 58 | # Semi-gradient n-step Sarsa with neural network function approximation 59 | epsilon = 0.1 60 | actions = [0, 1] 61 | gamma = 1 62 | alpha = 1e-2 63 | L = NeuralNetworkStateActionApproximation(5, 2, alpha) 64 | semi_gradient_n_step_sarsa(env, gamma, alpha, L, epsilon, train, actions, n) 65 | rewards, bhr = semi_gradient_n_step_sarsa(env, gamma, alpha, L, epsilon, test, actions, n) 66 | print("======================") 67 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/semi_gradient_n_step_sarsa_with_nn.png") 68 | 69 | def run_actor_critic_tc(env, train, test): 70 | print("Running actor critic with eligibility traces tc") 71 | # Actor critic with 1-D tile coding 72 | gamma = 1 73 | alpha_theta = 1e-3 74 | alpha_w = 1e-3 75 | lambda_theta = 0.8 76 | lamdba_w = 0.8 77 | state_low = np.array([1, 0, 0, 1, 0]) 78 | state_high = np.array([1, 20, 500, 1200, 500]) 79 | tile_width = np.array([1, 1, 10, 50, 10]) 80 | V = StateOneDTileCoding( 81 | state_low, 82 | state_high, 83 | num_tilings=1, 84 | tile_width=tile_width 85 | ) 86 | pi = LinearPolicyApproximation(5, 2, alpha_theta) 87 | actor_critic_eligibility_trace_tc(env, gamma, 88 | alpha_theta, alpha_w, 89 | lambda_theta, lamdba_w, 90 | V, pi, 91 | train) 92 | rewards, bhr = actor_critic_eligibility_trace_tc(env, gamma, 93 | alpha_theta, alpha_w, 94 | lambda_theta, lamdba_w, 95 | V, pi, 96 | test) 97 | print("======================") 98 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/actor_critic_eligibility_trace.png") 99 | 100 | def run_actor_critic_linear(env, train, test): 101 | print("Running actor critic with eligibility traces linear") 102 | # Actor critic with neural network and eligibility traces 103 | gamma = 1 104 | alpha_theta = 1e-3 105 | alpha_w = 1e-3 106 | lambda_theta = 0.8 107 | lamdba_w = 0.8 108 | V = LinearStateApproximation(5, alpha_w) 109 | pi = LinearPolicyApproximation(5, 2, alpha_theta) 110 | actor_critic_eligibility_trace_linear(env, gamma, 111 | alpha_theta, alpha_w, 112 | lambda_theta, lamdba_w, 113 | V, pi, 114 | train) 115 | rewards, bhr = actor_critic_eligibility_trace_linear(env, gamma, 116 | alpha_theta, alpha_w, 117 | lambda_theta, lamdba_w, 118 | V, pi, 119 | test) 120 | print("======================") 121 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/actor_critic_eligibility_trace_linear.png") 122 | 123 | def run_reinforce(env, train, test): 124 | print("Running Reinforce") 125 | # Reinforce with linear function approximation 126 | epsilon = 0.1 127 | actions = [0, 1] 128 | gamma = 1 129 | alpha_theta = 1e-3 130 | alpha_w = 1e-3 131 | n = 1 132 | L = LinearStateApproximation(5, alpha_w) 133 | pi = LinearPolicyApproximation(5, 2, alpha_theta) 134 | reinforce(env, gamma, train,pi, L) 135 | rewards, bhr = reinforce(env, gamma, test,pi, L) 136 | print("======================") 137 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/reinforce.png") 138 | 139 | def run_sarsa_lambda(env, train, test, lam): 140 | print("Running True Online Sarsa Lambda") 141 | # True Online Sarsa Lambda with One Dimensional Tile Coding 142 | epsilon = 0.1 143 | gamma = 1 144 | alpha = 1e-2 145 | actions = [0, 1] 146 | state_low = np.array([1, 0, 0, 1, 0]) 147 | state_high = np.array([1, 20, 500, 1200, 500]) 148 | tile_width = np.array([1, 1, 10, 50, 10]) 149 | Q = StateActionOneDTileCoding( 150 | state_low, 151 | state_high, 152 | num_tilings=2, 153 | tile_width=tile_width, 154 | num_actions=len(actions) 155 | ) 156 | TrueOnlineSarsaLambda(env, epsilon, gamma, lam, alpha, Q, train) 157 | rewards, bhr = TrueOnlineSarsaLambda(env, epsilon, gamma, lam, alpha, Q, test) 158 | print("======================") 159 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/true_online_sarsa_lambda.png") 160 | 161 | def run_always_evict(env, train, test): 162 | print("Running Always Evict") 163 | rewards, bhr = always_evict(env, test) 164 | print("======================") 165 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/lru.png") 166 | 167 | def run_random_eviction(env, train, test): 168 | p = [0.5,0.5] 169 | print("Running Random Policy ") 170 | rewards, bhr = random_eviction(env, test, p) 171 | print("======================") 172 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/random_eviction_lru.png") 173 | 174 | def run_optimal(test, cache_size): 175 | print("Running Optimal Policy") 176 | rewards, bhr = optimal_admission(episodes=test, cache_size=cache_size) 177 | print("======================") 178 | return get_metrics(test, 0, rewards, bhr, "experiments/graphs/optimal_algorithm.png") 179 | 180 | if __name__ == "__main__": 181 | parser = argparse.ArgumentParser(description='RL CaR') 182 | parser.add_argument( 183 | '-ne', 184 | '--num_episodes', 185 | help='Number of episodes', 186 | type=int, 187 | default=70 188 | ) 189 | parser.add_argument( 190 | '-nr', 191 | '--num_repetitions', 192 | help='Number of repetitions', 193 | type=int, 194 | default=10 195 | ) 196 | parser.add_argument( 197 | '-fa', 198 | '--function_approximation', 199 | help='function approximation to use', 200 | default='tc' 201 | ) 202 | 203 | parser.add_argument( 204 | '-n_steps', 205 | '--n_steps', 206 | help='number of steps in sarsa', 207 | type=int, 208 | default=2 209 | ) 210 | 211 | parser.add_argument( 212 | '-lam', 213 | '--lam', 214 | help='lambda in sarsa', 215 | type=float, 216 | default=0.5 217 | ) 218 | 219 | parser.add_argument( 220 | '-rl', 221 | '--rl_algo', 222 | help='rl algo to use', 223 | default='actor_critic' 224 | ) 225 | 226 | parser.add_argument( 227 | '-policy', 228 | '--policy', 229 | help='cache replacement policy space separated', 230 | default="LRU" 231 | ) 232 | 233 | parser.add_argument( 234 | '-ts', 235 | '--test_size', 236 | help='test size', 237 | type=int, 238 | default="20" 239 | ) 240 | 241 | parser.add_argument( 242 | '-cs', 243 | '--cache_size', 244 | help='cache size', 245 | type=int, 246 | default="20" 247 | ) 248 | seeds = [10, 20, 30, 40 ,50, 60, 70, 80, 90, 100] 249 | 250 | args = parser.parse_args() 251 | print("Arguments ", args) 252 | num_repetitions = args.num_repetitions 253 | cache_size = args.cache_size 254 | 255 | num_episodes = args.num_episodes 256 | episodes = np.arange(num_episodes) 257 | function_name = 'run_' + args.rl_algo 258 | if args.function_approximation is not None: 259 | function_name += '_' + args.function_approximation 260 | 261 | print("Using Function:", function_name) 262 | 263 | function_dict = {"run_reinforce" : run_reinforce, 264 | "run_actor_critic_tc" : run_actor_critic_tc, 265 | "run_actor_critic_linear" : run_actor_critic_linear, 266 | "run_random_eviction" : run_random_eviction, 267 | "run_always_evict": run_always_evict, 268 | "run_n_step_sarsa_nn" : run_n_step_sarsa_nn, 269 | "run_n_step_sarsa_linear" : run_n_step_sarsa_linear, 270 | "run_optimal" : run_optimal, 271 | "run_sarsa_lambda" : run_sarsa_lambda} 272 | 273 | logging.basicConfig(level=logging.INFO, 274 | datefmt='%Y-%m-%d %H:%M:%S', handlers=[ 275 | logging.FileHandler('logs/{}'.format(function_name)), 276 | logging.StreamHandler() 277 | ]) 278 | 279 | policies = args.policy.split(" ") 280 | env = CacheEnv(policies, cache_size=cache_size) 281 | bhr_metrics = [] 282 | for r in range(num_repetitions): 283 | train, test = train_test_split(episodes, test_size=args.test_size, random_state=seeds[r]) 284 | if function_name.startswith("run_n"): 285 | n =args.n_steps 286 | bhr_metrics.append(function_dict[function_name](env, train, test, n)) 287 | elif function_name.startswith("run_optimal"): 288 | bhr_metrics.append(function_dict[function_name](test, cache_size)) 289 | elif function_name.startswith("run_sarsa_lambda"): 290 | bhr_metrics.append(function_dict[function_name](env, train, test, args.lam)) 291 | else: 292 | bhr_metrics.append(function_dict[function_name](env, train, test)) 293 | print("bhr_metrics", bhr_metrics) 294 | mean_bhr = np.mean(np.array(bhr_metrics)) 295 | print("mean_bhr = ", mean_bhr) 296 | log_string = "args = {}, bhr_metrics = {}, mean_bhr = {}".format(args, ', '.join(str(e) for e in bhr_metrics), str(mean_bhr)) 297 | logging.info(log_string) 298 | 299 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -cs 10"' & 2 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -cs 20"' & 3 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -cs 50"' & 4 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -cs 75"' & 5 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -cs 100"' & 6 | 7 | 8 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa tc -cs 10"' & 9 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa tc -cs 20"' & 10 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa tc -cs 50"' & 11 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa tc -cs 75"' & 12 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa tc -cs 100"' & 13 | 14 | 15 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa linear -cs 20"' & 16 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl actor_critic -fa linear -cs 50"' & 17 | 18 | 19 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 10 -n_steps 1"' & 20 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 20 -n_steps 1"' & 21 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 50 -n_steps 1"' & 22 | <<<<<<< Updated upstream 23 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 1"' & 24 | ======= 25 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 1"' & 26 | 27 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 10 -n_steps 2"' & 28 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 20 -n_steps 2"' & 29 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 50 -n_steps 2"' & 30 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 2"' & 31 | 32 | 33 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 2"' & 34 | 35 | 36 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 10 -n_steps 4"' & 37 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 20 -n_steps 4"' & 38 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 50 -n_steps 4"' & 39 | 40 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 2"' & 41 | 42 | 43 | 44 | 45 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 10 -n_steps 8"' & 46 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 20 -n_steps 8"' & 47 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 50 -n_steps 8"' & 48 | 49 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 2"' & 50 | 51 | 52 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 10 -n_steps 16"' & 53 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 20 -n_steps 16"' & 54 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 50 -n_steps 16"' & 55 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa linear -cs 100 -n_steps 2"' & 56 | 57 | 58 | 59 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 10 -n_steps 1"' & 60 | 61 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 20 -n_steps 1"' & 62 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 50 -n_steps 1"' & 63 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 75 -n_steps 1"' & 64 | 65 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 10 -n_steps 2"' & 66 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 20 -n_steps 2"' & 67 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 50 -n_steps 2"' & 68 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 75 -n_steps 2"' & 69 | 70 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 10 -n_steps 16"' & 71 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 20 -n_steps 16"' & 72 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 50 -n_steps 16"' & 73 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl n_step_sarsa -fa nn -cs 75 -n_steps 16"' & 74 | 75 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 10 -lam 0.5 -rl sarsa_lambda"' & 76 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 20 -lam 0.5 -rl sarsa_lambda"' & 77 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 50 -lam 0.5 -rl sarsa_lambda"' & 78 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 75 -lam 0.5 -rl sarsa_lambda"' & 79 | 80 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 10 -lam 0.75 -rl sarsa_lambda"' & 81 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 20 -lam 0.75 -rl sarsa_lambda"' & 82 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 50 -lam 0.75 -rl sarsa_lambda"' & 83 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 75 -lam 0.75 -rl sarsa_lambda"' & 84 | 85 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 10 -lam 0.95 -rl sarsa_lambda"' & 86 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 20 -lam 0.95 -rl sarsa_lambda"' & 87 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 50 -lam 0.95 -rl sarsa_lambda"' & 88 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -cs 75 -lam 0.95 -rl sarsa_lambda"' & 89 | 90 | 91 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy LFU -cs 10"' & 92 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy LFU -cs 20"' & 93 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy LFU -cs 50"' & 94 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy LFU -cs 75"' & 95 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy LFU -cs 100"' & 96 | 97 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -policy LFU -cs 10"' & 98 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -policy LFU -cs 20"' & 99 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -policy LFU -cs 50"' & 100 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -policy LFU -cs 75"' & 101 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl reinforce -policy LFU -cs 100"' & 102 | 103 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy FIFO -cs 10"' & 104 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy FIFO -cs 20"' & 105 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy FIFO -cs 50"' & 106 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy FIFO -cs 75"' & 107 | osascript -e 'tell application "Terminal" to do script "cd /Users/isha/Desktop/Courses/RL/RLCaR && /usr/local/bin/python3.9 main.py -rl always_evict -policy FIFO -cs 100"' & 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /cache.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from collections import Counter, defaultdict, OrderedDict 5 | import heapq 6 | from gym import spaces 7 | from replacement_agent import ReplacementAgent 8 | 9 | from trace_loader import load_traces, get_stats 10 | 11 | # from park import core, spaces, logger 12 | # from park.param import config 13 | # from park.utils import seeding 14 | # from park.envs.cache.trace_loader import load_traces 15 | 16 | accept = 1 17 | reject = 0 18 | 19 | cache_unseen_default = 500 20 | cache_size_default = 20 21 | cache_trace_default = "test" 22 | 23 | 24 | class TraceSrc(object): 25 | ''' 26 | Tracesrc is the Trace Loader 27 | 28 | @param trace: The file name of the trace file 29 | @param cache_size: The fixed size of the whole cache 30 | @param load_trace: The list of trace data. The item could be gotten by using load_trace.iloc[self.idx] 31 | @param n_request: length of the trace 32 | @param min_values, max values: Used for the restricted of the value space 33 | @param req: The obj_time of the object 34 | ''' 35 | 36 | def __init__(self, trace, cache_size): 37 | self.trace = trace 38 | self.cache_size = cache_size 39 | self.load_trace = load_traces(self.trace, self.cache_size, 0) 40 | self.means, self.stddevs = get_stats(self.load_trace) 41 | self.n_request = len(self.load_trace) 42 | self.cache_size = cache_size 43 | self.min_values = np.asarray([1, 0, 0]) 44 | self.max_values = np.asarray([self.cache_size, self.cache_size, max(self.load_trace[0])]) 45 | self.req = 0 46 | 47 | def reset(self, random): 48 | if self.trace == 'test' or self.trace.startswith('zipf'): 49 | self.load_trace = load_traces(self.trace, self.cache_size, random) 50 | self.means, self.stddevs = get_stats(self.load_trace) 51 | self.n_request = len(self.load_trace) 52 | self.min_values = np.asarray([1, 0, 0]) 53 | self.max_values = np.asarray([self.cache_size, self.cache_size, max(self.load_trace[0])]) 54 | self.req = 0 55 | 56 | def step(self): 57 | # Obs is: (obj_time, obj_id, obj_size) 58 | # print("req id in trace step:", self.req) 59 | obs = self.load_trace.iloc[self.req].values 60 | self.req += 1 61 | done = self.req >= self.n_request 62 | return obs, done 63 | 64 | def next(self): 65 | obs = self.load_trace.iloc[self.req].values 66 | done = (self.req + 1) >= self.n_request 67 | return obs, done 68 | 69 | def get_trace_stats(self): 70 | return self.means, self.stddevs 71 | 72 | class CacheSim(object): 73 | def __init__(self, cache_size, policy, action_space, state_space, replacement_policies, trace_means, trace_stddevs, episode_index=0): 74 | # invariant 75 | ''' 76 | This is the simulater for the cache. 77 | @param cache_size 78 | @param policy: Not implement yet. Maybe we should instead put this part in the action 79 | @param action_space: The restriction for action_space. For the cache admission agent, it is [0, 1]: 0 is for reject and 1 is for admit 80 | @param req: It is the idx for the object requiration 81 | @param non_cache: It is the list for those requiration that aren't cached, have obj_id and last req time 82 | @param cache: It is the list for those requiration that are cached, have obj id and last req time 83 | @param count_ohr: ohr is (sigma hit) / req 84 | @param count_bhr: ohr is (sigma object_size * hit) / sigma object_size 85 | @param size_all: size_all is sigma object_size 86 | ''' 87 | 88 | self.cache_size = cache_size 89 | self.policy = policy 90 | self.action_space = action_space 91 | self.observation_space = state_space 92 | self.req = 0 93 | self.non_cache = defaultdict(list) 94 | self.cache = defaultdict(list) # requested items with caching 95 | self.cache_pq = [] 96 | # self.lru_cache = LRUCache(self.cache_size) 97 | self.agent = ReplacementAgent(capacity=self.cache_size, policies=replacement_policies,episode_index=episode_index) 98 | self.cache_remain = self.cache_size 99 | self.count_ohr = 0 100 | self.count_bhr = 0 101 | self.size_all = 0 102 | self.object_frequency = Counter() 103 | self.object_average_interarrival = Counter() 104 | self.trace_means = trace_means 105 | self.trace_stddevs = trace_stddevs 106 | 107 | def reset(self, trace_means, trace_stddevs, episode_index): 108 | self.req = 0 109 | self.non_cache = defaultdict(list) 110 | self.cache = defaultdict(list) 111 | self.cache_pq = [] 112 | self.cache_remain = self.cache_size 113 | self.count_ohr = 0 114 | self.count_bhr = 0 115 | self.size_all = 0 116 | self.agent.reset(index=episode_index) 117 | self.object_frequency = Counter() 118 | self.object_average_interarrival = Counter() 119 | self.trace_means = trace_means 120 | self.trace_stddevs = trace_stddevs 121 | 122 | def step(self, action, obj): 123 | #print("object_freq in step(): {}".format(self.object_frequency)) 124 | req = self.req 125 | # print(self.req) 126 | cache_size_online_remain = self.cache_remain 127 | discard_obj_if_admit = [] 128 | obj_time, obj_id, obj_size = obj[0], obj[1], obj[2] 129 | self.object_frequency[obj_id] += 1 130 | 131 | 132 | # create the current state for cache simulator 133 | cost = 0 134 | 135 | # simulation 136 | # if the object size is larger than cache size 137 | if obj_size >= self.cache_size: 138 | # record the request 139 | cost += obj_size 140 | hit = 0 141 | try: 142 | self.non_cache[obj_id][1] = req 143 | except IndexError: 144 | self.non_cache[obj_id] = [obj_size, req] 145 | 146 | else: 147 | # Search the object in the cache 148 | # If hit 149 | try: 150 | self.cache[obj_id][1] = req 151 | self.count_bhr += obj_size 152 | self.count_ohr += 1 153 | hit = 1 154 | cost += obj_size 155 | self.agent.update(obj_id, obj_size) 156 | 157 | # If not hit 158 | except IndexError: 159 | # accept request 160 | if action == 1: 161 | # find the object in the cache, no cost, OHR and BHR ++ 162 | # can't find the object in the cache, add the object into cache after replacement, cost ++ 163 | while cache_size_online_remain < obj_size: 164 | # rm_id = self.cache_pq[0][1] 165 | # cache_size_online_remain += self.cache_pq[0][0] 166 | # cost += self.cache_pq[0][0] 167 | # discard_obj_if_admit.append(rm_id) 168 | # heapq.heappop(self.cache_pq) 169 | # del self.cache[rm_id] 170 | rm_id, size = self.agent.remove() 171 | #print("rm_id = ",rm_id, " size = ", size) 172 | cache_size_online_remain += size 173 | cost += size 174 | discard_obj_if_admit.append(rm_id) 175 | del self.cache[rm_id] 176 | 177 | 178 | # add into cache 179 | self.cache[obj_id] = [obj_size, req] 180 | # heapq.heappush(self.cache_pq, (obj_size, obj_id)) 181 | self.agent.put(obj_id, obj_size) 182 | cache_size_online_remain -= obj_size 183 | 184 | # cost value is based on size, can be changed 185 | cost += obj_size 186 | hit = 0 187 | 188 | # reject request 189 | else: 190 | hit = 0 191 | # record the request to non_cache 192 | try: 193 | self.non_cache[obj_id][1] = req 194 | except IndexError: 195 | self.non_cache[obj_id] = [obj_size, req] 196 | 197 | self.size_all += obj_size 198 | bhr = float(self.count_bhr / self.size_all) 199 | ohr = float(self.count_ohr / (req + 1)) 200 | # print("debug:", bhr, ohr) 201 | reward = hit * cost 202 | 203 | if self.object_frequency[obj_id] != 1: 204 | new_count = self.object_frequency[obj_id] - 1 205 | cur_avg = self.object_average_interarrival[obj_id] 206 | try: 207 | last_interarrival = self.req - self.cache[obj_id][1] 208 | except IndexError: 209 | last_interarrival = self.req - self.non_cache[obj_id][1] 210 | new_avg = cur_avg + (last_interarrival - cur_avg)/new_count 211 | self.object_average_interarrival[obj_id] = new_avg 212 | 213 | self.req += 1 214 | self.cache_remain = cache_size_online_remain 215 | 216 | info = [self.count_bhr, self.size_all, float(float(self.count_bhr) / float(self.size_all))] 217 | return reward, info 218 | 219 | def next_hit(self, obj): 220 | try: 221 | obj_id = obj[1] 222 | self.cache[obj_id][1] = self.cache[obj_id][1] 223 | return True 224 | 225 | except IndexError: 226 | return False 227 | 228 | def get_normalized_state(self, state): 229 | normalized_state = [] 230 | for index, s in enumerate(state): 231 | normalized_state.append( (s-self.trace_means[index])/self.trace_stddevs[index]) 232 | normalized_state[1] /= self.cache_size 233 | return normalized_state 234 | 235 | def get_state(self, obj=[0, 0, 0, 0]): 236 | ''' 237 | Return the state of the object, [obj_size, cache_size_online_remain, recency (steps since object was last visited) = req - last visited time] 238 | If an object has never been seen before, assigned a constant for the recency feature. 239 | For more information, see Learning Caching policy_approximations with Subsampling: 240 | http://mlforsystems.org/assets/papers/neurips2019/learning_wang_2019.pdf 241 | ''' 242 | obj_time, obj_id, obj_size = obj[0], obj[1], obj[2] 243 | try: 244 | req = self.req - self.cache[obj_id][1] 245 | except IndexError: 246 | try: 247 | req = self.req - self.non_cache[obj_id][1] 248 | except IndexError: 249 | # Unseen objects (not in non_cache or cache) are assigned this recency constant 250 | req = cache_unseen_default 251 | 252 | #print("object_freq in get_state: {}".format(self.object_frequency)) 253 | # sorted_frequency = dict(sorted(self.object_frequency.items(), key=lambda item: item[1])) 254 | # rank = -1 255 | # if obj_id in sorted_frequency: 256 | # rank = list(sorted_frequency.keys()).index(obj_id) 257 | # cache_min_freq = math.inf 258 | # for object in self.cache: 259 | # freq = self.object_frequency[object] 260 | # cache_min_freq = min(cache_min_freq, freq) 261 | #print("obj_id = {}, rank = {}".format(obj_id, rank)) 262 | state = [obj_size, self.cache_remain, req, self.object_frequency[obj_id], 263 | self.object_average_interarrival[obj_id]] 264 | 265 | return self.get_normalized_state(state) 266 | 267 | 268 | class CacheEnv(): 269 | """ 270 | Cache description. 271 | 272 | * STATE * 273 | The state is represented as a vector: 274 | [request object size, 275 | cache remaining size, 276 | time of last request to the same object] 277 | 278 | * ACTIONS * 279 | TODO: should be fixed here, there should be both 280 | Whether the cache accept the incoming request, represented as an 281 | integer in [0, 1]. 282 | 283 | * REWARD * (BHR) 284 | Cost of previous step (object size) * hit 285 | 286 | * REFERENCE * 287 | """ 288 | 289 | def __init__(self, replacement_policies, cache_size=cache_size_default, 290 | trace=cache_trace_default, seed=42): 291 | self.seed(seed) 292 | self.cache_size = cache_size 293 | 294 | # load trace, attach initial online feature values 295 | self.src = TraceSrc(trace=trace, cache_size=self.cache_size) 296 | 297 | # set up the state and action space 298 | self.action_space = spaces.Discrete(2) 299 | self.observation_space = spaces.Box(self.src.min_values, \ 300 | self.src.max_values, \ 301 | dtype=np.float32) 302 | 303 | # cache simulator 304 | trace_means, trace_stddevs = self.src.get_trace_stats() 305 | self.sim = CacheSim(cache_size=self.cache_size, \ 306 | policy='lru', \ 307 | action_space=self.action_space, \ 308 | state_space=self.observation_space, 309 | replacement_policies=replacement_policies, 310 | trace_means=trace_means, 311 | trace_stddevs=trace_stddevs, 312 | episode_index=0) 313 | 314 | # reset environment (generate new jobs) 315 | self.reset(1, 2) 316 | 317 | def reset(self, trace_index, low=0, high=1000): 318 | #new_trace = np.random.randint(low, high) 319 | self.src.reset(trace_index) 320 | trace_means, trace_stddevs = self.src.get_trace_stats() 321 | self.sim.reset(trace_means, trace_stddevs, episode_index=trace_index) 322 | if cache_trace_default == 'test': 323 | print("New Env Start", trace_index) 324 | elif cache_trace_default == 'real': 325 | print("New Env Start Real") 326 | return self.sim.get_state() 327 | 328 | def seed(self, seed): 329 | self.np_random = np.random.seed(seed) 330 | 331 | def step(self, action): 332 | # 0 <= action < num_servers 333 | global accept 334 | assert self.action_space.contains(action) 335 | state, done = self.src.step() 336 | reward, info = self.sim.step(action, state) 337 | obj, done = self.src.next() 338 | while self.sim.next_hit(obj): 339 | state, done = self.src.step() 340 | hit_reward, info = self.sim.step(accept, state) 341 | reward += hit_reward 342 | if done is True: 343 | break 344 | obj, done = self.src.next() 345 | 346 | obs = self.sim.get_state(obj) 347 | #info = {} 348 | return obs, reward, done, info 349 | 350 | def render(self, mode='human', close=False): 351 | pass 352 | --------------------------------------------------------------------------------