├── .gitignore ├── 0. Old ├── 1. Reinforcement Learning - Pickup problem.ipynb ├── 2. Reinforcement Learning - Multi-armed bandits.ipynb └── scripts │ ├── algorithms.py │ ├── grid_world.py │ ├── maze.py │ ├── multi_armed_bandit.py │ ├── open_ai_gym.py │ └── pickup_problem.py ├── 0. Solving Gym environments ├── Agents development - Breakout.ipynb ├── breakout_with_rl.py ├── cartpole_with_deepqlearning.py └── pendulum_with_actorcritic.py ├── 1. Tic Tac Toe ├── 1. Solving Tic Tac Toe with Policy gradients.ipynb └── images │ ├── game_random_rl_agents.gif │ ├── game_random_rl_agents2.gif │ ├── game_random_rl_agents3.gif │ ├── game_random_rules_agents.gif │ ├── game_random_rules_agents2.gif │ ├── game_rules_rl_agents.gif │ └── game_two_random_agents.gif ├── 2. Data Center Cooling ├── 0. Explaining the Data Center Cooling environment.ipynb ├── 1. Reinforcement Learning - Q Learning.ipynb ├── 2. Reinforcement Learning - Deep-Q-Learning.ipynb ├── README.md └── app.py ├── 3. Robotics ├── Minitaur pybullet environment.ipynb └── minitaur.py ├── 4. Chrome Dino ├── 20180102 - Chrome Dino development.ipynb ├── 20180203 - Genetic algorithms experiments.ipynb ├── README.md ├── dino.py ├── experiments.py └── images │ ├── capture1.png │ ├── dino_hardcoded_agent.gif │ ├── dino_ml_agent1.gif │ └── dino_ml_agent1_bad.gif ├── 5. Delivery Optimization ├── Optimizing delivery with Reinforcement Learning.ipynb ├── README.md ├── Routing optimization with Deep Reinforcement Learning.ipynb ├── delivery.py ├── env1.png ├── env2.png ├── env3.png ├── training.png ├── training_100_stops.gif ├── training_100_stops_traffic.gif ├── training_10_stops.gif ├── training_500_stops.gif ├── training_500_stops_traffic.gif └── training_50_stops.gif ├── 6. Solving a Rubik's Cube ├── Solving a Rubik's cube with RL.ipynb └── rubik.py ├── 7. Multi-Agents Simulations ├── 20191018 - Sugarscape playground.ipynb ├── 20191112 - Chicken game.ipynb ├── 20200318 - Hyperion dev.ipynb ├── README.md ├── pygame_test.py ├── test.gif └── test2.gif ├── 8. Unity ML agents tests ├── README.md └── rolling_a_ball │ ├── 20200202 - Rolling a Ball.ipynb │ └── rollingaball1.png ├── 9. Discrete optimization with RL ├── README.md ├── Reinforcement Learning for knapsack problem.ipynb ├── knapsack_problem │ └── knapsack │ │ ├── Solver.java │ │ ├── _coursera │ │ ├── data │ │ ├── ks_10000_0 │ │ ├── ks_1000_0 │ │ ├── ks_100_0 │ │ ├── ks_100_1 │ │ ├── ks_100_2 │ │ ├── ks_106_0 │ │ ├── ks_19_0 │ │ ├── ks_200_0 │ │ ├── ks_200_1 │ │ ├── ks_300_0 │ │ ├── ks_30_0 │ │ ├── ks_400_0 │ │ ├── ks_40_0 │ │ ├── ks_45_0 │ │ ├── ks_4_0 │ │ ├── ks_500_0 │ │ ├── ks_50_0 │ │ ├── ks_50_1 │ │ ├── ks_60_0 │ │ ├── ks_82_0 │ │ ├── ks_lecture_dp_1 │ │ └── ks_lecture_dp_2 │ │ ├── handout.pdf │ │ ├── solver.py │ │ ├── solverJava.py │ │ └── submit.py └── lessons │ ├── README.md │ ├── discrete_optimization.md │ ├── dynamic_programming.md │ └── knapsack_problem.md ├── README.md └── rl ├── __init__.py ├── agents ├── __init__.py ├── actor_critic_agent.py ├── base_agent.py ├── dqn2d_agent.py ├── dqn_agent.py ├── q_agent.py └── sarsa_agent.py ├── envs ├── __init__.py ├── data_center_cooling.py └── tictactoe.py ├── memory.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | .spyproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | # mkdocs documentation 93 | /site 94 | 95 | # mypy 96 | .mypy_cache/ 97 | -------------------------------------------------------------------------------- /0. Old/scripts/algorithms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | First RL script done using Keras and policy gradients 8 | 9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv 10 | - Still inspired by Karpathy's work too 11 | 12 | Started on the 30/12/2016 13 | 14 | 15 | 16 | theo.alves.da.costa@gmail.com 17 | https://github.com/theolvs 18 | ------------------------------------------------------------------------ 19 | """ 20 | 21 | 22 | import numpy as np 23 | # import gym 24 | import os 25 | from keras.models import load_model, Sequential 26 | from keras.layers import Dense, Activation, Dropout 27 | from keras.optimizers import SGD, RMSprop 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | class Brain(): 38 | def __init__(self,env,env_name = "default",H = 500,learning_rate = 0.01,dropout = 0.0,hidden_layers = 1,reload = False,input_dim = 0,output_dim = 0): 39 | 40 | self.env_name = env_name 41 | self.base_path = "C:/Data Science/15. Reinforcement Learning/0. Models/" 42 | file = [x for x in os.listdir(self.base_path) if self.env_name in x] 43 | 44 | self.H = H 45 | self.gamma = 0.5 46 | self.batch_size = 10 47 | self.learning_rate = learning_rate 48 | self.dropout = dropout 49 | self.hidden_layers = hidden_layers 50 | 51 | if input_dim == 0: 52 | try: 53 | self.observation_space = env.observation_space.n 54 | self.observation_to_vectorize = True 55 | except Exception as e: 56 | self.observation_space = env.observation_space.shape[0] 57 | self.observation_to_vectorize = False 58 | else: 59 | self.observation_space = input_dim 60 | self.observation_to_vectorize = False 61 | 62 | if output_dim == 0: 63 | self.action_space = env.action_space.n 64 | else: 65 | self.action_space = output_dim 66 | 67 | 68 | if len(file) == 0 or reload: 69 | print('>> Building a fully connected neural network') 70 | self.episode_number = 0 71 | self.model = self.build_fcc_model_with_regularization(H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers) 72 | else: 73 | print('>> Loading the previously trained model') 74 | self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")]) 75 | self.model = load_model(self.base_path + file[0]) 76 | 77 | 78 | 79 | self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[] 80 | self.episode_rewards,self.episode_running_rewards = [],[] 81 | self.reward_sum = 0 82 | self.running_reward = 0 83 | 84 | 85 | def rebuild_model(self): 86 | self.model = self.build_fcc_model_with_regularization(self.H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers) 87 | 88 | 89 | 90 | def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2): 91 | model = Sequential() 92 | model.add(Dense(H, input_dim=input_dim)) 93 | model.add(Activation('relu')) 94 | model.add(Dense(H)) 95 | model.add(Activation('relu')) 96 | 97 | sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True) 98 | 99 | if output_dim <= 2: 100 | model.add(Dense(1)) 101 | model.add(Activation('sigmoid')) 102 | model.compile(loss='mse', 103 | optimizer=sgd, 104 | metrics=['accuracy']) 105 | else: 106 | model.add(Dense(output_dim)) 107 | model.add(Activation('softmax')) 108 | model.compile(loss='categorical_crossentropy', 109 | optimizer=sgd, 110 | metrics=['accuracy']) 111 | 112 | return model 113 | 114 | 115 | 116 | def build_fcc_model_with_regularization(self,H = 500,input_dim = 4,output_dim = 2,dropout = 0.0,hidden_layers = 1): 117 | model = Sequential() 118 | model.add(Dense(H, input_dim=input_dim,init='uniform')) 119 | model.add(Activation('relu')) 120 | model.add(Dropout(dropout)) 121 | 122 | for i in range(hidden_layers): 123 | model.add(Dense(H,init='uniform')) 124 | model.add(Activation('relu')) 125 | model.add(Dropout(dropout)) 126 | 127 | sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True) 128 | 129 | if output_dim <= 2: 130 | model.add(Dense(1)) 131 | model.add(Activation('sigmoid')) 132 | model.compile(loss='mse', 133 | optimizer=sgd, 134 | metrics=['accuracy']) 135 | else: 136 | model.add(Dense(output_dim)) 137 | model.add(Activation('softmax')) 138 | model.compile(loss='categorical_crossentropy', 139 | optimizer=sgd, 140 | metrics=['accuracy']) 141 | 142 | return model 143 | 144 | 145 | 146 | def to_input(self,observation): 147 | if self.observation_to_vectorize: 148 | observation = self.vectorize_observation(observation,self.observation_space) 149 | return np.reshape(observation,(1,self.observation_space)) 150 | 151 | 152 | def predict(self,observation,possible_moves = []): 153 | 154 | x = self.to_input(observation) 155 | 156 | # getting the probability of action 157 | probas = self.model.predict(x)[0] 158 | 159 | 160 | if len(possible_moves) > 0: 161 | probas += 1e-9 162 | probas *= possible_moves 163 | probas /= np.sum(probas) 164 | 165 | # sampling the correct action 166 | action= self.sample_action(probas) 167 | 168 | return x,action,probas 169 | 170 | 171 | def sample_action(self,probabilities): 172 | if len(probabilities)<=2: 173 | action = 1 if np.random.uniform() < probabilities[0] else 0 174 | else: 175 | action = np.random.choice(len(probabilities),p = np.array(probabilities)) 176 | 177 | return action 178 | 179 | def vectorize_action(self,action): 180 | if self.action_space <= 2: 181 | return action 182 | else: 183 | onehot_vector = np.zeros(self.action_space) 184 | onehot_vector[action] = 1 185 | return onehot_vector 186 | 187 | def vectorize_observation(self,value,size): 188 | onehot_vector = np.zeros(size) 189 | onehot_vector[value] = 1 190 | return onehot_vector 191 | 192 | 193 | 194 | def record(self,input = None,action = None,proba = None,reward = None): 195 | if type(input) != type(None): 196 | self.inputs.append(input) 197 | 198 | if type(action) != type(None): 199 | self.actions.append(action) 200 | 201 | if type(proba) != type(None): 202 | self.probas.append(proba) 203 | 204 | if type(reward) != type(None): 205 | self.rewards.append(reward) 206 | self.reward_sum += reward 207 | 208 | 209 | 210 | 211 | def discounting_rewards(self,r,normalization = True): 212 | discounted_r = np.zeros_like(r) 213 | running_add = 0 214 | for t in reversed(range(0, r.size)): 215 | running_add = running_add * self.gamma + r[t] 216 | discounted_r[t] = running_add 217 | 218 | if normalization: 219 | discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe") 220 | discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe") 221 | 222 | return discounted_r 223 | 224 | 225 | def discount_rewards(self,normalization = True): 226 | rewards = np.vstack(self.rewards) 227 | return self.discounting_rewards(rewards,normalization) 228 | 229 | 230 | def record_episode(self): 231 | # self.step_rewards.extend(self.discount_rewards(normalization = True)) 232 | 233 | # self.rewards = np.array([self.rewards[-1]]*len(self.rewards)) 234 | # self.reward_sum = self.rewards[-1]*100 235 | 236 | self.reward_sum = np.sum(self.rewards) 237 | self.rewards = self.discount_rewards(normalization = False) 238 | self.step_rewards.extend(self.rewards) 239 | 240 | 241 | self.episode_rewards.append(self.reward_sum) 242 | self.running_reward = np.mean(self.episode_rewards) 243 | self.episode_number += 1 244 | 245 | def reset_episode(self): 246 | self.rewards = [] 247 | self.reward_sum = 0 248 | 249 | def update_on_batch(self,show = False): 250 | if show: print('... Training on batch of size %s'%self.batch_size) 251 | self.actions = np.vstack(self.actions) 252 | self.probas = np.vstack(self.probas) 253 | self.step_rewards = np.vstack(self.step_rewards) 254 | self.inputs = np.vstack(self.inputs) 255 | 256 | self.targets = self.step_rewards * (self.actions - self.probas) + self.probas 257 | # print(self.targets) 258 | 259 | #ajouter la protection de la max rewards 260 | 261 | self.model.train_on_batch(self.inputs,self.targets) 262 | 263 | self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[] 264 | 265 | def save_model(self): 266 | file = [x for x in os.listdir(self.base_path) if self.env_name in x] 267 | self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number)) 268 | if len(file)>0: 269 | os.remove(self.base_path+file[0]) 270 | # self.model.save(self.base_path+"%s.h5"%(self.env_name)) 271 | 272 | 273 | def build_cnn_model(self,input_dim,output_dim): 274 | model = Sequential() 275 | 276 | model.add(Convolution2D(32, 3, 3, border_mode='same',input_shape=input_dim)) 277 | model.add(Activation('relu')) 278 | model.add(Convolution2D(32, 3, 3)) 279 | model.add(Activation('relu')) 280 | model.add(MaxPooling2D(pool_size=(2, 2))) 281 | model.add(Dropout(0.25)) 282 | 283 | model.add(Convolution2D(64, 3, 3, border_mode='same')) 284 | model.add(Activation('relu')) 285 | model.add(Convolution2D(64, 3, 3)) 286 | model.add(Activation('relu')) 287 | model.add(MaxPooling2D(pool_size=(2, 2))) 288 | model.add(Dropout(0.25)) 289 | 290 | model.add(Flatten()) 291 | model.add(Dense(512)) 292 | model.add(Activation('relu')) 293 | model.add(Dropout(0.5)) 294 | model.add(Dense(output_dim)) 295 | model.add(Activation('softmax')) 296 | 297 | # Let's train the model using RMSprop 298 | model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy']) 299 | 300 | return model 301 | -------------------------------------------------------------------------------- /0. Old/scripts/grid_world.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | Grid World 8 | 9 | Started on the 08/08/2017 10 | 11 | 12 | References : 13 | - https://www.youtube.com/watch?v=A5eihauRQvo&t=5s 14 | - https://github.com/llSourcell/q_learning_demo 15 | - http://firsttimeprogrammer.blogspot.fr/2016/09/getting-ai-smarter-with-q-learning.html 16 | 17 | 18 | theo.alves.da.costa@gmail.com 19 | https://github.com/theolvs 20 | ------------------------------------------------------------------------ 21 | """ 22 | 23 | 24 | import os 25 | import matplotlib.pyplot as plt 26 | import pandas as pd 27 | import numpy as np 28 | import sys 29 | import random 30 | import time 31 | 32 | 33 | 34 | 35 | 36 | #=========================================================================================================== 37 | # CELLS DEFINITION 38 | #=========================================================================================================== 39 | 40 | 41 | class Cell(object): 42 | def __init__(self,reward = 0,is_terminal = False,is_occupied = False,is_wall = False,is_start = False): 43 | self.reward = reward 44 | self.is_terminal = is_terminal 45 | self.is_occupied = is_occupied 46 | self.is_wall = is_wall 47 | self.is_start = is_start 48 | 49 | def __repr__(self): 50 | if self.is_occupied: 51 | return "x" 52 | else: 53 | return " " 54 | 55 | 56 | def __str__(self): 57 | return self.__str__() 58 | 59 | 60 | 61 | 62 | class Start(Cell): 63 | def __init__(self): 64 | super().__init__(is_occupied = True,is_start = True) 65 | 66 | 67 | 68 | 69 | class End(Cell): 70 | def __init__(self,reward = 10): 71 | super().__init__(reward = reward,is_terminal = True) 72 | 73 | def __repr__(self): 74 | return "O" 75 | 76 | 77 | 78 | class Hole(Cell): 79 | def __init__(self,reward = -10): 80 | super().__init__(reward = reward,is_terminal = True) 81 | 82 | def __repr__(self): 83 | return "X" 84 | 85 | 86 | 87 | class Wall(Cell): 88 | def __init__(self): 89 | super().__init__(is_wall = True) 90 | 91 | def __repr__(self): 92 | return "#" 93 | 94 | 95 | 96 | 97 | #=========================================================================================================== 98 | # GRIDS DEFINITION 99 | #=========================================================================================================== 100 | 101 | 102 | 103 | 104 | class Grid(object): 105 | def __init__(self,cells): 106 | self.grid = cells 107 | 108 | 109 | def __repr__(self): 110 | pass 111 | 112 | 113 | def __str__(self): 114 | pass 115 | 116 | -------------------------------------------------------------------------------- /0. Old/scripts/multi_armed_bandit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | Multi Armed Bandit Problem 8 | 9 | Started on the 14/04/2017 10 | 11 | 12 | theo.alves.da.costa@gmail.com 13 | https://github.com/theolvs 14 | ------------------------------------------------------------------------ 15 | """ 16 | 17 | 18 | import os 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | import numpy as np 22 | import sys 23 | 24 | 25 | # Deep Learning (Keras, Tensorflow) 26 | import tensorflow as tf 27 | from keras.models import Sequential 28 | from keras.optimizers import SGD,RMSprop, Adam 29 | from keras.layers import Dense, Dropout, Activation, Flatten 30 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D 31 | from keras.utils.np_utils import to_categorical 32 | 33 | 34 | 35 | 36 | #=========================================================================================================== 37 | # BANDIT DEFINITION 38 | #=========================================================================================================== 39 | 40 | 41 | 42 | class Bandit(object): 43 | def __init__(self,p = None): 44 | '''Simple bandit initialization''' 45 | self.p = p if p is not None else np.random.random() 46 | 47 | def pull(self): 48 | '''Simulate a pull from the bandit 49 | 50 | ''' 51 | if np.random.random() < self.p: 52 | return 1 53 | else: 54 | return -1 55 | 56 | 57 | 58 | def create_list_bandits(n = 4,p = None): 59 | if p is None: p = [None]*n 60 | bandits = [Bandit(p = p[i]) for i in range(n)] 61 | return bandits 62 | 63 | 64 | 65 | 66 | 67 | #=========================================================================================================== 68 | # NEURAL NETWORK 69 | #=========================================================================================================== 70 | 71 | 72 | 73 | def build_fcc_model(H = 100,lr = 0.1,dim = 4): 74 | model = Sequential() 75 | model.add(Dense(H, input_dim=dim)) 76 | model.add(Activation('relu')) 77 | model.add(Dense(H)) 78 | model.add(Activation('relu')) 79 | 80 | sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True) 81 | 82 | 83 | model.add(Dense(dim)) 84 | model.add(Activation('softmax')) 85 | model.compile(loss='categorical_crossentropy', 86 | optimizer=sgd, 87 | metrics=['accuracy']) 88 | 89 | return model 90 | 91 | 92 | model = build_fcc_model() 93 | 94 | 95 | 96 | 97 | 98 | #=========================================================================================================== 99 | # SAMPLING ACTION 100 | #=========================================================================================================== 101 | 102 | 103 | def sample_action(probas,epsilon = 0.2): 104 | probas = probas[0] 105 | if np.random.rand() < epsilon: 106 | choice = np.random.randint(0,len(probas)) 107 | else: 108 | choice = np.random.choice(range(len(probas)),p = probas) 109 | return choice 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | #=========================================================================================================== 120 | # EPISODE 121 | #=========================================================================================================== 122 | 123 | 124 | 125 | 126 | def run_episode(bandits,model,probas = None,train = True,epsilon = 0.2): 127 | 128 | if probas is None: 129 | probas = np.ones((1,len(bandits)))/len(bandits) 130 | 131 | # sampling action 132 | bandit_to_pull = sample_action(probas,epsilon = epsilon) 133 | action = to_categorical(bandit_to_pull,num_classes=probas.shape[1]) 134 | 135 | # reward 136 | reward = bandits[bandit_to_pull].pull() 137 | 138 | # feed vectors 139 | X = action 140 | y = (action - probas)*reward 141 | 142 | if train: 143 | model.train_on_batch(X,y) 144 | 145 | # update probabilities 146 | probas = model.predict(X) 147 | 148 | return reward,probas 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | #=========================================================================================================== 157 | # GAME 158 | #=========================================================================================================== 159 | 160 | 161 | def run_game(n_episodes = 100,lr = 0.1,n_bandits = 4,p = None,epsilon = 0.2): 162 | 163 | # DEFINE THE BANDITS 164 | bandits = create_list_bandits(n = n_bandits,p = p) 165 | probabilities_to_win = [x.p for x in bandits] 166 | best_bandit = np.argmax(probabilities_to_win) 167 | print(">> Probabilities to win : {} -> Best bandit : {}".format(probabilities_to_win,best_bandit)) 168 | 169 | # INITIALIZE THE NEURAL NETWORK 170 | model = build_fcc_model(lr = lr,dim = n_bandits) 171 | 172 | # INITIALIZE BUFFERS 173 | rewards = [] 174 | avg_rewards = [] 175 | all_probas = np.array([]) 176 | 177 | # EPISODES LOOP 178 | for i in range(n_episodes): 179 | print("\r[{}/{}] episodes completed".format(i+1,n_episodes),end = "") 180 | 181 | # Random choice at the first episode 182 | if i == 0: 183 | reward,probas = run_episode(bandits = bandits,model = model,epsilon = epsilon) 184 | 185 | # Updated probabilities at the following episodes 186 | else: 187 | reward,probas = run_episode(bandits = bandits,model = model,probas = probas) 188 | 189 | 190 | # Store the rewards and the probas 191 | rewards.append(reward) 192 | avg_rewards.append(np.mean(rewards)) 193 | all_probas = np.append(all_probas,probas) 194 | 195 | print("") 196 | 197 | 198 | # GET THE BEST PREDICTED BANDIT 199 | predicted_bandit = np.argmax(probas) 200 | print(">> Predicted bandit : {} - {}".format(predicted_bandit,"CORRECT !!!" if predicted_bandit == best_bandit else "INCORRECT")) 201 | 202 | 203 | # PLOT THE EVOLUTION OF PROBABILITIES OVER TRAINING 204 | all_probas = all_probas.reshape((n_episodes,n_bandits)).transpose() 205 | plt.figure(figsize = (12,5)) 206 | plt.title("Probabilities on Bandit choice - {} episodes - learning rate {}".format(n_episodes,lr)) 207 | for i,p in enumerate(list(all_probas)): 208 | plt.plot(p,label = "Bandit {}".format(i),lw = 1) 209 | 210 | plt.plot(avg_rewards,linestyle="-", dashes=(5, 4),color = "black",lw = 0.5,label = "average running reward") 211 | plt.legend() 212 | plt.ylim([-0.2,1]) 213 | 214 | plt.show() 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /0. Old/scripts/open_ai_gym.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | First RL script done using Keras and policy gradients 8 | 9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv 10 | - Still inspired by Karpathy's work too 11 | 12 | Started on the 30/12/2016 13 | 14 | 15 | https://github.com/rybskej/atari-py 16 | https://sourceforge.net/projects/vcxsrv/ 17 | 18 | 19 | Environment which works with script: 20 | - CartPole-v0 21 | - MountainCar-v0 22 | - Taxi-v1 23 | 24 | 25 | theo.alves.da.costa@gmail.com 26 | https://github.com/theolvs 27 | ------------------------------------------------------------------------ 28 | """ 29 | 30 | 31 | import numpy as np 32 | import gym 33 | import os 34 | from keras.models import load_model, Sequential 35 | from keras.layers import Dense, Activation 36 | from keras.optimizers import SGD, RMSprop 37 | 38 | 39 | 40 | #------------------------------------------------------------------------------- 41 | 42 | 43 | 44 | 45 | 46 | # def main(n_episodes = 20): 47 | # for i_episode in range(n_episodes): 48 | # observation = env.reset() 49 | # print(observation) 50 | # break 51 | # for t in range(1000): 52 | # if render: env.render 53 | # print(observation) 54 | # action = env.action_space.sample() 55 | # observation, reward, done, info = env.step(action) 56 | # if done: 57 | # print("Episode finished after {} timesteps".format(t+1)) 58 | # break 59 | 60 | 61 | 62 | 63 | 64 | 65 | #------------------------------------------------------------------------------- 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | class Brain(): 75 | def __init__(self,env,env_name = "default",H = 500,reload = False): 76 | 77 | self.env_name = env_name 78 | self.base_path = "C:/Users/talvesdacosta/Documents/Perso/Data Science/15. Reinforcement Learning/3. Open AI Gym/models/" 79 | file = [x for x in os.listdir(self.base_path) if self.env_name in x] 80 | 81 | self.H = H 82 | self.gamma = 0.975 83 | self.batch_size = 10 84 | 85 | try: 86 | self.observation_space = env.observation_space.n 87 | self.observation_to_vectorize = True 88 | except Exception as e: 89 | self.observation_space = env.observation_space.shape[0] 90 | self.observation_to_vectorize = False 91 | 92 | self.action_space = env.action_space.n 93 | 94 | 95 | if len(file) == 0 or reload: 96 | print('>> Building a fully connected neural network') 97 | self.episode_number = 0 98 | self.model = self.build_fcc_model(H,input_dim = self.observation_space,output_dim = self.action_space) 99 | else: 100 | print('>> Loading the previously trained model') 101 | self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")]) 102 | self.model = load_model(self.base_path + file[0]) 103 | 104 | 105 | 106 | self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[] 107 | self.episode_rewards,self.episode_running_rewards = [],[] 108 | self.reward_sum = 0 109 | self.running_reward = 0 110 | 111 | 112 | 113 | 114 | def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2): 115 | model = Sequential() 116 | model.add(Dense(H, input_dim=input_dim)) 117 | model.add(Activation('relu')) 118 | model.add(Dense(H)) 119 | model.add(Activation('relu')) 120 | 121 | sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True) 122 | 123 | if output_dim <= 2: 124 | model.add(Dense(1)) 125 | model.add(Activation('sigmoid')) 126 | model.compile(loss='mse', 127 | optimizer=sgd, 128 | metrics=['accuracy']) 129 | else: 130 | model.add(Dense(output_dim)) 131 | model.add(Activation('softmax')) 132 | model.compile(loss='categorical_crossentropy', 133 | optimizer=sgd, 134 | metrics=['accuracy']) 135 | 136 | return model 137 | 138 | 139 | 140 | def to_input(self,observation): 141 | if self.observation_to_vectorize: 142 | observation = self.vectorize_observation(observation,self.observation_space) 143 | return np.reshape(observation,(1,self.observation_space)) 144 | 145 | 146 | def predict(self,observation): 147 | 148 | x = self.to_input(observation) 149 | 150 | # getting the probability of action 151 | probas = self.model.predict(x)[0] 152 | 153 | # sampling the correct action 154 | action= self.sample_action(probas) 155 | 156 | return x,action,probas 157 | 158 | 159 | def sample_action(self,probabilities): 160 | if len(probabilities)<=2: 161 | action = 1 if np.random.uniform() < probabilities[0] else 0 162 | else: 163 | action = np.random.choice(len(probabilities),p = np.array(probabilities)) 164 | 165 | return action 166 | 167 | def vectorize_action(self,action): 168 | if self.action_space <= 2: 169 | return action 170 | else: 171 | onehot_vector = np.zeros(self.action_space) 172 | onehot_vector[action] = 1 173 | return onehot_vector 174 | 175 | def vectorize_observation(self,value,size): 176 | onehot_vector = np.zeros(size) 177 | onehot_vector[value] = 1 178 | return onehot_vector 179 | 180 | 181 | 182 | def record(self,input = None,action = None,proba = None,reward = None): 183 | if type(input) != type(None): 184 | self.inputs.append(input) 185 | 186 | if type(action) != type(None): 187 | self.actions.append(action) 188 | 189 | if type(proba) != type(None): 190 | self.probas.append(proba) 191 | 192 | if type(reward) != type(None): 193 | self.rewards.append(reward) 194 | self.reward_sum += reward 195 | 196 | 197 | 198 | 199 | def discounting_rewards(self,r,normalization = True): 200 | discounted_r = np.zeros_like(r) 201 | running_add = 0 202 | for t in reversed(range(0, r.size)): 203 | running_add = running_add * self.gamma + r[t] 204 | discounted_r[t] = running_add 205 | 206 | if normalization: 207 | discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe") 208 | discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe") 209 | 210 | return discounted_r 211 | 212 | 213 | def discount_rewards(self,normalization = True): 214 | rewards = np.vstack(self.rewards) 215 | return self.discounting_rewards(rewards,normalization) 216 | 217 | 218 | def record_episode(self): 219 | self.step_rewards.extend(self.discount_rewards(normalization = True)) 220 | self.episode_rewards.append(self.reward_sum) 221 | self.running_reward = np.mean(self.episode_rewards) 222 | self.episode_number += 1 223 | 224 | def reset_episode(self): 225 | self.rewards = [] 226 | self.reward_sum = 0 227 | 228 | def update_on_batch(self): 229 | print('... Training on batch of size %s'%self.batch_size) 230 | self.actions = np.vstack(self.actions) 231 | self.probas = np.vstack(self.probas) 232 | self.step_rewards = np.vstack(self.step_rewards) 233 | self.inputs = np.vstack(self.inputs) 234 | 235 | self.targets = self.step_rewards * (self.actions - self.probas) + self.probas 236 | 237 | #ajouter la protection de la max rewards 238 | 239 | self.model.train_on_batch(self.inputs,self.targets) 240 | 241 | self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[] 242 | 243 | def save_model(self): 244 | file = [x for x in os.listdir(self.base_path) if self.env_name in x] 245 | self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number)) 246 | if len(file)>0: 247 | os.remove(self.base_path+file[0]) 248 | # self.model.save(self.base_path+"%s.h5"%(self.env_name)) 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | def main(env_name = 'CartPole-v0',n_episodes = 20,render = False,reload = False,n_by_episode = 1000): 261 | env = gym.make(env_name) 262 | brain = Brain(env,env_name = env_name,reload = reload) 263 | # env.monitor.start(brain.base_path+'monitor/%s'%env_name) 264 | 265 | 266 | for i_episode in range(1,n_episodes+1): 267 | observation = env.reset() 268 | for t in range(n_by_episode): 269 | if render: env.render() 270 | 271 | x,action,proba = brain.predict(observation) 272 | 273 | observation, reward, done, info = env.step(action) 274 | action = brain.vectorize_action(action) 275 | brain.record(input = x,action = action,proba = proba,reward = reward) 276 | 277 | if done or t == n_by_episode - 1: 278 | brain.record_episode() 279 | print("Episode {} : total reward was {:0.03f} and running mean {:0.03f}".format(brain.episode_number, brain.reward_sum, brain.running_reward)) 280 | 281 | 282 | if i_episode % brain.batch_size == 0: 283 | brain.update_on_batch() 284 | 285 | if i_episode % 100 == 0: 286 | brain.save_model() 287 | 288 | 289 | brain.reset_episode() 290 | 291 | break 292 | 293 | # env.monitor.close() -------------------------------------------------------------------------------- /0. Solving Gym environments/breakout_with_rl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | 18 | 19 | # Usual libraries 20 | import os 21 | import matplotlib.pyplot as plt 22 | import pandas as pd 23 | import numpy as np 24 | import sys 25 | import random 26 | import time 27 | from tqdm import tqdm 28 | import random 29 | import gym 30 | import numpy as np 31 | 32 | 33 | # Keras (Deep Learning) 34 | from keras.models import Sequential 35 | from keras.layers import Dense 36 | from keras.optimizers import Adam 37 | 38 | 39 | # Custom RL library 40 | import sys 41 | sys.path.insert(0,'..') 42 | 43 | from rl import utils 44 | from rl.agents.dqn2d_agent import DQN2DAgent 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | #---------------------------------------------------------------- 53 | # CONSTANTS 54 | 55 | 56 | N_EPISODES = 1000 57 | MAX_STEPS = 10000 58 | RENDER = True 59 | RENDER_EVERY = 50 60 | BATCH_SIZE = 256 61 | MAX_MEMORY = MAX_STEPS 62 | 63 | 64 | 65 | #---------------------------------------------------------------- 66 | # MAIN LOOP 67 | 68 | 69 | if __name__ == "__main__": 70 | 71 | # Define the gym environment 72 | env = gym.make('Pong-v0') 73 | 74 | # Get the environement action and observation space 75 | state_size = env.observation_space.shape 76 | action_size = env.action_space.n 77 | 78 | # Create the RL Agent 79 | agent = DQN2DAgent(state_size,action_size,max_memory = MAX_MEMORY) 80 | 81 | # Initialize a list to store the rewards 82 | rewards = [] 83 | 84 | 85 | 86 | #--------------------------------------------- 87 | # ITERATION OVER EPISODES 88 | for i_episode in range(N_EPISODES): 89 | 90 | 91 | 92 | # Reset the environment 93 | s = env.reset() 94 | 95 | 96 | #----------------------------------------- 97 | # EPISODE RUN 98 | for i_step in range(MAX_STEPS): 99 | 100 | # Render the environement 101 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0) 102 | 103 | # Store s before 104 | if i_step == 0: 105 | s_before = s 106 | 107 | 108 | # The agent chose the action considering the given current state 109 | a = agent.act(s_before,s) 110 | 111 | 112 | # Take the action, get the reward from environment and go to the next state 113 | s_next,r,done,info = env.step(a) 114 | 115 | # print(r) 116 | 117 | # Tweaking the reward to make it negative when we lose 118 | # r = r if not done else -10 119 | 120 | # Remember the important variables 121 | agent.remember( 122 | np.expand_dims(s,axis=0), 123 | a, 124 | r, 125 | np.expand_dims(s_next,axis=0), 126 | np.expand_dims(s_before,axis=0), 127 | done) 128 | 129 | # Go to the next state 130 | s_before = s 131 | s = s_next 132 | 133 | # If the episode is terminated 134 | if done: 135 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon)) 136 | break 137 | 138 | 139 | #----------------------------------------- 140 | 141 | # Store the rewards 142 | rewards.append(i_step) 143 | 144 | 145 | # Training 146 | agent.train(batch_size = BATCH_SIZE) 147 | 148 | 149 | 150 | 151 | 152 | # Plot the average running rewards 153 | utils.plot_average_running_rewards(rewards) 154 | -------------------------------------------------------------------------------- /0. Solving Gym environments/cartpole_with_deepqlearning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | 18 | 19 | # Usual libraries 20 | import os 21 | import matplotlib.pyplot as plt 22 | import pandas as pd 23 | import numpy as np 24 | import sys 25 | import random 26 | import time 27 | from tqdm import tqdm 28 | import random 29 | import gym 30 | import numpy as np 31 | 32 | 33 | # Keras (Deep Learning) 34 | from keras.models import Sequential 35 | from keras.layers import Dense 36 | from keras.optimizers import Adam 37 | 38 | 39 | # Custom RL library 40 | import sys 41 | sys.path.insert(0,'..') 42 | 43 | from rl import utils 44 | from rl.agents.dqn_agent import DQNAgent 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | #---------------------------------------------------------------- 53 | # CONSTANTS 54 | 55 | 56 | N_EPISODES = 1000 57 | MAX_STEPS = 1000 58 | RENDER = True 59 | RENDER_EVERY = 50 60 | 61 | 62 | 63 | #---------------------------------------------------------------- 64 | # MAIN LOOP 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | # Define the gym environment 70 | env = gym.make('CartPole-v1') 71 | 72 | # Get the environement action and observation space 73 | state_size = env.observation_space.shape[0] 74 | action_size = env.action_space.n 75 | 76 | # Create the RL Agent 77 | agent = DQNAgent(state_size,action_size) 78 | 79 | # Initialize a list to store the rewards 80 | rewards = [] 81 | 82 | 83 | 84 | 85 | 86 | #--------------------------------------------- 87 | # ITERATION OVER EPISODES 88 | for i_episode in range(N_EPISODES): 89 | 90 | 91 | 92 | # Reset the environment 93 | s = env.reset() 94 | 95 | 96 | #----------------------------------------- 97 | # EPISODE RUN 98 | for i_step in range(MAX_STEPS): 99 | 100 | # Render the environement 101 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0) 102 | 103 | # The agent chose the action considering the given current state 104 | a = agent.act(s) 105 | 106 | # Take the action, get the reward from environment and go to the next state 107 | s_next,r,done,info = env.step(a) 108 | 109 | # Tweaking the reward to make it negative when we lose 110 | r = r if not done else -10 111 | 112 | # Remember the important variables 113 | agent.remember(s,a,r,s_next,done) 114 | 115 | # Go to the next state 116 | s = s_next 117 | 118 | # If the episode is terminated 119 | if done: 120 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon)) 121 | break 122 | 123 | 124 | #----------------------------------------- 125 | 126 | # Store the rewards 127 | rewards.append(i_step) 128 | 129 | 130 | # Training 131 | agent.train() 132 | 133 | 134 | 135 | 136 | 137 | # Plot the average running rewards 138 | utils.plot_average_running_rewards(rewards) 139 | -------------------------------------------------------------------------------- /0. Solving Gym environments/pendulum_with_actorcritic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 13/11/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | 18 | 19 | # Usual libraries 20 | import os 21 | import matplotlib.pyplot as plt 22 | import pandas as pd 23 | import numpy as np 24 | import sys 25 | import random 26 | import time 27 | from tqdm import tqdm 28 | import random 29 | import gym 30 | import numpy as np 31 | 32 | 33 | # Keras (Deep Learning) 34 | from keras.models import Sequential 35 | from keras.layers import Dense 36 | from keras.optimizers import Adam 37 | import tensorflow as tf 38 | import keras.backend as K 39 | 40 | # Custom RL library 41 | import sys 42 | sys.path.insert(0,'..') 43 | 44 | from rl import utils 45 | from rl.agents.actor_critic_agent import ActorCriticAgent 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | #---------------------------------------------------------------- 54 | # CONSTANTS 55 | 56 | 57 | N_EPISODES = 10000 58 | MAX_STEPS = 500 59 | RENDER = True 60 | RENDER_EVERY = 50 61 | 62 | 63 | 64 | #---------------------------------------------------------------- 65 | # MAIN LOOP 66 | 67 | 68 | if __name__ == "__main__": 69 | 70 | # Define the gym environment 71 | sess = tf.Session() 72 | K.set_session(sess) 73 | env = gym.make('Pendulum-v0') 74 | 75 | # Define the agent 76 | agent = ActorCriticAgent(env, sess) 77 | 78 | # Initialize a list to store the rewards 79 | rewards = [] 80 | 81 | 82 | 83 | 84 | 85 | #--------------------------------------------- 86 | # ITERATION OVER EPISODES 87 | for i_episode in range(N_EPISODES): 88 | 89 | 90 | 91 | # Reset the environment 92 | s = env.reset() 93 | 94 | reward = 0 95 | 96 | 97 | #----------------------------------------- 98 | # EPISODE RUN 99 | for i_step in range(MAX_STEPS): 100 | 101 | # Render the environement 102 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0) 103 | 104 | # The agent chose the action considering the given current state 105 | s = s.reshape((1, env.observation_space.shape[0])) 106 | a = agent.act(s) 107 | a = a.reshape((1, env.action_space.shape[0])) 108 | 109 | # Take the action, get the reward from environment and go to the next state 110 | s_next,r,done,_ = env.step(a) 111 | s_next = s_next.reshape((1, env.observation_space.shape[0])) 112 | reward += r 113 | 114 | # Tweaking the reward to make it negative when we lose 115 | 116 | # Remember the important variables 117 | agent.remember(s,a,r,s_next,done) 118 | 119 | # Go to the next state 120 | s = s_next 121 | 122 | # If the episode is terminated 123 | if done: 124 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward)) 125 | break 126 | 127 | 128 | #----------------------------------------- 129 | 130 | # Store the rewards 131 | rewards.append(i_step) 132 | 133 | 134 | # Training 135 | agent.train() 136 | 137 | 138 | 139 | 140 | 141 | # Plot the average running rewards 142 | utils.plot_average_running_rewards(rewards) 143 | -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_random_rl_agents.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_random_rl_agents2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents2.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_random_rl_agents3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents3.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_random_rules_agents.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_random_rules_agents2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents2.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_rules_rl_agents.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_rules_rl_agents.gif -------------------------------------------------------------------------------- /1. Tic Tac Toe/images/game_two_random_agents.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_two_random_agents.gif -------------------------------------------------------------------------------- /2. Data Center Cooling/README.md: -------------------------------------------------------------------------------- 1 | # Data Center Cooling 2 | ![](https://s-media-cache-ak0.pinimg.com/originals/36/d1/87/36d18741bdd4d2ac0033c53bcc669148.jpg) 3 | 4 | Inspired by [DeepMind's realization](https://deepmind.com/blog/deepmind-ai-reduces-google-data-centre-cooling-bill-40/) 5 | 6 | This repository hold the development of a business question that can be solved with Reinforcement Learning : cooling data centers 7 | - The environment modelled with the fashion of OpenAI Gym's environments 8 | - Solving the problem with different RL algorithms (Q-Learning, Deep-Q-Learning, Policy Gradients) 9 | - A interactive Dash app to test the environment and the agents 10 | 11 | 12 | *** 13 | ## Data Center Cooling environment 14 | 15 | To try out the app, launch with ``python app.py`` and go to port ``localhost:8050`` 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /2. Data Center Cooling/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | TWITTER APP 7 | 8 | Started on the 22/06/2017 9 | 10 | 11 | https://plot.ly/dash/live-updates 12 | https://plot.ly/dash/getting-started 13 | https://plot.ly/dash/getting-started-part-2 14 | https://plot.ly/dash/gallery/new-york-oil-and-gas/ 15 | 16 | theo.alves.da.costa@gmail.com 17 | https://github.com/theolvs 18 | ------------------------------------------------------------------------ 19 | """ 20 | 21 | # USUAL 22 | import os 23 | import numpy as np 24 | from tqdm import tqdm 25 | from copy import deepcopy 26 | 27 | # DASH IMPORT 28 | import dash 29 | import dash_core_components as dcc 30 | import dash_html_components as html 31 | from dash.dependencies import Input, Output, Event, State 32 | import plotly.graph_objs as go 33 | 34 | import sys 35 | sys.path.append("C:/git/reinforcement-learning/") 36 | 37 | 38 | 39 | #-------------------------------------------------------------------------------- 40 | from rl.envs.data_center_cooling import DataCenterCooling 41 | from rl.agents.q_agent import QAgent 42 | from rl.agents.dqn_agent import DQNAgent 43 | from rl.agents.sarsa_agent import SarsaAgent 44 | from rl import utils 45 | 46 | 47 | 48 | 49 | def run_episode(env,agent,max_step = 100,verbose = 1): 50 | 51 | s = env.reset() 52 | 53 | episode_reward = 0 54 | 55 | i = 0 56 | while i < max_step: 57 | 58 | # Choose an action 59 | a = agent.act(s) 60 | 61 | # Take the action, and get the reward from environment 62 | s_next,r,done = env.step(a) 63 | 64 | if verbose: print(s_next,r,done) 65 | 66 | # Update our knowledge in the Q-table 67 | agent.train(s,a,r,s_next) 68 | 69 | # Update the caches 70 | episode_reward += r 71 | s = s_next 72 | 73 | # If the episode is terminated 74 | i += 1 75 | if done: 76 | break 77 | 78 | return env,agent,episode_reward 79 | 80 | 81 | 82 | 83 | def run_n_episodes(env,type_agent = "Q Agent",n_episodes = 2000,lr = 0.8,gamma = 0.95): 84 | 85 | environment = deepcopy(env) 86 | 87 | # Initialize the agent 88 | states_size = len(env.observation_space) 89 | actions_size = len(env.action_space) 90 | 91 | if type_agent == "Q Agent": 92 | print("... Using Q Agent") 93 | agent = QAgent(states_size,actions_size,lr = lr,gamma = gamma) 94 | elif type_agent == "SARSA Agent": 95 | print("... Using SARSA Agent") 96 | agent = SarsaAgent(states_size,actions_size,lr = lr,gamma = gamma) 97 | 98 | # Store the rewards 99 | rewards = [] 100 | 101 | # Experience replay 102 | for i in tqdm(range(n_episodes)): 103 | 104 | # Run the episode 105 | environment,agent,episode_reward = run_episode(environment,agent,verbose = 0) 106 | rewards.append(episode_reward) 107 | 108 | return environment,agent,rewards 109 | 110 | 111 | class Clicks(object): 112 | def __init__(self): 113 | self.count = 0 114 | 115 | reset_clicks = Clicks() 116 | train_clicks = Clicks() 117 | env = DataCenterCooling() 118 | np.random.seed() 119 | 120 | #--------------------------------------------------------------------------------- 121 | # CREATE THE APP 122 | app = dash.Dash("Data Cooling Center") 123 | 124 | 125 | # # Making the app available offline 126 | offline = False 127 | app.css.config.serve_locally = offline 128 | app.scripts.config.serve_locally = offline 129 | 130 | 131 | style = { 132 | 'font-weight': 'bolder', 133 | 'font-family': 'Product Sans', 134 | } 135 | 136 | container_style = { 137 | "margin":"20px", 138 | } 139 | 140 | 141 | 142 | AGENTS = [{"label":x,"value":x} for x in ["Q Agent","SARSA Agent","Deep-Q-Network Agent","Policy Gradient Agent"]] 143 | 144 | #--------------------------------------------------------------------------------- 145 | # LAYOUT 146 | app.layout = html.Div(children=[ 147 | 148 | 149 | 150 | 151 | 152 | # HEADER FIRST CONTAINER 153 | html.Div([ 154 | html.H2("Data Center Cooling",style = {'color': "rgba(117, 117, 117, 0.95)",**style}), 155 | 156 | html.Div([ 157 | html.H4("Environment",style = {'color': "rgba(117, 117, 117, 0.95)",**style}), 158 | html.P("Cooling levels",id = "cooling"), 159 | dcc.Slider(min=10,max=100,step=10,value=10,id = "levels-cooling"), 160 | html.P("Cost factor",id = "cost-factor"), 161 | dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-cost-factor"), 162 | html.P("Risk factor",id = "risk-factor"), 163 | dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-risk-factor"), 164 | html.Br(), 165 | html.Button("Reset",id = "reset-env",style = style,n_clicks = 0), 166 | ],style = {"height":"50%"}), 167 | 168 | 169 | html.Div([ 170 | html.H4("Agent",style = {'color': "rgba(117, 117, 117, 0.95)",**style}), 171 | dcc.Dropdown(id = "input-agent",options = AGENTS,value = "Q Agent",multi = False), 172 | html.P("N episodes",id = "input-episodes"), 173 | dcc.Slider(min=500,max=10000,step=500,value=5000,id = "n-episodes"), 174 | html.P("Learning rate",id = "input-lr"), 175 | dcc.Slider(min=0.001,max=1.0,step=0.005,value=0.1,id = "lr"), 176 | html.Br(), 177 | html.Button("Train",id = "training",style = style,n_clicks = 0), 178 | ],style = {"height":"50%"}), 179 | 180 | 181 | 182 | ],style={**style,**container_style,'width': '20%',"height":"800px", 'float' : 'left', 'display': 'inline'}, className="container"), 183 | 184 | 185 | 186 | 187 | # ANALYTICS CONTAINER 188 | html.Div([ 189 | 190 | dcc.Graph(id='render',animate = False,figure = env.render(with_plotly = True),style = {"height":"100%"}), 191 | 192 | 193 | ],style={**style,**container_style,'width': '55%',"height":"800px", 'float' : 'right', 'display': 'inline'}, className="container"), 194 | 195 | 196 | ]) 197 | 198 | 199 | 200 | 201 | #--------------------------------------------------------------------------------- 202 | # CALLBACKS 203 | 204 | 205 | 206 | # Callback to stop the streaming 207 | @app.callback( 208 | Output("render","figure"), 209 | [Input('reset-env','n_clicks'),Input('training','n_clicks'),Input('levels-cost-factor','value'),Input('levels-risk-factor','value')], 210 | state = [State('levels-cooling','value'),State('lr','value'),State('n-episodes','value'),State('input-agent','value')] 211 | 212 | ) 213 | def render(click_reset,click_training,cost_factor,risk_factor,levels_cooling,lr,n_episodes,type_agent): 214 | 215 | 216 | print("Reset ",click_reset," - ",reset_clicks.count) 217 | print("Train ",click_training," - ",train_clicks.count) 218 | 219 | 220 | if click_reset > reset_clicks.count: 221 | reset_clicks.count = click_reset 222 | env.__init__(levels_cooling = levels_cooling,risk_factor = risk_factor,cost_factor = cost_factor,keep_cooling = True) 223 | 224 | elif click_training > train_clicks.count: 225 | train_clicks.count = click_training 226 | env_temp,agent,rewards = run_n_episodes(env,n_episodes = n_episodes,lr = lr,type_agent = type_agent) 227 | utils.plot_average_running_rewards(rewards,"C:/Users/talvesdacosta/Desktop/results.png") 228 | # os.system("start "+"C:/Users/talvesdacosta/Desktop/results.png") 229 | env.cooling = env_temp.cooling 230 | else: 231 | env.risk_factor = risk_factor 232 | env.cost_factor = cost_factor 233 | 234 | 235 | 236 | return env.render(with_plotly = True) 237 | 238 | 239 | 240 | 241 | @app.callback( 242 | Output("cooling","children"), 243 | [Input('levels-cooling','value')]) 244 | def update_cooling(value): 245 | env.levels_cooling = value 246 | env.define_cooling(value) 247 | return "Cooling levels : {}".format(value) 248 | 249 | 250 | 251 | @app.callback( 252 | Output("risk-factor","children"), 253 | [Input('levels-risk-factor','value')]) 254 | def update_risk(value): 255 | return "Risk factor : {}".format(value) 256 | 257 | 258 | 259 | @app.callback( 260 | Output("cost-factor","children"), 261 | [Input('levels-cost-factor','value')]) 262 | def update_cost(value): 263 | return "Cost factor : {}".format(value) 264 | 265 | @app.callback( 266 | Output("input-episodes","children"), 267 | [Input('n-episodes','value')]) 268 | def update_episodes(value): 269 | return "N episodes : {}".format(value) 270 | 271 | @app.callback( 272 | Output("input-lr","children"), 273 | [Input('lr','value')]) 274 | def update_lr(value): 275 | return "Learning rate : {}".format(value) 276 | 277 | 278 | 279 | 280 | 281 | 282 | #--------------------------------------------------------------------------------- 283 | # ADD EXTERNAL CSS 284 | 285 | external_css = ["https://fonts.googleapis.com/css?family=Product+Sans:400,400i,700,700i", 286 | "https://cdn.rawgit.com/plotly/dash-app-stylesheets/2cc54b8c03f4126569a3440aae611bbef1d7a5dd/stylesheet.css"] 287 | 288 | for css in external_css: 289 | app.css.append_css({"external_url": css}) 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | #--------------------------------------------------------------------------------- 298 | # RUN SERVER 299 | if __name__ == '__main__': 300 | app.run_server(debug=True) 301 | np.random.seed() -------------------------------------------------------------------------------- /3. Robotics/minitaur.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | 18 | 19 | # Usual libraries 20 | import os 21 | import matplotlib.pyplot as plt 22 | import pandas as pd 23 | import numpy as np 24 | import sys 25 | import random 26 | import time 27 | from tqdm import tqdm 28 | import random 29 | import gym 30 | import numpy as np 31 | 32 | 33 | # Keras (Deep Learning) 34 | from keras.models import Sequential 35 | from keras.layers import Dense 36 | from keras.optimizers import Adam 37 | 38 | 39 | # Custom RL library 40 | import sys 41 | sys.path.insert(0,'..') 42 | 43 | from rl import utils 44 | from rl.agents.dqn_agent import DQNAgent 45 | 46 | import pybullet_envs.bullet.minitaur_gym_env as e 47 | 48 | 49 | 50 | 51 | 52 | #---------------------------------------------------------------- 53 | # CONSTANTS 54 | 55 | 56 | N_EPISODES = 1000 57 | MAX_STEPS = 2000 58 | RENDER = True 59 | RENDER_EVERY = 50 60 | 61 | 62 | 63 | #---------------------------------------------------------------- 64 | # MAIN LOOP 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | # Define the gym environment 70 | env = e.MinitaurBulletEnv(render=True) 71 | 72 | # Get the environement action and observation space 73 | state_size = env.observation_space.shape[0] 74 | action_size = env.action_space.shape[0] 75 | 76 | # Create the RL Agent 77 | agent = DQNAgent(state_size,action_size,low = -1,high = 1,action_type="continuous") 78 | 79 | # Initialize a list to store the rewards 80 | rewards = [] 81 | 82 | 83 | 84 | 85 | 86 | #--------------------------------------------- 87 | # ITERATION OVER EPISODES 88 | for i_episode in range(N_EPISODES): 89 | 90 | 91 | 92 | # Reset the environment 93 | s = env.reset() 94 | reward = 0 95 | 96 | 97 | #----------------------------------------- 98 | # EPISODE RUN 99 | for i_step in range(MAX_STEPS): 100 | 101 | # Render the environement 102 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0) 103 | 104 | # The agent chose the action considering the given current state 105 | a = agent.act(s) 106 | 107 | # Take the action, get the reward from environment and go to the next state 108 | s_next,r,done,info = env.step(a) 109 | reward += r 110 | 111 | # Remember the important variables 112 | agent.remember(s,a,r,s_next,done) 113 | 114 | # Go to the next state 115 | s = s_next 116 | 117 | # If the episode is terminated 118 | if done: 119 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward)) 120 | break 121 | 122 | 123 | #----------------------------------------- 124 | 125 | # Store the rewards 126 | rewards.append(i_step) 127 | 128 | 129 | # Training 130 | agent.train(batch_size = 128) 131 | 132 | 133 | 134 | 135 | 136 | # Plot the average running rewards 137 | utils.plot_average_running_rewards(rewards) 138 | -------------------------------------------------------------------------------- /4. Chrome Dino/README.md: -------------------------------------------------------------------------------- 1 | # Chrome Dino Project 2 | ## Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch 3 | ![](http://www.skipser.com/test/trex-game/promotion/trex-chrome-game.png) 4 | 5 | 6 | ##### Summary 7 | - Capturing image from the game - **OK** 8 | - Allowing control programmatically - **OK** 9 | - Trying a simple implementation of rules-based agent with classic CV algorithms - **OK** 10 | - Capturing scores for fitness and reward - **OK** 11 | - Creating the environment for RL - **OK** 12 | - Developing a RL agent that learns via evolution strategies - **OK** 13 | - Different experiments on both agent and method of learning 14 | 15 | 16 | ##### Ideas 17 | - Taking as input of the neural network 18 | - The boundaries of the obstacles in a 1D vector 19 | - The raw image 20 | - The processed image 21 | - Initialize the agent with hard coded policy 22 | - Combine the RL agent and the rules-based Agent 23 | - Try other evolution strategies 24 | - Crossover on the fitness 25 | - Simple ES 26 | - CMA-ES 27 | 28 | 29 | ##### Experiments : 30 | 1. **Genetic algorithm** : Generation of 20 dinos, 5 survive, and make 10 offsprings. 10 random dinos are created to complete the 20 population. Did not work at all after 100 generations, still an average score of 50 which is stopping at the first obstacle. This was tested without mutations. The Neural Network is very shallow MLP with one 100-unit hidden layer. 31 | 2. **Genetic algorithm** : Generation of 40 dinos, 10 survive, make 45 offsprings, but only 40 are selected at random to recreate the 40-population. Added mutations with gaussian noise at this step. Tried as well with a shallow MLP but also with a simple logistic regression in PyTorch 32 | 3. **Genetic algorithm** : Generation of 50 dinos, 12 survive, make 66 offsprings, but only 38 are selected at random to recreate the population. The input is now modelled by a vector with the position on the x axis of the next 2 obstacles. Thus I went back to a shallow MLP with the following structure ``(2 input features,50 hidden layers,1 output)`` giving me the probability to jump. When ensuring a high mutation factor for the gaussian noise to have more exploration. The dinosaurs reach a max score of 600 in about 70 generations of 50 dinos (6 hours on my laptop). But they fail when reaching the birds that were not included in the training. 33 | 4. **Evolution Strategy** : I went back to a simple evolution strategy to focus the training on the dino with the good behavior. The selection will be the top 10 or 20% at each generation. Then the next generation is created based on the fittest on which is adding gaussian noise as the mutations. With this strategy the dinosaur reach a max score of 600 in about 20 generations of 50 dinos. This works better than the last solution, but it is always falling to local optimas with dino jumping all the time to maximize their score. 34 | 5. **Evolution Strategy** : to correct the bad behavior of jumping all the time, I added a discount factor if moves are done when there is no obstacles. By counting the number of obstacles passed and the number of moves. The new reward is then modelled in the fashion of the Bellman equation, by incrementing a discounted reward to the previous reward. With this correction, after one generation the "always-jumping" behavior has disappeared, and with a few generations the dinos reach a good enough policy. In 10 generations of 10 dinos only (only 10 minutes on my laptop) we reach easily the max score of 600 previously reached, with a good enough average policy. But new issues arise : birds that come after 600 points which require to duck, speed increasing over time, long obstacles which would require to jump before. Here is a screen capture of the game at this state : 35 | ![](images/dino_ml_agent1.gif) 36 | 37 | 38 | 39 | ##### Misc 40 | - Finding parameter on when to jump 41 | - Logreg/NN on the first and second position of obstacles 42 | - ML + Heuristics model 43 | - Bayesian priors -------------------------------------------------------------------------------- /4. Chrome Dino/experiments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | GENETIC ALGORITHMS EXPERIMENTS 7 | Started on the 2018/01/03 8 | theo.alves.da.costa@gmail.com 9 | https://github.com/theolvs 10 | ------------------------------------------------------------------------ 11 | """ 12 | 13 | from scipy import stats 14 | import seaborn as sns 15 | import os 16 | import matplotlib.pyplot as plt 17 | import pandas as pd 18 | import numpy as np 19 | import sys 20 | import time 21 | from tqdm import tqdm 22 | import itertools 23 | 24 | 25 | 26 | 27 | #============================================================================================================================= 28 | # DISTRIBUTIONS 29 | #============================================================================================================================= 30 | 31 | 32 | 33 | 34 | 35 | class Dist(object): 36 | def __init__(self,mu = None,std = None,label = None): 37 | self.mu = np.random.rand()*20 - 10 if mu is None else mu 38 | self.std = np.random.rand()*10 if std is None else std 39 | self.label = "" if not label else " - "+label 40 | self.func = lambda x : stats.norm.cdf(x,loc = self.mu,scale = self.std) 41 | 42 | def __repr__(self,markdown = False): 43 | return "Norm {1}mu={2}{0}, {0}std={3}{0}{4}".format("$" if markdown else "","$\\" if markdown else "", 44 | round(self.mu,2),round(self.std,2),self.label) 45 | 46 | def plot(self,fill = True): 47 | x = np.linspace(-20, 20, 100) 48 | y = stats.norm.pdf(x,loc = self.mu,scale = self.std) 49 | plt.plot(x,y,label = self.__repr__(markdown = True)) 50 | if fill: 51 | plt.fill_between(x, 0, y, alpha=0.4) 52 | 53 | 54 | def __add__(self,other): 55 | mu = np.mean([self.mu,other.mu]) 56 | std = np.mean([self.std,other.std]) 57 | return Dist(mu,std) 58 | 59 | def mutate(self,alpha = 1): 60 | self.mu = self.mu + 1/(1+np.log(1+alpha)) * np.random.randn() 61 | self.std = max(self.std + 1/(1+np.log(1+alpha)) * np.random.randn(),0.5) 62 | 63 | def fitness(self,x): 64 | return 1 - stats.kstest(x,self.func).statistic 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | class Population(object): 76 | def __init__(self,distributions = None,n = 100): 77 | if distributions is not None: 78 | self.distributions = distributions 79 | else: 80 | self.distributions = [Dist() for i in range(n)] 81 | 82 | def __getitem__(self,key): 83 | if type(key) == tuple or type(key) == list: 84 | d = [] 85 | for i in key: 86 | d.append(self.distributions[i]) 87 | return d 88 | else: 89 | return self.distributions[key] 90 | 91 | def __iter__(self): 92 | return iter(self.distributions) 93 | 94 | def __len__(self): 95 | return len(self.distributions) 96 | 97 | def plot(self,title = "Normal distributions",figsize = None): 98 | if figsize: 99 | plt.figure(figsize = figsize) 100 | plt.title(title) 101 | fill = len(self) < 5 102 | for d in self: 103 | d.plot(fill = fill) 104 | plt.legend() 105 | plt.xlabel("x") 106 | plt.show() 107 | 108 | def evaluate(self,x): 109 | fitnesses = [(i,dist.fitness(x)) for i,dist in enumerate(self)] 110 | indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True)) 111 | return indices,fitnesses 112 | 113 | def selection(self,x,top = 0.1): 114 | indices,fitnesses = self.evaluate(x) 115 | n = int(top*len(fitnesses)) 116 | return indices[:n] 117 | 118 | 119 | def crossover(self,indices): 120 | combinations = list(itertools.combinations(indices,2)) 121 | np.random.shuffle(combinations) 122 | combinations = combinations[:len(self)] 123 | new_population = [] 124 | for i,j in combinations: 125 | new_population.append(self[i]+self[j]) 126 | self.distributions = new_population 127 | 128 | def mutate(self,generation = 1): 129 | for d in self: 130 | d.mutate(generation) 131 | 132 | 133 | def evolve(self,x,top = 0.25,n_generations = 20,last_selection = True): 134 | all_fitnesses = [self.evaluate(x)[1]] 135 | 136 | for generation in tqdm(range(n_generations)): 137 | 138 | indices = self.selection(x,top) 139 | self.crossover(indices) 140 | self.mutate(generation) 141 | 142 | indices,fitnesses = self.evaluate(x) 143 | all_fitnesses.append(fitnesses) 144 | 145 | self._plot_fitnesses(all_fitnesses) 146 | 147 | if last_selection: 148 | indices = self.selection(x,top) 149 | 150 | return Population(self[indices]) 151 | 152 | 153 | def _plot_fitnesses(self,fitnesses): 154 | sups = [] 155 | infs = [] 156 | means = [] 157 | for step in fitnesses: 158 | sups.append(np.max(step)) 159 | infs.append(np.min(step)) 160 | means.append(np.mean(step)) 161 | 162 | plt.figure(figsize=(10,6)) 163 | plt.plot(means) 164 | plt.fill_between(range(len(means)),sups,infs, alpha = 0.2) 165 | plt.xlabel('# Generation') 166 | plt.ylabel('Fitness') 167 | plt.legend() 168 | plt.show() 169 | 170 | 171 | 172 | 173 | 174 | #============================================================================================================================= 175 | # LOGREG 176 | #============================================================================================================================= 177 | 178 | 179 | 180 | import torch 181 | from torch.autograd import Variable 182 | import torch.nn as nn 183 | import torch.nn.functional as F 184 | 185 | 186 | 187 | 188 | class LogReg(torch.nn.Module): 189 | def __init__(self, n_feature,n_output = 1,alpha = 10e-1): 190 | self.alpha = alpha 191 | self.args = n_feature,n_output 192 | super(LogReg, self).__init__() 193 | self.out = torch.nn.Linear(n_feature,n_output,bias = False) # output layer 194 | 195 | def forward(self, x): 196 | x = Variable(torch.FloatTensor(x)) 197 | x = F.sigmoid(self.out(x)) 198 | return x 199 | 200 | 201 | def __add__(self,other): 202 | new = LogReg(*self.args) 203 | new.out.weight.data = torch.FloatTensor(0.5 * (self.out.weight.data.numpy() + other.out.weight.data.numpy())) 204 | return new 205 | 206 | 207 | def mutate(self,generation): 208 | out = self.out.weight.data.numpy() 209 | noise_out = self.alpha * np.random.randn(*out.shape) 210 | self.out.weight.data = torch.FloatTensor(self.out.weight.data.numpy() + noise_out) 211 | 212 | 213 | def evaluate(self,x,y): 214 | pred = self.forward(x).data.numpy() 215 | loss_1 = np.sum(np.log(pred + 10e-9)*y.reshape(-1,1)) 216 | loss_0 = np.sum(np.log(1-pred + 10e-9)*(1-y).reshape(-1,1)) 217 | return loss_1 + loss_0 218 | 219 | 220 | def plot_coefs(self): 221 | plt.figure(figsize = (15,4)) 222 | plt.title("Coefficients") 223 | plt.axhline(0,c = "black") 224 | plt.plot(self.out.weight.data.numpy()[0]) 225 | plt.xlabel("# Pixel") 226 | plt.show() 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | class PopulationLogReg(object): 236 | def __init__(self,x,y,regs = None,n = 20,top = 0.25,**kwargs): 237 | 238 | self.x = x 239 | self.y = y 240 | self.kwargs = kwargs 241 | 242 | if regs is None: 243 | self.regs = [LogReg(**kwargs) for i in range(n)] 244 | else: 245 | self.regs = regs 246 | 247 | 248 | def __getitem__(self,key): 249 | if type(key) == tuple or type(key) == list: 250 | d = [] 251 | for i in key: 252 | d.append(self.regs[i]) 253 | return d 254 | else: 255 | return self.regs[key] 256 | 257 | def __iter__(self): 258 | return iter(self.regs) 259 | 260 | def __len__(self): 261 | return len(self.regs) 262 | 263 | 264 | 265 | def evaluate(self): 266 | fitnesses = [(i,element.evaluate(self.x,self.y)) for i,element in enumerate(self)] 267 | indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True)) 268 | return indices,fitnesses 269 | 270 | 271 | 272 | def selection(self,top = 0.5): 273 | indices,fitnesses = self.evaluate() 274 | n = int(top*len(fitnesses)) 275 | return indices[:n] 276 | 277 | 278 | 279 | def crossover(self,indices): 280 | combinations = list(itertools.combinations(indices,2)) 281 | np.random.shuffle(combinations) 282 | combinations = combinations[:len(self)] 283 | new_population = [] 284 | for i,j in combinations: 285 | new_population.append(self[i]+self[j]) 286 | 287 | if len(new_population) < len(self): 288 | new_population.extend([LogReg(**self.kwargs) for i in range(len(self)-len(new_population))]) 289 | self.regs = new_population 290 | 291 | 292 | 293 | def mutate(self,generation): 294 | for d in self: 295 | d.mutate(generation) 296 | 297 | 298 | 299 | def evolve(self,top = 0.25,n_generations = 20,last_selection = True): 300 | n_fittest = int(top*len(self)) 301 | offsprings = len(list(itertools.combinations(range(n_fittest),2))) 302 | print("- Generations {}".format(len(self))) 303 | print("- Fittest : {}".format(n_fittest)) 304 | print("- Offsprings : {}".format(offsprings)) 305 | 306 | all_fitnesses = [self.evaluate()[1]] 307 | 308 | for generation in tqdm(range(n_generations)): 309 | 310 | indices = self.selection(top) 311 | self.crossover(indices) 312 | self.mutate(generation) 313 | 314 | indices,fitnesses = self.evaluate() 315 | all_fitnesses.append(fitnesses) 316 | 317 | self._plot_fitnesses(all_fitnesses) 318 | 319 | if last_selection: 320 | indices = self.selection(top) 321 | 322 | return PopulationLogReg(self.x,self.y,regs = self[indices]) 323 | 324 | 325 | 326 | def _plot_fitnesses(self,fitnesses): 327 | 328 | from sklearn.linear_model import LogisticRegression 329 | lr = LogisticRegression() 330 | lr.fit(self.x,self.y) 331 | pred_bench = lr.predict_proba(self.x) 332 | loss_bench = np.sum(np.log(pred_bench + 10e-9)*self.y.reshape(-1,1)) + np.sum(np.log(1-pred_bench + 10e-9)*(1-self.y).reshape(-1,1)) 333 | 334 | sups = [] 335 | infs = [] 336 | means = [] 337 | for step in fitnesses: 338 | sups.append(np.max(step)) 339 | infs.append(np.min(step)) 340 | means.append(np.mean(step)) 341 | 342 | plt.figure(figsize=(10,6)) 343 | plt.plot(means) 344 | plt.fill_between(range(len(means)),sups,infs, alpha = 0.2) 345 | plt.axhline(loss_bench) 346 | plt.xlabel('# Generation') 347 | plt.ylabel('Fitness') 348 | plt.legend() 349 | plt.show() 350 | 351 | 352 | -------------------------------------------------------------------------------- /4. Chrome Dino/images/capture1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/capture1.png -------------------------------------------------------------------------------- /4. Chrome Dino/images/dino_hardcoded_agent.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_hardcoded_agent.gif -------------------------------------------------------------------------------- /4. Chrome Dino/images/dino_ml_agent1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1.gif -------------------------------------------------------------------------------- /4. Chrome Dino/images/dino_ml_agent1_bad.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1_bad.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/README.md: -------------------------------------------------------------------------------- 1 | # Delivery optimization with Reinforcement Learning 2 | ![](http://img.chefdentreprise.com/Img/BREVE/2018/3/328332/recette-nestor-atteindre-rentabilite-F.jpg) 3 | 4 | This folder contains experiments to solve transportation optimization using **Reinforcement Learning** algorithm
5 | It will use the code of RL agents previously created in this repo. 6 | 7 | > The overall goal is to optimize routing between deliveries via **experience replay**
8 | > And be robust to anomalies such as traffic slowing down the vehicles in a zone 9 | 10 | ##### Preliminary remarks 11 | Such a problem (Travelling Salesman Problem) has many possible solutions including brute force or heuristic solutions.
12 | The goal here was to demonstrate the use of Reinforcement Learning in particular **when the cost function between two points is stochastic**
13 | It shows also a different resolution with a algorithm that could be used in a live system and automatically improves other time towards the best stragies. 14 | 15 | 16 | # The environment 17 | 18 | ## Environment implementation 19 | 20 | All the environment was coded from scratch with parameterized : 21 | - Number of stops for delivery 22 | - Traffic zone size (optional) 23 | - Traffic intensity (optional) 24 | 25 | *The convention used are the same as for OpenAI Gym's environments*
26 | *Only numpy and other basic libraries are used here for the environment* 27 | 28 | ##### Base environment with one trajectory 29 | ![](env1.png) 30 | 31 | ##### Base environment with 500 stops 32 | ![](env2.png) 33 | 34 | ##### Base environment with traffic zone 35 | ![](env3.png) 36 | 37 | ## Rewards 38 | - Rewards from the environment between two delivery stops are simply the time elapsed between two travels which is calculated by taking the euclidean distance between two points plus a gaussian noise 39 | - If the trajectory between two stops goes through the traffic zone, the time elapsed is longer via a noise proportional to the distance through the zone and the traffic intensity parameter 40 | 41 | 42 | # The algorithm 43 | 44 | ## Q-Learning 45 | - A simple **Q-Learning** algorithm already gave interesting results. 46 | - The **reward** taken was the opposite of the time elapsed taken from the environment 47 | - An **epsilon-greedy** strategy allow to discover new paths and strategies while exploring 48 | 49 | ##### Training 50 | Over experience replays, the delivery takes less and less time 51 | ![](training.png) 52 | 53 | ### Results 54 | ##### 50 stops with no traffic 55 | ![](training_50_stops.gif) 56 | 57 | 58 | ##### 100 stops with no traffic 59 | ![](training_100_stops.gif) 60 | 61 | ##### 500 stops with no traffic 62 | ![](training_500_stops.gif) 63 | 64 | ##### 100 stops with intense traffic 65 | ![](training_100_stops_traffic.gif) 66 | 67 | ##### 500 stops with intense traffic 68 | ![](training_500_stops_traffic.gif) 69 | 70 | 71 | # Next steps 72 | - Test other simple algorithms like SARSA 73 | - Switch from discrete to continuous problems with Deep-Q-Learning (start including continuous observation space) and then DDPG (including continuous action space) 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /5. Delivery Optimization/Routing optimization with Deep Reinforcement Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Routing optimization using Deep Reinforcement Learning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2019-09-17T17:47:22.119995Z", 16 | "start_time": "2019-09-17T17:47:20.289509Z" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# Base Data Science snippet\n", 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import os\n", 26 | "import time\n", 27 | "from tqdm import tqdm_notebook\n", 28 | "\n", 29 | "%matplotlib inline\n", 30 | "%load_ext autoreload\n", 31 | "%autoreload 2" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [] 40 | } 41 | ], 42 | "metadata": { 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.4" 59 | }, 60 | "toc": { 61 | "base_numbering": 1, 62 | "nav_menu": {}, 63 | "number_sections": true, 64 | "sideBar": true, 65 | "skip_h1_title": false, 66 | "title_cell": "Table of Contents", 67 | "title_sidebar": "Contents", 68 | "toc_cell": false, 69 | "toc_position": {}, 70 | "toc_section_display": true, 71 | "toc_window_display": false 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /5. Delivery Optimization/delivery.py: -------------------------------------------------------------------------------- 1 | # Base Data Science snippet 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import os 6 | import time 7 | from tqdm import tqdm_notebook 8 | from scipy.spatial.distance import cdist 9 | import imageio 10 | from matplotlib.patches import Rectangle 11 | from matplotlib.collections import PatchCollection 12 | 13 | plt.style.use("seaborn-dark") 14 | 15 | import sys 16 | sys.path.append("../") 17 | from rl.agents.q_agent import QAgent 18 | 19 | 20 | 21 | 22 | class DeliveryEnvironment(object): 23 | def __init__(self,n_stops = 10,max_box = 10,method = "distance",**kwargs): 24 | 25 | print(f"Initialized Delivery Environment with {n_stops} random stops") 26 | print(f"Target metric for optimization is {method}") 27 | 28 | # Initialization 29 | self.n_stops = n_stops 30 | self.action_space = self.n_stops 31 | self.observation_space = self.n_stops 32 | self.max_box = max_box 33 | self.stops = [] 34 | self.method = method 35 | 36 | # Generate stops 37 | self._generate_constraints(**kwargs) 38 | self._generate_stops() 39 | self._generate_q_values() 40 | self.render() 41 | 42 | # Initialize first point 43 | self.reset() 44 | 45 | 46 | def _generate_constraints(self,box_size = 0.2,traffic_intensity = 5): 47 | 48 | if self.method == "traffic_box": 49 | 50 | x_left = np.random.rand() * (self.max_box) * (1-box_size) 51 | y_bottom = np.random.rand() * (self.max_box) * (1-box_size) 52 | 53 | x_right = x_left + np.random.rand() * box_size * self.max_box 54 | y_top = y_bottom + np.random.rand() * box_size * self.max_box 55 | 56 | self.box = (x_left,x_right,y_bottom,y_top) 57 | self.traffic_intensity = traffic_intensity 58 | 59 | 60 | 61 | def _generate_stops(self): 62 | 63 | if self.method == "traffic_box": 64 | 65 | points = [] 66 | while len(points) < self.n_stops: 67 | x,y = np.random.rand(2)*self.max_box 68 | if not self._is_in_box(x,y,self.box): 69 | points.append((x,y)) 70 | 71 | xy = np.array(points) 72 | 73 | else: 74 | # Generate geographical coordinates 75 | xy = np.random.rand(self.n_stops,2)*self.max_box 76 | 77 | self.x = xy[:,0] 78 | self.y = xy[:,1] 79 | 80 | 81 | def _generate_q_values(self,box_size = 0.2): 82 | 83 | # Generate actual Q Values corresponding to time elapsed between two points 84 | if self.method in ["distance","traffic_box"]: 85 | xy = np.column_stack([self.x,self.y]) 86 | self.q_stops = cdist(xy,xy) 87 | elif self.method=="time": 88 | self.q_stops = np.random.rand(self.n_stops,self.n_stops)*self.max_box 89 | np.fill_diagonal(self.q_stops,0) 90 | else: 91 | raise Exception("Method not recognized") 92 | 93 | 94 | def render(self,return_img = False): 95 | 96 | fig = plt.figure(figsize=(7,7)) 97 | ax = fig.add_subplot(111) 98 | ax.set_title("Delivery Stops") 99 | 100 | # Show stops 101 | ax.scatter(self.x,self.y,c = "red",s = 50) 102 | 103 | # Show START 104 | if len(self.stops)>0: 105 | xy = self._get_xy(initial = True) 106 | xytext = xy[0]+0.1,xy[1]-0.05 107 | ax.annotate("START",xy=xy,xytext=xytext,weight = "bold") 108 | 109 | # Show itinerary 110 | if len(self.stops) > 1: 111 | ax.plot(self.x[self.stops],self.y[self.stops],c = "blue",linewidth=1,linestyle="--") 112 | 113 | # Annotate END 114 | xy = self._get_xy(initial = False) 115 | xytext = xy[0]+0.1,xy[1]-0.05 116 | ax.annotate("END",xy=xy,xytext=xytext,weight = "bold") 117 | 118 | 119 | if hasattr(self,"box"): 120 | left,bottom = self.box[0],self.box[2] 121 | width = self.box[1] - self.box[0] 122 | height = self.box[3] - self.box[2] 123 | rect = Rectangle((left,bottom), width, height) 124 | collection = PatchCollection([rect],facecolor = "red",alpha = 0.2) 125 | ax.add_collection(collection) 126 | 127 | 128 | plt.xticks([]) 129 | plt.yticks([]) 130 | 131 | if return_img: 132 | # From https://ndres.me/post/matplotlib-animated-gifs-easily/ 133 | fig.canvas.draw_idle() 134 | image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8') 135 | image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 136 | plt.close() 137 | return image 138 | else: 139 | plt.show() 140 | 141 | 142 | 143 | def reset(self): 144 | 145 | # Stops placeholder 146 | self.stops = [] 147 | 148 | # Random first stop 149 | first_stop = np.random.randint(self.n_stops) 150 | self.stops.append(first_stop) 151 | 152 | return first_stop 153 | 154 | 155 | def step(self,destination): 156 | 157 | # Get current state 158 | state = self._get_state() 159 | new_state = destination 160 | 161 | # Get reward for such a move 162 | reward = self._get_reward(state,new_state) 163 | 164 | # Append new_state to stops 165 | self.stops.append(destination) 166 | done = len(self.stops) == self.n_stops 167 | 168 | return new_state,reward,done 169 | 170 | 171 | def _get_state(self): 172 | return self.stops[-1] 173 | 174 | 175 | def _get_xy(self,initial = False): 176 | state = self.stops[0] if initial else self._get_state() 177 | x = self.x[state] 178 | y = self.y[state] 179 | return x,y 180 | 181 | 182 | def _get_reward(self,state,new_state): 183 | base_reward = self.q_stops[state,new_state] 184 | 185 | if self.method == "distance": 186 | return base_reward 187 | elif self.method == "time": 188 | return base_reward + np.random.randn() 189 | elif self.method == "traffic_box": 190 | 191 | # Additional reward correspond to slowing down in traffic 192 | xs,ys = self.x[state],self.y[state] 193 | xe,ye = self.x[new_state],self.y[new_state] 194 | intersections = self._calculate_box_intersection(xs,xe,ys,ye,self.box) 195 | if len(intersections) > 0: 196 | i1,i2 = intersections 197 | distance_traffic = np.sqrt((i2[1]-i1[1])**2 + (i2[0]-i1[0])**2) 198 | additional_reward = distance_traffic * self.traffic_intensity * np.random.rand() 199 | else: 200 | additional_reward = np.random.rand() 201 | 202 | return base_reward + additional_reward 203 | 204 | 205 | @staticmethod 206 | def _calculate_point(x1,x2,y1,y2,x = None,y = None): 207 | 208 | if y1 == y2: 209 | return y1 210 | elif x1 == x2: 211 | return x1 212 | else: 213 | a = (y2-y1)/(x2-x1) 214 | b = y2 - a * x2 215 | 216 | if x is None: 217 | x = (y-b)/a 218 | return x 219 | elif y is None: 220 | y = a*x+b 221 | return y 222 | else: 223 | raise Exception("Provide x or y") 224 | 225 | 226 | def _is_in_box(self,x,y,box): 227 | # Get box coordinates 228 | x_left,x_right,y_bottom,y_top = box 229 | return x >= x_left and x <= x_right and y >= y_bottom and y <= y_top 230 | 231 | 232 | def _calculate_box_intersection(self,x1,x2,y1,y2,box): 233 | 234 | # Get box coordinates 235 | x_left,x_right,y_bottom,y_top = box 236 | 237 | # Intersections 238 | intersections = [] 239 | 240 | # Top intersection 241 | i_top = self._calculate_point(x1,x2,y1,y2,y=y_top) 242 | if i_top > x_left and i_top < x_right: 243 | intersections.append((i_top,y_top)) 244 | 245 | # Bottom intersection 246 | i_bottom = self._calculate_point(x1,x2,y1,y2,y=y_bottom) 247 | if i_bottom > x_left and i_bottom < x_right: 248 | intersections.append((i_bottom,y_bottom)) 249 | 250 | # Left intersection 251 | i_left = self._calculate_point(x1,x2,y1,y2,x=x_left) 252 | if i_left > y_bottom and i_left < y_top: 253 | intersections.append((x_left,i_left)) 254 | 255 | # Right intersection 256 | i_right = self._calculate_point(x1,x2,y1,y2,x=x_right) 257 | if i_right > y_bottom and i_right < y_top: 258 | intersections.append((x_right,i_right)) 259 | 260 | return intersections 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | def run_episode(env,agent,verbose = 1): 269 | 270 | s = env.reset() 271 | agent.reset_memory() 272 | 273 | max_step = env.n_stops 274 | 275 | episode_reward = 0 276 | 277 | i = 0 278 | while i < max_step: 279 | 280 | # Remember the states 281 | agent.remember_state(s) 282 | 283 | # Choose an action 284 | a = agent.act(s) 285 | 286 | # Take the action, and get the reward from environment 287 | s_next,r,done = env.step(a) 288 | 289 | # Tweak the reward 290 | r = -1 * r 291 | 292 | if verbose: print(s_next,r,done) 293 | 294 | # Update our knowledge in the Q-table 295 | agent.train(s,a,r,s_next) 296 | 297 | # Update the caches 298 | episode_reward += r 299 | s = s_next 300 | 301 | # If the episode is terminated 302 | i += 1 303 | if done: 304 | break 305 | 306 | return env,agent,episode_reward 307 | 308 | 309 | 310 | 311 | 312 | 313 | class DeliveryQAgent(QAgent): 314 | 315 | def __init__(self,*args,**kwargs): 316 | super().__init__(*args,**kwargs) 317 | self.reset_memory() 318 | 319 | def act(self,s): 320 | 321 | # Get Q Vector 322 | q = np.copy(self.Q[s,:]) 323 | 324 | # Avoid already visited states 325 | q[self.states_memory] = -np.inf 326 | 327 | if np.random.rand() > self.epsilon: 328 | a = np.argmax(q) 329 | else: 330 | a = np.random.choice([x for x in range(self.actions_size) if x not in self.states_memory]) 331 | 332 | return a 333 | 334 | 335 | def remember_state(self,s): 336 | self.states_memory.append(s) 337 | 338 | def reset_memory(self): 339 | self.states_memory = [] 340 | 341 | 342 | 343 | def run_n_episodes(env,agent,name="training.gif",n_episodes=1000,render_each=10,fps=10): 344 | 345 | # Store the rewards 346 | rewards = [] 347 | imgs = [] 348 | 349 | # Experience replay 350 | for i in tqdm_notebook(range(n_episodes)): 351 | 352 | # Run the episode 353 | env,agent,episode_reward = run_episode(env,agent,verbose = 0) 354 | rewards.append(episode_reward) 355 | 356 | if i % render_each == 0: 357 | img = env.render(return_img = True) 358 | imgs.append(img) 359 | 360 | # Show rewards 361 | plt.figure(figsize = (15,3)) 362 | plt.title("Rewards over training") 363 | plt.plot(rewards) 364 | plt.show() 365 | 366 | # Save imgs as gif 367 | imageio.mimsave(name,imgs,fps = fps) 368 | 369 | return env,agent -------------------------------------------------------------------------------- /5. Delivery Optimization/env1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env1.png -------------------------------------------------------------------------------- /5. Delivery Optimization/env2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env2.png -------------------------------------------------------------------------------- /5. Delivery Optimization/env3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env3.png -------------------------------------------------------------------------------- /5. Delivery Optimization/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training.png -------------------------------------------------------------------------------- /5. Delivery Optimization/training_100_stops.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/training_100_stops_traffic.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops_traffic.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/training_10_stops.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_10_stops.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/training_500_stops.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/training_500_stops_traffic.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops_traffic.gif -------------------------------------------------------------------------------- /5. Delivery Optimization/training_50_stops.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_50_stops.gif -------------------------------------------------------------------------------- /6. Solving a Rubik's Cube/rubik.py: -------------------------------------------------------------------------------- 1 | # Base Data Science snippet 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import os 6 | import time 7 | from tqdm import tqdm_notebook 8 | from matplotlib.patches import Rectangle 9 | from matplotlib.collections import PatchCollection 10 | 11 | plt.style.use("seaborn-dark") 12 | 13 | # import sys 14 | # sys.path.append("../") 15 | # from rl.agents.q_agent import QAgent 16 | 17 | #---------------------------------------------------------------------------------------------------------------------------- 18 | # CONSTANTS 19 | #---------------------------------------------------------------------------------------------------------------------------- 20 | 21 | COLORS = ["red","white","orange","yellow","green","blue"] 22 | WIDTH_SQUARE = 0.05 23 | FACES = ["LEFT","FRONT","RIGHT","BACK","TOP","BOTTOM"] 24 | 25 | LEFT_SLICE = np.s_[0,:] 26 | RIGHT_SLICE = np.s_[-1,:] 27 | TOP_SLICE = np.s_[:,0] 28 | BOTTOM_SLICE = np.s_[:,-1] 29 | 30 | FACES_LINK = { 31 | "LEFT":[ 32 | ("BACK",RIGHT_SLICE), 33 | ("BOTTOM",LEFT_SLICE), 34 | ("FRONT",LEFT_SLICE), 35 | ("TOP",LEFT_SLICE), 36 | ], 37 | "FRONT":[ 38 | ("LEFT",RIGHT_SLICE), 39 | ("BOTTOM",BOTTOM_SLICE), 40 | ("RIGHT",LEFT_SLICE), 41 | ("TOP",TOP_SLICE), 42 | ], 43 | "RIGHT":[ 44 | ("TOP",RIGHT_SLICE), 45 | ("FRONT",RIGHT_SLICE), 46 | ("BOTTOM",RIGHT_SLICE), 47 | ("BACK",LEFT_SLICE), 48 | ], 49 | "BACK":[ 50 | ("TOP",BOTTOM_SLICE), 51 | ("RIGHT",RIGHT_SLICE), 52 | ("BOTTOM",TOP_SLICE), 53 | ("LEFT",LEFT_SLICE), 54 | ], 55 | "TOP":[ 56 | ("LEFT",BOTTOM_SLICE), 57 | ("FRONT",BOTTOM_SLICE), 58 | ("RIGHT",BOTTOM_SLICE), 59 | ("BACK",BOTTOM_SLICE), 60 | ], 61 | "BOTTOM":[ 62 | ("BACK",TOP_SLICE), 63 | ("RIGHT",TOP_SLICE), 64 | ("FRONT",TOP_SLICE), 65 | ("LEFT",TOP_SLICE), 66 | ], 67 | } 68 | 69 | 70 | 71 | 72 | #---------------------------------------------------------------------------------------------------------------------------- 73 | # RUBIKS CUBE ENVIRONMENT CLASS 74 | #---------------------------------------------------------------------------------------------------------------------------- 75 | 76 | class RubiksCube(object): 77 | def __init__(self,shuffle = True): 78 | 79 | print(f"Initialized RubiksCube") 80 | self.data = np.array([[i]*9 for i in range(6)]) 81 | self.data = self._to_1D(self.data) 82 | 83 | if shuffle: 84 | np.random.shuffle(self.data) 85 | 86 | @staticmethod 87 | def _to_1D(array): 88 | return np.squeeze(array.reshape(1,-1)) 89 | 90 | @staticmethod 91 | def _to_2D(array): 92 | return array.reshape(6,9) 93 | 94 | @staticmethod 95 | def _to_square(face): 96 | return face.reshape(3,3) 97 | 98 | @staticmethod 99 | def _to_array(face): 100 | return face.reshape(9) 101 | 102 | 103 | @staticmethod 104 | def _facestr_to_faceid(face): 105 | """Convert face as string to face ID (between 0 and 5) 106 | """ 107 | if isinstance(face,str): 108 | assert face in FACES 109 | face = FACES.index(face) 110 | return face 111 | 112 | 113 | @staticmethod 114 | def _rotate_array(array,clockwise = True): 115 | if clockwise: 116 | return array[1:] + [array[0]] 117 | else: 118 | return [array[-1]] + array[:-1] 119 | 120 | 121 | def get_face(self,face,as_square = True): 122 | """Function to get one face of the Rubik's cube 123 | """ 124 | 125 | # Convert face as string to face ID (between 0 and 5) 126 | face = self._facestr_to_faceid(face) 127 | 128 | # Select matching face in the data array 129 | face = self.data[face*9:(face+1)*9] 130 | 131 | # Reshape face data to a square 132 | if as_square: 133 | face = self._to_square(face) 134 | 135 | # Return face data 136 | return face 137 | 138 | 139 | 140 | 141 | def set_face(self,face,array): 142 | 143 | # Convert face as string to face ID (between 0 and 5) 144 | face = self._facestr_to_faceid(face) 145 | 146 | # Reshape array 147 | if array.shape == (3,3): 148 | array = self._to_array(array) 149 | 150 | # Set face 151 | self.data[face*9:(face+1)*9] = array 152 | 153 | 154 | 155 | 156 | 157 | def rotate(self,face,clockwise = True): 158 | """Rotate one face of the Rubik's cube 159 | """ 160 | # Convert face as string to face ID (between 0 and 5) 161 | face_id = self._facestr_to_faceid(face) 162 | 163 | # Get face 164 | face_data = self.get_face(face_id) 165 | 166 | # Rotate selected face 167 | sense = -1 if clockwise else 1 168 | face_data = np.rot90(face_data,k=sense) 169 | self.set_face(face,face_data) 170 | 171 | # Get other faces 172 | linked_faces,slices = zip(*FACES_LINK[face]) 173 | slices_data = [np.copy(self.get_face(linked_faces[i])[slices[i]]) for i in range(4)] 174 | 175 | # Rotate arrays 176 | slices_data = self._rotate_array(slices_data,clockwise = clockwise) 177 | 178 | # Set new rotated arrays 179 | for i in range(4): 180 | face = linked_faces[i] 181 | face_data = self.get_face(face) 182 | face_data[slices[i]] = slices_data[i] 183 | self.set_face(face,face_data) 184 | 185 | 186 | 187 | def render3D(self): 188 | pass 189 | 190 | 191 | def render(self): 192 | 193 | fig = plt.figure(figsize=(7,7)) 194 | ax = fig.add_subplot(111) 195 | 196 | for i in range(4): 197 | face_data = self.data[i*9:(i+1)*9] 198 | face = RubiksFace(face_data) 199 | face.render(ax = ax,init_height = 0.4,init_width = 0.15 + i*3*(WIDTH_SQUARE+0.005)) 200 | 201 | 202 | for i in range(4,6): 203 | face_data = self.data[i*9:(i+1)*9] 204 | face = RubiksFace(face_data) 205 | init_height = 0.4 + 3*(WIDTH_SQUARE+0.005) if i == 4 else 0.4 - 3*(WIDTH_SQUARE+0.005) 206 | face.render(ax = ax,init_height = init_height,init_width = 0.15 + 3*(WIDTH_SQUARE+0.005)) 207 | 208 | plt.xticks([]) 209 | plt.yticks([]) 210 | plt.show() 211 | 212 | 213 | 214 | 215 | 216 | class RubiksFace(object): 217 | def __init__(self,array): 218 | if array.shape == (3,3): 219 | self.array = array 220 | else: 221 | assert len(array) == 9 222 | self.array = array.reshape(3,3) 223 | 224 | def render(self,ax = None,init_height = 0,init_width = 0): 225 | 226 | if ax is None: 227 | fig = plt.figure(figsize=(7,7)) 228 | ax = fig.add_subplot(111) 229 | 230 | 231 | 232 | for i in range(3): 233 | for j in range(3): 234 | 235 | square = self.array[i,j] 236 | color = COLORS[square] 237 | 238 | rect = Rectangle((init_width + i*WIDTH_SQUARE,init_height + j*WIDTH_SQUARE), WIDTH_SQUARE, WIDTH_SQUARE) 239 | collection = PatchCollection([rect],facecolor = color,alpha = 0.8,edgecolor="black") 240 | ax.add_collection(collection) 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /7. Multi-Agents Simulations/20200318 - Hyperion dev.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hyperion Library development" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2020-03-18T17:16:33.161399Z", 16 | "start_time": "2020-03-18T17:16:31.745503Z" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "# Base Data Science snippet\n", 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import os\n", 26 | "import time\n", 27 | "from tqdm import tqdm_notebook\n", 28 | "\n", 29 | "%matplotlib inline\n", 30 | "%load_ext autoreload\n", 31 | "%autoreload 2" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import sys\n", 41 | "sys.path.append(\"c:/git/reinforcement-learning/\")\n", 42 | "\n", 43 | "from hyperion.grid import *" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Playground" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import pygame\n", 60 | "\n", 61 | "pygame.init()\n", 62 | "\n", 63 | "ecran = pygame.display.set_mode((300, 200))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "pygame.quit()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.7.4" 100 | }, 101 | "toc": { 102 | "base_numbering": 1, 103 | "nav_menu": {}, 104 | "number_sections": true, 105 | "sideBar": true, 106 | "skip_h1_title": false, 107 | "title_cell": "Table of Contents", 108 | "title_sidebar": "Contents", 109 | "toc_cell": false, 110 | "toc_position": {}, 111 | "toc_section_display": true, 112 | "toc_window_display": false 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /7. Multi-Agents Simulations/README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agents simulation 2 | ![](https://thumbs.gfycat.com/EvergreenGenuineAmethystgemclam-size_restricted.gif) 3 | 4 | Simulations including multiple agents are present everywhere in our daily lives, from large-scale economics policies to epidemiology.
5 | Agent-based modeling is even more effective when merged with modern AI techniques such as Reinforcement Learning.
6 | This folder contains experiments on this topics 7 | 8 | # Experiments summary 9 | - **October 2019** - First attempts to create a Sugarscape experiment. Developed a framework using Dataframes for accelerated computations. Yet too many interactions to code from scratch and low performance 10 | - **December 2019** - Discovered Unity for such simulations + ML Agents 11 | - **March 2019** - Due to COVID19 outbreak, I started experiments on Multi Agent modeling and social distancing. PyGame is a good candidate for 2D simulations similar to Unity but in Python. Many possibilities and spatial O(n2) interactions are really sped up thanks to colliders embedded in PyGame. Movements are still feasible up to 10k agents at least in my first experiments. Moved experiments in the [westworld](https://github.com/TheoLvs/westworld) repo. 12 | 13 | 14 | # References 15 | ## Libraries & softwares 16 | - Unity 17 | - NetLogo 18 | - [MESA](https://github.com/projectmesa/mesa) - Python 19 | - [SPADE](https://spade-mas.readthedocs.io/en/latest/readme.html) - Python 20 | - [abcEconomics](https://abce.readthedocs.io/en/master/) 21 | - [GAMA-Platform](https://gama-platform.github.io/) 22 | - [Manim](https://github.com/3b1b/manim) by the great Grant Sanderson 23 | - PyGame 24 | 25 | ## Tutorials 26 | - [Introduction to Agent Based Modeling in Python](https://towardsdatascience.com/introduction-to-mesa-agent-based-modeling-in-python-bcb0596e1c9a) 27 | 28 | ## Inspiration 29 | - https://www.complexity-explorables.org/ 30 | - Sugarscape https://www.youtube.com/watch?v=r_It_X7v-1E 31 | - youtube.com/watch?v=uVpN136q7N8 32 | - youtube.com/watch?v=Bot5_DouTWg 33 | - Ant-based modeling 34 | 35 | ## Features to implement 36 | - Set and reload data -> ok 37 | - Animation over the simulation (gif ok, ipywidgets to go) 38 | - Action framework with delayed deferrence 39 | - Metrics storage for each agent 40 | - Set up geographical zones and 2D maps with impossible moves 41 | - Find closest agent method 42 | - Wander method 43 | - Launch simulation until certain time + early stopping 44 | - Circle collider 45 | - Optimizing tutorial towardsdatascience.com/speeding-up-python-code-fast-filtering-and-slow-loops-8e11a09a9c2f for more optimization 46 | - A* algorithm for shortest path 47 | - Heatmaps of navigation presence for retail use cases -------------------------------------------------------------------------------- /7. Multi-Agents Simulations/pygame_test.py: -------------------------------------------------------------------------------- 1 | """Pygame test for multi agent modeling 2 | 3 | Tutorials 4 | https://zestedesavoir.com/tutoriels/846/pygame-pour-les-zesteurs/1381_a-la-decouverte-de-pygame/creer-une-simple-fenetre-personnalisable/#1-15425_creons-une-fenetre-basique 5 | https://www.pygame.org/docs/ref/rect.html#pygame.Rect.move_ip 6 | https://stackoverflow.com/questions/32061507/moving-a-rectangle-in-pygame 7 | 8 | 9 | Ideas: 10 | - Add circles 11 | - Pathfinding algorithm 12 | - Obstacles 13 | - Colliders 14 | - Clicking to add agent or wall 15 | - Grid 16 | - AutoMaze 17 | - Raytracing 18 | - Change Icon 19 | - Heatmaps of where agents were located (for retail purposes) 20 | 21 | Projects: 22 | - Epidemiology 23 | - See MESA or NetLogo examples 24 | - Bunny & Rabbits 25 | """ 26 | 27 | import numpy as np 28 | import pygame 29 | import time 30 | import uuid 31 | 32 | # import os 33 | # os.environ['SDL_VIDEO_WINDOW_POS'] = "%d,%d" % (320,240) 34 | 35 | pygame.init() 36 | pygame.display.set_caption("Multi Agent Modeling Environment") 37 | # ecran = pygame.display.set_mode((0, 0), pygame.FULLSCREEN) 38 | 39 | screen = pygame.display.set_mode((1000, 600)) 40 | 41 | simulation_on = True 42 | # time.sleep(5) 43 | 44 | background_colour = (0, 0, 0) 45 | 46 | 47 | 48 | 49 | 50 | 51 | class RectangleAgent: 52 | 53 | def __init__(self,width,height,x,y,screen = None): 54 | # Rect left top width height 55 | 56 | self.screen = screen 57 | self.fig = pygame.rect.Rect((x,y,width,height)) 58 | # print(f"Initialized rect at {self.pos}") 59 | 60 | self.change_direction() 61 | 62 | self.agent_id = str(uuid.uuid1()) 63 | 64 | 65 | @property 66 | def pos(self): 67 | return self.fig.x,self.fig.y,self.fig.width,self.fig.height 68 | 69 | def move_at(self,x,y): 70 | self.x = x 71 | self.y = y 72 | 73 | 74 | def wander(self,dl): 75 | 76 | self.move(angle = self.direction_angle,dl = dl) 77 | 78 | 79 | def change_direction(self): 80 | self.direction_angle = np.random.uniform(0,2*np.pi) 81 | 82 | 83 | def move_towards(self): 84 | pass 85 | 86 | 87 | def collides(self,agents): 88 | 89 | if len(agents) == 0: 90 | collisions = [] 91 | else: 92 | other_agents = [agent.fig for agent in agents if agent.agent_id != self.agent_id] 93 | collisions = self.fig.collidelistall(other_agents) 94 | 95 | if len(collisions) > 0: 96 | return True,collisions 97 | else: 98 | return False,collisions 99 | 100 | 101 | def if_collides(self,agents): 102 | 103 | is_collision,collisions = self.collides(agents) 104 | 105 | if is_collision: 106 | self.direction_angle += np.pi 107 | 108 | 109 | 110 | def move(self,dx = 0,dy = 0,angle = None,dl = None,colliders = None): 111 | 112 | if angle is not None: 113 | assert dl is not None 114 | 115 | # Compute delta directions with basic trigonometry 116 | dx = dl * np.cos(angle) 117 | dy = dl * np.sin(angle) 118 | self.move(dx = dx,dy = dy) 119 | 120 | else: 121 | 122 | screen_width = self.screen.get_width() 123 | screen_height = self.screen.get_height() 124 | 125 | old_x = self.fig.x 126 | old_y = self.fig.y 127 | 128 | if self.fig.x + dx > screen_width: 129 | self.fig.x = 0 130 | elif self.fig.x + dx < 0: 131 | self.fig.x = screen_width 132 | else: 133 | self.fig.x = self.fig.x + dx 134 | 135 | if self.fig.y + dy > screen_height: 136 | self.fig.y = 0 137 | elif self.fig.y + dy < 0: 138 | self.fig.y = screen_height 139 | else: 140 | self.fig.y = self.fig.y + dy 141 | 142 | if colliders is not None: 143 | if self.collides(colliders): 144 | self.fig.x = old_x 145 | self.fig.y = old_y 146 | 147 | 148 | # print(f"Position at {self.fig.x},{self.fig.y}") 149 | 150 | 151 | def render(self,color = (180,20,150)): 152 | pygame.draw.rect(self.screen,color,self.pos) 153 | # pygame.draw.circle(self.screen,color,(self.fig.x,self.fig.y),10) 154 | # pass 155 | 156 | 157 | 158 | 159 | class Obstacle: 160 | def __init__(self,width,height,x,y,screen = None): 161 | # Rect left top width height 162 | 163 | self.screen = screen 164 | self.fig = pygame.rect.Rect((x,y,width,height)) 165 | # print(f"Initialized rect at {self.pos}") 166 | self.agent_id = str(uuid.uuid1()) 167 | 168 | 169 | def render(self,color = (10,150,10)): 170 | pygame.draw.rect(self.screen,color,self.pos) 171 | 172 | 173 | @property 174 | def pos(self): 175 | return self.fig.x,self.fig.y,self.fig.width,self.fig.height 176 | 177 | 178 | 179 | size = 10 180 | n_rects = 500 181 | 182 | rects = [] 183 | 184 | for i in range(n_rects): 185 | new_rect = RectangleAgent( 186 | size,size, 187 | np.random.uniform(0,screen.get_width()), 188 | np.random.uniform(0,screen.get_height()), 189 | screen, 190 | ) 191 | 192 | rects.append(new_rect) 193 | 194 | 195 | 196 | 197 | i = 0 198 | stop = 1000 199 | 200 | obstacles = [ 201 | Obstacle(200,200,300,300,screen) 202 | ] 203 | 204 | 205 | while simulation_on: 206 | screen.fill(background_colour) 207 | 208 | for rect in rects: 209 | rect.wander(size) 210 | rect.if_collides(rects + obstacles) 211 | 212 | for rect in rects + obstacles: 213 | rect.render() 214 | 215 | for event in pygame.event.get(): 216 | if event.type == pygame.KEYDOWN: 217 | simulation_on = False 218 | 219 | elif event.type == pygame.MOUSEBUTTONUP: 220 | new_x,new_y = pygame.mouse.get_pos() 221 | # new_rect = RectangleAgent(size,size,new_x,new_y,screen) 222 | # rects.append(new_rect) 223 | 224 | new_obs = Obstacle(20,20,new_x,new_y,screen) 225 | obstacles.append(new_obs) 226 | 227 | 228 | 229 | pygame.display.update() 230 | # pygame.display.flip() 231 | 232 | time.sleep(0.05) 233 | 234 | 235 | if i == stop: 236 | simulation_on = False 237 | else: 238 | i+=1 239 | 240 | 241 | pygame.quit() -------------------------------------------------------------------------------- /7. Multi-Agents Simulations/test.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test.gif -------------------------------------------------------------------------------- /7. Multi-Agents Simulations/test2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test2.gif -------------------------------------------------------------------------------- /8. Unity ML agents tests/README.md: -------------------------------------------------------------------------------- 1 | # Unity ML Agents test 2 | 3 | > I've been creating environments directly with Python for a few years now, yet I've been facing lots of limitation due to the nature of Python
4 | > In 2020, the best option is probably Unity ML Agents. 5 | > This repo will hold experiments on custom Unity environments/games and Reinforcement Learning to attempt solving them 6 | 7 | ## Rolling a ball (January 2020) 8 | ![](rolling_a_ball/rollingaball1.png) 9 | > My first experiment is a simple game about rolling a ball affected by gravity trying to catch all 10 pickups randomly placed in the environment. Movement is directly affected by inertia. To create the same env, follow Unity official tutorial https://learn.unity.com/project/roll-a-ball-tutorial 10 | 11 | 12 | 13 | ## References ✨ 14 | ### To learn about Unity 15 | - Youtube holds greats resources such as Brackeys, Sebastian Lague or Jason Weimann channels. Huge thanks to those videos for teaching about Unity in such an entertaining way. 16 | - Unity official tutorials are great as well. 17 | 18 | 19 | ### To learn about Unity ML Agents 20 | - Documentation at https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Readme.md 21 | - Creating custom environments https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Create-New.md 22 | - Overview of how UML works https://github.com/Unity-Technologies/ml-agents/blob/master/docs/ML-Agents-Overview.md 23 | - [This great video](https://www.youtube.com/watch?v=x2RBxmooh8w) 24 | 25 | 26 | 27 | ## Installing ML Agents 28 | Follow [tutorial at this link](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) 29 | 30 | - Install Python wrapper with pip 31 | ``` 32 | pip install mlagents 33 | ``` 34 | - Clone ML Agents repo 35 | ``` 36 | git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git 37 | ``` 38 | - Install Barracuda 39 | - Copy ML-Agents folder from the cloned repo at ``UnitySDK/Assets`` in your Assets project folder -------------------------------------------------------------------------------- /8. Unity ML agents tests/rolling_a_ball/20200202 - Rolling a Ball.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Rolling a Ball" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![](rollingaball1.png)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Interaction test" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "> This comes from the getting started tutorial applied to the 3D Ball Agent" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Python version:\n", 41 | "3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import matplotlib.pyplot as plt\n", 47 | "import numpy as np\n", 48 | "import sys\n", 49 | "\n", 50 | "from mlagents_envs.environment import UnityEnvironment\n", 51 | "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n", 52 | "\n", 53 | "%matplotlib inline\n", 54 | "\n", 55 | "print(\"Python version:\")\n", 56 | "print(sys.version)\n", 57 | "\n", 58 | "# check Python version\n", 59 | "if (sys.version_info[0] < 3):\n", 60 | " raise Exception(\"ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stderr", 70 | "output_type": "stream", 71 | "text": [ 72 | "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "engine_configuration_channel = EngineConfigurationChannel()\n", 78 | "env = UnityEnvironment(base_port = 5004)#, file_name=env_name, side_channels = [engine_configuration_channel])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stderr", 88 | "output_type": "stream", 89 | "text": [ 90 | "INFO:mlagents_envs:Connected new brain:\n", 91 | "3DBall?team=0\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "#Reset the environment\n", 97 | "env.reset()\n", 98 | "\n", 99 | "# Set the default brain to work with\n", 100 | "group_name = env.get_agent_groups()[0]\n", 101 | "group_spec = env.get_agent_group_spec(group_name)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 7, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "Number of observations : 1\n", 114 | "Agent state looks like: \n", 115 | "[[-1.4673042e-02 -1.4683060e-02 -5.2082062e-01 4.0000000e+00\n", 116 | " -7.9952097e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 117 | " [-2.6140258e-02 3.4010161e-02 -4.5768166e-01 4.0000000e+00\n", 118 | " -5.5027008e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 119 | " [ 6.3632242e-02 3.7996579e-02 -1.1360741e+00 4.0000000e+00\n", 120 | " -4.1505909e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 121 | " [-4.6871606e-02 -3.9161425e-02 -6.1104012e-01 4.0000000e+00\n", 122 | " 5.6867313e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 123 | " [ 3.8746696e-02 7.7085062e-03 1.1423024e+00 4.0000000e+00\n", 124 | " -1.4589405e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 125 | " [ 4.8017994e-02 -7.4483551e-02 -5.7353783e-01 4.0000000e+00\n", 126 | " -3.8447380e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 127 | " [ 3.9585244e-02 -8.3357669e-02 -9.4123268e-01 4.0000000e+00\n", 128 | " -7.9583311e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 129 | " [ 8.0520153e-02 -2.9333552e-02 1.7612720e-01 4.0000000e+00\n", 130 | " 5.6848335e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 131 | " [ 8.3218820e-02 -7.4690364e-02 1.4817381e+00 4.0000000e+00\n", 132 | " 4.3329239e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 133 | " [ 5.2080988e-03 4.5170397e-03 1.4738545e+00 4.0000000e+00\n", 134 | " 6.0955667e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 135 | " [-4.5549396e-02 1.7029690e-02 -1.4121037e+00 4.0000000e+00\n", 136 | " -1.0720904e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n", 137 | " [ 5.7741486e-02 8.4876612e-02 5.8971786e-01 4.0000000e+00\n", 138 | " -7.8450203e-02 0.0000000e+00 0.0000000e+00 0.0000000e+00]]\n", 139 | "Agent state looks like: \n", 140 | "[-0.01467304 -0.01468306 -0.5208206 4. -0.79952097 0.\n", 141 | " 0. 0. ]\n", 142 | "Is there a visual observation ? False\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "# Get the state of the agents\n", 148 | "step_result = env.get_step_result(group_name)\n", 149 | "\n", 150 | "# Examine the number of observations per Agent\n", 151 | "print(\"Number of observations : \", len(group_spec.observation_shapes))\n", 152 | "\n", 153 | "# Examine the state space for the first observation for all agents\n", 154 | "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0]))\n", 155 | "\n", 156 | "# Examine the state space for the first observation for the first agent\n", 157 | "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0][0]))\n", 158 | "\n", 159 | "# Is there a visual observation ?\n", 160 | "vis_obs = any([len(shape) == 3 for shape in group_spec.observation_shapes])\n", 161 | "print(\"Is there a visual observation ?\", vis_obs)\n", 162 | "\n", 163 | "# Examine the visual observations\n", 164 | "if vis_obs:\n", 165 | " vis_obs_index = next(i for i,v in enumerate(group_spec.observation_shapes) if len(v) == 3)\n", 166 | " print(\"Agent visual observation look like:\")\n", 167 | " obs = step_result.obs[vis_obs_index]\n", 168 | " plt.imshow(obs[0,:,:,:])" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 9, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "Total reward this episode: 1.1000000312924385\n", 181 | "Total reward this episode: 0.6000000238418579\n", 182 | "Total reward this episode: 0.6000000238418579\n", 183 | "Total reward this episode: 2.300000049173832\n", 184 | "Total reward this episode: 1.1000000312924385\n", 185 | "Total reward this episode: 2.0000000447034836\n", 186 | "Total reward this episode: 1.1000000312924385\n", 187 | "Total reward this episode: 0.6000000238418579\n", 188 | "Total reward this episode: 1.4901161193847656e-08\n", 189 | "Total reward this episode: 1.2000000327825546\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "for episode in range(10):\n", 195 | " env.reset()\n", 196 | " step_result = env.get_step_result(group_name)\n", 197 | " done = False\n", 198 | " episode_rewards = 0\n", 199 | " while not done:\n", 200 | " action_size = group_spec.action_size\n", 201 | " if group_spec.is_action_continuous():\n", 202 | " action = np.random.randn(step_result.n_agents(), group_spec.action_size)\n", 203 | " \n", 204 | " if group_spec.is_action_discrete():\n", 205 | " branch_size = group_spec.discrete_action_branches\n", 206 | " action = np.column_stack([np.random.randint(0, branch_size[i], size=(step_result.n_agents())) for i in range(len(branch_size))])\n", 207 | " env.set_actions(group_name, action)\n", 208 | " env.step()\n", 209 | " step_result = env.get_step_result(group_name)\n", 210 | " episode_rewards += step_result.reward[0]\n", 211 | " done = step_result.done[0]\n", 212 | " print(\"Total reward this episode: {}\".format(episode_rewards))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "# Interaction test with custom environment" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stderr", 229 | "output_type": "stream", 230 | "text": [ 231 | "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "import matplotlib.pyplot as plt\n", 237 | "import numpy as np\n", 238 | "import sys\n", 239 | "\n", 240 | "from mlagents_envs.environment import UnityEnvironment\n", 241 | "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n", 242 | "\n", 243 | "engine_configuration_channel = EngineConfigurationChannel()\n", 244 | "env = UnityEnvironment(base_port = 5004, side_channels = [engine_configuration_channel])" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.6.4" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | -------------------------------------------------------------------------------- /8. Unity ML agents tests/rolling_a_ball/rollingaball1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/8. Unity ML agents tests/rolling_a_ball/rollingaball1.png -------------------------------------------------------------------------------- /9. Discrete optimization with RL/README.md: -------------------------------------------------------------------------------- 1 | # Discrete Optimization with RL 2 | 3 | > Comparison between classical techniques and RL in discrete optimization.
4 | > These experiments are run alongside the MOOC on Discrete Optimization by the University of Melbourne 5 | 6 | 7 | ## Folder structure 8 | ``` 9 | - lessons - personal notes on discrete Optimization, mostly from Coursera MOOC 10 | - knapsack_problem - experiments on the knapsack problem, from classical optimization to RL 11 | ``` 12 | 13 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/Solver.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.util.List; 3 | import java.util.ArrayList; 4 | 5 | /** 6 | * The class Solver is an implementation of a greedy algorithm to solve the knapsack problem. 7 | * 8 | */ 9 | public class Solver { 10 | 11 | /** 12 | * The main class 13 | */ 14 | public static void main(String[] args) { 15 | try { 16 | solve(args); 17 | } catch (IOException e) { 18 | e.printStackTrace(); 19 | } 20 | } 21 | 22 | /** 23 | * Read the instance, solve it, and print the solution in the standard output 24 | */ 25 | public static void solve(String[] args) throws IOException { 26 | String fileName = null; 27 | 28 | // get the temp file name 29 | for(String arg : args){ 30 | if(arg.startsWith("-file=")){ 31 | fileName = arg.substring(6); 32 | } 33 | } 34 | if(fileName == null) 35 | return; 36 | 37 | // read the lines out of the file 38 | List lines = new ArrayList(); 39 | 40 | BufferedReader input = new BufferedReader(new FileReader(fileName)); 41 | try { 42 | String line = null; 43 | while (( line = input.readLine()) != null){ 44 | lines.add(line); 45 | } 46 | } 47 | finally { 48 | input.close(); 49 | } 50 | 51 | 52 | // parse the data in the file 53 | String[] firstLine = lines.get(0).split("\\s+"); 54 | int items = Integer.parseInt(firstLine[0]); 55 | int capacity = Integer.parseInt(firstLine[1]); 56 | 57 | int[] values = new int[items]; 58 | int[] weights = new int[items]; 59 | 60 | for(int i=1; i < items+1; i++){ 61 | String line = lines.get(i); 62 | String[] parts = line.split("\\s+"); 63 | 64 | values[i-1] = Integer.parseInt(parts[0]); 65 | weights[i-1] = Integer.parseInt(parts[1]); 66 | } 67 | 68 | // a trivial greedy algorithm for filling the knapsack 69 | // it takes items in-order until the knapsack is full 70 | int value = 0; 71 | int weight = 0; 72 | int[] taken = new int[items]; 73 | 74 | for(int i=0; i < items; i++){ 75 | if(weight + weights[i] <= capacity){ 76 | taken[i] = 1; 77 | value += values[i]; 78 | weight += weights[i]; 79 | } else { 80 | taken[i] = 0; 81 | } 82 | } 83 | 84 | // prepare the solution in the specified output format 85 | System.out.println(value+" 0"); 86 | for(int i=0; i < items; i++){ 87 | System.out.print(taken[i]+" "); 88 | } 89 | System.out.println(""); 90 | } 91 | } -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/_coursera: -------------------------------------------------------------------------------- 1 | _le-pVv_EeasJA5dVmWj2w 2 | Knapsack 3 | awPVV, ./data/ks_30_0, solver.py, Knapsack Problem 1 4 | hHYWS, ./data/ks_50_0, solver.py, Knapsack Problem 2 5 | JwWnx, ./data/ks_200_0, solver.py, Knapsack Problem 3 6 | Z2tMt, ./data/ks_400_0, solver.py, Knapsack Problem 4 7 | PUIxa, ./data/ks_1000_0, solver.py, Knapsack Problem 5 8 | AKXWc, ./data/ks_10000_0, solver.py, Knapsack Problem 6 9 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_0: -------------------------------------------------------------------------------- 1 | 100 100000 2 | 90000 90001 3 | 89750 89751 4 | 10001 10002 5 | 89500 89501 6 | 10252 10254 7 | 89250 89251 8 | 10503 10506 9 | 89000 89001 10 | 10754 10758 11 | 88750 88751 12 | 11005 11010 13 | 88500 88501 14 | 11256 11262 15 | 88250 88251 16 | 11507 11514 17 | 88000 88001 18 | 11758 11766 19 | 87750 87751 20 | 12009 12018 21 | 87500 87501 22 | 12260 12270 23 | 87250 87251 24 | 12511 12522 25 | 87000 87001 26 | 12762 12774 27 | 86750 86751 28 | 13013 13026 29 | 86500 86501 30 | 13264 13278 31 | 86250 86251 32 | 13515 13530 33 | 86000 86001 34 | 13766 13782 35 | 85750 85751 36 | 14017 14034 37 | 85500 85501 38 | 14268 14286 39 | 85250 85251 40 | 14519 14538 41 | 85000 85001 42 | 14770 14790 43 | 84750 84751 44 | 15021 15042 45 | 84500 84501 46 | 15272 15294 47 | 84250 84251 48 | 15523 15546 49 | 84000 84001 50 | 15774 15798 51 | 83750 83751 52 | 16025 16050 53 | 83500 83501 54 | 16276 16302 55 | 83250 83251 56 | 16527 16554 57 | 83000 83001 58 | 16778 16806 59 | 82750 82751 60 | 17029 17058 61 | 82500 82501 62 | 17280 17310 63 | 82250 82251 64 | 17531 17562 65 | 82000 82001 66 | 17782 17814 67 | 81750 81751 68 | 18033 18066 69 | 81500 81501 70 | 18284 18318 71 | 81250 81251 72 | 18535 18570 73 | 81000 81001 74 | 18786 18822 75 | 80750 80751 76 | 19037 19074 77 | 80500 80501 78 | 19288 19326 79 | 80250 80251 80 | 19539 19578 81 | 80000 80001 82 | 19790 19830 83 | 79750 79751 84 | 20041 20082 85 | 79500 79501 86 | 20292 20334 87 | 79250 79251 88 | 20543 20586 89 | 79000 79001 90 | 20794 20838 91 | 78750 78751 92 | 21045 21090 93 | 78500 78501 94 | 21296 21342 95 | 78250 78251 96 | 21547 21594 97 | 78000 78001 98 | 21798 21846 99 | 77750 77751 100 | 22049 22098 101 | 77500 77501 102 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_1: -------------------------------------------------------------------------------- 1 | 100 3190802 2 | 1491 3882 3 | 399 1298 4 | 77 654 5 | 969 2638 6 | 8485 20670 7 | 55 610 8 | 1904 4908 9 | 703 2106 10 | 657 2014 11 | 932 2564 12 | 1201 3302 13 | 1697 4494 14 | 462 1424 15 | 1201 3302 16 | 111632 267364 17 | 9044 21988 18 | 147380 352660 19 | 31852 76604 20 | 9044 21988 21 | 9300 22700 22 | 8660 21020 23 | 174684 418068 24 | 19844 47788 25 | 9044 21988 26 | 1635 4370 27 | 62788 150476 28 | 6932 16964 29 | 6308 15516 30 | 50 600 31 | 4600 11300 32 | 565204 1351508 33 | 7463 18226 34 | 2988 7476 35 | 9044 21988 36 | 9044 21988 37 | 4040 9980 38 | 137732 329764 39 | 7150 17400 40 | 9300 22700 41 | 177 854 42 | 372 1244 43 | 499 1498 44 | 15108 36516 45 | 11108 26916 46 | 2468 6236 47 | 1133 3166 48 | 1490 3880 49 | 865 2430 50 | 2468 6236 51 | 2468 6236 52 | 5974 14648 53 | 5972 14644 54 | 9532 23164 55 | 1872 4844 56 | 3964 9828 57 | 2799 7098 58 | 527708 1261916 59 | 7212 17724 60 | 3002 7504 61 | 21004 50708 62 | 47728 114556 63 | 565204 1351508 64 | 100600 240900 65 | 118920 284740 66 | 2822 7144 67 | 612 1924 68 | 6324 15548 69 | 9508 23116 70 | 9268 22636 71 | 11636 28172 72 | 210708 504116 73 | 2176944 5204588 74 | 930 2560 75 | 4481 11062 76 | 50 600 77 | 112 724 78 | 14434 34968 79 | 0 500 80 | 248 996 81 | 48 596 82 | 820 2340 83 | 278 1056 84 | 643 1986 85 | 1413 3726 86 | 1408 3716 87 | 0 500 88 | 2581 6662 89 | 287 1074 90 | 2040 5180 91 | 289 1078 92 | 1380 3660 93 | 372 1244 94 | 0 500 95 | 472 1444 96 | 360 1220 97 | 0 500 98 | 622 1944 99 | 3504 8708 100 | 5924 14548 101 | 2784 7068 102 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_2: -------------------------------------------------------------------------------- 1 | 100 10000 2 | 339 342 3 | 1629 1514 4 | 697 696 5 | 1299 1433 6 | 1613 1762 7 | 36 40 8 | 1737 1635 9 | 473 442 10 | 1859 1899 11 | 2055 1960 12 | 362 378 13 | 1104 1177 14 | 1880 1970 15 | 1349 1434 16 | 1545 1691 17 | 132 139 18 | 341 371 19 | 1430 1350 20 | 1878 1775 21 | 1870 1980 22 | 1536 1651 23 | 818 814 24 | 289 282 25 | 1690 1573 26 | 1437 1587 27 | 310 302 28 | 53 56 29 | 720 726 30 | 1707 1820 31 | 258 269 32 | 1842 1680 33 | 757 842 34 | 1642 1730 35 | 1149 1243 36 | 1970 1794 37 | 749 775 38 | 1904 1810 39 | 2 3 40 | 967 970 41 | 1310 1261 42 | 1004 997 43 | 1295 1192 44 | 1056 1036 45 | 51 52 46 | 1320 1453 47 | 1580 1673 48 | 480 440 49 | 604 624 50 | 1766 1813 51 | 1198 1326 52 | 1762 1637 53 | 2046 1902 54 | 315 323 55 | 714 746 56 | 434 471 57 | 1461 1366 58 | 1652 1511 59 | 1876 1785 60 | 906 1002 61 | 1483 1560 62 | 1355 1403 63 | 510 513 64 | 2114 1958 65 | 1479 1505 66 | 1618 1538 67 | 1472 1378 68 | 310 315 69 | 1478 1493 70 | 970 1066 71 | 43 40 72 | 1231 1172 73 | 1792 1972 74 | 870 956 75 | 1484 1541 76 | 1049 1014 77 | 56 55 78 | 814 793 79 | 978 985 80 | 1215 1311 81 | 720 737 82 | 210 204 83 | 460 492 84 | 1798 1961 85 | 1944 1952 86 | 208 204 87 | 1836 1872 88 | 882 806 89 | 239 234 90 | 141 136 91 | 49 49 92 | 1352 1363 93 | 915 883 94 | 1318 1259 95 | 72 70 96 | 937 886 97 | 1783 1843 98 | 1253 1319 99 | 1268 1375 100 | 1144 1234 101 | 878 818 102 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_106_0: -------------------------------------------------------------------------------- 1 | 106 106925262 2 | 45276 45276 3 | 90552 90552 4 | 181104 181104 5 | 362208 362208 6 | 724416 724416 7 | 1448832 1448832 8 | 2897664 2897664 9 | 5795328 5795328 10 | 11590656 11590656 11 | 23181312 23181312 12 | 46362624 46362624 13 | 92725248 92725248 14 | 70778 70778 15 | 141556 141556 16 | 283112 283112 17 | 566224 566224 18 | 1132448 1132448 19 | 2264896 2264896 20 | 4529792 4529792 21 | 9059584 9059584 22 | 18119168 18119168 23 | 36238336 36238336 24 | 72476672 72476672 25 | 86911 86911 26 | 173822 173822 27 | 347644 347644 28 | 695288 695288 29 | 1390576 1390576 30 | 2781152 2781152 31 | 5562304 5562304 32 | 11124608 11124608 33 | 22249216 22249216 34 | 44498432 44498432 35 | 88996864 88996864 36 | 92634 92634 37 | 185268 185268 38 | 370536 370536 39 | 741072 741072 40 | 1482144 1482144 41 | 2964288 2964288 42 | 5928576 5928576 43 | 11857152 11857152 44 | 23714304 23714304 45 | 47428608 47428608 46 | 94857216 94857216 47 | 97839 97839 48 | 195678 195678 49 | 391356 391356 50 | 782712 782712 51 | 1565424 1565424 52 | 3130848 3130848 53 | 6261696 6261696 54 | 12523392 12523392 55 | 25046784 25046784 56 | 50093568 50093568 57 | 100187136 100187136 58 | 125941 125941 59 | 251882 251882 60 | 503764 503764 61 | 1007528 1007528 62 | 2015056 2015056 63 | 4030112 4030112 64 | 8060224 8060224 65 | 16120448 16120448 66 | 32240896 32240896 67 | 64481792 64481792 68 | 134269 134269 69 | 268538 268538 70 | 537076 537076 71 | 1074152 1074152 72 | 2148304 2148304 73 | 4296608 4296608 74 | 8593216 8593216 75 | 17186432 17186432 76 | 34372864 34372864 77 | 68745728 68745728 78 | 141033 141033 79 | 282066 282066 80 | 564132 564132 81 | 1128264 1128264 82 | 2256528 2256528 83 | 4513056 4513056 84 | 9026112 9026112 85 | 18052224 18052224 86 | 36104448 36104448 87 | 72208896 72208896 88 | 147279 147279 89 | 294558 294558 90 | 589116 589116 91 | 1178232 1178232 92 | 2356464 2356464 93 | 4712928 4712928 94 | 9425856 9425856 95 | 18851712 18851712 96 | 37703424 37703424 97 | 75406848 75406848 98 | 153525 153525 99 | 307050 307050 100 | 614100 614100 101 | 1228200 1228200 102 | 2456400 2456400 103 | 4912800 4912800 104 | 9825600 9825600 105 | 19651200 19651200 106 | 39302400 39302400 107 | 78604800 78604800 108 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_19_0: -------------------------------------------------------------------------------- 1 | 19 31181 2 | 1945 4990 3 | 321 1142 4 | 2945 7390 5 | 4136 10372 6 | 1107 3114 7 | 1022 2744 8 | 1101 3102 9 | 2890 7280 10 | 962 2624 11 | 1060 3020 12 | 805 2310 13 | 689 2078 14 | 1513 3926 15 | 3878 9656 16 | 13504 32708 17 | 1865 4830 18 | 667 2034 19 | 1833 4766 20 | 16553 40006 21 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_0: -------------------------------------------------------------------------------- 1 | 200 100000 2 | 90001 90000 3 | 89751 89750 4 | 10002 10001 5 | 89501 89500 6 | 10254 10252 7 | 89251 89250 8 | 10506 10503 9 | 89001 89000 10 | 10758 10754 11 | 88751 88750 12 | 11010 11005 13 | 88501 88500 14 | 11262 11256 15 | 88251 88250 16 | 11514 11507 17 | 88001 88000 18 | 11766 11758 19 | 87751 87750 20 | 12018 12009 21 | 87501 87500 22 | 12270 12260 23 | 87251 87250 24 | 12522 12511 25 | 87001 87000 26 | 12774 12762 27 | 86751 86750 28 | 13026 13013 29 | 86501 86500 30 | 13278 13264 31 | 86251 86250 32 | 13530 13515 33 | 86001 86000 34 | 13782 13766 35 | 85751 85750 36 | 14034 14017 37 | 85501 85500 38 | 14286 14268 39 | 85251 85250 40 | 14538 14519 41 | 85001 85000 42 | 14790 14770 43 | 84751 84750 44 | 15042 15021 45 | 84501 84500 46 | 15294 15272 47 | 84251 84250 48 | 15546 15523 49 | 84001 84000 50 | 15798 15774 51 | 83751 83750 52 | 16050 16025 53 | 83501 83500 54 | 16302 16276 55 | 83251 83250 56 | 16554 16527 57 | 83001 83000 58 | 16806 16778 59 | 82751 82750 60 | 17058 17029 61 | 82501 82500 62 | 17310 17280 63 | 82251 82250 64 | 17562 17531 65 | 82001 82000 66 | 17814 17782 67 | 81751 81750 68 | 18066 18033 69 | 81501 81500 70 | 18318 18284 71 | 81251 81250 72 | 18570 18535 73 | 81001 81000 74 | 18822 18786 75 | 80751 80750 76 | 19074 19037 77 | 80501 80500 78 | 19326 19288 79 | 80251 80250 80 | 19578 19539 81 | 80001 80000 82 | 19830 19790 83 | 79751 79750 84 | 20082 20041 85 | 79501 79500 86 | 20334 20292 87 | 79251 79250 88 | 20586 20543 89 | 79001 79000 90 | 20838 20794 91 | 78751 78750 92 | 21090 21045 93 | 78501 78500 94 | 21342 21296 95 | 78251 78250 96 | 21594 21547 97 | 78001 78000 98 | 21846 21798 99 | 77751 77750 100 | 22098 22049 101 | 77501 77500 102 | 22350 22300 103 | 77251 77250 104 | 22602 22551 105 | 77001 77000 106 | 22854 22802 107 | 76751 76750 108 | 23106 23053 109 | 76501 76500 110 | 23358 23304 111 | 76251 76250 112 | 23610 23555 113 | 76001 76000 114 | 23862 23806 115 | 75751 75750 116 | 24114 24057 117 | 75501 75500 118 | 24366 24308 119 | 75251 75250 120 | 24618 24559 121 | 75001 75000 122 | 24870 24810 123 | 74751 74750 124 | 25122 25061 125 | 74501 74500 126 | 25374 25312 127 | 74251 74250 128 | 25626 25563 129 | 74001 74000 130 | 25878 25814 131 | 73751 73750 132 | 26130 26065 133 | 73501 73500 134 | 26382 26316 135 | 73251 73250 136 | 26634 26567 137 | 73001 73000 138 | 26886 26818 139 | 72751 72750 140 | 27138 27069 141 | 72501 72500 142 | 27390 27320 143 | 72251 72250 144 | 27642 27571 145 | 72001 72000 146 | 27894 27822 147 | 71751 71750 148 | 28146 28073 149 | 71501 71500 150 | 28398 28324 151 | 71251 71250 152 | 28650 28575 153 | 71001 71000 154 | 28902 28826 155 | 70751 70750 156 | 29154 29077 157 | 70501 70500 158 | 29406 29328 159 | 70251 70250 160 | 29658 29579 161 | 70001 70000 162 | 29910 29830 163 | 69751 69750 164 | 30162 30081 165 | 69501 69500 166 | 30414 30332 167 | 69251 69250 168 | 30666 30583 169 | 69001 69000 170 | 30918 30834 171 | 68751 68750 172 | 31170 31085 173 | 68501 68500 174 | 31422 31336 175 | 68251 68250 176 | 31674 31587 177 | 68001 68000 178 | 31926 31838 179 | 67751 67750 180 | 32178 32089 181 | 67501 67500 182 | 32430 32340 183 | 67251 67250 184 | 32682 32591 185 | 67001 67000 186 | 32934 32842 187 | 66751 66750 188 | 33186 33093 189 | 66501 66500 190 | 33438 33344 191 | 66251 66250 192 | 33690 33595 193 | 66001 66000 194 | 33942 33846 195 | 65751 65750 196 | 34194 34097 197 | 65501 65500 198 | 34446 34348 199 | 65251 65250 200 | 34698 34599 201 | 68451 68450 202 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_1: -------------------------------------------------------------------------------- 1 | 200 2640230 2 | 31860 76620 3 | 11884 28868 4 | 10492 25484 5 | 901 2502 6 | 43580 104660 7 | 9004 21908 8 | 6700 16500 9 | 29940 71980 10 | 7484 18268 11 | 5932 14564 12 | 7900 19300 13 | 6564 16028 14 | 6596 16092 15 | 8172 19844 16 | 5324 13148 17 | 8436 20572 18 | 7332 17964 19 | 6972 17044 20 | 7668 18636 21 | 6524 15948 22 | 6244 15388 23 | 635 1970 24 | 5396 13292 25 | 13596 32892 26 | 51188 122676 27 | 13684 33068 28 | 8596 20892 29 | 156840 375380 30 | 7900 19300 31 | 6460 15820 32 | 14132 34164 33 | 4980 12260 34 | 5216 12932 35 | 6276 15452 36 | 701 2102 37 | 3084 7868 38 | 6924 16948 39 | 5500 13500 40 | 3148 7996 41 | 47844 114788 42 | 226844 542788 43 | 25748 61996 44 | 7012 17124 45 | 3440 8580 46 | 15580 37660 47 | 314 1128 48 | 2852 7204 49 | 15500 37500 50 | 9348 22796 51 | 17768 42836 52 | 16396 39692 53 | 16540 39980 54 | 395124 944948 55 | 10196 24692 56 | 6652 16204 57 | 4848 11996 58 | 74372 178244 59 | 4556 11212 60 | 4900 12100 61 | 3508 8716 62 | 3820 9540 63 | 5460 13420 64 | 16564 40028 65 | 3896 9692 66 | 3832 9564 67 | 9012 21924 68 | 4428 10956 69 | 57796 138492 70 | 12052 29204 71 | 7052 17204 72 | 85864 205628 73 | 5068 12436 74 | 10484 25468 75 | 4516 11132 76 | 3620 9140 77 | 18052 43604 78 | 21 542 79 | 15804 38108 80 | 19020 45940 81 | 170844 408788 82 | 3732 9364 83 | 2920 7340 84 | 4120 10340 85 | 6828 16756 86 | 26252 63204 87 | 11676 28252 88 | 19916 47932 89 | 65488 156876 90 | 7172 17644 91 | 3772 9444 92 | 132868 318036 93 | 8332 20364 94 | 5308 13116 95 | 3780 9460 96 | 5208 12916 97 | 56788 136076 98 | 7172 17644 99 | 7868 19236 100 | 31412 75524 101 | 9252 22604 102 | 12276 29652 103 | 3712 9324 104 | 4516 11132 105 | 105876 253452 106 | 20084 48468 107 | 11492 27884 108 | 49092 117684 109 | 83452 199804 110 | 71372 171044 111 | 66572 159644 112 | 25268 60836 113 | 64292 154084 114 | 21228 51156 115 | 16812 40524 116 | 19260 46420 117 | 7740 18980 118 | 5632 13964 119 | 3256 8212 120 | 15580 37660 121 | 4824 11948 122 | 59700 143100 123 | 14500 35100 124 | 7208 17716 125 | 6028 14756 126 | 75716 181332 127 | 22364 53828 128 | 7636 18572 129 | 6444 15788 130 | 5192 12884 131 | 7388 18076 132 | 33156 79612 133 | 3032 7564 134 | 6628 16156 135 | 7036 17172 136 | 3200 8100 137 | 7300 17900 138 | 4452 11004 139 | 26364 63428 140 | 14036 33972 141 | 16932 40964 142 | 5788 14276 143 | 70476 168852 144 | 4552 11204 145 | 33980 81660 146 | 19300 46500 147 | 39628 95156 148 | 4484 11068 149 | 55044 131988 150 | 574 1848 151 | 29644 71188 152 | 9460 23020 153 | 106284 254468 154 | 304 1108 155 | 3580 8860 156 | 6308 15516 157 | 10492 25484 158 | 12820 31140 159 | 14436 34972 160 | 5044 12388 161 | 1155 3210 162 | 12468 30236 163 | 4380 10860 164 | 9876 24052 165 | 8752 21404 166 | 8676 21052 167 | 42848 102796 168 | 22844 54988 169 | 6244 15388 170 | 314 1128 171 | 314 1128 172 | 314 1128 173 | 314 1128 174 | 314 1128 175 | 314 1128 176 | 387480 926660 177 | 314 1128 178 | 314 1128 179 | 314 1128 180 | 314 1128 181 | 314 1128 182 | 15996 38692 183 | 8372 20444 184 | 65488 156876 185 | 304 1108 186 | 4756 11812 187 | 5012 12324 188 | 304 1108 189 | 314 1128 190 | 314 1128 191 | 314 1128 192 | 314 1128 193 | 314 1128 194 | 314 1128 195 | 314 1128 196 | 304 1108 197 | 1208 3316 198 | 47728 114556 199 | 314 1128 200 | 314 1128 201 | 314 1128 202 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_300_0: -------------------------------------------------------------------------------- 1 | 300 4040184 2 | 31860 76620 3 | 11884 28868 4 | 10492 25484 5 | 901 2502 6 | 43580 104660 7 | 9004 21908 8 | 6700 16500 9 | 29940 71980 10 | 7484 18268 11 | 5932 14564 12 | 7900 19300 13 | 6564 16028 14 | 6596 16092 15 | 8172 19844 16 | 5324 13148 17 | 8436 20572 18 | 7332 17964 19 | 6972 17044 20 | 7668 18636 21 | 6524 15948 22 | 6244 15388 23 | 635 1970 24 | 5396 13292 25 | 13596 32892 26 | 51188 122676 27 | 13684 33068 28 | 8596 20892 29 | 156840 375380 30 | 7900 19300 31 | 6460 15820 32 | 14132 34164 33 | 4980 12260 34 | 5216 12932 35 | 6276 15452 36 | 701 2102 37 | 3084 7868 38 | 6924 16948 39 | 5500 13500 40 | 3148 7996 41 | 47844 114788 42 | 226844 542788 43 | 25748 61996 44 | 7012 17124 45 | 3440 8580 46 | 15580 37660 47 | 314 1128 48 | 2852 7204 49 | 15500 37500 50 | 9348 22796 51 | 17768 42836 52 | 16396 39692 53 | 16540 39980 54 | 395124 944948 55 | 10196 24692 56 | 6652 16204 57 | 4848 11996 58 | 74372 178244 59 | 4556 11212 60 | 4900 12100 61 | 3508 8716 62 | 3820 9540 63 | 5460 13420 64 | 16564 40028 65 | 3896 9692 66 | 3832 9564 67 | 9012 21924 68 | 4428 10956 69 | 57796 138492 70 | 12052 29204 71 | 7052 17204 72 | 85864 205628 73 | 5068 12436 74 | 10484 25468 75 | 4516 11132 76 | 3620 9140 77 | 18052 43604 78 | 21 542 79 | 15804 38108 80 | 19020 45940 81 | 170844 408788 82 | 3732 9364 83 | 2920 7340 84 | 4120 10340 85 | 6828 16756 86 | 26252 63204 87 | 11676 28252 88 | 19916 47932 89 | 65488 156876 90 | 7172 17644 91 | 3772 9444 92 | 132868 318036 93 | 8332 20364 94 | 5308 13116 95 | 3780 9460 96 | 5208 12916 97 | 56788 136076 98 | 7172 17644 99 | 7868 19236 100 | 31412 75524 101 | 9252 22604 102 | 12276 29652 103 | 3712 9324 104 | 4516 11132 105 | 105876 253452 106 | 20084 48468 107 | 11492 27884 108 | 49092 117684 109 | 83452 199804 110 | 71372 171044 111 | 66572 159644 112 | 25268 60836 113 | 64292 154084 114 | 21228 51156 115 | 16812 40524 116 | 19260 46420 117 | 7740 18980 118 | 5632 13964 119 | 3256 8212 120 | 15580 37660 121 | 4824 11948 122 | 59700 143100 123 | 14500 35100 124 | 7208 17716 125 | 6028 14756 126 | 75716 181332 127 | 22364 53828 128 | 7636 18572 129 | 6444 15788 130 | 5192 12884 131 | 7388 18076 132 | 33156 79612 133 | 3032 7564 134 | 6628 16156 135 | 7036 17172 136 | 3200 8100 137 | 7300 17900 138 | 4452 11004 139 | 26364 63428 140 | 14036 33972 141 | 16932 40964 142 | 5788 14276 143 | 70476 168852 144 | 4552 11204 145 | 33980 81660 146 | 19300 46500 147 | 39628 95156 148 | 4484 11068 149 | 55044 131988 150 | 574 1848 151 | 29644 71188 152 | 9460 23020 153 | 106284 254468 154 | 304 1108 155 | 3580 8860 156 | 6308 15516 157 | 10492 25484 158 | 12820 31140 159 | 14436 34972 160 | 5044 12388 161 | 1155 3210 162 | 12468 30236 163 | 4380 10860 164 | 9876 24052 165 | 8752 21404 166 | 8676 21052 167 | 42848 102796 168 | 22844 54988 169 | 6244 15388 170 | 314 1128 171 | 314 1128 172 | 314 1128 173 | 314 1128 174 | 314 1128 175 | 314 1128 176 | 387480 926660 177 | 314 1128 178 | 314 1128 179 | 314 1128 180 | 314 1128 181 | 314 1128 182 | 15996 38692 183 | 8372 20444 184 | 65488 156876 185 | 304 1108 186 | 4756 11812 187 | 5012 12324 188 | 304 1108 189 | 314 1128 190 | 314 1128 191 | 314 1128 192 | 314 1128 193 | 314 1128 194 | 314 1128 195 | 314 1128 196 | 304 1108 197 | 1208 3316 198 | 47728 114556 199 | 314 1128 200 | 314 1128 201 | 314 1128 202 | 314 1128 203 | 314 1128 204 | 314 1128 205 | 104036 249172 206 | 5248 12996 207 | 312 1124 208 | 24468 58836 209 | 7716 18932 210 | 30180 72460 211 | 4824 11948 212 | 1120 3140 213 | 11496 27892 214 | 4916 12132 215 | 14428 34956 216 | 24948 59996 217 | 41100 98700 218 | 28692 69084 219 | 826 2352 220 | 3073 7846 221 | 7684 18868 222 | 5604 13708 223 | 17188 41476 224 | 34828 83756 225 | 7540 18380 226 | 8004 19508 227 | 2648 6796 228 | 5124 12748 229 | 3096 7892 230 | 166516 398532 231 | 13756 33212 232 | 9980 24260 233 | 15980 38660 234 | 9056 22012 235 | 5052 12404 236 | 8212 20124 237 | 11164 27028 238 | 13036 31572 239 | 23596 56892 240 | 2028 5156 241 | 7584 18468 242 | 5772 14244 243 | 4124 10348 244 | 5368 13236 245 | 4364 10828 246 | 5604 13708 247 | 8500 20700 248 | 7676 18652 249 | 8636 20972 250 | 4588 11276 251 | 4152 10404 252 | 4860 12020 253 | 5484 13468 254 | 8636 20972 255 | 5140 12780 256 | 236380 565460 257 | 116500 278900 258 | 36480 87660 259 | 16968 41036 260 | 5232 12964 261 | 13280 32060 262 | 138032 330364 263 | 9044 21988 264 | 22028 53156 265 | 4632 11564 266 | 13196 31892 267 | 65404 156708 268 | 28940 69580 269 | 865 2430 270 | 45988 110276 271 | 670 2040 272 | 4820 11940 273 | 41356 99212 274 | 39844 95588 275 | 897 2494 276 | 4028 9956 277 | 7924 19348 278 | 47756 114612 279 | 47036 112772 280 | 25908 62316 281 | 4516 11132 282 | 29460 70820 283 | 7964 19428 284 | 16964 41028 285 | 22196 53492 286 | 68140 163380 287 | 80924 193948 288 | 63700 152700 289 | 20860 50220 290 | 1682 4464 291 | 16804 40508 292 | 3195 8090 293 | 60348 144596 294 | 1901 4902 295 | 67468 161636 296 | 4772 11844 297 | 11196 27092 298 | 25836 62172 299 | 49676 119252 300 | 6188 15276 301 | 15588 37676 302 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_30_0: -------------------------------------------------------------------------------- 1 | 30 100000 2 | 90000 90001 3 | 89750 89751 4 | 10001 10002 5 | 89500 89501 6 | 10252 10254 7 | 89250 89251 8 | 10503 10506 9 | 89000 89001 10 | 10754 10758 11 | 88750 88751 12 | 11005 11010 13 | 88500 88501 14 | 11256 11262 15 | 88250 88251 16 | 11507 11514 17 | 88000 88001 18 | 11758 11766 19 | 87750 87751 20 | 12009 12018 21 | 87500 87501 22 | 12260 12270 23 | 87250 87251 24 | 12511 12522 25 | 87000 87001 26 | 12762 12774 27 | 86750 86751 28 | 13013 13026 29 | 86500 86501 30 | 13264 13278 31 | 86250 86251 32 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_400_0: -------------------------------------------------------------------------------- 1 | 400 9486367 2 | 31860 76620 3 | 11884 28868 4 | 10492 25484 5 | 901 2502 6 | 43580 104660 7 | 9004 21908 8 | 6700 16500 9 | 29940 71980 10 | 7484 18268 11 | 5932 14564 12 | 7900 19300 13 | 6564 16028 14 | 6596 16092 15 | 8172 19844 16 | 5324 13148 17 | 8436 20572 18 | 7332 17964 19 | 6972 17044 20 | 7668 18636 21 | 6524 15948 22 | 6244 15388 23 | 635 1970 24 | 5396 13292 25 | 13596 32892 26 | 51188 122676 27 | 13684 33068 28 | 8596 20892 29 | 156840 375380 30 | 7900 19300 31 | 6460 15820 32 | 14132 34164 33 | 4980 12260 34 | 5216 12932 35 | 6276 15452 36 | 701 2102 37 | 3084 7868 38 | 6924 16948 39 | 5500 13500 40 | 3148 7996 41 | 47844 114788 42 | 226844 542788 43 | 25748 61996 44 | 7012 17124 45 | 3440 8580 46 | 15580 37660 47 | 314 1128 48 | 2852 7204 49 | 15500 37500 50 | 9348 22796 51 | 17768 42836 52 | 16396 39692 53 | 16540 39980 54 | 395124 944948 55 | 10196 24692 56 | 6652 16204 57 | 4848 11996 58 | 74372 178244 59 | 4556 11212 60 | 4900 12100 61 | 3508 8716 62 | 3820 9540 63 | 5460 13420 64 | 16564 40028 65 | 3896 9692 66 | 3832 9564 67 | 9012 21924 68 | 4428 10956 69 | 57796 138492 70 | 12052 29204 71 | 7052 17204 72 | 85864 205628 73 | 5068 12436 74 | 10484 25468 75 | 4516 11132 76 | 3620 9140 77 | 18052 43604 78 | 21 542 79 | 15804 38108 80 | 19020 45940 81 | 170844 408788 82 | 3732 9364 83 | 2920 7340 84 | 4120 10340 85 | 6828 16756 86 | 26252 63204 87 | 11676 28252 88 | 19916 47932 89 | 65488 156876 90 | 7172 17644 91 | 3772 9444 92 | 132868 318036 93 | 8332 20364 94 | 5308 13116 95 | 3780 9460 96 | 5208 12916 97 | 56788 136076 98 | 7172 17644 99 | 7868 19236 100 | 31412 75524 101 | 9252 22604 102 | 12276 29652 103 | 3712 9324 104 | 4516 11132 105 | 105876 253452 106 | 20084 48468 107 | 11492 27884 108 | 49092 117684 109 | 83452 199804 110 | 71372 171044 111 | 66572 159644 112 | 25268 60836 113 | 64292 154084 114 | 21228 51156 115 | 16812 40524 116 | 19260 46420 117 | 7740 18980 118 | 5632 13964 119 | 3256 8212 120 | 15580 37660 121 | 4824 11948 122 | 59700 143100 123 | 14500 35100 124 | 7208 17716 125 | 6028 14756 126 | 75716 181332 127 | 22364 53828 128 | 7636 18572 129 | 6444 15788 130 | 5192 12884 131 | 7388 18076 132 | 33156 79612 133 | 3032 7564 134 | 6628 16156 135 | 7036 17172 136 | 3200 8100 137 | 7300 17900 138 | 4452 11004 139 | 26364 63428 140 | 14036 33972 141 | 16932 40964 142 | 5788 14276 143 | 70476 168852 144 | 4552 11204 145 | 33980 81660 146 | 19300 46500 147 | 39628 95156 148 | 4484 11068 149 | 55044 131988 150 | 574 1848 151 | 29644 71188 152 | 9460 23020 153 | 106284 254468 154 | 304 1108 155 | 3580 8860 156 | 6308 15516 157 | 10492 25484 158 | 12820 31140 159 | 14436 34972 160 | 5044 12388 161 | 1155 3210 162 | 12468 30236 163 | 4380 10860 164 | 9876 24052 165 | 8752 21404 166 | 8676 21052 167 | 42848 102796 168 | 22844 54988 169 | 6244 15388 170 | 314 1128 171 | 314 1128 172 | 314 1128 173 | 314 1128 174 | 314 1128 175 | 314 1128 176 | 387480 926660 177 | 314 1128 178 | 314 1128 179 | 314 1128 180 | 314 1128 181 | 314 1128 182 | 15996 38692 183 | 8372 20444 184 | 65488 156876 185 | 304 1108 186 | 4756 11812 187 | 5012 12324 188 | 304 1108 189 | 314 1128 190 | 314 1128 191 | 314 1128 192 | 314 1128 193 | 314 1128 194 | 314 1128 195 | 314 1128 196 | 304 1108 197 | 1208 3316 198 | 47728 114556 199 | 314 1128 200 | 314 1128 201 | 314 1128 202 | 314 1128 203 | 314 1128 204 | 314 1128 205 | 104036 249172 206 | 5248 12996 207 | 312 1124 208 | 24468 58836 209 | 7716 18932 210 | 30180 72460 211 | 4824 11948 212 | 1120 3140 213 | 11496 27892 214 | 4916 12132 215 | 14428 34956 216 | 24948 59996 217 | 41100 98700 218 | 28692 69084 219 | 826 2352 220 | 3073 7846 221 | 7684 18868 222 | 5604 13708 223 | 17188 41476 224 | 34828 83756 225 | 7540 18380 226 | 8004 19508 227 | 2648 6796 228 | 5124 12748 229 | 3096 7892 230 | 166516 398532 231 | 13756 33212 232 | 9980 24260 233 | 15980 38660 234 | 9056 22012 235 | 5052 12404 236 | 8212 20124 237 | 11164 27028 238 | 13036 31572 239 | 23596 56892 240 | 2028 5156 241 | 7584 18468 242 | 5772 14244 243 | 4124 10348 244 | 5368 13236 245 | 4364 10828 246 | 5604 13708 247 | 8500 20700 248 | 7676 18652 249 | 8636 20972 250 | 4588 11276 251 | 4152 10404 252 | 4860 12020 253 | 5484 13468 254 | 8636 20972 255 | 5140 12780 256 | 236380 565460 257 | 116500 278900 258 | 36480 87660 259 | 16968 41036 260 | 5232 12964 261 | 13280 32060 262 | 138032 330364 263 | 9044 21988 264 | 22028 53156 265 | 4632 11564 266 | 13196 31892 267 | 65404 156708 268 | 28940 69580 269 | 865 2430 270 | 45988 110276 271 | 670 2040 272 | 4820 11940 273 | 41356 99212 274 | 39844 95588 275 | 897 2494 276 | 4028 9956 277 | 7924 19348 278 | 47756 114612 279 | 47036 112772 280 | 25908 62316 281 | 4516 11132 282 | 29460 70820 283 | 7964 19428 284 | 16964 41028 285 | 22196 53492 286 | 68140 163380 287 | 80924 193948 288 | 63700 152700 289 | 20860 50220 290 | 1682 4464 291 | 16804 40508 292 | 3195 8090 293 | 60348 144596 294 | 1901 4902 295 | 67468 161636 296 | 4772 11844 297 | 11196 27092 298 | 25836 62172 299 | 49676 119252 300 | 6188 15276 301 | 15588 37676 302 | 4412 10924 303 | 26564 63828 304 | 16412 39724 305 | 8108 19716 306 | 6084 14868 307 | 9884 24068 308 | 4224 10548 309 | 14660 35420 310 | 25708 61916 311 | 39228 94156 312 | 40748 97796 313 | 40748 97796 314 | 64276 154052 315 | 114356 273812 316 | 14724 35548 317 | 4540 11180 318 | 11612 28124 319 | 4972 12244 320 | 10060 24420 321 | 14548 35196 322 | 3136 7972 323 | 9132 22164 324 | 5752 14204 325 | 10100 24500 326 | 12172 29444 327 | 24428 58756 328 | 3336 8372 329 | 4356 10812 330 | 8652 21004 331 | 14492 35084 332 | 8796 21492 333 | 6408 15716 334 | 6056 14812 335 | 10124 24548 336 | 387480 926660 337 | 18188 43876 338 | 7732 18964 339 | 9492 23084 340 | 7300 17900 341 | 10052 24404 342 | 19604 47308 343 | 6644 16188 344 | 107364 257028 345 | 91812 219924 346 | 4620 11540 347 | 42848 102796 348 | 33268 79836 349 | 13260 32020 350 | 6564 16028 351 | 6524 15948 352 | 13596 32892 353 | 13596 32892 354 | 47844 114788 355 | 226844 542788 356 | 226844 542788 357 | 226844 542788 358 | 226844 542788 359 | 85864 205628 360 | 170844 408788 361 | 56788 136076 362 | 6628 16156 363 | 10492 25484 364 | 104036 249172 365 | 14428 34956 366 | 14428 34956 367 | 22028 53156 368 | 22028 53156 369 | 22028 53156 370 | 25836 62172 371 | 11612 28124 372 | 11612 28124 373 | 11612 28124 374 | 85872 205644 375 | 1377 3654 376 | 1365820 3265540 377 | 562272 1344644 378 | 1445900 3457100 379 | 501060 1198220 380 | 106224 254348 381 | 492496 1177692 382 | 387824 927548 383 | 151320 362140 384 | 109924 263148 385 | 105696 253092 386 | 96404 230908 387 | 107732 257964 388 | 42140 101180 389 | 102896 246292 390 | 4036 9972 391 | 19616 47332 392 | 100948 241796 393 | 1417728 3389756 394 | 62604 150108 395 | 491820 1176140 396 | 33740 80980 397 | 25216 60732 398 | 111716 267532 399 | 400156 957012 400 | 108800 260500 401 | 1211040 2895580 402 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_40_0: -------------------------------------------------------------------------------- 1 | 40 100000 2 | 90001 90000 3 | 89751 89750 4 | 10002 10001 5 | 89501 89500 6 | 10254 10252 7 | 89251 89250 8 | 10506 10503 9 | 89001 89000 10 | 10758 10754 11 | 88751 88750 12 | 11010 11005 13 | 88501 88500 14 | 11262 11256 15 | 88251 88250 16 | 11514 11507 17 | 88001 88000 18 | 11766 11758 19 | 87751 87750 20 | 12018 12009 21 | 87501 87500 22 | 12270 12260 23 | 87251 87250 24 | 12522 12511 25 | 87001 87000 26 | 12774 12762 27 | 86751 86750 28 | 13026 13013 29 | 86501 86500 30 | 13278 13264 31 | 86251 86250 32 | 13530 13515 33 | 86001 86000 34 | 13782 13766 35 | 85751 85750 36 | 14034 14017 37 | 85501 85500 38 | 14286 14268 39 | 85251 85250 40 | 14538 14519 41 | 86131 86130 42 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_45_0: -------------------------------------------------------------------------------- 1 | 45 58181 2 | 1945 4990 3 | 321 1142 4 | 2945 7390 5 | 4136 10372 6 | 1107 3114 7 | 1022 2744 8 | 1101 3102 9 | 2890 7280 10 | 47019 112738 11 | 1530 3960 12 | 3432 8564 13 | 2165 5630 14 | 1703 4506 15 | 1106 3112 16 | 370 1240 17 | 657 2014 18 | 962 2624 19 | 1060 3020 20 | 805 2310 21 | 689 2078 22 | 1513 3926 23 | 3878 9656 24 | 13504 32708 25 | 1865 4830 26 | 667 2034 27 | 1833 4766 28 | 16553 40006 29 | 1261 3422 30 | 2593 6686 31 | 1170 3240 32 | 794 2288 33 | 671 2042 34 | 7421 18142 35 | 6009 14718 36 | 1767 4634 37 | 2622 6744 38 | 831 2362 39 | 701 2102 40 | 5222 12944 41 | 3086 7872 42 | 900 2500 43 | 3121 7942 44 | 1029 2958 45 | 52555 126010 46 | 389 1278 47 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_4_0: -------------------------------------------------------------------------------- 1 | 4 11 2 | 8 4 3 | 10 5 4 | 15 8 5 | 4 3 6 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_500_0: -------------------------------------------------------------------------------- 1 | 500 50000 2 | 384 412 3 | 7060 7285 4 | 8475 8103 5 | 5028 4876 6 | 9741 9369 7 | 3360 3538 8 | 1426 1394 9 | 2084 2204 10 | 4865 5362 11 | 1885 1779 12 | 8191 8376 13 | 6296 6460 14 | 3292 3193 15 | 10227 9957 16 | 5744 5513 17 | 2163 2365 18 | 10738 9786 19 | 5099 4865 20 | 9193 9406 21 | 7777 7455 22 | 8538 8090 23 | 9597 9224 24 | 1275 1257 25 | 6317 5831 26 | 7598 7177 27 | 2241 2297 28 | 1398 1271 29 | 4083 4216 30 | 6033 5634 31 | 1694 1560 32 | 7563 6878 33 | 12 12 34 | 7406 6872 35 | 7679 7142 36 | 6619 6945 37 | 9222 8778 38 | 1869 1785 39 | 6809 7485 40 | 4961 5033 41 | 2616 2719 42 | 6406 6156 43 | 1703 1826 44 | 6415 6795 45 | 4898 4790 46 | 7601 7620 47 | 2145 1971 48 | 6559 6310 49 | 1691 1874 50 | 8734 8092 51 | 9570 9321 52 | 7649 7955 53 | 0 1 54 | 5652 5146 55 | 475 517 56 | 8789 8341 57 | 1366 1400 58 | 3325 3230 59 | 5487 5443 60 | 7316 7097 61 | 10232 9979 62 | 1788 1873 63 | 9179 9259 64 | 3790 3940 65 | 7820 8611 66 | 4462 4552 67 | 832 893 68 | 6798 7209 69 | 5467 5319 70 | 5573 6065 71 | 5489 5010 72 | 8246 8770 73 | 2815 2918 74 | 8766 8355 75 | 7043 7760 76 | 8834 8052 77 | 8549 8969 78 | 6511 6415 79 | 9253 9812 80 | 831 861 81 | 4587 4755 82 | 202 210 83 | 1022 950 84 | 867 823 85 | 1989 2194 86 | 2813 2594 87 | 1711 1642 88 | 9343 9828 89 | 1840 2029 90 | 2772 2575 91 | 6035 5564 92 | 8815 9345 93 | 9329 8485 94 | 354 353 95 | 3488 3792 96 | 2701 2645 97 | 102 102 98 | 3711 4046 99 | 10505 9897 100 | 8471 9201 101 | 3406 3157 102 | 10171 9442 103 | 6862 7425 104 | 3747 3887 105 | 7132 7137 106 | 7386 7590 107 | 3073 3179 108 | 7566 8244 109 | 2269 2467 110 | 7134 7291 111 | 7750 7078 112 | 8126 8991 113 | 1803 1824 114 | 8229 8894 115 | 9725 9514 116 | 1468 1498 117 | 844 771 118 | 2939 2868 119 | 7538 7210 120 | 380 406 121 | 10182 9845 122 | 176 188 123 | 8874 8977 124 | 5461 5808 125 | 7833 7831 126 | 9668 9122 127 | 3381 3255 128 | 8534 7808 129 | 10002 9684 130 | 8881 9703 131 | 3503 3884 132 | 2774 2742 133 | 6546 6754 134 | 3368 3227 135 | 2269 2521 136 | 3229 3149 137 | 6703 6895 138 | 9740 9718 139 | 1660 1779 140 | 4724 4906 141 | 10161 9765 142 | 2460 2712 143 | 1221 1161 144 | 893 956 145 | 3922 3736 146 | 3837 3854 147 | 4564 4211 148 | 6844 7195 149 | 7300 7204 150 | 550 509 151 | 3347 3315 152 | 8141 8090 153 | 7173 7121 154 | 1386 1366 155 | 2216 2053 156 | 4182 4310 157 | 6496 6753 158 | 7540 7923 159 | 6576 7072 160 | 745 774 161 | 10510 9710 162 | 5294 5494 163 | 6752 6259 164 | 3818 4235 165 | 6704 6462 166 | 212 222 167 | 6247 5995 168 | 7948 8543 169 | 2763 2688 170 | 5698 5186 171 | 2307 2186 172 | 7426 7303 173 | 5292 5134 174 | 9295 8645 175 | 2578 2430 176 | 6097 5571 177 | 2925 3243 178 | 1223 1123 179 | 8720 8978 180 | 4240 4139 181 | 4344 4244 182 | 6250 6864 183 | 6547 7189 184 | 4989 4641 185 | 732 753 186 | 4440 4445 187 | 7861 8726 188 | 147 147 189 | 3066 3394 190 | 5265 5044 191 | 6723 7050 192 | 7443 7655 193 | 6062 6387 194 | 3793 3529 195 | 6167 6689 196 | 1965 1918 197 | 1479 1530 198 | 7177 7624 199 | 3624 3782 200 | 6602 7203 201 | 9195 9398 202 | 8667 8091 203 | 4802 4637 204 | 3317 3035 205 | 10496 9631 206 | 2441 2467 207 | 8759 7973 208 | 320 325 209 | 3459 3770 210 | 4805 4396 211 | 6153 5990 212 | 5076 5513 213 | 6003 6084 214 | 2143 2027 215 | 2915 3169 216 | 6150 6074 217 | 5077 4948 218 | 3335 3361 219 | 8400 8116 220 | 9711 9158 221 | 1375 1467 222 | 6421 6150 223 | 8784 8277 224 | 3085 2946 225 | 247 228 226 | 6182 6208 227 | 7543 7284 228 | 2056 2048 229 | 1198 1190 230 | 4033 4380 231 | 2527 2603 232 | 4158 4618 233 | 2552 2607 234 | 668 609 235 | 7843 8591 236 | 3986 3670 237 | 8463 8184 238 | 6382 6242 239 | 3103 3422 240 | 397 385 241 | 10619 9845 242 | 8138 8106 243 | 8370 8192 244 | 4321 3974 245 | 4514 4964 246 | 4041 4063 247 | 6558 6871 248 | 397 438 249 | 1943 2122 250 | 319 305 251 | 8557 8465 252 | 10517 9695 253 | 7573 8139 254 | 9981 9433 255 | 8833 8354 256 | 5854 5944 257 | 3796 3761 258 | 2043 2109 259 | 7288 7949 260 | 7280 7744 261 | 2163 2065 262 | 2469 2264 263 | 5532 5066 264 | 2318 2387 265 | 7179 6779 266 | 8381 9284 267 | 5665 5694 268 | 3544 3303 269 | 3108 2872 270 | 3050 2801 271 | 7307 6760 272 | 528 536 273 | 8598 8444 274 | 1282 1404 275 | 1912 1919 276 | 6096 6018 277 | 2305 2211 278 | 3787 3723 279 | 7142 6631 280 | 950 965 281 | 7389 7413 282 | 2823 2941 283 | 2097 1979 284 | 7066 6576 285 | 3447 3779 286 | 2727 2493 287 | 7624 8353 288 | 764 776 289 | 4578 4617 290 | 2503 2653 291 | 7276 7099 292 | 6643 6991 293 | 2786 2972 294 | 2422 2349 295 | 6811 6498 296 | 5584 5951 297 | 10727 9755 298 | 3882 3987 299 | 9566 9211 300 | 4396 4126 301 | 8930 8192 302 | 831 849 303 | 4712 4675 304 | 657 602 305 | 2738 3006 306 | 6995 6708 307 | 5598 5844 308 | 8939 9020 309 | 6861 6674 310 | 9795 9952 311 | 2090 2208 312 | 4661 4726 313 | 3258 3155 314 | 6520 6999 315 | 3040 3298 316 | 7137 6758 317 | 8379 8963 318 | 7682 7553 319 | 5225 5634 320 | 5653 5459 321 | 6605 6957 322 | 8226 7939 323 | 7947 8831 324 | 6663 6956 325 | 9263 8743 326 | 8527 7914 327 | 110 116 328 | 486 526 329 | 916 863 330 | 6285 6030 331 | 8658 8005 332 | 9627 9516 333 | 777 752 334 | 5208 5569 335 | 7641 7249 336 | 2961 2726 337 | 255 252 338 | 6656 6447 339 | 10101 9887 340 | 124 133 341 | 8303 7584 342 | 7576 8318 343 | 2428 2643 344 | 4008 4090 345 | 2645 2517 346 | 756 717 347 | 3980 4407 348 | 2950 3236 349 | 9529 9690 350 | 3644 3814 351 | 260 276 352 | 7840 8345 353 | 4601 4493 354 | 7423 7117 355 | 1692 1817 356 | 6957 7465 357 | 2923 3073 358 | 1677 1792 359 | 1138 1088 360 | 5317 5247 361 | 9705 9127 362 | 840 838 363 | 1209 1309 364 | 2481 2369 365 | 7686 8119 366 | 6022 5554 367 | 8029 8016 368 | 5418 5101 369 | 646 613 370 | 9511 8848 371 | 2350 2335 372 | 2544 2444 373 | 6819 7518 374 | 1055 1044 375 | 7563 7599 376 | 4530 4369 377 | 2249 2154 378 | 2244 2095 379 | 2976 3034 380 | 6533 6184 381 | 1518 1625 382 | 2484 2603 383 | 6100 6072 384 | 6326 6297 385 | 7341 7384 386 | 8751 8748 387 | 7195 7352 388 | 2487 2548 389 | 6846 7003 390 | 1049 1102 391 | 3670 3525 392 | 2538 2691 393 | 5378 5906 394 | 1530 1403 395 | 8675 8179 396 | 5411 5421 397 | 308 342 398 | 8138 8884 399 | 3751 4000 400 | 5392 5535 401 | 8288 7690 402 | 3425 3797 403 | 6599 6118 404 | 1855 2050 405 | 8516 8028 406 | 5331 5379 407 | 8180 7989 408 | 708 746 409 | 1217 1315 410 | 5753 5983 411 | 2918 3035 412 | 8370 8675 413 | 9502 9840 414 | 10584 9793 415 | 6538 6077 416 | 3678 3780 417 | 5013 5327 418 | 8374 8415 419 | 2038 1965 420 | 6129 5741 421 | 6622 6292 422 | 7569 7366 423 | 942 963 424 | 1259 1194 425 | 4277 3984 426 | 1121 1021 427 | 6333 5974 428 | 8989 9647 429 | 9265 8860 430 | 8344 8231 431 | 3112 3138 432 | 3347 3355 433 | 1352 1450 434 | 9712 9502 435 | 2307 2209 436 | 5520 5095 437 | 10137 9833 438 | 4583 4634 439 | 4444 4676 440 | 6024 5990 441 | 2481 2671 442 | 9522 9498 443 | 9993 9209 444 | 5687 6004 445 | 420 414 446 | 5365 5480 447 | 834 836 448 | 4767 4745 449 | 2409 2497 450 | 1897 1847 451 | 8698 9047 452 | 4612 4405 453 | 3524 3486 454 | 1156 1173 455 | 6516 5996 456 | 7741 7139 457 | 8546 9331 458 | 2349 2219 459 | 6095 6103 460 | 835 872 461 | 724 666 462 | 5288 5114 463 | 5659 6134 464 | 2847 3042 465 | 9627 9511 466 | 189 189 467 | 1509 1378 468 | 3609 3963 469 | 3802 3926 470 | 134 139 471 | 5689 6206 472 | 9097 9077 473 | 6347 5951 474 | 3007 2835 475 | 4305 3972 476 | 3155 3228 477 | 4130 3764 478 | 3904 3631 479 | 1915 2109 480 | 9014 9897 481 | 8504 8943 482 | 651 708 483 | 8947 8695 484 | 6239 5900 485 | 8311 8054 486 | 1412 1422 487 | 6513 7166 488 | 8244 8159 489 | 8127 8361 490 | 5552 5782 491 | 4068 4325 492 | 1013 935 493 | 10274 9984 494 | 2977 3181 495 | 2751 2876 496 | 10479 9715 497 | 2260 2159 498 | 5603 5520 499 | 3074 3065 500 | 9406 9789 501 | 9416 9939 502 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_0: -------------------------------------------------------------------------------- 1 | 50 341045 2 | 1906 4912 3 | 41516 99732 4 | 23527 56554 5 | 559 1818 6 | 45136 108372 7 | 2625 6750 8 | 492 1484 9 | 1086 3072 10 | 5516 13532 11 | 4875 12050 12 | 7570 18440 13 | 4436 10972 14 | 620 1940 15 | 50897 122094 16 | 2129 5558 17 | 4265 10630 18 | 706 2112 19 | 2721 6942 20 | 16494 39888 21 | 29688 71276 22 | 3383 8466 23 | 2181 5662 24 | 96601 231302 25 | 1795 4690 26 | 7512 18324 27 | 1242 3384 28 | 2889 7278 29 | 2133 5566 30 | 103 706 31 | 4446 10992 32 | 11326 27552 33 | 3024 7548 34 | 217 934 35 | 13269 32038 36 | 281 1062 37 | 77174 184848 38 | 952 2604 39 | 15572 37644 40 | 566 1832 41 | 4103 10306 42 | 313 1126 43 | 14393 34886 44 | 1313 3526 45 | 348 1196 46 | 419 1338 47 | 246 992 48 | 445 1390 49 | 23552 56804 50 | 23552 56804 51 | 67 634 52 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_1: -------------------------------------------------------------------------------- 1 | 50 5000 2 | 995 945 3 | 259 242 4 | 258 244 5 | 279 281 6 | 576 582 7 | 126 119 8 | 280 303 9 | 859 913 10 | 270 279 11 | 389 408 12 | 927 925 13 | 281 305 14 | 624 662 15 | 961 938 16 | 757 718 17 | 231 250 18 | 838 767 19 | 154 158 20 | 649 595 21 | 277 268 22 | 180 167 23 | 895 957 24 | 23 22 25 | 930 948 26 | 93 102 27 | 61 62 28 | 626 604 29 | 342 349 30 | 262 279 31 | 215 221 32 | 183 203 33 | 958 889 34 | 205 213 35 | 859 835 36 | 171 166 37 | 566 575 38 | 779 758 39 | 704 706 40 | 196 182 41 | 26 28 42 | 726 729 43 | 621 671 44 | 800 864 45 | 580 579 46 | 535 553 47 | 647 632 48 | 168 163 49 | 90 95 50 | 679 745 51 | 440 438 52 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_60_0: -------------------------------------------------------------------------------- 1 | 60 100000 2 | 90000 90001 3 | 89750 89751 4 | 10001 10002 5 | 89500 89501 6 | 10252 10254 7 | 89250 89251 8 | 10503 10506 9 | 89000 89001 10 | 10754 10758 11 | 88750 88751 12 | 11005 11010 13 | 88500 88501 14 | 11256 11262 15 | 88250 88251 16 | 11507 11514 17 | 88000 88001 18 | 11758 11766 19 | 87750 87751 20 | 12009 12018 21 | 87500 87501 22 | 12260 12270 23 | 87250 87251 24 | 12511 12522 25 | 87000 87001 26 | 12762 12774 27 | 86750 86751 28 | 13013 13026 29 | 86500 86501 30 | 13264 13278 31 | 86250 86251 32 | 13515 13530 33 | 86000 86001 34 | 13766 13782 35 | 85750 85751 36 | 14017 14034 37 | 85500 85501 38 | 14268 14286 39 | 85250 85251 40 | 14519 14538 41 | 85000 85001 42 | 14770 14790 43 | 84750 84751 44 | 15021 15042 45 | 84500 84501 46 | 15272 15294 47 | 84250 84251 48 | 15523 15546 49 | 84000 84001 50 | 15774 15798 51 | 83750 83751 52 | 16025 16050 53 | 83500 83501 54 | 16276 16302 55 | 83250 83251 56 | 16527 16554 57 | 83000 83001 58 | 16778 16806 59 | 82750 82751 60 | 17029 17058 61 | 82500 82501 62 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_82_0: -------------------------------------------------------------------------------- 1 | 82 104723596 2 | 13211 13211 3 | 26422 26422 4 | 52844 52844 5 | 105688 105688 6 | 211376 211376 7 | 422752 422752 8 | 845504 845504 9 | 1691008 1691008 10 | 3382016 3382016 11 | 6764032 6764032 12 | 13528064 13528064 13 | 27056128 27056128 14 | 54112256 54112256 15 | 13212 13212 16 | 26424 26424 17 | 52848 52848 18 | 105696 105696 19 | 211392 211392 20 | 422784 422784 21 | 845568 845568 22 | 1691136 1691136 23 | 3382272 3382272 24 | 6764544 6764544 25 | 13529088 13529088 26 | 27058176 27058176 27 | 54116352 54116352 28 | 39638 39638 29 | 79276 79276 30 | 158552 158552 31 | 317104 317104 32 | 634208 634208 33 | 1268416 1268416 34 | 2536832 2536832 35 | 5073664 5073664 36 | 10147328 10147328 37 | 20294656 20294656 38 | 40589312 40589312 39 | 81178624 81178624 40 | 52844 52844 41 | 105688 105688 42 | 211376 211376 43 | 422752 422752 44 | 845504 845504 45 | 1691008 1691008 46 | 3382016 3382016 47 | 6764032 6764032 48 | 13528064 13528064 49 | 27056128 27056128 50 | 54112256 54112256 51 | 66060 66060 52 | 132120 132120 53 | 264240 264240 54 | 528480 528480 55 | 1056960 1056960 56 | 2113920 2113920 57 | 4227840 4227840 58 | 8455680 8455680 59 | 16911360 16911360 60 | 33822720 33822720 61 | 67645440 67645440 62 | 79268 79268 63 | 158536 158536 64 | 317072 317072 65 | 634144 634144 66 | 1268288 1268288 67 | 2536576 2536576 68 | 5073152 5073152 69 | 10146304 10146304 70 | 20292608 20292608 71 | 40585216 40585216 72 | 81170432 81170432 73 | 92482 92482 74 | 184964 184964 75 | 369928 369928 76 | 739856 739856 77 | 1479712 1479712 78 | 2959424 2959424 79 | 5918848 5918848 80 | 11837696 11837696 81 | 23675392 23675392 82 | 47350784 47350784 83 | 94701568 94701568 84 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_1: -------------------------------------------------------------------------------- 1 | 3 9 2 | 5 4 3 | 6 5 4 | 3 2 5 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_2: -------------------------------------------------------------------------------- 1 | 4 7 2 | 16 2 3 | 19 3 4 | 23 4 5 | 28 5 6 | 7 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/solver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import namedtuple 5 | Item = namedtuple("Item", ['index', 'value', 'weight']) 6 | 7 | def solve_it(input_data): 8 | # Modify this code to run your optimization algorithm 9 | 10 | # parse the input 11 | lines = input_data.split('\n') 12 | 13 | firstLine = lines[0].split() 14 | item_count = int(firstLine[0]) 15 | capacity = int(firstLine[1]) 16 | 17 | items = [] 18 | 19 | for i in range(1, item_count+1): 20 | line = lines[i] 21 | parts = line.split() 22 | items.append(Item(i-1, int(parts[0]), int(parts[1]))) 23 | 24 | # a trivial algorithm for filling the knapsack 25 | # it takes items in-order until the knapsack is full 26 | value = 0 27 | weight = 0 28 | taken = [0]*len(items) 29 | 30 | for item in items: 31 | if weight + item.weight <= capacity: 32 | taken[item.index] = 1 33 | value += item.value 34 | weight += item.weight 35 | 36 | # prepare the solution in the specified output format 37 | output_data = str(value) + ' ' + str(0) + '\n' 38 | output_data += ' '.join(map(str, taken)) 39 | return output_data 40 | 41 | 42 | if __name__ == '__main__': 43 | import sys 44 | if len(sys.argv) > 1: 45 | file_location = sys.argv[1].strip() 46 | with open(file_location, 'r') as input_data_file: 47 | input_data = input_data_file.read() 48 | print(solve_it(input_data)) 49 | else: 50 | print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)') 51 | 52 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/knapsack_problem/knapsack/solverJava.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | from subprocess import Popen, PIPE 6 | 7 | def solve_it(input_data): 8 | 9 | # Writes the inputData to a temporay file 10 | 11 | tmp_file_name = 'tmp.data' 12 | tmp_file = open(tmp_file_name, 'w') 13 | tmp_file.write(input_data) 14 | tmp_file.close() 15 | 16 | # Runs the command: java Solver -file=tmp.data 17 | 18 | process = Popen(['java', 'Solver', '-file=' + tmp_file_name], stdout=PIPE, universal_newlines=True) 19 | (stdout, stderr) = process.communicate() 20 | 21 | # removes the temporay file 22 | os.remove(tmp_file_name) 23 | 24 | return stdout.strip() 25 | 26 | 27 | import sys 28 | 29 | if __name__ == '__main__': 30 | if len(sys.argv) > 1: 31 | file_location = sys.argv[1].strip() 32 | with open(file_location, 'r') as input_data_file: 33 | input_data = input_data_file.read() 34 | print(solve_it(input_data)) 35 | else: 36 | print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)') 37 | 38 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/lessons/README.md: -------------------------------------------------------------------------------- 1 | # Personal notes on Discrete Optimization 2 | 3 | > These notes are taken during Coursera course on discrete optimization 4 | 5 | https://www.coursera.org/learn/discrete-optimization/home/welcome -------------------------------------------------------------------------------- /9. Discrete optimization with RL/lessons/discrete_optimization.md: -------------------------------------------------------------------------------- 1 | # Discrete Optimization 2 | 3 | 4 | - The goal of optimization is to find the optimal or a least a high quality solution in a reasonable amount of time even when we face exponential growth in the number of possible solutions 5 | 6 | 7 | ## How to solve an optimization problem ? 8 | - Formalization of the mathematical model 9 | - Start with a greedy algorithm 10 | 11 | 12 | ## Formalizing an optimization task 13 | **How to model an optimization problem?**
14 | Agreeing on a mathematical form on the problem. 15 | - Choose some decision variables (typically encode the result we are interested in) 16 | - Express the problem constraint in terms of variables (what the solutions to the problem are) 17 | - Express the objective function to be maximized (specifying the quality of a solution) 18 | 19 | > There can be many ways to model an optimization problem 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /9. Discrete optimization with RL/lessons/dynamic_programming.md: -------------------------------------------------------------------------------- 1 | # Dynamic Programming 2 | ![](https://caseine.org/pluginfile.php/2558/course/section/269/Capture%20d%E2%80%99%C3%A9cran%202016-05-17%20%C3%A0%2022.15.49.png) 3 | 4 | ## What is dynamic programming ? 5 | **A widely used optimization technique** 6 | - for certain classes of problems 7 | - heavily used in computational biology 8 | 9 | **Basic principle** 10 | - Divide and conquer 11 | - Bottom-up computation 12 | 13 | 14 | 15 | 16 | 17 | ## 📚 References 18 | - [Wikipedia homepage](https://en.wikipedia.org/wiki/Dynamic_programming) -------------------------------------------------------------------------------- /9. Discrete optimization with RL/lessons/knapsack_problem.md: -------------------------------------------------------------------------------- 1 | # Knapsack Problem 2 | ![](https://miro.medium.com/max/684/0*3dS6Jw8NzzSD-mn8.jpg) 3 | 4 | 5 | ## 📝 Conventions & notations 6 | - I = {1,2,...,n} 7 | - O(k,j) denotes an optimal solution to the knapsack problem with capacity k and items [1,...,j]. This is what we want to solve. 8 | 9 | ## 👜 Modeling the Knapsack Problem 10 | 11 | ### Defining the problem 12 | - **Variables** 13 | - Decision variables 14 | - ``xi`` denotes whether the item i is selected in the solution 15 | - Other variables 16 | - ``wi`` denotes the weight of the item i 17 | - ``vi`` denotes the value of the item i 18 | - **Problem constraint** 19 | - Selected item cannot exceed the capacity of the backpack ``sum(wi*xi) <= K`` 20 | - **Objective function** 21 | - We want to maximize ``sum(vi*xi)`` 22 | 23 | 24 | ### Number of configurations 25 | - How many possible configurations of 1 and 0 for ``(x1,x2,...,xn)`` ? -> Search space 26 | - Not all of them are feasible -> Feasible search space 27 | - How many are they ? ``2^n`` -> exponential growth -> brute force is not possible for more than a few objects 28 | 29 | 30 | ## 🤗 Greedy algorithms 31 | 32 | ### Greedy algorithms to solve the knapsack problem 33 | 1. Take lighter item first 34 | 2. Take most valuable item first 35 | 3. Compute value density ratio (value/weight) and take the most important value 36 | 37 | For one problem, **there are many greedy algorithms**. With no guarantee it's optimal. It really depends on the input. But it's quick to implement, it's often fast to run and it serves as a baseline. 38 | 39 | ### Advantages 40 | - Quick to design and implement 41 | - Can be very fast 42 | 43 | ### Problems 44 | - No quality guarantee 45 | - Quality can vary widely on the input 46 | - Problem feasibility needs to be easy 47 | 48 | 49 | 50 | 51 | ## ⚡ Dynamic Programming 52 | ### Recurrence relations (Bellmann equations) 53 | We want to solve O(k,j) by recurrence : 54 | - Assume we know how to solve ``O(k,j-1)`` for all k, and we want to solve ``O(k,j)`` by adding one more item : the item ``j`` 55 | - If ``wj <= k`` there are two cases: 56 | - Either we don't select item j and the best solution is then ``O(k,j-1)`` 57 | - Or we select item j and the best solution is ``vj + O(k-wj,j-1)`` 58 | - Or written mathematically 59 | ``` 60 | - O(k,j) = max(O(k,j-1),vj + O(k-wj,j-1)) if wj <=k 61 | - O(k,j) = O(k,j-1) otherwise 62 | ``` 63 | - And of course ``O(k,0) = 0`` for all k (there are no items, there is no value) 64 | 65 | ### Recursive function in Python 66 | ```python 67 | # Variables 68 | w = list(...) 69 | v = list(...) 70 | 71 | def O(k,j): 72 | if (j == 0): 73 | return 0 74 | elif w[j] <= k: 75 | return max([O(k,j-1),v[j] + O(k-w[j],j-1)]) 76 | else: 77 | return O(k,j-1) 78 | ``` 79 | How efficient is this approach? Not a lot if we go top down (to compute many values we need to compute again the same values, that's often the case with complex recursive functions).
80 | That's why Dynamic Programming is all about Bottom-up approach. ### 81 | 82 | ### Bottom-up computation 83 | - Compute the recursive equations bottom up 84 | - Start with zero items 85 | - Add one more item, then two ... 86 | 87 | Often needs to be thought as a tables (capacity x items) 88 | 89 | ![](https://sadakurapati.files.wordpress.com/2013/11/knapsack2.png?w=584) 90 | 91 | - Building the table one by one using the formula 92 | - Tracing back to find the optimal solution 93 | 94 | ### Efficiency 95 | - Complexity of the algorithm -> time to fill the table ie O(Kn), we could think it's polynomial not exactly 96 | - It's not polynomial, but exponential because K is represented in a computer by log(K) bits. So we call this type of algorithms pseudo-polynomials. Because it's only efficient when K is small 97 | 98 | 99 | 100 | ## 🌴 Branch, bound & relaxation 101 | When you do exhaustive search it's basically building a decision tree of 2^n branches. Relaxation methods are to explore the tree without computing all nodes. We iterate two steps: 102 | - **Branching** (splitting the problem into a number of subproblems like in exhaustive search) 103 | - **Bounding** (finding an optimistic estimate of the best solution to the subproblem, maximization = upper bound & minimization = lower bound) 104 | 105 | ### How to find an optimization evaluation? How can I relax my problem? 106 | > - We relax a constraint 107 | > - Build the tree and evaluate an optimistic estimate 108 | > - If branching leads to a lower optimisatic estimate, we don't even need to go further in a branch and we can prune it. 109 | 110 | *Branching & bounding can be done a lot of different ways, see Search strategies section* 111 | 112 | ### What can we relax in the knapsack problem? 113 | - The capacity constraint -> take everything in the knapsack 114 | - The selection variable, we can imagine taking a fraction of each item (xi is now a decimal), this is called **linear relaxation** 115 | 116 | Linear relaxation for the knapsack algorithm works by : 117 | - Sorting by value density ratio 118 | - Fill the rest of the knapsack with a fraction of the last item that can partially fit, and you have an optimistic estimate for pruning 119 | 120 | 121 | ## 🔍 Search strategies 122 | 123 | ### Depth-first 124 | Prunes when a node estimation is worse than the best found 125 | - Go deep 126 | - When does it prune? when it finds a new node worse than the found solution 127 | - Is it memory efficient? It can be if we look at a few branches 128 | 129 | ### Best-first 130 | Select the node with the best estimation 131 | - Go for the best 132 | - When does it prune? when all the nodes are worse than a found solution 133 | - It it memory efficient? If we exaggerate and think of a knapsack with infinite capacity, we will commpute the entire tree, so infinite time and infinite space would be required. When the problem is small, it can be efficient. 134 | 135 | 136 | ### Least discrepancy or limited discrepancy search 137 | Trust a greedy heuristic 138 | - Assume a good heuristic is available 139 | - It makes very few mistakes 140 | - Search tree is binary 141 | - Following the heuristic means branching left and branching right means the heuristic was wrong 142 | - Limited Discrepancy Search (LDS) 143 | - Avoid mistakes at all costs 144 | - Explore the search space in increasing order of mistakes 145 | - Trusting the heuristic less and less 146 | 147 | We explore the search spaces in waves, and trust the heuristic less and less.
Its efficiency really depends on a trade off between space and time. 148 | 149 | 150 | ### And many others search strategies 151 | 152 | 153 | 154 | 155 | ## 📚 References 156 | - [Wikipedia page on Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem) 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning 2 | 3 | ![](https://cdn-images-1.medium.com/max/1600/1*D7JNcbvhP5UOR6_Ul-WJaw.gif) 4 | 5 | ##### Realizations 6 | - Old experiments on RL (2016) 7 | - Solving OpenAI Gym environments (2017-2018) 8 | - Developing an multi agent Tic Tac Toe environment and solving it with Policy Gradients (May 2017) 9 | - Using RL to automatically adapt the cooling in a Data Center (August 2017) 10 | - Controlling Robots via Reinforcement Learning (November 2017) 11 | - Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch (January 2018) 12 | - Delivery optimization using Reinforcement Learning (January 2019) 13 | - Rubik's Cube optimization (February 2019) 14 | - Multi-Agents simulations (November 2019) 15 | 16 | 17 | ##### Libraries 18 | - ``rl`` is a simple library to do Reinforcement Learning with Keras, it uses old Keras versions and should be updated 19 | - ``hyperion`` is a simple multi agent simulation library 20 | 21 | 22 | *** 23 | ### References and inspiration 24 | ###### RL references 25 | 26 | - [Udemy course on RL](https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python/) 27 | - [David Silver course on RL at UCL](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 28 | - [Berkeley course on AI](http://ai.berkeley.edu/lecture_slides.html) 29 | - [Spinning up course by OpenAI](https://spinningup.openai.com/en/latest/) 30 | 31 | 32 | ##### Q Learning references 33 | - [Q Learning tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0) 34 | - [Q Learning tutorial on Keon.io](https://keon.io/deep-q-learning/) 35 | - [Q Learning tutorial by Udacity](https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb) 36 | 37 | 38 | ##### Deep Q Learning 39 | - [David Silver's Deep Q Learning course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Resources_files/deep_rl.pdf) 40 | - [Demystyfing Deep Reinforcement Learning](http://neuro.cs.ut.ee/demystifying-deep-reinforcement-learning/) 41 | - [Siraj Raval's notebook on Deep Q Learning](https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb) 42 | 43 | ##### Policy Gradient 44 | - [Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/) Andrej Karpathy's blog article on RL (always a reference) 45 | 46 | 47 | 48 | ##### Evolution strategies 49 | - [Evolution strategies](https://blog.openai.com/evolution-strategies/) - OpenAI 50 | - [How evolution taught us the “genetic algorithm”](https://blog.sicara.com/was-darwin-a-great-computer-scientist-81ffa1dd72f9) 51 | - [Making a robot learn how to move, part 1 — Evolutionary algorithms](https://medium.com/towards-data-science/making-a-robot-learn-how-to-move-part-1-evolutionary-algorithms-340f239c9cd2) 52 | - [Optimize a quadratic function with ES](https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d) - Andrej Karpathy 53 | - [Evolution modelling with creatures](https://www.youtube.com/watch?v=GOFws_hhZs8) 54 | - [Genetic biwalkers](http://rednuht.org/genetic_walkers/) 55 | - [Evolving stable strategies](http://blog.otoro.net/2017/11/12/evolving-stable-strategies/) 56 | 57 | ##### Actor Critic, A2C, ACKTR 58 | - [A3C tutorial tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2) 59 | - [A3C tutorial with Keras and OpenAI](http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html) 60 | - [A3C explananations and implementations](https://mpatacchiola.github.io/blog/2017/02/11/dissecting-reinforcement-learning-4.html) 61 | - [ACKTR & A2C](https://blog.openai.com/baselines-acktr-a2c) - by OpenAI 62 | - [ACKTR & A3C implementation in PyTorch](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr) 63 | - [Actor Critic model with Keras](https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69) 64 | - [Car Racing solving with A3C](https://fr.scribd.com/document/358019044/Reinforcement-Car-Racing-with-A3C) and [this solution as well](https://web.stanford.edu/class/cs221/2017/restricted/p-final/elibol/final.pdf) 65 | 66 | ##### PPO, TRPO 67 | - [Proximal Policy Optimization](https://blog.openai.com/openai-baselines-ppo/) - by OpenAI 68 | - [PPO,TRPO tutorials](https://learningai.io/projects/2017/07/28/ai-gym-workout.html) 69 | 70 | 71 | 72 | ##### AlphaGo 73 | - [ELI5 MCTS](https://www.reddit.com/r/explainlikeimfive/comments/4aimqo/eli5_alpha_go_and_its_decision_making_process/) 74 | - [How AlphaGo works](https://www.tastehit.com/blog/google-deepmind-alphago-how-it-works/) 75 | - [Original Paper for AlphaGo](http://airesearch.com/wp-content/uploads/2016/01/deepmind-mastering-go.pdf) by David Silver 76 | 77 | 78 | ##### Monte Carlo Tree Search 79 | - [Udacity videos on MCTS](https://www.youtube.com/watch?v=onBYsen2_eA) 80 | 81 | 82 | ##### Misc 83 | - [Learning to optimize with RL](http://bair.berkeley.edu/blog/2017/09/12/learning-to-optimize-with-rl/) 84 | 85 | 86 | ##### Environment 87 | - [Unity Agents](https://blogs.unity3d.com/2017/09/19/introducing-unity-machine-learning-agents/) 88 | - [SerpentAI](https://github.com/SerpentAI/SerpentAI) 89 | - [Pybullet](https://docs.google.com/document/d/10sXEhzFRSnvFcl3XxNGhnD4N2SedqwdAvK3dsihxVUA/edit) 90 | 91 | *** 92 | ### Papers 93 | 94 | - [Discrete Sequential Prediction of Continuous Actions for Deep RL](https://arxiv.org/abs/1705.05035) 95 | - [Emotion in Reinforcement Learning Agents and Robots: A Survey](https://arxiv.org/abs/1705.05172) 96 | - [Combating Reinforcement Learning's Sisyphean Curse with Intrinsic Fear](https://arxiv.org/abs/1611.01211) 97 | - [Curiosity-driven Exploration by Self-supervised Prediction](https://arxiv.org/abs/1705.05363) 98 | - [End-to-end optimization of goal-driven and visually grounded dialogue systems](https://arxiv.org/abs/1703.05423) 99 | - [Deep reinforcement learning from human preferences](https://arxiv.org/abs/1706.03741) - OpenAI 100 | - [Programmable Agents](https://arxiv.org/abs/1706.06383) - Deepmind 101 | - [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf) - OpenAI 102 | - [Actor-Critic Reinforcement Learning with Simultaneous Human Control and Feedback](https://arxiv.org/abs/1703.01274) 103 | - [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295) 104 | - [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495) 105 | - [DARLA: Improving Zero-Shot Transfer in Reinforcement Learning](https://arxiv.org/pdf/1707.08475.pdf) 106 | - [Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards](https://arxiv.org/pdf/1707.08817.pdf) 107 | - [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/abs/1703.03864) 108 | - [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/abs/1707.06887) 109 | - [Intrinsically Motivated Goal Exploration Processes with Automatic Curriculum Learning](https://arxiv.org/abs/1708.02190?) 110 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) 111 | - [Value Iteration Networks](https://arxiv.org/pdf/1602.02867.pdf) 112 | - [A deep reinforcement learning chatbot](https://arxiv.org/pdf/1709.02349.pdf) - MILA 113 | - [The Uncertainty Bellman Equation and Exploration](https://arxiv.org/abs/1709.05380) 114 | - [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560) 115 | - [Overcoming Exploration in Reinforcement Learning with Demonstrations](https://arxiv.org/abs/1709.10089) 116 | - [Using Simulation and Domain Adaptation to Improve Efficiency of Deep Robotic Grasping](https://arxiv.org/abs/1709.07857) 117 | - [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf) 118 | - [Optimizing Long Short-Term Memory Recurrent Neural Networks UsingAnt Colony Optimization to Predict Turbine Engine Vibration](https://arxiv.org/pdf/1710.03753.pdf) 119 | - [Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments](https://arxiv.org/pdf/1710.03641.pdf) 120 | - [Emergent Complexity via Multi-Agent Competition](https://arxiv.org/pdf/1710.03748.pdf) 121 | - [A Unified Game-Theoretic Approach to Multiagent Reinforcement Learning](https://arxiv.org/pdf/1711.00832.pdf) 122 | -------------------------------------------------------------------------------- /rl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/__init__.py -------------------------------------------------------------------------------- /rl/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/agents/__init__.py -------------------------------------------------------------------------------- /rl/agents/actor_critic_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | Inspiration from https://keon.io/deep-q-learning/ 11 | https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69 12 | 13 | theo.alves.da.costa@gmail.com 14 | https://github.com/theolvs 15 | ------------------------------------------------------------------------ 16 | """ 17 | 18 | 19 | 20 | import os 21 | import matplotlib.pyplot as plt 22 | import pandas as pd 23 | import numpy as np 24 | import sys 25 | import random 26 | import time 27 | import random 28 | import numpy as np 29 | 30 | from keras.models import Sequential, Model 31 | from keras.layers import Dense, Dropout, Input 32 | from keras.layers.merge import Add, Multiply 33 | from keras.optimizers import Adam 34 | import keras.backend as K 35 | import tensorflow as tf 36 | 37 | from rl import utils 38 | from rl.memory import Memory 39 | from rl.agents.base_agent import Agent 40 | 41 | 42 | 43 | class ActorCriticAgent(Agent): 44 | def __init__(self,env,sess,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,tau = 0.125,actor_activation = "linear"): 45 | 46 | # Main parameters 47 | self.env = env 48 | self.sess = sess 49 | 50 | # Other parameters 51 | self.memory = Memory() 52 | self.epsilon = epsilon 53 | self.epsilon_min = epsilon_min 54 | self.epsilon_decay = epsilon_decay 55 | self.gamma = gamma 56 | self.tau = tau 57 | self.lr = lr 58 | 59 | # Models 60 | self.initialize_actor_model(actor_activation) 61 | self.initialize_critic_model() 62 | 63 | 64 | def initialize_actor_model(self,actor_activation): 65 | self.actor_state_input, self.actor_model = self.build_actor_model(actor_activation) 66 | _, self.target_actor_model = self.build_actor_model(actor_activation) 67 | 68 | self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]]) # where we will feed de/dC (from critic) 69 | 70 | actor_model_weights = self.actor_model.trainable_weights 71 | self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor) 72 | grads = zip(self.actor_grads, actor_model_weights) 73 | self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(grads) 74 | 75 | 76 | 77 | def build_actor_model(self,activation = ""): 78 | # Define the layers of the network 79 | state_input = Input(shape=self.env.observation_space.shape) 80 | h1 = Dense(24, activation='relu')(state_input) 81 | h2 = Dense(48, activation='relu')(h1) 82 | h3 = Dense(24, activation='relu')(h2) 83 | output = Dense(self.env.action_space.shape[0],activation='relu')(h3) 84 | 85 | # Compute the model 86 | model = Model(input=state_input, output=output) 87 | model.compile(loss="mse", optimizer=Adam(lr=self.lr)) 88 | return state_input, model 89 | 90 | 91 | def initialize_critic_model(self): 92 | self.critic_state_input, self.critic_action_input, self.critic_model = self.build_critic_model() 93 | _, _, self.target_critic_model = self.build_critic_model() 94 | 95 | self.critic_grads = tf.gradients(self.critic_model.output,self.critic_action_input) # where we calcaulte de/dC for feeding above 96 | 97 | # Initialize for later gradient calculations 98 | self.sess.run(tf.initialize_all_variables()) 99 | 100 | 101 | 102 | 103 | def build_critic_model(self): 104 | state_input = Input(shape=self.env.observation_space.shape) 105 | state_h1 = Dense(24, activation='relu')(state_input) 106 | state_h2 = Dense(48)(state_h1) 107 | 108 | action_input = Input(shape=self.env.action_space.shape) 109 | action_h1 = Dense(48)(action_input) 110 | 111 | merged = Add()([state_h2, action_h1]) 112 | merged_h1 = Dense(24, activation='relu')(merged) 113 | output = Dense(1, activation='relu')(merged_h1) 114 | model = Model(input=[state_input,action_input], output=output) 115 | 116 | model.compile(loss="mse", optimizer=Adam(lr=self.lr)) 117 | return state_input, action_input, model 118 | 119 | 120 | 121 | 122 | 123 | 124 | def train(self,batch_size = 32): 125 | if self.epsilon > self.epsilon_min: 126 | self.epsilon *= self.epsilon_decay 127 | 128 | if len(self.memory.cache) > batch_size: 129 | batch = random.sample(self.memory.cache, batch_size) 130 | else: 131 | batch = self.memory.cache 132 | 133 | self._train_actor(batch) 134 | self._train_critic(batch) 135 | 136 | 137 | 138 | 139 | 140 | def _train_actor(self,batch): 141 | for state,action,reward,next_state,_ in batch: 142 | predicted_action = self.actor_model.predict(state) 143 | grads = self.sess.run(self.critic_grads, feed_dict={ 144 | self.critic_state_input: state, 145 | self.critic_action_input: predicted_action 146 | })[0] 147 | 148 | self.sess.run(self.optimize, feed_dict={ 149 | self.actor_state_input: state, 150 | self.actor_critic_grad: grads 151 | }) 152 | 153 | 154 | 155 | def _train_critic(self,batch): 156 | for state,action,reward,next_state,done in batch: 157 | if not done: 158 | target_action = self.target_actor_model.predict(next_state) 159 | future_reward = self.target_critic_model.predict([next_state, target_action])[0][0] 160 | reward += self.gamma * future_reward 161 | self.critic_model.fit([state, action], reward, verbose=0) 162 | 163 | 164 | 165 | def _update_actor_target(self): 166 | actor_model_weights = self.actor_model.get_weights() 167 | actor_target_weights = self.target_critic_model.get_weights() 168 | 169 | for i in range(len(actor_target_weights)): 170 | actor_target_weights[i] = actor_model_weights[i] 171 | self.target_critic_model.set_weights(actor_target_weights) 172 | 173 | 174 | def _update_critic_target(self): 175 | critic_model_weights = self.critic_model.get_weights() 176 | critic_target_weights = self.critic_target_model.get_weights() 177 | 178 | for i in range(len(critic_target_weights)): 179 | critic_target_weights[i] = critic_model_weights[i] 180 | self.critic_target_model.set_weights(critic_target_weights) 181 | 182 | 183 | def update_target(self): 184 | self._update_actor_target() 185 | self._update_critic_target() 186 | 187 | 188 | 189 | 190 | def act(self, state): 191 | 192 | 193 | 194 | 195 | if np.random.random() < self.epsilon: 196 | return self.env.action_space.sample() 197 | return self.actor_model.predict(state) -------------------------------------------------------------------------------- /rl/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | import os 17 | import matplotlib.pyplot as plt 18 | import pandas as pd 19 | import numpy as np 20 | import sys 21 | import random 22 | import time 23 | import random 24 | import numpy as np 25 | 26 | 27 | 28 | 29 | 30 | class Agent(object): 31 | def __init__(self): 32 | pass 33 | 34 | 35 | def expand_state_vector(self,state): 36 | if len(state.shape) == 1 or len(state.shape)==3: 37 | return np.expand_dims(state,axis = 0) 38 | else: 39 | return state 40 | 41 | 42 | 43 | def remember(self,*args): 44 | self.memory.save(args) 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /rl/agents/dqn2d_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 19/10/2018 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | import os 18 | import matplotlib.pyplot as plt 19 | import pandas as pd 20 | import numpy as np 21 | import sys 22 | import random 23 | import time 24 | import random 25 | import numpy as np 26 | 27 | from keras.models import Sequential 28 | from keras.layers import Dense 29 | from keras.optimizers import Adam 30 | 31 | from keras.layers import Input, LSTM, Dense, Conv2D, MaxPooling2D, Dropout, Flatten 32 | from keras.layers import concatenate 33 | from keras.models import Model 34 | from keras.utils import plot_model,to_categorical 35 | 36 | from rl import utils 37 | from rl.memory import Memory 38 | from rl.agents.base_agent import Agent 39 | from rl.agents.dqn_agent import DQNAgent 40 | 41 | 42 | 43 | 44 | 45 | def create_vision_model(input_shape): 46 | input_image = Input(shape=input_shape) 47 | conv1 = Conv2D(32,(3,3),padding="same",activation="relu")(input_image) 48 | pool1 = MaxPooling2D(pool_size=(2,2))(conv1) 49 | drop1 = Dropout(0.25)(pool1) 50 | 51 | conv2 = Conv2D(64,(3,3),padding="same",activation="relu")(drop1) 52 | pool2 = MaxPooling2D(pool_size=(2,2))(conv2) 53 | drop2 = Dropout(0.25)(pool2) 54 | 55 | out = Flatten()(drop2) 56 | 57 | vision_model = Model(inputs=input_image, outputs=out) 58 | return vision_model 59 | 60 | 61 | def create_model(input_shape,output_dim): 62 | 63 | input1 = Input(shape=input_shape) 64 | input2 = Input(shape=input_shape) 65 | 66 | vision_model = create_vision_model(input_shape) 67 | 68 | out1 = vision_model(input1) 69 | out2 = vision_model(input2) 70 | 71 | concatenated = concatenate([out1,out2]) 72 | 73 | hidden = Dense(128, activation='relu')(concatenated) 74 | output = Dense(output_dim, activation='softmax')(hidden) 75 | 76 | model = Model([input1, input2], output) 77 | 78 | return model 79 | 80 | 81 | 82 | 83 | 84 | class DQN2DAgent(DQNAgent): 85 | 86 | 87 | 88 | def build_model(self,states_size,actions_size): 89 | model = create_model(states_size,actions_size) 90 | model.compile(loss='categorical_crossentropy', 91 | metrics=['accuracy'], 92 | optimizer="adam") 93 | return model 94 | 95 | 96 | 97 | def train(self,batch_size = 32): 98 | if len(self.memory.cache) > batch_size: 99 | batch = random.sample(self.memory.cache, batch_size) 100 | else: 101 | batch = self.memory.cache 102 | 103 | # Unzip batch 104 | states,actions,rewards,next_states,before_states,dones = zip(*batch) 105 | 106 | # Concat states 107 | states = np.vstack(states) 108 | next_states = np.vstack(next_states) 109 | before_states = np.vstack(before_states) 110 | 111 | # Compute targets 112 | targets = self.model.predict([before_states,states]) 113 | 114 | # Compute new targets 115 | rewards = np.array(rewards).reshape(-1,1) 116 | dones = 1-np.array(dones,dtype=np.int32).reshape(-1,1) 117 | predictions = (self.gamma * np.max(self.model.predict([before_states,states]),axis = 1)).reshape(-1,1) 118 | new_targets = rewards + dones * predictions 119 | new_targets = new_targets.astype("float32") 120 | 121 | # Correct targets 122 | actions = to_categorical(np.array(actions).reshape(-1,1),self.actions_size) 123 | np.place(targets,actions,new_targets) 124 | 125 | # Training 126 | self.model.fit([states,next_states],targets,epochs = 1,verbose = 0) 127 | 128 | if self.epsilon > self.epsilon_min: 129 | self.epsilon *= self.epsilon_decay 130 | 131 | 132 | 133 | 134 | 135 | def act(self,before_state,state): 136 | before_state = self.expand_state_vector(before_state) 137 | state = self.expand_state_vector(state) 138 | 139 | 140 | if np.random.rand() > self.epsilon: 141 | q = self.model.predict([before_state,state]) 142 | 143 | if self.observation_type == "discrete": 144 | a = np.argmax(q[0]) 145 | elif self.observation_type == "continuous": 146 | a = np.squeeze(np.clip(q,self.low,self.high)) 147 | 148 | else: 149 | if self.observation_type == "discrete": 150 | a = np.random.randint(self.actions_size) 151 | elif self.observation_type == "continuous": 152 | a = np.random.uniform(self.low,self.high,self.actions_size) 153 | return a 154 | 155 | 156 | -------------------------------------------------------------------------------- /rl/agents/dqn_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | Inspiration from https://keon.io/deep-q-learning/ 11 | 12 | theo.alves.da.costa@gmail.com 13 | https://github.com/theolvs 14 | ------------------------------------------------------------------------ 15 | """ 16 | 17 | 18 | 19 | import os 20 | import matplotlib.pyplot as plt 21 | import pandas as pd 22 | import numpy as np 23 | import sys 24 | import random 25 | import time 26 | import random 27 | import numpy as np 28 | 29 | from keras.models import Sequential 30 | from keras.layers import Dense 31 | from keras.optimizers import Adam 32 | 33 | 34 | from rl import utils 35 | from rl.memory import Memory 36 | from rl.agents.base_agent import Agent 37 | 38 | 39 | 40 | class DQNAgent(Agent): 41 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,low = 0,high = 1,max_memory = 2000,observation_type = "discrete"): 42 | assert observation_type in ["discrete","continuous"] 43 | self.states_size = states_size 44 | self.actions_size = actions_size 45 | self.memory = Memory(max_memory = max_memory) 46 | self.epsilon = epsilon 47 | self.low = low 48 | self.high = high 49 | self.observation_type = observation_type 50 | self.epsilon_min = epsilon_min 51 | self.epsilon_decay = epsilon_decay 52 | self.gamma = gamma 53 | self.lr = lr 54 | self.model = self.build_model(states_size,actions_size) 55 | 56 | 57 | 58 | 59 | 60 | def build_model(self,states_size,actions_size): 61 | model = Sequential() 62 | model.add(Dense(24,input_dim = states_size,activation = "relu")) 63 | model.add(Dense(24,activation = "relu")) 64 | model.add(Dense(actions_size,activation = "linear")) 65 | model.compile(loss='mse', 66 | optimizer=Adam(lr=self.lr)) 67 | return model 68 | 69 | 70 | 71 | 72 | 73 | 74 | def train(self,batch_size = 32): 75 | if len(self.memory.cache) > batch_size: 76 | batch = random.sample(self.memory.cache, batch_size) 77 | else: 78 | batch = self.memory.cache 79 | 80 | for state,action,reward,next_state,done in batch: 81 | state = self.expand_state_vector(state) 82 | next_state = self.expand_state_vector(next_state) 83 | 84 | 85 | targets = self.model.predict(state) 86 | 87 | if not done: 88 | target = reward + self.gamma * np.max(self.model.predict(next_state)) 89 | else: 90 | target = reward 91 | 92 | targets[0][action] = target 93 | 94 | self.model.fit(state,targets,epochs = 1,verbose = 0) 95 | 96 | 97 | if self.epsilon > self.epsilon_min: 98 | self.epsilon *= self.epsilon_decay 99 | 100 | 101 | 102 | 103 | 104 | def act(self,state): 105 | state = self.expand_state_vector(state) 106 | 107 | 108 | if np.random.rand() > self.epsilon: 109 | q = self.model.predict(state) 110 | 111 | if self.observation_type == "discrete": 112 | a = np.argmax(q[0]) 113 | elif self.observation_type == "continuous": 114 | a = np.squeeze(np.clip(q,self.low,self.high)) 115 | 116 | else: 117 | if self.observation_type == "discrete": 118 | a = np.random.randint(self.actions_size) 119 | elif self.observation_type == "continuous": 120 | a = np.random.uniform(self.low,self.high,self.actions_size) 121 | return a 122 | 123 | 124 | -------------------------------------------------------------------------------- /rl/agents/q_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | 11 | theo.alves.da.costa@gmail.com 12 | https://github.com/theolvs 13 | ------------------------------------------------------------------------ 14 | """ 15 | 16 | 17 | 18 | import os 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | import numpy as np 22 | import sys 23 | import random 24 | import time 25 | import random 26 | import numpy as np 27 | 28 | 29 | 30 | 31 | from rl import utils 32 | from rl.memory import Memory 33 | from rl.agents.base_agent import Agent 34 | 35 | 36 | 37 | class QAgent(Agent): 38 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8): 39 | self.states_size = states_size 40 | self.actions_size = actions_size 41 | self.epsilon = epsilon 42 | self.epsilon_min = epsilon_min 43 | self.epsilon_decay = epsilon_decay 44 | self.gamma = gamma 45 | self.lr = lr 46 | self.Q = self.build_model(states_size,actions_size) 47 | 48 | 49 | def build_model(self,states_size,actions_size): 50 | Q = np.zeros([states_size,actions_size]) 51 | return Q 52 | 53 | 54 | def train(self,s,a,r,s_next): 55 | self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*np.max(self.Q[s_next,a]) - self.Q[s,a]) 56 | 57 | if self.epsilon > self.epsilon_min: 58 | self.epsilon *= self.epsilon_decay 59 | 60 | 61 | def act(self,s): 62 | 63 | q = self.Q[s,:] 64 | 65 | if np.random.rand() > self.epsilon: 66 | a = np.argmax(q) 67 | else: 68 | a = np.random.randint(self.actions_size) 69 | 70 | return a 71 | 72 | 73 | -------------------------------------------------------------------------------- /rl/agents/sarsa_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | 11 | theo.alves.da.costa@gmail.com 12 | https://github.com/theolvs 13 | ------------------------------------------------------------------------ 14 | """ 15 | 16 | 17 | 18 | import os 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | import numpy as np 22 | import sys 23 | import random 24 | import time 25 | import random 26 | import numpy as np 27 | 28 | 29 | 30 | 31 | from rl import utils 32 | from rl.memory import Memory 33 | from rl.agents.base_agent import Agent 34 | 35 | 36 | 37 | class SarsaAgent(Agent): 38 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8): 39 | self.states_size = states_size 40 | self.actions_size = actions_size 41 | self.epsilon = epsilon 42 | self.epsilon_min = epsilon_min 43 | self.epsilon_decay = epsilon_decay 44 | self.gamma = gamma 45 | self.lr = lr 46 | self.Q = self.build_model(states_size,actions_size) 47 | 48 | 49 | 50 | 51 | 52 | def build_model(self,states_size,actions_size): 53 | Q = np.zeros([states_size,actions_size]) 54 | return Q 55 | 56 | 57 | 58 | 59 | 60 | 61 | def train(self,s,a,r,s_next): 62 | a_next = self.act(s_next) 63 | self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*self.Q[s_next,a_next] - self.Q[s,a]) 64 | 65 | if self.epsilon > self.epsilon_min: 66 | self.epsilon *= self.epsilon_decay 67 | 68 | 69 | 70 | 71 | def act(self,s): 72 | 73 | q = self.Q[s,:] 74 | 75 | if np.random.rand() > self.epsilon: 76 | a = np.argmax(q) 77 | else: 78 | a = np.random.randint(self.actions_size) 79 | 80 | return a 81 | 82 | 83 | -------------------------------------------------------------------------------- /rl/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/envs/__init__.py -------------------------------------------------------------------------------- /rl/envs/data_center_cooling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | DATA CENTER COOLING 8 | 9 | Started on the 25/08/2017 10 | 11 | 12 | theo.alves.da.costa@gmail.com 13 | https://github.com/theolvs 14 | ------------------------------------------------------------------------ 15 | """ 16 | 17 | 18 | import os 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | import numpy as np 22 | import sys 23 | import random 24 | import time 25 | from tqdm import tqdm 26 | from collections import Counter 27 | from scipy import stats 28 | 29 | # Deep Learning (Keras, Tensorflow) 30 | import tensorflow as tf 31 | from keras.models import Sequential 32 | from keras.optimizers import SGD,RMSprop, Adam 33 | from keras.layers import Dense, Dropout, Activation, Flatten 34 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D 35 | from keras.utils.np_utils import to_categorical 36 | 37 | 38 | # Plotly 39 | import plotly.graph_objs as go 40 | from plotly import tools 41 | 42 | np.random.seed(1) 43 | 44 | 45 | #=========================================================================================================== 46 | # COOLING CENTER ENVIRONMENT 47 | #=========================================================================================================== 48 | 49 | 50 | 51 | class DataCenterCooling(object): 52 | def __init__(self,levels_activity = 20,levels_cooling = 10,cost_factor = 5,risk_factor = 1.6,keep_cooling = False): 53 | 54 | self.hour = 0 55 | self.cost_factor = cost_factor 56 | self.risk_factor = risk_factor 57 | self.levels_activity = levels_activity 58 | self.levels_cooling = levels_cooling 59 | self.define_activity(levels_activity) 60 | if not hasattr(self,"cooling") or not keep_cooling: 61 | self.define_cooling(levels_cooling) 62 | 63 | 64 | def define_activity(self,levels_activity): 65 | # Define the peaks of activity 66 | peak_morning = np.random.randint(7,10) 67 | peak_evening = np.random.randint(17,22) 68 | 69 | # Build the distribution 70 | x1 = np.array(stats.poisson.pmf(range(24),peak_morning)) 71 | x2 = np.array(stats.poisson.pmf(range(24),peak_evening)) 72 | x = x1 + x2 73 | x *= (100/0.14) 74 | 75 | # Discretize the distribution 76 | take_closest = lambda j,vector:min(vector,key=lambda x:abs(x-j)) 77 | percentiles = np.percentile(x,range(0,100,int(100/levels_activity))) 78 | assert len(percentiles) == levels_activity 79 | x_disc = np.array([take_closest(y,percentiles) for y in x]) 80 | 81 | # Store the variable 82 | self.observation_space = percentiles 83 | self.activity = np.expand_dims(x_disc,axis = 0) 84 | 85 | 86 | 87 | def define_cooling(self,levels_cooling): 88 | self.action_space = list([int(100/levels_cooling*i) for i in range(levels_cooling)]) 89 | assert len(self.action_space) == levels_cooling 90 | 91 | initial_value = random.choice(self.action_space) 92 | self.cooling = np.full((1,24),initial_value) 93 | 94 | 95 | 96 | def reset(self): 97 | self.__init__(self.levels_activity,self.levels_cooling,self.cost_factor) 98 | return self.reset_state() 99 | 100 | def reset_state(self): 101 | activity = self.activity[0][0] 102 | activity_state = self.convert_activity_to_state(activity) 103 | return activity_state 104 | 105 | 106 | def convert_activity_to_state(self,activity): 107 | state = int(np.where(self.observation_space == activity)[0][0]) 108 | return state 109 | 110 | 111 | 112 | def render(self,with_plotly = False): 113 | 114 | rewards,winnings,losses,failures = self.compute_daily_rewards() 115 | 116 | if not with_plotly: 117 | # Show the activity and cooling 118 | plt.figure(figsize = (14,5)) 119 | plt.plot(np.squeeze(self.activity),c ="red",label = "activity") 120 | plt.plot(np.squeeze(self.cooling),c = "blue",label = "cooling") 121 | plt.legend() 122 | plt.show() 123 | 124 | # Show the rewards 125 | plt.figure(figsize = (14,5)) 126 | plt.title("Total reward : {}".format(int(np.sum(rewards)))) 127 | plt.plot(rewards,c = "blue",label = "profits") 128 | plt.plot(losses*(-1),c = "red",label = "costs") 129 | plt.plot(winnings,c = "green",label = "revenues") 130 | plt.legend() 131 | plt.show() 132 | else: 133 | data_states = self.render_states_plotly()["data"] 134 | data_rewards = self.render_rewards_plotly()["data"] 135 | data_states 136 | fig = tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]], 137 | shared_xaxes=True, shared_yaxes=False, 138 | vertical_spacing=0.1) 139 | 140 | for i,trace in enumerate(data_rewards): 141 | fig.append_trace(trace, 2, 1) 142 | 143 | for i,trace in enumerate(data_states): 144 | fig.append_trace(trace, 1, 1) 145 | 146 | # print(len(failures)) 147 | # print(len(rewards)) 148 | 149 | # shapes = [{"type":"line","x0":hour+1,"y0":0,"x1":hour+1,"y1":failure} for hour,failure in enumerate(failures) if failure > 0] 150 | fig['layout'].update(title="Total reward : {}".format(int(np.sum(rewards)))) 151 | fig['layout']['xaxis'].update(dtick = 1) 152 | # fig['layout'].update(shapes=shapes) 153 | return fig 154 | 155 | 156 | def render_states_plotly(self): 157 | # Create a trace 158 | x = list(range(24)) 159 | trace_activity = go.Scatter(x = x,y = np.squeeze(self.activity),name = "activity",line = dict(color = "red",width = 2),ysrc = "activity") 160 | trace_cooling = go.Scatter(x = x,y = np.squeeze(self.cooling),name = "cooling",line = dict(color = "#34aac1",width = 2)) 161 | 162 | data = [trace_activity,trace_cooling] 163 | fig = {"data":data} 164 | return fig 165 | 166 | 167 | def render_rewards_plotly(self): 168 | rewards,winnings,losses,failures = self.compute_daily_rewards() 169 | # Create a trace 170 | x = list(range(24)) 171 | trace_rewards = go.Scatter(x = x,y = np.squeeze(rewards),name = "rewards",line = dict(color = "#34aac1",width = 2),ysrc = "rewards") 172 | trace_winnings = go.Scatter(x = x,y = np.squeeze(winnings),name = "revenues",line = dict(color = "#10c576",width = 1),mode = "lines+markers") 173 | trace_losses = go.Scatter(x = x,y = np.squeeze(losses),name = "costs",line = dict(color = "red",width = 1),mode = "lines+markers") 174 | 175 | data = [trace_rewards,trace_winnings,trace_losses] 176 | fig = {"data":data} 177 | return fig 178 | 179 | 180 | 181 | 182 | 183 | def compute_reward(self,activity,cooling): 184 | 185 | # CALCULATING THE WINNINGS 186 | win = activity 187 | 188 | # CALCULATING THE LOSSES 189 | if cooling >= activity: 190 | cost = (0 if self.cost_factor < 1.0 else 1)*(cooling)**np.sqrt(self.cost_factor) 191 | failure = 0 192 | else: 193 | difference = (activity-cooling)/(cooling+1) 194 | default_probability = np.tanh(difference) 195 | if np.random.rand() > default_probability or self.risk_factor < 1.0: 196 | cost = 0 197 | else: 198 | cost = np.random.normal(loc = self.risk_factor,scale = 0.4) * 150 199 | 200 | # cost += (cooling * min(1,self.cost_factor))**2 201 | cost += (0 if self.cost_factor < 1.0 else (1-1/(self.cost_factor+0.1)))*(cooling) 202 | 203 | failure = cost 204 | 205 | return win,cost,failure 206 | 207 | 208 | 209 | 210 | 211 | 212 | def compute_daily_rewards(self): 213 | winnings = [] 214 | losses = [] 215 | rewards = [] 216 | failures = [] 217 | for i in range(24): 218 | activity = self.activity[0][i] 219 | cooling = self.cooling[0][i] 220 | win,loss,failure = self.compute_reward(activity,cooling) 221 | winnings.append(win) 222 | losses.append(loss) 223 | rewards.append(win-loss) 224 | failures.append(failure) 225 | 226 | return np.array(rewards),np.array(winnings),np.array(losses),np.array(failures) 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | def step(self,cooling_action): 235 | 236 | # Convert cooling_action to cooling_value 237 | cooling = self.action_space[cooling_action] 238 | 239 | # Update the cooling 240 | self.cooling[0][self.hour] = cooling 241 | 242 | activity = self.activity[0][self.hour] 243 | win,loss,failure = self.compute_reward(activity,cooling) 244 | reward = win-loss 245 | 246 | self.hour += 1 247 | 248 | if int(self.hour) == 24: 249 | new_state = self.reset_state() 250 | done = True 251 | else: 252 | new_activity = self.activity[0][self.hour] 253 | new_state = self.convert_activity_to_state(new_activity) 254 | done = False 255 | 256 | 257 | return new_state,reward,done 258 | 259 | 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /rl/memory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | from collections import deque 18 | 19 | 20 | 21 | 22 | class Memory(object): 23 | def __init__(self,max_memory = 2000): 24 | self.cache = deque(maxlen=max_memory) 25 | 26 | def save(self,args): 27 | self.cache.append(args) 28 | 29 | def empty_cache(self): 30 | self.__init__() 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /rl/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """-------------------------------------------------------------------- 6 | REINFORCEMENT LEARNING 7 | 8 | Started on the 25/08/2017 9 | 10 | theo.alves.da.costa@gmail.com 11 | https://github.com/theolvs 12 | ------------------------------------------------------------------------ 13 | """ 14 | 15 | 16 | 17 | import os 18 | import matplotlib.pyplot as plt 19 | import pandas as pd 20 | import numpy as np 21 | import sys 22 | import random 23 | import time 24 | import random 25 | import numpy as np 26 | import pylab 27 | 28 | 29 | 30 | def plot_average_running_rewards(rewards,save = None): 31 | average_running_rewards = np.cumsum(rewards)/np.array(range(1,len(rewards)+1)) 32 | figure = plt.figure(figsize = (15,4)) 33 | plt.plot(average_running_rewards) 34 | 35 | if save is None: 36 | plt.show() 37 | else: 38 | plt.savefig(save) 39 | 40 | 41 | 42 | 43 | --------------------------------------------------------------------------------