├── .gitignore
├── 0. Old
    ├── 1. Reinforcement Learning - Pickup problem.ipynb
    ├── 2. Reinforcement Learning - Multi-armed bandits.ipynb
    └── scripts
    │   ├── algorithms.py
    │   ├── grid_world.py
    │   ├── maze.py
    │   ├── multi_armed_bandit.py
    │   ├── open_ai_gym.py
    │   └── pickup_problem.py
├── 0. Solving Gym environments
    ├── Agents development -  Breakout.ipynb
    ├── breakout_with_rl.py
    ├── cartpole_with_deepqlearning.py
    └── pendulum_with_actorcritic.py
├── 1. Tic Tac Toe
    ├── 1. Solving Tic Tac Toe with Policy gradients.ipynb
    └── images
    │   ├── game_random_rl_agents.gif
    │   ├── game_random_rl_agents2.gif
    │   ├── game_random_rl_agents3.gif
    │   ├── game_random_rules_agents.gif
    │   ├── game_random_rules_agents2.gif
    │   ├── game_rules_rl_agents.gif
    │   └── game_two_random_agents.gif
├── 2. Data Center Cooling
    ├── 0. Explaining the Data Center Cooling environment.ipynb
    ├── 1. Reinforcement Learning - Q Learning.ipynb
    ├── 2. Reinforcement Learning - Deep-Q-Learning.ipynb
    ├── README.md
    └── app.py
├── 3. Robotics
    ├── Minitaur pybullet environment.ipynb
    └── minitaur.py
├── 4. Chrome Dino
    ├── 20180102 - Chrome Dino development.ipynb
    ├── 20180203 - Genetic algorithms experiments.ipynb
    ├── README.md
    ├── dino.py
    ├── experiments.py
    └── images
    │   ├── capture1.png
    │   ├── dino_hardcoded_agent.gif
    │   ├── dino_ml_agent1.gif
    │   └── dino_ml_agent1_bad.gif
├── 5. Delivery Optimization
    ├── Optimizing delivery with Reinforcement Learning.ipynb
    ├── README.md
    ├── Routing optimization with Deep Reinforcement Learning.ipynb
    ├── delivery.py
    ├── env1.png
    ├── env2.png
    ├── env3.png
    ├── training.png
    ├── training_100_stops.gif
    ├── training_100_stops_traffic.gif
    ├── training_10_stops.gif
    ├── training_500_stops.gif
    ├── training_500_stops_traffic.gif
    └── training_50_stops.gif
├── 6. Solving a Rubik's Cube
    ├── Solving a Rubik's cube with RL.ipynb
    └── rubik.py
├── 7. Multi-Agents Simulations
    ├── 20191018 - Sugarscape playground.ipynb
    ├── 20191112 - Chicken game.ipynb
    ├── 20200318 - Hyperion dev.ipynb
    ├── README.md
    ├── pygame_test.py
    ├── test.gif
    └── test2.gif
├── 8. Unity ML agents tests
    ├── README.md
    └── rolling_a_ball
    │   ├── 20200202 - Rolling a Ball.ipynb
    │   └── rollingaball1.png
├── 9. Discrete optimization with RL
    ├── README.md
    ├── Reinforcement Learning for knapsack problem.ipynb
    ├── knapsack_problem
    │   └── knapsack
    │   │   ├── Solver.java
    │   │   ├── _coursera
    │   │   ├── data
    │   │       ├── ks_10000_0
    │   │       ├── ks_1000_0
    │   │       ├── ks_100_0
    │   │       ├── ks_100_1
    │   │       ├── ks_100_2
    │   │       ├── ks_106_0
    │   │       ├── ks_19_0
    │   │       ├── ks_200_0
    │   │       ├── ks_200_1
    │   │       ├── ks_300_0
    │   │       ├── ks_30_0
    │   │       ├── ks_400_0
    │   │       ├── ks_40_0
    │   │       ├── ks_45_0
    │   │       ├── ks_4_0
    │   │       ├── ks_500_0
    │   │       ├── ks_50_0
    │   │       ├── ks_50_1
    │   │       ├── ks_60_0
    │   │       ├── ks_82_0
    │   │       ├── ks_lecture_dp_1
    │   │       └── ks_lecture_dp_2
    │   │   ├── handout.pdf
    │   │   ├── solver.py
    │   │   ├── solverJava.py
    │   │   └── submit.py
    └── lessons
    │   ├── README.md
    │   ├── discrete_optimization.md
    │   ├── dynamic_programming.md
    │   └── knapsack_problem.md
├── README.md
└── rl
    ├── __init__.py
    ├── agents
        ├── __init__.py
        ├── actor_critic_agent.py
        ├── base_agent.py
        ├── dqn2d_agent.py
        ├── dqn_agent.py
        ├── q_agent.py
        └── sarsa_agent.py
    ├── envs
        ├── __init__.py
        ├── data_center_cooling.py
        └── tictactoe.py
    ├── memory.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # SageMath parsed files
80 | *.sage.py
81 | 
82 | # dotenv
83 | .env
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | .spyproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 
92 | # mkdocs documentation
93 | /site
94 | 
95 | # mypy
96 | .mypy_cache/
97 | 


--------------------------------------------------------------------------------
/0. Old/scripts/algorithms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | First RL script done using Keras and policy gradients
  8 | 
  9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv
 10 | - Still inspired by Karpathy's work too
 11 | 
 12 | Started on the 30/12/2016
 13 | 
 14 | 
 15 | 
 16 | theo.alves.da.costa@gmail.com
 17 | https://github.com/theolvs
 18 | ------------------------------------------------------------------------
 19 | """
 20 | 
 21 | 
 22 | import numpy as np
 23 | # import gym
 24 | import os
 25 | from keras.models import load_model, Sequential
 26 | from keras.layers import Dense, Activation, Dropout
 27 | from keras.optimizers import SGD, RMSprop
 28 | 
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | class Brain():
 38 |     def __init__(self,env,env_name = "default",H = 500,learning_rate = 0.01,dropout = 0.0,hidden_layers = 1,reload = False,input_dim = 0,output_dim = 0):
 39 | 
 40 |         self.env_name = env_name
 41 |         self.base_path = "C:/Data Science/15. Reinforcement Learning/0. Models/"
 42 |         file = [x for x in os.listdir(self.base_path) if self.env_name in x]
 43 | 
 44 |         self.H = H
 45 |         self.gamma = 0.5
 46 |         self.batch_size = 10
 47 |         self.learning_rate = learning_rate
 48 |         self.dropout = dropout
 49 |         self.hidden_layers = hidden_layers
 50 | 
 51 |         if input_dim == 0:
 52 |             try:
 53 |                 self.observation_space = env.observation_space.n
 54 |                 self.observation_to_vectorize = True
 55 |             except Exception as e:
 56 |                 self.observation_space = env.observation_space.shape[0]
 57 |                 self.observation_to_vectorize = False
 58 |         else:
 59 |             self.observation_space = input_dim
 60 |             self.observation_to_vectorize = False
 61 | 
 62 |         if output_dim == 0:
 63 |             self.action_space = env.action_space.n
 64 |         else:
 65 |             self.action_space = output_dim
 66 | 
 67 | 
 68 |         if len(file) == 0 or reload:
 69 |             print('>> Building a fully connected neural network')
 70 |             self.episode_number = 0
 71 |             self.model = self.build_fcc_model_with_regularization(H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers)
 72 |         else:
 73 |             print('>> Loading the previously trained model')
 74 |             self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")])
 75 |             self.model = load_model(self.base_path + file[0])
 76 | 
 77 | 
 78 | 
 79 |         self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[]
 80 |         self.episode_rewards,self.episode_running_rewards = [],[]
 81 |         self.reward_sum = 0
 82 |         self.running_reward = 0
 83 | 
 84 | 
 85 |     def rebuild_model(self):
 86 |         self.model = self.build_fcc_model_with_regularization(self.H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers)
 87 | 
 88 | 
 89 | 
 90 |     def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2):
 91 |         model = Sequential()
 92 |         model.add(Dense(H, input_dim=input_dim))
 93 |         model.add(Activation('relu'))
 94 |         model.add(Dense(H))
 95 |         model.add(Activation('relu'))
 96 | 
 97 |         sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
 98 | 
 99 |         if output_dim <= 2:
100 |             model.add(Dense(1))
101 |             model.add(Activation('sigmoid'))
102 |             model.compile(loss='mse',
103 |                           optimizer=sgd,
104 |                           metrics=['accuracy'])
105 |         else:
106 |             model.add(Dense(output_dim))
107 |             model.add(Activation('softmax'))
108 |             model.compile(loss='categorical_crossentropy',
109 |                           optimizer=sgd,
110 |                           metrics=['accuracy'])
111 | 
112 |         return model
113 | 
114 | 
115 | 
116 |     def build_fcc_model_with_regularization(self,H = 500,input_dim = 4,output_dim = 2,dropout = 0.0,hidden_layers = 1):
117 |         model = Sequential()
118 |         model.add(Dense(H, input_dim=input_dim,init='uniform'))
119 |         model.add(Activation('relu'))
120 |         model.add(Dropout(dropout))
121 | 
122 |         for i in range(hidden_layers):
123 |             model.add(Dense(H,init='uniform'))
124 |             model.add(Activation('relu'))
125 |             model.add(Dropout(dropout))
126 | 
127 |         sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
128 | 
129 |         if output_dim <= 2:
130 |             model.add(Dense(1))
131 |             model.add(Activation('sigmoid'))
132 |             model.compile(loss='mse',
133 |                           optimizer=sgd,
134 |                           metrics=['accuracy'])
135 |         else:
136 |             model.add(Dense(output_dim))
137 |             model.add(Activation('softmax'))
138 |             model.compile(loss='categorical_crossentropy',
139 |                           optimizer=sgd,
140 |                           metrics=['accuracy'])
141 | 
142 |         return model
143 | 
144 | 
145 | 
146 |     def to_input(self,observation):
147 |         if self.observation_to_vectorize:
148 |             observation = self.vectorize_observation(observation,self.observation_space)
149 |         return np.reshape(observation,(1,self.observation_space))
150 | 
151 | 
152 |     def predict(self,observation,possible_moves = []):
153 | 
154 |         x = self.to_input(observation)
155 | 
156 |         # getting the probability of action
157 |         probas = self.model.predict(x)[0]
158 |         
159 | 
160 |         if len(possible_moves) > 0:
161 |             probas += 1e-9
162 |             probas *= possible_moves
163 |             probas /= np.sum(probas)
164 | 
165 |         # sampling the correct action
166 |         action= self.sample_action(probas)
167 | 
168 |         return x,action,probas
169 | 
170 | 
171 |     def sample_action(self,probabilities):
172 |         if len(probabilities)<=2:
173 |             action = 1 if np.random.uniform() < probabilities[0] else 0
174 |         else:
175 |             action = np.random.choice(len(probabilities),p = np.array(probabilities))
176 | 
177 |         return action
178 | 
179 |     def vectorize_action(self,action):
180 |         if self.action_space <= 2:
181 |             return action
182 |         else:
183 |             onehot_vector = np.zeros(self.action_space)
184 |             onehot_vector[action] = 1
185 |             return onehot_vector
186 | 
187 |     def vectorize_observation(self,value,size):
188 |         onehot_vector = np.zeros(size)
189 |         onehot_vector[value] = 1
190 |         return onehot_vector
191 | 
192 | 
193 | 
194 |     def record(self,input = None,action = None,proba = None,reward = None):
195 |         if type(input) != type(None):
196 |             self.inputs.append(input)
197 | 
198 |         if type(action) != type(None):
199 |             self.actions.append(action)
200 | 
201 |         if type(proba) != type(None):
202 |             self.probas.append(proba)
203 | 
204 |         if type(reward) != type(None):
205 |             self.rewards.append(reward)
206 |             self.reward_sum += reward
207 | 
208 | 
209 | 
210 | 
211 |     def discounting_rewards(self,r,normalization = True):
212 |         discounted_r = np.zeros_like(r)
213 |         running_add = 0
214 |         for t in reversed(range(0, r.size)):
215 |             running_add = running_add * self.gamma + r[t]
216 |             discounted_r[t] = running_add
217 | 
218 |         if normalization:
219 |             discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe")
220 |             discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe")
221 | 
222 |         return discounted_r
223 | 
224 | 
225 |     def discount_rewards(self,normalization = True):
226 |         rewards = np.vstack(self.rewards)
227 |         return self.discounting_rewards(rewards,normalization)
228 | 
229 | 
230 |     def record_episode(self):
231 |         # self.step_rewards.extend(self.discount_rewards(normalization = True))
232 | 
233 |         # self.rewards = np.array([self.rewards[-1]]*len(self.rewards))
234 |         # self.reward_sum = self.rewards[-1]*100
235 | 
236 |         self.reward_sum = np.sum(self.rewards)
237 |         self.rewards = self.discount_rewards(normalization = False)
238 |         self.step_rewards.extend(self.rewards)
239 | 
240 | 
241 |         self.episode_rewards.append(self.reward_sum)
242 |         self.running_reward = np.mean(self.episode_rewards)
243 |         self.episode_number += 1
244 | 
245 |     def reset_episode(self):
246 |         self.rewards = []
247 |         self.reward_sum = 0
248 | 
249 |     def update_on_batch(self,show = False):
250 |         if show: print('... Training on batch of size %s'%self.batch_size)
251 |         self.actions = np.vstack(self.actions)
252 |         self.probas = np.vstack(self.probas)
253 |         self.step_rewards = np.vstack(self.step_rewards)
254 |         self.inputs = np.vstack(self.inputs)
255 | 
256 |         self.targets = self.step_rewards * (self.actions - self.probas) + self.probas
257 |         # print(self.targets)
258 | 
259 |         #ajouter la protection de la max rewards
260 | 
261 |         self.model.train_on_batch(self.inputs,self.targets)
262 | 
263 |         self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[]
264 | 
265 |     def save_model(self):
266 |         file = [x for x in os.listdir(self.base_path) if self.env_name in x]
267 |         self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number))
268 |         if len(file)>0:
269 |             os.remove(self.base_path+file[0])
270 |         # self.model.save(self.base_path+"%s.h5"%(self.env_name))
271 | 
272 | 
273 |     def build_cnn_model(self,input_dim,output_dim):
274 |         model = Sequential()
275 | 
276 |         model.add(Convolution2D(32, 3, 3, border_mode='same',input_shape=input_dim))
277 |         model.add(Activation('relu'))
278 |         model.add(Convolution2D(32, 3, 3))
279 |         model.add(Activation('relu'))
280 |         model.add(MaxPooling2D(pool_size=(2, 2)))
281 |         model.add(Dropout(0.25))
282 | 
283 |         model.add(Convolution2D(64, 3, 3, border_mode='same'))
284 |         model.add(Activation('relu'))
285 |         model.add(Convolution2D(64, 3, 3))
286 |         model.add(Activation('relu'))
287 |         model.add(MaxPooling2D(pool_size=(2, 2)))
288 |         model.add(Dropout(0.25))
289 | 
290 |         model.add(Flatten())
291 |         model.add(Dense(512))
292 |         model.add(Activation('relu'))
293 |         model.add(Dropout(0.5))
294 |         model.add(Dense(output_dim))
295 |         model.add(Activation('softmax'))
296 | 
297 |         # Let's train the model using RMSprop
298 |         model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
299 | 
300 |         return model
301 | 


--------------------------------------------------------------------------------
/0. Old/scripts/grid_world.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | Grid World
  8 | 
  9 | Started on the 08/08/2017
 10 | 
 11 | 
 12 | References : 
 13 | - https://www.youtube.com/watch?v=A5eihauRQvo&t=5s
 14 | - https://github.com/llSourcell/q_learning_demo
 15 | - http://firsttimeprogrammer.blogspot.fr/2016/09/getting-ai-smarter-with-q-learning.html
 16 | 
 17 | 
 18 | theo.alves.da.costa@gmail.com
 19 | https://github.com/theolvs
 20 | ------------------------------------------------------------------------
 21 | """
 22 | 
 23 | 
 24 | import os
 25 | import matplotlib.pyplot as plt
 26 | import pandas as pd
 27 | import numpy as np 
 28 | import sys
 29 | import random
 30 | import time
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | #===========================================================================================================
 37 | # CELLS DEFINITION
 38 | #===========================================================================================================
 39 | 
 40 | 
 41 | class Cell(object):
 42 |     def __init__(self,reward = 0,is_terminal = False,is_occupied = False,is_wall = False,is_start = False):
 43 |         self.reward = reward
 44 |         self.is_terminal = is_terminal
 45 |         self.is_occupied = is_occupied
 46 |         self.is_wall = is_wall
 47 |         self.is_start = is_start
 48 | 
 49 |     def __repr__(self):
 50 |         if self.is_occupied:
 51 |             return "x"
 52 |         else:
 53 |             return " "
 54 | 
 55 | 
 56 |     def __str__(self):
 57 |         return self.__str__()
 58 | 
 59 | 
 60 | 
 61 | 
 62 | class Start(Cell):
 63 |     def __init__(self):
 64 |         super().__init__(is_occupied = True,is_start = True)
 65 | 
 66 | 
 67 | 
 68 | 
 69 | class End(Cell):
 70 |     def __init__(self,reward = 10):
 71 |         super().__init__(reward = reward,is_terminal = True)
 72 | 
 73 |     def __repr__(self):
 74 |         return "O"
 75 | 
 76 | 
 77 | 
 78 | class Hole(Cell):
 79 |     def __init__(self,reward = -10):
 80 |         super().__init__(reward = reward,is_terminal = True)
 81 | 
 82 |     def __repr__(self):
 83 |         return "X"
 84 | 
 85 | 
 86 | 
 87 | class Wall(Cell):
 88 |     def __init__(self):
 89 |         super().__init__(is_wall = True)
 90 | 
 91 |     def __repr__(self):
 92 |         return "#"
 93 | 
 94 | 
 95 | 
 96 | 
 97 | #===========================================================================================================
 98 | # GRIDS DEFINITION
 99 | #===========================================================================================================
100 | 
101 | 
102 | 
103 | 
104 | class Grid(object):
105 |     def __init__(self,cells):
106 |         self.grid = cells
107 | 
108 | 
109 |     def __repr__(self):
110 |         pass
111 | 
112 | 
113 |     def __str__(self):
114 |         pass
115 | 
116 | 


--------------------------------------------------------------------------------
/0. Old/scripts/multi_armed_bandit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | Multi Armed Bandit Problem
  8 | 
  9 | Started on the 14/04/2017
 10 | 
 11 | 
 12 | theo.alves.da.costa@gmail.com
 13 | https://github.com/theolvs
 14 | ------------------------------------------------------------------------
 15 | """
 16 | 
 17 | 
 18 | import os
 19 | import matplotlib.pyplot as plt
 20 | import pandas as pd
 21 | import numpy as np
 22 | import sys
 23 | 
 24 | 
 25 | # Deep Learning (Keras, Tensorflow)
 26 | import tensorflow as tf
 27 | from keras.models import Sequential
 28 | from keras.optimizers import SGD,RMSprop, Adam
 29 | from keras.layers import Dense, Dropout, Activation, Flatten
 30 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D
 31 | from keras.utils.np_utils import to_categorical
 32 | 
 33 | 
 34 | 
 35 | 
 36 | #===========================================================================================================
 37 | # BANDIT DEFINITION
 38 | #===========================================================================================================
 39 | 
 40 | 
 41 | 
 42 | class Bandit(object):
 43 |     def __init__(self,p = None):
 44 |         '''Simple bandit initialization'''
 45 |         self.p = p if p is not None else np.random.random()
 46 |         
 47 |     def pull(self):
 48 |         '''Simulate a pull from the bandit
 49 |            
 50 |         '''
 51 |         if np.random.random() < self.p:
 52 |             return 1
 53 |         else:
 54 |             return -1
 55 |         
 56 | 
 57 | 
 58 | def create_list_bandits(n = 4,p = None):
 59 |     if p is None: p = [None]*n
 60 |     bandits = [Bandit(p = p[i]) for i in range(n)]
 61 |     return bandits
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | #===========================================================================================================
 68 | # NEURAL NETWORK
 69 | #===========================================================================================================
 70 | 
 71 | 
 72 | 
 73 | def build_fcc_model(H = 100,lr = 0.1,dim = 4):
 74 |     model = Sequential()
 75 |     model.add(Dense(H, input_dim=dim))
 76 |     model.add(Activation('relu'))
 77 |     model.add(Dense(H))
 78 |     model.add(Activation('relu'))
 79 | 
 80 |     sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
 81 | 
 82 | 
 83 |     model.add(Dense(dim))
 84 |     model.add(Activation('softmax'))
 85 |     model.compile(loss='categorical_crossentropy',
 86 |                   optimizer=sgd,
 87 |                   metrics=['accuracy'])
 88 | 
 89 |     return model
 90 | 
 91 | 
 92 | model = build_fcc_model()
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | #===========================================================================================================
 99 | # SAMPLING ACTION
100 | #===========================================================================================================
101 | 
102 | 
103 | def sample_action(probas,epsilon = 0.2):
104 |     probas = probas[0]
105 |     if np.random.rand() < epsilon:
106 |         choice = np.random.randint(0,len(probas))
107 |     else:
108 |         choice = np.random.choice(range(len(probas)),p = probas)
109 |     return choice
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | #===========================================================================================================
120 | # EPISODE
121 | #===========================================================================================================
122 | 
123 | 
124 | 
125 | 
126 | def run_episode(bandits,model,probas = None,train = True,epsilon = 0.2):
127 |     
128 |     if probas is None:
129 |         probas = np.ones((1,len(bandits)))/len(bandits)
130 |     
131 |     # sampling action
132 |     bandit_to_pull = sample_action(probas,epsilon = epsilon)
133 |     action = to_categorical(bandit_to_pull,num_classes=probas.shape[1])
134 |     
135 |     # reward
136 |     reward = bandits[bandit_to_pull].pull()
137 |     
138 |     # feed vectors
139 |     X = action
140 |     y = (action - probas)*reward
141 |         
142 |     if train:
143 |         model.train_on_batch(X,y)
144 |         
145 |     # update probabilities
146 |     probas = model.predict(X)
147 |     
148 |     return reward,probas
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | #===========================================================================================================
157 | # GAME
158 | #===========================================================================================================
159 | 
160 | 
161 | def run_game(n_episodes = 100,lr = 0.1,n_bandits = 4,p = None,epsilon = 0.2):
162 | 
163 |     # DEFINE THE BANDITS
164 |     bandits = create_list_bandits(n = n_bandits,p = p)
165 |     probabilities_to_win = [x.p for x in bandits]
166 |     best_bandit = np.argmax(probabilities_to_win)
167 |     print(">> Probabilities to win : {} -> Best bandit : {}".format(probabilities_to_win,best_bandit))
168 | 
169 |     # INITIALIZE THE NEURAL NETWORK
170 |     model = build_fcc_model(lr = lr,dim = n_bandits)
171 |     
172 |     # INITIALIZE BUFFERS
173 |     rewards = []
174 |     avg_rewards = []
175 |     all_probas = np.array([])
176 |     
177 |     # EPISODES LOOP
178 |     for i in range(n_episodes):
179 |         print("\r[{}/{}] episodes completed".format(i+1,n_episodes),end = "")
180 | 
181 |         # Random choice at the first episode
182 |         if i == 0:
183 |             reward,probas = run_episode(bandits = bandits,model = model,epsilon = epsilon)
184 |             
185 |         # Updated probabilities at the following episodes
186 |         else:
187 |             reward,probas = run_episode(bandits = bandits,model = model,probas = probas)
188 | 
189 |             
190 |         # Store the rewards and the probas
191 |         rewards.append(reward)
192 |         avg_rewards.append(np.mean(rewards))
193 |         all_probas = np.append(all_probas,probas)
194 |         
195 |     print("")
196 |     
197 |     
198 |     # GET THE BEST PREDICTED BANDIT
199 |     predicted_bandit = np.argmax(probas)
200 |     print(">> Predicted bandit : {} - {}".format(predicted_bandit,"CORRECT !!!" if predicted_bandit == best_bandit else "INCORRECT"))
201 | 
202 |     
203 |     # PLOT THE EVOLUTION OF PROBABILITIES OVER TRAINING
204 |     all_probas = all_probas.reshape((n_episodes,n_bandits)).transpose()
205 |     plt.figure(figsize = (12,5))
206 |     plt.title("Probabilities on Bandit choice - {} episodes - learning rate {}".format(n_episodes,lr))
207 |     for i,p in enumerate(list(all_probas)):
208 |         plt.plot(p,label = "Bandit {}".format(i),lw = 1)
209 |         
210 |     plt.plot(avg_rewards,linestyle="-", dashes=(5, 4),color = "black",lw = 0.5,label = "average running reward")
211 |     plt.legend()
212 |     plt.ylim([-0.2,1])
213 |     
214 |     plt.show()
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/0. Old/scripts/open_ai_gym.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | First RL script done using Keras and policy gradients
  8 | 
  9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv
 10 | - Still inspired by Karpathy's work too
 11 | 
 12 | Started on the 30/12/2016
 13 | 
 14 | 
 15 | https://github.com/rybskej/atari-py
 16 | https://sourceforge.net/projects/vcxsrv/
 17 | 
 18 | 
 19 | Environment which works with script:  
 20 | - CartPole-v0
 21 | - MountainCar-v0
 22 | - Taxi-v1
 23 | 
 24 | 
 25 | theo.alves.da.costa@gmail.com
 26 | https://github.com/theolvs
 27 | ------------------------------------------------------------------------
 28 | """
 29 | 
 30 | 
 31 | import numpy as np
 32 | import gym
 33 | import os
 34 | from keras.models import load_model, Sequential
 35 | from keras.layers import Dense, Activation
 36 | from keras.optimizers import SGD, RMSprop
 37 | 
 38 | 
 39 | 
 40 | #-------------------------------------------------------------------------------
 41 | 
 42 | 
 43 | 
 44 | 
 45 |     
 46 | # def main(n_episodes = 20):
 47 | #     for i_episode in range(n_episodes):
 48 | #         observation = env.reset()
 49 | #         print(observation)
 50 | #         break
 51 | #         for t in range(1000):
 52 | #             if render: env.render
 53 | #             print(observation)
 54 | #             action = env.action_space.sample()
 55 | #             observation, reward, done, info = env.step(action)
 56 | #             if done:
 57 | #                 print("Episode finished after {} timesteps".format(t+1))
 58 | #                 break
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | #-------------------------------------------------------------------------------
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | class Brain():
 75 |     def __init__(self,env,env_name = "default",H = 500,reload = False):
 76 | 
 77 |         self.env_name = env_name
 78 |         self.base_path = "C:/Users/talvesdacosta/Documents/Perso/Data Science/15. Reinforcement Learning/3. Open AI Gym/models/"
 79 |         file = [x for x in os.listdir(self.base_path) if self.env_name in x]
 80 | 
 81 |         self.H = H
 82 |         self.gamma = 0.975
 83 |         self.batch_size = 10
 84 | 
 85 |         try:
 86 |             self.observation_space = env.observation_space.n
 87 |             self.observation_to_vectorize = True
 88 |         except Exception as e:
 89 |             self.observation_space = env.observation_space.shape[0]
 90 |             self.observation_to_vectorize = False
 91 | 
 92 |         self.action_space = env.action_space.n
 93 | 
 94 | 
 95 |         if len(file) == 0 or reload:
 96 |             print('>> Building a fully connected neural network')
 97 |             self.episode_number = 0
 98 |             self.model = self.build_fcc_model(H,input_dim = self.observation_space,output_dim = self.action_space)
 99 |         else:
100 |             print('>> Loading the previously trained model')
101 |             self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")])
102 |             self.model = load_model(self.base_path + file[0])
103 | 
104 | 
105 | 
106 |         self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[]
107 |         self.episode_rewards,self.episode_running_rewards = [],[]
108 |         self.reward_sum = 0
109 |         self.running_reward = 0
110 | 
111 | 
112 | 
113 | 
114 |     def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2):
115 |         model = Sequential()
116 |         model.add(Dense(H, input_dim=input_dim))
117 |         model.add(Activation('relu'))
118 |         model.add(Dense(H))
119 |         model.add(Activation('relu'))
120 | 
121 |         sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
122 | 
123 |         if output_dim <= 2:
124 |             model.add(Dense(1))
125 |             model.add(Activation('sigmoid'))
126 |             model.compile(loss='mse',
127 |                           optimizer=sgd,
128 |                           metrics=['accuracy'])
129 |         else:
130 |             model.add(Dense(output_dim))
131 |             model.add(Activation('softmax'))
132 |             model.compile(loss='categorical_crossentropy',
133 |                           optimizer=sgd,
134 |                           metrics=['accuracy'])
135 | 
136 |         return model
137 | 
138 | 
139 | 
140 |     def to_input(self,observation):
141 |         if self.observation_to_vectorize:
142 |             observation = self.vectorize_observation(observation,self.observation_space)
143 |         return np.reshape(observation,(1,self.observation_space))
144 | 
145 | 
146 |     def predict(self,observation):
147 | 
148 |         x = self.to_input(observation)
149 | 
150 |         # getting the probability of action
151 |         probas = self.model.predict(x)[0]
152 | 
153 |         # sampling the correct action
154 |         action= self.sample_action(probas)
155 | 
156 |         return x,action,probas
157 | 
158 | 
159 |     def sample_action(self,probabilities):
160 |         if len(probabilities)<=2:
161 |             action = 1 if np.random.uniform() < probabilities[0] else 0
162 |         else:
163 |             action = np.random.choice(len(probabilities),p = np.array(probabilities))
164 | 
165 |         return action
166 | 
167 |     def vectorize_action(self,action):
168 |         if self.action_space <= 2:
169 |             return action
170 |         else:
171 |             onehot_vector = np.zeros(self.action_space)
172 |             onehot_vector[action] = 1
173 |             return onehot_vector
174 | 
175 |     def vectorize_observation(self,value,size):
176 |         onehot_vector = np.zeros(size)
177 |         onehot_vector[value] = 1
178 |         return onehot_vector
179 | 
180 | 
181 | 
182 |     def record(self,input = None,action = None,proba = None,reward = None):
183 |         if type(input) != type(None):
184 |             self.inputs.append(input)
185 | 
186 |         if type(action) != type(None):
187 |             self.actions.append(action)
188 | 
189 |         if type(proba) != type(None):
190 |             self.probas.append(proba)
191 | 
192 |         if type(reward) != type(None):
193 |             self.rewards.append(reward)
194 |             self.reward_sum += reward
195 | 
196 | 
197 | 
198 | 
199 |     def discounting_rewards(self,r,normalization = True):
200 |         discounted_r = np.zeros_like(r)
201 |         running_add = 0
202 |         for t in reversed(range(0, r.size)):
203 |             running_add = running_add * self.gamma + r[t]
204 |             discounted_r[t] = running_add
205 | 
206 |         if normalization:
207 |             discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe")
208 |             discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe")
209 | 
210 |         return discounted_r
211 | 
212 | 
213 |     def discount_rewards(self,normalization = True):
214 |         rewards = np.vstack(self.rewards)
215 |         return self.discounting_rewards(rewards,normalization)
216 | 
217 | 
218 |     def record_episode(self):
219 |         self.step_rewards.extend(self.discount_rewards(normalization = True))
220 |         self.episode_rewards.append(self.reward_sum)
221 |         self.running_reward = np.mean(self.episode_rewards)
222 |         self.episode_number += 1
223 | 
224 |     def reset_episode(self):
225 |         self.rewards = []
226 |         self.reward_sum = 0
227 | 
228 |     def update_on_batch(self):
229 |         print('... Training on batch of size %s'%self.batch_size)
230 |         self.actions = np.vstack(self.actions)
231 |         self.probas = np.vstack(self.probas)
232 |         self.step_rewards = np.vstack(self.step_rewards)
233 |         self.inputs = np.vstack(self.inputs)
234 | 
235 |         self.targets = self.step_rewards * (self.actions - self.probas) + self.probas
236 | 
237 |         #ajouter la protection de la max rewards
238 | 
239 |         self.model.train_on_batch(self.inputs,self.targets)
240 | 
241 |         self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[]
242 | 
243 |     def save_model(self):
244 |         file = [x for x in os.listdir(self.base_path) if self.env_name in x]
245 |         self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number))
246 |         if len(file)>0:
247 |             os.remove(self.base_path+file[0])
248 |         # self.model.save(self.base_path+"%s.h5"%(self.env_name))
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | def main(env_name = 'CartPole-v0',n_episodes = 20,render = False,reload = False,n_by_episode = 1000):
261 |     env = gym.make(env_name)
262 |     brain = Brain(env,env_name = env_name,reload = reload)
263 |     # env.monitor.start(brain.base_path+'monitor/%s'%env_name)
264 | 
265 | 
266 |     for i_episode in range(1,n_episodes+1):
267 |         observation = env.reset()   
268 |         for t in range(n_by_episode):
269 |             if render: env.render()
270 | 
271 |             x,action,proba = brain.predict(observation)
272 | 
273 |             observation, reward, done, info = env.step(action)
274 |             action = brain.vectorize_action(action)
275 |             brain.record(input = x,action = action,proba = proba,reward = reward)
276 | 
277 |             if done or t == n_by_episode - 1:
278 |                 brain.record_episode()
279 |                 print("Episode {} : total reward was {:0.03f} and running mean {:0.03f}".format(brain.episode_number, brain.reward_sum, brain.running_reward))
280 | 
281 | 
282 |                 if i_episode % brain.batch_size == 0:
283 |                     brain.update_on_batch()
284 | 
285 |                 if i_episode % 100 == 0:
286 |                     brain.save_model()
287 | 
288 | 
289 |                 brain.reset_episode()
290 | 
291 |                 break
292 | 
293 |     # env.monitor.close()


--------------------------------------------------------------------------------
/0. Solving Gym environments/breakout_with_rl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 25/08/2017
  9 | 
 10 | theo.alves.da.costa@gmail.com
 11 | https://github.com/theolvs
 12 | ------------------------------------------------------------------------
 13 | """
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | # Usual libraries
 20 | import os
 21 | import matplotlib.pyplot as plt
 22 | import pandas as pd
 23 | import numpy as np 
 24 | import sys
 25 | import random
 26 | import time
 27 | from tqdm import tqdm
 28 | import random
 29 | import gym
 30 | import numpy as np
 31 | 
 32 | 
 33 | # Keras (Deep Learning)
 34 | from keras.models import Sequential
 35 | from keras.layers import Dense
 36 | from keras.optimizers import Adam
 37 | 
 38 | 
 39 | # Custom RL library
 40 | import sys
 41 | sys.path.insert(0,'..')
 42 | 
 43 | from rl import utils
 44 | from rl.agents.dqn2d_agent import DQN2DAgent
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | #----------------------------------------------------------------
 53 | # CONSTANTS
 54 | 
 55 | 
 56 | N_EPISODES = 1000
 57 | MAX_STEPS = 10000
 58 | RENDER = True
 59 | RENDER_EVERY = 50
 60 | BATCH_SIZE = 256
 61 | MAX_MEMORY = MAX_STEPS
 62 | 
 63 | 
 64 | 
 65 | #----------------------------------------------------------------
 66 | # MAIN LOOP
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 | 
 71 |     # Define the gym environment
 72 |     env = gym.make('Pong-v0')
 73 | 
 74 |     # Get the environement action and observation space
 75 |     state_size = env.observation_space.shape
 76 |     action_size = env.action_space.n
 77 | 
 78 |     # Create the RL Agent
 79 |     agent = DQN2DAgent(state_size,action_size,max_memory = MAX_MEMORY)
 80 | 
 81 |     # Initialize a list to store the rewards
 82 |     rewards = []
 83 | 
 84 | 
 85 | 
 86 |     #---------------------------------------------
 87 |     # ITERATION OVER EPISODES
 88 |     for i_episode in range(N_EPISODES):
 89 | 
 90 | 
 91 | 
 92 |         # Reset the environment
 93 |         s = env.reset()
 94 | 
 95 | 
 96 |         #-----------------------------------------
 97 |         # EPISODE RUN
 98 |         for i_step in range(MAX_STEPS):
 99 |         
100 |             # Render the environement
101 |             if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
102 | 
103 |             # Store s before
104 |             if i_step == 0:
105 |                 s_before = s
106 | 
107 | 
108 |             # The agent chose the action considering the given current state
109 |             a = agent.act(s_before,s)
110 | 
111 | 
112 |             # Take the action, get the reward from environment and go to the next state
113 |             s_next,r,done,info = env.step(a)
114 | 
115 |             # print(r)
116 | 
117 |             # Tweaking the reward to make it negative when we lose
118 |             # r = r if not done else -10
119 | 
120 |             # Remember the important variables
121 |             agent.remember(
122 |                 np.expand_dims(s,axis=0),
123 |                 a,
124 |                 r,
125 |                 np.expand_dims(s_next,axis=0),
126 |                 np.expand_dims(s_before,axis=0),
127 |                 done)
128 |                 
129 |             # Go to the next state
130 |             s_before = s
131 |             s = s_next
132 |             
133 |             # If the episode is terminated
134 |             if done:
135 |                 print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon))
136 |                 break
137 | 
138 | 
139 |         #-----------------------------------------
140 | 
141 |         # Store the rewards
142 |         rewards.append(i_step)
143 | 
144 | 
145 |         # Training
146 |         agent.train(batch_size = BATCH_SIZE)
147 | 
148 | 
149 | 
150 | 
151 | 
152 |     # Plot the average running rewards
153 |     utils.plot_average_running_rewards(rewards)
154 | 


--------------------------------------------------------------------------------
/0. Solving Gym environments/cartpole_with_deepqlearning.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 25/08/2017
  9 | 
 10 | theo.alves.da.costa@gmail.com
 11 | https://github.com/theolvs
 12 | ------------------------------------------------------------------------
 13 | """
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | # Usual libraries
 20 | import os
 21 | import matplotlib.pyplot as plt
 22 | import pandas as pd
 23 | import numpy as np 
 24 | import sys
 25 | import random
 26 | import time
 27 | from tqdm import tqdm
 28 | import random
 29 | import gym
 30 | import numpy as np
 31 | 
 32 | 
 33 | # Keras (Deep Learning)
 34 | from keras.models import Sequential
 35 | from keras.layers import Dense
 36 | from keras.optimizers import Adam
 37 | 
 38 | 
 39 | # Custom RL library
 40 | import sys
 41 | sys.path.insert(0,'..')
 42 | 
 43 | from rl import utils
 44 | from rl.agents.dqn_agent import DQNAgent
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | #----------------------------------------------------------------
 53 | # CONSTANTS
 54 | 
 55 | 
 56 | N_EPISODES = 1000
 57 | MAX_STEPS = 1000
 58 | RENDER = True
 59 | RENDER_EVERY = 50
 60 | 
 61 | 
 62 | 
 63 | #----------------------------------------------------------------
 64 | # MAIN LOOP
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 | 
 69 |     # Define the gym environment
 70 |     env = gym.make('CartPole-v1')
 71 | 
 72 |     # Get the environement action and observation space
 73 |     state_size = env.observation_space.shape[0]
 74 |     action_size = env.action_space.n
 75 | 
 76 |     # Create the RL Agent
 77 |     agent = DQNAgent(state_size,action_size)
 78 | 
 79 |     # Initialize a list to store the rewards
 80 |     rewards = []
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 |     #---------------------------------------------
 87 |     # ITERATION OVER EPISODES
 88 |     for i_episode in range(N_EPISODES):
 89 | 
 90 | 
 91 | 
 92 |         # Reset the environment
 93 |         s = env.reset()
 94 | 
 95 | 
 96 |         #-----------------------------------------
 97 |         # EPISODE RUN
 98 |         for i_step in range(MAX_STEPS):
 99 |         
100 |             # Render the environement
101 |             if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
102 | 
103 |             # The agent chose the action considering the given current state
104 |             a = agent.act(s)
105 |             
106 |             # Take the action, get the reward from environment and go to the next state
107 |             s_next,r,done,info = env.step(a)
108 | 
109 |             # Tweaking the reward to make it negative when we lose
110 |             r = r if not done else -10
111 | 
112 |             # Remember the important variables
113 |             agent.remember(s,a,r,s_next,done)
114 |                 
115 |             # Go to the next state
116 |             s = s_next
117 |             
118 |             # If the episode is terminated
119 |             if done:
120 |                 print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon))
121 |                 break
122 | 
123 | 
124 |         #-----------------------------------------
125 | 
126 |         # Store the rewards
127 |         rewards.append(i_step)
128 | 
129 | 
130 |         # Training
131 |         agent.train()
132 | 
133 | 
134 | 
135 | 
136 | 
137 |     # Plot the average running rewards
138 |     utils.plot_average_running_rewards(rewards)
139 | 


--------------------------------------------------------------------------------
/0. Solving Gym environments/pendulum_with_actorcritic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 13/11/2017
  9 | 
 10 | theo.alves.da.costa@gmail.com
 11 | https://github.com/theolvs
 12 | ------------------------------------------------------------------------
 13 | """
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | # Usual libraries
 20 | import os
 21 | import matplotlib.pyplot as plt
 22 | import pandas as pd
 23 | import numpy as np 
 24 | import sys
 25 | import random
 26 | import time
 27 | from tqdm import tqdm
 28 | import random
 29 | import gym
 30 | import numpy as np
 31 | 
 32 | 
 33 | # Keras (Deep Learning)
 34 | from keras.models import Sequential
 35 | from keras.layers import Dense
 36 | from keras.optimizers import Adam
 37 | import tensorflow as tf
 38 | import keras.backend as K
 39 | 
 40 | # Custom RL library
 41 | import sys
 42 | sys.path.insert(0,'..')
 43 | 
 44 | from rl import utils
 45 | from rl.agents.actor_critic_agent import ActorCriticAgent
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | #----------------------------------------------------------------
 54 | # CONSTANTS
 55 | 
 56 | 
 57 | N_EPISODES = 10000
 58 | MAX_STEPS = 500
 59 | RENDER = True
 60 | RENDER_EVERY = 50
 61 | 
 62 | 
 63 | 
 64 | #----------------------------------------------------------------
 65 | # MAIN LOOP
 66 | 
 67 | 
 68 | if __name__ == "__main__":
 69 | 
 70 |     # Define the gym environment
 71 |     sess = tf.Session()
 72 |     K.set_session(sess)
 73 |     env = gym.make('Pendulum-v0')
 74 | 
 75 |     # Define the agent
 76 |     agent = ActorCriticAgent(env, sess)
 77 | 
 78 |     # Initialize a list to store the rewards
 79 |     rewards = []
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 |     #---------------------------------------------
 86 |     # ITERATION OVER EPISODES
 87 |     for i_episode in range(N_EPISODES):
 88 | 
 89 | 
 90 | 
 91 |         # Reset the environment
 92 |         s = env.reset()
 93 | 
 94 |         reward = 0
 95 | 
 96 | 
 97 |         #-----------------------------------------
 98 |         # EPISODE RUN
 99 |         for i_step in range(MAX_STEPS):
100 | 
101 |             # Render the environement
102 |             if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
103 | 
104 |             # The agent chose the action considering the given current state
105 |             s = s.reshape((1, env.observation_space.shape[0]))
106 |             a = agent.act(s)
107 |             a = a.reshape((1, env.action_space.shape[0]))
108 |             
109 |             # Take the action, get the reward from environment and go to the next state
110 |             s_next,r,done,_ = env.step(a)
111 |             s_next = s_next.reshape((1, env.observation_space.shape[0]))
112 |             reward += r
113 | 
114 |             # Tweaking the reward to make it negative when we lose
115 | 
116 |             # Remember the important variables
117 |             agent.remember(s,a,r,s_next,done)
118 |                 
119 |             # Go to the next state
120 |             s = s_next
121 |             
122 |             # If the episode is terminated
123 |             if done:
124 |                 print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward))
125 |                 break
126 | 
127 | 
128 |         #-----------------------------------------
129 | 
130 |         # Store the rewards
131 |         rewards.append(i_step)
132 | 
133 | 
134 |         # Training
135 |         agent.train()
136 | 
137 | 
138 | 
139 | 
140 | 
141 |     # Plot the average running rewards
142 |     utils.plot_average_running_rewards(rewards)
143 | 


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents2.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents3.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rules_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rules_agents2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents2.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_rules_rl_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_rules_rl_agents.gif


--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_two_random_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_two_random_agents.gif


--------------------------------------------------------------------------------
/2. Data Center Cooling/README.md:
--------------------------------------------------------------------------------
 1 | # Data Center Cooling
 2 | ![](https://s-media-cache-ak0.pinimg.com/originals/36/d1/87/36d18741bdd4d2ac0033c53bcc669148.jpg)
 3 | 
 4 | Inspired by [DeepMind's realization](https://deepmind.com/blog/deepmind-ai-reduces-google-data-centre-cooling-bill-40/)
 5 | 
 6 | This repository hold the development of a business question that can be solved with Reinforcement Learning : cooling data centers
 7 | - The environment modelled with the fashion of OpenAI Gym's environments
 8 | - Solving the problem with different RL algorithms (Q-Learning, Deep-Q-Learning, Policy Gradients)
 9 | - A interactive Dash app to test the environment and the agents
10 | 
11 | 
12 | ***
13 | ## Data Center Cooling environment
14 | 
15 | To try out the app, launch with ``python app.py`` and go to port ``localhost:8050``
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/2. Data Center Cooling/app.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | TWITTER APP
  7 | 
  8 | Started on the 22/06/2017
  9 | 
 10 | 
 11 | https://plot.ly/dash/live-updates
 12 | https://plot.ly/dash/getting-started
 13 | https://plot.ly/dash/getting-started-part-2
 14 | https://plot.ly/dash/gallery/new-york-oil-and-gas/
 15 | 
 16 | theo.alves.da.costa@gmail.com
 17 | https://github.com/theolvs
 18 | ------------------------------------------------------------------------
 19 | """
 20 | 
 21 | # USUAL
 22 | import os
 23 | import numpy as np
 24 | from tqdm import tqdm
 25 | from copy import deepcopy
 26 | 
 27 | # DASH IMPORT
 28 | import dash
 29 | import dash_core_components as dcc
 30 | import dash_html_components as html
 31 | from dash.dependencies import Input, Output, Event, State
 32 | import plotly.graph_objs as go
 33 | 
 34 | import sys
 35 | sys.path.append("C:/git/reinforcement-learning/")
 36 | 
 37 | 
 38 | 
 39 | #--------------------------------------------------------------------------------
 40 | from rl.envs.data_center_cooling import DataCenterCooling
 41 | from rl.agents.q_agent import QAgent
 42 | from rl.agents.dqn_agent import DQNAgent
 43 | from rl.agents.sarsa_agent import SarsaAgent
 44 | from rl import utils
 45 | 
 46 | 
 47 | 
 48 | 
 49 | def run_episode(env,agent,max_step = 100,verbose = 1):
 50 | 
 51 |     s = env.reset()
 52 |     
 53 |     episode_reward = 0
 54 |     
 55 |     i = 0
 56 |     while i < max_step:
 57 |         
 58 |         # Choose an action
 59 |         a = agent.act(s)
 60 |         
 61 |         # Take the action, and get the reward from environment
 62 |         s_next,r,done = env.step(a)
 63 |         
 64 |         if verbose: print(s_next,r,done)
 65 |         
 66 |         # Update our knowledge in the Q-table
 67 |         agent.train(s,a,r,s_next)
 68 |         
 69 |         # Update the caches
 70 |         episode_reward += r
 71 |         s = s_next
 72 |         
 73 |         # If the episode is terminated
 74 |         i += 1
 75 |         if done:
 76 |             break
 77 |             
 78 |     return env,agent,episode_reward
 79 | 
 80 | 
 81 | 
 82 | 
 83 | def run_n_episodes(env,type_agent = "Q Agent",n_episodes = 2000,lr = 0.8,gamma = 0.95):
 84 | 
 85 |     environment = deepcopy(env)
 86 |     
 87 |     # Initialize the agent
 88 |     states_size = len(env.observation_space)
 89 |     actions_size = len(env.action_space)
 90 | 
 91 |     if type_agent == "Q Agent":
 92 |         print("... Using Q Agent")
 93 |         agent = QAgent(states_size,actions_size,lr = lr,gamma = gamma)
 94 |     elif type_agent == "SARSA Agent":    
 95 |         print("... Using SARSA Agent")
 96 |         agent = SarsaAgent(states_size,actions_size,lr = lr,gamma = gamma)
 97 | 
 98 |     # Store the rewards
 99 |     rewards = []
100 |     
101 |     # Experience replay
102 |     for i in tqdm(range(n_episodes)):
103 |         
104 |         # Run the episode
105 |         environment,agent,episode_reward = run_episode(environment,agent,verbose = 0)
106 |         rewards.append(episode_reward)
107 | 
108 |     return environment,agent,rewards
109 |         
110 | 
111 | class Clicks(object):
112 |     def __init__(self):
113 |         self.count = 0
114 | 
115 | reset_clicks = Clicks()
116 | train_clicks = Clicks()
117 | env = DataCenterCooling()
118 | np.random.seed()
119 | 
120 | #---------------------------------------------------------------------------------
121 | # CREATE THE APP
122 | app = dash.Dash("Data Cooling Center")
123 | 
124 | 
125 | # # Making the app available offline
126 | offline = False
127 | app.css.config.serve_locally = offline
128 | app.scripts.config.serve_locally = offline
129 | 
130 | 
131 | style = {
132 |     'font-weight': 'bolder',
133 |     'font-family': 'Product Sans',
134 |     }
135 | 
136 | container_style = {
137 |     "margin":"20px",
138 | }
139 | 
140 | 
141 | 
142 | AGENTS = [{"label":x,"value":x} for x in ["Q Agent","SARSA Agent","Deep-Q-Network Agent","Policy Gradient Agent"]]
143 | 
144 | #---------------------------------------------------------------------------------
145 | # LAYOUT
146 | app.layout = html.Div(children=[
147 | 
148 | 
149 | 
150 | 
151 | 
152 |     # HEADER FIRST CONTAINER
153 |     html.Div([
154 |         html.H2("Data Center Cooling",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
155 | 
156 |         html.Div([
157 |             html.H4("Environment",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
158 |             html.P("Cooling levels",id = "cooling"),
159 |             dcc.Slider(min=10,max=100,step=10,value=10,id = "levels-cooling"),
160 |             html.P("Cost factor",id = "cost-factor"),
161 |             dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-cost-factor"),
162 |             html.P("Risk factor",id = "risk-factor"),
163 |             dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-risk-factor"),    
164 |             html.Br(),
165 |             html.Button("Reset",id = "reset-env",style = style,n_clicks = 0),
166 |         ],style = {"height":"50%"}),
167 | 
168 | 
169 |         html.Div([
170 |             html.H4("Agent",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
171 |             dcc.Dropdown(id = "input-agent",options = AGENTS,value = "Q Agent",multi = False),
172 |             html.P("N episodes",id = "input-episodes"),
173 |             dcc.Slider(min=500,max=10000,step=500,value=5000,id = "n-episodes"),
174 |             html.P("Learning rate",id = "input-lr"),
175 |             dcc.Slider(min=0.001,max=1.0,step=0.005,value=0.1,id = "lr"),
176 |             html.Br(),
177 |             html.Button("Train",id = "training",style = style,n_clicks = 0),
178 |         ],style = {"height":"50%"}),
179 | 
180 | 
181 | 
182 |     ],style={**style,**container_style,'width': '20%',"height":"800px", 'float' : 'left', 'display': 'inline'}, className="container"),
183 | 
184 | 
185 | 
186 | 
187 |     # ANALYTICS CONTAINER
188 |     html.Div([
189 | 
190 |         dcc.Graph(id='render',animate = False,figure = env.render(with_plotly = True),style = {"height":"100%"}),
191 | 
192 | 
193 |     ],style={**style,**container_style,'width': '55%',"height":"800px", 'float' : 'right', 'display': 'inline'}, className="container"),
194 | 
195 | 
196 | ])
197 | 
198 | 
199 | 
200 | 
201 | #---------------------------------------------------------------------------------
202 | # CALLBACKS
203 | 
204 | 
205 | 
206 | # Callback to stop the streaming
207 | @app.callback(
208 |     Output("render","figure"),
209 |     [Input('reset-env','n_clicks'),Input('training','n_clicks'),Input('levels-cost-factor','value'),Input('levels-risk-factor','value')],
210 |     state = [State('levels-cooling','value'),State('lr','value'),State('n-episodes','value'),State('input-agent','value')]
211 | 
212 |     )
213 | def render(click_reset,click_training,cost_factor,risk_factor,levels_cooling,lr,n_episodes,type_agent):
214 | 
215 | 
216 |     print("Reset ",click_reset," - ",reset_clicks.count)
217 |     print("Train ",click_training," - ",train_clicks.count)
218 | 
219 | 
220 |     if click_reset > reset_clicks.count:
221 |         reset_clicks.count = click_reset
222 |         env.__init__(levels_cooling = levels_cooling,risk_factor = risk_factor,cost_factor = cost_factor,keep_cooling = True)
223 | 
224 |     elif click_training > train_clicks.count:
225 |         train_clicks.count = click_training
226 |         env_temp,agent,rewards = run_n_episodes(env,n_episodes = n_episodes,lr = lr,type_agent = type_agent)
227 |         utils.plot_average_running_rewards(rewards,"C:/Users/talvesdacosta/Desktop/results.png")
228 |         # os.system("start "+"C:/Users/talvesdacosta/Desktop/results.png")
229 |         env.cooling = env_temp.cooling
230 |     else:
231 |         env.risk_factor = risk_factor
232 |         env.cost_factor = cost_factor
233 | 
234 | 
235 | 
236 |     return env.render(with_plotly = True)
237 | 
238 | 
239 | 
240 | 
241 | @app.callback(
242 |     Output("cooling","children"),
243 |     [Input('levels-cooling','value')])
244 | def update_cooling(value):
245 |     env.levels_cooling = value
246 |     env.define_cooling(value)
247 |     return "Cooling levels : {}".format(value)
248 | 
249 | 
250 | 
251 | @app.callback(
252 |     Output("risk-factor","children"),
253 |     [Input('levels-risk-factor','value')])
254 | def update_risk(value):
255 |     return "Risk factor : {}".format(value)
256 | 
257 | 
258 | 
259 | @app.callback(
260 |     Output("cost-factor","children"),
261 |     [Input('levels-cost-factor','value')])
262 | def update_cost(value):
263 |     return "Cost factor : {}".format(value)
264 | 
265 | @app.callback(
266 |     Output("input-episodes","children"),
267 |     [Input('n-episodes','value')])
268 | def update_episodes(value):
269 |     return "N episodes : {}".format(value)
270 | 
271 | @app.callback(
272 |     Output("input-lr","children"),
273 |     [Input('lr','value')])
274 | def update_lr(value):
275 |     return "Learning rate : {}".format(value)
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 
282 | #---------------------------------------------------------------------------------
283 | # ADD EXTERNAL CSS
284 | 
285 | external_css = ["https://fonts.googleapis.com/css?family=Product+Sans:400,400i,700,700i",
286 |                 "https://cdn.rawgit.com/plotly/dash-app-stylesheets/2cc54b8c03f4126569a3440aae611bbef1d7a5dd/stylesheet.css"]
287 | 
288 | for css in external_css:
289 |     app.css.append_css({"external_url": css})
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | #---------------------------------------------------------------------------------
298 | # RUN SERVER
299 | if __name__ == '__main__':
300 |     app.run_server(debug=True)
301 |     np.random.seed()


--------------------------------------------------------------------------------
/3. Robotics/minitaur.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 25/08/2017
  9 | 
 10 | theo.alves.da.costa@gmail.com
 11 | https://github.com/theolvs
 12 | ------------------------------------------------------------------------
 13 | """
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | # Usual libraries
 20 | import os
 21 | import matplotlib.pyplot as plt
 22 | import pandas as pd
 23 | import numpy as np 
 24 | import sys
 25 | import random
 26 | import time
 27 | from tqdm import tqdm
 28 | import random
 29 | import gym
 30 | import numpy as np
 31 | 
 32 | 
 33 | # Keras (Deep Learning)
 34 | from keras.models import Sequential
 35 | from keras.layers import Dense
 36 | from keras.optimizers import Adam
 37 | 
 38 | 
 39 | # Custom RL library
 40 | import sys
 41 | sys.path.insert(0,'..')
 42 | 
 43 | from rl import utils
 44 | from rl.agents.dqn_agent import DQNAgent
 45 | 
 46 | import pybullet_envs.bullet.minitaur_gym_env as e
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | #----------------------------------------------------------------
 53 | # CONSTANTS
 54 | 
 55 | 
 56 | N_EPISODES = 1000
 57 | MAX_STEPS = 2000
 58 | RENDER = True
 59 | RENDER_EVERY = 50
 60 | 
 61 | 
 62 | 
 63 | #----------------------------------------------------------------
 64 | # MAIN LOOP
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 | 
 69 |     # Define the gym environment
 70 |     env = e.MinitaurBulletEnv(render=True)
 71 | 
 72 |     # Get the environement action and observation space
 73 |     state_size = env.observation_space.shape[0]
 74 |     action_size = env.action_space.shape[0]
 75 | 
 76 |     # Create the RL Agent
 77 |     agent = DQNAgent(state_size,action_size,low = -1,high = 1,action_type="continuous")
 78 | 
 79 |     # Initialize a list to store the rewards
 80 |     rewards = []
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 |     #---------------------------------------------
 87 |     # ITERATION OVER EPISODES
 88 |     for i_episode in range(N_EPISODES):
 89 | 
 90 | 
 91 | 
 92 |         # Reset the environment
 93 |         s = env.reset()
 94 |         reward = 0
 95 | 
 96 | 
 97 |         #-----------------------------------------
 98 |         # EPISODE RUN
 99 |         for i_step in range(MAX_STEPS):
100 |         
101 |             # Render the environement
102 |             if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
103 | 
104 |             # The agent chose the action considering the given current state
105 |             a = agent.act(s)
106 |             
107 |             # Take the action, get the reward from environment and go to the next state
108 |             s_next,r,done,info = env.step(a)
109 |             reward += r
110 | 
111 |             # Remember the important variables
112 |             agent.remember(s,a,r,s_next,done)
113 |                 
114 |             # Go to the next state
115 |             s = s_next
116 |             
117 |             # If the episode is terminated
118 |             if done:
119 |                 print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward))
120 |                 break
121 | 
122 | 
123 |         #-----------------------------------------
124 | 
125 |         # Store the rewards
126 |         rewards.append(i_step)
127 | 
128 | 
129 |         # Training
130 |         agent.train(batch_size = 128)
131 | 
132 | 
133 | 
134 | 
135 | 
136 |     # Plot the average running rewards
137 |     utils.plot_average_running_rewards(rewards)
138 | 


--------------------------------------------------------------------------------
/4. Chrome Dino/README.md:
--------------------------------------------------------------------------------
 1 | # Chrome Dino Project
 2 | ## Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch
 3 | ![](http://www.skipser.com/test/trex-game/promotion/trex-chrome-game.png)
 4 | 
 5 | 
 6 | ##### Summary
 7 | - Capturing image from the game - **OK**
 8 | - Allowing control programmatically - **OK**
 9 | - Trying a simple implementation of rules-based agent with classic CV algorithms - **OK** 
10 | - Capturing scores for fitness and reward - **OK**
11 | - Creating the environment for RL - **OK**
12 | - Developing a RL agent that learns via evolution strategies - **OK**
13 | - Different experiments on both agent and method of learning
14 | 
15 | 
16 | ##### Ideas 
17 | - Taking as input of the neural network
18 |   - The boundaries of the obstacles in a 1D vector
19 |   - The raw image
20 |   - The processed image
21 | - Initialize the agent with hard coded policy
22 | - Combine the RL agent and the rules-based Agent
23 | - Try other evolution strategies
24 |   - Crossover on the fitness
25 |   - Simple ES
26 |   - CMA-ES
27 | 
28 | 
29 | ##### Experiments : 
30 | 1. **Genetic algorithm** : Generation of 20 dinos, 5 survive, and make 10 offsprings. 10 random dinos are created to complete the 20 population. Did not work at all after 100 generations, still an average score of 50 which is stopping at the first obstacle. This was tested without mutations. The Neural Network is very shallow MLP with one 100-unit hidden layer. 
31 | 2. **Genetic algorithm** : Generation of 40 dinos, 10 survive, make 45 offsprings, but only 40 are selected at random to recreate the 40-population. Added mutations with gaussian noise at this step. Tried as well with a shallow MLP but also with a simple logistic regression in PyTorch
32 | 3. **Genetic algorithm** : Generation of 50 dinos, 12 survive, make 66 offsprings, but only 38 are selected at random to recreate the population. The input is now modelled by a vector with the position on the x axis of the next 2 obstacles. Thus I went back to a shallow MLP with the following structure ``(2 input features,50 hidden layers,1 output)`` giving me the probability to jump. When ensuring a high mutation factor for the gaussian noise to have more exploration. The dinosaurs reach a max score of 600 in about 70 generations of 50 dinos (6 hours on my laptop). But they fail when reaching the birds that were not included in the training. 
33 | 4. **Evolution Strategy** : I went back to a simple evolution strategy to focus the training on the dino with the good behavior. The selection will be the top 10 or 20% at each generation. Then the next generation is created based on the fittest on which is adding gaussian noise as the mutations. With this strategy the dinosaur reach a max score of 600 in about 20 generations of 50 dinos. This works better than the last solution, but it is always falling to local optimas with dino jumping all the time to maximize their score. 
34 | 5. **Evolution Strategy** : to correct the bad behavior of jumping all the time, I added a discount factor if moves are done when there is no obstacles. By counting the number of obstacles passed and the number of moves. The new reward is then modelled in the fashion of the Bellman equation, by incrementing a discounted reward to the previous reward. With this correction, after one generation the "always-jumping" behavior has disappeared, and with a few generations the dinos reach a good enough policy. In 10 generations of 10 dinos only (only 10 minutes on my laptop) we reach easily the max score of 600 previously reached, with a good enough average policy. But new issues arise : birds that come after 600 points which require to duck, speed increasing over time, long obstacles which would require to jump before. Here is a screen capture of the game at this state : 
35 | ![](images/dino_ml_agent1.gif)
36 |  
37 | 
38 | 
39 | ##### Misc
40 | - Finding parameter on when to jump
41 | - Logreg/NN on the first and second position of obstacles
42 | - ML + Heuristics model
43 | - Bayesian priors


--------------------------------------------------------------------------------
/4. Chrome Dino/experiments.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | GENETIC ALGORITHMS EXPERIMENTS
  7 | Started on the 2018/01/03
  8 | theo.alves.da.costa@gmail.com
  9 | https://github.com/theolvs
 10 | ------------------------------------------------------------------------
 11 | """
 12 | 
 13 | from scipy import stats
 14 | import seaborn as sns
 15 | import os
 16 | import matplotlib.pyplot as plt
 17 | import pandas as pd
 18 | import numpy as np
 19 | import sys
 20 | import time
 21 | from tqdm import tqdm
 22 | import itertools
 23 | 
 24 | 
 25 | 
 26 | 
 27 | #=============================================================================================================================
 28 | # DISTRIBUTIONS
 29 | #=============================================================================================================================
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | class Dist(object):
 36 |     def __init__(self,mu = None,std = None,label = None):
 37 |         self.mu = np.random.rand()*20 - 10 if mu is None else mu
 38 |         self.std = np.random.rand()*10 if std is None else std
 39 |         self.label = "" if not label else " - "+label
 40 |         self.func = lambda x : stats.norm.cdf(x,loc = self.mu,scale = self.std)
 41 |         
 42 |     def __repr__(self,markdown = False):
 43 |         return "Norm {1}mu={2}{0}, {0}std={3}{0}{4}".format("$" if markdown else "","$\\" if markdown else "",
 44 |                                                              round(self.mu,2),round(self.std,2),self.label)
 45 |         
 46 |     def plot(self,fill = True):
 47 |         x = np.linspace(-20, 20, 100)
 48 |         y = stats.norm.pdf(x,loc = self.mu,scale = self.std)
 49 |         plt.plot(x,y,label = self.__repr__(markdown = True))
 50 |         if fill:
 51 |             plt.fill_between(x, 0, y, alpha=0.4)
 52 |         
 53 |         
 54 |     def __add__(self,other):
 55 |         mu = np.mean([self.mu,other.mu])
 56 |         std = np.mean([self.std,other.std])
 57 |         return Dist(mu,std)
 58 |     
 59 |     def mutate(self,alpha = 1):
 60 |         self.mu = self.mu + 1/(1+np.log(1+alpha)) * np.random.randn()
 61 |         self.std = max(self.std + 1/(1+np.log(1+alpha)) * np.random.randn(),0.5)
 62 |         
 63 |     def fitness(self,x):
 64 |         return 1 - stats.kstest(x,self.func).statistic
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | class Population(object):
 76 |     def __init__(self,distributions = None,n = 100):
 77 |         if distributions is not None:
 78 |             self.distributions = distributions
 79 |         else:
 80 |             self.distributions = [Dist() for i in range(n)]
 81 |             
 82 |     def __getitem__(self,key):
 83 |         if type(key) == tuple or type(key) == list:
 84 |             d = []
 85 |             for i in key:
 86 |                 d.append(self.distributions[i])
 87 |             return d
 88 |         else:
 89 |             return self.distributions[key]
 90 |     
 91 |     def __iter__(self):
 92 |         return iter(self.distributions)
 93 |     
 94 |     def __len__(self):
 95 |         return len(self.distributions)
 96 |     
 97 |     def plot(self,title = "Normal distributions",figsize = None):
 98 |         if figsize:
 99 |             plt.figure(figsize = figsize)
100 |         plt.title(title)
101 |         fill = len(self) < 5
102 |         for d in self:
103 |             d.plot(fill = fill)
104 |         plt.legend()
105 |         plt.xlabel("x")
106 |         plt.show()
107 |     
108 |     def evaluate(self,x):
109 |         fitnesses = [(i,dist.fitness(x)) for i,dist in enumerate(self)]
110 |         indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True))
111 |         return indices,fitnesses
112 |     
113 |     def selection(self,x,top = 0.1):
114 |         indices,fitnesses = self.evaluate(x)
115 |         n = int(top*len(fitnesses))
116 |         return indices[:n]
117 |     
118 |     
119 |     def crossover(self,indices):
120 |         combinations = list(itertools.combinations(indices,2))
121 |         np.random.shuffle(combinations)
122 |         combinations = combinations[:len(self)]
123 |         new_population = []
124 |         for i,j in combinations:
125 |             new_population.append(self[i]+self[j])
126 |         self.distributions = new_population
127 |             
128 |     def mutate(self,generation = 1):
129 |         for d in self:
130 |             d.mutate(generation)
131 |             
132 |             
133 |     def evolve(self,x,top = 0.25,n_generations = 20,last_selection = True):
134 |         all_fitnesses = [self.evaluate(x)[1]]
135 | 
136 |         for generation in tqdm(range(n_generations)):
137 | 
138 |             indices = self.selection(x,top)
139 |             self.crossover(indices)
140 |             self.mutate(generation)
141 |             
142 |             indices,fitnesses = self.evaluate(x)
143 |             all_fitnesses.append(fitnesses)
144 |             
145 |         self._plot_fitnesses(all_fitnesses)
146 |         
147 |         if last_selection:
148 |             indices = self.selection(x,top)
149 |     
150 |         return Population(self[indices])
151 |     
152 |     
153 |     def _plot_fitnesses(self,fitnesses):
154 |         sups = []
155 |         infs = []
156 |         means = []
157 |         for step in fitnesses:
158 |             sups.append(np.max(step))
159 |             infs.append(np.min(step))
160 |             means.append(np.mean(step))
161 |             
162 |         plt.figure(figsize=(10,6))
163 |         plt.plot(means)
164 |         plt.fill_between(range(len(means)),sups,infs, alpha = 0.2)
165 |         plt.xlabel('# Generation')
166 |         plt.ylabel('Fitness')
167 |         plt.legend()
168 |         plt.show()
169 | 
170 | 
171 | 
172 | 
173 | 
174 | #=============================================================================================================================
175 | # LOGREG
176 | #=============================================================================================================================
177 | 
178 | 
179 | 
180 | import torch
181 | from torch.autograd import Variable
182 | import torch.nn as nn
183 | import torch.nn.functional as F
184 | 
185 | 
186 | 
187 | 
188 | class LogReg(torch.nn.Module):
189 |     def __init__(self, n_feature,n_output = 1,alpha = 10e-1):
190 |         self.alpha = alpha
191 |         self.args = n_feature,n_output
192 |         super(LogReg, self).__init__()
193 |         self.out = torch.nn.Linear(n_feature,n_output,bias = False)   # output layer
194 | 
195 |     def forward(self, x):
196 |         x = Variable(torch.FloatTensor(x))
197 |         x = F.sigmoid(self.out(x))
198 |         return x
199 | 
200 | 
201 |     def __add__(self,other):
202 |         new = LogReg(*self.args)
203 |         new.out.weight.data = torch.FloatTensor(0.5 * (self.out.weight.data.numpy() + other.out.weight.data.numpy()))
204 |         return new
205 | 
206 | 
207 |     def mutate(self,generation):
208 |         out = self.out.weight.data.numpy()
209 |         noise_out = self.alpha * np.random.randn(*out.shape)
210 |         self.out.weight.data = torch.FloatTensor(self.out.weight.data.numpy() + noise_out)
211 | 
212 | 
213 |     def evaluate(self,x,y):
214 |         pred = self.forward(x).data.numpy()
215 |         loss_1 = np.sum(np.log(pred + 10e-9)*y.reshape(-1,1))
216 |         loss_0 = np.sum(np.log(1-pred + 10e-9)*(1-y).reshape(-1,1))
217 |         return loss_1 + loss_0
218 | 
219 | 
220 |     def plot_coefs(self):
221 |         plt.figure(figsize = (15,4))
222 |         plt.title("Coefficients")
223 |         plt.axhline(0,c = "black")
224 |         plt.plot(self.out.weight.data.numpy()[0])
225 |         plt.xlabel("# Pixel")
226 |         plt.show()
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | class PopulationLogReg(object):
236 |     def __init__(self,x,y,regs = None,n = 20,top = 0.25,**kwargs):
237 | 
238 |         self.x = x
239 |         self.y = y
240 |         self.kwargs = kwargs
241 | 
242 |         if regs is None:
243 |             self.regs = [LogReg(**kwargs) for i in range(n)]
244 |         else:
245 |             self.regs = regs
246 | 
247 | 
248 |     def __getitem__(self,key):
249 |         if type(key) == tuple or type(key) == list:
250 |             d = []
251 |             for i in key:
252 |                 d.append(self.regs[i])
253 |             return d
254 |         else:
255 |             return self.regs[key]
256 |     
257 |     def __iter__(self):
258 |         return iter(self.regs)
259 |     
260 |     def __len__(self):
261 |         return len(self.regs)
262 | 
263 | 
264 | 
265 |     def evaluate(self):
266 |         fitnesses = [(i,element.evaluate(self.x,self.y)) for i,element in enumerate(self)]
267 |         indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True))
268 |         return indices,fitnesses
269 | 
270 | 
271 | 
272 |     def selection(self,top = 0.5):
273 |         indices,fitnesses = self.evaluate()
274 |         n = int(top*len(fitnesses))
275 |         return indices[:n]
276 | 
277 | 
278 | 
279 |     def crossover(self,indices):
280 |         combinations = list(itertools.combinations(indices,2))
281 |         np.random.shuffle(combinations)
282 |         combinations = combinations[:len(self)]
283 |         new_population = []
284 |         for i,j in combinations:
285 |             new_population.append(self[i]+self[j])
286 | 
287 |         if len(new_population) < len(self):
288 |             new_population.extend([LogReg(**self.kwargs) for i in range(len(self)-len(new_population))])
289 |         self.regs = new_population
290 | 
291 | 
292 | 
293 |     def mutate(self,generation):
294 |         for d in self:
295 |             d.mutate(generation)
296 | 
297 | 
298 |             
299 |     def evolve(self,top = 0.25,n_generations = 20,last_selection = True):
300 |         n_fittest = int(top*len(self))
301 |         offsprings = len(list(itertools.combinations(range(n_fittest),2)))
302 |         print("- Generations {}".format(len(self)))
303 |         print("- Fittest : {}".format(n_fittest))
304 |         print("- Offsprings : {}".format(offsprings))
305 | 
306 |         all_fitnesses = [self.evaluate()[1]]
307 | 
308 |         for generation in tqdm(range(n_generations)):
309 | 
310 |             indices = self.selection(top)
311 |             self.crossover(indices)
312 |             self.mutate(generation)
313 |             
314 |             indices,fitnesses = self.evaluate()
315 |             all_fitnesses.append(fitnesses)
316 |             
317 |         self._plot_fitnesses(all_fitnesses)
318 |         
319 |         if last_selection:
320 |             indices = self.selection(top)
321 |     
322 |         return PopulationLogReg(self.x,self.y,regs = self[indices])
323 | 
324 |     
325 |     
326 |     def _plot_fitnesses(self,fitnesses):
327 | 
328 |         from sklearn.linear_model import LogisticRegression
329 |         lr = LogisticRegression()
330 |         lr.fit(self.x,self.y)
331 |         pred_bench = lr.predict_proba(self.x)
332 |         loss_bench = np.sum(np.log(pred_bench + 10e-9)*self.y.reshape(-1,1)) + np.sum(np.log(1-pred_bench + 10e-9)*(1-self.y).reshape(-1,1))
333 | 
334 |         sups = []
335 |         infs = []
336 |         means = []
337 |         for step in fitnesses:
338 |             sups.append(np.max(step))
339 |             infs.append(np.min(step))
340 |             means.append(np.mean(step))
341 |             
342 |         plt.figure(figsize=(10,6))
343 |         plt.plot(means)
344 |         plt.fill_between(range(len(means)),sups,infs, alpha = 0.2)
345 |         plt.axhline(loss_bench)
346 |         plt.xlabel('# Generation')
347 |         plt.ylabel('Fitness')
348 |         plt.legend()
349 |         plt.show()
350 | 
351 | 
352 | 


--------------------------------------------------------------------------------
/4. Chrome Dino/images/capture1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/capture1.png


--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_hardcoded_agent.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_hardcoded_agent.gif


--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_ml_agent1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1.gif


--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_ml_agent1_bad.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1_bad.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/README.md:
--------------------------------------------------------------------------------
 1 | # Delivery optimization with Reinforcement Learning
 2 | ![](http://img.chefdentreprise.com/Img/BREVE/2018/3/328332/recette-nestor-atteindre-rentabilite-F.jpg)
 3 | 
 4 | This folder contains experiments to solve transportation optimization using **Reinforcement Learning** algorithm <br>
 5 | It will use the code of RL agents previously created in this repo.
 6 | 
 7 | > The overall goal is to optimize routing between deliveries via **experience replay** <br>
 8 | > And be robust to anomalies such as traffic slowing down the vehicles in a zone
 9 | 
10 | ##### Preliminary remarks
11 | Such a problem (Travelling Salesman Problem) has many possible solutions including brute force or heuristic solutions. <br>
12 | The goal here was to demonstrate the use of Reinforcement Learning in particular **when the cost function between two points is stochastic**<br>
13 | It shows also a different resolution with a algorithm that could be used in a live system and automatically improves other time towards the best stragies.  
14 | 
15 | 
16 | # The environment 
17 | 
18 | ## Environment implementation
19 | 
20 | All the environment was coded from scratch with parameterized : 
21 | - Number of stops for delivery
22 | - Traffic zone size (optional)
23 | - Traffic intensity (optional)
24 | 
25 | *The convention used are the same as for OpenAI Gym's environments*<br>
26 | *Only numpy and other basic libraries are used here for the environment*
27 | 
28 | ##### Base environment with one trajectory
29 | ![](env1.png)
30 | 
31 | ##### Base environment with 500 stops
32 | ![](env2.png)
33 | 
34 | ##### Base environment with traffic zone
35 | ![](env3.png)
36 | 
37 | ## Rewards
38 | - Rewards from the environment between two delivery stops are simply the time elapsed between two travels which is calculated by taking the euclidean distance between two points plus a gaussian noise
39 | - If the trajectory between two stops goes through the traffic zone, the time elapsed is longer via a noise proportional to the distance through the zone and the traffic intensity parameter
40 | 
41 | 
42 | # The algorithm 
43 | 
44 | ## Q-Learning
45 | - A simple **Q-Learning** algorithm already gave interesting results. 
46 | - The **reward** taken was the opposite of the time elapsed taken from the environment
47 | - An **epsilon-greedy** strategy allow to discover new paths and strategies while exploring
48 | 
49 | ##### Training
50 | Over experience replays, the delivery takes less and less time
51 | ![](training.png)
52 | 
53 | ### Results
54 | ##### 50 stops with no traffic
55 | ![](training_50_stops.gif)
56 | 
57 | 
58 | ##### 100 stops with no traffic
59 | ![](training_100_stops.gif)
60 | 
61 | ##### 500 stops with no traffic
62 | ![](training_500_stops.gif)
63 | 
64 | ##### 100 stops with intense traffic
65 | ![](training_100_stops_traffic.gif)
66 | 
67 | ##### 500 stops with intense traffic
68 | ![](training_500_stops_traffic.gif)
69 | 
70 | 
71 | # Next steps
72 | - Test other simple algorithms like SARSA
73 | - Switch from discrete to continuous problems with Deep-Q-Learning (start including continuous observation space) and then DDPG (including continuous action space)
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/5. Delivery Optimization/Routing optimization with Deep Reinforcement Learning.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Routing optimization using Deep Reinforcement Learning"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {
14 |     "ExecuteTime": {
15 |      "end_time": "2019-09-17T17:47:22.119995Z",
16 |      "start_time": "2019-09-17T17:47:20.289509Z"
17 |     }
18 |    },
19 |    "outputs": [],
20 |    "source": [
21 |     "# Base Data Science snippet\n",
22 |     "import pandas as pd\n",
23 |     "import numpy as np\n",
24 |     "import matplotlib.pyplot as plt\n",
25 |     "import os\n",
26 |     "import time\n",
27 |     "from tqdm import tqdm_notebook\n",
28 |     "\n",
29 |     "%matplotlib inline\n",
30 |     "%load_ext autoreload\n",
31 |     "%autoreload 2"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": []
40 |   }
41 |  ],
42 |  "metadata": {
43 |   "kernelspec": {
44 |    "display_name": "Python 3",
45 |    "language": "python",
46 |    "name": "python3"
47 |   },
48 |   "language_info": {
49 |    "codemirror_mode": {
50 |     "name": "ipython",
51 |     "version": 3
52 |    },
53 |    "file_extension": ".py",
54 |    "mimetype": "text/x-python",
55 |    "name": "python",
56 |    "nbconvert_exporter": "python",
57 |    "pygments_lexer": "ipython3",
58 |    "version": "3.6.4"
59 |   },
60 |   "toc": {
61 |    "base_numbering": 1,
62 |    "nav_menu": {},
63 |    "number_sections": true,
64 |    "sideBar": true,
65 |    "skip_h1_title": false,
66 |    "title_cell": "Table of Contents",
67 |    "title_sidebar": "Contents",
68 |    "toc_cell": false,
69 |    "toc_position": {},
70 |    "toc_section_display": true,
71 |    "toc_window_display": false
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/5. Delivery Optimization/delivery.py:
--------------------------------------------------------------------------------
  1 | # Base Data Science snippet
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import os
  6 | import time
  7 | from tqdm import tqdm_notebook
  8 | from scipy.spatial.distance import cdist
  9 | import imageio
 10 | from matplotlib.patches import Rectangle
 11 | from matplotlib.collections import PatchCollection
 12 | 
 13 | plt.style.use("seaborn-dark")
 14 | 
 15 | import sys
 16 | sys.path.append("../")
 17 | from rl.agents.q_agent import QAgent
 18 | 
 19 | 
 20 | 
 21 | 
 22 | class DeliveryEnvironment(object):
 23 |     def __init__(self,n_stops = 10,max_box = 10,method = "distance",**kwargs):
 24 | 
 25 |         print(f"Initialized Delivery Environment with {n_stops} random stops")
 26 |         print(f"Target metric for optimization is {method}")
 27 | 
 28 |         # Initialization
 29 |         self.n_stops = n_stops
 30 |         self.action_space = self.n_stops
 31 |         self.observation_space = self.n_stops
 32 |         self.max_box = max_box
 33 |         self.stops = []
 34 |         self.method = method
 35 | 
 36 |         # Generate stops
 37 |         self._generate_constraints(**kwargs)
 38 |         self._generate_stops()
 39 |         self._generate_q_values()
 40 |         self.render()
 41 | 
 42 |         # Initialize first point
 43 |         self.reset()
 44 | 
 45 | 
 46 |     def _generate_constraints(self,box_size = 0.2,traffic_intensity = 5):
 47 | 
 48 |         if self.method == "traffic_box":
 49 | 
 50 |             x_left = np.random.rand() * (self.max_box) * (1-box_size)
 51 |             y_bottom = np.random.rand() * (self.max_box) * (1-box_size)
 52 | 
 53 |             x_right = x_left + np.random.rand() * box_size * self.max_box
 54 |             y_top = y_bottom + np.random.rand() * box_size * self.max_box
 55 | 
 56 |             self.box = (x_left,x_right,y_bottom,y_top)
 57 |             self.traffic_intensity = traffic_intensity 
 58 | 
 59 | 
 60 | 
 61 |     def _generate_stops(self):
 62 | 
 63 |         if self.method == "traffic_box":
 64 | 
 65 |             points = []
 66 |             while len(points) < self.n_stops:
 67 |                 x,y = np.random.rand(2)*self.max_box
 68 |                 if not self._is_in_box(x,y,self.box):
 69 |                     points.append((x,y))
 70 | 
 71 |             xy = np.array(points)
 72 | 
 73 |         else:
 74 |             # Generate geographical coordinates
 75 |             xy = np.random.rand(self.n_stops,2)*self.max_box
 76 | 
 77 |         self.x = xy[:,0]
 78 |         self.y = xy[:,1]
 79 | 
 80 | 
 81 |     def _generate_q_values(self,box_size = 0.2):
 82 | 
 83 |         # Generate actual Q Values corresponding to time elapsed between two points
 84 |         if self.method in ["distance","traffic_box"]:
 85 |             xy = np.column_stack([self.x,self.y])
 86 |             self.q_stops = cdist(xy,xy)
 87 |         elif self.method=="time":
 88 |             self.q_stops = np.random.rand(self.n_stops,self.n_stops)*self.max_box
 89 |             np.fill_diagonal(self.q_stops,0)
 90 |         else:
 91 |             raise Exception("Method not recognized")
 92 |     
 93 | 
 94 |     def render(self,return_img = False):
 95 |         
 96 |         fig = plt.figure(figsize=(7,7))
 97 |         ax = fig.add_subplot(111)
 98 |         ax.set_title("Delivery Stops")
 99 | 
100 |         # Show stops
101 |         ax.scatter(self.x,self.y,c = "red",s = 50)
102 | 
103 |         # Show START
104 |         if len(self.stops)>0:
105 |             xy = self._get_xy(initial = True)
106 |             xytext = xy[0]+0.1,xy[1]-0.05
107 |             ax.annotate("START",xy=xy,xytext=xytext,weight = "bold")
108 | 
109 |         # Show itinerary
110 |         if len(self.stops) > 1:
111 |             ax.plot(self.x[self.stops],self.y[self.stops],c = "blue",linewidth=1,linestyle="--")
112 |             
113 |             # Annotate END
114 |             xy = self._get_xy(initial = False)
115 |             xytext = xy[0]+0.1,xy[1]-0.05
116 |             ax.annotate("END",xy=xy,xytext=xytext,weight = "bold")
117 | 
118 | 
119 |         if hasattr(self,"box"):
120 |             left,bottom = self.box[0],self.box[2]
121 |             width = self.box[1] - self.box[0]
122 |             height = self.box[3] - self.box[2]
123 |             rect = Rectangle((left,bottom), width, height)
124 |             collection = PatchCollection([rect],facecolor = "red",alpha = 0.2)
125 |             ax.add_collection(collection)
126 | 
127 | 
128 |         plt.xticks([])
129 |         plt.yticks([])
130 |         
131 |         if return_img:
132 |             # From https://ndres.me/post/matplotlib-animated-gifs-easily/
133 |             fig.canvas.draw_idle()
134 |             image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
135 |             image  = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
136 |             plt.close()
137 |             return image
138 |         else:
139 |             plt.show()
140 | 
141 | 
142 | 
143 |     def reset(self):
144 | 
145 |         # Stops placeholder
146 |         self.stops = []
147 | 
148 |         # Random first stop
149 |         first_stop = np.random.randint(self.n_stops)
150 |         self.stops.append(first_stop)
151 | 
152 |         return first_stop
153 | 
154 | 
155 |     def step(self,destination):
156 | 
157 |         # Get current state
158 |         state = self._get_state()
159 |         new_state = destination
160 | 
161 |         # Get reward for such a move
162 |         reward = self._get_reward(state,new_state)
163 | 
164 |         # Append new_state to stops
165 |         self.stops.append(destination)
166 |         done = len(self.stops) == self.n_stops
167 | 
168 |         return new_state,reward,done
169 |     
170 | 
171 |     def _get_state(self):
172 |         return self.stops[-1]
173 | 
174 | 
175 |     def _get_xy(self,initial = False):
176 |         state = self.stops[0] if initial else self._get_state()
177 |         x = self.x[state]
178 |         y = self.y[state]
179 |         return x,y
180 | 
181 | 
182 |     def _get_reward(self,state,new_state):
183 |         base_reward = self.q_stops[state,new_state]
184 | 
185 |         if self.method == "distance":
186 |             return base_reward
187 |         elif self.method == "time":
188 |             return base_reward + np.random.randn()
189 |         elif self.method == "traffic_box":
190 | 
191 |             # Additional reward correspond to slowing down in traffic
192 |             xs,ys = self.x[state],self.y[state]
193 |             xe,ye = self.x[new_state],self.y[new_state]
194 |             intersections = self._calculate_box_intersection(xs,xe,ys,ye,self.box)
195 |             if len(intersections) > 0:
196 |                 i1,i2 = intersections
197 |                 distance_traffic = np.sqrt((i2[1]-i1[1])**2 + (i2[0]-i1[0])**2)
198 |                 additional_reward = distance_traffic * self.traffic_intensity * np.random.rand()
199 |             else:
200 |                 additional_reward = np.random.rand()
201 | 
202 |             return base_reward + additional_reward
203 | 
204 | 
205 |     @staticmethod
206 |     def _calculate_point(x1,x2,y1,y2,x = None,y = None):
207 | 
208 |         if y1 == y2:
209 |             return y1
210 |         elif x1 == x2:
211 |             return x1
212 |         else:
213 |             a = (y2-y1)/(x2-x1)
214 |             b = y2 - a * x2
215 | 
216 |             if x is None:
217 |                 x = (y-b)/a
218 |                 return x
219 |             elif y is None:
220 |                 y = a*x+b
221 |                 return y
222 |             else:
223 |                 raise Exception("Provide x or y")
224 | 
225 | 
226 |     def _is_in_box(self,x,y,box):
227 |         # Get box coordinates
228 |         x_left,x_right,y_bottom,y_top = box
229 |         return x >= x_left and x <= x_right and y >= y_bottom and y <= y_top
230 | 
231 | 
232 |     def _calculate_box_intersection(self,x1,x2,y1,y2,box):
233 | 
234 |         # Get box coordinates
235 |         x_left,x_right,y_bottom,y_top = box
236 | 
237 |         # Intersections
238 |         intersections = []
239 | 
240 |         # Top intersection
241 |         i_top = self._calculate_point(x1,x2,y1,y2,y=y_top)
242 |         if i_top > x_left and i_top < x_right:
243 |             intersections.append((i_top,y_top))
244 | 
245 |         # Bottom intersection
246 |         i_bottom = self._calculate_point(x1,x2,y1,y2,y=y_bottom)
247 |         if i_bottom > x_left and i_bottom < x_right:
248 |             intersections.append((i_bottom,y_bottom))
249 | 
250 |         # Left intersection
251 |         i_left = self._calculate_point(x1,x2,y1,y2,x=x_left)
252 |         if i_left > y_bottom and i_left < y_top:
253 |             intersections.append((x_left,i_left))
254 | 
255 |         # Right intersection
256 |         i_right = self._calculate_point(x1,x2,y1,y2,x=x_right)
257 |         if i_right > y_bottom and i_right < y_top:
258 |             intersections.append((x_right,i_right))
259 | 
260 |         return intersections
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | def run_episode(env,agent,verbose = 1):
269 | 
270 |     s = env.reset()
271 |     agent.reset_memory()
272 | 
273 |     max_step = env.n_stops
274 |     
275 |     episode_reward = 0
276 |     
277 |     i = 0
278 |     while i < max_step:
279 | 
280 |         # Remember the states
281 |         agent.remember_state(s)
282 | 
283 |         # Choose an action
284 |         a = agent.act(s)
285 |         
286 |         # Take the action, and get the reward from environment
287 |         s_next,r,done = env.step(a)
288 | 
289 |         # Tweak the reward
290 |         r = -1 * r
291 |         
292 |         if verbose: print(s_next,r,done)
293 |         
294 |         # Update our knowledge in the Q-table
295 |         agent.train(s,a,r,s_next)
296 |         
297 |         # Update the caches
298 |         episode_reward += r
299 |         s = s_next
300 |         
301 |         # If the episode is terminated
302 |         i += 1
303 |         if done:
304 |             break
305 |             
306 |     return env,agent,episode_reward
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | class DeliveryQAgent(QAgent):
314 | 
315 |     def __init__(self,*args,**kwargs):
316 |         super().__init__(*args,**kwargs)
317 |         self.reset_memory()
318 | 
319 |     def act(self,s):
320 | 
321 |         # Get Q Vector
322 |         q = np.copy(self.Q[s,:])
323 | 
324 |         # Avoid already visited states
325 |         q[self.states_memory] = -np.inf
326 | 
327 |         if np.random.rand() > self.epsilon:
328 |             a = np.argmax(q)
329 |         else:
330 |             a = np.random.choice([x for x in range(self.actions_size) if x not in self.states_memory])
331 | 
332 |         return a
333 | 
334 | 
335 |     def remember_state(self,s):
336 |         self.states_memory.append(s)
337 | 
338 |     def reset_memory(self):
339 |         self.states_memory = []
340 | 
341 | 
342 | 
343 | def run_n_episodes(env,agent,name="training.gif",n_episodes=1000,render_each=10,fps=10):
344 | 
345 |     # Store the rewards
346 |     rewards = []
347 |     imgs = []
348 | 
349 |     # Experience replay
350 |     for i in tqdm_notebook(range(n_episodes)):
351 | 
352 |         # Run the episode
353 |         env,agent,episode_reward = run_episode(env,agent,verbose = 0)
354 |         rewards.append(episode_reward)
355 |         
356 |         if i % render_each == 0:
357 |             img = env.render(return_img = True)
358 |             imgs.append(img)
359 | 
360 |     # Show rewards
361 |     plt.figure(figsize = (15,3))
362 |     plt.title("Rewards over training")
363 |     plt.plot(rewards)
364 |     plt.show()
365 | 
366 |     # Save imgs as gif
367 |     imageio.mimsave(name,imgs,fps = fps)
368 | 
369 |     return env,agent


--------------------------------------------------------------------------------
/5. Delivery Optimization/env1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env1.png


--------------------------------------------------------------------------------
/5. Delivery Optimization/env2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env2.png


--------------------------------------------------------------------------------
/5. Delivery Optimization/env3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env3.png


--------------------------------------------------------------------------------
/5. Delivery Optimization/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training.png


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_100_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_100_stops_traffic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops_traffic.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_10_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_10_stops.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_500_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_500_stops_traffic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops_traffic.gif


--------------------------------------------------------------------------------
/5. Delivery Optimization/training_50_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_50_stops.gif


--------------------------------------------------------------------------------
/6. Solving a Rubik's Cube/rubik.py:
--------------------------------------------------------------------------------
  1 | # Base Data Science snippet
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import os
  6 | import time
  7 | from tqdm import tqdm_notebook
  8 | from matplotlib.patches import Rectangle
  9 | from matplotlib.collections import PatchCollection
 10 | 
 11 | plt.style.use("seaborn-dark")
 12 | 
 13 | # import sys
 14 | # sys.path.append("../")
 15 | # from rl.agents.q_agent import QAgent
 16 | 
 17 | #----------------------------------------------------------------------------------------------------------------------------
 18 | # CONSTANTS
 19 | #----------------------------------------------------------------------------------------------------------------------------
 20 | 
 21 | COLORS = ["red","white","orange","yellow","green","blue"]
 22 | WIDTH_SQUARE = 0.05
 23 | FACES = ["LEFT","FRONT","RIGHT","BACK","TOP","BOTTOM"]
 24 | 
 25 | LEFT_SLICE = np.s_[0,:]
 26 | RIGHT_SLICE = np.s_[-1,:]
 27 | TOP_SLICE = np.s_[:,0]
 28 | BOTTOM_SLICE = np.s_[:,-1]
 29 | 
 30 | FACES_LINK = {
 31 |     "LEFT":[
 32 |         ("BACK",RIGHT_SLICE),
 33 |         ("BOTTOM",LEFT_SLICE),
 34 |         ("FRONT",LEFT_SLICE),
 35 |         ("TOP",LEFT_SLICE),
 36 |     ],
 37 |     "FRONT":[
 38 |         ("LEFT",RIGHT_SLICE),
 39 |         ("BOTTOM",BOTTOM_SLICE),
 40 |         ("RIGHT",LEFT_SLICE),
 41 |         ("TOP",TOP_SLICE),
 42 |     ],
 43 |     "RIGHT":[
 44 |         ("TOP",RIGHT_SLICE),
 45 |         ("FRONT",RIGHT_SLICE),
 46 |         ("BOTTOM",RIGHT_SLICE),
 47 |         ("BACK",LEFT_SLICE),
 48 |     ],
 49 |     "BACK":[
 50 |         ("TOP",BOTTOM_SLICE),
 51 |         ("RIGHT",RIGHT_SLICE),
 52 |         ("BOTTOM",TOP_SLICE),
 53 |         ("LEFT",LEFT_SLICE),
 54 |     ],
 55 |     "TOP":[
 56 |         ("LEFT",BOTTOM_SLICE),
 57 |         ("FRONT",BOTTOM_SLICE),
 58 |         ("RIGHT",BOTTOM_SLICE),
 59 |         ("BACK",BOTTOM_SLICE),
 60 |     ],
 61 |     "BOTTOM":[
 62 |         ("BACK",TOP_SLICE),
 63 |         ("RIGHT",TOP_SLICE),
 64 |         ("FRONT",TOP_SLICE),
 65 |         ("LEFT",TOP_SLICE),
 66 |     ],
 67 | }
 68 | 
 69 | 
 70 | 
 71 | 
 72 | #----------------------------------------------------------------------------------------------------------------------------
 73 | # RUBIKS CUBE ENVIRONMENT CLASS
 74 | #----------------------------------------------------------------------------------------------------------------------------
 75 | 
 76 | class RubiksCube(object):
 77 |     def __init__(self,shuffle = True):
 78 | 
 79 |         print(f"Initialized RubiksCube")
 80 |         self.data = np.array([[i]*9 for i in range(6)])
 81 |         self.data = self._to_1D(self.data)
 82 | 
 83 |         if shuffle:
 84 |             np.random.shuffle(self.data)
 85 | 
 86 |     @staticmethod
 87 |     def _to_1D(array):
 88 |         return np.squeeze(array.reshape(1,-1))
 89 | 
 90 |     @staticmethod
 91 |     def _to_2D(array):
 92 |         return array.reshape(6,9)
 93 | 
 94 |     @staticmethod
 95 |     def _to_square(face):
 96 |         return face.reshape(3,3)
 97 | 
 98 |     @staticmethod
 99 |     def _to_array(face):
100 |         return face.reshape(9)
101 | 
102 | 
103 |     @staticmethod
104 |     def _facestr_to_faceid(face):
105 |         """Convert face as string to face ID (between 0 and 5)
106 |         """
107 |         if isinstance(face,str):
108 |             assert face in FACES
109 |             face = FACES.index(face)
110 |         return face
111 | 
112 | 
113 |     @staticmethod
114 |     def _rotate_array(array,clockwise = True):
115 |         if clockwise:
116 |             return array[1:] + [array[0]]
117 |         else:
118 |             return [array[-1]] + array[:-1]
119 | 
120 | 
121 |     def get_face(self,face,as_square = True):
122 |         """Function to get one face of the Rubik's cube
123 |         """
124 | 
125 |         # Convert face as string to face ID (between 0 and 5)
126 |         face = self._facestr_to_faceid(face)
127 | 
128 |         # Select matching face in the data array
129 |         face = self.data[face*9:(face+1)*9]
130 | 
131 |         # Reshape face data to a square 
132 |         if as_square:
133 |             face = self._to_square(face)
134 | 
135 |         # Return face data
136 |         return face
137 | 
138 | 
139 | 
140 | 
141 |     def set_face(self,face,array):
142 | 
143 |         # Convert face as string to face ID (between 0 and 5)
144 |         face = self._facestr_to_faceid(face)
145 | 
146 |         # Reshape array
147 |         if array.shape == (3,3):
148 |             array = self._to_array(array)
149 | 
150 |         # Set face
151 |         self.data[face*9:(face+1)*9] = array
152 | 
153 | 
154 | 
155 | 
156 | 
157 |     def rotate(self,face,clockwise = True):
158 |         """Rotate one face of the Rubik's cube
159 |         """
160 |         # Convert face as string to face ID (between 0 and 5)
161 |         face_id = self._facestr_to_faceid(face)
162 | 
163 |         # Get face
164 |         face_data = self.get_face(face_id)
165 | 
166 |         # Rotate selected face
167 |         sense = -1 if clockwise else 1
168 |         face_data = np.rot90(face_data,k=sense)
169 |         self.set_face(face,face_data)
170 | 
171 |         # Get other faces
172 |         linked_faces,slices = zip(*FACES_LINK[face])
173 |         slices_data = [np.copy(self.get_face(linked_faces[i])[slices[i]]) for i in range(4)]
174 | 
175 |         # Rotate arrays
176 |         slices_data = self._rotate_array(slices_data,clockwise = clockwise)
177 | 
178 |         # Set new rotated arrays
179 |         for i in range(4):
180 |             face = linked_faces[i]
181 |             face_data = self.get_face(face)
182 |             face_data[slices[i]] = slices_data[i]
183 |             self.set_face(face,face_data)
184 |         
185 | 
186 | 
187 |     def render3D(self):
188 |         pass   
189 | 
190 | 
191 |     def render(self):
192 |         
193 |         fig = plt.figure(figsize=(7,7))
194 |         ax = fig.add_subplot(111)
195 | 
196 |         for i in range(4):
197 |             face_data = self.data[i*9:(i+1)*9]
198 |             face = RubiksFace(face_data)
199 |             face.render(ax = ax,init_height = 0.4,init_width = 0.15 + i*3*(WIDTH_SQUARE+0.005))
200 | 
201 | 
202 |         for i in range(4,6):
203 |             face_data = self.data[i*9:(i+1)*9]
204 |             face = RubiksFace(face_data)
205 |             init_height = 0.4 + 3*(WIDTH_SQUARE+0.005) if i == 4 else 0.4 - 3*(WIDTH_SQUARE+0.005)
206 |             face.render(ax = ax,init_height = init_height,init_width = 0.15 + 3*(WIDTH_SQUARE+0.005))
207 | 
208 |         plt.xticks([])
209 |         plt.yticks([])
210 |         plt.show()
211 | 
212 | 
213 | 
214 | 
215 | 
216 | class RubiksFace(object):
217 |     def __init__(self,array):
218 |         if array.shape == (3,3):
219 |             self.array = array
220 |         else:
221 |             assert len(array) == 9
222 |             self.array = array.reshape(3,3)
223 |         
224 |     def render(self,ax = None,init_height = 0,init_width = 0):
225 | 
226 |         if ax is None:
227 |             fig = plt.figure(figsize=(7,7))
228 |             ax = fig.add_subplot(111)
229 | 
230 | 
231 |         
232 |         for i in range(3):
233 |             for j in range(3):
234 | 
235 |                 square = self.array[i,j]
236 |                 color = COLORS[square]
237 | 
238 |                 rect = Rectangle((init_width + i*WIDTH_SQUARE,init_height + j*WIDTH_SQUARE), WIDTH_SQUARE, WIDTH_SQUARE)
239 |                 collection = PatchCollection([rect],facecolor = color,alpha = 0.8,edgecolor="black")
240 |                 ax.add_collection(collection)
241 | 
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/20200318 - Hyperion dev.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Hyperion Library development"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "ExecuteTime": {
 15 |      "end_time": "2020-03-18T17:16:33.161399Z",
 16 |      "start_time": "2020-03-18T17:16:31.745503Z"
 17 |     }
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "# Base Data Science snippet\n",
 22 |     "import pandas as pd\n",
 23 |     "import numpy as np\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import os\n",
 26 |     "import time\n",
 27 |     "from tqdm import tqdm_notebook\n",
 28 |     "\n",
 29 |     "%matplotlib inline\n",
 30 |     "%load_ext autoreload\n",
 31 |     "%autoreload 2"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 1,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import sys\n",
 41 |     "sys.path.append(\"c:/git/reinforcement-learning/\")\n",
 42 |     "\n",
 43 |     "from hyperion.grid import *"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "# Playground"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import pygame\n",
 60 |     "\n",
 61 |     "pygame.init()\n",
 62 |     "\n",
 63 |     "ecran = pygame.display.set_mode((300, 200))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "pygame.quit()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": []
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.7.4"
100 |   },
101 |   "toc": {
102 |    "base_numbering": 1,
103 |    "nav_menu": {},
104 |    "number_sections": true,
105 |    "sideBar": true,
106 |    "skip_h1_title": false,
107 |    "title_cell": "Table of Contents",
108 |    "title_sidebar": "Contents",
109 |    "toc_cell": false,
110 |    "toc_position": {},
111 |    "toc_section_display": true,
112 |    "toc_window_display": false
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 4
117 | }
118 | 


--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agents simulation
 2 | ![](https://thumbs.gfycat.com/EvergreenGenuineAmethystgemclam-size_restricted.gif)
 3 | 
 4 | Simulations including multiple agents are present everywhere in our daily lives, from large-scale economics policies to epidemiology. <br>
 5 | Agent-based modeling is even more effective when merged with modern AI techniques such as Reinforcement Learning. <br>
 6 | This folder contains experiments on this topics
 7 | 
 8 | # Experiments summary
 9 | - **October 2019** - First attempts to create a Sugarscape experiment. Developed a framework using Dataframes for accelerated computations. Yet too many interactions to code from scratch and low performance
10 | - **December 2019** - Discovered Unity for such simulations + ML Agents
11 | - **March 2019** - Due to COVID19 outbreak, I started experiments on Multi Agent modeling and social distancing. PyGame is a good candidate for 2D simulations similar to Unity but in Python. Many possibilities and spatial O(n2) interactions are really sped up thanks to colliders embedded in PyGame. Movements are still feasible up to 10k agents at least in my first experiments.  Moved experiments in the [westworld](https://github.com/TheoLvs/westworld) repo. 
12 | 
13 | 
14 | # References
15 | ## Libraries & softwares
16 | - Unity
17 | - NetLogo
18 | - [MESA](https://github.com/projectmesa/mesa) - Python
19 | - [SPADE](https://spade-mas.readthedocs.io/en/latest/readme.html) - Python
20 | - [abcEconomics](https://abce.readthedocs.io/en/master/)
21 | - [GAMA-Platform](https://gama-platform.github.io/)
22 | - [Manim](https://github.com/3b1b/manim) by the great Grant Sanderson
23 | - PyGame
24 | 
25 | ## Tutorials
26 | - [Introduction to Agent Based Modeling in Python](https://towardsdatascience.com/introduction-to-mesa-agent-based-modeling-in-python-bcb0596e1c9a)
27 | 
28 | ## Inspiration
29 | - https://www.complexity-explorables.org/
30 | - Sugarscape https://www.youtube.com/watch?v=r_It_X7v-1E
31 | - youtube.com/watch?v=uVpN136q7N8
32 | - youtube.com/watch?v=Bot5_DouTWg
33 | - Ant-based modeling
34 | 
35 | ## Features to implement
36 | - Set and reload data -> ok
37 | - Animation over the simulation (gif ok, ipywidgets to go)
38 | - Action framework with delayed deferrence
39 | - Metrics storage for each agent
40 | - Set up geographical zones and 2D maps with impossible moves
41 | - Find closest agent method
42 | - Wander method
43 | - Launch simulation until certain time + early stopping
44 | - Circle collider
45 | - Optimizing tutorial towardsdatascience.com/speeding-up-python-code-fast-filtering-and-slow-loops-8e11a09a9c2f for more optimization
46 | - A* algorithm for shortest path
47 | - Heatmaps of navigation presence for retail use cases


--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/pygame_test.py:
--------------------------------------------------------------------------------
  1 | """Pygame test for multi agent modeling
  2 | 
  3 | Tutorials
  4 | https://zestedesavoir.com/tutoriels/846/pygame-pour-les-zesteurs/1381_a-la-decouverte-de-pygame/creer-une-simple-fenetre-personnalisable/#1-15425_creons-une-fenetre-basique
  5 | https://www.pygame.org/docs/ref/rect.html#pygame.Rect.move_ip
  6 | https://stackoverflow.com/questions/32061507/moving-a-rectangle-in-pygame 
  7 | 
  8 | 
  9 | Ideas:
 10 | - Add circles
 11 | - Pathfinding algorithm
 12 | - Obstacles
 13 | - Colliders
 14 | - Clicking to add agent or wall
 15 | - Grid
 16 | - AutoMaze
 17 | - Raytracing
 18 | - Change Icon
 19 | - Heatmaps of where agents were located (for retail purposes)
 20 | 
 21 | Projects:
 22 | - Epidemiology
 23 | - See MESA or NetLogo examples
 24 | - Bunny & Rabbits
 25 | """
 26 | 
 27 | import numpy as np
 28 | import pygame
 29 | import time
 30 | import uuid
 31 | 
 32 | # import os
 33 | # os.environ['SDL_VIDEO_WINDOW_POS'] = "%d,%d" % (320,240)
 34 | 
 35 | pygame.init()
 36 | pygame.display.set_caption("Multi Agent Modeling Environment")
 37 | # ecran = pygame.display.set_mode((0, 0), pygame.FULLSCREEN)
 38 | 
 39 | screen = pygame.display.set_mode((1000, 600))
 40 | 
 41 | simulation_on = True
 42 | # time.sleep(5)
 43 | 
 44 | background_colour = (0, 0, 0)
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | class RectangleAgent:
 52 | 
 53 |     def __init__(self,width,height,x,y,screen = None):
 54 |         # Rect left top width height
 55 | 
 56 |         self.screen = screen
 57 |         self.fig = pygame.rect.Rect((x,y,width,height))
 58 |         # print(f"Initialized rect at {self.pos}")
 59 | 
 60 |         self.change_direction()
 61 | 
 62 |         self.agent_id = str(uuid.uuid1())
 63 | 
 64 | 
 65 |     @property
 66 |     def pos(self):
 67 |         return self.fig.x,self.fig.y,self.fig.width,self.fig.height
 68 | 
 69 |     def move_at(self,x,y):
 70 |         self.x = x
 71 |         self.y = y
 72 | 
 73 |     
 74 |     def wander(self,dl):
 75 | 
 76 |         self.move(angle = self.direction_angle,dl = dl)
 77 | 
 78 | 
 79 |     def change_direction(self):
 80 |         self.direction_angle = np.random.uniform(0,2*np.pi)
 81 | 
 82 | 
 83 |     def move_towards(self):
 84 |         pass
 85 | 
 86 | 
 87 |     def collides(self,agents):
 88 | 
 89 |         if len(agents) == 0:
 90 |             collisions = []
 91 |         else:
 92 |             other_agents = [agent.fig for agent in agents if agent.agent_id != self.agent_id]
 93 |             collisions = self.fig.collidelistall(other_agents)
 94 | 
 95 |         if len(collisions) > 0:
 96 |             return True,collisions
 97 |         else:
 98 |             return False,collisions
 99 | 
100 | 
101 |     def if_collides(self,agents):
102 | 
103 |         is_collision,collisions = self.collides(agents)
104 | 
105 |         if is_collision:
106 |             self.direction_angle += np.pi
107 | 
108 |     
109 | 
110 |     def move(self,dx = 0,dy = 0,angle = None,dl = None,colliders = None):
111 | 
112 |         if angle is not None:
113 |             assert dl is not None
114 | 
115 |             # Compute delta directions with basic trigonometry
116 |             dx = dl * np.cos(angle)
117 |             dy = dl * np.sin(angle)
118 |             self.move(dx = dx,dy = dy)
119 | 
120 |         else:
121 |     
122 |             screen_width = self.screen.get_width()
123 |             screen_height = self.screen.get_height()
124 | 
125 |             old_x = self.fig.x
126 |             old_y = self.fig.y
127 | 
128 |             if self.fig.x + dx > screen_width:
129 |                 self.fig.x = 0
130 |             elif self.fig.x + dx < 0:
131 |                 self.fig.x = screen_width
132 |             else:
133 |                 self.fig.x = self.fig.x + dx
134 | 
135 |             if self.fig.y + dy > screen_height:
136 |                 self.fig.y = 0
137 |             elif self.fig.y + dy < 0:
138 |                 self.fig.y = screen_height
139 |             else:
140 |                 self.fig.y = self.fig.y + dy
141 | 
142 |         if colliders is not None:
143 |             if self.collides(colliders):
144 |                 self.fig.x = old_x
145 |                 self.fig.y = old_y
146 | 
147 | 
148 |         # print(f"Position at {self.fig.x},{self.fig.y}")
149 | 
150 | 
151 |     def render(self,color = (180,20,150)):
152 |         pygame.draw.rect(self.screen,color,self.pos)
153 |         # pygame.draw.circle(self.screen,color,(self.fig.x,self.fig.y),10)
154 |         # pass
155 | 
156 | 
157 | 
158 | 
159 | class Obstacle:
160 |     def __init__(self,width,height,x,y,screen = None):
161 |         # Rect left top width height
162 | 
163 |         self.screen = screen
164 |         self.fig = pygame.rect.Rect((x,y,width,height))
165 |         # print(f"Initialized rect at {self.pos}")
166 |         self.agent_id = str(uuid.uuid1())
167 | 
168 | 
169 |     def render(self,color = (10,150,10)):
170 |         pygame.draw.rect(self.screen,color,self.pos)
171 | 
172 | 
173 |     @property
174 |     def pos(self):
175 |         return self.fig.x,self.fig.y,self.fig.width,self.fig.height
176 | 
177 | 
178 | 
179 | size = 10
180 | n_rects = 500
181 | 
182 | rects = []
183 | 
184 | for i in range(n_rects):
185 |     new_rect = RectangleAgent(
186 |         size,size,
187 |         np.random.uniform(0,screen.get_width()),
188 |         np.random.uniform(0,screen.get_height()),
189 |         screen,
190 |     )
191 | 
192 |     rects.append(new_rect)
193 | 
194 | 
195 |     
196 | 
197 | i = 0
198 | stop = 1000
199 | 
200 | obstacles = [
201 |     Obstacle(200,200,300,300,screen)
202 | ]
203 | 
204 | 
205 | while simulation_on:
206 |     screen.fill(background_colour)
207 | 
208 |     for rect in rects:
209 |         rect.wander(size)
210 |         rect.if_collides(rects + obstacles)
211 | 
212 |     for rect in rects + obstacles:
213 |         rect.render()
214 | 
215 |     for event in pygame.event.get():
216 |         if event.type == pygame.KEYDOWN:
217 |             simulation_on = False
218 | 
219 |         elif event.type == pygame.MOUSEBUTTONUP:
220 |             new_x,new_y = pygame.mouse.get_pos()
221 |             # new_rect = RectangleAgent(size,size,new_x,new_y,screen)
222 |             # rects.append(new_rect)
223 | 
224 |             new_obs = Obstacle(20,20,new_x,new_y,screen)
225 |             obstacles.append(new_obs)
226 | 
227 | 
228 | 
229 |     pygame.display.update()
230 |     # pygame.display.flip()
231 | 
232 |     time.sleep(0.05)
233 | 
234 | 
235 |     if i == stop:
236 |         simulation_on = False
237 |     else:
238 |         i+=1
239 | 
240 | 
241 | pygame.quit()


--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/test.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test.gif


--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/test2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test2.gif


--------------------------------------------------------------------------------
/8. Unity ML agents tests/README.md:
--------------------------------------------------------------------------------
 1 | # Unity ML Agents test
 2 | 
 3 | > I've been creating environments directly with Python for a few years now, yet I've been facing lots of limitation due to the nature of Python<br>
 4 | > In 2020, the best option is probably Unity ML Agents. 
 5 | > This repo will hold experiments on custom Unity environments/games and Reinforcement Learning to attempt solving them
 6 | 
 7 | ## Rolling a ball (January 2020)
 8 | ![](rolling_a_ball/rollingaball1.png)
 9 | > My first experiment is a simple game about rolling a ball affected by gravity trying to catch all 10 pickups randomly placed in the environment. Movement is directly affected by inertia. To create the same env, follow Unity official tutorial https://learn.unity.com/project/roll-a-ball-tutorial
10 | 
11 | 
12 | 
13 | ## References ✨
14 | ### To learn about Unity
15 | - Youtube holds greats resources such as Brackeys, Sebastian Lague or Jason Weimann channels. Huge thanks to those videos for teaching about Unity in such an entertaining way. 
16 | - Unity official tutorials are great as well. 
17 | 
18 | 
19 | ### To learn about Unity ML Agents
20 | - Documentation at https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Readme.md
21 | - Creating custom environments https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Create-New.md
22 | - Overview of how UML works https://github.com/Unity-Technologies/ml-agents/blob/master/docs/ML-Agents-Overview.md
23 | - [This great video](https://www.youtube.com/watch?v=x2RBxmooh8w)
24 | 
25 | 
26 | 
27 | ## Installing ML Agents
28 | Follow [tutorial at this link](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md)
29 | 
30 | - Install Python wrapper with pip
31 | ```
32 | pip install mlagents
33 | ```
34 | - Clone ML Agents repo
35 | ``` 
36 | git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
37 | ```
38 | - Install Barracuda
39 | - Copy ML-Agents folder from the cloned repo at ``UnitySDK/Assets`` in your Assets project folder


--------------------------------------------------------------------------------
/8. Unity ML agents tests/rolling_a_ball/20200202 - Rolling a Ball.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Rolling a Ball"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "![](rollingaball1.png)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Interaction test"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "> This comes from the getting started tutorial applied to the 3D Ball Agent"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Python version:\n",
 41 |       "3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "import numpy as np\n",
 48 |     "import sys\n",
 49 |     "\n",
 50 |     "from mlagents_envs.environment import UnityEnvironment\n",
 51 |     "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n",
 52 |     "\n",
 53 |     "%matplotlib inline\n",
 54 |     "\n",
 55 |     "print(\"Python version:\")\n",
 56 |     "print(sys.version)\n",
 57 |     "\n",
 58 |     "# check Python version\n",
 59 |     "if (sys.version_info[0] < 3):\n",
 60 |     "    raise Exception(\"ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 3,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stderr",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "engine_configuration_channel = EngineConfigurationChannel()\n",
 78 |     "env = UnityEnvironment(base_port = 5004)#, file_name=env_name, side_channels = [engine_configuration_channel])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stderr",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "INFO:mlagents_envs:Connected new brain:\n",
 91 |       "3DBall?team=0\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "#Reset the environment\n",
 97 |     "env.reset()\n",
 98 |     "\n",
 99 |     "# Set the default brain to work with\n",
100 |     "group_name = env.get_agent_groups()[0]\n",
101 |     "group_spec = env.get_agent_group_spec(group_name)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 7,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "Number of observations :  1\n",
114 |       "Agent state looks like: \n",
115 |       "[[-1.4673042e-02 -1.4683060e-02 -5.2082062e-01  4.0000000e+00\n",
116 |       "  -7.9952097e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
117 |       " [-2.6140258e-02  3.4010161e-02 -4.5768166e-01  4.0000000e+00\n",
118 |       "  -5.5027008e-03  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
119 |       " [ 6.3632242e-02  3.7996579e-02 -1.1360741e+00  4.0000000e+00\n",
120 |       "  -4.1505909e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
121 |       " [-4.6871606e-02 -3.9161425e-02 -6.1104012e-01  4.0000000e+00\n",
122 |       "   5.6867313e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
123 |       " [ 3.8746696e-02  7.7085062e-03  1.1423024e+00  4.0000000e+00\n",
124 |       "  -1.4589405e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
125 |       " [ 4.8017994e-02 -7.4483551e-02 -5.7353783e-01  4.0000000e+00\n",
126 |       "  -3.8447380e-03  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
127 |       " [ 3.9585244e-02 -8.3357669e-02 -9.4123268e-01  4.0000000e+00\n",
128 |       "  -7.9583311e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
129 |       " [ 8.0520153e-02 -2.9333552e-02  1.7612720e-01  4.0000000e+00\n",
130 |       "   5.6848335e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
131 |       " [ 8.3218820e-02 -7.4690364e-02  1.4817381e+00  4.0000000e+00\n",
132 |       "   4.3329239e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
133 |       " [ 5.2080988e-03  4.5170397e-03  1.4738545e+00  4.0000000e+00\n",
134 |       "   6.0955667e-01  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
135 |       " [-4.5549396e-02  1.7029690e-02 -1.4121037e+00  4.0000000e+00\n",
136 |       "  -1.0720904e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00]\n",
137 |       " [ 5.7741486e-02  8.4876612e-02  5.8971786e-01  4.0000000e+00\n",
138 |       "  -7.8450203e-02  0.0000000e+00  0.0000000e+00  0.0000000e+00]]\n",
139 |       "Agent state looks like: \n",
140 |       "[-0.01467304 -0.01468306 -0.5208206   4.         -0.79952097  0.\n",
141 |       "  0.          0.        ]\n",
142 |       "Is there a visual observation ? False\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "# Get the state of the agents\n",
148 |     "step_result = env.get_step_result(group_name)\n",
149 |     "\n",
150 |     "# Examine the number of observations per Agent\n",
151 |     "print(\"Number of observations : \", len(group_spec.observation_shapes))\n",
152 |     "\n",
153 |     "# Examine the state space for the first observation for all agents\n",
154 |     "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0]))\n",
155 |     "\n",
156 |     "# Examine the state space for the first observation for the first agent\n",
157 |     "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0][0]))\n",
158 |     "\n",
159 |     "# Is there a visual observation ?\n",
160 |     "vis_obs = any([len(shape) == 3 for shape in group_spec.observation_shapes])\n",
161 |     "print(\"Is there a visual observation ?\", vis_obs)\n",
162 |     "\n",
163 |     "# Examine the visual observations\n",
164 |     "if vis_obs:\n",
165 |     "    vis_obs_index = next(i for i,v in enumerate(group_spec.observation_shapes) if len(v) == 3)\n",
166 |     "    print(\"Agent visual observation look like:\")\n",
167 |     "    obs = step_result.obs[vis_obs_index]\n",
168 |     "    plt.imshow(obs[0,:,:,:])"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 9,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "Total reward this episode: 1.1000000312924385\n",
181 |       "Total reward this episode: 0.6000000238418579\n",
182 |       "Total reward this episode: 0.6000000238418579\n",
183 |       "Total reward this episode: 2.300000049173832\n",
184 |       "Total reward this episode: 1.1000000312924385\n",
185 |       "Total reward this episode: 2.0000000447034836\n",
186 |       "Total reward this episode: 1.1000000312924385\n",
187 |       "Total reward this episode: 0.6000000238418579\n",
188 |       "Total reward this episode: 1.4901161193847656e-08\n",
189 |       "Total reward this episode: 1.2000000327825546\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "for episode in range(10):\n",
195 |     "    env.reset()\n",
196 |     "    step_result = env.get_step_result(group_name)\n",
197 |     "    done = False\n",
198 |     "    episode_rewards = 0\n",
199 |     "    while not done:\n",
200 |     "        action_size = group_spec.action_size\n",
201 |     "        if group_spec.is_action_continuous():\n",
202 |     "            action = np.random.randn(step_result.n_agents(), group_spec.action_size)\n",
203 |     "            \n",
204 |     "        if group_spec.is_action_discrete():\n",
205 |     "            branch_size = group_spec.discrete_action_branches\n",
206 |     "            action = np.column_stack([np.random.randint(0, branch_size[i], size=(step_result.n_agents())) for i in range(len(branch_size))])\n",
207 |     "        env.set_actions(group_name, action)\n",
208 |     "        env.step()\n",
209 |     "        step_result = env.get_step_result(group_name)\n",
210 |     "        episode_rewards += step_result.reward[0]\n",
211 |     "        done = step_result.done[0]\n",
212 |     "    print(\"Total reward this episode: {}\".format(episode_rewards))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "# Interaction test with custom environment"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stderr",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "import matplotlib.pyplot as plt\n",
237 |     "import numpy as np\n",
238 |     "import sys\n",
239 |     "\n",
240 |     "from mlagents_envs.environment import UnityEnvironment\n",
241 |     "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n",
242 |     "\n",
243 |     "engine_configuration_channel = EngineConfigurationChannel()\n",
244 |     "env = UnityEnvironment(base_port = 5004, side_channels = [engine_configuration_channel])"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": []
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "kernelspec": {
257 |    "display_name": "Python 3",
258 |    "language": "python",
259 |    "name": "python3"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "ipython",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "python",
269 |    "nbconvert_exporter": "python",
270 |    "pygments_lexer": "ipython3",
271 |    "version": "3.6.4"
272 |   }
273 |  },
274 |  "nbformat": 4,
275 |  "nbformat_minor": 2
276 | }
277 | 


--------------------------------------------------------------------------------
/8. Unity ML agents tests/rolling_a_ball/rollingaball1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/8. Unity ML agents tests/rolling_a_ball/rollingaball1.png


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/README.md:
--------------------------------------------------------------------------------
 1 | # Discrete Optimization with RL
 2 | 
 3 | > Comparison between classical techniques and RL in discrete optimization. <br>
 4 | > These experiments are run alongside the MOOC on Discrete Optimization by the University of Melbourne
 5 | 
 6 | 
 7 | ## Folder structure
 8 | ```
 9 | - lessons - personal notes on discrete Optimization, mostly from Coursera MOOC
10 | - knapsack_problem - experiments on the knapsack problem, from classical optimization to RL
11 | ```
12 | 
13 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/Solver.java:
--------------------------------------------------------------------------------
 1 | import java.io.*;
 2 | import java.util.List;
 3 | import java.util.ArrayList;
 4 | 
 5 | /**
 6 |  * The class <code>Solver</code> is an implementation of a greedy algorithm to solve the knapsack problem.
 7 |  *
 8 |  */
 9 | public class Solver {
10 |     
11 |     /**
12 |      * The main class
13 |      */
14 |     public static void main(String[] args) {
15 |         try {
16 |             solve(args);
17 |         } catch (IOException e) {
18 |             e.printStackTrace();
19 |         }
20 |     }
21 |     
22 |     /**
23 |      * Read the instance, solve it, and print the solution in the standard output
24 |      */
25 |     public static void solve(String[] args) throws IOException {
26 |         String fileName = null;
27 |         
28 |         // get the temp file name
29 |         for(String arg : args){
30 |             if(arg.startsWith("-file=")){
31 |                 fileName = arg.substring(6);
32 |             } 
33 |         }
34 |         if(fileName == null)
35 |             return;
36 |         
37 |         // read the lines out of the file
38 |         List<String> lines = new ArrayList<String>();
39 | 
40 |         BufferedReader input =  new BufferedReader(new FileReader(fileName));
41 |         try {
42 |             String line = null;
43 |             while (( line = input.readLine()) != null){
44 |                 lines.add(line);
45 |             }
46 |         }
47 |         finally {
48 |             input.close();
49 |         }
50 |         
51 |         
52 |         // parse the data in the file
53 |         String[] firstLine = lines.get(0).split("\\s+");
54 |         int items = Integer.parseInt(firstLine[0]);
55 |         int capacity = Integer.parseInt(firstLine[1]);
56 | 
57 |         int[] values = new int[items];
58 |         int[] weights = new int[items];
59 | 
60 |         for(int i=1; i < items+1; i++){
61 |           String line = lines.get(i);
62 |           String[] parts = line.split("\\s+");
63 | 
64 |           values[i-1] = Integer.parseInt(parts[0]);
65 |           weights[i-1] = Integer.parseInt(parts[1]);
66 |         }
67 | 
68 |         // a trivial greedy algorithm for filling the knapsack
69 |         // it takes items in-order until the knapsack is full
70 |         int value = 0;
71 |         int weight = 0;
72 |         int[] taken = new int[items];
73 | 
74 |         for(int i=0; i < items; i++){
75 |             if(weight + weights[i] <= capacity){
76 |                 taken[i] = 1;
77 |                 value += values[i];
78 |                 weight += weights[i];
79 |             } else {
80 |                 taken[i] = 0;
81 |             }
82 |         }
83 |         
84 |         // prepare the solution in the specified output format
85 |         System.out.println(value+" 0");
86 |         for(int i=0; i < items; i++){
87 |             System.out.print(taken[i]+" ");
88 |         }
89 |         System.out.println("");        
90 |     }
91 | }


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/_coursera:
--------------------------------------------------------------------------------
1 | _le-pVv_EeasJA5dVmWj2w
2 | Knapsack
3 | awPVV, ./data/ks_30_0, solver.py, Knapsack Problem 1
4 | hHYWS, ./data/ks_50_0, solver.py, Knapsack Problem 2
5 | JwWnx, ./data/ks_200_0, solver.py, Knapsack Problem 3
6 | Z2tMt, ./data/ks_400_0, solver.py, Knapsack Problem 4
7 | PUIxa, ./data/ks_1000_0, solver.py, Knapsack Problem 5
8 | AKXWc, ./data/ks_10000_0, solver.py, Knapsack Problem 6
9 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_0:
--------------------------------------------------------------------------------
  1 | 100 100000
  2 | 90000 90001
  3 | 89750 89751
  4 | 10001 10002
  5 | 89500 89501
  6 | 10252 10254
  7 | 89250 89251
  8 | 10503 10506
  9 | 89000 89001
 10 | 10754 10758
 11 | 88750 88751
 12 | 11005 11010
 13 | 88500 88501
 14 | 11256 11262
 15 | 88250 88251
 16 | 11507 11514
 17 | 88000 88001
 18 | 11758 11766
 19 | 87750 87751
 20 | 12009 12018
 21 | 87500 87501
 22 | 12260 12270
 23 | 87250 87251
 24 | 12511 12522
 25 | 87000 87001
 26 | 12762 12774
 27 | 86750 86751
 28 | 13013 13026
 29 | 86500 86501
 30 | 13264 13278
 31 | 86250 86251
 32 | 13515 13530
 33 | 86000 86001
 34 | 13766 13782
 35 | 85750 85751
 36 | 14017 14034
 37 | 85500 85501
 38 | 14268 14286
 39 | 85250 85251
 40 | 14519 14538
 41 | 85000 85001
 42 | 14770 14790
 43 | 84750 84751
 44 | 15021 15042
 45 | 84500 84501
 46 | 15272 15294
 47 | 84250 84251
 48 | 15523 15546
 49 | 84000 84001
 50 | 15774 15798
 51 | 83750 83751
 52 | 16025 16050
 53 | 83500 83501
 54 | 16276 16302
 55 | 83250 83251
 56 | 16527 16554
 57 | 83000 83001
 58 | 16778 16806
 59 | 82750 82751
 60 | 17029 17058
 61 | 82500 82501
 62 | 17280 17310
 63 | 82250 82251
 64 | 17531 17562
 65 | 82000 82001
 66 | 17782 17814
 67 | 81750 81751
 68 | 18033 18066
 69 | 81500 81501
 70 | 18284 18318
 71 | 81250 81251
 72 | 18535 18570
 73 | 81000 81001
 74 | 18786 18822
 75 | 80750 80751
 76 | 19037 19074
 77 | 80500 80501
 78 | 19288 19326
 79 | 80250 80251
 80 | 19539 19578
 81 | 80000 80001
 82 | 19790 19830
 83 | 79750 79751
 84 | 20041 20082
 85 | 79500 79501
 86 | 20292 20334
 87 | 79250 79251
 88 | 20543 20586
 89 | 79000 79001
 90 | 20794 20838
 91 | 78750 78751
 92 | 21045 21090
 93 | 78500 78501
 94 | 21296 21342
 95 | 78250 78251
 96 | 21547 21594
 97 | 78000 78001
 98 | 21798 21846
 99 | 77750 77751
100 | 22049 22098
101 | 77500 77501
102 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_1:
--------------------------------------------------------------------------------
  1 | 100 3190802
  2 | 1491 3882
  3 | 399 1298
  4 | 77 654
  5 | 969 2638
  6 | 8485 20670
  7 | 55 610
  8 | 1904 4908
  9 | 703 2106
 10 | 657 2014
 11 | 932 2564
 12 | 1201 3302
 13 | 1697 4494
 14 | 462 1424
 15 | 1201 3302
 16 | 111632 267364
 17 | 9044 21988
 18 | 147380 352660
 19 | 31852 76604
 20 | 9044 21988
 21 | 9300 22700
 22 | 8660 21020
 23 | 174684 418068
 24 | 19844 47788
 25 | 9044 21988
 26 | 1635 4370
 27 | 62788 150476
 28 | 6932 16964
 29 | 6308 15516
 30 | 50 600
 31 | 4600 11300
 32 | 565204 1351508
 33 | 7463 18226
 34 | 2988 7476
 35 | 9044 21988
 36 | 9044 21988
 37 | 4040 9980
 38 | 137732 329764
 39 | 7150 17400
 40 | 9300 22700
 41 | 177 854
 42 | 372 1244
 43 | 499 1498
 44 | 15108 36516
 45 | 11108 26916
 46 | 2468 6236
 47 | 1133 3166
 48 | 1490 3880
 49 | 865 2430
 50 | 2468 6236
 51 | 2468 6236
 52 | 5974 14648
 53 | 5972 14644
 54 | 9532 23164
 55 | 1872 4844
 56 | 3964 9828
 57 | 2799 7098
 58 | 527708 1261916
 59 | 7212 17724
 60 | 3002 7504
 61 | 21004 50708
 62 | 47728 114556
 63 | 565204 1351508
 64 | 100600 240900
 65 | 118920 284740
 66 | 2822 7144
 67 | 612 1924
 68 | 6324 15548
 69 | 9508 23116
 70 | 9268 22636
 71 | 11636 28172
 72 | 210708 504116
 73 | 2176944 5204588
 74 | 930 2560
 75 | 4481 11062
 76 | 50 600
 77 | 112 724
 78 | 14434 34968
 79 | 0 500
 80 | 248 996
 81 | 48 596
 82 | 820 2340
 83 | 278 1056
 84 | 643 1986
 85 | 1413 3726
 86 | 1408 3716
 87 | 0 500
 88 | 2581 6662
 89 | 287 1074
 90 | 2040 5180
 91 | 289 1078
 92 | 1380 3660
 93 | 372 1244
 94 | 0 500
 95 | 472 1444
 96 | 360 1220
 97 | 0 500
 98 | 622 1944
 99 | 3504 8708
100 | 5924 14548
101 | 2784 7068
102 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_2:
--------------------------------------------------------------------------------
  1 | 100 10000
  2 | 339 342
  3 | 1629 1514
  4 | 697 696
  5 | 1299 1433
  6 | 1613 1762
  7 | 36 40
  8 | 1737 1635
  9 | 473 442
 10 | 1859 1899
 11 | 2055 1960
 12 | 362 378
 13 | 1104 1177
 14 | 1880 1970
 15 | 1349 1434
 16 | 1545 1691
 17 | 132 139
 18 | 341 371
 19 | 1430 1350
 20 | 1878 1775
 21 | 1870 1980
 22 | 1536 1651
 23 | 818 814
 24 | 289 282
 25 | 1690 1573
 26 | 1437 1587
 27 | 310 302
 28 | 53 56
 29 | 720 726
 30 | 1707 1820
 31 | 258 269
 32 | 1842 1680
 33 | 757 842
 34 | 1642 1730
 35 | 1149 1243
 36 | 1970 1794
 37 | 749 775
 38 | 1904 1810
 39 | 2 3
 40 | 967 970
 41 | 1310 1261
 42 | 1004 997
 43 | 1295 1192
 44 | 1056 1036
 45 | 51 52
 46 | 1320 1453
 47 | 1580 1673
 48 | 480 440
 49 | 604 624
 50 | 1766 1813
 51 | 1198 1326
 52 | 1762 1637
 53 | 2046 1902
 54 | 315 323
 55 | 714 746
 56 | 434 471
 57 | 1461 1366
 58 | 1652 1511
 59 | 1876 1785
 60 | 906 1002
 61 | 1483 1560
 62 | 1355 1403
 63 | 510 513
 64 | 2114 1958
 65 | 1479 1505
 66 | 1618 1538
 67 | 1472 1378
 68 | 310 315
 69 | 1478 1493
 70 | 970 1066
 71 | 43 40
 72 | 1231 1172
 73 | 1792 1972
 74 | 870 956
 75 | 1484 1541
 76 | 1049 1014
 77 | 56 55
 78 | 814 793
 79 | 978 985
 80 | 1215 1311
 81 | 720 737
 82 | 210 204
 83 | 460 492
 84 | 1798 1961
 85 | 1944 1952
 86 | 208 204
 87 | 1836 1872
 88 | 882 806
 89 | 239 234
 90 | 141 136
 91 | 49 49
 92 | 1352 1363
 93 | 915 883
 94 | 1318 1259
 95 | 72 70
 96 | 937 886
 97 | 1783 1843
 98 | 1253 1319
 99 | 1268 1375
100 | 1144 1234
101 | 878 818
102 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_106_0:
--------------------------------------------------------------------------------
  1 | 106 106925262
  2 | 45276 45276   
  3 | 90552 90552   
  4 | 181104 181104   
  5 | 362208 362208   
  6 | 724416 724416   
  7 | 1448832 1448832   
  8 | 2897664 2897664   
  9 | 5795328 5795328   
 10 | 11590656 11590656   
 11 | 23181312 23181312   
 12 | 46362624 46362624   
 13 | 92725248 92725248   
 14 | 70778 70778   
 15 | 141556 141556   
 16 | 283112 283112   
 17 | 566224 566224   
 18 | 1132448 1132448   
 19 | 2264896 2264896   
 20 | 4529792 4529792   
 21 | 9059584 9059584   
 22 | 18119168 18119168   
 23 | 36238336 36238336   
 24 | 72476672 72476672   
 25 | 86911 86911   
 26 | 173822 173822   
 27 | 347644 347644   
 28 | 695288 695288   
 29 | 1390576 1390576   
 30 | 2781152 2781152   
 31 | 5562304 5562304   
 32 | 11124608 11124608   
 33 | 22249216 22249216   
 34 | 44498432 44498432   
 35 | 88996864 88996864   
 36 | 92634 92634   
 37 | 185268 185268   
 38 | 370536 370536   
 39 | 741072 741072   
 40 | 1482144 1482144   
 41 | 2964288 2964288   
 42 | 5928576 5928576   
 43 | 11857152 11857152   
 44 | 23714304 23714304   
 45 | 47428608 47428608   
 46 | 94857216 94857216   
 47 | 97839 97839   
 48 | 195678 195678   
 49 | 391356 391356   
 50 | 782712 782712   
 51 | 1565424 1565424   
 52 | 3130848 3130848   
 53 | 6261696 6261696   
 54 | 12523392 12523392   
 55 | 25046784 25046784   
 56 | 50093568 50093568   
 57 | 100187136 100187136   
 58 | 125941 125941   
 59 | 251882 251882   
 60 | 503764 503764   
 61 | 1007528 1007528   
 62 | 2015056 2015056   
 63 | 4030112 4030112   
 64 | 8060224 8060224   
 65 | 16120448 16120448   
 66 | 32240896 32240896   
 67 | 64481792 64481792   
 68 | 134269 134269   
 69 | 268538 268538   
 70 | 537076 537076   
 71 | 1074152 1074152   
 72 | 2148304 2148304   
 73 | 4296608 4296608   
 74 | 8593216 8593216   
 75 | 17186432 17186432   
 76 | 34372864 34372864   
 77 | 68745728 68745728   
 78 | 141033 141033   
 79 | 282066 282066   
 80 | 564132 564132   
 81 | 1128264 1128264   
 82 | 2256528 2256528   
 83 | 4513056 4513056   
 84 | 9026112 9026112   
 85 | 18052224 18052224   
 86 | 36104448 36104448   
 87 | 72208896 72208896   
 88 | 147279 147279   
 89 | 294558 294558   
 90 | 589116 589116   
 91 | 1178232 1178232   
 92 | 2356464 2356464   
 93 | 4712928 4712928   
 94 | 9425856 9425856   
 95 | 18851712 18851712   
 96 | 37703424 37703424   
 97 | 75406848 75406848   
 98 | 153525 153525   
 99 | 307050 307050   
100 | 614100 614100   
101 | 1228200 1228200   
102 | 2456400 2456400   
103 | 4912800 4912800   
104 | 9825600 9825600   
105 | 19651200 19651200   
106 | 39302400 39302400   
107 | 78604800 78604800 
108 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_19_0:
--------------------------------------------------------------------------------
 1 | 19 31181
 2 | 1945 4990
 3 | 321 1142
 4 | 2945 7390
 5 | 4136 10372
 6 | 1107 3114
 7 | 1022 2744
 8 | 1101 3102
 9 | 2890 7280
10 | 962 2624
11 | 1060 3020
12 | 805 2310
13 | 689 2078
14 | 1513 3926
15 | 3878 9656
16 | 13504 32708
17 | 1865 4830
18 | 667 2034
19 | 1833 4766
20 | 16553 40006
21 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_0:
--------------------------------------------------------------------------------
  1 | 200 100000
  2 | 90001 90000
  3 | 89751 89750
  4 | 10002 10001
  5 | 89501 89500
  6 | 10254 10252
  7 | 89251 89250
  8 | 10506 10503
  9 | 89001 89000
 10 | 10758 10754
 11 | 88751 88750
 12 | 11010 11005
 13 | 88501 88500
 14 | 11262 11256
 15 | 88251 88250
 16 | 11514 11507
 17 | 88001 88000
 18 | 11766 11758
 19 | 87751 87750
 20 | 12018 12009
 21 | 87501 87500
 22 | 12270 12260
 23 | 87251 87250
 24 | 12522 12511
 25 | 87001 87000
 26 | 12774 12762
 27 | 86751 86750
 28 | 13026 13013
 29 | 86501 86500
 30 | 13278 13264
 31 | 86251 86250
 32 | 13530 13515
 33 | 86001 86000
 34 | 13782 13766
 35 | 85751 85750
 36 | 14034 14017
 37 | 85501 85500
 38 | 14286 14268
 39 | 85251 85250
 40 | 14538 14519
 41 | 85001 85000
 42 | 14790 14770
 43 | 84751 84750
 44 | 15042 15021
 45 | 84501 84500
 46 | 15294 15272
 47 | 84251 84250
 48 | 15546 15523
 49 | 84001 84000
 50 | 15798 15774
 51 | 83751 83750
 52 | 16050 16025
 53 | 83501 83500
 54 | 16302 16276
 55 | 83251 83250
 56 | 16554 16527
 57 | 83001 83000
 58 | 16806 16778
 59 | 82751 82750
 60 | 17058 17029
 61 | 82501 82500
 62 | 17310 17280
 63 | 82251 82250
 64 | 17562 17531
 65 | 82001 82000
 66 | 17814 17782
 67 | 81751 81750
 68 | 18066 18033
 69 | 81501 81500
 70 | 18318 18284
 71 | 81251 81250
 72 | 18570 18535
 73 | 81001 81000
 74 | 18822 18786
 75 | 80751 80750
 76 | 19074 19037
 77 | 80501 80500
 78 | 19326 19288
 79 | 80251 80250
 80 | 19578 19539
 81 | 80001 80000
 82 | 19830 19790
 83 | 79751 79750
 84 | 20082 20041
 85 | 79501 79500
 86 | 20334 20292
 87 | 79251 79250
 88 | 20586 20543
 89 | 79001 79000
 90 | 20838 20794
 91 | 78751 78750
 92 | 21090 21045
 93 | 78501 78500
 94 | 21342 21296
 95 | 78251 78250
 96 | 21594 21547
 97 | 78001 78000
 98 | 21846 21798
 99 | 77751 77750
100 | 22098 22049
101 | 77501 77500
102 | 22350 22300
103 | 77251 77250
104 | 22602 22551
105 | 77001 77000
106 | 22854 22802
107 | 76751 76750
108 | 23106 23053
109 | 76501 76500
110 | 23358 23304
111 | 76251 76250
112 | 23610 23555
113 | 76001 76000
114 | 23862 23806
115 | 75751 75750
116 | 24114 24057
117 | 75501 75500
118 | 24366 24308
119 | 75251 75250
120 | 24618 24559
121 | 75001 75000
122 | 24870 24810
123 | 74751 74750
124 | 25122 25061
125 | 74501 74500
126 | 25374 25312
127 | 74251 74250
128 | 25626 25563
129 | 74001 74000
130 | 25878 25814
131 | 73751 73750
132 | 26130 26065
133 | 73501 73500
134 | 26382 26316
135 | 73251 73250
136 | 26634 26567
137 | 73001 73000
138 | 26886 26818
139 | 72751 72750
140 | 27138 27069
141 | 72501 72500
142 | 27390 27320
143 | 72251 72250
144 | 27642 27571
145 | 72001 72000
146 | 27894 27822
147 | 71751 71750
148 | 28146 28073
149 | 71501 71500
150 | 28398 28324
151 | 71251 71250
152 | 28650 28575
153 | 71001 71000
154 | 28902 28826
155 | 70751 70750
156 | 29154 29077
157 | 70501 70500
158 | 29406 29328
159 | 70251 70250
160 | 29658 29579
161 | 70001 70000
162 | 29910 29830
163 | 69751 69750
164 | 30162 30081
165 | 69501 69500
166 | 30414 30332
167 | 69251 69250
168 | 30666 30583
169 | 69001 69000
170 | 30918 30834
171 | 68751 68750
172 | 31170 31085
173 | 68501 68500
174 | 31422 31336
175 | 68251 68250
176 | 31674 31587
177 | 68001 68000
178 | 31926 31838
179 | 67751 67750
180 | 32178 32089
181 | 67501 67500
182 | 32430 32340
183 | 67251 67250
184 | 32682 32591
185 | 67001 67000
186 | 32934 32842
187 | 66751 66750
188 | 33186 33093
189 | 66501 66500
190 | 33438 33344
191 | 66251 66250
192 | 33690 33595
193 | 66001 66000
194 | 33942 33846
195 | 65751 65750
196 | 34194 34097
197 | 65501 65500
198 | 34446 34348
199 | 65251 65250
200 | 34698 34599
201 | 68451 68450
202 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_1:
--------------------------------------------------------------------------------
  1 | 200 2640230
  2 | 31860 76620
  3 | 11884 28868
  4 | 10492 25484
  5 | 901 2502
  6 | 43580 104660
  7 | 9004 21908
  8 | 6700 16500
  9 | 29940 71980
 10 | 7484 18268
 11 | 5932 14564
 12 | 7900 19300
 13 | 6564 16028
 14 | 6596 16092
 15 | 8172 19844
 16 | 5324 13148
 17 | 8436 20572
 18 | 7332 17964
 19 | 6972 17044
 20 | 7668 18636
 21 | 6524 15948
 22 | 6244 15388
 23 | 635 1970
 24 | 5396 13292
 25 | 13596 32892
 26 | 51188 122676
 27 | 13684 33068
 28 | 8596 20892
 29 | 156840 375380
 30 | 7900 19300
 31 | 6460 15820
 32 | 14132 34164
 33 | 4980 12260
 34 | 5216 12932
 35 | 6276 15452
 36 | 701 2102
 37 | 3084 7868
 38 | 6924 16948
 39 | 5500 13500
 40 | 3148 7996
 41 | 47844 114788
 42 | 226844 542788
 43 | 25748 61996
 44 | 7012 17124
 45 | 3440 8580
 46 | 15580 37660
 47 | 314 1128
 48 | 2852 7204
 49 | 15500 37500
 50 | 9348 22796
 51 | 17768 42836
 52 | 16396 39692
 53 | 16540 39980
 54 | 395124 944948
 55 | 10196 24692
 56 | 6652 16204
 57 | 4848 11996
 58 | 74372 178244
 59 | 4556 11212
 60 | 4900 12100
 61 | 3508 8716
 62 | 3820 9540
 63 | 5460 13420
 64 | 16564 40028
 65 | 3896 9692
 66 | 3832 9564
 67 | 9012 21924
 68 | 4428 10956
 69 | 57796 138492
 70 | 12052 29204
 71 | 7052 17204
 72 | 85864 205628
 73 | 5068 12436
 74 | 10484 25468
 75 | 4516 11132
 76 | 3620 9140
 77 | 18052 43604
 78 | 21 542
 79 | 15804 38108
 80 | 19020 45940
 81 | 170844 408788
 82 | 3732 9364
 83 | 2920 7340
 84 | 4120 10340
 85 | 6828 16756
 86 | 26252 63204
 87 | 11676 28252
 88 | 19916 47932
 89 | 65488 156876
 90 | 7172 17644
 91 | 3772 9444
 92 | 132868 318036
 93 | 8332 20364
 94 | 5308 13116
 95 | 3780 9460
 96 | 5208 12916
 97 | 56788 136076
 98 | 7172 17644
 99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_300_0:
--------------------------------------------------------------------------------
  1 | 300 4040184
  2 | 31860 76620
  3 | 11884 28868
  4 | 10492 25484
  5 | 901 2502
  6 | 43580 104660
  7 | 9004 21908
  8 | 6700 16500
  9 | 29940 71980
 10 | 7484 18268
 11 | 5932 14564
 12 | 7900 19300
 13 | 6564 16028
 14 | 6596 16092
 15 | 8172 19844
 16 | 5324 13148
 17 | 8436 20572
 18 | 7332 17964
 19 | 6972 17044
 20 | 7668 18636
 21 | 6524 15948
 22 | 6244 15388
 23 | 635 1970
 24 | 5396 13292
 25 | 13596 32892
 26 | 51188 122676
 27 | 13684 33068
 28 | 8596 20892
 29 | 156840 375380
 30 | 7900 19300
 31 | 6460 15820
 32 | 14132 34164
 33 | 4980 12260
 34 | 5216 12932
 35 | 6276 15452
 36 | 701 2102
 37 | 3084 7868
 38 | 6924 16948
 39 | 5500 13500
 40 | 3148 7996
 41 | 47844 114788
 42 | 226844 542788
 43 | 25748 61996
 44 | 7012 17124
 45 | 3440 8580
 46 | 15580 37660
 47 | 314 1128
 48 | 2852 7204
 49 | 15500 37500
 50 | 9348 22796
 51 | 17768 42836
 52 | 16396 39692
 53 | 16540 39980
 54 | 395124 944948
 55 | 10196 24692
 56 | 6652 16204
 57 | 4848 11996
 58 | 74372 178244
 59 | 4556 11212
 60 | 4900 12100
 61 | 3508 8716
 62 | 3820 9540
 63 | 5460 13420
 64 | 16564 40028
 65 | 3896 9692
 66 | 3832 9564
 67 | 9012 21924
 68 | 4428 10956
 69 | 57796 138492
 70 | 12052 29204
 71 | 7052 17204
 72 | 85864 205628
 73 | 5068 12436
 74 | 10484 25468
 75 | 4516 11132
 76 | 3620 9140
 77 | 18052 43604
 78 | 21 542
 79 | 15804 38108
 80 | 19020 45940
 81 | 170844 408788
 82 | 3732 9364
 83 | 2920 7340
 84 | 4120 10340
 85 | 6828 16756
 86 | 26252 63204
 87 | 11676 28252
 88 | 19916 47932
 89 | 65488 156876
 90 | 7172 17644
 91 | 3772 9444
 92 | 132868 318036
 93 | 8332 20364
 94 | 5308 13116
 95 | 3780 9460
 96 | 5208 12916
 97 | 56788 136076
 98 | 7172 17644
 99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 | 314 1128
203 | 314 1128
204 | 314 1128
205 | 104036 249172
206 | 5248 12996
207 | 312 1124
208 | 24468 58836
209 | 7716 18932
210 | 30180 72460
211 | 4824 11948
212 | 1120 3140
213 | 11496 27892
214 | 4916 12132
215 | 14428 34956
216 | 24948 59996
217 | 41100 98700
218 | 28692 69084
219 | 826 2352
220 | 3073 7846
221 | 7684 18868
222 | 5604 13708
223 | 17188 41476
224 | 34828 83756
225 | 7540 18380
226 | 8004 19508
227 | 2648 6796
228 | 5124 12748
229 | 3096 7892
230 | 166516 398532
231 | 13756 33212
232 | 9980 24260
233 | 15980 38660
234 | 9056 22012
235 | 5052 12404
236 | 8212 20124
237 | 11164 27028
238 | 13036 31572
239 | 23596 56892
240 | 2028 5156
241 | 7584 18468
242 | 5772 14244
243 | 4124 10348
244 | 5368 13236
245 | 4364 10828
246 | 5604 13708
247 | 8500 20700
248 | 7676 18652
249 | 8636 20972
250 | 4588 11276
251 | 4152 10404
252 | 4860 12020
253 | 5484 13468
254 | 8636 20972
255 | 5140 12780
256 | 236380 565460
257 | 116500 278900
258 | 36480 87660
259 | 16968 41036
260 | 5232 12964
261 | 13280 32060
262 | 138032 330364
263 | 9044 21988
264 | 22028 53156
265 | 4632 11564
266 | 13196 31892
267 | 65404 156708
268 | 28940 69580
269 | 865 2430
270 | 45988 110276
271 | 670 2040
272 | 4820 11940
273 | 41356 99212
274 | 39844 95588
275 | 897 2494
276 | 4028 9956
277 | 7924 19348
278 | 47756 114612
279 | 47036 112772
280 | 25908 62316
281 | 4516 11132
282 | 29460 70820
283 | 7964 19428
284 | 16964 41028
285 | 22196 53492
286 | 68140 163380
287 | 80924 193948
288 | 63700 152700
289 | 20860 50220
290 | 1682 4464
291 | 16804 40508
292 | 3195 8090
293 | 60348 144596
294 | 1901 4902
295 | 67468 161636
296 | 4772 11844
297 | 11196 27092
298 | 25836 62172
299 | 49676 119252
300 | 6188 15276
301 | 15588 37676
302 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_30_0:
--------------------------------------------------------------------------------
 1 | 30 100000
 2 | 90000 90001
 3 | 89750 89751
 4 | 10001 10002
 5 | 89500 89501
 6 | 10252 10254
 7 | 89250 89251
 8 | 10503 10506
 9 | 89000 89001
10 | 10754 10758
11 | 88750 88751
12 | 11005 11010
13 | 88500 88501
14 | 11256 11262
15 | 88250 88251
16 | 11507 11514
17 | 88000 88001
18 | 11758 11766
19 | 87750 87751
20 | 12009 12018
21 | 87500 87501
22 | 12260 12270
23 | 87250 87251
24 | 12511 12522
25 | 87000 87001
26 | 12762 12774
27 | 86750 86751
28 | 13013 13026
29 | 86500 86501
30 | 13264 13278
31 | 86250 86251
32 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_400_0:
--------------------------------------------------------------------------------
  1 | 400 9486367
  2 | 31860 76620
  3 | 11884 28868
  4 | 10492 25484
  5 | 901 2502
  6 | 43580 104660
  7 | 9004 21908
  8 | 6700 16500
  9 | 29940 71980
 10 | 7484 18268
 11 | 5932 14564
 12 | 7900 19300
 13 | 6564 16028
 14 | 6596 16092
 15 | 8172 19844
 16 | 5324 13148
 17 | 8436 20572
 18 | 7332 17964
 19 | 6972 17044
 20 | 7668 18636
 21 | 6524 15948
 22 | 6244 15388
 23 | 635 1970
 24 | 5396 13292
 25 | 13596 32892
 26 | 51188 122676
 27 | 13684 33068
 28 | 8596 20892
 29 | 156840 375380
 30 | 7900 19300
 31 | 6460 15820
 32 | 14132 34164
 33 | 4980 12260
 34 | 5216 12932
 35 | 6276 15452
 36 | 701 2102
 37 | 3084 7868
 38 | 6924 16948
 39 | 5500 13500
 40 | 3148 7996
 41 | 47844 114788
 42 | 226844 542788
 43 | 25748 61996
 44 | 7012 17124
 45 | 3440 8580
 46 | 15580 37660
 47 | 314 1128
 48 | 2852 7204
 49 | 15500 37500
 50 | 9348 22796
 51 | 17768 42836
 52 | 16396 39692
 53 | 16540 39980
 54 | 395124 944948
 55 | 10196 24692
 56 | 6652 16204
 57 | 4848 11996
 58 | 74372 178244
 59 | 4556 11212
 60 | 4900 12100
 61 | 3508 8716
 62 | 3820 9540
 63 | 5460 13420
 64 | 16564 40028
 65 | 3896 9692
 66 | 3832 9564
 67 | 9012 21924
 68 | 4428 10956
 69 | 57796 138492
 70 | 12052 29204
 71 | 7052 17204
 72 | 85864 205628
 73 | 5068 12436
 74 | 10484 25468
 75 | 4516 11132
 76 | 3620 9140
 77 | 18052 43604
 78 | 21 542
 79 | 15804 38108
 80 | 19020 45940
 81 | 170844 408788
 82 | 3732 9364
 83 | 2920 7340
 84 | 4120 10340
 85 | 6828 16756
 86 | 26252 63204
 87 | 11676 28252
 88 | 19916 47932
 89 | 65488 156876
 90 | 7172 17644
 91 | 3772 9444
 92 | 132868 318036
 93 | 8332 20364
 94 | 5308 13116
 95 | 3780 9460
 96 | 5208 12916
 97 | 56788 136076
 98 | 7172 17644
 99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 | 314 1128
203 | 314 1128
204 | 314 1128
205 | 104036 249172
206 | 5248 12996
207 | 312 1124
208 | 24468 58836
209 | 7716 18932
210 | 30180 72460
211 | 4824 11948
212 | 1120 3140
213 | 11496 27892
214 | 4916 12132
215 | 14428 34956
216 | 24948 59996
217 | 41100 98700
218 | 28692 69084
219 | 826 2352
220 | 3073 7846
221 | 7684 18868
222 | 5604 13708
223 | 17188 41476
224 | 34828 83756
225 | 7540 18380
226 | 8004 19508
227 | 2648 6796
228 | 5124 12748
229 | 3096 7892
230 | 166516 398532
231 | 13756 33212
232 | 9980 24260
233 | 15980 38660
234 | 9056 22012
235 | 5052 12404
236 | 8212 20124
237 | 11164 27028
238 | 13036 31572
239 | 23596 56892
240 | 2028 5156
241 | 7584 18468
242 | 5772 14244
243 | 4124 10348
244 | 5368 13236
245 | 4364 10828
246 | 5604 13708
247 | 8500 20700
248 | 7676 18652
249 | 8636 20972
250 | 4588 11276
251 | 4152 10404
252 | 4860 12020
253 | 5484 13468
254 | 8636 20972
255 | 5140 12780
256 | 236380 565460
257 | 116500 278900
258 | 36480 87660
259 | 16968 41036
260 | 5232 12964
261 | 13280 32060
262 | 138032 330364
263 | 9044 21988
264 | 22028 53156
265 | 4632 11564
266 | 13196 31892
267 | 65404 156708
268 | 28940 69580
269 | 865 2430
270 | 45988 110276
271 | 670 2040
272 | 4820 11940
273 | 41356 99212
274 | 39844 95588
275 | 897 2494
276 | 4028 9956
277 | 7924 19348
278 | 47756 114612
279 | 47036 112772
280 | 25908 62316
281 | 4516 11132
282 | 29460 70820
283 | 7964 19428
284 | 16964 41028
285 | 22196 53492
286 | 68140 163380
287 | 80924 193948
288 | 63700 152700
289 | 20860 50220
290 | 1682 4464
291 | 16804 40508
292 | 3195 8090
293 | 60348 144596
294 | 1901 4902
295 | 67468 161636
296 | 4772 11844
297 | 11196 27092
298 | 25836 62172
299 | 49676 119252
300 | 6188 15276
301 | 15588 37676
302 | 4412 10924
303 | 26564 63828
304 | 16412 39724
305 | 8108 19716
306 | 6084 14868
307 | 9884 24068
308 | 4224 10548
309 | 14660 35420
310 | 25708 61916
311 | 39228 94156
312 | 40748 97796
313 | 40748 97796
314 | 64276 154052
315 | 114356 273812
316 | 14724 35548
317 | 4540 11180
318 | 11612 28124
319 | 4972 12244
320 | 10060 24420
321 | 14548 35196
322 | 3136 7972
323 | 9132 22164
324 | 5752 14204
325 | 10100 24500
326 | 12172 29444
327 | 24428 58756
328 | 3336 8372
329 | 4356 10812
330 | 8652 21004
331 | 14492 35084
332 | 8796 21492
333 | 6408 15716
334 | 6056 14812
335 | 10124 24548
336 | 387480 926660
337 | 18188 43876
338 | 7732 18964
339 | 9492 23084
340 | 7300 17900
341 | 10052 24404
342 | 19604 47308
343 | 6644 16188
344 | 107364 257028
345 | 91812 219924
346 | 4620 11540
347 | 42848 102796
348 | 33268 79836
349 | 13260 32020
350 | 6564 16028
351 | 6524 15948
352 | 13596 32892
353 | 13596 32892
354 | 47844 114788
355 | 226844 542788
356 | 226844 542788
357 | 226844 542788
358 | 226844 542788
359 | 85864 205628
360 | 170844 408788
361 | 56788 136076
362 | 6628 16156
363 | 10492 25484
364 | 104036 249172
365 | 14428 34956
366 | 14428 34956
367 | 22028 53156
368 | 22028 53156
369 | 22028 53156
370 | 25836 62172
371 | 11612 28124
372 | 11612 28124
373 | 11612 28124
374 | 85872 205644
375 | 1377 3654
376 | 1365820 3265540
377 | 562272 1344644
378 | 1445900 3457100
379 | 501060 1198220
380 | 106224 254348
381 | 492496 1177692
382 | 387824 927548
383 | 151320 362140
384 | 109924 263148
385 | 105696 253092
386 | 96404 230908
387 | 107732 257964
388 | 42140 101180
389 | 102896 246292
390 | 4036 9972
391 | 19616 47332
392 | 100948 241796
393 | 1417728 3389756
394 | 62604 150108
395 | 491820 1176140
396 | 33740 80980
397 | 25216 60732
398 | 111716 267532
399 | 400156 957012
400 | 108800 260500
401 | 1211040 2895580
402 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_40_0:
--------------------------------------------------------------------------------
 1 | 40 100000
 2 | 90001 90000
 3 | 89751 89750
 4 | 10002 10001
 5 | 89501 89500
 6 | 10254 10252
 7 | 89251 89250
 8 | 10506 10503
 9 | 89001 89000
10 | 10758 10754
11 | 88751 88750
12 | 11010 11005
13 | 88501 88500
14 | 11262 11256
15 | 88251 88250
16 | 11514 11507
17 | 88001 88000
18 | 11766 11758
19 | 87751 87750
20 | 12018 12009
21 | 87501 87500
22 | 12270 12260
23 | 87251 87250
24 | 12522 12511
25 | 87001 87000
26 | 12774 12762
27 | 86751 86750
28 | 13026 13013
29 | 86501 86500
30 | 13278 13264
31 | 86251 86250
32 | 13530 13515
33 | 86001 86000
34 | 13782 13766
35 | 85751 85750
36 | 14034 14017
37 | 85501 85500
38 | 14286 14268
39 | 85251 85250
40 | 14538 14519
41 | 86131 86130
42 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_45_0:
--------------------------------------------------------------------------------
 1 | 45 58181
 2 | 1945 4990
 3 | 321 1142
 4 | 2945 7390
 5 | 4136 10372
 6 | 1107 3114
 7 | 1022 2744
 8 | 1101 3102
 9 | 2890 7280
10 | 47019 112738
11 | 1530 3960
12 | 3432 8564
13 | 2165 5630
14 | 1703 4506
15 | 1106 3112
16 | 370 1240
17 | 657 2014
18 | 962 2624
19 | 1060 3020
20 | 805 2310
21 | 689 2078
22 | 1513 3926
23 | 3878 9656
24 | 13504 32708
25 | 1865 4830
26 | 667 2034
27 | 1833 4766
28 | 16553 40006
29 | 1261 3422
30 | 2593 6686
31 | 1170 3240
32 | 794 2288
33 | 671 2042
34 | 7421 18142
35 | 6009 14718
36 | 1767 4634
37 | 2622 6744
38 | 831 2362
39 | 701 2102
40 | 5222 12944
41 | 3086 7872
42 | 900 2500
43 | 3121 7942
44 | 1029 2958
45 | 52555 126010
46 | 389 1278
47 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_4_0:
--------------------------------------------------------------------------------
1 | 4 11
2 | 8 4
3 | 10 5
4 | 15 8
5 | 4 3
6 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_500_0:
--------------------------------------------------------------------------------
  1 | 500 50000
  2 | 384 412
  3 | 7060 7285
  4 | 8475 8103
  5 | 5028 4876
  6 | 9741 9369
  7 | 3360 3538
  8 | 1426 1394
  9 | 2084 2204
 10 | 4865 5362
 11 | 1885 1779
 12 | 8191 8376
 13 | 6296 6460
 14 | 3292 3193
 15 | 10227 9957
 16 | 5744 5513
 17 | 2163 2365
 18 | 10738 9786
 19 | 5099 4865
 20 | 9193 9406
 21 | 7777 7455
 22 | 8538 8090
 23 | 9597 9224
 24 | 1275 1257
 25 | 6317 5831
 26 | 7598 7177
 27 | 2241 2297
 28 | 1398 1271
 29 | 4083 4216
 30 | 6033 5634
 31 | 1694 1560
 32 | 7563 6878
 33 | 12 12
 34 | 7406 6872
 35 | 7679 7142
 36 | 6619 6945
 37 | 9222 8778
 38 | 1869 1785
 39 | 6809 7485
 40 | 4961 5033
 41 | 2616 2719
 42 | 6406 6156
 43 | 1703 1826
 44 | 6415 6795
 45 | 4898 4790
 46 | 7601 7620
 47 | 2145 1971
 48 | 6559 6310
 49 | 1691 1874
 50 | 8734 8092
 51 | 9570 9321
 52 | 7649 7955
 53 | 0 1
 54 | 5652 5146
 55 | 475 517
 56 | 8789 8341
 57 | 1366 1400
 58 | 3325 3230
 59 | 5487 5443
 60 | 7316 7097
 61 | 10232 9979
 62 | 1788 1873
 63 | 9179 9259
 64 | 3790 3940
 65 | 7820 8611
 66 | 4462 4552
 67 | 832 893
 68 | 6798 7209
 69 | 5467 5319
 70 | 5573 6065
 71 | 5489 5010
 72 | 8246 8770
 73 | 2815 2918
 74 | 8766 8355
 75 | 7043 7760
 76 | 8834 8052
 77 | 8549 8969
 78 | 6511 6415
 79 | 9253 9812
 80 | 831 861
 81 | 4587 4755
 82 | 202 210
 83 | 1022 950
 84 | 867 823
 85 | 1989 2194
 86 | 2813 2594
 87 | 1711 1642
 88 | 9343 9828
 89 | 1840 2029
 90 | 2772 2575
 91 | 6035 5564
 92 | 8815 9345
 93 | 9329 8485
 94 | 354 353
 95 | 3488 3792
 96 | 2701 2645
 97 | 102 102
 98 | 3711 4046
 99 | 10505 9897
100 | 8471 9201
101 | 3406 3157
102 | 10171 9442
103 | 6862 7425
104 | 3747 3887
105 | 7132 7137
106 | 7386 7590
107 | 3073 3179
108 | 7566 8244
109 | 2269 2467
110 | 7134 7291
111 | 7750 7078
112 | 8126 8991
113 | 1803 1824
114 | 8229 8894
115 | 9725 9514
116 | 1468 1498
117 | 844 771
118 | 2939 2868
119 | 7538 7210
120 | 380 406
121 | 10182 9845
122 | 176 188
123 | 8874 8977
124 | 5461 5808
125 | 7833 7831
126 | 9668 9122
127 | 3381 3255
128 | 8534 7808
129 | 10002 9684
130 | 8881 9703
131 | 3503 3884
132 | 2774 2742
133 | 6546 6754
134 | 3368 3227
135 | 2269 2521
136 | 3229 3149
137 | 6703 6895
138 | 9740 9718
139 | 1660 1779
140 | 4724 4906
141 | 10161 9765
142 | 2460 2712
143 | 1221 1161
144 | 893 956
145 | 3922 3736
146 | 3837 3854
147 | 4564 4211
148 | 6844 7195
149 | 7300 7204
150 | 550 509
151 | 3347 3315
152 | 8141 8090
153 | 7173 7121
154 | 1386 1366
155 | 2216 2053
156 | 4182 4310
157 | 6496 6753
158 | 7540 7923
159 | 6576 7072
160 | 745 774
161 | 10510 9710
162 | 5294 5494
163 | 6752 6259
164 | 3818 4235
165 | 6704 6462
166 | 212 222
167 | 6247 5995
168 | 7948 8543
169 | 2763 2688
170 | 5698 5186
171 | 2307 2186
172 | 7426 7303
173 | 5292 5134
174 | 9295 8645
175 | 2578 2430
176 | 6097 5571
177 | 2925 3243
178 | 1223 1123
179 | 8720 8978
180 | 4240 4139
181 | 4344 4244
182 | 6250 6864
183 | 6547 7189
184 | 4989 4641
185 | 732 753
186 | 4440 4445
187 | 7861 8726
188 | 147 147
189 | 3066 3394
190 | 5265 5044
191 | 6723 7050
192 | 7443 7655
193 | 6062 6387
194 | 3793 3529
195 | 6167 6689
196 | 1965 1918
197 | 1479 1530
198 | 7177 7624
199 | 3624 3782
200 | 6602 7203
201 | 9195 9398
202 | 8667 8091
203 | 4802 4637
204 | 3317 3035
205 | 10496 9631
206 | 2441 2467
207 | 8759 7973
208 | 320 325
209 | 3459 3770
210 | 4805 4396
211 | 6153 5990
212 | 5076 5513
213 | 6003 6084
214 | 2143 2027
215 | 2915 3169
216 | 6150 6074
217 | 5077 4948
218 | 3335 3361
219 | 8400 8116
220 | 9711 9158
221 | 1375 1467
222 | 6421 6150
223 | 8784 8277
224 | 3085 2946
225 | 247 228
226 | 6182 6208
227 | 7543 7284
228 | 2056 2048
229 | 1198 1190
230 | 4033 4380
231 | 2527 2603
232 | 4158 4618
233 | 2552 2607
234 | 668 609
235 | 7843 8591
236 | 3986 3670
237 | 8463 8184
238 | 6382 6242
239 | 3103 3422
240 | 397 385
241 | 10619 9845
242 | 8138 8106
243 | 8370 8192
244 | 4321 3974
245 | 4514 4964
246 | 4041 4063
247 | 6558 6871
248 | 397 438
249 | 1943 2122
250 | 319 305
251 | 8557 8465
252 | 10517 9695
253 | 7573 8139
254 | 9981 9433
255 | 8833 8354
256 | 5854 5944
257 | 3796 3761
258 | 2043 2109
259 | 7288 7949
260 | 7280 7744
261 | 2163 2065
262 | 2469 2264
263 | 5532 5066
264 | 2318 2387
265 | 7179 6779
266 | 8381 9284
267 | 5665 5694
268 | 3544 3303
269 | 3108 2872
270 | 3050 2801
271 | 7307 6760
272 | 528 536
273 | 8598 8444
274 | 1282 1404
275 | 1912 1919
276 | 6096 6018
277 | 2305 2211
278 | 3787 3723
279 | 7142 6631
280 | 950 965
281 | 7389 7413
282 | 2823 2941
283 | 2097 1979
284 | 7066 6576
285 | 3447 3779
286 | 2727 2493
287 | 7624 8353
288 | 764 776
289 | 4578 4617
290 | 2503 2653
291 | 7276 7099
292 | 6643 6991
293 | 2786 2972
294 | 2422 2349
295 | 6811 6498
296 | 5584 5951
297 | 10727 9755
298 | 3882 3987
299 | 9566 9211
300 | 4396 4126
301 | 8930 8192
302 | 831 849
303 | 4712 4675
304 | 657 602
305 | 2738 3006
306 | 6995 6708
307 | 5598 5844
308 | 8939 9020
309 | 6861 6674
310 | 9795 9952
311 | 2090 2208
312 | 4661 4726
313 | 3258 3155
314 | 6520 6999
315 | 3040 3298
316 | 7137 6758
317 | 8379 8963
318 | 7682 7553
319 | 5225 5634
320 | 5653 5459
321 | 6605 6957
322 | 8226 7939
323 | 7947 8831
324 | 6663 6956
325 | 9263 8743
326 | 8527 7914
327 | 110 116
328 | 486 526
329 | 916 863
330 | 6285 6030
331 | 8658 8005
332 | 9627 9516
333 | 777 752
334 | 5208 5569
335 | 7641 7249
336 | 2961 2726
337 | 255 252
338 | 6656 6447
339 | 10101 9887
340 | 124 133
341 | 8303 7584
342 | 7576 8318
343 | 2428 2643
344 | 4008 4090
345 | 2645 2517
346 | 756 717
347 | 3980 4407
348 | 2950 3236
349 | 9529 9690
350 | 3644 3814
351 | 260 276
352 | 7840 8345
353 | 4601 4493
354 | 7423 7117
355 | 1692 1817
356 | 6957 7465
357 | 2923 3073
358 | 1677 1792
359 | 1138 1088
360 | 5317 5247
361 | 9705 9127
362 | 840 838
363 | 1209 1309
364 | 2481 2369
365 | 7686 8119
366 | 6022 5554
367 | 8029 8016
368 | 5418 5101
369 | 646 613
370 | 9511 8848
371 | 2350 2335
372 | 2544 2444
373 | 6819 7518
374 | 1055 1044
375 | 7563 7599
376 | 4530 4369
377 | 2249 2154
378 | 2244 2095
379 | 2976 3034
380 | 6533 6184
381 | 1518 1625
382 | 2484 2603
383 | 6100 6072
384 | 6326 6297
385 | 7341 7384
386 | 8751 8748
387 | 7195 7352
388 | 2487 2548
389 | 6846 7003
390 | 1049 1102
391 | 3670 3525
392 | 2538 2691
393 | 5378 5906
394 | 1530 1403
395 | 8675 8179
396 | 5411 5421
397 | 308 342
398 | 8138 8884
399 | 3751 4000
400 | 5392 5535
401 | 8288 7690
402 | 3425 3797
403 | 6599 6118
404 | 1855 2050
405 | 8516 8028
406 | 5331 5379
407 | 8180 7989
408 | 708 746
409 | 1217 1315
410 | 5753 5983
411 | 2918 3035
412 | 8370 8675
413 | 9502 9840
414 | 10584 9793
415 | 6538 6077
416 | 3678 3780
417 | 5013 5327
418 | 8374 8415
419 | 2038 1965
420 | 6129 5741
421 | 6622 6292
422 | 7569 7366
423 | 942 963
424 | 1259 1194
425 | 4277 3984
426 | 1121 1021
427 | 6333 5974
428 | 8989 9647
429 | 9265 8860
430 | 8344 8231
431 | 3112 3138
432 | 3347 3355
433 | 1352 1450
434 | 9712 9502
435 | 2307 2209
436 | 5520 5095
437 | 10137 9833
438 | 4583 4634
439 | 4444 4676
440 | 6024 5990
441 | 2481 2671
442 | 9522 9498
443 | 9993 9209
444 | 5687 6004
445 | 420 414
446 | 5365 5480
447 | 834 836
448 | 4767 4745
449 | 2409 2497
450 | 1897 1847
451 | 8698 9047
452 | 4612 4405
453 | 3524 3486
454 | 1156 1173
455 | 6516 5996
456 | 7741 7139
457 | 8546 9331
458 | 2349 2219
459 | 6095 6103
460 | 835 872
461 | 724 666
462 | 5288 5114
463 | 5659 6134
464 | 2847 3042
465 | 9627 9511
466 | 189 189
467 | 1509 1378
468 | 3609 3963
469 | 3802 3926
470 | 134 139
471 | 5689 6206
472 | 9097 9077
473 | 6347 5951
474 | 3007 2835
475 | 4305 3972
476 | 3155 3228
477 | 4130 3764
478 | 3904 3631
479 | 1915 2109
480 | 9014 9897
481 | 8504 8943
482 | 651 708
483 | 8947 8695
484 | 6239 5900
485 | 8311 8054
486 | 1412 1422
487 | 6513 7166
488 | 8244 8159
489 | 8127 8361
490 | 5552 5782
491 | 4068 4325
492 | 1013 935
493 | 10274 9984
494 | 2977 3181
495 | 2751 2876
496 | 10479 9715
497 | 2260 2159
498 | 5603 5520
499 | 3074 3065
500 | 9406 9789
501 | 9416 9939
502 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_0:
--------------------------------------------------------------------------------
 1 | 50 341045
 2 | 1906 4912
 3 | 41516 99732
 4 | 23527 56554
 5 | 559 1818
 6 | 45136 108372
 7 | 2625 6750
 8 | 492 1484
 9 | 1086 3072
10 | 5516 13532
11 | 4875 12050
12 | 7570 18440
13 | 4436 10972
14 | 620 1940
15 | 50897 122094
16 | 2129 5558
17 | 4265 10630
18 | 706 2112
19 | 2721 6942
20 | 16494 39888
21 | 29688 71276
22 | 3383 8466
23 | 2181 5662
24 | 96601 231302
25 | 1795 4690
26 | 7512 18324
27 | 1242 3384
28 | 2889 7278
29 | 2133 5566
30 | 103 706
31 | 4446 10992
32 | 11326 27552
33 | 3024 7548
34 | 217 934
35 | 13269 32038
36 | 281 1062
37 | 77174 184848
38 | 952 2604
39 | 15572 37644
40 | 566 1832
41 | 4103 10306
42 | 313 1126
43 | 14393 34886
44 | 1313 3526
45 | 348 1196
46 | 419 1338
47 | 246 992
48 | 445 1390
49 | 23552 56804
50 | 23552 56804
51 | 67 634
52 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_1:
--------------------------------------------------------------------------------
 1 | 50 5000
 2 | 995 945
 3 | 259 242
 4 | 258 244
 5 | 279 281
 6 | 576 582
 7 | 126 119
 8 | 280 303
 9 | 859 913
10 | 270 279
11 | 389 408
12 | 927 925
13 | 281 305
14 | 624 662
15 | 961 938
16 | 757 718
17 | 231 250
18 | 838 767
19 | 154 158
20 | 649 595
21 | 277 268
22 | 180 167
23 | 895 957
24 | 23 22
25 | 930 948
26 | 93 102
27 | 61 62
28 | 626 604
29 | 342 349
30 | 262 279
31 | 215 221
32 | 183 203
33 | 958 889
34 | 205 213
35 | 859 835
36 | 171 166
37 | 566 575
38 | 779 758
39 | 704 706
40 | 196 182
41 | 26 28
42 | 726 729
43 | 621 671
44 | 800 864
45 | 580 579
46 | 535 553
47 | 647 632
48 | 168 163
49 | 90 95
50 | 679 745
51 | 440 438
52 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_60_0:
--------------------------------------------------------------------------------
 1 | 60 100000
 2 | 90000 90001
 3 | 89750 89751
 4 | 10001 10002
 5 | 89500 89501
 6 | 10252 10254
 7 | 89250 89251
 8 | 10503 10506
 9 | 89000 89001
10 | 10754 10758
11 | 88750 88751
12 | 11005 11010
13 | 88500 88501
14 | 11256 11262
15 | 88250 88251
16 | 11507 11514
17 | 88000 88001
18 | 11758 11766
19 | 87750 87751
20 | 12009 12018
21 | 87500 87501
22 | 12260 12270
23 | 87250 87251
24 | 12511 12522
25 | 87000 87001
26 | 12762 12774
27 | 86750 86751
28 | 13013 13026
29 | 86500 86501
30 | 13264 13278
31 | 86250 86251
32 | 13515 13530
33 | 86000 86001
34 | 13766 13782
35 | 85750 85751
36 | 14017 14034
37 | 85500 85501
38 | 14268 14286
39 | 85250 85251
40 | 14519 14538
41 | 85000 85001
42 | 14770 14790
43 | 84750 84751
44 | 15021 15042
45 | 84500 84501
46 | 15272 15294
47 | 84250 84251
48 | 15523 15546
49 | 84000 84001
50 | 15774 15798
51 | 83750 83751
52 | 16025 16050
53 | 83500 83501
54 | 16276 16302
55 | 83250 83251
56 | 16527 16554
57 | 83000 83001
58 | 16778 16806
59 | 82750 82751
60 | 17029 17058
61 | 82500 82501
62 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_82_0:
--------------------------------------------------------------------------------
 1 | 82 104723596
 2 | 13211 13211 
 3 | 26422 26422 
 4 | 52844 52844 
 5 | 105688 105688 
 6 | 211376 211376 
 7 | 422752 422752 
 8 | 845504 845504 
 9 | 1691008 1691008 
10 | 3382016 3382016 
11 | 6764032 6764032 
12 | 13528064 13528064 
13 | 27056128 27056128 
14 | 54112256 54112256 
15 | 13212 13212 
16 | 26424 26424 
17 | 52848 52848 
18 | 105696 105696 
19 | 211392 211392 
20 | 422784 422784 
21 | 845568 845568
22 | 1691136 1691136 
23 | 3382272 3382272 
24 | 6764544 6764544 
25 | 13529088 13529088 
26 | 27058176 27058176 
27 | 54116352 54116352 
28 | 39638 39638 
29 | 79276 79276 
30 | 158552 158552
31 | 317104 317104 
32 | 634208 634208 
33 | 1268416 1268416 
34 | 2536832 2536832 
35 | 5073664 5073664 
36 | 10147328 10147328 
37 | 20294656 20294656 
38 | 40589312 40589312 
39 | 81178624 81178624
40 | 52844 52844 
41 | 105688 105688 
42 | 211376 211376 
43 | 422752 422752 
44 | 845504 845504 
45 | 1691008 1691008 
46 | 3382016 3382016 
47 | 6764032 6764032 
48 | 13528064 13528064
49 | 27056128 27056128 
50 | 54112256 54112256 
51 | 66060 66060 
52 | 132120 132120 
53 | 264240 264240 
54 | 528480 528480 
55 | 1056960 1056960 
56 | 2113920 2113920 
57 | 4227840 4227840
58 | 8455680 8455680 
59 | 16911360 16911360 
60 | 33822720 33822720 
61 | 67645440 67645440 
62 | 79268 79268 
63 | 158536 158536 
64 | 317072 317072 
65 | 634144 634144 
66 | 1268288 1268288
67 | 2536576 2536576 
68 | 5073152 5073152 
69 | 10146304 10146304 
70 | 20292608 20292608 
71 | 40585216 40585216 
72 | 81170432 81170432 
73 | 92482 92482 
74 | 184964 184964 
75 | 369928 369928
76 | 739856 739856 
77 | 1479712 1479712 
78 | 2959424 2959424 
79 | 5918848 5918848 
80 | 11837696 11837696 
81 | 23675392 23675392 
82 | 47350784 47350784 
83 | 94701568 94701568
84 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_1:
--------------------------------------------------------------------------------
1 | 3 9
2 | 5 4
3 | 6 5
4 | 3 2
5 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_2:
--------------------------------------------------------------------------------
1 | 4 7
2 | 16 2
3 | 19 3
4 | 23 4
5 | 28 5
6 | 
7 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/solver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from collections import namedtuple
 5 | Item = namedtuple("Item", ['index', 'value', 'weight'])
 6 | 
 7 | def solve_it(input_data):
 8 |     # Modify this code to run your optimization algorithm
 9 | 
10 |     # parse the input
11 |     lines = input_data.split('\n')
12 | 
13 |     firstLine = lines[0].split()
14 |     item_count = int(firstLine[0])
15 |     capacity = int(firstLine[1])
16 | 
17 |     items = []
18 | 
19 |     for i in range(1, item_count+1):
20 |         line = lines[i]
21 |         parts = line.split()
22 |         items.append(Item(i-1, int(parts[0]), int(parts[1])))
23 | 
24 |     # a trivial algorithm for filling the knapsack
25 |     # it takes items in-order until the knapsack is full
26 |     value = 0
27 |     weight = 0
28 |     taken = [0]*len(items)
29 | 
30 |     for item in items:
31 |         if weight + item.weight <= capacity:
32 |             taken[item.index] = 1
33 |             value += item.value
34 |             weight += item.weight
35 |     
36 |     # prepare the solution in the specified output format
37 |     output_data = str(value) + ' ' + str(0) + '\n'
38 |     output_data += ' '.join(map(str, taken))
39 |     return output_data
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     import sys
44 |     if len(sys.argv) > 1:
45 |         file_location = sys.argv[1].strip()
46 |         with open(file_location, 'r') as input_data_file:
47 |             input_data = input_data_file.read()
48 |         print(solve_it(input_data))
49 |     else:
50 |         print('This test requires an input file.  Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)')
51 | 
52 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/solverJava.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | from subprocess import Popen, PIPE
 6 | 
 7 | def solve_it(input_data):
 8 | 
 9 |     # Writes the inputData to a temporay file
10 | 
11 |     tmp_file_name = 'tmp.data'
12 |     tmp_file = open(tmp_file_name, 'w')
13 |     tmp_file.write(input_data)
14 |     tmp_file.close()
15 | 
16 |     # Runs the command: java Solver -file=tmp.data
17 | 
18 |     process = Popen(['java', 'Solver', '-file=' + tmp_file_name], stdout=PIPE, universal_newlines=True)
19 |     (stdout, stderr) = process.communicate()
20 | 
21 |     # removes the temporay file
22 |     os.remove(tmp_file_name)
23 | 
24 |     return stdout.strip()
25 | 
26 | 
27 | import sys
28 | 
29 | if __name__ == '__main__':
30 |     if len(sys.argv) > 1:
31 |         file_location = sys.argv[1].strip()
32 |         with open(file_location, 'r') as input_data_file:
33 |             input_data = input_data_file.read()
34 |         print(solve_it(input_data))
35 |     else:
36 |         print('This test requires an input file.  Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)')
37 | 
38 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/README.md:
--------------------------------------------------------------------------------
1 | # Personal notes on Discrete Optimization
2 | 
3 | > These notes are taken during Coursera course on discrete optimization
4 | 
5 | https://www.coursera.org/learn/discrete-optimization/home/welcome


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/discrete_optimization.md:
--------------------------------------------------------------------------------
 1 | # Discrete Optimization
 2 | 
 3 | 
 4 | - The goal of optimization is to find the optimal or a least a high quality solution in a reasonable amount of time even when we face exponential growth in the number of possible solutions
 5 | 
 6 | 
 7 | ## How to solve an optimization problem ?
 8 | - Formalization of the mathematical model
 9 | - Start with a greedy algorithm
10 | 
11 | 
12 | ## Formalizing an optimization task
13 | **How to model an optimization problem?**<br>
14 | Agreeing on a mathematical form on the problem.
15 | - Choose some decision variables (typically encode the result we are interested in)
16 | - Express the problem constraint in terms of variables (what the solutions to the problem are)
17 | - Express the objective function to be maximized (specifying the quality of a solution)
18 | 
19 | > There can be many ways to model an optimization problem
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/dynamic_programming.md:
--------------------------------------------------------------------------------
 1 | # Dynamic Programming
 2 | ![](https://caseine.org/pluginfile.php/2558/course/section/269/Capture%20d%E2%80%99%C3%A9cran%202016-05-17%20%C3%A0%2022.15.49.png)
 3 | 
 4 | ## What is dynamic programming ?
 5 | **A widely used optimization technique**
 6 | - for certain classes of problems
 7 | - heavily used in computational biology
 8 | 
 9 | **Basic principle**
10 | - Divide and conquer
11 | - Bottom-up computation
12 | 
13 | 
14 | 
15 | 
16 | 
17 | ## 📚 References
18 | - [Wikipedia homepage](https://en.wikipedia.org/wiki/Dynamic_programming)


--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/knapsack_problem.md:
--------------------------------------------------------------------------------
  1 | # Knapsack Problem
  2 | ![](https://miro.medium.com/max/684/0*3dS6Jw8NzzSD-mn8.jpg)
  3 | 
  4 | 
  5 | ## 📝 Conventions & notations
  6 | - I = {1,2,...,n}
  7 | - O(k,j) denotes an optimal solution to the knapsack problem with capacity k and items [1,...,j]. This is what we want to solve. 
  8 | 
  9 | ## 👜 Modeling the Knapsack Problem
 10 | 
 11 | ### Defining the problem
 12 | - **Variables**
 13 |   - Decision variables
 14 |     - ``xi`` denotes whether the item i is selected in the solution
 15 |   - Other variables
 16 |     - ``wi`` denotes the weight of the item i
 17 |     - ``vi`` denotes the value of the item i
 18 | - **Problem constraint**
 19 |   - Selected item cannot exceed the capacity of the backpack ``sum(wi*xi) <= K`` 
 20 | - **Objective function**
 21 |   - We want to maximize ``sum(vi*xi)``
 22 | 
 23 | 
 24 | ### Number of configurations
 25 | - How many possible configurations of 1 and 0 for ``(x1,x2,...,xn)`` ? -> Search space
 26 | - Not all of them are feasible -> Feasible search space
 27 | - How many are they ? ``2^n`` -> exponential growth -> brute force is not possible for more than a few objects
 28 | 
 29 | 
 30 | ## 🤗 Greedy algorithms
 31 | 
 32 | ### Greedy algorithms to solve the knapsack problem
 33 | 1. Take lighter item first
 34 | 2. Take most valuable item first
 35 | 3. Compute value density ratio (value/weight) and take the most important value
 36 | 
 37 | For one problem, **there are many greedy algorithms**. With no guarantee it's optimal. It really depends on the input. But it's quick to implement, it's often fast to run and it serves as a baseline. 
 38 | 
 39 | ### Advantages
 40 | - Quick to design and implement
 41 | - Can be very fast
 42 | 
 43 | ### Problems
 44 | - No quality guarantee
 45 | - Quality can vary widely on the input
 46 | - Problem feasibility needs to be easy
 47 | 
 48 | 
 49 | 
 50 | 
 51 | ## ⚡ Dynamic Programming
 52 | ### Recurrence relations (Bellmann equations)
 53 | We want to solve O(k,j) by recurrence : 
 54 | - Assume we know how to solve ``O(k,j-1)`` for all k, and we want to solve ``O(k,j)`` by adding one more item : the item ``j``
 55 | - If ``wj <= k`` there are two cases: 
 56 |   - Either we don't select item j and the best solution is then ``O(k,j-1)``
 57 |   - Or we select item j and the best solution is ``vj + O(k-wj,j-1)``
 58 | - Or written mathematically 
 59 | ```
 60 | - O(k,j) = max(O(k,j-1),vj + O(k-wj,j-1)) if wj <=k
 61 | - O(k,j) = O(k,j-1) otherwise
 62 | ```
 63 | - And of course ``O(k,0) = 0`` for all k (there are no items, there is no value)
 64 | 
 65 | ### Recursive function in Python 
 66 | ```python
 67 | # Variables
 68 | w = list(...)
 69 | v = list(...)
 70 | 
 71 | def O(k,j):
 72 |     if (j == 0):
 73 |         return 0
 74 |     elif w[j] <= k:
 75 |         return max([O(k,j-1),v[j] + O(k-w[j],j-1)])
 76 |     else:
 77 |         return O(k,j-1)      
 78 | ```
 79 | How efficient is this approach? Not a lot if we go top down (to compute many values we need to compute again the same values, that's often the case with complex recursive functions). <br>
 80 | That's why Dynamic Programming is all about Bottom-up approach. ###
 81 | 
 82 | ### Bottom-up computation
 83 | - Compute the recursive equations bottom up
 84 |   - Start with zero items
 85 |   - Add one more item, then two ... 
 86 | 
 87 | Often needs to be thought as a tables (capacity x items)
 88 | 
 89 | ![](https://sadakurapati.files.wordpress.com/2013/11/knapsack2.png?w=584)
 90 | 
 91 | - Building the table one by one using the formula
 92 | - Tracing back to find the optimal solution
 93 | 
 94 | ### Efficiency
 95 | - Complexity of the algorithm -> time to fill the table ie O(Kn), we could think it's polynomial not exactly
 96 | - It's not polynomial, but exponential because K is represented in a computer by log(K) bits. So we call this type of algorithms pseudo-polynomials. Because it's only efficient when K is small
 97 | 
 98 | 
 99 | 
100 | ## 🌴 Branch, bound & relaxation
101 | When you do exhaustive search it's basically building a decision tree of 2^n branches. Relaxation methods are to explore the tree without computing all nodes. We iterate two steps: 
102 | - **Branching** (splitting the problem into a number of subproblems like in exhaustive search)
103 | - **Bounding** (finding an optimistic estimate of the best solution to the subproblem, maximization = upper bound & minimization = lower bound)
104 | 
105 | ### How to find an optimization evaluation? How can I relax my problem?
106 | > - We relax a constraint
107 | > - Build the tree and evaluate an optimistic estimate
108 | > - If branching leads to a lower optimisatic estimate, we don't even need to go further in a branch and we can prune it. 
109 | 
110 | *Branching & bounding can be done a lot of different ways, see Search strategies section*
111 | 
112 | ### What can we relax in the knapsack problem?
113 | - The capacity constraint -> take everything in the knapsack
114 | - The selection variable, we can imagine taking a fraction of each item (xi is now a decimal), this is called **linear relaxation** 
115 | 
116 | Linear relaxation for the knapsack algorithm works by : 
117 | - Sorting by value density ratio
118 | - Fill the rest of the knapsack with a fraction of the last item that can partially fit, and you have an optimistic estimate for pruning
119 | 
120 | 
121 | ## 🔍 Search strategies
122 | 
123 | ### Depth-first
124 | Prunes when a node estimation is worse than the best found
125 | - Go deep
126 | - When does it prune? when it finds a new node worse than the found solution
127 | - Is it memory efficient? It can be if we look at a few branches
128 | 
129 | ### Best-first
130 | Select the node with the best estimation
131 | - Go for the best
132 | - When does it prune? when all the nodes are worse than a found solution
133 | - It it memory efficient? If we exaggerate and think of a knapsack with infinite capacity, we will commpute the entire tree, so infinite time and infinite space would be required. When the problem is small, it can be efficient. 
134 | 
135 | 
136 | ### Least discrepancy or limited discrepancy search
137 | Trust a greedy heuristic
138 | - Assume a good heuristic is available
139 |   - It makes very few mistakes
140 |   - Search tree is binary
141 |   - Following the heuristic means branching left and branching right means the heuristic was wrong
142 | - Limited Discrepancy Search (LDS)
143 |   - Avoid mistakes at all costs
144 |   - Explore the search space in increasing order of mistakes
145 |   - Trusting the heuristic less and less
146 | 
147 | We explore the search spaces in waves, and trust the heuristic less and less. <br>Its efficiency really depends on a trade off between space and time. 
148 | 
149 | 
150 | ### And many others search strategies
151 | 
152 | 
153 | 
154 | 
155 | ## 📚 References
156 | - [Wikipedia page on Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem)
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reinforcement Learning
  2 | 
  3 | ![](https://cdn-images-1.medium.com/max/1600/1*D7JNcbvhP5UOR6_Ul-WJaw.gif)
  4 | 
  5 | ##### Realizations
  6 | - Old experiments on RL (2016)
  7 | - Solving OpenAI Gym environments (2017-2018)
  8 | - Developing an multi agent Tic Tac Toe environment and solving it with Policy Gradients (May 2017)
  9 | - Using RL to automatically adapt the cooling in a Data Center (August 2017)
 10 | - Controlling Robots via Reinforcement Learning (November 2017)
 11 | - Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch (January 2018)
 12 | - Delivery optimization using Reinforcement Learning (January 2019)
 13 | - Rubik's Cube optimization (February 2019)
 14 | - Multi-Agents simulations (November 2019)
 15 | 
 16 | 
 17 | ##### Libraries
 18 | - ``rl`` is a simple library to do Reinforcement Learning with Keras, it uses old Keras versions and should be updated
 19 | - ``hyperion`` is a simple multi agent simulation library
 20 | 
 21 | 
 22 | ***
 23 | ### References and inspiration
 24 | ###### RL references
 25 | 
 26 | - [Udemy course on RL](https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python/)  
 27 | - [David Silver course on RL at UCL](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
 28 | - [Berkeley course on AI](http://ai.berkeley.edu/lecture_slides.html)
 29 | - [Spinning up course by OpenAI](https://spinningup.openai.com/en/latest/)
 30 | 
 31 | 
 32 | ##### Q Learning references
 33 | - [Q Learning tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0)
 34 | - [Q Learning tutorial on Keon.io](https://keon.io/deep-q-learning/)
 35 | - [Q Learning tutorial by Udacity](https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb)
 36 | 
 37 | 
 38 | ##### Deep Q Learning
 39 | - [David Silver's Deep Q Learning course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Resources_files/deep_rl.pdf)
 40 | - [Demystyfing Deep Reinforcement Learning](http://neuro.cs.ut.ee/demystifying-deep-reinforcement-learning/)
 41 | - [Siraj Raval's notebook on Deep Q Learning](https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb)
 42 | 
 43 | ##### Policy Gradient
 44 | - [Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/) Andrej Karpathy's blog article on RL (always a reference)
 45 | 
 46 | 
 47 | 
 48 | ##### Evolution strategies
 49 | - [Evolution strategies](https://blog.openai.com/evolution-strategies/) - OpenAI
 50 | - [How evolution taught us the “genetic algorithm”](https://blog.sicara.com/was-darwin-a-great-computer-scientist-81ffa1dd72f9)
 51 | - [Making a robot learn how to move, part 1 — Evolutionary algorithms](https://medium.com/towards-data-science/making-a-robot-learn-how-to-move-part-1-evolutionary-algorithms-340f239c9cd2)
 52 | - [Optimize a quadratic function with ES](https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d) - Andrej Karpathy
 53 | - [Evolution modelling with creatures](https://www.youtube.com/watch?v=GOFws_hhZs8)
 54 | - [Genetic biwalkers](http://rednuht.org/genetic_walkers/)
 55 | - [Evolving stable strategies](http://blog.otoro.net/2017/11/12/evolving-stable-strategies/)
 56 | 
 57 | ##### Actor Critic, A2C, ACKTR
 58 | - [A3C tutorial tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)
 59 | - [A3C tutorial with Keras and OpenAI](http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html)
 60 | - [A3C explananations and implementations](https://mpatacchiola.github.io/blog/2017/02/11/dissecting-reinforcement-learning-4.html)
 61 | - [ACKTR & A2C](https://blog.openai.com/baselines-acktr-a2c) - by OpenAI
 62 | - [ACKTR & A3C implementation in PyTorch](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr)
 63 | - [Actor Critic model with Keras](https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69)
 64 | - [Car Racing solving with A3C](https://fr.scribd.com/document/358019044/Reinforcement-Car-Racing-with-A3C) and [this solution as well](https://web.stanford.edu/class/cs221/2017/restricted/p-final/elibol/final.pdf)
 65 | 
 66 | ##### PPO, TRPO
 67 | - [Proximal Policy Optimization](https://blog.openai.com/openai-baselines-ppo/) - by OpenAI
 68 | - [PPO,TRPO tutorials](https://learningai.io/projects/2017/07/28/ai-gym-workout.html)
 69 | 
 70 | 
 71 | 
 72 | ##### AlphaGo
 73 | - [ELI5 MCTS](https://www.reddit.com/r/explainlikeimfive/comments/4aimqo/eli5_alpha_go_and_its_decision_making_process/)
 74 | - [How AlphaGo works](https://www.tastehit.com/blog/google-deepmind-alphago-how-it-works/)
 75 | - [Original Paper for AlphaGo](http://airesearch.com/wp-content/uploads/2016/01/deepmind-mastering-go.pdf) by David Silver
 76 | 
 77 | 
 78 | ##### Monte Carlo Tree Search
 79 | - [Udacity videos on MCTS](https://www.youtube.com/watch?v=onBYsen2_eA)
 80 | 
 81 | 
 82 | ##### Misc
 83 | - [Learning to optimize with RL](http://bair.berkeley.edu/blog/2017/09/12/learning-to-optimize-with-rl/)
 84 | 
 85 | 
 86 | ##### Environment
 87 | - [Unity Agents](https://blogs.unity3d.com/2017/09/19/introducing-unity-machine-learning-agents/)
 88 | - [SerpentAI](https://github.com/SerpentAI/SerpentAI)
 89 | - [Pybullet](https://docs.google.com/document/d/10sXEhzFRSnvFcl3XxNGhnD4N2SedqwdAvK3dsihxVUA/edit)
 90 | 
 91 | ***
 92 | ### Papers
 93 | 
 94 | - [Discrete Sequential Prediction of Continuous Actions for Deep RL](https://arxiv.org/abs/1705.05035)
 95 | - [Emotion in Reinforcement Learning Agents and Robots: A Survey](https://arxiv.org/abs/1705.05172)
 96 | - [Combating Reinforcement Learning's Sisyphean Curse with Intrinsic Fear](https://arxiv.org/abs/1611.01211)
 97 | - [Curiosity-driven Exploration by Self-supervised Prediction](https://arxiv.org/abs/1705.05363)
 98 | - [End-to-end optimization of goal-driven and visually grounded dialogue systems](https://arxiv.org/abs/1703.05423)
 99 | - [Deep reinforcement learning from human preferences](https://arxiv.org/abs/1706.03741) - OpenAI
100 | - [Programmable Agents](https://arxiv.org/abs/1706.06383) - Deepmind
101 | - [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf) - OpenAI
102 | - [Actor-Critic Reinforcement Learning with Simultaneous Human Control and Feedback](https://arxiv.org/abs/1703.01274)
103 | - [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
104 | - [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495)
105 | - [DARLA: Improving Zero-Shot Transfer in Reinforcement Learning](https://arxiv.org/pdf/1707.08475.pdf)
106 | - [Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards](https://arxiv.org/pdf/1707.08817.pdf)
107 | - [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/abs/1703.03864)
108 | - [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/abs/1707.06887)
109 | - [Intrinsically Motivated Goal Exploration Processes with Automatic Curriculum Learning](https://arxiv.org/abs/1708.02190?)
110 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
111 | - [Value Iteration Networks](https://arxiv.org/pdf/1602.02867.pdf)
112 | - [A deep reinforcement learning chatbot](https://arxiv.org/pdf/1709.02349.pdf) - MILA
113 | - [The Uncertainty Bellman Equation and Exploration](https://arxiv.org/abs/1709.05380)
114 | - [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560)
115 | - [Overcoming Exploration in Reinforcement Learning with Demonstrations](https://arxiv.org/abs/1709.10089)
116 | - [Using Simulation and Domain Adaptation to Improve Efficiency of Deep Robotic Grasping](https://arxiv.org/abs/1709.07857)
117 | - [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf)
118 | - [Optimizing Long Short-Term Memory Recurrent Neural Networks UsingAnt Colony Optimization to Predict Turbine Engine Vibration](https://arxiv.org/pdf/1710.03753.pdf)
119 | - [Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments](https://arxiv.org/pdf/1710.03641.pdf)
120 | - [Emergent Complexity via Multi-Agent Competition](https://arxiv.org/pdf/1710.03748.pdf)
121 | - [A Unified Game-Theoretic Approach to Multiagent Reinforcement Learning](https://arxiv.org/pdf/1711.00832.pdf)
122 | 


--------------------------------------------------------------------------------
/rl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/__init__.py


--------------------------------------------------------------------------------
/rl/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/agents/__init__.py


--------------------------------------------------------------------------------
/rl/agents/actor_critic_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 25/08/2017
  9 | 
 10 | Inspiration from https://keon.io/deep-q-learning/
 11 | https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69
 12 | 
 13 | theo.alves.da.costa@gmail.com
 14 | https://github.com/theolvs
 15 | ------------------------------------------------------------------------
 16 | """
 17 | 
 18 | 
 19 | 
 20 | import os
 21 | import matplotlib.pyplot as plt
 22 | import pandas as pd
 23 | import numpy as np 
 24 | import sys
 25 | import random
 26 | import time
 27 | import random
 28 | import numpy as np
 29 | 
 30 | from keras.models import Sequential, Model
 31 | from keras.layers import Dense, Dropout, Input
 32 | from keras.layers.merge import Add, Multiply
 33 | from keras.optimizers import Adam
 34 | import keras.backend as K
 35 | import tensorflow as tf
 36 | 
 37 | from rl import utils
 38 | from rl.memory import Memory
 39 | from rl.agents.base_agent import Agent
 40 | 
 41 | 
 42 | 
 43 | class ActorCriticAgent(Agent):
 44 |     def __init__(self,env,sess,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,tau = 0.125,actor_activation = "linear"):
 45 | 
 46 |         # Main parameters
 47 |         self.env = env
 48 |         self.sess = sess
 49 | 
 50 |         # Other parameters
 51 |         self.memory = Memory()
 52 |         self.epsilon = epsilon
 53 |         self.epsilon_min = epsilon_min
 54 |         self.epsilon_decay = epsilon_decay
 55 |         self.gamma = gamma
 56 |         self.tau = tau
 57 |         self.lr = lr
 58 | 
 59 |         # Models
 60 |         self.initialize_actor_model(actor_activation)
 61 |         self.initialize_critic_model()
 62 | 
 63 | 
 64 |     def initialize_actor_model(self,actor_activation):
 65 |         self.actor_state_input, self.actor_model = self.build_actor_model(actor_activation)
 66 |         _, self.target_actor_model = self.build_actor_model(actor_activation)
 67 | 
 68 |         self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]]) # where we will feed de/dC (from critic)
 69 |         
 70 |         actor_model_weights = self.actor_model.trainable_weights
 71 |         self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor)
 72 |         grads = zip(self.actor_grads, actor_model_weights)
 73 |         self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(grads)
 74 | 
 75 | 
 76 | 
 77 |     def build_actor_model(self,activation = ""):
 78 |         # Define the layers of the network
 79 |         state_input = Input(shape=self.env.observation_space.shape)
 80 |         h1 = Dense(24, activation='relu')(state_input)
 81 |         h2 = Dense(48, activation='relu')(h1)
 82 |         h3 = Dense(24, activation='relu')(h2)
 83 |         output = Dense(self.env.action_space.shape[0],activation='relu')(h3)
 84 |         
 85 |         # Compute the model
 86 |         model = Model(input=state_input, output=output)
 87 |         model.compile(loss="mse", optimizer=Adam(lr=self.lr))
 88 |         return state_input, model
 89 | 
 90 | 
 91 |     def initialize_critic_model(self):
 92 |         self.critic_state_input, self.critic_action_input, self.critic_model = self.build_critic_model()
 93 |         _, _, self.target_critic_model = self.build_critic_model()
 94 | 
 95 |         self.critic_grads = tf.gradients(self.critic_model.output,self.critic_action_input) # where we calcaulte de/dC for feeding above
 96 |         
 97 |         # Initialize for later gradient calculations
 98 |         self.sess.run(tf.initialize_all_variables())
 99 | 
100 | 
101 | 
102 | 
103 |     def build_critic_model(self):
104 |         state_input = Input(shape=self.env.observation_space.shape)
105 |         state_h1 = Dense(24, activation='relu')(state_input)
106 |         state_h2 = Dense(48)(state_h1)
107 |         
108 |         action_input = Input(shape=self.env.action_space.shape)
109 |         action_h1    = Dense(48)(action_input)
110 |         
111 |         merged    = Add()([state_h2, action_h1])
112 |         merged_h1 = Dense(24, activation='relu')(merged)
113 |         output = Dense(1, activation='relu')(merged_h1)
114 |         model  = Model(input=[state_input,action_input], output=output)
115 |         
116 |         model.compile(loss="mse", optimizer=Adam(lr=self.lr))
117 |         return state_input, action_input, model
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 |     def train(self,batch_size = 32):
125 |         if self.epsilon > self.epsilon_min:
126 |             self.epsilon *= self.epsilon_decay
127 | 
128 |         if len(self.memory.cache) > batch_size:
129 |             batch = random.sample(self.memory.cache, batch_size)
130 |         else:
131 |             batch = self.memory.cache
132 | 
133 |         self._train_actor(batch)
134 |         self._train_critic(batch)
135 | 
136 | 
137 | 
138 | 
139 | 
140 |     def _train_actor(self,batch):
141 |         for state,action,reward,next_state,_ in batch:
142 |             predicted_action = self.actor_model.predict(state)
143 |             grads = self.sess.run(self.critic_grads, feed_dict={
144 |                 self.critic_state_input:  state,
145 |                 self.critic_action_input: predicted_action
146 |             })[0]
147 | 
148 |             self.sess.run(self.optimize, feed_dict={
149 |                 self.actor_state_input: state,
150 |                 self.actor_critic_grad: grads
151 |             })
152 |             
153 | 
154 | 
155 |     def _train_critic(self,batch):
156 |         for state,action,reward,next_state,done in batch:
157 |             if not done:
158 |                 target_action = self.target_actor_model.predict(next_state)
159 |                 future_reward = self.target_critic_model.predict([next_state, target_action])[0][0]
160 |                 reward += self.gamma * future_reward
161 |             self.critic_model.fit([state, action], reward, verbose=0)
162 |         
163 | 
164 | 
165 |     def _update_actor_target(self):
166 |         actor_model_weights  = self.actor_model.get_weights()
167 |         actor_target_weights = self.target_critic_model.get_weights()
168 |         
169 |         for i in range(len(actor_target_weights)):
170 |             actor_target_weights[i] = actor_model_weights[i]
171 |         self.target_critic_model.set_weights(actor_target_weights)
172 | 
173 | 
174 |     def _update_critic_target(self):
175 |         critic_model_weights  = self.critic_model.get_weights()
176 |         critic_target_weights = self.critic_target_model.get_weights()
177 |         
178 |         for i in range(len(critic_target_weights)):
179 |             critic_target_weights[i] = critic_model_weights[i]
180 |         self.critic_target_model.set_weights(critic_target_weights)     
181 | 
182 | 
183 |     def update_target(self):
184 |         self._update_actor_target()
185 |         self._update_critic_target()
186 | 
187 | 
188 | 
189 | 
190 |     def act(self, state):
191 | 
192 | 
193 | 
194 |             
195 |         if np.random.random() < self.epsilon:
196 |             return self.env.action_space.sample()
197 |         return self.actor_model.predict(state)


--------------------------------------------------------------------------------
/rl/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | 
 5 | """--------------------------------------------------------------------
 6 | REINFORCEMENT LEARNING
 7 | 
 8 | Started on the 25/08/2017
 9 | 
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 | 
15 | 
16 | import os
17 | import matplotlib.pyplot as plt
18 | import pandas as pd
19 | import numpy as np 
20 | import sys
21 | import random
22 | import time
23 | import random
24 | import numpy as np
25 | 
26 | 
27 | 
28 | 
29 | 
30 | class Agent(object):
31 |     def __init__(self):
32 |         pass
33 | 
34 | 
35 |     def expand_state_vector(self,state):
36 |         if len(state.shape) == 1 or len(state.shape)==3:
37 |             return np.expand_dims(state,axis = 0)
38 |         else:
39 |             return state
40 | 
41 | 
42 | 
43 |     def remember(self,*args):
44 |         self.memory.save(args)
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/rl/agents/dqn2d_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 19/10/2018
  9 | 
 10 | theo.alves.da.costa@gmail.com
 11 | https://github.com/theolvs
 12 | ------------------------------------------------------------------------
 13 | """
 14 | 
 15 | 
 16 | 
 17 | import os
 18 | import matplotlib.pyplot as plt
 19 | import pandas as pd
 20 | import numpy as np 
 21 | import sys
 22 | import random
 23 | import time
 24 | import random
 25 | import numpy as np
 26 | 
 27 | from keras.models import Sequential
 28 | from keras.layers import Dense
 29 | from keras.optimizers import Adam
 30 | 
 31 | from keras.layers import Input, LSTM, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
 32 | from keras.layers import concatenate
 33 | from keras.models import Model
 34 | from keras.utils import plot_model,to_categorical
 35 | 
 36 | from rl import utils
 37 | from rl.memory import Memory
 38 | from rl.agents.base_agent import Agent
 39 | from rl.agents.dqn_agent import DQNAgent
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | def create_vision_model(input_shape):
 46 |     input_image = Input(shape=input_shape)
 47 |     conv1 = Conv2D(32,(3,3),padding="same",activation="relu")(input_image)
 48 |     pool1 = MaxPooling2D(pool_size=(2,2))(conv1)
 49 |     drop1 = Dropout(0.25)(pool1)
 50 | 
 51 |     conv2 = Conv2D(64,(3,3),padding="same",activation="relu")(drop1)
 52 |     pool2 = MaxPooling2D(pool_size=(2,2))(conv2)
 53 |     drop2 = Dropout(0.25)(pool2)
 54 | 
 55 |     out = Flatten()(drop2)
 56 | 
 57 |     vision_model = Model(inputs=input_image, outputs=out)
 58 |     return vision_model
 59 | 
 60 | 
 61 | def create_model(input_shape,output_dim):
 62 | 
 63 |     input1 = Input(shape=input_shape)
 64 |     input2 = Input(shape=input_shape)
 65 | 
 66 |     vision_model = create_vision_model(input_shape)
 67 | 
 68 |     out1 = vision_model(input1)
 69 |     out2 = vision_model(input2)
 70 | 
 71 |     concatenated = concatenate([out1,out2])
 72 | 
 73 |     hidden = Dense(128, activation='relu')(concatenated)
 74 |     output = Dense(output_dim, activation='softmax')(hidden)
 75 | 
 76 |     model = Model([input1, input2], output)
 77 | 
 78 |     return model
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | class DQN2DAgent(DQNAgent):
 85 | 
 86 | 
 87 | 
 88 |     def build_model(self,states_size,actions_size):
 89 |         model = create_model(states_size,actions_size)
 90 |         model.compile(loss='categorical_crossentropy',
 91 |                       metrics=['accuracy'],
 92 |                       optimizer="adam")
 93 |         return model
 94 | 
 95 | 
 96 | 
 97 |     def train(self,batch_size = 32):
 98 |         if len(self.memory.cache) > batch_size:
 99 |             batch = random.sample(self.memory.cache, batch_size)
100 |         else:
101 |             batch = self.memory.cache
102 | 
103 |         # Unzip batch
104 |         states,actions,rewards,next_states,before_states,dones = zip(*batch)
105 | 
106 |         # Concat states
107 |         states = np.vstack(states)
108 |         next_states = np.vstack(next_states)
109 |         before_states = np.vstack(before_states)
110 | 
111 |         # Compute targets
112 |         targets = self.model.predict([before_states,states])
113 | 
114 |         # Compute new targets
115 |         rewards = np.array(rewards).reshape(-1,1)
116 |         dones = 1-np.array(dones,dtype=np.int32).reshape(-1,1)
117 |         predictions = (self.gamma * np.max(self.model.predict([before_states,states]),axis = 1)).reshape(-1,1)
118 |         new_targets = rewards + dones * predictions
119 |         new_targets = new_targets.astype("float32")
120 | 
121 |         # Correct targets
122 |         actions = to_categorical(np.array(actions).reshape(-1,1),self.actions_size)
123 |         np.place(targets,actions,new_targets)
124 | 
125 |         # Training
126 |         self.model.fit([states,next_states],targets,epochs = 1,verbose = 0)
127 | 
128 |         if self.epsilon > self.epsilon_min:
129 |             self.epsilon *= self.epsilon_decay
130 | 
131 | 
132 | 
133 | 
134 | 
135 |     def act(self,before_state,state):
136 |         before_state = self.expand_state_vector(before_state)
137 |         state = self.expand_state_vector(state)
138 | 
139 | 
140 |         if np.random.rand() > self.epsilon:
141 |             q = self.model.predict([before_state,state])
142 | 
143 |             if self.observation_type == "discrete":
144 |                 a = np.argmax(q[0])
145 |             elif self.observation_type == "continuous":
146 |                 a = np.squeeze(np.clip(q,self.low,self.high))
147 | 
148 |         else:
149 |             if self.observation_type == "discrete":
150 |                 a = np.random.randint(self.actions_size)
151 |             elif self.observation_type == "continuous":
152 |                 a = np.random.uniform(self.low,self.high,self.actions_size)
153 |         return a 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/rl/agents/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | 
  8 | Started on the 25/08/2017
  9 | 
 10 | Inspiration from https://keon.io/deep-q-learning/
 11 | 
 12 | theo.alves.da.costa@gmail.com
 13 | https://github.com/theolvs
 14 | ------------------------------------------------------------------------
 15 | """
 16 | 
 17 | 
 18 | 
 19 | import os
 20 | import matplotlib.pyplot as plt
 21 | import pandas as pd
 22 | import numpy as np 
 23 | import sys
 24 | import random
 25 | import time
 26 | import random
 27 | import numpy as np
 28 | 
 29 | from keras.models import Sequential
 30 | from keras.layers import Dense
 31 | from keras.optimizers import Adam
 32 | 
 33 | 
 34 | from rl import utils
 35 | from rl.memory import Memory
 36 | from rl.agents.base_agent import Agent
 37 | 
 38 | 
 39 | 
 40 | class DQNAgent(Agent):
 41 |     def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,low = 0,high = 1,max_memory = 2000,observation_type = "discrete"):
 42 |         assert observation_type in ["discrete","continuous"]
 43 |         self.states_size = states_size
 44 |         self.actions_size = actions_size
 45 |         self.memory = Memory(max_memory = max_memory)
 46 |         self.epsilon = epsilon
 47 |         self.low = low
 48 |         self.high = high
 49 |         self.observation_type = observation_type
 50 |         self.epsilon_min = epsilon_min
 51 |         self.epsilon_decay = epsilon_decay
 52 |         self.gamma = gamma
 53 |         self.lr = lr
 54 |         self.model = self.build_model(states_size,actions_size)
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 |     def build_model(self,states_size,actions_size):
 61 |         model = Sequential()
 62 |         model.add(Dense(24,input_dim = states_size,activation = "relu"))
 63 |         model.add(Dense(24,activation = "relu"))
 64 |         model.add(Dense(actions_size,activation = "linear"))
 65 |         model.compile(loss='mse',
 66 |                       optimizer=Adam(lr=self.lr))
 67 |         return model
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 |     def train(self,batch_size = 32):
 75 |         if len(self.memory.cache) > batch_size:
 76 |             batch = random.sample(self.memory.cache, batch_size)
 77 |         else:
 78 |             batch = self.memory.cache
 79 | 
 80 |         for state,action,reward,next_state,done in batch:
 81 |             state = self.expand_state_vector(state)
 82 |             next_state = self.expand_state_vector(next_state)
 83 | 
 84 | 
 85 |             targets = self.model.predict(state)
 86 | 
 87 |             if not done:
 88 |                 target = reward + self.gamma * np.max(self.model.predict(next_state))
 89 |             else:
 90 |                 target = reward
 91 | 
 92 |             targets[0][action] = target
 93 | 
 94 |             self.model.fit(state,targets,epochs = 1,verbose = 0)
 95 | 
 96 | 
 97 |         if self.epsilon > self.epsilon_min:
 98 |             self.epsilon *= self.epsilon_decay
 99 | 
100 | 
101 | 
102 | 
103 | 
104 |     def act(self,state):
105 |         state = self.expand_state_vector(state)
106 | 
107 | 
108 |         if np.random.rand() > self.epsilon:
109 |             q = self.model.predict(state)
110 | 
111 |             if self.observation_type == "discrete":
112 |                 a = np.argmax(q[0])
113 |             elif self.observation_type == "continuous":
114 |                 a = np.squeeze(np.clip(q,self.low,self.high))
115 | 
116 |         else:
117 |             if self.observation_type == "discrete":
118 |                 a = np.random.randint(self.actions_size)
119 |             elif self.observation_type == "continuous":
120 |                 a = np.random.uniform(self.low,self.high,self.actions_size)
121 |         return a 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/rl/agents/q_agent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | 
 5 | """--------------------------------------------------------------------
 6 | REINFORCEMENT LEARNING
 7 | 
 8 | Started on the 25/08/2017
 9 | 
10 | 
11 | theo.alves.da.costa@gmail.com
12 | https://github.com/theolvs
13 | ------------------------------------------------------------------------
14 | """
15 | 
16 | 
17 | 
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np 
22 | import sys
23 | import random
24 | import time
25 | import random
26 | import numpy as np
27 | 
28 | 
29 | 
30 | 
31 | from rl import utils
32 | from rl.memory import Memory
33 | from rl.agents.base_agent import Agent
34 | 
35 | 
36 | 
37 | class QAgent(Agent):
38 |     def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8):
39 |         self.states_size = states_size
40 |         self.actions_size = actions_size
41 |         self.epsilon = epsilon
42 |         self.epsilon_min = epsilon_min
43 |         self.epsilon_decay = epsilon_decay
44 |         self.gamma = gamma
45 |         self.lr = lr
46 |         self.Q = self.build_model(states_size,actions_size)
47 | 
48 | 
49 |     def build_model(self,states_size,actions_size):
50 |         Q = np.zeros([states_size,actions_size])
51 |         return Q
52 | 
53 | 
54 |     def train(self,s,a,r,s_next):
55 |         self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*np.max(self.Q[s_next,a]) - self.Q[s,a])
56 | 
57 |         if self.epsilon > self.epsilon_min:
58 |             self.epsilon *= self.epsilon_decay
59 | 
60 | 
61 |     def act(self,s):
62 | 
63 |         q = self.Q[s,:]
64 | 
65 |         if np.random.rand() > self.epsilon:
66 |             a = np.argmax(q)
67 |         else:
68 |             a = np.random.randint(self.actions_size)
69 | 
70 |         return a
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/rl/agents/sarsa_agent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | 
 5 | """--------------------------------------------------------------------
 6 | REINFORCEMENT LEARNING
 7 | 
 8 | Started on the 25/08/2017
 9 | 
10 | 
11 | theo.alves.da.costa@gmail.com
12 | https://github.com/theolvs
13 | ------------------------------------------------------------------------
14 | """
15 | 
16 | 
17 | 
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np 
22 | import sys
23 | import random
24 | import time
25 | import random
26 | import numpy as np
27 | 
28 | 
29 | 
30 | 
31 | from rl import utils
32 | from rl.memory import Memory
33 | from rl.agents.base_agent import Agent
34 | 
35 | 
36 | 
37 | class SarsaAgent(Agent):
38 |     def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8):
39 |         self.states_size = states_size
40 |         self.actions_size = actions_size
41 |         self.epsilon = epsilon
42 |         self.epsilon_min = epsilon_min
43 |         self.epsilon_decay = epsilon_decay
44 |         self.gamma = gamma
45 |         self.lr = lr
46 |         self.Q = self.build_model(states_size,actions_size)
47 | 
48 | 
49 | 
50 | 
51 | 
52 |     def build_model(self,states_size,actions_size):
53 |         Q = np.zeros([states_size,actions_size])
54 |         return Q
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 |     def train(self,s,a,r,s_next):
62 |         a_next = self.act(s_next)
63 |         self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*self.Q[s_next,a_next] - self.Q[s,a])
64 | 
65 |         if self.epsilon > self.epsilon_min:
66 |             self.epsilon *= self.epsilon_decay
67 | 
68 | 
69 | 
70 | 
71 |     def act(self,s):
72 | 
73 |         q = self.Q[s,:]
74 | 
75 |         if np.random.rand() > self.epsilon:
76 |             a = np.argmax(q)
77 |         else:
78 |             a = np.random.randint(self.actions_size)
79 | 
80 |         return a
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/rl/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/envs/__init__.py


--------------------------------------------------------------------------------
/rl/envs/data_center_cooling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | 
  4 | 
  5 | """--------------------------------------------------------------------
  6 | REINFORCEMENT LEARNING
  7 | DATA CENTER COOLING
  8 | 
  9 | Started on the 25/08/2017
 10 | 
 11 | 
 12 | theo.alves.da.costa@gmail.com
 13 | https://github.com/theolvs
 14 | ------------------------------------------------------------------------
 15 | """
 16 | 
 17 | 
 18 | import os
 19 | import matplotlib.pyplot as plt
 20 | import pandas as pd
 21 | import numpy as np 
 22 | import sys
 23 | import random
 24 | import time
 25 | from tqdm import tqdm
 26 | from collections import Counter
 27 | from scipy import stats
 28 | 
 29 | # Deep Learning (Keras, Tensorflow)
 30 | import tensorflow as tf
 31 | from keras.models import Sequential
 32 | from keras.optimizers import SGD,RMSprop, Adam
 33 | from keras.layers import Dense, Dropout, Activation, Flatten
 34 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D
 35 | from keras.utils.np_utils import to_categorical
 36 | 
 37 | 
 38 | # Plotly
 39 | import plotly.graph_objs as go
 40 | from plotly import tools
 41 | 
 42 | np.random.seed(1)
 43 | 
 44 | 
 45 | #===========================================================================================================
 46 | # COOLING CENTER ENVIRONMENT
 47 | #===========================================================================================================
 48 | 
 49 | 
 50 | 
 51 | class DataCenterCooling(object):
 52 |     def __init__(self,levels_activity = 20,levels_cooling = 10,cost_factor = 5,risk_factor = 1.6,keep_cooling = False):
 53 | 
 54 |         self.hour = 0
 55 |         self.cost_factor = cost_factor
 56 |         self.risk_factor = risk_factor
 57 |         self.levels_activity = levels_activity
 58 |         self.levels_cooling = levels_cooling
 59 |         self.define_activity(levels_activity)
 60 |         if not hasattr(self,"cooling") or not keep_cooling:
 61 |             self.define_cooling(levels_cooling)
 62 | 
 63 |         
 64 |     def define_activity(self,levels_activity):
 65 |         # Define the peaks of activity
 66 |         peak_morning = np.random.randint(7,10)
 67 |         peak_evening = np.random.randint(17,22)
 68 | 
 69 |         # Build the distribution
 70 |         x1 = np.array(stats.poisson.pmf(range(24),peak_morning))
 71 |         x2 = np.array(stats.poisson.pmf(range(24),peak_evening))
 72 |         x = x1 + x2
 73 |         x *= (100/0.14)
 74 | 
 75 |         # Discretize the distribution
 76 |         take_closest = lambda j,vector:min(vector,key=lambda x:abs(x-j))
 77 |         percentiles = np.percentile(x,range(0,100,int(100/levels_activity)))
 78 |         assert len(percentiles) == levels_activity
 79 |         x_disc = np.array([take_closest(y,percentiles) for y in x])
 80 | 
 81 |         # Store the variable
 82 |         self.observation_space = percentiles
 83 |         self.activity = np.expand_dims(x_disc,axis = 0)
 84 | 
 85 | 
 86 | 
 87 |     def define_cooling(self,levels_cooling):
 88 |         self.action_space = list([int(100/levels_cooling*i) for i in range(levels_cooling)])
 89 |         assert len(self.action_space) == levels_cooling
 90 | 
 91 |         initial_value = random.choice(self.action_space)
 92 |         self.cooling = np.full((1,24),initial_value)
 93 | 
 94 | 
 95 | 
 96 |     def reset(self):
 97 |         self.__init__(self.levels_activity,self.levels_cooling,self.cost_factor)
 98 |         return self.reset_state()
 99 | 
100 |     def reset_state(self):
101 |         activity = self.activity[0][0]
102 |         activity_state = self.convert_activity_to_state(activity)
103 |         return activity_state
104 | 
105 | 
106 |     def convert_activity_to_state(self,activity):
107 |         state = int(np.where(self.observation_space == activity)[0][0])
108 |         return state
109 | 
110 | 
111 | 
112 |     def render(self,with_plotly = False):
113 | 
114 |         rewards,winnings,losses,failures = self.compute_daily_rewards()
115 | 
116 |         if not with_plotly:
117 |             # Show the activity and cooling
118 |             plt.figure(figsize = (14,5))
119 |             plt.plot(np.squeeze(self.activity),c ="red",label = "activity")
120 |             plt.plot(np.squeeze(self.cooling),c = "blue",label = "cooling")
121 |             plt.legend()
122 |             plt.show()
123 | 
124 |             # Show the rewards
125 |             plt.figure(figsize = (14,5))
126 |             plt.title("Total reward : {}".format(int(np.sum(rewards))))
127 |             plt.plot(rewards,c = "blue",label = "profits")
128 |             plt.plot(losses*(-1),c = "red",label = "costs")
129 |             plt.plot(winnings,c = "green",label = "revenues")
130 |             plt.legend()
131 |             plt.show()
132 |         else:
133 |             data_states = self.render_states_plotly()["data"]
134 |             data_rewards = self.render_rewards_plotly()["data"]
135 |             data_states
136 |             fig = tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]],
137 |                           shared_xaxes=True, shared_yaxes=False,
138 |                           vertical_spacing=0.1)
139 | 
140 |             for i,trace in enumerate(data_rewards):
141 |                 fig.append_trace(trace, 2, 1)
142 | 
143 |             for i,trace in enumerate(data_states):
144 |                 fig.append_trace(trace, 1, 1)
145 | 
146 |             # print(len(failures))
147 |             # print(len(rewards))
148 | 
149 |             # shapes = [{"type":"line","x0":hour+1,"y0":0,"x1":hour+1,"y1":failure} for hour,failure in enumerate(failures) if failure > 0]
150 |             fig['layout'].update(title="Total reward : {}".format(int(np.sum(rewards))))
151 |             fig['layout']['xaxis'].update(dtick = 1)
152 |             # fig['layout'].update(shapes=shapes)
153 |             return fig
154 | 
155 | 
156 |     def render_states_plotly(self):
157 |         # Create a trace
158 |         x = list(range(24))
159 |         trace_activity = go.Scatter(x = x,y = np.squeeze(self.activity),name = "activity",line = dict(color = "red",width = 2),ysrc = "activity")
160 |         trace_cooling = go.Scatter(x = x,y = np.squeeze(self.cooling),name = "cooling",line = dict(color = "#34aac1",width = 2))
161 | 
162 |         data = [trace_activity,trace_cooling]
163 |         fig = {"data":data}
164 |         return fig
165 | 
166 | 
167 |     def render_rewards_plotly(self):
168 |         rewards,winnings,losses,failures = self.compute_daily_rewards()
169 |         # Create a trace
170 |         x = list(range(24))
171 |         trace_rewards = go.Scatter(x = x,y = np.squeeze(rewards),name = "rewards",line = dict(color = "#34aac1",width = 2),ysrc = "rewards")
172 |         trace_winnings = go.Scatter(x = x,y = np.squeeze(winnings),name = "revenues",line = dict(color = "#10c576",width = 1),mode = "lines+markers")
173 |         trace_losses = go.Scatter(x = x,y = np.squeeze(losses),name = "costs",line = dict(color = "red",width = 1),mode = "lines+markers")
174 | 
175 |         data = [trace_rewards,trace_winnings,trace_losses]
176 |         fig = {"data":data}
177 |         return fig
178 | 
179 | 
180 |         
181 | 
182 | 
183 |     def compute_reward(self,activity,cooling):
184 | 
185 |         # CALCULATING THE WINNINGS
186 |         win = activity
187 | 
188 |         # CALCULATING THE LOSSES
189 |         if cooling >= activity:
190 |             cost = (0 if self.cost_factor < 1.0 else 1)*(cooling)**np.sqrt(self.cost_factor)
191 |             failure = 0
192 |         else:
193 |             difference = (activity-cooling)/(cooling+1)
194 |             default_probability = np.tanh(difference)
195 |             if np.random.rand() > default_probability or self.risk_factor < 1.0:
196 |                 cost = 0
197 |             else:
198 |                 cost = np.random.normal(loc = self.risk_factor,scale = 0.4) * 150
199 | 
200 |             # cost += (cooling * min(1,self.cost_factor))**2
201 |             cost += (0 if self.cost_factor < 1.0 else (1-1/(self.cost_factor+0.1)))*(cooling)
202 | 
203 |             failure = cost
204 | 
205 |         return win,cost,failure
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 |     def compute_daily_rewards(self):
213 |         winnings = []
214 |         losses = []
215 |         rewards = []
216 |         failures = []
217 |         for i in range(24):
218 |             activity = self.activity[0][i]
219 |             cooling = self.cooling[0][i]
220 |             win,loss,failure = self.compute_reward(activity,cooling)
221 |             winnings.append(win)
222 |             losses.append(loss)
223 |             rewards.append(win-loss)
224 |             failures.append(failure)
225 | 
226 |         return np.array(rewards),np.array(winnings),np.array(losses),np.array(failures)
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 |     def step(self,cooling_action):
235 | 
236 |         # Convert cooling_action to cooling_value
237 |         cooling = self.action_space[cooling_action]
238 | 
239 |         # Update the cooling
240 |         self.cooling[0][self.hour] = cooling
241 | 
242 |         activity = self.activity[0][self.hour]
243 |         win,loss,failure = self.compute_reward(activity,cooling)
244 |         reward = win-loss
245 | 
246 |         self.hour += 1
247 | 
248 |         if int(self.hour) == 24:
249 |             new_state = self.reset_state()
250 |             done = True
251 |         else:
252 |             new_activity = self.activity[0][self.hour]
253 |             new_state = self.convert_activity_to_state(new_activity)
254 |             done = False
255 | 
256 | 
257 |         return new_state,reward,done
258 | 
259 | 
260 |     
261 |     
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/rl/memory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | 
 5 | """--------------------------------------------------------------------
 6 | REINFORCEMENT LEARNING
 7 | 
 8 | Started on the 25/08/2017
 9 | 
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 | 
15 | 
16 | 
17 | from collections import deque
18 | 
19 | 
20 | 
21 | 
22 | class Memory(object):
23 |     def __init__(self,max_memory = 2000):
24 |         self.cache = deque(maxlen=max_memory)
25 |     
26 |     def save(self,args):
27 |         self.cache.append(args)
28 | 
29 |     def empty_cache(self):
30 |         self.__init__()
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/rl/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | 
 5 | """--------------------------------------------------------------------
 6 | REINFORCEMENT LEARNING
 7 | 
 8 | Started on the 25/08/2017
 9 | 
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 | 
15 | 
16 | 
17 | import os
18 | import matplotlib.pyplot as plt
19 | import pandas as pd
20 | import numpy as np 
21 | import sys
22 | import random
23 | import time
24 | import random
25 | import numpy as np
26 | import pylab
27 | 
28 | 
29 | 
30 | def plot_average_running_rewards(rewards,save = None):
31 |     average_running_rewards = np.cumsum(rewards)/np.array(range(1,len(rewards)+1))
32 |     figure = plt.figure(figsize = (15,4))
33 |     plt.plot(average_running_rewards)
34 | 
35 |     if save is None:
36 |         plt.show()
37 |     else:
38 |         plt.savefig(save)
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------