├── .gitignore
├── 0. Old
├── 1. Reinforcement Learning - Pickup problem.ipynb
├── 2. Reinforcement Learning - Multi-armed bandits.ipynb
└── scripts
│ ├── algorithms.py
│ ├── grid_world.py
│ ├── maze.py
│ ├── multi_armed_bandit.py
│ ├── open_ai_gym.py
│ └── pickup_problem.py
├── 0. Solving Gym environments
├── Agents development - Breakout.ipynb
├── breakout_with_rl.py
├── cartpole_with_deepqlearning.py
└── pendulum_with_actorcritic.py
├── 1. Tic Tac Toe
├── 1. Solving Tic Tac Toe with Policy gradients.ipynb
└── images
│ ├── game_random_rl_agents.gif
│ ├── game_random_rl_agents2.gif
│ ├── game_random_rl_agents3.gif
│ ├── game_random_rules_agents.gif
│ ├── game_random_rules_agents2.gif
│ ├── game_rules_rl_agents.gif
│ └── game_two_random_agents.gif
├── 2. Data Center Cooling
├── 0. Explaining the Data Center Cooling environment.ipynb
├── 1. Reinforcement Learning - Q Learning.ipynb
├── 2. Reinforcement Learning - Deep-Q-Learning.ipynb
├── README.md
└── app.py
├── 3. Robotics
├── Minitaur pybullet environment.ipynb
└── minitaur.py
├── 4. Chrome Dino
├── 20180102 - Chrome Dino development.ipynb
├── 20180203 - Genetic algorithms experiments.ipynb
├── README.md
├── dino.py
├── experiments.py
└── images
│ ├── capture1.png
│ ├── dino_hardcoded_agent.gif
│ ├── dino_ml_agent1.gif
│ └── dino_ml_agent1_bad.gif
├── 5. Delivery Optimization
├── Optimizing delivery with Reinforcement Learning.ipynb
├── README.md
├── Routing optimization with Deep Reinforcement Learning.ipynb
├── delivery.py
├── env1.png
├── env2.png
├── env3.png
├── training.png
├── training_100_stops.gif
├── training_100_stops_traffic.gif
├── training_10_stops.gif
├── training_500_stops.gif
├── training_500_stops_traffic.gif
└── training_50_stops.gif
├── 6. Solving a Rubik's Cube
├── Solving a Rubik's cube with RL.ipynb
└── rubik.py
├── 7. Multi-Agents Simulations
├── 20191018 - Sugarscape playground.ipynb
├── 20191112 - Chicken game.ipynb
├── 20200318 - Hyperion dev.ipynb
├── README.md
├── pygame_test.py
├── test.gif
└── test2.gif
├── 8. Unity ML agents tests
├── README.md
└── rolling_a_ball
│ ├── 20200202 - Rolling a Ball.ipynb
│ └── rollingaball1.png
├── 9. Discrete optimization with RL
├── README.md
├── Reinforcement Learning for knapsack problem.ipynb
├── knapsack_problem
│ └── knapsack
│ │ ├── Solver.java
│ │ ├── _coursera
│ │ ├── data
│ │ ├── ks_10000_0
│ │ ├── ks_1000_0
│ │ ├── ks_100_0
│ │ ├── ks_100_1
│ │ ├── ks_100_2
│ │ ├── ks_106_0
│ │ ├── ks_19_0
│ │ ├── ks_200_0
│ │ ├── ks_200_1
│ │ ├── ks_300_0
│ │ ├── ks_30_0
│ │ ├── ks_400_0
│ │ ├── ks_40_0
│ │ ├── ks_45_0
│ │ ├── ks_4_0
│ │ ├── ks_500_0
│ │ ├── ks_50_0
│ │ ├── ks_50_1
│ │ ├── ks_60_0
│ │ ├── ks_82_0
│ │ ├── ks_lecture_dp_1
│ │ └── ks_lecture_dp_2
│ │ ├── handout.pdf
│ │ ├── solver.py
│ │ ├── solverJava.py
│ │ └── submit.py
└── lessons
│ ├── README.md
│ ├── discrete_optimization.md
│ ├── dynamic_programming.md
│ └── knapsack_problem.md
├── README.md
└── rl
├── __init__.py
├── agents
├── __init__.py
├── actor_critic_agent.py
├── base_agent.py
├── dqn2d_agent.py
├── dqn_agent.py
├── q_agent.py
└── sarsa_agent.py
├── envs
├── __init__.py
├── data_center_cooling.py
└── tictactoe.py
├── memory.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # Spyder project settings
86 | .spyderproject
87 | .spyproject
88 |
89 | # Rope project settings
90 | .ropeproject
91 |
92 | # mkdocs documentation
93 | /site
94 |
95 | # mypy
96 | .mypy_cache/
97 |
--------------------------------------------------------------------------------
/0. Old/scripts/algorithms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 | First RL script done using Keras and policy gradients
8 |
9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv
10 | - Still inspired by Karpathy's work too
11 |
12 | Started on the 30/12/2016
13 |
14 |
15 |
16 | theo.alves.da.costa@gmail.com
17 | https://github.com/theolvs
18 | ------------------------------------------------------------------------
19 | """
20 |
21 |
22 | import numpy as np
23 | # import gym
24 | import os
25 | from keras.models import load_model, Sequential
26 | from keras.layers import Dense, Activation, Dropout
27 | from keras.optimizers import SGD, RMSprop
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | class Brain():
38 | def __init__(self,env,env_name = "default",H = 500,learning_rate = 0.01,dropout = 0.0,hidden_layers = 1,reload = False,input_dim = 0,output_dim = 0):
39 |
40 | self.env_name = env_name
41 | self.base_path = "C:/Data Science/15. Reinforcement Learning/0. Models/"
42 | file = [x for x in os.listdir(self.base_path) if self.env_name in x]
43 |
44 | self.H = H
45 | self.gamma = 0.5
46 | self.batch_size = 10
47 | self.learning_rate = learning_rate
48 | self.dropout = dropout
49 | self.hidden_layers = hidden_layers
50 |
51 | if input_dim == 0:
52 | try:
53 | self.observation_space = env.observation_space.n
54 | self.observation_to_vectorize = True
55 | except Exception as e:
56 | self.observation_space = env.observation_space.shape[0]
57 | self.observation_to_vectorize = False
58 | else:
59 | self.observation_space = input_dim
60 | self.observation_to_vectorize = False
61 |
62 | if output_dim == 0:
63 | self.action_space = env.action_space.n
64 | else:
65 | self.action_space = output_dim
66 |
67 |
68 | if len(file) == 0 or reload:
69 | print('>> Building a fully connected neural network')
70 | self.episode_number = 0
71 | self.model = self.build_fcc_model_with_regularization(H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers)
72 | else:
73 | print('>> Loading the previously trained model')
74 | self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")])
75 | self.model = load_model(self.base_path + file[0])
76 |
77 |
78 |
79 | self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[]
80 | self.episode_rewards,self.episode_running_rewards = [],[]
81 | self.reward_sum = 0
82 | self.running_reward = 0
83 |
84 |
85 | def rebuild_model(self):
86 | self.model = self.build_fcc_model_with_regularization(self.H,input_dim = self.observation_space,output_dim = self.action_space,dropout = self.dropout,hidden_layers = self.hidden_layers)
87 |
88 |
89 |
90 | def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2):
91 | model = Sequential()
92 | model.add(Dense(H, input_dim=input_dim))
93 | model.add(Activation('relu'))
94 | model.add(Dense(H))
95 | model.add(Activation('relu'))
96 |
97 | sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
98 |
99 | if output_dim <= 2:
100 | model.add(Dense(1))
101 | model.add(Activation('sigmoid'))
102 | model.compile(loss='mse',
103 | optimizer=sgd,
104 | metrics=['accuracy'])
105 | else:
106 | model.add(Dense(output_dim))
107 | model.add(Activation('softmax'))
108 | model.compile(loss='categorical_crossentropy',
109 | optimizer=sgd,
110 | metrics=['accuracy'])
111 |
112 | return model
113 |
114 |
115 |
116 | def build_fcc_model_with_regularization(self,H = 500,input_dim = 4,output_dim = 2,dropout = 0.0,hidden_layers = 1):
117 | model = Sequential()
118 | model.add(Dense(H, input_dim=input_dim,init='uniform'))
119 | model.add(Activation('relu'))
120 | model.add(Dropout(dropout))
121 |
122 | for i in range(hidden_layers):
123 | model.add(Dense(H,init='uniform'))
124 | model.add(Activation('relu'))
125 | model.add(Dropout(dropout))
126 |
127 | sgd = SGD(lr=self.learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
128 |
129 | if output_dim <= 2:
130 | model.add(Dense(1))
131 | model.add(Activation('sigmoid'))
132 | model.compile(loss='mse',
133 | optimizer=sgd,
134 | metrics=['accuracy'])
135 | else:
136 | model.add(Dense(output_dim))
137 | model.add(Activation('softmax'))
138 | model.compile(loss='categorical_crossentropy',
139 | optimizer=sgd,
140 | metrics=['accuracy'])
141 |
142 | return model
143 |
144 |
145 |
146 | def to_input(self,observation):
147 | if self.observation_to_vectorize:
148 | observation = self.vectorize_observation(observation,self.observation_space)
149 | return np.reshape(observation,(1,self.observation_space))
150 |
151 |
152 | def predict(self,observation,possible_moves = []):
153 |
154 | x = self.to_input(observation)
155 |
156 | # getting the probability of action
157 | probas = self.model.predict(x)[0]
158 |
159 |
160 | if len(possible_moves) > 0:
161 | probas += 1e-9
162 | probas *= possible_moves
163 | probas /= np.sum(probas)
164 |
165 | # sampling the correct action
166 | action= self.sample_action(probas)
167 |
168 | return x,action,probas
169 |
170 |
171 | def sample_action(self,probabilities):
172 | if len(probabilities)<=2:
173 | action = 1 if np.random.uniform() < probabilities[0] else 0
174 | else:
175 | action = np.random.choice(len(probabilities),p = np.array(probabilities))
176 |
177 | return action
178 |
179 | def vectorize_action(self,action):
180 | if self.action_space <= 2:
181 | return action
182 | else:
183 | onehot_vector = np.zeros(self.action_space)
184 | onehot_vector[action] = 1
185 | return onehot_vector
186 |
187 | def vectorize_observation(self,value,size):
188 | onehot_vector = np.zeros(size)
189 | onehot_vector[value] = 1
190 | return onehot_vector
191 |
192 |
193 |
194 | def record(self,input = None,action = None,proba = None,reward = None):
195 | if type(input) != type(None):
196 | self.inputs.append(input)
197 |
198 | if type(action) != type(None):
199 | self.actions.append(action)
200 |
201 | if type(proba) != type(None):
202 | self.probas.append(proba)
203 |
204 | if type(reward) != type(None):
205 | self.rewards.append(reward)
206 | self.reward_sum += reward
207 |
208 |
209 |
210 |
211 | def discounting_rewards(self,r,normalization = True):
212 | discounted_r = np.zeros_like(r)
213 | running_add = 0
214 | for t in reversed(range(0, r.size)):
215 | running_add = running_add * self.gamma + r[t]
216 | discounted_r[t] = running_add
217 |
218 | if normalization:
219 | discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe")
220 | discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe")
221 |
222 | return discounted_r
223 |
224 |
225 | def discount_rewards(self,normalization = True):
226 | rewards = np.vstack(self.rewards)
227 | return self.discounting_rewards(rewards,normalization)
228 |
229 |
230 | def record_episode(self):
231 | # self.step_rewards.extend(self.discount_rewards(normalization = True))
232 |
233 | # self.rewards = np.array([self.rewards[-1]]*len(self.rewards))
234 | # self.reward_sum = self.rewards[-1]*100
235 |
236 | self.reward_sum = np.sum(self.rewards)
237 | self.rewards = self.discount_rewards(normalization = False)
238 | self.step_rewards.extend(self.rewards)
239 |
240 |
241 | self.episode_rewards.append(self.reward_sum)
242 | self.running_reward = np.mean(self.episode_rewards)
243 | self.episode_number += 1
244 |
245 | def reset_episode(self):
246 | self.rewards = []
247 | self.reward_sum = 0
248 |
249 | def update_on_batch(self,show = False):
250 | if show: print('... Training on batch of size %s'%self.batch_size)
251 | self.actions = np.vstack(self.actions)
252 | self.probas = np.vstack(self.probas)
253 | self.step_rewards = np.vstack(self.step_rewards)
254 | self.inputs = np.vstack(self.inputs)
255 |
256 | self.targets = self.step_rewards * (self.actions - self.probas) + self.probas
257 | # print(self.targets)
258 |
259 | #ajouter la protection de la max rewards
260 |
261 | self.model.train_on_batch(self.inputs,self.targets)
262 |
263 | self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[]
264 |
265 | def save_model(self):
266 | file = [x for x in os.listdir(self.base_path) if self.env_name in x]
267 | self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number))
268 | if len(file)>0:
269 | os.remove(self.base_path+file[0])
270 | # self.model.save(self.base_path+"%s.h5"%(self.env_name))
271 |
272 |
273 | def build_cnn_model(self,input_dim,output_dim):
274 | model = Sequential()
275 |
276 | model.add(Convolution2D(32, 3, 3, border_mode='same',input_shape=input_dim))
277 | model.add(Activation('relu'))
278 | model.add(Convolution2D(32, 3, 3))
279 | model.add(Activation('relu'))
280 | model.add(MaxPooling2D(pool_size=(2, 2)))
281 | model.add(Dropout(0.25))
282 |
283 | model.add(Convolution2D(64, 3, 3, border_mode='same'))
284 | model.add(Activation('relu'))
285 | model.add(Convolution2D(64, 3, 3))
286 | model.add(Activation('relu'))
287 | model.add(MaxPooling2D(pool_size=(2, 2)))
288 | model.add(Dropout(0.25))
289 |
290 | model.add(Flatten())
291 | model.add(Dense(512))
292 | model.add(Activation('relu'))
293 | model.add(Dropout(0.5))
294 | model.add(Dense(output_dim))
295 | model.add(Activation('softmax'))
296 |
297 | # Let's train the model using RMSprop
298 | model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
299 |
300 | return model
301 |
--------------------------------------------------------------------------------
/0. Old/scripts/grid_world.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 | Grid World
8 |
9 | Started on the 08/08/2017
10 |
11 |
12 | References :
13 | - https://www.youtube.com/watch?v=A5eihauRQvo&t=5s
14 | - https://github.com/llSourcell/q_learning_demo
15 | - http://firsttimeprogrammer.blogspot.fr/2016/09/getting-ai-smarter-with-q-learning.html
16 |
17 |
18 | theo.alves.da.costa@gmail.com
19 | https://github.com/theolvs
20 | ------------------------------------------------------------------------
21 | """
22 |
23 |
24 | import os
25 | import matplotlib.pyplot as plt
26 | import pandas as pd
27 | import numpy as np
28 | import sys
29 | import random
30 | import time
31 |
32 |
33 |
34 |
35 |
36 | #===========================================================================================================
37 | # CELLS DEFINITION
38 | #===========================================================================================================
39 |
40 |
41 | class Cell(object):
42 | def __init__(self,reward = 0,is_terminal = False,is_occupied = False,is_wall = False,is_start = False):
43 | self.reward = reward
44 | self.is_terminal = is_terminal
45 | self.is_occupied = is_occupied
46 | self.is_wall = is_wall
47 | self.is_start = is_start
48 |
49 | def __repr__(self):
50 | if self.is_occupied:
51 | return "x"
52 | else:
53 | return " "
54 |
55 |
56 | def __str__(self):
57 | return self.__str__()
58 |
59 |
60 |
61 |
62 | class Start(Cell):
63 | def __init__(self):
64 | super().__init__(is_occupied = True,is_start = True)
65 |
66 |
67 |
68 |
69 | class End(Cell):
70 | def __init__(self,reward = 10):
71 | super().__init__(reward = reward,is_terminal = True)
72 |
73 | def __repr__(self):
74 | return "O"
75 |
76 |
77 |
78 | class Hole(Cell):
79 | def __init__(self,reward = -10):
80 | super().__init__(reward = reward,is_terminal = True)
81 |
82 | def __repr__(self):
83 | return "X"
84 |
85 |
86 |
87 | class Wall(Cell):
88 | def __init__(self):
89 | super().__init__(is_wall = True)
90 |
91 | def __repr__(self):
92 | return "#"
93 |
94 |
95 |
96 |
97 | #===========================================================================================================
98 | # GRIDS DEFINITION
99 | #===========================================================================================================
100 |
101 |
102 |
103 |
104 | class Grid(object):
105 | def __init__(self,cells):
106 | self.grid = cells
107 |
108 |
109 | def __repr__(self):
110 | pass
111 |
112 |
113 | def __str__(self):
114 | pass
115 |
116 |
--------------------------------------------------------------------------------
/0. Old/scripts/multi_armed_bandit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 | Multi Armed Bandit Problem
8 |
9 | Started on the 14/04/2017
10 |
11 |
12 | theo.alves.da.costa@gmail.com
13 | https://github.com/theolvs
14 | ------------------------------------------------------------------------
15 | """
16 |
17 |
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np
22 | import sys
23 |
24 |
25 | # Deep Learning (Keras, Tensorflow)
26 | import tensorflow as tf
27 | from keras.models import Sequential
28 | from keras.optimizers import SGD,RMSprop, Adam
29 | from keras.layers import Dense, Dropout, Activation, Flatten
30 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D
31 | from keras.utils.np_utils import to_categorical
32 |
33 |
34 |
35 |
36 | #===========================================================================================================
37 | # BANDIT DEFINITION
38 | #===========================================================================================================
39 |
40 |
41 |
42 | class Bandit(object):
43 | def __init__(self,p = None):
44 | '''Simple bandit initialization'''
45 | self.p = p if p is not None else np.random.random()
46 |
47 | def pull(self):
48 | '''Simulate a pull from the bandit
49 |
50 | '''
51 | if np.random.random() < self.p:
52 | return 1
53 | else:
54 | return -1
55 |
56 |
57 |
58 | def create_list_bandits(n = 4,p = None):
59 | if p is None: p = [None]*n
60 | bandits = [Bandit(p = p[i]) for i in range(n)]
61 | return bandits
62 |
63 |
64 |
65 |
66 |
67 | #===========================================================================================================
68 | # NEURAL NETWORK
69 | #===========================================================================================================
70 |
71 |
72 |
73 | def build_fcc_model(H = 100,lr = 0.1,dim = 4):
74 | model = Sequential()
75 | model.add(Dense(H, input_dim=dim))
76 | model.add(Activation('relu'))
77 | model.add(Dense(H))
78 | model.add(Activation('relu'))
79 |
80 | sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
81 |
82 |
83 | model.add(Dense(dim))
84 | model.add(Activation('softmax'))
85 | model.compile(loss='categorical_crossentropy',
86 | optimizer=sgd,
87 | metrics=['accuracy'])
88 |
89 | return model
90 |
91 |
92 | model = build_fcc_model()
93 |
94 |
95 |
96 |
97 |
98 | #===========================================================================================================
99 | # SAMPLING ACTION
100 | #===========================================================================================================
101 |
102 |
103 | def sample_action(probas,epsilon = 0.2):
104 | probas = probas[0]
105 | if np.random.rand() < epsilon:
106 | choice = np.random.randint(0,len(probas))
107 | else:
108 | choice = np.random.choice(range(len(probas)),p = probas)
109 | return choice
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 | #===========================================================================================================
120 | # EPISODE
121 | #===========================================================================================================
122 |
123 |
124 |
125 |
126 | def run_episode(bandits,model,probas = None,train = True,epsilon = 0.2):
127 |
128 | if probas is None:
129 | probas = np.ones((1,len(bandits)))/len(bandits)
130 |
131 | # sampling action
132 | bandit_to_pull = sample_action(probas,epsilon = epsilon)
133 | action = to_categorical(bandit_to_pull,num_classes=probas.shape[1])
134 |
135 | # reward
136 | reward = bandits[bandit_to_pull].pull()
137 |
138 | # feed vectors
139 | X = action
140 | y = (action - probas)*reward
141 |
142 | if train:
143 | model.train_on_batch(X,y)
144 |
145 | # update probabilities
146 | probas = model.predict(X)
147 |
148 | return reward,probas
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | #===========================================================================================================
157 | # GAME
158 | #===========================================================================================================
159 |
160 |
161 | def run_game(n_episodes = 100,lr = 0.1,n_bandits = 4,p = None,epsilon = 0.2):
162 |
163 | # DEFINE THE BANDITS
164 | bandits = create_list_bandits(n = n_bandits,p = p)
165 | probabilities_to_win = [x.p for x in bandits]
166 | best_bandit = np.argmax(probabilities_to_win)
167 | print(">> Probabilities to win : {} -> Best bandit : {}".format(probabilities_to_win,best_bandit))
168 |
169 | # INITIALIZE THE NEURAL NETWORK
170 | model = build_fcc_model(lr = lr,dim = n_bandits)
171 |
172 | # INITIALIZE BUFFERS
173 | rewards = []
174 | avg_rewards = []
175 | all_probas = np.array([])
176 |
177 | # EPISODES LOOP
178 | for i in range(n_episodes):
179 | print("\r[{}/{}] episodes completed".format(i+1,n_episodes),end = "")
180 |
181 | # Random choice at the first episode
182 | if i == 0:
183 | reward,probas = run_episode(bandits = bandits,model = model,epsilon = epsilon)
184 |
185 | # Updated probabilities at the following episodes
186 | else:
187 | reward,probas = run_episode(bandits = bandits,model = model,probas = probas)
188 |
189 |
190 | # Store the rewards and the probas
191 | rewards.append(reward)
192 | avg_rewards.append(np.mean(rewards))
193 | all_probas = np.append(all_probas,probas)
194 |
195 | print("")
196 |
197 |
198 | # GET THE BEST PREDICTED BANDIT
199 | predicted_bandit = np.argmax(probas)
200 | print(">> Predicted bandit : {} - {}".format(predicted_bandit,"CORRECT !!!" if predicted_bandit == best_bandit else "INCORRECT"))
201 |
202 |
203 | # PLOT THE EVOLUTION OF PROBABILITIES OVER TRAINING
204 | all_probas = all_probas.reshape((n_episodes,n_bandits)).transpose()
205 | plt.figure(figsize = (12,5))
206 | plt.title("Probabilities on Bandit choice - {} episodes - learning rate {}".format(n_episodes,lr))
207 | for i,p in enumerate(list(all_probas)):
208 | plt.plot(p,label = "Bandit {}".format(i),lw = 1)
209 |
210 | plt.plot(avg_rewards,linestyle="-", dashes=(5, 4),color = "black",lw = 0.5,label = "average running reward")
211 | plt.legend()
212 | plt.ylim([-0.2,1])
213 |
214 | plt.show()
215 |
216 |
217 |
218 |
219 |
220 |
--------------------------------------------------------------------------------
/0. Old/scripts/open_ai_gym.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 | First RL script done using Keras and policy gradients
8 |
9 | - Inspired by @steinbrecher script on https://gym.openai.com/evaluations/eval_usjJ7onVTTwrn43wrbBiAv
10 | - Still inspired by Karpathy's work too
11 |
12 | Started on the 30/12/2016
13 |
14 |
15 | https://github.com/rybskej/atari-py
16 | https://sourceforge.net/projects/vcxsrv/
17 |
18 |
19 | Environment which works with script:
20 | - CartPole-v0
21 | - MountainCar-v0
22 | - Taxi-v1
23 |
24 |
25 | theo.alves.da.costa@gmail.com
26 | https://github.com/theolvs
27 | ------------------------------------------------------------------------
28 | """
29 |
30 |
31 | import numpy as np
32 | import gym
33 | import os
34 | from keras.models import load_model, Sequential
35 | from keras.layers import Dense, Activation
36 | from keras.optimizers import SGD, RMSprop
37 |
38 |
39 |
40 | #-------------------------------------------------------------------------------
41 |
42 |
43 |
44 |
45 |
46 | # def main(n_episodes = 20):
47 | # for i_episode in range(n_episodes):
48 | # observation = env.reset()
49 | # print(observation)
50 | # break
51 | # for t in range(1000):
52 | # if render: env.render
53 | # print(observation)
54 | # action = env.action_space.sample()
55 | # observation, reward, done, info = env.step(action)
56 | # if done:
57 | # print("Episode finished after {} timesteps".format(t+1))
58 | # break
59 |
60 |
61 |
62 |
63 |
64 |
65 | #-------------------------------------------------------------------------------
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 | class Brain():
75 | def __init__(self,env,env_name = "default",H = 500,reload = False):
76 |
77 | self.env_name = env_name
78 | self.base_path = "C:/Users/talvesdacosta/Documents/Perso/Data Science/15. Reinforcement Learning/3. Open AI Gym/models/"
79 | file = [x for x in os.listdir(self.base_path) if self.env_name in x]
80 |
81 | self.H = H
82 | self.gamma = 0.975
83 | self.batch_size = 10
84 |
85 | try:
86 | self.observation_space = env.observation_space.n
87 | self.observation_to_vectorize = True
88 | except Exception as e:
89 | self.observation_space = env.observation_space.shape[0]
90 | self.observation_to_vectorize = False
91 |
92 | self.action_space = env.action_space.n
93 |
94 |
95 | if len(file) == 0 or reload:
96 | print('>> Building a fully connected neural network')
97 | self.episode_number = 0
98 | self.model = self.build_fcc_model(H,input_dim = self.observation_space,output_dim = self.action_space)
99 | else:
100 | print('>> Loading the previously trained model')
101 | self.episode_number = int(file[0][file[0].find("(")+1:file[0].find(")")])
102 | self.model = load_model(self.base_path + file[0])
103 |
104 |
105 |
106 | self.inputs,self.actions,self.probas,self.rewards,self.step_rewards = [],[],[],[],[]
107 | self.episode_rewards,self.episode_running_rewards = [],[]
108 | self.reward_sum = 0
109 | self.running_reward = 0
110 |
111 |
112 |
113 |
114 | def build_fcc_model(self,H = 500,input_dim = 4,output_dim = 2):
115 | model = Sequential()
116 | model.add(Dense(H, input_dim=input_dim))
117 | model.add(Activation('relu'))
118 | model.add(Dense(H))
119 | model.add(Activation('relu'))
120 |
121 | sgd = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)
122 |
123 | if output_dim <= 2:
124 | model.add(Dense(1))
125 | model.add(Activation('sigmoid'))
126 | model.compile(loss='mse',
127 | optimizer=sgd,
128 | metrics=['accuracy'])
129 | else:
130 | model.add(Dense(output_dim))
131 | model.add(Activation('softmax'))
132 | model.compile(loss='categorical_crossentropy',
133 | optimizer=sgd,
134 | metrics=['accuracy'])
135 |
136 | return model
137 |
138 |
139 |
140 | def to_input(self,observation):
141 | if self.observation_to_vectorize:
142 | observation = self.vectorize_observation(observation,self.observation_space)
143 | return np.reshape(observation,(1,self.observation_space))
144 |
145 |
146 | def predict(self,observation):
147 |
148 | x = self.to_input(observation)
149 |
150 | # getting the probability of action
151 | probas = self.model.predict(x)[0]
152 |
153 | # sampling the correct action
154 | action= self.sample_action(probas)
155 |
156 | return x,action,probas
157 |
158 |
159 | def sample_action(self,probabilities):
160 | if len(probabilities)<=2:
161 | action = 1 if np.random.uniform() < probabilities[0] else 0
162 | else:
163 | action = np.random.choice(len(probabilities),p = np.array(probabilities))
164 |
165 | return action
166 |
167 | def vectorize_action(self,action):
168 | if self.action_space <= 2:
169 | return action
170 | else:
171 | onehot_vector = np.zeros(self.action_space)
172 | onehot_vector[action] = 1
173 | return onehot_vector
174 |
175 | def vectorize_observation(self,value,size):
176 | onehot_vector = np.zeros(size)
177 | onehot_vector[value] = 1
178 | return onehot_vector
179 |
180 |
181 |
182 | def record(self,input = None,action = None,proba = None,reward = None):
183 | if type(input) != type(None):
184 | self.inputs.append(input)
185 |
186 | if type(action) != type(None):
187 | self.actions.append(action)
188 |
189 | if type(proba) != type(None):
190 | self.probas.append(proba)
191 |
192 | if type(reward) != type(None):
193 | self.rewards.append(reward)
194 | self.reward_sum += reward
195 |
196 |
197 |
198 |
199 | def discounting_rewards(self,r,normalization = True):
200 | discounted_r = np.zeros_like(r)
201 | running_add = 0
202 | for t in reversed(range(0, r.size)):
203 | running_add = running_add * self.gamma + r[t]
204 | discounted_r[t] = running_add
205 |
206 | if normalization:
207 | discounted_r = np.subtract(discounted_r,np.mean(discounted_r),casting = "unsafe")
208 | discounted_r = np.divide(discounted_r,np.std(discounted_r),casting = "unsafe")
209 |
210 | return discounted_r
211 |
212 |
213 | def discount_rewards(self,normalization = True):
214 | rewards = np.vstack(self.rewards)
215 | return self.discounting_rewards(rewards,normalization)
216 |
217 |
218 | def record_episode(self):
219 | self.step_rewards.extend(self.discount_rewards(normalization = True))
220 | self.episode_rewards.append(self.reward_sum)
221 | self.running_reward = np.mean(self.episode_rewards)
222 | self.episode_number += 1
223 |
224 | def reset_episode(self):
225 | self.rewards = []
226 | self.reward_sum = 0
227 |
228 | def update_on_batch(self):
229 | print('... Training on batch of size %s'%self.batch_size)
230 | self.actions = np.vstack(self.actions)
231 | self.probas = np.vstack(self.probas)
232 | self.step_rewards = np.vstack(self.step_rewards)
233 | self.inputs = np.vstack(self.inputs)
234 |
235 | self.targets = self.step_rewards * (self.actions - self.probas) + self.probas
236 |
237 | #ajouter la protection de la max rewards
238 |
239 | self.model.train_on_batch(self.inputs,self.targets)
240 |
241 | self.inputs,self.actions,self.probas,self.step_rewards = [],[],[],[]
242 |
243 | def save_model(self):
244 | file = [x for x in os.listdir(self.base_path) if self.env_name in x]
245 | self.model.save(self.base_path+"%s(%s).h5"%(self.env_name,self.episode_number))
246 | if len(file)>0:
247 | os.remove(self.base_path+file[0])
248 | # self.model.save(self.base_path+"%s.h5"%(self.env_name))
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 | def main(env_name = 'CartPole-v0',n_episodes = 20,render = False,reload = False,n_by_episode = 1000):
261 | env = gym.make(env_name)
262 | brain = Brain(env,env_name = env_name,reload = reload)
263 | # env.monitor.start(brain.base_path+'monitor/%s'%env_name)
264 |
265 |
266 | for i_episode in range(1,n_episodes+1):
267 | observation = env.reset()
268 | for t in range(n_by_episode):
269 | if render: env.render()
270 |
271 | x,action,proba = brain.predict(observation)
272 |
273 | observation, reward, done, info = env.step(action)
274 | action = brain.vectorize_action(action)
275 | brain.record(input = x,action = action,proba = proba,reward = reward)
276 |
277 | if done or t == n_by_episode - 1:
278 | brain.record_episode()
279 | print("Episode {} : total reward was {:0.03f} and running mean {:0.03f}".format(brain.episode_number, brain.reward_sum, brain.running_reward))
280 |
281 |
282 | if i_episode % brain.batch_size == 0:
283 | brain.update_on_batch()
284 |
285 | if i_episode % 100 == 0:
286 | brain.save_model()
287 |
288 |
289 | brain.reset_episode()
290 |
291 | break
292 |
293 | # env.monitor.close()
--------------------------------------------------------------------------------
/0. Solving Gym environments/breakout_with_rl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 |
18 |
19 | # Usual libraries
20 | import os
21 | import matplotlib.pyplot as plt
22 | import pandas as pd
23 | import numpy as np
24 | import sys
25 | import random
26 | import time
27 | from tqdm import tqdm
28 | import random
29 | import gym
30 | import numpy as np
31 |
32 |
33 | # Keras (Deep Learning)
34 | from keras.models import Sequential
35 | from keras.layers import Dense
36 | from keras.optimizers import Adam
37 |
38 |
39 | # Custom RL library
40 | import sys
41 | sys.path.insert(0,'..')
42 |
43 | from rl import utils
44 | from rl.agents.dqn2d_agent import DQN2DAgent
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 | #----------------------------------------------------------------
53 | # CONSTANTS
54 |
55 |
56 | N_EPISODES = 1000
57 | MAX_STEPS = 10000
58 | RENDER = True
59 | RENDER_EVERY = 50
60 | BATCH_SIZE = 256
61 | MAX_MEMORY = MAX_STEPS
62 |
63 |
64 |
65 | #----------------------------------------------------------------
66 | # MAIN LOOP
67 |
68 |
69 | if __name__ == "__main__":
70 |
71 | # Define the gym environment
72 | env = gym.make('Pong-v0')
73 |
74 | # Get the environement action and observation space
75 | state_size = env.observation_space.shape
76 | action_size = env.action_space.n
77 |
78 | # Create the RL Agent
79 | agent = DQN2DAgent(state_size,action_size,max_memory = MAX_MEMORY)
80 |
81 | # Initialize a list to store the rewards
82 | rewards = []
83 |
84 |
85 |
86 | #---------------------------------------------
87 | # ITERATION OVER EPISODES
88 | for i_episode in range(N_EPISODES):
89 |
90 |
91 |
92 | # Reset the environment
93 | s = env.reset()
94 |
95 |
96 | #-----------------------------------------
97 | # EPISODE RUN
98 | for i_step in range(MAX_STEPS):
99 |
100 | # Render the environement
101 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
102 |
103 | # Store s before
104 | if i_step == 0:
105 | s_before = s
106 |
107 |
108 | # The agent chose the action considering the given current state
109 | a = agent.act(s_before,s)
110 |
111 |
112 | # Take the action, get the reward from environment and go to the next state
113 | s_next,r,done,info = env.step(a)
114 |
115 | # print(r)
116 |
117 | # Tweaking the reward to make it negative when we lose
118 | # r = r if not done else -10
119 |
120 | # Remember the important variables
121 | agent.remember(
122 | np.expand_dims(s,axis=0),
123 | a,
124 | r,
125 | np.expand_dims(s_next,axis=0),
126 | np.expand_dims(s_before,axis=0),
127 | done)
128 |
129 | # Go to the next state
130 | s_before = s
131 | s = s_next
132 |
133 | # If the episode is terminated
134 | if done:
135 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon))
136 | break
137 |
138 |
139 | #-----------------------------------------
140 |
141 | # Store the rewards
142 | rewards.append(i_step)
143 |
144 |
145 | # Training
146 | agent.train(batch_size = BATCH_SIZE)
147 |
148 |
149 |
150 |
151 |
152 | # Plot the average running rewards
153 | utils.plot_average_running_rewards(rewards)
154 |
--------------------------------------------------------------------------------
/0. Solving Gym environments/cartpole_with_deepqlearning.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 |
18 |
19 | # Usual libraries
20 | import os
21 | import matplotlib.pyplot as plt
22 | import pandas as pd
23 | import numpy as np
24 | import sys
25 | import random
26 | import time
27 | from tqdm import tqdm
28 | import random
29 | import gym
30 | import numpy as np
31 |
32 |
33 | # Keras (Deep Learning)
34 | from keras.models import Sequential
35 | from keras.layers import Dense
36 | from keras.optimizers import Adam
37 |
38 |
39 | # Custom RL library
40 | import sys
41 | sys.path.insert(0,'..')
42 |
43 | from rl import utils
44 | from rl.agents.dqn_agent import DQNAgent
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 | #----------------------------------------------------------------
53 | # CONSTANTS
54 |
55 |
56 | N_EPISODES = 1000
57 | MAX_STEPS = 1000
58 | RENDER = True
59 | RENDER_EVERY = 50
60 |
61 |
62 |
63 | #----------------------------------------------------------------
64 | # MAIN LOOP
65 |
66 |
67 | if __name__ == "__main__":
68 |
69 | # Define the gym environment
70 | env = gym.make('CartPole-v1')
71 |
72 | # Get the environement action and observation space
73 | state_size = env.observation_space.shape[0]
74 | action_size = env.action_space.n
75 |
76 | # Create the RL Agent
77 | agent = DQNAgent(state_size,action_size)
78 |
79 | # Initialize a list to store the rewards
80 | rewards = []
81 |
82 |
83 |
84 |
85 |
86 | #---------------------------------------------
87 | # ITERATION OVER EPISODES
88 | for i_episode in range(N_EPISODES):
89 |
90 |
91 |
92 | # Reset the environment
93 | s = env.reset()
94 |
95 |
96 | #-----------------------------------------
97 | # EPISODE RUN
98 | for i_step in range(MAX_STEPS):
99 |
100 | # Render the environement
101 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
102 |
103 | # The agent chose the action considering the given current state
104 | a = agent.act(s)
105 |
106 | # Take the action, get the reward from environment and go to the next state
107 | s_next,r,done,info = env.step(a)
108 |
109 | # Tweaking the reward to make it negative when we lose
110 | r = r if not done else -10
111 |
112 | # Remember the important variables
113 | agent.remember(s,a,r,s_next,done)
114 |
115 | # Go to the next state
116 | s = s_next
117 |
118 | # If the episode is terminated
119 | if done:
120 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon))
121 | break
122 |
123 |
124 | #-----------------------------------------
125 |
126 | # Store the rewards
127 | rewards.append(i_step)
128 |
129 |
130 | # Training
131 | agent.train()
132 |
133 |
134 |
135 |
136 |
137 | # Plot the average running rewards
138 | utils.plot_average_running_rewards(rewards)
139 |
--------------------------------------------------------------------------------
/0. Solving Gym environments/pendulum_with_actorcritic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 13/11/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 |
18 |
19 | # Usual libraries
20 | import os
21 | import matplotlib.pyplot as plt
22 | import pandas as pd
23 | import numpy as np
24 | import sys
25 | import random
26 | import time
27 | from tqdm import tqdm
28 | import random
29 | import gym
30 | import numpy as np
31 |
32 |
33 | # Keras (Deep Learning)
34 | from keras.models import Sequential
35 | from keras.layers import Dense
36 | from keras.optimizers import Adam
37 | import tensorflow as tf
38 | import keras.backend as K
39 |
40 | # Custom RL library
41 | import sys
42 | sys.path.insert(0,'..')
43 |
44 | from rl import utils
45 | from rl.agents.actor_critic_agent import ActorCriticAgent
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | #----------------------------------------------------------------
54 | # CONSTANTS
55 |
56 |
57 | N_EPISODES = 10000
58 | MAX_STEPS = 500
59 | RENDER = True
60 | RENDER_EVERY = 50
61 |
62 |
63 |
64 | #----------------------------------------------------------------
65 | # MAIN LOOP
66 |
67 |
68 | if __name__ == "__main__":
69 |
70 | # Define the gym environment
71 | sess = tf.Session()
72 | K.set_session(sess)
73 | env = gym.make('Pendulum-v0')
74 |
75 | # Define the agent
76 | agent = ActorCriticAgent(env, sess)
77 |
78 | # Initialize a list to store the rewards
79 | rewards = []
80 |
81 |
82 |
83 |
84 |
85 | #---------------------------------------------
86 | # ITERATION OVER EPISODES
87 | for i_episode in range(N_EPISODES):
88 |
89 |
90 |
91 | # Reset the environment
92 | s = env.reset()
93 |
94 | reward = 0
95 |
96 |
97 | #-----------------------------------------
98 | # EPISODE RUN
99 | for i_step in range(MAX_STEPS):
100 |
101 | # Render the environement
102 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
103 |
104 | # The agent chose the action considering the given current state
105 | s = s.reshape((1, env.observation_space.shape[0]))
106 | a = agent.act(s)
107 | a = a.reshape((1, env.action_space.shape[0]))
108 |
109 | # Take the action, get the reward from environment and go to the next state
110 | s_next,r,done,_ = env.step(a)
111 | s_next = s_next.reshape((1, env.observation_space.shape[0]))
112 | reward += r
113 |
114 | # Tweaking the reward to make it negative when we lose
115 |
116 | # Remember the important variables
117 | agent.remember(s,a,r,s_next,done)
118 |
119 | # Go to the next state
120 | s = s_next
121 |
122 | # If the episode is terminated
123 | if done:
124 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward))
125 | break
126 |
127 |
128 | #-----------------------------------------
129 |
130 | # Store the rewards
131 | rewards.append(i_step)
132 |
133 |
134 | # Training
135 | agent.train()
136 |
137 |
138 |
139 |
140 |
141 | # Plot the average running rewards
142 | utils.plot_average_running_rewards(rewards)
143 |
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents2.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rl_agents3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rl_agents3.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rules_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_random_rules_agents2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_random_rules_agents2.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_rules_rl_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_rules_rl_agents.gif
--------------------------------------------------------------------------------
/1. Tic Tac Toe/images/game_two_random_agents.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/1. Tic Tac Toe/images/game_two_random_agents.gif
--------------------------------------------------------------------------------
/2. Data Center Cooling/README.md:
--------------------------------------------------------------------------------
1 | # Data Center Cooling
2 | 
3 |
4 | Inspired by [DeepMind's realization](https://deepmind.com/blog/deepmind-ai-reduces-google-data-centre-cooling-bill-40/)
5 |
6 | This repository hold the development of a business question that can be solved with Reinforcement Learning : cooling data centers
7 | - The environment modelled with the fashion of OpenAI Gym's environments
8 | - Solving the problem with different RL algorithms (Q-Learning, Deep-Q-Learning, Policy Gradients)
9 | - A interactive Dash app to test the environment and the agents
10 |
11 |
12 | ***
13 | ## Data Center Cooling environment
14 |
15 | To try out the app, launch with ``python app.py`` and go to port ``localhost:8050``
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/2. Data Center Cooling/app.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | TWITTER APP
7 |
8 | Started on the 22/06/2017
9 |
10 |
11 | https://plot.ly/dash/live-updates
12 | https://plot.ly/dash/getting-started
13 | https://plot.ly/dash/getting-started-part-2
14 | https://plot.ly/dash/gallery/new-york-oil-and-gas/
15 |
16 | theo.alves.da.costa@gmail.com
17 | https://github.com/theolvs
18 | ------------------------------------------------------------------------
19 | """
20 |
21 | # USUAL
22 | import os
23 | import numpy as np
24 | from tqdm import tqdm
25 | from copy import deepcopy
26 |
27 | # DASH IMPORT
28 | import dash
29 | import dash_core_components as dcc
30 | import dash_html_components as html
31 | from dash.dependencies import Input, Output, Event, State
32 | import plotly.graph_objs as go
33 |
34 | import sys
35 | sys.path.append("C:/git/reinforcement-learning/")
36 |
37 |
38 |
39 | #--------------------------------------------------------------------------------
40 | from rl.envs.data_center_cooling import DataCenterCooling
41 | from rl.agents.q_agent import QAgent
42 | from rl.agents.dqn_agent import DQNAgent
43 | from rl.agents.sarsa_agent import SarsaAgent
44 | from rl import utils
45 |
46 |
47 |
48 |
49 | def run_episode(env,agent,max_step = 100,verbose = 1):
50 |
51 | s = env.reset()
52 |
53 | episode_reward = 0
54 |
55 | i = 0
56 | while i < max_step:
57 |
58 | # Choose an action
59 | a = agent.act(s)
60 |
61 | # Take the action, and get the reward from environment
62 | s_next,r,done = env.step(a)
63 |
64 | if verbose: print(s_next,r,done)
65 |
66 | # Update our knowledge in the Q-table
67 | agent.train(s,a,r,s_next)
68 |
69 | # Update the caches
70 | episode_reward += r
71 | s = s_next
72 |
73 | # If the episode is terminated
74 | i += 1
75 | if done:
76 | break
77 |
78 | return env,agent,episode_reward
79 |
80 |
81 |
82 |
83 | def run_n_episodes(env,type_agent = "Q Agent",n_episodes = 2000,lr = 0.8,gamma = 0.95):
84 |
85 | environment = deepcopy(env)
86 |
87 | # Initialize the agent
88 | states_size = len(env.observation_space)
89 | actions_size = len(env.action_space)
90 |
91 | if type_agent == "Q Agent":
92 | print("... Using Q Agent")
93 | agent = QAgent(states_size,actions_size,lr = lr,gamma = gamma)
94 | elif type_agent == "SARSA Agent":
95 | print("... Using SARSA Agent")
96 | agent = SarsaAgent(states_size,actions_size,lr = lr,gamma = gamma)
97 |
98 | # Store the rewards
99 | rewards = []
100 |
101 | # Experience replay
102 | for i in tqdm(range(n_episodes)):
103 |
104 | # Run the episode
105 | environment,agent,episode_reward = run_episode(environment,agent,verbose = 0)
106 | rewards.append(episode_reward)
107 |
108 | return environment,agent,rewards
109 |
110 |
111 | class Clicks(object):
112 | def __init__(self):
113 | self.count = 0
114 |
115 | reset_clicks = Clicks()
116 | train_clicks = Clicks()
117 | env = DataCenterCooling()
118 | np.random.seed()
119 |
120 | #---------------------------------------------------------------------------------
121 | # CREATE THE APP
122 | app = dash.Dash("Data Cooling Center")
123 |
124 |
125 | # # Making the app available offline
126 | offline = False
127 | app.css.config.serve_locally = offline
128 | app.scripts.config.serve_locally = offline
129 |
130 |
131 | style = {
132 | 'font-weight': 'bolder',
133 | 'font-family': 'Product Sans',
134 | }
135 |
136 | container_style = {
137 | "margin":"20px",
138 | }
139 |
140 |
141 |
142 | AGENTS = [{"label":x,"value":x} for x in ["Q Agent","SARSA Agent","Deep-Q-Network Agent","Policy Gradient Agent"]]
143 |
144 | #---------------------------------------------------------------------------------
145 | # LAYOUT
146 | app.layout = html.Div(children=[
147 |
148 |
149 |
150 |
151 |
152 | # HEADER FIRST CONTAINER
153 | html.Div([
154 | html.H2("Data Center Cooling",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
155 |
156 | html.Div([
157 | html.H4("Environment",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
158 | html.P("Cooling levels",id = "cooling"),
159 | dcc.Slider(min=10,max=100,step=10,value=10,id = "levels-cooling"),
160 | html.P("Cost factor",id = "cost-factor"),
161 | dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-cost-factor"),
162 | html.P("Risk factor",id = "risk-factor"),
163 | dcc.Slider(min=0.0,max=5,step=0.1,value=1,id = "levels-risk-factor"),
164 | html.Br(),
165 | html.Button("Reset",id = "reset-env",style = style,n_clicks = 0),
166 | ],style = {"height":"50%"}),
167 |
168 |
169 | html.Div([
170 | html.H4("Agent",style = {'color': "rgba(117, 117, 117, 0.95)",**style}),
171 | dcc.Dropdown(id = "input-agent",options = AGENTS,value = "Q Agent",multi = False),
172 | html.P("N episodes",id = "input-episodes"),
173 | dcc.Slider(min=500,max=10000,step=500,value=5000,id = "n-episodes"),
174 | html.P("Learning rate",id = "input-lr"),
175 | dcc.Slider(min=0.001,max=1.0,step=0.005,value=0.1,id = "lr"),
176 | html.Br(),
177 | html.Button("Train",id = "training",style = style,n_clicks = 0),
178 | ],style = {"height":"50%"}),
179 |
180 |
181 |
182 | ],style={**style,**container_style,'width': '20%',"height":"800px", 'float' : 'left', 'display': 'inline'}, className="container"),
183 |
184 |
185 |
186 |
187 | # ANALYTICS CONTAINER
188 | html.Div([
189 |
190 | dcc.Graph(id='render',animate = False,figure = env.render(with_plotly = True),style = {"height":"100%"}),
191 |
192 |
193 | ],style={**style,**container_style,'width': '55%',"height":"800px", 'float' : 'right', 'display': 'inline'}, className="container"),
194 |
195 |
196 | ])
197 |
198 |
199 |
200 |
201 | #---------------------------------------------------------------------------------
202 | # CALLBACKS
203 |
204 |
205 |
206 | # Callback to stop the streaming
207 | @app.callback(
208 | Output("render","figure"),
209 | [Input('reset-env','n_clicks'),Input('training','n_clicks'),Input('levels-cost-factor','value'),Input('levels-risk-factor','value')],
210 | state = [State('levels-cooling','value'),State('lr','value'),State('n-episodes','value'),State('input-agent','value')]
211 |
212 | )
213 | def render(click_reset,click_training,cost_factor,risk_factor,levels_cooling,lr,n_episodes,type_agent):
214 |
215 |
216 | print("Reset ",click_reset," - ",reset_clicks.count)
217 | print("Train ",click_training," - ",train_clicks.count)
218 |
219 |
220 | if click_reset > reset_clicks.count:
221 | reset_clicks.count = click_reset
222 | env.__init__(levels_cooling = levels_cooling,risk_factor = risk_factor,cost_factor = cost_factor,keep_cooling = True)
223 |
224 | elif click_training > train_clicks.count:
225 | train_clicks.count = click_training
226 | env_temp,agent,rewards = run_n_episodes(env,n_episodes = n_episodes,lr = lr,type_agent = type_agent)
227 | utils.plot_average_running_rewards(rewards,"C:/Users/talvesdacosta/Desktop/results.png")
228 | # os.system("start "+"C:/Users/talvesdacosta/Desktop/results.png")
229 | env.cooling = env_temp.cooling
230 | else:
231 | env.risk_factor = risk_factor
232 | env.cost_factor = cost_factor
233 |
234 |
235 |
236 | return env.render(with_plotly = True)
237 |
238 |
239 |
240 |
241 | @app.callback(
242 | Output("cooling","children"),
243 | [Input('levels-cooling','value')])
244 | def update_cooling(value):
245 | env.levels_cooling = value
246 | env.define_cooling(value)
247 | return "Cooling levels : {}".format(value)
248 |
249 |
250 |
251 | @app.callback(
252 | Output("risk-factor","children"),
253 | [Input('levels-risk-factor','value')])
254 | def update_risk(value):
255 | return "Risk factor : {}".format(value)
256 |
257 |
258 |
259 | @app.callback(
260 | Output("cost-factor","children"),
261 | [Input('levels-cost-factor','value')])
262 | def update_cost(value):
263 | return "Cost factor : {}".format(value)
264 |
265 | @app.callback(
266 | Output("input-episodes","children"),
267 | [Input('n-episodes','value')])
268 | def update_episodes(value):
269 | return "N episodes : {}".format(value)
270 |
271 | @app.callback(
272 | Output("input-lr","children"),
273 | [Input('lr','value')])
274 | def update_lr(value):
275 | return "Learning rate : {}".format(value)
276 |
277 |
278 |
279 |
280 |
281 |
282 | #---------------------------------------------------------------------------------
283 | # ADD EXTERNAL CSS
284 |
285 | external_css = ["https://fonts.googleapis.com/css?family=Product+Sans:400,400i,700,700i",
286 | "https://cdn.rawgit.com/plotly/dash-app-stylesheets/2cc54b8c03f4126569a3440aae611bbef1d7a5dd/stylesheet.css"]
287 |
288 | for css in external_css:
289 | app.css.append_css({"external_url": css})
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 | #---------------------------------------------------------------------------------
298 | # RUN SERVER
299 | if __name__ == '__main__':
300 | app.run_server(debug=True)
301 | np.random.seed()
--------------------------------------------------------------------------------
/3. Robotics/minitaur.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 |
18 |
19 | # Usual libraries
20 | import os
21 | import matplotlib.pyplot as plt
22 | import pandas as pd
23 | import numpy as np
24 | import sys
25 | import random
26 | import time
27 | from tqdm import tqdm
28 | import random
29 | import gym
30 | import numpy as np
31 |
32 |
33 | # Keras (Deep Learning)
34 | from keras.models import Sequential
35 | from keras.layers import Dense
36 | from keras.optimizers import Adam
37 |
38 |
39 | # Custom RL library
40 | import sys
41 | sys.path.insert(0,'..')
42 |
43 | from rl import utils
44 | from rl.agents.dqn_agent import DQNAgent
45 |
46 | import pybullet_envs.bullet.minitaur_gym_env as e
47 |
48 |
49 |
50 |
51 |
52 | #----------------------------------------------------------------
53 | # CONSTANTS
54 |
55 |
56 | N_EPISODES = 1000
57 | MAX_STEPS = 2000
58 | RENDER = True
59 | RENDER_EVERY = 50
60 |
61 |
62 |
63 | #----------------------------------------------------------------
64 | # MAIN LOOP
65 |
66 |
67 | if __name__ == "__main__":
68 |
69 | # Define the gym environment
70 | env = e.MinitaurBulletEnv(render=True)
71 |
72 | # Get the environement action and observation space
73 | state_size = env.observation_space.shape[0]
74 | action_size = env.action_space.shape[0]
75 |
76 | # Create the RL Agent
77 | agent = DQNAgent(state_size,action_size,low = -1,high = 1,action_type="continuous")
78 |
79 | # Initialize a list to store the rewards
80 | rewards = []
81 |
82 |
83 |
84 |
85 |
86 | #---------------------------------------------
87 | # ITERATION OVER EPISODES
88 | for i_episode in range(N_EPISODES):
89 |
90 |
91 |
92 | # Reset the environment
93 | s = env.reset()
94 | reward = 0
95 |
96 |
97 | #-----------------------------------------
98 | # EPISODE RUN
99 | for i_step in range(MAX_STEPS):
100 |
101 | # Render the environement
102 | if RENDER : env.render() #and (i_step % RENDER_EVERY == 0)
103 |
104 | # The agent chose the action considering the given current state
105 | a = agent.act(s)
106 |
107 | # Take the action, get the reward from environment and go to the next state
108 | s_next,r,done,info = env.step(a)
109 | reward += r
110 |
111 | # Remember the important variables
112 | agent.remember(s,a,r,s_next,done)
113 |
114 | # Go to the next state
115 | s = s_next
116 |
117 | # If the episode is terminated
118 | if done:
119 | print("Episode {}/{} finished after {} timesteps - epsilon : {:.2} - reward : {:.2}".format(i_episode+1,N_EPISODES,i_step,agent.epsilon,reward))
120 | break
121 |
122 |
123 | #-----------------------------------------
124 |
125 | # Store the rewards
126 | rewards.append(i_step)
127 |
128 |
129 | # Training
130 | agent.train(batch_size = 128)
131 |
132 |
133 |
134 |
135 |
136 | # Plot the average running rewards
137 | utils.plot_average_running_rewards(rewards)
138 |
--------------------------------------------------------------------------------
/4. Chrome Dino/README.md:
--------------------------------------------------------------------------------
1 | # Chrome Dino Project
2 | ## Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch
3 | 
4 |
5 |
6 | ##### Summary
7 | - Capturing image from the game - **OK**
8 | - Allowing control programmatically - **OK**
9 | - Trying a simple implementation of rules-based agent with classic CV algorithms - **OK**
10 | - Capturing scores for fitness and reward - **OK**
11 | - Creating the environment for RL - **OK**
12 | - Developing a RL agent that learns via evolution strategies - **OK**
13 | - Different experiments on both agent and method of learning
14 |
15 |
16 | ##### Ideas
17 | - Taking as input of the neural network
18 | - The boundaries of the obstacles in a 1D vector
19 | - The raw image
20 | - The processed image
21 | - Initialize the agent with hard coded policy
22 | - Combine the RL agent and the rules-based Agent
23 | - Try other evolution strategies
24 | - Crossover on the fitness
25 | - Simple ES
26 | - CMA-ES
27 |
28 |
29 | ##### Experiments :
30 | 1. **Genetic algorithm** : Generation of 20 dinos, 5 survive, and make 10 offsprings. 10 random dinos are created to complete the 20 population. Did not work at all after 100 generations, still an average score of 50 which is stopping at the first obstacle. This was tested without mutations. The Neural Network is very shallow MLP with one 100-unit hidden layer.
31 | 2. **Genetic algorithm** : Generation of 40 dinos, 10 survive, make 45 offsprings, but only 40 are selected at random to recreate the 40-population. Added mutations with gaussian noise at this step. Tried as well with a shallow MLP but also with a simple logistic regression in PyTorch
32 | 3. **Genetic algorithm** : Generation of 50 dinos, 12 survive, make 66 offsprings, but only 38 are selected at random to recreate the population. The input is now modelled by a vector with the position on the x axis of the next 2 obstacles. Thus I went back to a shallow MLP with the following structure ``(2 input features,50 hidden layers,1 output)`` giving me the probability to jump. When ensuring a high mutation factor for the gaussian noise to have more exploration. The dinosaurs reach a max score of 600 in about 70 generations of 50 dinos (6 hours on my laptop). But they fail when reaching the birds that were not included in the training.
33 | 4. **Evolution Strategy** : I went back to a simple evolution strategy to focus the training on the dino with the good behavior. The selection will be the top 10 or 20% at each generation. Then the next generation is created based on the fittest on which is adding gaussian noise as the mutations. With this strategy the dinosaur reach a max score of 600 in about 20 generations of 50 dinos. This works better than the last solution, but it is always falling to local optimas with dino jumping all the time to maximize their score.
34 | 5. **Evolution Strategy** : to correct the bad behavior of jumping all the time, I added a discount factor if moves are done when there is no obstacles. By counting the number of obstacles passed and the number of moves. The new reward is then modelled in the fashion of the Bellman equation, by incrementing a discounted reward to the previous reward. With this correction, after one generation the "always-jumping" behavior has disappeared, and with a few generations the dinos reach a good enough policy. In 10 generations of 10 dinos only (only 10 minutes on my laptop) we reach easily the max score of 600 previously reached, with a good enough average policy. But new issues arise : birds that come after 600 points which require to duck, speed increasing over time, long obstacles which would require to jump before. Here is a screen capture of the game at this state :
35 | 
36 |
37 |
38 |
39 | ##### Misc
40 | - Finding parameter on when to jump
41 | - Logreg/NN on the first and second position of obstacles
42 | - ML + Heuristics model
43 | - Bayesian priors
--------------------------------------------------------------------------------
/4. Chrome Dino/experiments.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | GENETIC ALGORITHMS EXPERIMENTS
7 | Started on the 2018/01/03
8 | theo.alves.da.costa@gmail.com
9 | https://github.com/theolvs
10 | ------------------------------------------------------------------------
11 | """
12 |
13 | from scipy import stats
14 | import seaborn as sns
15 | import os
16 | import matplotlib.pyplot as plt
17 | import pandas as pd
18 | import numpy as np
19 | import sys
20 | import time
21 | from tqdm import tqdm
22 | import itertools
23 |
24 |
25 |
26 |
27 | #=============================================================================================================================
28 | # DISTRIBUTIONS
29 | #=============================================================================================================================
30 |
31 |
32 |
33 |
34 |
35 | class Dist(object):
36 | def __init__(self,mu = None,std = None,label = None):
37 | self.mu = np.random.rand()*20 - 10 if mu is None else mu
38 | self.std = np.random.rand()*10 if std is None else std
39 | self.label = "" if not label else " - "+label
40 | self.func = lambda x : stats.norm.cdf(x,loc = self.mu,scale = self.std)
41 |
42 | def __repr__(self,markdown = False):
43 | return "Norm {1}mu={2}{0}, {0}std={3}{0}{4}".format("$" if markdown else "","$\\" if markdown else "",
44 | round(self.mu,2),round(self.std,2),self.label)
45 |
46 | def plot(self,fill = True):
47 | x = np.linspace(-20, 20, 100)
48 | y = stats.norm.pdf(x,loc = self.mu,scale = self.std)
49 | plt.plot(x,y,label = self.__repr__(markdown = True))
50 | if fill:
51 | plt.fill_between(x, 0, y, alpha=0.4)
52 |
53 |
54 | def __add__(self,other):
55 | mu = np.mean([self.mu,other.mu])
56 | std = np.mean([self.std,other.std])
57 | return Dist(mu,std)
58 |
59 | def mutate(self,alpha = 1):
60 | self.mu = self.mu + 1/(1+np.log(1+alpha)) * np.random.randn()
61 | self.std = max(self.std + 1/(1+np.log(1+alpha)) * np.random.randn(),0.5)
62 |
63 | def fitness(self,x):
64 | return 1 - stats.kstest(x,self.func).statistic
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 | class Population(object):
76 | def __init__(self,distributions = None,n = 100):
77 | if distributions is not None:
78 | self.distributions = distributions
79 | else:
80 | self.distributions = [Dist() for i in range(n)]
81 |
82 | def __getitem__(self,key):
83 | if type(key) == tuple or type(key) == list:
84 | d = []
85 | for i in key:
86 | d.append(self.distributions[i])
87 | return d
88 | else:
89 | return self.distributions[key]
90 |
91 | def __iter__(self):
92 | return iter(self.distributions)
93 |
94 | def __len__(self):
95 | return len(self.distributions)
96 |
97 | def plot(self,title = "Normal distributions",figsize = None):
98 | if figsize:
99 | plt.figure(figsize = figsize)
100 | plt.title(title)
101 | fill = len(self) < 5
102 | for d in self:
103 | d.plot(fill = fill)
104 | plt.legend()
105 | plt.xlabel("x")
106 | plt.show()
107 |
108 | def evaluate(self,x):
109 | fitnesses = [(i,dist.fitness(x)) for i,dist in enumerate(self)]
110 | indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True))
111 | return indices,fitnesses
112 |
113 | def selection(self,x,top = 0.1):
114 | indices,fitnesses = self.evaluate(x)
115 | n = int(top*len(fitnesses))
116 | return indices[:n]
117 |
118 |
119 | def crossover(self,indices):
120 | combinations = list(itertools.combinations(indices,2))
121 | np.random.shuffle(combinations)
122 | combinations = combinations[:len(self)]
123 | new_population = []
124 | for i,j in combinations:
125 | new_population.append(self[i]+self[j])
126 | self.distributions = new_population
127 |
128 | def mutate(self,generation = 1):
129 | for d in self:
130 | d.mutate(generation)
131 |
132 |
133 | def evolve(self,x,top = 0.25,n_generations = 20,last_selection = True):
134 | all_fitnesses = [self.evaluate(x)[1]]
135 |
136 | for generation in tqdm(range(n_generations)):
137 |
138 | indices = self.selection(x,top)
139 | self.crossover(indices)
140 | self.mutate(generation)
141 |
142 | indices,fitnesses = self.evaluate(x)
143 | all_fitnesses.append(fitnesses)
144 |
145 | self._plot_fitnesses(all_fitnesses)
146 |
147 | if last_selection:
148 | indices = self.selection(x,top)
149 |
150 | return Population(self[indices])
151 |
152 |
153 | def _plot_fitnesses(self,fitnesses):
154 | sups = []
155 | infs = []
156 | means = []
157 | for step in fitnesses:
158 | sups.append(np.max(step))
159 | infs.append(np.min(step))
160 | means.append(np.mean(step))
161 |
162 | plt.figure(figsize=(10,6))
163 | plt.plot(means)
164 | plt.fill_between(range(len(means)),sups,infs, alpha = 0.2)
165 | plt.xlabel('# Generation')
166 | plt.ylabel('Fitness')
167 | plt.legend()
168 | plt.show()
169 |
170 |
171 |
172 |
173 |
174 | #=============================================================================================================================
175 | # LOGREG
176 | #=============================================================================================================================
177 |
178 |
179 |
180 | import torch
181 | from torch.autograd import Variable
182 | import torch.nn as nn
183 | import torch.nn.functional as F
184 |
185 |
186 |
187 |
188 | class LogReg(torch.nn.Module):
189 | def __init__(self, n_feature,n_output = 1,alpha = 10e-1):
190 | self.alpha = alpha
191 | self.args = n_feature,n_output
192 | super(LogReg, self).__init__()
193 | self.out = torch.nn.Linear(n_feature,n_output,bias = False) # output layer
194 |
195 | def forward(self, x):
196 | x = Variable(torch.FloatTensor(x))
197 | x = F.sigmoid(self.out(x))
198 | return x
199 |
200 |
201 | def __add__(self,other):
202 | new = LogReg(*self.args)
203 | new.out.weight.data = torch.FloatTensor(0.5 * (self.out.weight.data.numpy() + other.out.weight.data.numpy()))
204 | return new
205 |
206 |
207 | def mutate(self,generation):
208 | out = self.out.weight.data.numpy()
209 | noise_out = self.alpha * np.random.randn(*out.shape)
210 | self.out.weight.data = torch.FloatTensor(self.out.weight.data.numpy() + noise_out)
211 |
212 |
213 | def evaluate(self,x,y):
214 | pred = self.forward(x).data.numpy()
215 | loss_1 = np.sum(np.log(pred + 10e-9)*y.reshape(-1,1))
216 | loss_0 = np.sum(np.log(1-pred + 10e-9)*(1-y).reshape(-1,1))
217 | return loss_1 + loss_0
218 |
219 |
220 | def plot_coefs(self):
221 | plt.figure(figsize = (15,4))
222 | plt.title("Coefficients")
223 | plt.axhline(0,c = "black")
224 | plt.plot(self.out.weight.data.numpy()[0])
225 | plt.xlabel("# Pixel")
226 | plt.show()
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 | class PopulationLogReg(object):
236 | def __init__(self,x,y,regs = None,n = 20,top = 0.25,**kwargs):
237 |
238 | self.x = x
239 | self.y = y
240 | self.kwargs = kwargs
241 |
242 | if regs is None:
243 | self.regs = [LogReg(**kwargs) for i in range(n)]
244 | else:
245 | self.regs = regs
246 |
247 |
248 | def __getitem__(self,key):
249 | if type(key) == tuple or type(key) == list:
250 | d = []
251 | for i in key:
252 | d.append(self.regs[i])
253 | return d
254 | else:
255 | return self.regs[key]
256 |
257 | def __iter__(self):
258 | return iter(self.regs)
259 |
260 | def __len__(self):
261 | return len(self.regs)
262 |
263 |
264 |
265 | def evaluate(self):
266 | fitnesses = [(i,element.evaluate(self.x,self.y)) for i,element in enumerate(self)]
267 | indices,fitnesses = zip(*sorted(fitnesses,key = lambda x : x[1],reverse = True))
268 | return indices,fitnesses
269 |
270 |
271 |
272 | def selection(self,top = 0.5):
273 | indices,fitnesses = self.evaluate()
274 | n = int(top*len(fitnesses))
275 | return indices[:n]
276 |
277 |
278 |
279 | def crossover(self,indices):
280 | combinations = list(itertools.combinations(indices,2))
281 | np.random.shuffle(combinations)
282 | combinations = combinations[:len(self)]
283 | new_population = []
284 | for i,j in combinations:
285 | new_population.append(self[i]+self[j])
286 |
287 | if len(new_population) < len(self):
288 | new_population.extend([LogReg(**self.kwargs) for i in range(len(self)-len(new_population))])
289 | self.regs = new_population
290 |
291 |
292 |
293 | def mutate(self,generation):
294 | for d in self:
295 | d.mutate(generation)
296 |
297 |
298 |
299 | def evolve(self,top = 0.25,n_generations = 20,last_selection = True):
300 | n_fittest = int(top*len(self))
301 | offsprings = len(list(itertools.combinations(range(n_fittest),2)))
302 | print("- Generations {}".format(len(self)))
303 | print("- Fittest : {}".format(n_fittest))
304 | print("- Offsprings : {}".format(offsprings))
305 |
306 | all_fitnesses = [self.evaluate()[1]]
307 |
308 | for generation in tqdm(range(n_generations)):
309 |
310 | indices = self.selection(top)
311 | self.crossover(indices)
312 | self.mutate(generation)
313 |
314 | indices,fitnesses = self.evaluate()
315 | all_fitnesses.append(fitnesses)
316 |
317 | self._plot_fitnesses(all_fitnesses)
318 |
319 | if last_selection:
320 | indices = self.selection(top)
321 |
322 | return PopulationLogReg(self.x,self.y,regs = self[indices])
323 |
324 |
325 |
326 | def _plot_fitnesses(self,fitnesses):
327 |
328 | from sklearn.linear_model import LogisticRegression
329 | lr = LogisticRegression()
330 | lr.fit(self.x,self.y)
331 | pred_bench = lr.predict_proba(self.x)
332 | loss_bench = np.sum(np.log(pred_bench + 10e-9)*self.y.reshape(-1,1)) + np.sum(np.log(1-pred_bench + 10e-9)*(1-self.y).reshape(-1,1))
333 |
334 | sups = []
335 | infs = []
336 | means = []
337 | for step in fitnesses:
338 | sups.append(np.max(step))
339 | infs.append(np.min(step))
340 | means.append(np.mean(step))
341 |
342 | plt.figure(figsize=(10,6))
343 | plt.plot(means)
344 | plt.fill_between(range(len(means)),sups,infs, alpha = 0.2)
345 | plt.axhline(loss_bench)
346 | plt.xlabel('# Generation')
347 | plt.ylabel('Fitness')
348 | plt.legend()
349 | plt.show()
350 |
351 |
352 |
--------------------------------------------------------------------------------
/4. Chrome Dino/images/capture1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/capture1.png
--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_hardcoded_agent.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_hardcoded_agent.gif
--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_ml_agent1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1.gif
--------------------------------------------------------------------------------
/4. Chrome Dino/images/dino_ml_agent1_bad.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/4. Chrome Dino/images/dino_ml_agent1_bad.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/README.md:
--------------------------------------------------------------------------------
1 | # Delivery optimization with Reinforcement Learning
2 | 
3 |
4 | This folder contains experiments to solve transportation optimization using **Reinforcement Learning** algorithm
5 | It will use the code of RL agents previously created in this repo.
6 |
7 | > The overall goal is to optimize routing between deliveries via **experience replay**
8 | > And be robust to anomalies such as traffic slowing down the vehicles in a zone
9 |
10 | ##### Preliminary remarks
11 | Such a problem (Travelling Salesman Problem) has many possible solutions including brute force or heuristic solutions.
12 | The goal here was to demonstrate the use of Reinforcement Learning in particular **when the cost function between two points is stochastic**
13 | It shows also a different resolution with a algorithm that could be used in a live system and automatically improves other time towards the best stragies.
14 |
15 |
16 | # The environment
17 |
18 | ## Environment implementation
19 |
20 | All the environment was coded from scratch with parameterized :
21 | - Number of stops for delivery
22 | - Traffic zone size (optional)
23 | - Traffic intensity (optional)
24 |
25 | *The convention used are the same as for OpenAI Gym's environments*
26 | *Only numpy and other basic libraries are used here for the environment*
27 |
28 | ##### Base environment with one trajectory
29 | 
30 |
31 | ##### Base environment with 500 stops
32 | 
33 |
34 | ##### Base environment with traffic zone
35 | 
36 |
37 | ## Rewards
38 | - Rewards from the environment between two delivery stops are simply the time elapsed between two travels which is calculated by taking the euclidean distance between two points plus a gaussian noise
39 | - If the trajectory between two stops goes through the traffic zone, the time elapsed is longer via a noise proportional to the distance through the zone and the traffic intensity parameter
40 |
41 |
42 | # The algorithm
43 |
44 | ## Q-Learning
45 | - A simple **Q-Learning** algorithm already gave interesting results.
46 | - The **reward** taken was the opposite of the time elapsed taken from the environment
47 | - An **epsilon-greedy** strategy allow to discover new paths and strategies while exploring
48 |
49 | ##### Training
50 | Over experience replays, the delivery takes less and less time
51 | 
52 |
53 | ### Results
54 | ##### 50 stops with no traffic
55 | 
56 |
57 |
58 | ##### 100 stops with no traffic
59 | 
60 |
61 | ##### 500 stops with no traffic
62 | 
63 |
64 | ##### 100 stops with intense traffic
65 | 
66 |
67 | ##### 500 stops with intense traffic
68 | 
69 |
70 |
71 | # Next steps
72 | - Test other simple algorithms like SARSA
73 | - Switch from discrete to continuous problems with Deep-Q-Learning (start including continuous observation space) and then DDPG (including continuous action space)
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/5. Delivery Optimization/Routing optimization with Deep Reinforcement Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Routing optimization using Deep Reinforcement Learning"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "ExecuteTime": {
15 | "end_time": "2019-09-17T17:47:22.119995Z",
16 | "start_time": "2019-09-17T17:47:20.289509Z"
17 | }
18 | },
19 | "outputs": [],
20 | "source": [
21 | "# Base Data Science snippet\n",
22 | "import pandas as pd\n",
23 | "import numpy as np\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import os\n",
26 | "import time\n",
27 | "from tqdm import tqdm_notebook\n",
28 | "\n",
29 | "%matplotlib inline\n",
30 | "%load_ext autoreload\n",
31 | "%autoreload 2"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": []
40 | }
41 | ],
42 | "metadata": {
43 | "kernelspec": {
44 | "display_name": "Python 3",
45 | "language": "python",
46 | "name": "python3"
47 | },
48 | "language_info": {
49 | "codemirror_mode": {
50 | "name": "ipython",
51 | "version": 3
52 | },
53 | "file_extension": ".py",
54 | "mimetype": "text/x-python",
55 | "name": "python",
56 | "nbconvert_exporter": "python",
57 | "pygments_lexer": "ipython3",
58 | "version": "3.6.4"
59 | },
60 | "toc": {
61 | "base_numbering": 1,
62 | "nav_menu": {},
63 | "number_sections": true,
64 | "sideBar": true,
65 | "skip_h1_title": false,
66 | "title_cell": "Table of Contents",
67 | "title_sidebar": "Contents",
68 | "toc_cell": false,
69 | "toc_position": {},
70 | "toc_section_display": true,
71 | "toc_window_display": false
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 2
76 | }
77 |
--------------------------------------------------------------------------------
/5. Delivery Optimization/delivery.py:
--------------------------------------------------------------------------------
1 | # Base Data Science snippet
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import os
6 | import time
7 | from tqdm import tqdm_notebook
8 | from scipy.spatial.distance import cdist
9 | import imageio
10 | from matplotlib.patches import Rectangle
11 | from matplotlib.collections import PatchCollection
12 |
13 | plt.style.use("seaborn-dark")
14 |
15 | import sys
16 | sys.path.append("../")
17 | from rl.agents.q_agent import QAgent
18 |
19 |
20 |
21 |
22 | class DeliveryEnvironment(object):
23 | def __init__(self,n_stops = 10,max_box = 10,method = "distance",**kwargs):
24 |
25 | print(f"Initialized Delivery Environment with {n_stops} random stops")
26 | print(f"Target metric for optimization is {method}")
27 |
28 | # Initialization
29 | self.n_stops = n_stops
30 | self.action_space = self.n_stops
31 | self.observation_space = self.n_stops
32 | self.max_box = max_box
33 | self.stops = []
34 | self.method = method
35 |
36 | # Generate stops
37 | self._generate_constraints(**kwargs)
38 | self._generate_stops()
39 | self._generate_q_values()
40 | self.render()
41 |
42 | # Initialize first point
43 | self.reset()
44 |
45 |
46 | def _generate_constraints(self,box_size = 0.2,traffic_intensity = 5):
47 |
48 | if self.method == "traffic_box":
49 |
50 | x_left = np.random.rand() * (self.max_box) * (1-box_size)
51 | y_bottom = np.random.rand() * (self.max_box) * (1-box_size)
52 |
53 | x_right = x_left + np.random.rand() * box_size * self.max_box
54 | y_top = y_bottom + np.random.rand() * box_size * self.max_box
55 |
56 | self.box = (x_left,x_right,y_bottom,y_top)
57 | self.traffic_intensity = traffic_intensity
58 |
59 |
60 |
61 | def _generate_stops(self):
62 |
63 | if self.method == "traffic_box":
64 |
65 | points = []
66 | while len(points) < self.n_stops:
67 | x,y = np.random.rand(2)*self.max_box
68 | if not self._is_in_box(x,y,self.box):
69 | points.append((x,y))
70 |
71 | xy = np.array(points)
72 |
73 | else:
74 | # Generate geographical coordinates
75 | xy = np.random.rand(self.n_stops,2)*self.max_box
76 |
77 | self.x = xy[:,0]
78 | self.y = xy[:,1]
79 |
80 |
81 | def _generate_q_values(self,box_size = 0.2):
82 |
83 | # Generate actual Q Values corresponding to time elapsed between two points
84 | if self.method in ["distance","traffic_box"]:
85 | xy = np.column_stack([self.x,self.y])
86 | self.q_stops = cdist(xy,xy)
87 | elif self.method=="time":
88 | self.q_stops = np.random.rand(self.n_stops,self.n_stops)*self.max_box
89 | np.fill_diagonal(self.q_stops,0)
90 | else:
91 | raise Exception("Method not recognized")
92 |
93 |
94 | def render(self,return_img = False):
95 |
96 | fig = plt.figure(figsize=(7,7))
97 | ax = fig.add_subplot(111)
98 | ax.set_title("Delivery Stops")
99 |
100 | # Show stops
101 | ax.scatter(self.x,self.y,c = "red",s = 50)
102 |
103 | # Show START
104 | if len(self.stops)>0:
105 | xy = self._get_xy(initial = True)
106 | xytext = xy[0]+0.1,xy[1]-0.05
107 | ax.annotate("START",xy=xy,xytext=xytext,weight = "bold")
108 |
109 | # Show itinerary
110 | if len(self.stops) > 1:
111 | ax.plot(self.x[self.stops],self.y[self.stops],c = "blue",linewidth=1,linestyle="--")
112 |
113 | # Annotate END
114 | xy = self._get_xy(initial = False)
115 | xytext = xy[0]+0.1,xy[1]-0.05
116 | ax.annotate("END",xy=xy,xytext=xytext,weight = "bold")
117 |
118 |
119 | if hasattr(self,"box"):
120 | left,bottom = self.box[0],self.box[2]
121 | width = self.box[1] - self.box[0]
122 | height = self.box[3] - self.box[2]
123 | rect = Rectangle((left,bottom), width, height)
124 | collection = PatchCollection([rect],facecolor = "red",alpha = 0.2)
125 | ax.add_collection(collection)
126 |
127 |
128 | plt.xticks([])
129 | plt.yticks([])
130 |
131 | if return_img:
132 | # From https://ndres.me/post/matplotlib-animated-gifs-easily/
133 | fig.canvas.draw_idle()
134 | image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
135 | image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
136 | plt.close()
137 | return image
138 | else:
139 | plt.show()
140 |
141 |
142 |
143 | def reset(self):
144 |
145 | # Stops placeholder
146 | self.stops = []
147 |
148 | # Random first stop
149 | first_stop = np.random.randint(self.n_stops)
150 | self.stops.append(first_stop)
151 |
152 | return first_stop
153 |
154 |
155 | def step(self,destination):
156 |
157 | # Get current state
158 | state = self._get_state()
159 | new_state = destination
160 |
161 | # Get reward for such a move
162 | reward = self._get_reward(state,new_state)
163 |
164 | # Append new_state to stops
165 | self.stops.append(destination)
166 | done = len(self.stops) == self.n_stops
167 |
168 | return new_state,reward,done
169 |
170 |
171 | def _get_state(self):
172 | return self.stops[-1]
173 |
174 |
175 | def _get_xy(self,initial = False):
176 | state = self.stops[0] if initial else self._get_state()
177 | x = self.x[state]
178 | y = self.y[state]
179 | return x,y
180 |
181 |
182 | def _get_reward(self,state,new_state):
183 | base_reward = self.q_stops[state,new_state]
184 |
185 | if self.method == "distance":
186 | return base_reward
187 | elif self.method == "time":
188 | return base_reward + np.random.randn()
189 | elif self.method == "traffic_box":
190 |
191 | # Additional reward correspond to slowing down in traffic
192 | xs,ys = self.x[state],self.y[state]
193 | xe,ye = self.x[new_state],self.y[new_state]
194 | intersections = self._calculate_box_intersection(xs,xe,ys,ye,self.box)
195 | if len(intersections) > 0:
196 | i1,i2 = intersections
197 | distance_traffic = np.sqrt((i2[1]-i1[1])**2 + (i2[0]-i1[0])**2)
198 | additional_reward = distance_traffic * self.traffic_intensity * np.random.rand()
199 | else:
200 | additional_reward = np.random.rand()
201 |
202 | return base_reward + additional_reward
203 |
204 |
205 | @staticmethod
206 | def _calculate_point(x1,x2,y1,y2,x = None,y = None):
207 |
208 | if y1 == y2:
209 | return y1
210 | elif x1 == x2:
211 | return x1
212 | else:
213 | a = (y2-y1)/(x2-x1)
214 | b = y2 - a * x2
215 |
216 | if x is None:
217 | x = (y-b)/a
218 | return x
219 | elif y is None:
220 | y = a*x+b
221 | return y
222 | else:
223 | raise Exception("Provide x or y")
224 |
225 |
226 | def _is_in_box(self,x,y,box):
227 | # Get box coordinates
228 | x_left,x_right,y_bottom,y_top = box
229 | return x >= x_left and x <= x_right and y >= y_bottom and y <= y_top
230 |
231 |
232 | def _calculate_box_intersection(self,x1,x2,y1,y2,box):
233 |
234 | # Get box coordinates
235 | x_left,x_right,y_bottom,y_top = box
236 |
237 | # Intersections
238 | intersections = []
239 |
240 | # Top intersection
241 | i_top = self._calculate_point(x1,x2,y1,y2,y=y_top)
242 | if i_top > x_left and i_top < x_right:
243 | intersections.append((i_top,y_top))
244 |
245 | # Bottom intersection
246 | i_bottom = self._calculate_point(x1,x2,y1,y2,y=y_bottom)
247 | if i_bottom > x_left and i_bottom < x_right:
248 | intersections.append((i_bottom,y_bottom))
249 |
250 | # Left intersection
251 | i_left = self._calculate_point(x1,x2,y1,y2,x=x_left)
252 | if i_left > y_bottom and i_left < y_top:
253 | intersections.append((x_left,i_left))
254 |
255 | # Right intersection
256 | i_right = self._calculate_point(x1,x2,y1,y2,x=x_right)
257 | if i_right > y_bottom and i_right < y_top:
258 | intersections.append((x_right,i_right))
259 |
260 | return intersections
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 | def run_episode(env,agent,verbose = 1):
269 |
270 | s = env.reset()
271 | agent.reset_memory()
272 |
273 | max_step = env.n_stops
274 |
275 | episode_reward = 0
276 |
277 | i = 0
278 | while i < max_step:
279 |
280 | # Remember the states
281 | agent.remember_state(s)
282 |
283 | # Choose an action
284 | a = agent.act(s)
285 |
286 | # Take the action, and get the reward from environment
287 | s_next,r,done = env.step(a)
288 |
289 | # Tweak the reward
290 | r = -1 * r
291 |
292 | if verbose: print(s_next,r,done)
293 |
294 | # Update our knowledge in the Q-table
295 | agent.train(s,a,r,s_next)
296 |
297 | # Update the caches
298 | episode_reward += r
299 | s = s_next
300 |
301 | # If the episode is terminated
302 | i += 1
303 | if done:
304 | break
305 |
306 | return env,agent,episode_reward
307 |
308 |
309 |
310 |
311 |
312 |
313 | class DeliveryQAgent(QAgent):
314 |
315 | def __init__(self,*args,**kwargs):
316 | super().__init__(*args,**kwargs)
317 | self.reset_memory()
318 |
319 | def act(self,s):
320 |
321 | # Get Q Vector
322 | q = np.copy(self.Q[s,:])
323 |
324 | # Avoid already visited states
325 | q[self.states_memory] = -np.inf
326 |
327 | if np.random.rand() > self.epsilon:
328 | a = np.argmax(q)
329 | else:
330 | a = np.random.choice([x for x in range(self.actions_size) if x not in self.states_memory])
331 |
332 | return a
333 |
334 |
335 | def remember_state(self,s):
336 | self.states_memory.append(s)
337 |
338 | def reset_memory(self):
339 | self.states_memory = []
340 |
341 |
342 |
343 | def run_n_episodes(env,agent,name="training.gif",n_episodes=1000,render_each=10,fps=10):
344 |
345 | # Store the rewards
346 | rewards = []
347 | imgs = []
348 |
349 | # Experience replay
350 | for i in tqdm_notebook(range(n_episodes)):
351 |
352 | # Run the episode
353 | env,agent,episode_reward = run_episode(env,agent,verbose = 0)
354 | rewards.append(episode_reward)
355 |
356 | if i % render_each == 0:
357 | img = env.render(return_img = True)
358 | imgs.append(img)
359 |
360 | # Show rewards
361 | plt.figure(figsize = (15,3))
362 | plt.title("Rewards over training")
363 | plt.plot(rewards)
364 | plt.show()
365 |
366 | # Save imgs as gif
367 | imageio.mimsave(name,imgs,fps = fps)
368 |
369 | return env,agent
--------------------------------------------------------------------------------
/5. Delivery Optimization/env1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env1.png
--------------------------------------------------------------------------------
/5. Delivery Optimization/env2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env2.png
--------------------------------------------------------------------------------
/5. Delivery Optimization/env3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/env3.png
--------------------------------------------------------------------------------
/5. Delivery Optimization/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training.png
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_100_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_100_stops_traffic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_100_stops_traffic.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_10_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_10_stops.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_500_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_500_stops_traffic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_500_stops_traffic.gif
--------------------------------------------------------------------------------
/5. Delivery Optimization/training_50_stops.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/5. Delivery Optimization/training_50_stops.gif
--------------------------------------------------------------------------------
/6. Solving a Rubik's Cube/rubik.py:
--------------------------------------------------------------------------------
1 | # Base Data Science snippet
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import os
6 | import time
7 | from tqdm import tqdm_notebook
8 | from matplotlib.patches import Rectangle
9 | from matplotlib.collections import PatchCollection
10 |
11 | plt.style.use("seaborn-dark")
12 |
13 | # import sys
14 | # sys.path.append("../")
15 | # from rl.agents.q_agent import QAgent
16 |
17 | #----------------------------------------------------------------------------------------------------------------------------
18 | # CONSTANTS
19 | #----------------------------------------------------------------------------------------------------------------------------
20 |
21 | COLORS = ["red","white","orange","yellow","green","blue"]
22 | WIDTH_SQUARE = 0.05
23 | FACES = ["LEFT","FRONT","RIGHT","BACK","TOP","BOTTOM"]
24 |
25 | LEFT_SLICE = np.s_[0,:]
26 | RIGHT_SLICE = np.s_[-1,:]
27 | TOP_SLICE = np.s_[:,0]
28 | BOTTOM_SLICE = np.s_[:,-1]
29 |
30 | FACES_LINK = {
31 | "LEFT":[
32 | ("BACK",RIGHT_SLICE),
33 | ("BOTTOM",LEFT_SLICE),
34 | ("FRONT",LEFT_SLICE),
35 | ("TOP",LEFT_SLICE),
36 | ],
37 | "FRONT":[
38 | ("LEFT",RIGHT_SLICE),
39 | ("BOTTOM",BOTTOM_SLICE),
40 | ("RIGHT",LEFT_SLICE),
41 | ("TOP",TOP_SLICE),
42 | ],
43 | "RIGHT":[
44 | ("TOP",RIGHT_SLICE),
45 | ("FRONT",RIGHT_SLICE),
46 | ("BOTTOM",RIGHT_SLICE),
47 | ("BACK",LEFT_SLICE),
48 | ],
49 | "BACK":[
50 | ("TOP",BOTTOM_SLICE),
51 | ("RIGHT",RIGHT_SLICE),
52 | ("BOTTOM",TOP_SLICE),
53 | ("LEFT",LEFT_SLICE),
54 | ],
55 | "TOP":[
56 | ("LEFT",BOTTOM_SLICE),
57 | ("FRONT",BOTTOM_SLICE),
58 | ("RIGHT",BOTTOM_SLICE),
59 | ("BACK",BOTTOM_SLICE),
60 | ],
61 | "BOTTOM":[
62 | ("BACK",TOP_SLICE),
63 | ("RIGHT",TOP_SLICE),
64 | ("FRONT",TOP_SLICE),
65 | ("LEFT",TOP_SLICE),
66 | ],
67 | }
68 |
69 |
70 |
71 |
72 | #----------------------------------------------------------------------------------------------------------------------------
73 | # RUBIKS CUBE ENVIRONMENT CLASS
74 | #----------------------------------------------------------------------------------------------------------------------------
75 |
76 | class RubiksCube(object):
77 | def __init__(self,shuffle = True):
78 |
79 | print(f"Initialized RubiksCube")
80 | self.data = np.array([[i]*9 for i in range(6)])
81 | self.data = self._to_1D(self.data)
82 |
83 | if shuffle:
84 | np.random.shuffle(self.data)
85 |
86 | @staticmethod
87 | def _to_1D(array):
88 | return np.squeeze(array.reshape(1,-1))
89 |
90 | @staticmethod
91 | def _to_2D(array):
92 | return array.reshape(6,9)
93 |
94 | @staticmethod
95 | def _to_square(face):
96 | return face.reshape(3,3)
97 |
98 | @staticmethod
99 | def _to_array(face):
100 | return face.reshape(9)
101 |
102 |
103 | @staticmethod
104 | def _facestr_to_faceid(face):
105 | """Convert face as string to face ID (between 0 and 5)
106 | """
107 | if isinstance(face,str):
108 | assert face in FACES
109 | face = FACES.index(face)
110 | return face
111 |
112 |
113 | @staticmethod
114 | def _rotate_array(array,clockwise = True):
115 | if clockwise:
116 | return array[1:] + [array[0]]
117 | else:
118 | return [array[-1]] + array[:-1]
119 |
120 |
121 | def get_face(self,face,as_square = True):
122 | """Function to get one face of the Rubik's cube
123 | """
124 |
125 | # Convert face as string to face ID (between 0 and 5)
126 | face = self._facestr_to_faceid(face)
127 |
128 | # Select matching face in the data array
129 | face = self.data[face*9:(face+1)*9]
130 |
131 | # Reshape face data to a square
132 | if as_square:
133 | face = self._to_square(face)
134 |
135 | # Return face data
136 | return face
137 |
138 |
139 |
140 |
141 | def set_face(self,face,array):
142 |
143 | # Convert face as string to face ID (between 0 and 5)
144 | face = self._facestr_to_faceid(face)
145 |
146 | # Reshape array
147 | if array.shape == (3,3):
148 | array = self._to_array(array)
149 |
150 | # Set face
151 | self.data[face*9:(face+1)*9] = array
152 |
153 |
154 |
155 |
156 |
157 | def rotate(self,face,clockwise = True):
158 | """Rotate one face of the Rubik's cube
159 | """
160 | # Convert face as string to face ID (between 0 and 5)
161 | face_id = self._facestr_to_faceid(face)
162 |
163 | # Get face
164 | face_data = self.get_face(face_id)
165 |
166 | # Rotate selected face
167 | sense = -1 if clockwise else 1
168 | face_data = np.rot90(face_data,k=sense)
169 | self.set_face(face,face_data)
170 |
171 | # Get other faces
172 | linked_faces,slices = zip(*FACES_LINK[face])
173 | slices_data = [np.copy(self.get_face(linked_faces[i])[slices[i]]) for i in range(4)]
174 |
175 | # Rotate arrays
176 | slices_data = self._rotate_array(slices_data,clockwise = clockwise)
177 |
178 | # Set new rotated arrays
179 | for i in range(4):
180 | face = linked_faces[i]
181 | face_data = self.get_face(face)
182 | face_data[slices[i]] = slices_data[i]
183 | self.set_face(face,face_data)
184 |
185 |
186 |
187 | def render3D(self):
188 | pass
189 |
190 |
191 | def render(self):
192 |
193 | fig = plt.figure(figsize=(7,7))
194 | ax = fig.add_subplot(111)
195 |
196 | for i in range(4):
197 | face_data = self.data[i*9:(i+1)*9]
198 | face = RubiksFace(face_data)
199 | face.render(ax = ax,init_height = 0.4,init_width = 0.15 + i*3*(WIDTH_SQUARE+0.005))
200 |
201 |
202 | for i in range(4,6):
203 | face_data = self.data[i*9:(i+1)*9]
204 | face = RubiksFace(face_data)
205 | init_height = 0.4 + 3*(WIDTH_SQUARE+0.005) if i == 4 else 0.4 - 3*(WIDTH_SQUARE+0.005)
206 | face.render(ax = ax,init_height = init_height,init_width = 0.15 + 3*(WIDTH_SQUARE+0.005))
207 |
208 | plt.xticks([])
209 | plt.yticks([])
210 | plt.show()
211 |
212 |
213 |
214 |
215 |
216 | class RubiksFace(object):
217 | def __init__(self,array):
218 | if array.shape == (3,3):
219 | self.array = array
220 | else:
221 | assert len(array) == 9
222 | self.array = array.reshape(3,3)
223 |
224 | def render(self,ax = None,init_height = 0,init_width = 0):
225 |
226 | if ax is None:
227 | fig = plt.figure(figsize=(7,7))
228 | ax = fig.add_subplot(111)
229 |
230 |
231 |
232 | for i in range(3):
233 | for j in range(3):
234 |
235 | square = self.array[i,j]
236 | color = COLORS[square]
237 |
238 | rect = Rectangle((init_width + i*WIDTH_SQUARE,init_height + j*WIDTH_SQUARE), WIDTH_SQUARE, WIDTH_SQUARE)
239 | collection = PatchCollection([rect],facecolor = color,alpha = 0.8,edgecolor="black")
240 | ax.add_collection(collection)
241 |
242 |
243 |
244 |
--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/20200318 - Hyperion dev.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hyperion Library development"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "ExecuteTime": {
15 | "end_time": "2020-03-18T17:16:33.161399Z",
16 | "start_time": "2020-03-18T17:16:31.745503Z"
17 | }
18 | },
19 | "outputs": [],
20 | "source": [
21 | "# Base Data Science snippet\n",
22 | "import pandas as pd\n",
23 | "import numpy as np\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import os\n",
26 | "import time\n",
27 | "from tqdm import tqdm_notebook\n",
28 | "\n",
29 | "%matplotlib inline\n",
30 | "%load_ext autoreload\n",
31 | "%autoreload 2"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import sys\n",
41 | "sys.path.append(\"c:/git/reinforcement-learning/\")\n",
42 | "\n",
43 | "from hyperion.grid import *"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "# Playground"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "import pygame\n",
60 | "\n",
61 | "pygame.init()\n",
62 | "\n",
63 | "ecran = pygame.display.set_mode((300, 200))"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "pygame.quit()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": []
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.7.4"
100 | },
101 | "toc": {
102 | "base_numbering": 1,
103 | "nav_menu": {},
104 | "number_sections": true,
105 | "sideBar": true,
106 | "skip_h1_title": false,
107 | "title_cell": "Table of Contents",
108 | "title_sidebar": "Contents",
109 | "toc_cell": false,
110 | "toc_position": {},
111 | "toc_section_display": true,
112 | "toc_window_display": false
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 4
117 | }
118 |
--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/README.md:
--------------------------------------------------------------------------------
1 | # Multi-Agents simulation
2 | 
3 |
4 | Simulations including multiple agents are present everywhere in our daily lives, from large-scale economics policies to epidemiology.
5 | Agent-based modeling is even more effective when merged with modern AI techniques such as Reinforcement Learning.
6 | This folder contains experiments on this topics
7 |
8 | # Experiments summary
9 | - **October 2019** - First attempts to create a Sugarscape experiment. Developed a framework using Dataframes for accelerated computations. Yet too many interactions to code from scratch and low performance
10 | - **December 2019** - Discovered Unity for such simulations + ML Agents
11 | - **March 2019** - Due to COVID19 outbreak, I started experiments on Multi Agent modeling and social distancing. PyGame is a good candidate for 2D simulations similar to Unity but in Python. Many possibilities and spatial O(n2) interactions are really sped up thanks to colliders embedded in PyGame. Movements are still feasible up to 10k agents at least in my first experiments. Moved experiments in the [westworld](https://github.com/TheoLvs/westworld) repo.
12 |
13 |
14 | # References
15 | ## Libraries & softwares
16 | - Unity
17 | - NetLogo
18 | - [MESA](https://github.com/projectmesa/mesa) - Python
19 | - [SPADE](https://spade-mas.readthedocs.io/en/latest/readme.html) - Python
20 | - [abcEconomics](https://abce.readthedocs.io/en/master/)
21 | - [GAMA-Platform](https://gama-platform.github.io/)
22 | - [Manim](https://github.com/3b1b/manim) by the great Grant Sanderson
23 | - PyGame
24 |
25 | ## Tutorials
26 | - [Introduction to Agent Based Modeling in Python](https://towardsdatascience.com/introduction-to-mesa-agent-based-modeling-in-python-bcb0596e1c9a)
27 |
28 | ## Inspiration
29 | - https://www.complexity-explorables.org/
30 | - Sugarscape https://www.youtube.com/watch?v=r_It_X7v-1E
31 | - youtube.com/watch?v=uVpN136q7N8
32 | - youtube.com/watch?v=Bot5_DouTWg
33 | - Ant-based modeling
34 |
35 | ## Features to implement
36 | - Set and reload data -> ok
37 | - Animation over the simulation (gif ok, ipywidgets to go)
38 | - Action framework with delayed deferrence
39 | - Metrics storage for each agent
40 | - Set up geographical zones and 2D maps with impossible moves
41 | - Find closest agent method
42 | - Wander method
43 | - Launch simulation until certain time + early stopping
44 | - Circle collider
45 | - Optimizing tutorial towardsdatascience.com/speeding-up-python-code-fast-filtering-and-slow-loops-8e11a09a9c2f for more optimization
46 | - A* algorithm for shortest path
47 | - Heatmaps of navigation presence for retail use cases
--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/pygame_test.py:
--------------------------------------------------------------------------------
1 | """Pygame test for multi agent modeling
2 |
3 | Tutorials
4 | https://zestedesavoir.com/tutoriels/846/pygame-pour-les-zesteurs/1381_a-la-decouverte-de-pygame/creer-une-simple-fenetre-personnalisable/#1-15425_creons-une-fenetre-basique
5 | https://www.pygame.org/docs/ref/rect.html#pygame.Rect.move_ip
6 | https://stackoverflow.com/questions/32061507/moving-a-rectangle-in-pygame
7 |
8 |
9 | Ideas:
10 | - Add circles
11 | - Pathfinding algorithm
12 | - Obstacles
13 | - Colliders
14 | - Clicking to add agent or wall
15 | - Grid
16 | - AutoMaze
17 | - Raytracing
18 | - Change Icon
19 | - Heatmaps of where agents were located (for retail purposes)
20 |
21 | Projects:
22 | - Epidemiology
23 | - See MESA or NetLogo examples
24 | - Bunny & Rabbits
25 | """
26 |
27 | import numpy as np
28 | import pygame
29 | import time
30 | import uuid
31 |
32 | # import os
33 | # os.environ['SDL_VIDEO_WINDOW_POS'] = "%d,%d" % (320,240)
34 |
35 | pygame.init()
36 | pygame.display.set_caption("Multi Agent Modeling Environment")
37 | # ecran = pygame.display.set_mode((0, 0), pygame.FULLSCREEN)
38 |
39 | screen = pygame.display.set_mode((1000, 600))
40 |
41 | simulation_on = True
42 | # time.sleep(5)
43 |
44 | background_colour = (0, 0, 0)
45 |
46 |
47 |
48 |
49 |
50 |
51 | class RectangleAgent:
52 |
53 | def __init__(self,width,height,x,y,screen = None):
54 | # Rect left top width height
55 |
56 | self.screen = screen
57 | self.fig = pygame.rect.Rect((x,y,width,height))
58 | # print(f"Initialized rect at {self.pos}")
59 |
60 | self.change_direction()
61 |
62 | self.agent_id = str(uuid.uuid1())
63 |
64 |
65 | @property
66 | def pos(self):
67 | return self.fig.x,self.fig.y,self.fig.width,self.fig.height
68 |
69 | def move_at(self,x,y):
70 | self.x = x
71 | self.y = y
72 |
73 |
74 | def wander(self,dl):
75 |
76 | self.move(angle = self.direction_angle,dl = dl)
77 |
78 |
79 | def change_direction(self):
80 | self.direction_angle = np.random.uniform(0,2*np.pi)
81 |
82 |
83 | def move_towards(self):
84 | pass
85 |
86 |
87 | def collides(self,agents):
88 |
89 | if len(agents) == 0:
90 | collisions = []
91 | else:
92 | other_agents = [agent.fig for agent in agents if agent.agent_id != self.agent_id]
93 | collisions = self.fig.collidelistall(other_agents)
94 |
95 | if len(collisions) > 0:
96 | return True,collisions
97 | else:
98 | return False,collisions
99 |
100 |
101 | def if_collides(self,agents):
102 |
103 | is_collision,collisions = self.collides(agents)
104 |
105 | if is_collision:
106 | self.direction_angle += np.pi
107 |
108 |
109 |
110 | def move(self,dx = 0,dy = 0,angle = None,dl = None,colliders = None):
111 |
112 | if angle is not None:
113 | assert dl is not None
114 |
115 | # Compute delta directions with basic trigonometry
116 | dx = dl * np.cos(angle)
117 | dy = dl * np.sin(angle)
118 | self.move(dx = dx,dy = dy)
119 |
120 | else:
121 |
122 | screen_width = self.screen.get_width()
123 | screen_height = self.screen.get_height()
124 |
125 | old_x = self.fig.x
126 | old_y = self.fig.y
127 |
128 | if self.fig.x + dx > screen_width:
129 | self.fig.x = 0
130 | elif self.fig.x + dx < 0:
131 | self.fig.x = screen_width
132 | else:
133 | self.fig.x = self.fig.x + dx
134 |
135 | if self.fig.y + dy > screen_height:
136 | self.fig.y = 0
137 | elif self.fig.y + dy < 0:
138 | self.fig.y = screen_height
139 | else:
140 | self.fig.y = self.fig.y + dy
141 |
142 | if colliders is not None:
143 | if self.collides(colliders):
144 | self.fig.x = old_x
145 | self.fig.y = old_y
146 |
147 |
148 | # print(f"Position at {self.fig.x},{self.fig.y}")
149 |
150 |
151 | def render(self,color = (180,20,150)):
152 | pygame.draw.rect(self.screen,color,self.pos)
153 | # pygame.draw.circle(self.screen,color,(self.fig.x,self.fig.y),10)
154 | # pass
155 |
156 |
157 |
158 |
159 | class Obstacle:
160 | def __init__(self,width,height,x,y,screen = None):
161 | # Rect left top width height
162 |
163 | self.screen = screen
164 | self.fig = pygame.rect.Rect((x,y,width,height))
165 | # print(f"Initialized rect at {self.pos}")
166 | self.agent_id = str(uuid.uuid1())
167 |
168 |
169 | def render(self,color = (10,150,10)):
170 | pygame.draw.rect(self.screen,color,self.pos)
171 |
172 |
173 | @property
174 | def pos(self):
175 | return self.fig.x,self.fig.y,self.fig.width,self.fig.height
176 |
177 |
178 |
179 | size = 10
180 | n_rects = 500
181 |
182 | rects = []
183 |
184 | for i in range(n_rects):
185 | new_rect = RectangleAgent(
186 | size,size,
187 | np.random.uniform(0,screen.get_width()),
188 | np.random.uniform(0,screen.get_height()),
189 | screen,
190 | )
191 |
192 | rects.append(new_rect)
193 |
194 |
195 |
196 |
197 | i = 0
198 | stop = 1000
199 |
200 | obstacles = [
201 | Obstacle(200,200,300,300,screen)
202 | ]
203 |
204 |
205 | while simulation_on:
206 | screen.fill(background_colour)
207 |
208 | for rect in rects:
209 | rect.wander(size)
210 | rect.if_collides(rects + obstacles)
211 |
212 | for rect in rects + obstacles:
213 | rect.render()
214 |
215 | for event in pygame.event.get():
216 | if event.type == pygame.KEYDOWN:
217 | simulation_on = False
218 |
219 | elif event.type == pygame.MOUSEBUTTONUP:
220 | new_x,new_y = pygame.mouse.get_pos()
221 | # new_rect = RectangleAgent(size,size,new_x,new_y,screen)
222 | # rects.append(new_rect)
223 |
224 | new_obs = Obstacle(20,20,new_x,new_y,screen)
225 | obstacles.append(new_obs)
226 |
227 |
228 |
229 | pygame.display.update()
230 | # pygame.display.flip()
231 |
232 | time.sleep(0.05)
233 |
234 |
235 | if i == stop:
236 | simulation_on = False
237 | else:
238 | i+=1
239 |
240 |
241 | pygame.quit()
--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/test.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test.gif
--------------------------------------------------------------------------------
/7. Multi-Agents Simulations/test2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/7. Multi-Agents Simulations/test2.gif
--------------------------------------------------------------------------------
/8. Unity ML agents tests/README.md:
--------------------------------------------------------------------------------
1 | # Unity ML Agents test
2 |
3 | > I've been creating environments directly with Python for a few years now, yet I've been facing lots of limitation due to the nature of Python
4 | > In 2020, the best option is probably Unity ML Agents.
5 | > This repo will hold experiments on custom Unity environments/games and Reinforcement Learning to attempt solving them
6 |
7 | ## Rolling a ball (January 2020)
8 | 
9 | > My first experiment is a simple game about rolling a ball affected by gravity trying to catch all 10 pickups randomly placed in the environment. Movement is directly affected by inertia. To create the same env, follow Unity official tutorial https://learn.unity.com/project/roll-a-ball-tutorial
10 |
11 |
12 |
13 | ## References ✨
14 | ### To learn about Unity
15 | - Youtube holds greats resources such as Brackeys, Sebastian Lague or Jason Weimann channels. Huge thanks to those videos for teaching about Unity in such an entertaining way.
16 | - Unity official tutorials are great as well.
17 |
18 |
19 | ### To learn about Unity ML Agents
20 | - Documentation at https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Readme.md
21 | - Creating custom environments https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Learning-Environment-Create-New.md
22 | - Overview of how UML works https://github.com/Unity-Technologies/ml-agents/blob/master/docs/ML-Agents-Overview.md
23 | - [This great video](https://www.youtube.com/watch?v=x2RBxmooh8w)
24 |
25 |
26 |
27 | ## Installing ML Agents
28 | Follow [tutorial at this link](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md)
29 |
30 | - Install Python wrapper with pip
31 | ```
32 | pip install mlagents
33 | ```
34 | - Clone ML Agents repo
35 | ```
36 | git clone --branch latest_release https://github.com/Unity-Technologies/ml-agents.git
37 | ```
38 | - Install Barracuda
39 | - Copy ML-Agents folder from the cloned repo at ``UnitySDK/Assets`` in your Assets project folder
--------------------------------------------------------------------------------
/8. Unity ML agents tests/rolling_a_ball/20200202 - Rolling a Ball.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Rolling a Ball"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | ""
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "# Interaction test"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "> This comes from the getting started tutorial applied to the 3D Ball Agent"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 1,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Python version:\n",
41 | "3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "import matplotlib.pyplot as plt\n",
47 | "import numpy as np\n",
48 | "import sys\n",
49 | "\n",
50 | "from mlagents_envs.environment import UnityEnvironment\n",
51 | "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n",
52 | "\n",
53 | "%matplotlib inline\n",
54 | "\n",
55 | "print(\"Python version:\")\n",
56 | "print(sys.version)\n",
57 | "\n",
58 | "# check Python version\n",
59 | "if (sys.version_info[0] < 3):\n",
60 | " raise Exception(\"ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3\")"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stderr",
70 | "output_type": "stream",
71 | "text": [
72 | "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "engine_configuration_channel = EngineConfigurationChannel()\n",
78 | "env = UnityEnvironment(base_port = 5004)#, file_name=env_name, side_channels = [engine_configuration_channel])"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stderr",
88 | "output_type": "stream",
89 | "text": [
90 | "INFO:mlagents_envs:Connected new brain:\n",
91 | "3DBall?team=0\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "#Reset the environment\n",
97 | "env.reset()\n",
98 | "\n",
99 | "# Set the default brain to work with\n",
100 | "group_name = env.get_agent_groups()[0]\n",
101 | "group_spec = env.get_agent_group_spec(group_name)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 7,
107 | "metadata": {},
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "Number of observations : 1\n",
114 | "Agent state looks like: \n",
115 | "[[-1.4673042e-02 -1.4683060e-02 -5.2082062e-01 4.0000000e+00\n",
116 | " -7.9952097e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
117 | " [-2.6140258e-02 3.4010161e-02 -4.5768166e-01 4.0000000e+00\n",
118 | " -5.5027008e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
119 | " [ 6.3632242e-02 3.7996579e-02 -1.1360741e+00 4.0000000e+00\n",
120 | " -4.1505909e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
121 | " [-4.6871606e-02 -3.9161425e-02 -6.1104012e-01 4.0000000e+00\n",
122 | " 5.6867313e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
123 | " [ 3.8746696e-02 7.7085062e-03 1.1423024e+00 4.0000000e+00\n",
124 | " -1.4589405e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
125 | " [ 4.8017994e-02 -7.4483551e-02 -5.7353783e-01 4.0000000e+00\n",
126 | " -3.8447380e-03 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
127 | " [ 3.9585244e-02 -8.3357669e-02 -9.4123268e-01 4.0000000e+00\n",
128 | " -7.9583311e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
129 | " [ 8.0520153e-02 -2.9333552e-02 1.7612720e-01 4.0000000e+00\n",
130 | " 5.6848335e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
131 | " [ 8.3218820e-02 -7.4690364e-02 1.4817381e+00 4.0000000e+00\n",
132 | " 4.3329239e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
133 | " [ 5.2080988e-03 4.5170397e-03 1.4738545e+00 4.0000000e+00\n",
134 | " 6.0955667e-01 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
135 | " [-4.5549396e-02 1.7029690e-02 -1.4121037e+00 4.0000000e+00\n",
136 | " -1.0720904e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]\n",
137 | " [ 5.7741486e-02 8.4876612e-02 5.8971786e-01 4.0000000e+00\n",
138 | " -7.8450203e-02 0.0000000e+00 0.0000000e+00 0.0000000e+00]]\n",
139 | "Agent state looks like: \n",
140 | "[-0.01467304 -0.01468306 -0.5208206 4. -0.79952097 0.\n",
141 | " 0. 0. ]\n",
142 | "Is there a visual observation ? False\n"
143 | ]
144 | }
145 | ],
146 | "source": [
147 | "# Get the state of the agents\n",
148 | "step_result = env.get_step_result(group_name)\n",
149 | "\n",
150 | "# Examine the number of observations per Agent\n",
151 | "print(\"Number of observations : \", len(group_spec.observation_shapes))\n",
152 | "\n",
153 | "# Examine the state space for the first observation for all agents\n",
154 | "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0]))\n",
155 | "\n",
156 | "# Examine the state space for the first observation for the first agent\n",
157 | "print(\"Agent state looks like: \\n{}\".format(step_result.obs[0][0]))\n",
158 | "\n",
159 | "# Is there a visual observation ?\n",
160 | "vis_obs = any([len(shape) == 3 for shape in group_spec.observation_shapes])\n",
161 | "print(\"Is there a visual observation ?\", vis_obs)\n",
162 | "\n",
163 | "# Examine the visual observations\n",
164 | "if vis_obs:\n",
165 | " vis_obs_index = next(i for i,v in enumerate(group_spec.observation_shapes) if len(v) == 3)\n",
166 | " print(\"Agent visual observation look like:\")\n",
167 | " obs = step_result.obs[vis_obs_index]\n",
168 | " plt.imshow(obs[0,:,:,:])"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 9,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "Total reward this episode: 1.1000000312924385\n",
181 | "Total reward this episode: 0.6000000238418579\n",
182 | "Total reward this episode: 0.6000000238418579\n",
183 | "Total reward this episode: 2.300000049173832\n",
184 | "Total reward this episode: 1.1000000312924385\n",
185 | "Total reward this episode: 2.0000000447034836\n",
186 | "Total reward this episode: 1.1000000312924385\n",
187 | "Total reward this episode: 0.6000000238418579\n",
188 | "Total reward this episode: 1.4901161193847656e-08\n",
189 | "Total reward this episode: 1.2000000327825546\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "for episode in range(10):\n",
195 | " env.reset()\n",
196 | " step_result = env.get_step_result(group_name)\n",
197 | " done = False\n",
198 | " episode_rewards = 0\n",
199 | " while not done:\n",
200 | " action_size = group_spec.action_size\n",
201 | " if group_spec.is_action_continuous():\n",
202 | " action = np.random.randn(step_result.n_agents(), group_spec.action_size)\n",
203 | " \n",
204 | " if group_spec.is_action_discrete():\n",
205 | " branch_size = group_spec.discrete_action_branches\n",
206 | " action = np.column_stack([np.random.randint(0, branch_size[i], size=(step_result.n_agents())) for i in range(len(branch_size))])\n",
207 | " env.set_actions(group_name, action)\n",
208 | " env.step()\n",
209 | " step_result = env.get_step_result(group_name)\n",
210 | " episode_rewards += step_result.reward[0]\n",
211 | " done = step_result.done[0]\n",
212 | " print(\"Total reward this episode: {}\".format(episode_rewards))"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "# Interaction test with custom environment"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stderr",
229 | "output_type": "stream",
230 | "text": [
231 | "INFO:mlagents_envs:Listening on port 5004. Start training by pressing the Play button in the Unity Editor.\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "import matplotlib.pyplot as plt\n",
237 | "import numpy as np\n",
238 | "import sys\n",
239 | "\n",
240 | "from mlagents_envs.environment import UnityEnvironment\n",
241 | "from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel\n",
242 | "\n",
243 | "engine_configuration_channel = EngineConfigurationChannel()\n",
244 | "env = UnityEnvironment(base_port = 5004, side_channels = [engine_configuration_channel])"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": []
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": "Python 3",
258 | "language": "python",
259 | "name": "python3"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.6.4"
272 | }
273 | },
274 | "nbformat": 4,
275 | "nbformat_minor": 2
276 | }
277 |
--------------------------------------------------------------------------------
/8. Unity ML agents tests/rolling_a_ball/rollingaball1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/8. Unity ML agents tests/rolling_a_ball/rollingaball1.png
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/README.md:
--------------------------------------------------------------------------------
1 | # Discrete Optimization with RL
2 |
3 | > Comparison between classical techniques and RL in discrete optimization.
4 | > These experiments are run alongside the MOOC on Discrete Optimization by the University of Melbourne
5 |
6 |
7 | ## Folder structure
8 | ```
9 | - lessons - personal notes on discrete Optimization, mostly from Coursera MOOC
10 | - knapsack_problem - experiments on the knapsack problem, from classical optimization to RL
11 | ```
12 |
13 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/Solver.java:
--------------------------------------------------------------------------------
1 | import java.io.*;
2 | import java.util.List;
3 | import java.util.ArrayList;
4 |
5 | /**
6 | * The class Solver
is an implementation of a greedy algorithm to solve the knapsack problem.
7 | *
8 | */
9 | public class Solver {
10 |
11 | /**
12 | * The main class
13 | */
14 | public static void main(String[] args) {
15 | try {
16 | solve(args);
17 | } catch (IOException e) {
18 | e.printStackTrace();
19 | }
20 | }
21 |
22 | /**
23 | * Read the instance, solve it, and print the solution in the standard output
24 | */
25 | public static void solve(String[] args) throws IOException {
26 | String fileName = null;
27 |
28 | // get the temp file name
29 | for(String arg : args){
30 | if(arg.startsWith("-file=")){
31 | fileName = arg.substring(6);
32 | }
33 | }
34 | if(fileName == null)
35 | return;
36 |
37 | // read the lines out of the file
38 | List lines = new ArrayList();
39 |
40 | BufferedReader input = new BufferedReader(new FileReader(fileName));
41 | try {
42 | String line = null;
43 | while (( line = input.readLine()) != null){
44 | lines.add(line);
45 | }
46 | }
47 | finally {
48 | input.close();
49 | }
50 |
51 |
52 | // parse the data in the file
53 | String[] firstLine = lines.get(0).split("\\s+");
54 | int items = Integer.parseInt(firstLine[0]);
55 | int capacity = Integer.parseInt(firstLine[1]);
56 |
57 | int[] values = new int[items];
58 | int[] weights = new int[items];
59 |
60 | for(int i=1; i < items+1; i++){
61 | String line = lines.get(i);
62 | String[] parts = line.split("\\s+");
63 |
64 | values[i-1] = Integer.parseInt(parts[0]);
65 | weights[i-1] = Integer.parseInt(parts[1]);
66 | }
67 |
68 | // a trivial greedy algorithm for filling the knapsack
69 | // it takes items in-order until the knapsack is full
70 | int value = 0;
71 | int weight = 0;
72 | int[] taken = new int[items];
73 |
74 | for(int i=0; i < items; i++){
75 | if(weight + weights[i] <= capacity){
76 | taken[i] = 1;
77 | value += values[i];
78 | weight += weights[i];
79 | } else {
80 | taken[i] = 0;
81 | }
82 | }
83 |
84 | // prepare the solution in the specified output format
85 | System.out.println(value+" 0");
86 | for(int i=0; i < items; i++){
87 | System.out.print(taken[i]+" ");
88 | }
89 | System.out.println("");
90 | }
91 | }
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/_coursera:
--------------------------------------------------------------------------------
1 | _le-pVv_EeasJA5dVmWj2w
2 | Knapsack
3 | awPVV, ./data/ks_30_0, solver.py, Knapsack Problem 1
4 | hHYWS, ./data/ks_50_0, solver.py, Knapsack Problem 2
5 | JwWnx, ./data/ks_200_0, solver.py, Knapsack Problem 3
6 | Z2tMt, ./data/ks_400_0, solver.py, Knapsack Problem 4
7 | PUIxa, ./data/ks_1000_0, solver.py, Knapsack Problem 5
8 | AKXWc, ./data/ks_10000_0, solver.py, Knapsack Problem 6
9 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_0:
--------------------------------------------------------------------------------
1 | 100 100000
2 | 90000 90001
3 | 89750 89751
4 | 10001 10002
5 | 89500 89501
6 | 10252 10254
7 | 89250 89251
8 | 10503 10506
9 | 89000 89001
10 | 10754 10758
11 | 88750 88751
12 | 11005 11010
13 | 88500 88501
14 | 11256 11262
15 | 88250 88251
16 | 11507 11514
17 | 88000 88001
18 | 11758 11766
19 | 87750 87751
20 | 12009 12018
21 | 87500 87501
22 | 12260 12270
23 | 87250 87251
24 | 12511 12522
25 | 87000 87001
26 | 12762 12774
27 | 86750 86751
28 | 13013 13026
29 | 86500 86501
30 | 13264 13278
31 | 86250 86251
32 | 13515 13530
33 | 86000 86001
34 | 13766 13782
35 | 85750 85751
36 | 14017 14034
37 | 85500 85501
38 | 14268 14286
39 | 85250 85251
40 | 14519 14538
41 | 85000 85001
42 | 14770 14790
43 | 84750 84751
44 | 15021 15042
45 | 84500 84501
46 | 15272 15294
47 | 84250 84251
48 | 15523 15546
49 | 84000 84001
50 | 15774 15798
51 | 83750 83751
52 | 16025 16050
53 | 83500 83501
54 | 16276 16302
55 | 83250 83251
56 | 16527 16554
57 | 83000 83001
58 | 16778 16806
59 | 82750 82751
60 | 17029 17058
61 | 82500 82501
62 | 17280 17310
63 | 82250 82251
64 | 17531 17562
65 | 82000 82001
66 | 17782 17814
67 | 81750 81751
68 | 18033 18066
69 | 81500 81501
70 | 18284 18318
71 | 81250 81251
72 | 18535 18570
73 | 81000 81001
74 | 18786 18822
75 | 80750 80751
76 | 19037 19074
77 | 80500 80501
78 | 19288 19326
79 | 80250 80251
80 | 19539 19578
81 | 80000 80001
82 | 19790 19830
83 | 79750 79751
84 | 20041 20082
85 | 79500 79501
86 | 20292 20334
87 | 79250 79251
88 | 20543 20586
89 | 79000 79001
90 | 20794 20838
91 | 78750 78751
92 | 21045 21090
93 | 78500 78501
94 | 21296 21342
95 | 78250 78251
96 | 21547 21594
97 | 78000 78001
98 | 21798 21846
99 | 77750 77751
100 | 22049 22098
101 | 77500 77501
102 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_1:
--------------------------------------------------------------------------------
1 | 100 3190802
2 | 1491 3882
3 | 399 1298
4 | 77 654
5 | 969 2638
6 | 8485 20670
7 | 55 610
8 | 1904 4908
9 | 703 2106
10 | 657 2014
11 | 932 2564
12 | 1201 3302
13 | 1697 4494
14 | 462 1424
15 | 1201 3302
16 | 111632 267364
17 | 9044 21988
18 | 147380 352660
19 | 31852 76604
20 | 9044 21988
21 | 9300 22700
22 | 8660 21020
23 | 174684 418068
24 | 19844 47788
25 | 9044 21988
26 | 1635 4370
27 | 62788 150476
28 | 6932 16964
29 | 6308 15516
30 | 50 600
31 | 4600 11300
32 | 565204 1351508
33 | 7463 18226
34 | 2988 7476
35 | 9044 21988
36 | 9044 21988
37 | 4040 9980
38 | 137732 329764
39 | 7150 17400
40 | 9300 22700
41 | 177 854
42 | 372 1244
43 | 499 1498
44 | 15108 36516
45 | 11108 26916
46 | 2468 6236
47 | 1133 3166
48 | 1490 3880
49 | 865 2430
50 | 2468 6236
51 | 2468 6236
52 | 5974 14648
53 | 5972 14644
54 | 9532 23164
55 | 1872 4844
56 | 3964 9828
57 | 2799 7098
58 | 527708 1261916
59 | 7212 17724
60 | 3002 7504
61 | 21004 50708
62 | 47728 114556
63 | 565204 1351508
64 | 100600 240900
65 | 118920 284740
66 | 2822 7144
67 | 612 1924
68 | 6324 15548
69 | 9508 23116
70 | 9268 22636
71 | 11636 28172
72 | 210708 504116
73 | 2176944 5204588
74 | 930 2560
75 | 4481 11062
76 | 50 600
77 | 112 724
78 | 14434 34968
79 | 0 500
80 | 248 996
81 | 48 596
82 | 820 2340
83 | 278 1056
84 | 643 1986
85 | 1413 3726
86 | 1408 3716
87 | 0 500
88 | 2581 6662
89 | 287 1074
90 | 2040 5180
91 | 289 1078
92 | 1380 3660
93 | 372 1244
94 | 0 500
95 | 472 1444
96 | 360 1220
97 | 0 500
98 | 622 1944
99 | 3504 8708
100 | 5924 14548
101 | 2784 7068
102 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_100_2:
--------------------------------------------------------------------------------
1 | 100 10000
2 | 339 342
3 | 1629 1514
4 | 697 696
5 | 1299 1433
6 | 1613 1762
7 | 36 40
8 | 1737 1635
9 | 473 442
10 | 1859 1899
11 | 2055 1960
12 | 362 378
13 | 1104 1177
14 | 1880 1970
15 | 1349 1434
16 | 1545 1691
17 | 132 139
18 | 341 371
19 | 1430 1350
20 | 1878 1775
21 | 1870 1980
22 | 1536 1651
23 | 818 814
24 | 289 282
25 | 1690 1573
26 | 1437 1587
27 | 310 302
28 | 53 56
29 | 720 726
30 | 1707 1820
31 | 258 269
32 | 1842 1680
33 | 757 842
34 | 1642 1730
35 | 1149 1243
36 | 1970 1794
37 | 749 775
38 | 1904 1810
39 | 2 3
40 | 967 970
41 | 1310 1261
42 | 1004 997
43 | 1295 1192
44 | 1056 1036
45 | 51 52
46 | 1320 1453
47 | 1580 1673
48 | 480 440
49 | 604 624
50 | 1766 1813
51 | 1198 1326
52 | 1762 1637
53 | 2046 1902
54 | 315 323
55 | 714 746
56 | 434 471
57 | 1461 1366
58 | 1652 1511
59 | 1876 1785
60 | 906 1002
61 | 1483 1560
62 | 1355 1403
63 | 510 513
64 | 2114 1958
65 | 1479 1505
66 | 1618 1538
67 | 1472 1378
68 | 310 315
69 | 1478 1493
70 | 970 1066
71 | 43 40
72 | 1231 1172
73 | 1792 1972
74 | 870 956
75 | 1484 1541
76 | 1049 1014
77 | 56 55
78 | 814 793
79 | 978 985
80 | 1215 1311
81 | 720 737
82 | 210 204
83 | 460 492
84 | 1798 1961
85 | 1944 1952
86 | 208 204
87 | 1836 1872
88 | 882 806
89 | 239 234
90 | 141 136
91 | 49 49
92 | 1352 1363
93 | 915 883
94 | 1318 1259
95 | 72 70
96 | 937 886
97 | 1783 1843
98 | 1253 1319
99 | 1268 1375
100 | 1144 1234
101 | 878 818
102 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_106_0:
--------------------------------------------------------------------------------
1 | 106 106925262
2 | 45276 45276
3 | 90552 90552
4 | 181104 181104
5 | 362208 362208
6 | 724416 724416
7 | 1448832 1448832
8 | 2897664 2897664
9 | 5795328 5795328
10 | 11590656 11590656
11 | 23181312 23181312
12 | 46362624 46362624
13 | 92725248 92725248
14 | 70778 70778
15 | 141556 141556
16 | 283112 283112
17 | 566224 566224
18 | 1132448 1132448
19 | 2264896 2264896
20 | 4529792 4529792
21 | 9059584 9059584
22 | 18119168 18119168
23 | 36238336 36238336
24 | 72476672 72476672
25 | 86911 86911
26 | 173822 173822
27 | 347644 347644
28 | 695288 695288
29 | 1390576 1390576
30 | 2781152 2781152
31 | 5562304 5562304
32 | 11124608 11124608
33 | 22249216 22249216
34 | 44498432 44498432
35 | 88996864 88996864
36 | 92634 92634
37 | 185268 185268
38 | 370536 370536
39 | 741072 741072
40 | 1482144 1482144
41 | 2964288 2964288
42 | 5928576 5928576
43 | 11857152 11857152
44 | 23714304 23714304
45 | 47428608 47428608
46 | 94857216 94857216
47 | 97839 97839
48 | 195678 195678
49 | 391356 391356
50 | 782712 782712
51 | 1565424 1565424
52 | 3130848 3130848
53 | 6261696 6261696
54 | 12523392 12523392
55 | 25046784 25046784
56 | 50093568 50093568
57 | 100187136 100187136
58 | 125941 125941
59 | 251882 251882
60 | 503764 503764
61 | 1007528 1007528
62 | 2015056 2015056
63 | 4030112 4030112
64 | 8060224 8060224
65 | 16120448 16120448
66 | 32240896 32240896
67 | 64481792 64481792
68 | 134269 134269
69 | 268538 268538
70 | 537076 537076
71 | 1074152 1074152
72 | 2148304 2148304
73 | 4296608 4296608
74 | 8593216 8593216
75 | 17186432 17186432
76 | 34372864 34372864
77 | 68745728 68745728
78 | 141033 141033
79 | 282066 282066
80 | 564132 564132
81 | 1128264 1128264
82 | 2256528 2256528
83 | 4513056 4513056
84 | 9026112 9026112
85 | 18052224 18052224
86 | 36104448 36104448
87 | 72208896 72208896
88 | 147279 147279
89 | 294558 294558
90 | 589116 589116
91 | 1178232 1178232
92 | 2356464 2356464
93 | 4712928 4712928
94 | 9425856 9425856
95 | 18851712 18851712
96 | 37703424 37703424
97 | 75406848 75406848
98 | 153525 153525
99 | 307050 307050
100 | 614100 614100
101 | 1228200 1228200
102 | 2456400 2456400
103 | 4912800 4912800
104 | 9825600 9825600
105 | 19651200 19651200
106 | 39302400 39302400
107 | 78604800 78604800
108 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_19_0:
--------------------------------------------------------------------------------
1 | 19 31181
2 | 1945 4990
3 | 321 1142
4 | 2945 7390
5 | 4136 10372
6 | 1107 3114
7 | 1022 2744
8 | 1101 3102
9 | 2890 7280
10 | 962 2624
11 | 1060 3020
12 | 805 2310
13 | 689 2078
14 | 1513 3926
15 | 3878 9656
16 | 13504 32708
17 | 1865 4830
18 | 667 2034
19 | 1833 4766
20 | 16553 40006
21 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_0:
--------------------------------------------------------------------------------
1 | 200 100000
2 | 90001 90000
3 | 89751 89750
4 | 10002 10001
5 | 89501 89500
6 | 10254 10252
7 | 89251 89250
8 | 10506 10503
9 | 89001 89000
10 | 10758 10754
11 | 88751 88750
12 | 11010 11005
13 | 88501 88500
14 | 11262 11256
15 | 88251 88250
16 | 11514 11507
17 | 88001 88000
18 | 11766 11758
19 | 87751 87750
20 | 12018 12009
21 | 87501 87500
22 | 12270 12260
23 | 87251 87250
24 | 12522 12511
25 | 87001 87000
26 | 12774 12762
27 | 86751 86750
28 | 13026 13013
29 | 86501 86500
30 | 13278 13264
31 | 86251 86250
32 | 13530 13515
33 | 86001 86000
34 | 13782 13766
35 | 85751 85750
36 | 14034 14017
37 | 85501 85500
38 | 14286 14268
39 | 85251 85250
40 | 14538 14519
41 | 85001 85000
42 | 14790 14770
43 | 84751 84750
44 | 15042 15021
45 | 84501 84500
46 | 15294 15272
47 | 84251 84250
48 | 15546 15523
49 | 84001 84000
50 | 15798 15774
51 | 83751 83750
52 | 16050 16025
53 | 83501 83500
54 | 16302 16276
55 | 83251 83250
56 | 16554 16527
57 | 83001 83000
58 | 16806 16778
59 | 82751 82750
60 | 17058 17029
61 | 82501 82500
62 | 17310 17280
63 | 82251 82250
64 | 17562 17531
65 | 82001 82000
66 | 17814 17782
67 | 81751 81750
68 | 18066 18033
69 | 81501 81500
70 | 18318 18284
71 | 81251 81250
72 | 18570 18535
73 | 81001 81000
74 | 18822 18786
75 | 80751 80750
76 | 19074 19037
77 | 80501 80500
78 | 19326 19288
79 | 80251 80250
80 | 19578 19539
81 | 80001 80000
82 | 19830 19790
83 | 79751 79750
84 | 20082 20041
85 | 79501 79500
86 | 20334 20292
87 | 79251 79250
88 | 20586 20543
89 | 79001 79000
90 | 20838 20794
91 | 78751 78750
92 | 21090 21045
93 | 78501 78500
94 | 21342 21296
95 | 78251 78250
96 | 21594 21547
97 | 78001 78000
98 | 21846 21798
99 | 77751 77750
100 | 22098 22049
101 | 77501 77500
102 | 22350 22300
103 | 77251 77250
104 | 22602 22551
105 | 77001 77000
106 | 22854 22802
107 | 76751 76750
108 | 23106 23053
109 | 76501 76500
110 | 23358 23304
111 | 76251 76250
112 | 23610 23555
113 | 76001 76000
114 | 23862 23806
115 | 75751 75750
116 | 24114 24057
117 | 75501 75500
118 | 24366 24308
119 | 75251 75250
120 | 24618 24559
121 | 75001 75000
122 | 24870 24810
123 | 74751 74750
124 | 25122 25061
125 | 74501 74500
126 | 25374 25312
127 | 74251 74250
128 | 25626 25563
129 | 74001 74000
130 | 25878 25814
131 | 73751 73750
132 | 26130 26065
133 | 73501 73500
134 | 26382 26316
135 | 73251 73250
136 | 26634 26567
137 | 73001 73000
138 | 26886 26818
139 | 72751 72750
140 | 27138 27069
141 | 72501 72500
142 | 27390 27320
143 | 72251 72250
144 | 27642 27571
145 | 72001 72000
146 | 27894 27822
147 | 71751 71750
148 | 28146 28073
149 | 71501 71500
150 | 28398 28324
151 | 71251 71250
152 | 28650 28575
153 | 71001 71000
154 | 28902 28826
155 | 70751 70750
156 | 29154 29077
157 | 70501 70500
158 | 29406 29328
159 | 70251 70250
160 | 29658 29579
161 | 70001 70000
162 | 29910 29830
163 | 69751 69750
164 | 30162 30081
165 | 69501 69500
166 | 30414 30332
167 | 69251 69250
168 | 30666 30583
169 | 69001 69000
170 | 30918 30834
171 | 68751 68750
172 | 31170 31085
173 | 68501 68500
174 | 31422 31336
175 | 68251 68250
176 | 31674 31587
177 | 68001 68000
178 | 31926 31838
179 | 67751 67750
180 | 32178 32089
181 | 67501 67500
182 | 32430 32340
183 | 67251 67250
184 | 32682 32591
185 | 67001 67000
186 | 32934 32842
187 | 66751 66750
188 | 33186 33093
189 | 66501 66500
190 | 33438 33344
191 | 66251 66250
192 | 33690 33595
193 | 66001 66000
194 | 33942 33846
195 | 65751 65750
196 | 34194 34097
197 | 65501 65500
198 | 34446 34348
199 | 65251 65250
200 | 34698 34599
201 | 68451 68450
202 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_200_1:
--------------------------------------------------------------------------------
1 | 200 2640230
2 | 31860 76620
3 | 11884 28868
4 | 10492 25484
5 | 901 2502
6 | 43580 104660
7 | 9004 21908
8 | 6700 16500
9 | 29940 71980
10 | 7484 18268
11 | 5932 14564
12 | 7900 19300
13 | 6564 16028
14 | 6596 16092
15 | 8172 19844
16 | 5324 13148
17 | 8436 20572
18 | 7332 17964
19 | 6972 17044
20 | 7668 18636
21 | 6524 15948
22 | 6244 15388
23 | 635 1970
24 | 5396 13292
25 | 13596 32892
26 | 51188 122676
27 | 13684 33068
28 | 8596 20892
29 | 156840 375380
30 | 7900 19300
31 | 6460 15820
32 | 14132 34164
33 | 4980 12260
34 | 5216 12932
35 | 6276 15452
36 | 701 2102
37 | 3084 7868
38 | 6924 16948
39 | 5500 13500
40 | 3148 7996
41 | 47844 114788
42 | 226844 542788
43 | 25748 61996
44 | 7012 17124
45 | 3440 8580
46 | 15580 37660
47 | 314 1128
48 | 2852 7204
49 | 15500 37500
50 | 9348 22796
51 | 17768 42836
52 | 16396 39692
53 | 16540 39980
54 | 395124 944948
55 | 10196 24692
56 | 6652 16204
57 | 4848 11996
58 | 74372 178244
59 | 4556 11212
60 | 4900 12100
61 | 3508 8716
62 | 3820 9540
63 | 5460 13420
64 | 16564 40028
65 | 3896 9692
66 | 3832 9564
67 | 9012 21924
68 | 4428 10956
69 | 57796 138492
70 | 12052 29204
71 | 7052 17204
72 | 85864 205628
73 | 5068 12436
74 | 10484 25468
75 | 4516 11132
76 | 3620 9140
77 | 18052 43604
78 | 21 542
79 | 15804 38108
80 | 19020 45940
81 | 170844 408788
82 | 3732 9364
83 | 2920 7340
84 | 4120 10340
85 | 6828 16756
86 | 26252 63204
87 | 11676 28252
88 | 19916 47932
89 | 65488 156876
90 | 7172 17644
91 | 3772 9444
92 | 132868 318036
93 | 8332 20364
94 | 5308 13116
95 | 3780 9460
96 | 5208 12916
97 | 56788 136076
98 | 7172 17644
99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_300_0:
--------------------------------------------------------------------------------
1 | 300 4040184
2 | 31860 76620
3 | 11884 28868
4 | 10492 25484
5 | 901 2502
6 | 43580 104660
7 | 9004 21908
8 | 6700 16500
9 | 29940 71980
10 | 7484 18268
11 | 5932 14564
12 | 7900 19300
13 | 6564 16028
14 | 6596 16092
15 | 8172 19844
16 | 5324 13148
17 | 8436 20572
18 | 7332 17964
19 | 6972 17044
20 | 7668 18636
21 | 6524 15948
22 | 6244 15388
23 | 635 1970
24 | 5396 13292
25 | 13596 32892
26 | 51188 122676
27 | 13684 33068
28 | 8596 20892
29 | 156840 375380
30 | 7900 19300
31 | 6460 15820
32 | 14132 34164
33 | 4980 12260
34 | 5216 12932
35 | 6276 15452
36 | 701 2102
37 | 3084 7868
38 | 6924 16948
39 | 5500 13500
40 | 3148 7996
41 | 47844 114788
42 | 226844 542788
43 | 25748 61996
44 | 7012 17124
45 | 3440 8580
46 | 15580 37660
47 | 314 1128
48 | 2852 7204
49 | 15500 37500
50 | 9348 22796
51 | 17768 42836
52 | 16396 39692
53 | 16540 39980
54 | 395124 944948
55 | 10196 24692
56 | 6652 16204
57 | 4848 11996
58 | 74372 178244
59 | 4556 11212
60 | 4900 12100
61 | 3508 8716
62 | 3820 9540
63 | 5460 13420
64 | 16564 40028
65 | 3896 9692
66 | 3832 9564
67 | 9012 21924
68 | 4428 10956
69 | 57796 138492
70 | 12052 29204
71 | 7052 17204
72 | 85864 205628
73 | 5068 12436
74 | 10484 25468
75 | 4516 11132
76 | 3620 9140
77 | 18052 43604
78 | 21 542
79 | 15804 38108
80 | 19020 45940
81 | 170844 408788
82 | 3732 9364
83 | 2920 7340
84 | 4120 10340
85 | 6828 16756
86 | 26252 63204
87 | 11676 28252
88 | 19916 47932
89 | 65488 156876
90 | 7172 17644
91 | 3772 9444
92 | 132868 318036
93 | 8332 20364
94 | 5308 13116
95 | 3780 9460
96 | 5208 12916
97 | 56788 136076
98 | 7172 17644
99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 | 314 1128
203 | 314 1128
204 | 314 1128
205 | 104036 249172
206 | 5248 12996
207 | 312 1124
208 | 24468 58836
209 | 7716 18932
210 | 30180 72460
211 | 4824 11948
212 | 1120 3140
213 | 11496 27892
214 | 4916 12132
215 | 14428 34956
216 | 24948 59996
217 | 41100 98700
218 | 28692 69084
219 | 826 2352
220 | 3073 7846
221 | 7684 18868
222 | 5604 13708
223 | 17188 41476
224 | 34828 83756
225 | 7540 18380
226 | 8004 19508
227 | 2648 6796
228 | 5124 12748
229 | 3096 7892
230 | 166516 398532
231 | 13756 33212
232 | 9980 24260
233 | 15980 38660
234 | 9056 22012
235 | 5052 12404
236 | 8212 20124
237 | 11164 27028
238 | 13036 31572
239 | 23596 56892
240 | 2028 5156
241 | 7584 18468
242 | 5772 14244
243 | 4124 10348
244 | 5368 13236
245 | 4364 10828
246 | 5604 13708
247 | 8500 20700
248 | 7676 18652
249 | 8636 20972
250 | 4588 11276
251 | 4152 10404
252 | 4860 12020
253 | 5484 13468
254 | 8636 20972
255 | 5140 12780
256 | 236380 565460
257 | 116500 278900
258 | 36480 87660
259 | 16968 41036
260 | 5232 12964
261 | 13280 32060
262 | 138032 330364
263 | 9044 21988
264 | 22028 53156
265 | 4632 11564
266 | 13196 31892
267 | 65404 156708
268 | 28940 69580
269 | 865 2430
270 | 45988 110276
271 | 670 2040
272 | 4820 11940
273 | 41356 99212
274 | 39844 95588
275 | 897 2494
276 | 4028 9956
277 | 7924 19348
278 | 47756 114612
279 | 47036 112772
280 | 25908 62316
281 | 4516 11132
282 | 29460 70820
283 | 7964 19428
284 | 16964 41028
285 | 22196 53492
286 | 68140 163380
287 | 80924 193948
288 | 63700 152700
289 | 20860 50220
290 | 1682 4464
291 | 16804 40508
292 | 3195 8090
293 | 60348 144596
294 | 1901 4902
295 | 67468 161636
296 | 4772 11844
297 | 11196 27092
298 | 25836 62172
299 | 49676 119252
300 | 6188 15276
301 | 15588 37676
302 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_30_0:
--------------------------------------------------------------------------------
1 | 30 100000
2 | 90000 90001
3 | 89750 89751
4 | 10001 10002
5 | 89500 89501
6 | 10252 10254
7 | 89250 89251
8 | 10503 10506
9 | 89000 89001
10 | 10754 10758
11 | 88750 88751
12 | 11005 11010
13 | 88500 88501
14 | 11256 11262
15 | 88250 88251
16 | 11507 11514
17 | 88000 88001
18 | 11758 11766
19 | 87750 87751
20 | 12009 12018
21 | 87500 87501
22 | 12260 12270
23 | 87250 87251
24 | 12511 12522
25 | 87000 87001
26 | 12762 12774
27 | 86750 86751
28 | 13013 13026
29 | 86500 86501
30 | 13264 13278
31 | 86250 86251
32 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_400_0:
--------------------------------------------------------------------------------
1 | 400 9486367
2 | 31860 76620
3 | 11884 28868
4 | 10492 25484
5 | 901 2502
6 | 43580 104660
7 | 9004 21908
8 | 6700 16500
9 | 29940 71980
10 | 7484 18268
11 | 5932 14564
12 | 7900 19300
13 | 6564 16028
14 | 6596 16092
15 | 8172 19844
16 | 5324 13148
17 | 8436 20572
18 | 7332 17964
19 | 6972 17044
20 | 7668 18636
21 | 6524 15948
22 | 6244 15388
23 | 635 1970
24 | 5396 13292
25 | 13596 32892
26 | 51188 122676
27 | 13684 33068
28 | 8596 20892
29 | 156840 375380
30 | 7900 19300
31 | 6460 15820
32 | 14132 34164
33 | 4980 12260
34 | 5216 12932
35 | 6276 15452
36 | 701 2102
37 | 3084 7868
38 | 6924 16948
39 | 5500 13500
40 | 3148 7996
41 | 47844 114788
42 | 226844 542788
43 | 25748 61996
44 | 7012 17124
45 | 3440 8580
46 | 15580 37660
47 | 314 1128
48 | 2852 7204
49 | 15500 37500
50 | 9348 22796
51 | 17768 42836
52 | 16396 39692
53 | 16540 39980
54 | 395124 944948
55 | 10196 24692
56 | 6652 16204
57 | 4848 11996
58 | 74372 178244
59 | 4556 11212
60 | 4900 12100
61 | 3508 8716
62 | 3820 9540
63 | 5460 13420
64 | 16564 40028
65 | 3896 9692
66 | 3832 9564
67 | 9012 21924
68 | 4428 10956
69 | 57796 138492
70 | 12052 29204
71 | 7052 17204
72 | 85864 205628
73 | 5068 12436
74 | 10484 25468
75 | 4516 11132
76 | 3620 9140
77 | 18052 43604
78 | 21 542
79 | 15804 38108
80 | 19020 45940
81 | 170844 408788
82 | 3732 9364
83 | 2920 7340
84 | 4120 10340
85 | 6828 16756
86 | 26252 63204
87 | 11676 28252
88 | 19916 47932
89 | 65488 156876
90 | 7172 17644
91 | 3772 9444
92 | 132868 318036
93 | 8332 20364
94 | 5308 13116
95 | 3780 9460
96 | 5208 12916
97 | 56788 136076
98 | 7172 17644
99 | 7868 19236
100 | 31412 75524
101 | 9252 22604
102 | 12276 29652
103 | 3712 9324
104 | 4516 11132
105 | 105876 253452
106 | 20084 48468
107 | 11492 27884
108 | 49092 117684
109 | 83452 199804
110 | 71372 171044
111 | 66572 159644
112 | 25268 60836
113 | 64292 154084
114 | 21228 51156
115 | 16812 40524
116 | 19260 46420
117 | 7740 18980
118 | 5632 13964
119 | 3256 8212
120 | 15580 37660
121 | 4824 11948
122 | 59700 143100
123 | 14500 35100
124 | 7208 17716
125 | 6028 14756
126 | 75716 181332
127 | 22364 53828
128 | 7636 18572
129 | 6444 15788
130 | 5192 12884
131 | 7388 18076
132 | 33156 79612
133 | 3032 7564
134 | 6628 16156
135 | 7036 17172
136 | 3200 8100
137 | 7300 17900
138 | 4452 11004
139 | 26364 63428
140 | 14036 33972
141 | 16932 40964
142 | 5788 14276
143 | 70476 168852
144 | 4552 11204
145 | 33980 81660
146 | 19300 46500
147 | 39628 95156
148 | 4484 11068
149 | 55044 131988
150 | 574 1848
151 | 29644 71188
152 | 9460 23020
153 | 106284 254468
154 | 304 1108
155 | 3580 8860
156 | 6308 15516
157 | 10492 25484
158 | 12820 31140
159 | 14436 34972
160 | 5044 12388
161 | 1155 3210
162 | 12468 30236
163 | 4380 10860
164 | 9876 24052
165 | 8752 21404
166 | 8676 21052
167 | 42848 102796
168 | 22844 54988
169 | 6244 15388
170 | 314 1128
171 | 314 1128
172 | 314 1128
173 | 314 1128
174 | 314 1128
175 | 314 1128
176 | 387480 926660
177 | 314 1128
178 | 314 1128
179 | 314 1128
180 | 314 1128
181 | 314 1128
182 | 15996 38692
183 | 8372 20444
184 | 65488 156876
185 | 304 1108
186 | 4756 11812
187 | 5012 12324
188 | 304 1108
189 | 314 1128
190 | 314 1128
191 | 314 1128
192 | 314 1128
193 | 314 1128
194 | 314 1128
195 | 314 1128
196 | 304 1108
197 | 1208 3316
198 | 47728 114556
199 | 314 1128
200 | 314 1128
201 | 314 1128
202 | 314 1128
203 | 314 1128
204 | 314 1128
205 | 104036 249172
206 | 5248 12996
207 | 312 1124
208 | 24468 58836
209 | 7716 18932
210 | 30180 72460
211 | 4824 11948
212 | 1120 3140
213 | 11496 27892
214 | 4916 12132
215 | 14428 34956
216 | 24948 59996
217 | 41100 98700
218 | 28692 69084
219 | 826 2352
220 | 3073 7846
221 | 7684 18868
222 | 5604 13708
223 | 17188 41476
224 | 34828 83756
225 | 7540 18380
226 | 8004 19508
227 | 2648 6796
228 | 5124 12748
229 | 3096 7892
230 | 166516 398532
231 | 13756 33212
232 | 9980 24260
233 | 15980 38660
234 | 9056 22012
235 | 5052 12404
236 | 8212 20124
237 | 11164 27028
238 | 13036 31572
239 | 23596 56892
240 | 2028 5156
241 | 7584 18468
242 | 5772 14244
243 | 4124 10348
244 | 5368 13236
245 | 4364 10828
246 | 5604 13708
247 | 8500 20700
248 | 7676 18652
249 | 8636 20972
250 | 4588 11276
251 | 4152 10404
252 | 4860 12020
253 | 5484 13468
254 | 8636 20972
255 | 5140 12780
256 | 236380 565460
257 | 116500 278900
258 | 36480 87660
259 | 16968 41036
260 | 5232 12964
261 | 13280 32060
262 | 138032 330364
263 | 9044 21988
264 | 22028 53156
265 | 4632 11564
266 | 13196 31892
267 | 65404 156708
268 | 28940 69580
269 | 865 2430
270 | 45988 110276
271 | 670 2040
272 | 4820 11940
273 | 41356 99212
274 | 39844 95588
275 | 897 2494
276 | 4028 9956
277 | 7924 19348
278 | 47756 114612
279 | 47036 112772
280 | 25908 62316
281 | 4516 11132
282 | 29460 70820
283 | 7964 19428
284 | 16964 41028
285 | 22196 53492
286 | 68140 163380
287 | 80924 193948
288 | 63700 152700
289 | 20860 50220
290 | 1682 4464
291 | 16804 40508
292 | 3195 8090
293 | 60348 144596
294 | 1901 4902
295 | 67468 161636
296 | 4772 11844
297 | 11196 27092
298 | 25836 62172
299 | 49676 119252
300 | 6188 15276
301 | 15588 37676
302 | 4412 10924
303 | 26564 63828
304 | 16412 39724
305 | 8108 19716
306 | 6084 14868
307 | 9884 24068
308 | 4224 10548
309 | 14660 35420
310 | 25708 61916
311 | 39228 94156
312 | 40748 97796
313 | 40748 97796
314 | 64276 154052
315 | 114356 273812
316 | 14724 35548
317 | 4540 11180
318 | 11612 28124
319 | 4972 12244
320 | 10060 24420
321 | 14548 35196
322 | 3136 7972
323 | 9132 22164
324 | 5752 14204
325 | 10100 24500
326 | 12172 29444
327 | 24428 58756
328 | 3336 8372
329 | 4356 10812
330 | 8652 21004
331 | 14492 35084
332 | 8796 21492
333 | 6408 15716
334 | 6056 14812
335 | 10124 24548
336 | 387480 926660
337 | 18188 43876
338 | 7732 18964
339 | 9492 23084
340 | 7300 17900
341 | 10052 24404
342 | 19604 47308
343 | 6644 16188
344 | 107364 257028
345 | 91812 219924
346 | 4620 11540
347 | 42848 102796
348 | 33268 79836
349 | 13260 32020
350 | 6564 16028
351 | 6524 15948
352 | 13596 32892
353 | 13596 32892
354 | 47844 114788
355 | 226844 542788
356 | 226844 542788
357 | 226844 542788
358 | 226844 542788
359 | 85864 205628
360 | 170844 408788
361 | 56788 136076
362 | 6628 16156
363 | 10492 25484
364 | 104036 249172
365 | 14428 34956
366 | 14428 34956
367 | 22028 53156
368 | 22028 53156
369 | 22028 53156
370 | 25836 62172
371 | 11612 28124
372 | 11612 28124
373 | 11612 28124
374 | 85872 205644
375 | 1377 3654
376 | 1365820 3265540
377 | 562272 1344644
378 | 1445900 3457100
379 | 501060 1198220
380 | 106224 254348
381 | 492496 1177692
382 | 387824 927548
383 | 151320 362140
384 | 109924 263148
385 | 105696 253092
386 | 96404 230908
387 | 107732 257964
388 | 42140 101180
389 | 102896 246292
390 | 4036 9972
391 | 19616 47332
392 | 100948 241796
393 | 1417728 3389756
394 | 62604 150108
395 | 491820 1176140
396 | 33740 80980
397 | 25216 60732
398 | 111716 267532
399 | 400156 957012
400 | 108800 260500
401 | 1211040 2895580
402 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_40_0:
--------------------------------------------------------------------------------
1 | 40 100000
2 | 90001 90000
3 | 89751 89750
4 | 10002 10001
5 | 89501 89500
6 | 10254 10252
7 | 89251 89250
8 | 10506 10503
9 | 89001 89000
10 | 10758 10754
11 | 88751 88750
12 | 11010 11005
13 | 88501 88500
14 | 11262 11256
15 | 88251 88250
16 | 11514 11507
17 | 88001 88000
18 | 11766 11758
19 | 87751 87750
20 | 12018 12009
21 | 87501 87500
22 | 12270 12260
23 | 87251 87250
24 | 12522 12511
25 | 87001 87000
26 | 12774 12762
27 | 86751 86750
28 | 13026 13013
29 | 86501 86500
30 | 13278 13264
31 | 86251 86250
32 | 13530 13515
33 | 86001 86000
34 | 13782 13766
35 | 85751 85750
36 | 14034 14017
37 | 85501 85500
38 | 14286 14268
39 | 85251 85250
40 | 14538 14519
41 | 86131 86130
42 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_45_0:
--------------------------------------------------------------------------------
1 | 45 58181
2 | 1945 4990
3 | 321 1142
4 | 2945 7390
5 | 4136 10372
6 | 1107 3114
7 | 1022 2744
8 | 1101 3102
9 | 2890 7280
10 | 47019 112738
11 | 1530 3960
12 | 3432 8564
13 | 2165 5630
14 | 1703 4506
15 | 1106 3112
16 | 370 1240
17 | 657 2014
18 | 962 2624
19 | 1060 3020
20 | 805 2310
21 | 689 2078
22 | 1513 3926
23 | 3878 9656
24 | 13504 32708
25 | 1865 4830
26 | 667 2034
27 | 1833 4766
28 | 16553 40006
29 | 1261 3422
30 | 2593 6686
31 | 1170 3240
32 | 794 2288
33 | 671 2042
34 | 7421 18142
35 | 6009 14718
36 | 1767 4634
37 | 2622 6744
38 | 831 2362
39 | 701 2102
40 | 5222 12944
41 | 3086 7872
42 | 900 2500
43 | 3121 7942
44 | 1029 2958
45 | 52555 126010
46 | 389 1278
47 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_4_0:
--------------------------------------------------------------------------------
1 | 4 11
2 | 8 4
3 | 10 5
4 | 15 8
5 | 4 3
6 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_500_0:
--------------------------------------------------------------------------------
1 | 500 50000
2 | 384 412
3 | 7060 7285
4 | 8475 8103
5 | 5028 4876
6 | 9741 9369
7 | 3360 3538
8 | 1426 1394
9 | 2084 2204
10 | 4865 5362
11 | 1885 1779
12 | 8191 8376
13 | 6296 6460
14 | 3292 3193
15 | 10227 9957
16 | 5744 5513
17 | 2163 2365
18 | 10738 9786
19 | 5099 4865
20 | 9193 9406
21 | 7777 7455
22 | 8538 8090
23 | 9597 9224
24 | 1275 1257
25 | 6317 5831
26 | 7598 7177
27 | 2241 2297
28 | 1398 1271
29 | 4083 4216
30 | 6033 5634
31 | 1694 1560
32 | 7563 6878
33 | 12 12
34 | 7406 6872
35 | 7679 7142
36 | 6619 6945
37 | 9222 8778
38 | 1869 1785
39 | 6809 7485
40 | 4961 5033
41 | 2616 2719
42 | 6406 6156
43 | 1703 1826
44 | 6415 6795
45 | 4898 4790
46 | 7601 7620
47 | 2145 1971
48 | 6559 6310
49 | 1691 1874
50 | 8734 8092
51 | 9570 9321
52 | 7649 7955
53 | 0 1
54 | 5652 5146
55 | 475 517
56 | 8789 8341
57 | 1366 1400
58 | 3325 3230
59 | 5487 5443
60 | 7316 7097
61 | 10232 9979
62 | 1788 1873
63 | 9179 9259
64 | 3790 3940
65 | 7820 8611
66 | 4462 4552
67 | 832 893
68 | 6798 7209
69 | 5467 5319
70 | 5573 6065
71 | 5489 5010
72 | 8246 8770
73 | 2815 2918
74 | 8766 8355
75 | 7043 7760
76 | 8834 8052
77 | 8549 8969
78 | 6511 6415
79 | 9253 9812
80 | 831 861
81 | 4587 4755
82 | 202 210
83 | 1022 950
84 | 867 823
85 | 1989 2194
86 | 2813 2594
87 | 1711 1642
88 | 9343 9828
89 | 1840 2029
90 | 2772 2575
91 | 6035 5564
92 | 8815 9345
93 | 9329 8485
94 | 354 353
95 | 3488 3792
96 | 2701 2645
97 | 102 102
98 | 3711 4046
99 | 10505 9897
100 | 8471 9201
101 | 3406 3157
102 | 10171 9442
103 | 6862 7425
104 | 3747 3887
105 | 7132 7137
106 | 7386 7590
107 | 3073 3179
108 | 7566 8244
109 | 2269 2467
110 | 7134 7291
111 | 7750 7078
112 | 8126 8991
113 | 1803 1824
114 | 8229 8894
115 | 9725 9514
116 | 1468 1498
117 | 844 771
118 | 2939 2868
119 | 7538 7210
120 | 380 406
121 | 10182 9845
122 | 176 188
123 | 8874 8977
124 | 5461 5808
125 | 7833 7831
126 | 9668 9122
127 | 3381 3255
128 | 8534 7808
129 | 10002 9684
130 | 8881 9703
131 | 3503 3884
132 | 2774 2742
133 | 6546 6754
134 | 3368 3227
135 | 2269 2521
136 | 3229 3149
137 | 6703 6895
138 | 9740 9718
139 | 1660 1779
140 | 4724 4906
141 | 10161 9765
142 | 2460 2712
143 | 1221 1161
144 | 893 956
145 | 3922 3736
146 | 3837 3854
147 | 4564 4211
148 | 6844 7195
149 | 7300 7204
150 | 550 509
151 | 3347 3315
152 | 8141 8090
153 | 7173 7121
154 | 1386 1366
155 | 2216 2053
156 | 4182 4310
157 | 6496 6753
158 | 7540 7923
159 | 6576 7072
160 | 745 774
161 | 10510 9710
162 | 5294 5494
163 | 6752 6259
164 | 3818 4235
165 | 6704 6462
166 | 212 222
167 | 6247 5995
168 | 7948 8543
169 | 2763 2688
170 | 5698 5186
171 | 2307 2186
172 | 7426 7303
173 | 5292 5134
174 | 9295 8645
175 | 2578 2430
176 | 6097 5571
177 | 2925 3243
178 | 1223 1123
179 | 8720 8978
180 | 4240 4139
181 | 4344 4244
182 | 6250 6864
183 | 6547 7189
184 | 4989 4641
185 | 732 753
186 | 4440 4445
187 | 7861 8726
188 | 147 147
189 | 3066 3394
190 | 5265 5044
191 | 6723 7050
192 | 7443 7655
193 | 6062 6387
194 | 3793 3529
195 | 6167 6689
196 | 1965 1918
197 | 1479 1530
198 | 7177 7624
199 | 3624 3782
200 | 6602 7203
201 | 9195 9398
202 | 8667 8091
203 | 4802 4637
204 | 3317 3035
205 | 10496 9631
206 | 2441 2467
207 | 8759 7973
208 | 320 325
209 | 3459 3770
210 | 4805 4396
211 | 6153 5990
212 | 5076 5513
213 | 6003 6084
214 | 2143 2027
215 | 2915 3169
216 | 6150 6074
217 | 5077 4948
218 | 3335 3361
219 | 8400 8116
220 | 9711 9158
221 | 1375 1467
222 | 6421 6150
223 | 8784 8277
224 | 3085 2946
225 | 247 228
226 | 6182 6208
227 | 7543 7284
228 | 2056 2048
229 | 1198 1190
230 | 4033 4380
231 | 2527 2603
232 | 4158 4618
233 | 2552 2607
234 | 668 609
235 | 7843 8591
236 | 3986 3670
237 | 8463 8184
238 | 6382 6242
239 | 3103 3422
240 | 397 385
241 | 10619 9845
242 | 8138 8106
243 | 8370 8192
244 | 4321 3974
245 | 4514 4964
246 | 4041 4063
247 | 6558 6871
248 | 397 438
249 | 1943 2122
250 | 319 305
251 | 8557 8465
252 | 10517 9695
253 | 7573 8139
254 | 9981 9433
255 | 8833 8354
256 | 5854 5944
257 | 3796 3761
258 | 2043 2109
259 | 7288 7949
260 | 7280 7744
261 | 2163 2065
262 | 2469 2264
263 | 5532 5066
264 | 2318 2387
265 | 7179 6779
266 | 8381 9284
267 | 5665 5694
268 | 3544 3303
269 | 3108 2872
270 | 3050 2801
271 | 7307 6760
272 | 528 536
273 | 8598 8444
274 | 1282 1404
275 | 1912 1919
276 | 6096 6018
277 | 2305 2211
278 | 3787 3723
279 | 7142 6631
280 | 950 965
281 | 7389 7413
282 | 2823 2941
283 | 2097 1979
284 | 7066 6576
285 | 3447 3779
286 | 2727 2493
287 | 7624 8353
288 | 764 776
289 | 4578 4617
290 | 2503 2653
291 | 7276 7099
292 | 6643 6991
293 | 2786 2972
294 | 2422 2349
295 | 6811 6498
296 | 5584 5951
297 | 10727 9755
298 | 3882 3987
299 | 9566 9211
300 | 4396 4126
301 | 8930 8192
302 | 831 849
303 | 4712 4675
304 | 657 602
305 | 2738 3006
306 | 6995 6708
307 | 5598 5844
308 | 8939 9020
309 | 6861 6674
310 | 9795 9952
311 | 2090 2208
312 | 4661 4726
313 | 3258 3155
314 | 6520 6999
315 | 3040 3298
316 | 7137 6758
317 | 8379 8963
318 | 7682 7553
319 | 5225 5634
320 | 5653 5459
321 | 6605 6957
322 | 8226 7939
323 | 7947 8831
324 | 6663 6956
325 | 9263 8743
326 | 8527 7914
327 | 110 116
328 | 486 526
329 | 916 863
330 | 6285 6030
331 | 8658 8005
332 | 9627 9516
333 | 777 752
334 | 5208 5569
335 | 7641 7249
336 | 2961 2726
337 | 255 252
338 | 6656 6447
339 | 10101 9887
340 | 124 133
341 | 8303 7584
342 | 7576 8318
343 | 2428 2643
344 | 4008 4090
345 | 2645 2517
346 | 756 717
347 | 3980 4407
348 | 2950 3236
349 | 9529 9690
350 | 3644 3814
351 | 260 276
352 | 7840 8345
353 | 4601 4493
354 | 7423 7117
355 | 1692 1817
356 | 6957 7465
357 | 2923 3073
358 | 1677 1792
359 | 1138 1088
360 | 5317 5247
361 | 9705 9127
362 | 840 838
363 | 1209 1309
364 | 2481 2369
365 | 7686 8119
366 | 6022 5554
367 | 8029 8016
368 | 5418 5101
369 | 646 613
370 | 9511 8848
371 | 2350 2335
372 | 2544 2444
373 | 6819 7518
374 | 1055 1044
375 | 7563 7599
376 | 4530 4369
377 | 2249 2154
378 | 2244 2095
379 | 2976 3034
380 | 6533 6184
381 | 1518 1625
382 | 2484 2603
383 | 6100 6072
384 | 6326 6297
385 | 7341 7384
386 | 8751 8748
387 | 7195 7352
388 | 2487 2548
389 | 6846 7003
390 | 1049 1102
391 | 3670 3525
392 | 2538 2691
393 | 5378 5906
394 | 1530 1403
395 | 8675 8179
396 | 5411 5421
397 | 308 342
398 | 8138 8884
399 | 3751 4000
400 | 5392 5535
401 | 8288 7690
402 | 3425 3797
403 | 6599 6118
404 | 1855 2050
405 | 8516 8028
406 | 5331 5379
407 | 8180 7989
408 | 708 746
409 | 1217 1315
410 | 5753 5983
411 | 2918 3035
412 | 8370 8675
413 | 9502 9840
414 | 10584 9793
415 | 6538 6077
416 | 3678 3780
417 | 5013 5327
418 | 8374 8415
419 | 2038 1965
420 | 6129 5741
421 | 6622 6292
422 | 7569 7366
423 | 942 963
424 | 1259 1194
425 | 4277 3984
426 | 1121 1021
427 | 6333 5974
428 | 8989 9647
429 | 9265 8860
430 | 8344 8231
431 | 3112 3138
432 | 3347 3355
433 | 1352 1450
434 | 9712 9502
435 | 2307 2209
436 | 5520 5095
437 | 10137 9833
438 | 4583 4634
439 | 4444 4676
440 | 6024 5990
441 | 2481 2671
442 | 9522 9498
443 | 9993 9209
444 | 5687 6004
445 | 420 414
446 | 5365 5480
447 | 834 836
448 | 4767 4745
449 | 2409 2497
450 | 1897 1847
451 | 8698 9047
452 | 4612 4405
453 | 3524 3486
454 | 1156 1173
455 | 6516 5996
456 | 7741 7139
457 | 8546 9331
458 | 2349 2219
459 | 6095 6103
460 | 835 872
461 | 724 666
462 | 5288 5114
463 | 5659 6134
464 | 2847 3042
465 | 9627 9511
466 | 189 189
467 | 1509 1378
468 | 3609 3963
469 | 3802 3926
470 | 134 139
471 | 5689 6206
472 | 9097 9077
473 | 6347 5951
474 | 3007 2835
475 | 4305 3972
476 | 3155 3228
477 | 4130 3764
478 | 3904 3631
479 | 1915 2109
480 | 9014 9897
481 | 8504 8943
482 | 651 708
483 | 8947 8695
484 | 6239 5900
485 | 8311 8054
486 | 1412 1422
487 | 6513 7166
488 | 8244 8159
489 | 8127 8361
490 | 5552 5782
491 | 4068 4325
492 | 1013 935
493 | 10274 9984
494 | 2977 3181
495 | 2751 2876
496 | 10479 9715
497 | 2260 2159
498 | 5603 5520
499 | 3074 3065
500 | 9406 9789
501 | 9416 9939
502 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_0:
--------------------------------------------------------------------------------
1 | 50 341045
2 | 1906 4912
3 | 41516 99732
4 | 23527 56554
5 | 559 1818
6 | 45136 108372
7 | 2625 6750
8 | 492 1484
9 | 1086 3072
10 | 5516 13532
11 | 4875 12050
12 | 7570 18440
13 | 4436 10972
14 | 620 1940
15 | 50897 122094
16 | 2129 5558
17 | 4265 10630
18 | 706 2112
19 | 2721 6942
20 | 16494 39888
21 | 29688 71276
22 | 3383 8466
23 | 2181 5662
24 | 96601 231302
25 | 1795 4690
26 | 7512 18324
27 | 1242 3384
28 | 2889 7278
29 | 2133 5566
30 | 103 706
31 | 4446 10992
32 | 11326 27552
33 | 3024 7548
34 | 217 934
35 | 13269 32038
36 | 281 1062
37 | 77174 184848
38 | 952 2604
39 | 15572 37644
40 | 566 1832
41 | 4103 10306
42 | 313 1126
43 | 14393 34886
44 | 1313 3526
45 | 348 1196
46 | 419 1338
47 | 246 992
48 | 445 1390
49 | 23552 56804
50 | 23552 56804
51 | 67 634
52 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_50_1:
--------------------------------------------------------------------------------
1 | 50 5000
2 | 995 945
3 | 259 242
4 | 258 244
5 | 279 281
6 | 576 582
7 | 126 119
8 | 280 303
9 | 859 913
10 | 270 279
11 | 389 408
12 | 927 925
13 | 281 305
14 | 624 662
15 | 961 938
16 | 757 718
17 | 231 250
18 | 838 767
19 | 154 158
20 | 649 595
21 | 277 268
22 | 180 167
23 | 895 957
24 | 23 22
25 | 930 948
26 | 93 102
27 | 61 62
28 | 626 604
29 | 342 349
30 | 262 279
31 | 215 221
32 | 183 203
33 | 958 889
34 | 205 213
35 | 859 835
36 | 171 166
37 | 566 575
38 | 779 758
39 | 704 706
40 | 196 182
41 | 26 28
42 | 726 729
43 | 621 671
44 | 800 864
45 | 580 579
46 | 535 553
47 | 647 632
48 | 168 163
49 | 90 95
50 | 679 745
51 | 440 438
52 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_60_0:
--------------------------------------------------------------------------------
1 | 60 100000
2 | 90000 90001
3 | 89750 89751
4 | 10001 10002
5 | 89500 89501
6 | 10252 10254
7 | 89250 89251
8 | 10503 10506
9 | 89000 89001
10 | 10754 10758
11 | 88750 88751
12 | 11005 11010
13 | 88500 88501
14 | 11256 11262
15 | 88250 88251
16 | 11507 11514
17 | 88000 88001
18 | 11758 11766
19 | 87750 87751
20 | 12009 12018
21 | 87500 87501
22 | 12260 12270
23 | 87250 87251
24 | 12511 12522
25 | 87000 87001
26 | 12762 12774
27 | 86750 86751
28 | 13013 13026
29 | 86500 86501
30 | 13264 13278
31 | 86250 86251
32 | 13515 13530
33 | 86000 86001
34 | 13766 13782
35 | 85750 85751
36 | 14017 14034
37 | 85500 85501
38 | 14268 14286
39 | 85250 85251
40 | 14519 14538
41 | 85000 85001
42 | 14770 14790
43 | 84750 84751
44 | 15021 15042
45 | 84500 84501
46 | 15272 15294
47 | 84250 84251
48 | 15523 15546
49 | 84000 84001
50 | 15774 15798
51 | 83750 83751
52 | 16025 16050
53 | 83500 83501
54 | 16276 16302
55 | 83250 83251
56 | 16527 16554
57 | 83000 83001
58 | 16778 16806
59 | 82750 82751
60 | 17029 17058
61 | 82500 82501
62 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_82_0:
--------------------------------------------------------------------------------
1 | 82 104723596
2 | 13211 13211
3 | 26422 26422
4 | 52844 52844
5 | 105688 105688
6 | 211376 211376
7 | 422752 422752
8 | 845504 845504
9 | 1691008 1691008
10 | 3382016 3382016
11 | 6764032 6764032
12 | 13528064 13528064
13 | 27056128 27056128
14 | 54112256 54112256
15 | 13212 13212
16 | 26424 26424
17 | 52848 52848
18 | 105696 105696
19 | 211392 211392
20 | 422784 422784
21 | 845568 845568
22 | 1691136 1691136
23 | 3382272 3382272
24 | 6764544 6764544
25 | 13529088 13529088
26 | 27058176 27058176
27 | 54116352 54116352
28 | 39638 39638
29 | 79276 79276
30 | 158552 158552
31 | 317104 317104
32 | 634208 634208
33 | 1268416 1268416
34 | 2536832 2536832
35 | 5073664 5073664
36 | 10147328 10147328
37 | 20294656 20294656
38 | 40589312 40589312
39 | 81178624 81178624
40 | 52844 52844
41 | 105688 105688
42 | 211376 211376
43 | 422752 422752
44 | 845504 845504
45 | 1691008 1691008
46 | 3382016 3382016
47 | 6764032 6764032
48 | 13528064 13528064
49 | 27056128 27056128
50 | 54112256 54112256
51 | 66060 66060
52 | 132120 132120
53 | 264240 264240
54 | 528480 528480
55 | 1056960 1056960
56 | 2113920 2113920
57 | 4227840 4227840
58 | 8455680 8455680
59 | 16911360 16911360
60 | 33822720 33822720
61 | 67645440 67645440
62 | 79268 79268
63 | 158536 158536
64 | 317072 317072
65 | 634144 634144
66 | 1268288 1268288
67 | 2536576 2536576
68 | 5073152 5073152
69 | 10146304 10146304
70 | 20292608 20292608
71 | 40585216 40585216
72 | 81170432 81170432
73 | 92482 92482
74 | 184964 184964
75 | 369928 369928
76 | 739856 739856
77 | 1479712 1479712
78 | 2959424 2959424
79 | 5918848 5918848
80 | 11837696 11837696
81 | 23675392 23675392
82 | 47350784 47350784
83 | 94701568 94701568
84 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_1:
--------------------------------------------------------------------------------
1 | 3 9
2 | 5 4
3 | 6 5
4 | 3 2
5 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/data/ks_lecture_dp_2:
--------------------------------------------------------------------------------
1 | 4 7
2 | 16 2
3 | 19 3
4 | 23 4
5 | 28 5
6 |
7 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/9. Discrete optimization with RL/knapsack_problem/knapsack/handout.pdf
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/solver.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | from collections import namedtuple
5 | Item = namedtuple("Item", ['index', 'value', 'weight'])
6 |
7 | def solve_it(input_data):
8 | # Modify this code to run your optimization algorithm
9 |
10 | # parse the input
11 | lines = input_data.split('\n')
12 |
13 | firstLine = lines[0].split()
14 | item_count = int(firstLine[0])
15 | capacity = int(firstLine[1])
16 |
17 | items = []
18 |
19 | for i in range(1, item_count+1):
20 | line = lines[i]
21 | parts = line.split()
22 | items.append(Item(i-1, int(parts[0]), int(parts[1])))
23 |
24 | # a trivial algorithm for filling the knapsack
25 | # it takes items in-order until the knapsack is full
26 | value = 0
27 | weight = 0
28 | taken = [0]*len(items)
29 |
30 | for item in items:
31 | if weight + item.weight <= capacity:
32 | taken[item.index] = 1
33 | value += item.value
34 | weight += item.weight
35 |
36 | # prepare the solution in the specified output format
37 | output_data = str(value) + ' ' + str(0) + '\n'
38 | output_data += ' '.join(map(str, taken))
39 | return output_data
40 |
41 |
42 | if __name__ == '__main__':
43 | import sys
44 | if len(sys.argv) > 1:
45 | file_location = sys.argv[1].strip()
46 | with open(file_location, 'r') as input_data_file:
47 | input_data = input_data_file.read()
48 | print(solve_it(input_data))
49 | else:
50 | print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)')
51 |
52 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/knapsack_problem/knapsack/solverJava.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | from subprocess import Popen, PIPE
6 |
7 | def solve_it(input_data):
8 |
9 | # Writes the inputData to a temporay file
10 |
11 | tmp_file_name = 'tmp.data'
12 | tmp_file = open(tmp_file_name, 'w')
13 | tmp_file.write(input_data)
14 | tmp_file.close()
15 |
16 | # Runs the command: java Solver -file=tmp.data
17 |
18 | process = Popen(['java', 'Solver', '-file=' + tmp_file_name], stdout=PIPE, universal_newlines=True)
19 | (stdout, stderr) = process.communicate()
20 |
21 | # removes the temporay file
22 | os.remove(tmp_file_name)
23 |
24 | return stdout.strip()
25 |
26 |
27 | import sys
28 |
29 | if __name__ == '__main__':
30 | if len(sys.argv) > 1:
31 | file_location = sys.argv[1].strip()
32 | with open(file_location, 'r') as input_data_file:
33 | input_data = input_data_file.read()
34 | print(solve_it(input_data))
35 | else:
36 | print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/ks_4_0)')
37 |
38 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/README.md:
--------------------------------------------------------------------------------
1 | # Personal notes on Discrete Optimization
2 |
3 | > These notes are taken during Coursera course on discrete optimization
4 |
5 | https://www.coursera.org/learn/discrete-optimization/home/welcome
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/discrete_optimization.md:
--------------------------------------------------------------------------------
1 | # Discrete Optimization
2 |
3 |
4 | - The goal of optimization is to find the optimal or a least a high quality solution in a reasonable amount of time even when we face exponential growth in the number of possible solutions
5 |
6 |
7 | ## How to solve an optimization problem ?
8 | - Formalization of the mathematical model
9 | - Start with a greedy algorithm
10 |
11 |
12 | ## Formalizing an optimization task
13 | **How to model an optimization problem?**
14 | Agreeing on a mathematical form on the problem.
15 | - Choose some decision variables (typically encode the result we are interested in)
16 | - Express the problem constraint in terms of variables (what the solutions to the problem are)
17 | - Express the objective function to be maximized (specifying the quality of a solution)
18 |
19 | > There can be many ways to model an optimization problem
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/dynamic_programming.md:
--------------------------------------------------------------------------------
1 | # Dynamic Programming
2 | 
3 |
4 | ## What is dynamic programming ?
5 | **A widely used optimization technique**
6 | - for certain classes of problems
7 | - heavily used in computational biology
8 |
9 | **Basic principle**
10 | - Divide and conquer
11 | - Bottom-up computation
12 |
13 |
14 |
15 |
16 |
17 | ## 📚 References
18 | - [Wikipedia homepage](https://en.wikipedia.org/wiki/Dynamic_programming)
--------------------------------------------------------------------------------
/9. Discrete optimization with RL/lessons/knapsack_problem.md:
--------------------------------------------------------------------------------
1 | # Knapsack Problem
2 | 
3 |
4 |
5 | ## 📝 Conventions & notations
6 | - I = {1,2,...,n}
7 | - O(k,j) denotes an optimal solution to the knapsack problem with capacity k and items [1,...,j]. This is what we want to solve.
8 |
9 | ## 👜 Modeling the Knapsack Problem
10 |
11 | ### Defining the problem
12 | - **Variables**
13 | - Decision variables
14 | - ``xi`` denotes whether the item i is selected in the solution
15 | - Other variables
16 | - ``wi`` denotes the weight of the item i
17 | - ``vi`` denotes the value of the item i
18 | - **Problem constraint**
19 | - Selected item cannot exceed the capacity of the backpack ``sum(wi*xi) <= K``
20 | - **Objective function**
21 | - We want to maximize ``sum(vi*xi)``
22 |
23 |
24 | ### Number of configurations
25 | - How many possible configurations of 1 and 0 for ``(x1,x2,...,xn)`` ? -> Search space
26 | - Not all of them are feasible -> Feasible search space
27 | - How many are they ? ``2^n`` -> exponential growth -> brute force is not possible for more than a few objects
28 |
29 |
30 | ## 🤗 Greedy algorithms
31 |
32 | ### Greedy algorithms to solve the knapsack problem
33 | 1. Take lighter item first
34 | 2. Take most valuable item first
35 | 3. Compute value density ratio (value/weight) and take the most important value
36 |
37 | For one problem, **there are many greedy algorithms**. With no guarantee it's optimal. It really depends on the input. But it's quick to implement, it's often fast to run and it serves as a baseline.
38 |
39 | ### Advantages
40 | - Quick to design and implement
41 | - Can be very fast
42 |
43 | ### Problems
44 | - No quality guarantee
45 | - Quality can vary widely on the input
46 | - Problem feasibility needs to be easy
47 |
48 |
49 |
50 |
51 | ## ⚡ Dynamic Programming
52 | ### Recurrence relations (Bellmann equations)
53 | We want to solve O(k,j) by recurrence :
54 | - Assume we know how to solve ``O(k,j-1)`` for all k, and we want to solve ``O(k,j)`` by adding one more item : the item ``j``
55 | - If ``wj <= k`` there are two cases:
56 | - Either we don't select item j and the best solution is then ``O(k,j-1)``
57 | - Or we select item j and the best solution is ``vj + O(k-wj,j-1)``
58 | - Or written mathematically
59 | ```
60 | - O(k,j) = max(O(k,j-1),vj + O(k-wj,j-1)) if wj <=k
61 | - O(k,j) = O(k,j-1) otherwise
62 | ```
63 | - And of course ``O(k,0) = 0`` for all k (there are no items, there is no value)
64 |
65 | ### Recursive function in Python
66 | ```python
67 | # Variables
68 | w = list(...)
69 | v = list(...)
70 |
71 | def O(k,j):
72 | if (j == 0):
73 | return 0
74 | elif w[j] <= k:
75 | return max([O(k,j-1),v[j] + O(k-w[j],j-1)])
76 | else:
77 | return O(k,j-1)
78 | ```
79 | How efficient is this approach? Not a lot if we go top down (to compute many values we need to compute again the same values, that's often the case with complex recursive functions).
80 | That's why Dynamic Programming is all about Bottom-up approach. ###
81 |
82 | ### Bottom-up computation
83 | - Compute the recursive equations bottom up
84 | - Start with zero items
85 | - Add one more item, then two ...
86 |
87 | Often needs to be thought as a tables (capacity x items)
88 |
89 | 
90 |
91 | - Building the table one by one using the formula
92 | - Tracing back to find the optimal solution
93 |
94 | ### Efficiency
95 | - Complexity of the algorithm -> time to fill the table ie O(Kn), we could think it's polynomial not exactly
96 | - It's not polynomial, but exponential because K is represented in a computer by log(K) bits. So we call this type of algorithms pseudo-polynomials. Because it's only efficient when K is small
97 |
98 |
99 |
100 | ## 🌴 Branch, bound & relaxation
101 | When you do exhaustive search it's basically building a decision tree of 2^n branches. Relaxation methods are to explore the tree without computing all nodes. We iterate two steps:
102 | - **Branching** (splitting the problem into a number of subproblems like in exhaustive search)
103 | - **Bounding** (finding an optimistic estimate of the best solution to the subproblem, maximization = upper bound & minimization = lower bound)
104 |
105 | ### How to find an optimization evaluation? How can I relax my problem?
106 | > - We relax a constraint
107 | > - Build the tree and evaluate an optimistic estimate
108 | > - If branching leads to a lower optimisatic estimate, we don't even need to go further in a branch and we can prune it.
109 |
110 | *Branching & bounding can be done a lot of different ways, see Search strategies section*
111 |
112 | ### What can we relax in the knapsack problem?
113 | - The capacity constraint -> take everything in the knapsack
114 | - The selection variable, we can imagine taking a fraction of each item (xi is now a decimal), this is called **linear relaxation**
115 |
116 | Linear relaxation for the knapsack algorithm works by :
117 | - Sorting by value density ratio
118 | - Fill the rest of the knapsack with a fraction of the last item that can partially fit, and you have an optimistic estimate for pruning
119 |
120 |
121 | ## 🔍 Search strategies
122 |
123 | ### Depth-first
124 | Prunes when a node estimation is worse than the best found
125 | - Go deep
126 | - When does it prune? when it finds a new node worse than the found solution
127 | - Is it memory efficient? It can be if we look at a few branches
128 |
129 | ### Best-first
130 | Select the node with the best estimation
131 | - Go for the best
132 | - When does it prune? when all the nodes are worse than a found solution
133 | - It it memory efficient? If we exaggerate and think of a knapsack with infinite capacity, we will commpute the entire tree, so infinite time and infinite space would be required. When the problem is small, it can be efficient.
134 |
135 |
136 | ### Least discrepancy or limited discrepancy search
137 | Trust a greedy heuristic
138 | - Assume a good heuristic is available
139 | - It makes very few mistakes
140 | - Search tree is binary
141 | - Following the heuristic means branching left and branching right means the heuristic was wrong
142 | - Limited Discrepancy Search (LDS)
143 | - Avoid mistakes at all costs
144 | - Explore the search space in increasing order of mistakes
145 | - Trusting the heuristic less and less
146 |
147 | We explore the search spaces in waves, and trust the heuristic less and less.
Its efficiency really depends on a trade off between space and time.
148 |
149 |
150 | ### And many others search strategies
151 |
152 |
153 |
154 |
155 | ## 📚 References
156 | - [Wikipedia page on Knapsack problem](https://en.wikipedia.org/wiki/Knapsack_problem)
157 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning
2 |
3 | 
4 |
5 | ##### Realizations
6 | - Old experiments on RL (2016)
7 | - Solving OpenAI Gym environments (2017-2018)
8 | - Developing an multi agent Tic Tac Toe environment and solving it with Policy Gradients (May 2017)
9 | - Using RL to automatically adapt the cooling in a Data Center (August 2017)
10 | - Controlling Robots via Reinforcement Learning (November 2017)
11 | - Playing and solving the Chrome Dinosaur Game with Evolution Strategies and PyTorch (January 2018)
12 | - Delivery optimization using Reinforcement Learning (January 2019)
13 | - Rubik's Cube optimization (February 2019)
14 | - Multi-Agents simulations (November 2019)
15 |
16 |
17 | ##### Libraries
18 | - ``rl`` is a simple library to do Reinforcement Learning with Keras, it uses old Keras versions and should be updated
19 | - ``hyperion`` is a simple multi agent simulation library
20 |
21 |
22 | ***
23 | ### References and inspiration
24 | ###### RL references
25 |
26 | - [Udemy course on RL](https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python/)
27 | - [David Silver course on RL at UCL](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
28 | - [Berkeley course on AI](http://ai.berkeley.edu/lecture_slides.html)
29 | - [Spinning up course by OpenAI](https://spinningup.openai.com/en/latest/)
30 |
31 |
32 | ##### Q Learning references
33 | - [Q Learning tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0)
34 | - [Q Learning tutorial on Keon.io](https://keon.io/deep-q-learning/)
35 | - [Q Learning tutorial by Udacity](https://github.com/udacity/deep-learning/blob/master/reinforcement/Q-learning-cart.ipynb)
36 |
37 |
38 | ##### Deep Q Learning
39 | - [David Silver's Deep Q Learning course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Resources_files/deep_rl.pdf)
40 | - [Demystyfing Deep Reinforcement Learning](http://neuro.cs.ut.ee/demystifying-deep-reinforcement-learning/)
41 | - [Siraj Raval's notebook on Deep Q Learning](https://github.com/llSourcell/deep_q_learning/blob/master/03_PlayingAgent.ipynb)
42 |
43 | ##### Policy Gradient
44 | - [Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/) Andrej Karpathy's blog article on RL (always a reference)
45 |
46 |
47 |
48 | ##### Evolution strategies
49 | - [Evolution strategies](https://blog.openai.com/evolution-strategies/) - OpenAI
50 | - [How evolution taught us the “genetic algorithm”](https://blog.sicara.com/was-darwin-a-great-computer-scientist-81ffa1dd72f9)
51 | - [Making a robot learn how to move, part 1 — Evolutionary algorithms](https://medium.com/towards-data-science/making-a-robot-learn-how-to-move-part-1-evolutionary-algorithms-340f239c9cd2)
52 | - [Optimize a quadratic function with ES](https://gist.github.com/karpathy/77fbb6a8dac5395f1b73e7a89300318d) - Andrej Karpathy
53 | - [Evolution modelling with creatures](https://www.youtube.com/watch?v=GOFws_hhZs8)
54 | - [Genetic biwalkers](http://rednuht.org/genetic_walkers/)
55 | - [Evolving stable strategies](http://blog.otoro.net/2017/11/12/evolving-stable-strategies/)
56 |
57 | ##### Actor Critic, A2C, ACKTR
58 | - [A3C tutorial tutorial by Arthur Juliani](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)
59 | - [A3C tutorial with Keras and OpenAI](http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html)
60 | - [A3C explananations and implementations](https://mpatacchiola.github.io/blog/2017/02/11/dissecting-reinforcement-learning-4.html)
61 | - [ACKTR & A2C](https://blog.openai.com/baselines-acktr-a2c) - by OpenAI
62 | - [ACKTR & A3C implementation in PyTorch](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr)
63 | - [Actor Critic model with Keras](https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69)
64 | - [Car Racing solving with A3C](https://fr.scribd.com/document/358019044/Reinforcement-Car-Racing-with-A3C) and [this solution as well](https://web.stanford.edu/class/cs221/2017/restricted/p-final/elibol/final.pdf)
65 |
66 | ##### PPO, TRPO
67 | - [Proximal Policy Optimization](https://blog.openai.com/openai-baselines-ppo/) - by OpenAI
68 | - [PPO,TRPO tutorials](https://learningai.io/projects/2017/07/28/ai-gym-workout.html)
69 |
70 |
71 |
72 | ##### AlphaGo
73 | - [ELI5 MCTS](https://www.reddit.com/r/explainlikeimfive/comments/4aimqo/eli5_alpha_go_and_its_decision_making_process/)
74 | - [How AlphaGo works](https://www.tastehit.com/blog/google-deepmind-alphago-how-it-works/)
75 | - [Original Paper for AlphaGo](http://airesearch.com/wp-content/uploads/2016/01/deepmind-mastering-go.pdf) by David Silver
76 |
77 |
78 | ##### Monte Carlo Tree Search
79 | - [Udacity videos on MCTS](https://www.youtube.com/watch?v=onBYsen2_eA)
80 |
81 |
82 | ##### Misc
83 | - [Learning to optimize with RL](http://bair.berkeley.edu/blog/2017/09/12/learning-to-optimize-with-rl/)
84 |
85 |
86 | ##### Environment
87 | - [Unity Agents](https://blogs.unity3d.com/2017/09/19/introducing-unity-machine-learning-agents/)
88 | - [SerpentAI](https://github.com/SerpentAI/SerpentAI)
89 | - [Pybullet](https://docs.google.com/document/d/10sXEhzFRSnvFcl3XxNGhnD4N2SedqwdAvK3dsihxVUA/edit)
90 |
91 | ***
92 | ### Papers
93 |
94 | - [Discrete Sequential Prediction of Continuous Actions for Deep RL](https://arxiv.org/abs/1705.05035)
95 | - [Emotion in Reinforcement Learning Agents and Robots: A Survey](https://arxiv.org/abs/1705.05172)
96 | - [Combating Reinforcement Learning's Sisyphean Curse with Intrinsic Fear](https://arxiv.org/abs/1611.01211)
97 | - [Curiosity-driven Exploration by Self-supervised Prediction](https://arxiv.org/abs/1705.05363)
98 | - [End-to-end optimization of goal-driven and visually grounded dialogue systems](https://arxiv.org/abs/1703.05423)
99 | - [Deep reinforcement learning from human preferences](https://arxiv.org/abs/1706.03741) - OpenAI
100 | - [Programmable Agents](https://arxiv.org/abs/1706.06383) - Deepmind
101 | - [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf) - OpenAI
102 | - [Actor-Critic Reinforcement Learning with Simultaneous Human Control and Feedback](https://arxiv.org/abs/1703.01274)
103 | - [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
104 | - [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495)
105 | - [DARLA: Improving Zero-Shot Transfer in Reinforcement Learning](https://arxiv.org/pdf/1707.08475.pdf)
106 | - [Leveraging Demonstrations for Deep Reinforcement Learning on Robotics Problems with Sparse Rewards](https://arxiv.org/pdf/1707.08817.pdf)
107 | - [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/abs/1703.03864)
108 | - [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/abs/1707.06887)
109 | - [Intrinsically Motivated Goal Exploration Processes with Automatic Curriculum Learning](https://arxiv.org/abs/1708.02190?)
110 | - [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
111 | - [Value Iteration Networks](https://arxiv.org/pdf/1602.02867.pdf)
112 | - [A deep reinforcement learning chatbot](https://arxiv.org/pdf/1709.02349.pdf) - MILA
113 | - [The Uncertainty Bellman Equation and Exploration](https://arxiv.org/abs/1709.05380)
114 | - [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560)
115 | - [Overcoming Exploration in Reinforcement Learning with Demonstrations](https://arxiv.org/abs/1709.10089)
116 | - [Using Simulation and Domain Adaptation to Improve Efficiency of Deep Robotic Grasping](https://arxiv.org/abs/1709.07857)
117 | - [Rainbow: Combining Improvements in Deep Reinforcement Learning](https://arxiv.org/pdf/1710.02298.pdf)
118 | - [Optimizing Long Short-Term Memory Recurrent Neural Networks UsingAnt Colony Optimization to Predict Turbine Engine Vibration](https://arxiv.org/pdf/1710.03753.pdf)
119 | - [Continuous Adaptation via Meta-Learning in Nonstationary and Competitive Environments](https://arxiv.org/pdf/1710.03641.pdf)
120 | - [Emergent Complexity via Multi-Agent Competition](https://arxiv.org/pdf/1710.03748.pdf)
121 | - [A Unified Game-Theoretic Approach to Multiagent Reinforcement Learning](https://arxiv.org/pdf/1711.00832.pdf)
122 |
--------------------------------------------------------------------------------
/rl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/__init__.py
--------------------------------------------------------------------------------
/rl/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/agents/__init__.py
--------------------------------------------------------------------------------
/rl/agents/actor_critic_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | Inspiration from https://keon.io/deep-q-learning/
11 | https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69
12 |
13 | theo.alves.da.costa@gmail.com
14 | https://github.com/theolvs
15 | ------------------------------------------------------------------------
16 | """
17 |
18 |
19 |
20 | import os
21 | import matplotlib.pyplot as plt
22 | import pandas as pd
23 | import numpy as np
24 | import sys
25 | import random
26 | import time
27 | import random
28 | import numpy as np
29 |
30 | from keras.models import Sequential, Model
31 | from keras.layers import Dense, Dropout, Input
32 | from keras.layers.merge import Add, Multiply
33 | from keras.optimizers import Adam
34 | import keras.backend as K
35 | import tensorflow as tf
36 |
37 | from rl import utils
38 | from rl.memory import Memory
39 | from rl.agents.base_agent import Agent
40 |
41 |
42 |
43 | class ActorCriticAgent(Agent):
44 | def __init__(self,env,sess,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,tau = 0.125,actor_activation = "linear"):
45 |
46 | # Main parameters
47 | self.env = env
48 | self.sess = sess
49 |
50 | # Other parameters
51 | self.memory = Memory()
52 | self.epsilon = epsilon
53 | self.epsilon_min = epsilon_min
54 | self.epsilon_decay = epsilon_decay
55 | self.gamma = gamma
56 | self.tau = tau
57 | self.lr = lr
58 |
59 | # Models
60 | self.initialize_actor_model(actor_activation)
61 | self.initialize_critic_model()
62 |
63 |
64 | def initialize_actor_model(self,actor_activation):
65 | self.actor_state_input, self.actor_model = self.build_actor_model(actor_activation)
66 | _, self.target_actor_model = self.build_actor_model(actor_activation)
67 |
68 | self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]]) # where we will feed de/dC (from critic)
69 |
70 | actor_model_weights = self.actor_model.trainable_weights
71 | self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) # dC/dA (from actor)
72 | grads = zip(self.actor_grads, actor_model_weights)
73 | self.optimize = tf.train.AdamOptimizer(self.lr).apply_gradients(grads)
74 |
75 |
76 |
77 | def build_actor_model(self,activation = ""):
78 | # Define the layers of the network
79 | state_input = Input(shape=self.env.observation_space.shape)
80 | h1 = Dense(24, activation='relu')(state_input)
81 | h2 = Dense(48, activation='relu')(h1)
82 | h3 = Dense(24, activation='relu')(h2)
83 | output = Dense(self.env.action_space.shape[0],activation='relu')(h3)
84 |
85 | # Compute the model
86 | model = Model(input=state_input, output=output)
87 | model.compile(loss="mse", optimizer=Adam(lr=self.lr))
88 | return state_input, model
89 |
90 |
91 | def initialize_critic_model(self):
92 | self.critic_state_input, self.critic_action_input, self.critic_model = self.build_critic_model()
93 | _, _, self.target_critic_model = self.build_critic_model()
94 |
95 | self.critic_grads = tf.gradients(self.critic_model.output,self.critic_action_input) # where we calcaulte de/dC for feeding above
96 |
97 | # Initialize for later gradient calculations
98 | self.sess.run(tf.initialize_all_variables())
99 |
100 |
101 |
102 |
103 | def build_critic_model(self):
104 | state_input = Input(shape=self.env.observation_space.shape)
105 | state_h1 = Dense(24, activation='relu')(state_input)
106 | state_h2 = Dense(48)(state_h1)
107 |
108 | action_input = Input(shape=self.env.action_space.shape)
109 | action_h1 = Dense(48)(action_input)
110 |
111 | merged = Add()([state_h2, action_h1])
112 | merged_h1 = Dense(24, activation='relu')(merged)
113 | output = Dense(1, activation='relu')(merged_h1)
114 | model = Model(input=[state_input,action_input], output=output)
115 |
116 | model.compile(loss="mse", optimizer=Adam(lr=self.lr))
117 | return state_input, action_input, model
118 |
119 |
120 |
121 |
122 |
123 |
124 | def train(self,batch_size = 32):
125 | if self.epsilon > self.epsilon_min:
126 | self.epsilon *= self.epsilon_decay
127 |
128 | if len(self.memory.cache) > batch_size:
129 | batch = random.sample(self.memory.cache, batch_size)
130 | else:
131 | batch = self.memory.cache
132 |
133 | self._train_actor(batch)
134 | self._train_critic(batch)
135 |
136 |
137 |
138 |
139 |
140 | def _train_actor(self,batch):
141 | for state,action,reward,next_state,_ in batch:
142 | predicted_action = self.actor_model.predict(state)
143 | grads = self.sess.run(self.critic_grads, feed_dict={
144 | self.critic_state_input: state,
145 | self.critic_action_input: predicted_action
146 | })[0]
147 |
148 | self.sess.run(self.optimize, feed_dict={
149 | self.actor_state_input: state,
150 | self.actor_critic_grad: grads
151 | })
152 |
153 |
154 |
155 | def _train_critic(self,batch):
156 | for state,action,reward,next_state,done in batch:
157 | if not done:
158 | target_action = self.target_actor_model.predict(next_state)
159 | future_reward = self.target_critic_model.predict([next_state, target_action])[0][0]
160 | reward += self.gamma * future_reward
161 | self.critic_model.fit([state, action], reward, verbose=0)
162 |
163 |
164 |
165 | def _update_actor_target(self):
166 | actor_model_weights = self.actor_model.get_weights()
167 | actor_target_weights = self.target_critic_model.get_weights()
168 |
169 | for i in range(len(actor_target_weights)):
170 | actor_target_weights[i] = actor_model_weights[i]
171 | self.target_critic_model.set_weights(actor_target_weights)
172 |
173 |
174 | def _update_critic_target(self):
175 | critic_model_weights = self.critic_model.get_weights()
176 | critic_target_weights = self.critic_target_model.get_weights()
177 |
178 | for i in range(len(critic_target_weights)):
179 | critic_target_weights[i] = critic_model_weights[i]
180 | self.critic_target_model.set_weights(critic_target_weights)
181 |
182 |
183 | def update_target(self):
184 | self._update_actor_target()
185 | self._update_critic_target()
186 |
187 |
188 |
189 |
190 | def act(self, state):
191 |
192 |
193 |
194 |
195 | if np.random.random() < self.epsilon:
196 | return self.env.action_space.sample()
197 | return self.actor_model.predict(state)
--------------------------------------------------------------------------------
/rl/agents/base_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 | import os
17 | import matplotlib.pyplot as plt
18 | import pandas as pd
19 | import numpy as np
20 | import sys
21 | import random
22 | import time
23 | import random
24 | import numpy as np
25 |
26 |
27 |
28 |
29 |
30 | class Agent(object):
31 | def __init__(self):
32 | pass
33 |
34 |
35 | def expand_state_vector(self,state):
36 | if len(state.shape) == 1 or len(state.shape)==3:
37 | return np.expand_dims(state,axis = 0)
38 | else:
39 | return state
40 |
41 |
42 |
43 | def remember(self,*args):
44 | self.memory.save(args)
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/rl/agents/dqn2d_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 19/10/2018
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 | import os
18 | import matplotlib.pyplot as plt
19 | import pandas as pd
20 | import numpy as np
21 | import sys
22 | import random
23 | import time
24 | import random
25 | import numpy as np
26 |
27 | from keras.models import Sequential
28 | from keras.layers import Dense
29 | from keras.optimizers import Adam
30 |
31 | from keras.layers import Input, LSTM, Dense, Conv2D, MaxPooling2D, Dropout, Flatten
32 | from keras.layers import concatenate
33 | from keras.models import Model
34 | from keras.utils import plot_model,to_categorical
35 |
36 | from rl import utils
37 | from rl.memory import Memory
38 | from rl.agents.base_agent import Agent
39 | from rl.agents.dqn_agent import DQNAgent
40 |
41 |
42 |
43 |
44 |
45 | def create_vision_model(input_shape):
46 | input_image = Input(shape=input_shape)
47 | conv1 = Conv2D(32,(3,3),padding="same",activation="relu")(input_image)
48 | pool1 = MaxPooling2D(pool_size=(2,2))(conv1)
49 | drop1 = Dropout(0.25)(pool1)
50 |
51 | conv2 = Conv2D(64,(3,3),padding="same",activation="relu")(drop1)
52 | pool2 = MaxPooling2D(pool_size=(2,2))(conv2)
53 | drop2 = Dropout(0.25)(pool2)
54 |
55 | out = Flatten()(drop2)
56 |
57 | vision_model = Model(inputs=input_image, outputs=out)
58 | return vision_model
59 |
60 |
61 | def create_model(input_shape,output_dim):
62 |
63 | input1 = Input(shape=input_shape)
64 | input2 = Input(shape=input_shape)
65 |
66 | vision_model = create_vision_model(input_shape)
67 |
68 | out1 = vision_model(input1)
69 | out2 = vision_model(input2)
70 |
71 | concatenated = concatenate([out1,out2])
72 |
73 | hidden = Dense(128, activation='relu')(concatenated)
74 | output = Dense(output_dim, activation='softmax')(hidden)
75 |
76 | model = Model([input1, input2], output)
77 |
78 | return model
79 |
80 |
81 |
82 |
83 |
84 | class DQN2DAgent(DQNAgent):
85 |
86 |
87 |
88 | def build_model(self,states_size,actions_size):
89 | model = create_model(states_size,actions_size)
90 | model.compile(loss='categorical_crossentropy',
91 | metrics=['accuracy'],
92 | optimizer="adam")
93 | return model
94 |
95 |
96 |
97 | def train(self,batch_size = 32):
98 | if len(self.memory.cache) > batch_size:
99 | batch = random.sample(self.memory.cache, batch_size)
100 | else:
101 | batch = self.memory.cache
102 |
103 | # Unzip batch
104 | states,actions,rewards,next_states,before_states,dones = zip(*batch)
105 |
106 | # Concat states
107 | states = np.vstack(states)
108 | next_states = np.vstack(next_states)
109 | before_states = np.vstack(before_states)
110 |
111 | # Compute targets
112 | targets = self.model.predict([before_states,states])
113 |
114 | # Compute new targets
115 | rewards = np.array(rewards).reshape(-1,1)
116 | dones = 1-np.array(dones,dtype=np.int32).reshape(-1,1)
117 | predictions = (self.gamma * np.max(self.model.predict([before_states,states]),axis = 1)).reshape(-1,1)
118 | new_targets = rewards + dones * predictions
119 | new_targets = new_targets.astype("float32")
120 |
121 | # Correct targets
122 | actions = to_categorical(np.array(actions).reshape(-1,1),self.actions_size)
123 | np.place(targets,actions,new_targets)
124 |
125 | # Training
126 | self.model.fit([states,next_states],targets,epochs = 1,verbose = 0)
127 |
128 | if self.epsilon > self.epsilon_min:
129 | self.epsilon *= self.epsilon_decay
130 |
131 |
132 |
133 |
134 |
135 | def act(self,before_state,state):
136 | before_state = self.expand_state_vector(before_state)
137 | state = self.expand_state_vector(state)
138 |
139 |
140 | if np.random.rand() > self.epsilon:
141 | q = self.model.predict([before_state,state])
142 |
143 | if self.observation_type == "discrete":
144 | a = np.argmax(q[0])
145 | elif self.observation_type == "continuous":
146 | a = np.squeeze(np.clip(q,self.low,self.high))
147 |
148 | else:
149 | if self.observation_type == "discrete":
150 | a = np.random.randint(self.actions_size)
151 | elif self.observation_type == "continuous":
152 | a = np.random.uniform(self.low,self.high,self.actions_size)
153 | return a
154 |
155 |
156 |
--------------------------------------------------------------------------------
/rl/agents/dqn_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | Inspiration from https://keon.io/deep-q-learning/
11 |
12 | theo.alves.da.costa@gmail.com
13 | https://github.com/theolvs
14 | ------------------------------------------------------------------------
15 | """
16 |
17 |
18 |
19 | import os
20 | import matplotlib.pyplot as plt
21 | import pandas as pd
22 | import numpy as np
23 | import sys
24 | import random
25 | import time
26 | import random
27 | import numpy as np
28 |
29 | from keras.models import Sequential
30 | from keras.layers import Dense
31 | from keras.optimizers import Adam
32 |
33 |
34 | from rl import utils
35 | from rl.memory import Memory
36 | from rl.agents.base_agent import Agent
37 |
38 |
39 |
40 | class DQNAgent(Agent):
41 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.995,gamma = 0.95,lr = 0.001,low = 0,high = 1,max_memory = 2000,observation_type = "discrete"):
42 | assert observation_type in ["discrete","continuous"]
43 | self.states_size = states_size
44 | self.actions_size = actions_size
45 | self.memory = Memory(max_memory = max_memory)
46 | self.epsilon = epsilon
47 | self.low = low
48 | self.high = high
49 | self.observation_type = observation_type
50 | self.epsilon_min = epsilon_min
51 | self.epsilon_decay = epsilon_decay
52 | self.gamma = gamma
53 | self.lr = lr
54 | self.model = self.build_model(states_size,actions_size)
55 |
56 |
57 |
58 |
59 |
60 | def build_model(self,states_size,actions_size):
61 | model = Sequential()
62 | model.add(Dense(24,input_dim = states_size,activation = "relu"))
63 | model.add(Dense(24,activation = "relu"))
64 | model.add(Dense(actions_size,activation = "linear"))
65 | model.compile(loss='mse',
66 | optimizer=Adam(lr=self.lr))
67 | return model
68 |
69 |
70 |
71 |
72 |
73 |
74 | def train(self,batch_size = 32):
75 | if len(self.memory.cache) > batch_size:
76 | batch = random.sample(self.memory.cache, batch_size)
77 | else:
78 | batch = self.memory.cache
79 |
80 | for state,action,reward,next_state,done in batch:
81 | state = self.expand_state_vector(state)
82 | next_state = self.expand_state_vector(next_state)
83 |
84 |
85 | targets = self.model.predict(state)
86 |
87 | if not done:
88 | target = reward + self.gamma * np.max(self.model.predict(next_state))
89 | else:
90 | target = reward
91 |
92 | targets[0][action] = target
93 |
94 | self.model.fit(state,targets,epochs = 1,verbose = 0)
95 |
96 |
97 | if self.epsilon > self.epsilon_min:
98 | self.epsilon *= self.epsilon_decay
99 |
100 |
101 |
102 |
103 |
104 | def act(self,state):
105 | state = self.expand_state_vector(state)
106 |
107 |
108 | if np.random.rand() > self.epsilon:
109 | q = self.model.predict(state)
110 |
111 | if self.observation_type == "discrete":
112 | a = np.argmax(q[0])
113 | elif self.observation_type == "continuous":
114 | a = np.squeeze(np.clip(q,self.low,self.high))
115 |
116 | else:
117 | if self.observation_type == "discrete":
118 | a = np.random.randint(self.actions_size)
119 | elif self.observation_type == "continuous":
120 | a = np.random.uniform(self.low,self.high,self.actions_size)
121 | return a
122 |
123 |
124 |
--------------------------------------------------------------------------------
/rl/agents/q_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 |
11 | theo.alves.da.costa@gmail.com
12 | https://github.com/theolvs
13 | ------------------------------------------------------------------------
14 | """
15 |
16 |
17 |
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np
22 | import sys
23 | import random
24 | import time
25 | import random
26 | import numpy as np
27 |
28 |
29 |
30 |
31 | from rl import utils
32 | from rl.memory import Memory
33 | from rl.agents.base_agent import Agent
34 |
35 |
36 |
37 | class QAgent(Agent):
38 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8):
39 | self.states_size = states_size
40 | self.actions_size = actions_size
41 | self.epsilon = epsilon
42 | self.epsilon_min = epsilon_min
43 | self.epsilon_decay = epsilon_decay
44 | self.gamma = gamma
45 | self.lr = lr
46 | self.Q = self.build_model(states_size,actions_size)
47 |
48 |
49 | def build_model(self,states_size,actions_size):
50 | Q = np.zeros([states_size,actions_size])
51 | return Q
52 |
53 |
54 | def train(self,s,a,r,s_next):
55 | self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*np.max(self.Q[s_next,a]) - self.Q[s,a])
56 |
57 | if self.epsilon > self.epsilon_min:
58 | self.epsilon *= self.epsilon_decay
59 |
60 |
61 | def act(self,s):
62 |
63 | q = self.Q[s,:]
64 |
65 | if np.random.rand() > self.epsilon:
66 | a = np.argmax(q)
67 | else:
68 | a = np.random.randint(self.actions_size)
69 |
70 | return a
71 |
72 |
73 |
--------------------------------------------------------------------------------
/rl/agents/sarsa_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 |
11 | theo.alves.da.costa@gmail.com
12 | https://github.com/theolvs
13 | ------------------------------------------------------------------------
14 | """
15 |
16 |
17 |
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np
22 | import sys
23 | import random
24 | import time
25 | import random
26 | import numpy as np
27 |
28 |
29 |
30 |
31 | from rl import utils
32 | from rl.memory import Memory
33 | from rl.agents.base_agent import Agent
34 |
35 |
36 |
37 | class SarsaAgent(Agent):
38 | def __init__(self,states_size,actions_size,epsilon = 1.0,epsilon_min = 0.01,epsilon_decay = 0.999,gamma = 0.95,lr = 0.8):
39 | self.states_size = states_size
40 | self.actions_size = actions_size
41 | self.epsilon = epsilon
42 | self.epsilon_min = epsilon_min
43 | self.epsilon_decay = epsilon_decay
44 | self.gamma = gamma
45 | self.lr = lr
46 | self.Q = self.build_model(states_size,actions_size)
47 |
48 |
49 |
50 |
51 |
52 | def build_model(self,states_size,actions_size):
53 | Q = np.zeros([states_size,actions_size])
54 | return Q
55 |
56 |
57 |
58 |
59 |
60 |
61 | def train(self,s,a,r,s_next):
62 | a_next = self.act(s_next)
63 | self.Q[s,a] = self.Q[s,a] + self.lr * (r + self.gamma*self.Q[s_next,a_next] - self.Q[s,a])
64 |
65 | if self.epsilon > self.epsilon_min:
66 | self.epsilon *= self.epsilon_decay
67 |
68 |
69 |
70 |
71 | def act(self,s):
72 |
73 | q = self.Q[s,:]
74 |
75 | if np.random.rand() > self.epsilon:
76 | a = np.argmax(q)
77 | else:
78 | a = np.random.randint(self.actions_size)
79 |
80 | return a
81 |
82 |
83 |
--------------------------------------------------------------------------------
/rl/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheoLvs/reinforcement-learning/c54f732d25c198b4daa3deccb4684bc847131cf2/rl/envs/__init__.py
--------------------------------------------------------------------------------
/rl/envs/data_center_cooling.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 | DATA CENTER COOLING
8 |
9 | Started on the 25/08/2017
10 |
11 |
12 | theo.alves.da.costa@gmail.com
13 | https://github.com/theolvs
14 | ------------------------------------------------------------------------
15 | """
16 |
17 |
18 | import os
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 | import numpy as np
22 | import sys
23 | import random
24 | import time
25 | from tqdm import tqdm
26 | from collections import Counter
27 | from scipy import stats
28 |
29 | # Deep Learning (Keras, Tensorflow)
30 | import tensorflow as tf
31 | from keras.models import Sequential
32 | from keras.optimizers import SGD,RMSprop, Adam
33 | from keras.layers import Dense, Dropout, Activation, Flatten
34 | from keras.layers import MaxPooling2D,ZeroPadding2D,Conv2D
35 | from keras.utils.np_utils import to_categorical
36 |
37 |
38 | # Plotly
39 | import plotly.graph_objs as go
40 | from plotly import tools
41 |
42 | np.random.seed(1)
43 |
44 |
45 | #===========================================================================================================
46 | # COOLING CENTER ENVIRONMENT
47 | #===========================================================================================================
48 |
49 |
50 |
51 | class DataCenterCooling(object):
52 | def __init__(self,levels_activity = 20,levels_cooling = 10,cost_factor = 5,risk_factor = 1.6,keep_cooling = False):
53 |
54 | self.hour = 0
55 | self.cost_factor = cost_factor
56 | self.risk_factor = risk_factor
57 | self.levels_activity = levels_activity
58 | self.levels_cooling = levels_cooling
59 | self.define_activity(levels_activity)
60 | if not hasattr(self,"cooling") or not keep_cooling:
61 | self.define_cooling(levels_cooling)
62 |
63 |
64 | def define_activity(self,levels_activity):
65 | # Define the peaks of activity
66 | peak_morning = np.random.randint(7,10)
67 | peak_evening = np.random.randint(17,22)
68 |
69 | # Build the distribution
70 | x1 = np.array(stats.poisson.pmf(range(24),peak_morning))
71 | x2 = np.array(stats.poisson.pmf(range(24),peak_evening))
72 | x = x1 + x2
73 | x *= (100/0.14)
74 |
75 | # Discretize the distribution
76 | take_closest = lambda j,vector:min(vector,key=lambda x:abs(x-j))
77 | percentiles = np.percentile(x,range(0,100,int(100/levels_activity)))
78 | assert len(percentiles) == levels_activity
79 | x_disc = np.array([take_closest(y,percentiles) for y in x])
80 |
81 | # Store the variable
82 | self.observation_space = percentiles
83 | self.activity = np.expand_dims(x_disc,axis = 0)
84 |
85 |
86 |
87 | def define_cooling(self,levels_cooling):
88 | self.action_space = list([int(100/levels_cooling*i) for i in range(levels_cooling)])
89 | assert len(self.action_space) == levels_cooling
90 |
91 | initial_value = random.choice(self.action_space)
92 | self.cooling = np.full((1,24),initial_value)
93 |
94 |
95 |
96 | def reset(self):
97 | self.__init__(self.levels_activity,self.levels_cooling,self.cost_factor)
98 | return self.reset_state()
99 |
100 | def reset_state(self):
101 | activity = self.activity[0][0]
102 | activity_state = self.convert_activity_to_state(activity)
103 | return activity_state
104 |
105 |
106 | def convert_activity_to_state(self,activity):
107 | state = int(np.where(self.observation_space == activity)[0][0])
108 | return state
109 |
110 |
111 |
112 | def render(self,with_plotly = False):
113 |
114 | rewards,winnings,losses,failures = self.compute_daily_rewards()
115 |
116 | if not with_plotly:
117 | # Show the activity and cooling
118 | plt.figure(figsize = (14,5))
119 | plt.plot(np.squeeze(self.activity),c ="red",label = "activity")
120 | plt.plot(np.squeeze(self.cooling),c = "blue",label = "cooling")
121 | plt.legend()
122 | plt.show()
123 |
124 | # Show the rewards
125 | plt.figure(figsize = (14,5))
126 | plt.title("Total reward : {}".format(int(np.sum(rewards))))
127 | plt.plot(rewards,c = "blue",label = "profits")
128 | plt.plot(losses*(-1),c = "red",label = "costs")
129 | plt.plot(winnings,c = "green",label = "revenues")
130 | plt.legend()
131 | plt.show()
132 | else:
133 | data_states = self.render_states_plotly()["data"]
134 | data_rewards = self.render_rewards_plotly()["data"]
135 | data_states
136 | fig = tools.make_subplots(rows=2, cols=1, specs=[[{}], [{}]],
137 | shared_xaxes=True, shared_yaxes=False,
138 | vertical_spacing=0.1)
139 |
140 | for i,trace in enumerate(data_rewards):
141 | fig.append_trace(trace, 2, 1)
142 |
143 | for i,trace in enumerate(data_states):
144 | fig.append_trace(trace, 1, 1)
145 |
146 | # print(len(failures))
147 | # print(len(rewards))
148 |
149 | # shapes = [{"type":"line","x0":hour+1,"y0":0,"x1":hour+1,"y1":failure} for hour,failure in enumerate(failures) if failure > 0]
150 | fig['layout'].update(title="Total reward : {}".format(int(np.sum(rewards))))
151 | fig['layout']['xaxis'].update(dtick = 1)
152 | # fig['layout'].update(shapes=shapes)
153 | return fig
154 |
155 |
156 | def render_states_plotly(self):
157 | # Create a trace
158 | x = list(range(24))
159 | trace_activity = go.Scatter(x = x,y = np.squeeze(self.activity),name = "activity",line = dict(color = "red",width = 2),ysrc = "activity")
160 | trace_cooling = go.Scatter(x = x,y = np.squeeze(self.cooling),name = "cooling",line = dict(color = "#34aac1",width = 2))
161 |
162 | data = [trace_activity,trace_cooling]
163 | fig = {"data":data}
164 | return fig
165 |
166 |
167 | def render_rewards_plotly(self):
168 | rewards,winnings,losses,failures = self.compute_daily_rewards()
169 | # Create a trace
170 | x = list(range(24))
171 | trace_rewards = go.Scatter(x = x,y = np.squeeze(rewards),name = "rewards",line = dict(color = "#34aac1",width = 2),ysrc = "rewards")
172 | trace_winnings = go.Scatter(x = x,y = np.squeeze(winnings),name = "revenues",line = dict(color = "#10c576",width = 1),mode = "lines+markers")
173 | trace_losses = go.Scatter(x = x,y = np.squeeze(losses),name = "costs",line = dict(color = "red",width = 1),mode = "lines+markers")
174 |
175 | data = [trace_rewards,trace_winnings,trace_losses]
176 | fig = {"data":data}
177 | return fig
178 |
179 |
180 |
181 |
182 |
183 | def compute_reward(self,activity,cooling):
184 |
185 | # CALCULATING THE WINNINGS
186 | win = activity
187 |
188 | # CALCULATING THE LOSSES
189 | if cooling >= activity:
190 | cost = (0 if self.cost_factor < 1.0 else 1)*(cooling)**np.sqrt(self.cost_factor)
191 | failure = 0
192 | else:
193 | difference = (activity-cooling)/(cooling+1)
194 | default_probability = np.tanh(difference)
195 | if np.random.rand() > default_probability or self.risk_factor < 1.0:
196 | cost = 0
197 | else:
198 | cost = np.random.normal(loc = self.risk_factor,scale = 0.4) * 150
199 |
200 | # cost += (cooling * min(1,self.cost_factor))**2
201 | cost += (0 if self.cost_factor < 1.0 else (1-1/(self.cost_factor+0.1)))*(cooling)
202 |
203 | failure = cost
204 |
205 | return win,cost,failure
206 |
207 |
208 |
209 |
210 |
211 |
212 | def compute_daily_rewards(self):
213 | winnings = []
214 | losses = []
215 | rewards = []
216 | failures = []
217 | for i in range(24):
218 | activity = self.activity[0][i]
219 | cooling = self.cooling[0][i]
220 | win,loss,failure = self.compute_reward(activity,cooling)
221 | winnings.append(win)
222 | losses.append(loss)
223 | rewards.append(win-loss)
224 | failures.append(failure)
225 |
226 | return np.array(rewards),np.array(winnings),np.array(losses),np.array(failures)
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 | def step(self,cooling_action):
235 |
236 | # Convert cooling_action to cooling_value
237 | cooling = self.action_space[cooling_action]
238 |
239 | # Update the cooling
240 | self.cooling[0][self.hour] = cooling
241 |
242 | activity = self.activity[0][self.hour]
243 | win,loss,failure = self.compute_reward(activity,cooling)
244 | reward = win-loss
245 |
246 | self.hour += 1
247 |
248 | if int(self.hour) == 24:
249 | new_state = self.reset_state()
250 | done = True
251 | else:
252 | new_activity = self.activity[0][self.hour]
253 | new_state = self.convert_activity_to_state(new_activity)
254 | done = False
255 |
256 |
257 | return new_state,reward,done
258 |
259 |
260 |
261 |
262 |
263 |
264 |
--------------------------------------------------------------------------------
/rl/memory.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 | from collections import deque
18 |
19 |
20 |
21 |
22 | class Memory(object):
23 | def __init__(self,max_memory = 2000):
24 | self.cache = deque(maxlen=max_memory)
25 |
26 | def save(self,args):
27 | self.cache.append(args)
28 |
29 | def empty_cache(self):
30 | self.__init__()
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/rl/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """--------------------------------------------------------------------
6 | REINFORCEMENT LEARNING
7 |
8 | Started on the 25/08/2017
9 |
10 | theo.alves.da.costa@gmail.com
11 | https://github.com/theolvs
12 | ------------------------------------------------------------------------
13 | """
14 |
15 |
16 |
17 | import os
18 | import matplotlib.pyplot as plt
19 | import pandas as pd
20 | import numpy as np
21 | import sys
22 | import random
23 | import time
24 | import random
25 | import numpy as np
26 | import pylab
27 |
28 |
29 |
30 | def plot_average_running_rewards(rewards,save = None):
31 | average_running_rewards = np.cumsum(rewards)/np.array(range(1,len(rewards)+1))
32 | figure = plt.figure(figsize = (15,4))
33 | plt.plot(average_running_rewards)
34 |
35 | if save is None:
36 | plt.show()
37 | else:
38 | plt.savefig(save)
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------