├── Coding ├── Frozen_Lake.py ├── Practice-2_1.py ├── Practice-2_2.py ├── Practice-2_3.py ├── Practice-3.py ├── Practice-4.ipynb ├── Practice-5.py ├── Practice-6.ipynb └── Practice_1.py ├── Homework ├── Homework_1.pdf ├── Homework_2.pdf ├── Homework_3.pdf └── Homework_4.pdf ├── README.md └── Slides ├── Lecture_1.pdf ├── Lecture_2.pdf ├── Lecture_3.pdf ├── Lecture_4.pdf ├── Lecture_5.pdf └── Lecture_6.pdf /Coding/Frozen_Lake.py: -------------------------------------------------------------------------------- 1 | # most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/ 2 | # all credit goes to https://github.com/abhishekunique 3 | # (if I got the author right) 4 | import sys 5 | import random 6 | import numpy as np 7 | from gym.utils import seeding 8 | 9 | try: 10 | from graphviz import Digraph 11 | import graphviz 12 | has_graphviz = True 13 | except ImportError: 14 | has_graphviz = False 15 | 16 | 17 | class MDP: 18 | def __init__(self, transition_probs, rewards, initial_state=None, seed=None): 19 | """ 20 | Defines an MDP. Compatible with gym Env. 21 | :param transition_probs: transition_probs[s][a][s_next] = P(s_next | s, a) 22 | A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> prob] 23 | For each state and action, probabilities of next states should sum to 1 24 | If a state has no actions available, it is considered terminal 25 | :param rewards: rewards[s][a][s_next] = r(s,a,s') 26 | A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> reward] 27 | The reward for anything not mentioned here is zero. 28 | :param get_initial_state: a state where agent starts or a callable() -> state 29 | By default, picks initial state at random. 30 | 31 | States and actions can be anything you can use as dict keys, but we recommend that you use strings or integers 32 | 33 | Here's an example from MDP depicted on http://bit.ly/2jrNHNr 34 | transition_probs = { 35 | 's0':{ 36 | 'a0': {'s0': 0.5, 's2': 0.5}, 37 | 'a1': {'s2': 1} 38 | }, 39 | 's1':{ 40 | 'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2}, 41 | 'a1': {'s1': 0.95, 's2': 0.05} 42 | }, 43 | 's2':{ 44 | 'a0': {'s0': 0.4, 's1': 0.6}, 45 | 'a1': {'s0': 0.3, 's1': 0.3, 's2':0.4} 46 | } 47 | } 48 | rewards = { 49 | 's1': {'a0': {'s0': +5}}, 50 | 's2': {'a1': {'s0': -1}} 51 | } 52 | """ 53 | self._check_param_consistency(transition_probs, rewards) 54 | self._transition_probs = transition_probs 55 | self._rewards = rewards 56 | self._initial_state = initial_state 57 | self.n_states = len(transition_probs) 58 | self.reset() 59 | self.np_random, _ = seeding.np_random(seed) 60 | 61 | def get_all_states(self): 62 | """ return a tuple of all possiblestates """ 63 | return tuple(self._transition_probs.keys()) 64 | 65 | def get_possible_actions(self, state): 66 | """ return a tuple of possible actions in a given state """ 67 | return tuple(self._transition_probs.get(state, {}).keys()) 68 | 69 | def is_terminal(self, state): 70 | """ return True if state is terminal or False if it isn't """ 71 | return len(self.get_possible_actions(state)) == 0 72 | 73 | def get_next_states(self, state, action): 74 | """ return a dictionary of {next_state1 : P(next_state1 | state, action), next_state2: ...} """ 75 | assert action in self.get_possible_actions( 76 | state), "cannot do action %s from state %s" % (action, state) 77 | return self._transition_probs[state][action] 78 | 79 | def get_transition_prob(self, state, action, next_state): 80 | """ return P(next_state | state, action) """ 81 | return self.get_next_states(state, action).get(next_state, 0.0) 82 | 83 | def get_reward(self, state, action, next_state): 84 | """ return the reward you get for taking action in state and landing on next_state""" 85 | assert action in self.get_possible_actions( 86 | state), "cannot do action %s from state %s" % (action, state) 87 | return self._rewards.get(state, {}).get(action, {}).get(next_state, 88 | 0.0) 89 | 90 | def reset(self): 91 | """ reset the game, return the initial state""" 92 | if self._initial_state is None: 93 | self._current_state = self.np_random.choice( 94 | tuple(self._transition_probs.keys())) 95 | elif self._initial_state in self._transition_probs: 96 | self._current_state = self._initial_state 97 | elif callable(self._initial_state): 98 | self._current_state = self._initial_state() 99 | else: 100 | raise ValueError( 101 | "initial state %s should be either a state or a function() -> state" % 102 | self._initial_state) 103 | return self._current_state 104 | 105 | def step(self, action): 106 | """ take action, return next_state, reward, is_done, empty_info """ 107 | possible_states, probs = zip( 108 | *self.get_next_states(self._current_state, action).items()) 109 | next_state = possible_states[self.np_random.choice( 110 | np.arange(len(possible_states)), p=probs)] 111 | reward = self.get_reward(self._current_state, action, next_state) 112 | is_done = self.is_terminal(next_state) 113 | self._current_state = next_state 114 | return next_state, reward, is_done, {} 115 | 116 | def render(self): 117 | print("Currently at %s" % self._current_state) 118 | 119 | def _check_param_consistency(self, transition_probs, rewards): 120 | for state in transition_probs: 121 | assert isinstance(transition_probs[state], 122 | dict), "transition_probs for %s should be a dictionary " \ 123 | "but is instead %s" % ( 124 | state, type(transition_probs[state])) 125 | for action in transition_probs[state]: 126 | assert isinstance(transition_probs[state][action], 127 | dict), "transition_probs for %s, %s should be a " \ 128 | "a dictionary but is instead %s" % ( 129 | state, action, 130 | type(transition_probs[ 131 | state, action])) 132 | next_state_probs = transition_probs[state][action] 133 | assert len( 134 | next_state_probs) != 0, "from state %s action %s leads to no next states" % ( 135 | state, action) 136 | sum_probs = sum(next_state_probs.values()) 137 | assert abs( 138 | sum_probs - 1) <= 1e-10, "next state probabilities for state %s action %s " \ 139 | "add up to %f (should be 1)" % ( 140 | state, action, sum_probs) 141 | for state in rewards: 142 | assert isinstance(rewards[state], 143 | dict), "rewards for %s should be a dictionary " \ 144 | "but is instead %s" % ( 145 | state, type(transition_probs[state])) 146 | for action in rewards[state]: 147 | assert isinstance(rewards[state][action], 148 | dict), "rewards for %s, %s should be a " \ 149 | "a dictionary but is instead %s" % ( 150 | state, action, type( 151 | transition_probs[ 152 | state, action])) 153 | msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \ 154 | " you will be sent at the first sign of defiance. " 155 | assert None not in transition_probs, "please do not use None as a state identifier. " + msg 156 | assert None not in rewards, "please do not use None as an action identifier. " + msg 157 | 158 | 159 | class FrozenLakeEnv(MDP): 160 | """ 161 | Winter is here. You and your friends were tossing around a frisbee at the park 162 | when you made a wild throw that left the frisbee out in the middle of the lake. 163 | The water is mostly frozen, but there are a few holes where the ice has melted. 164 | If you step into one of those holes, you'll fall into the freezing water. 165 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 166 | you navigate across the lake and retrieve the disc. 167 | However, the ice is slippery, so you won't always move in the direction you intend. 168 | The surface is described using a grid like the following 169 | 170 | SFFF 171 | FHFH 172 | FFFH 173 | HFFG 174 | 175 | S : starting point, safe 176 | F : frozen surface, safe 177 | H : hole, fall to your doom 178 | G : goal, where the frisbee is located 179 | 180 | The episode ends when you reach the goal or fall in a hole. 181 | You receive a reward of 1 if you reach the goal, and zero otherwise. 182 | 183 | """ 184 | 185 | MAPS = { 186 | "4x4": [ 187 | "SFFF", 188 | "FHFH", 189 | "FFFH", 190 | "HFFG" 191 | ], 192 | "8x8": [ 193 | "SFFFFFFF", 194 | "FFFFFFFF", 195 | "FFFHFFFF", 196 | "FFFFFHFF", 197 | "FFFHFFFF", 198 | "FHHFFFHF", 199 | "FHFFHFHF", 200 | "FFFHFFFG" 201 | ], 202 | } 203 | 204 | def __init__(self, desc=None, map_name="4x4", slip_chance=0.2, seed=None): 205 | if desc is None and map_name is None: 206 | raise ValueError('Must provide either desc or map_name') 207 | elif desc is None: 208 | desc = self.MAPS[map_name] 209 | assert ''.join(desc).count( 210 | 'S') == 1, "this implementation supports having exactly one initial state" 211 | assert all(c in "SFHG" for c in 212 | ''.join(desc)), "all cells must be either of S, F, H or G" 213 | 214 | self.desc = desc = np.asarray(list(map(list, desc)), dtype='str') 215 | self.lastaction = None 216 | 217 | nrow, ncol = desc.shape 218 | states = [(i, j) for i in range(nrow) for j in range(ncol)] 219 | actions = ["left", "down", "right", "up"] 220 | 221 | initial_state = states[np.array(desc == b'S').ravel().argmax()] 222 | 223 | def move(row, col, movement): 224 | if movement == 'left': 225 | col = max(col - 1, 0) 226 | elif movement == 'down': 227 | row = min(row + 1, nrow - 1) 228 | elif movement == 'right': 229 | col = min(col + 1, ncol - 1) 230 | elif movement == 'up': 231 | row = max(row - 1, 0) 232 | else: 233 | raise ("invalid action") 234 | return (row, col) 235 | 236 | transition_probs = {s: {} for s in states} 237 | rewards = {s: {} for s in states} 238 | for (row, col) in states: 239 | if desc[row, col] in "GH": 240 | continue 241 | for action_i in range(len(actions)): 242 | action = actions[action_i] 243 | transition_probs[(row, col)][action] = {} 244 | rewards[(row, col)][action] = {} 245 | for movement_i in [(action_i - 1) % len(actions), action_i, 246 | (action_i + 1) % len(actions)]: 247 | movement = actions[movement_i] 248 | newrow, newcol = move(row, col, movement) 249 | prob = (1. - slip_chance) if movement == action else ( 250 | slip_chance / 2.) 251 | if prob == 0: 252 | continue 253 | if (newrow, newcol) not in transition_probs[row, col][ 254 | action]: 255 | transition_probs[row, col][action][ 256 | newrow, newcol] = prob 257 | else: 258 | transition_probs[row, col][action][ 259 | newrow, newcol] += prob 260 | if desc[newrow, newcol] == 'G': 261 | rewards[row, col][action][newrow, newcol] = 1.0 262 | 263 | MDP.__init__(self, transition_probs, rewards, initial_state, seed) 264 | 265 | def render(self): 266 | desc_copy = np.copy(self.desc) 267 | desc_copy[self._current_state] = '*' 268 | print('\n'.join(map(''.join, desc_copy)), end='\n\n') 269 | 270 | 271 | def plot_graph(mdp, graph_size='10,10', s_node_size='1,5', 272 | a_node_size='0,5', rankdir='LR', ): 273 | """ 274 | Function for pretty drawing MDP graph with graphviz library. 275 | Requirements: 276 | graphviz : https://www.graphviz.org/ 277 | for ubuntu users: sudo apt-get install graphviz 278 | python library for graphviz 279 | for pip users: pip install graphviz 280 | :param mdp: 281 | :param graph_size: size of graph plot 282 | :param s_node_size: size of state nodes 283 | :param a_node_size: size of action nodes 284 | :param rankdir: order for drawing 285 | :return: dot object 286 | """ 287 | s_node_attrs = {'shape': 'doublecircle', 288 | 'color': '#85ff75', 289 | 'style': 'filled', 290 | 'width': str(s_node_size), 291 | 'height': str(s_node_size), 292 | 'fontname': 'Arial', 293 | 'fontsize': '24'} 294 | 295 | a_node_attrs = {'shape': 'circle', 296 | 'color': 'lightpink', 297 | 'style': 'filled', 298 | 'width': str(a_node_size), 299 | 'height': str(a_node_size), 300 | 'fontname': 'Arial', 301 | 'fontsize': '20'} 302 | 303 | s_a_edge_attrs = {'style': 'bold', 304 | 'color': 'red', 305 | 'ratio': 'auto'} 306 | 307 | a_s_edge_attrs = {'style': 'dashed', 308 | 'color': 'blue', 309 | 'ratio': 'auto', 310 | 'fontname': 'Arial', 311 | 'fontsize': '16'} 312 | 313 | graph = Digraph(name='MDP') 314 | graph.attr(rankdir=rankdir, size=graph_size) 315 | for state_node in mdp._transition_probs: 316 | graph.node(state_node, **s_node_attrs) 317 | 318 | for posible_action in mdp.get_possible_actions(state_node): 319 | action_node = state_node + "-" + posible_action 320 | graph.node(action_node, 321 | label=str(posible_action), 322 | **a_node_attrs) 323 | graph.edge(state_node, state_node + "-" + 324 | posible_action, **s_a_edge_attrs) 325 | 326 | for posible_next_state in mdp.get_next_states(state_node, 327 | posible_action): 328 | probability = mdp.get_transition_prob( 329 | state_node, posible_action, posible_next_state) 330 | reward = mdp.get_reward( 331 | state_node, posible_action, posible_next_state) 332 | 333 | if reward != 0: 334 | label_a_s_edge = 'p = ' + str(probability) + \ 335 | ' ' + 'reward =' + str(reward) 336 | else: 337 | label_a_s_edge = 'p = ' + str(probability) 338 | 339 | graph.edge(action_node, posible_next_state, 340 | label=label_a_s_edge, **a_s_edge_attrs) 341 | return graph 342 | 343 | 344 | def plot_graph_with_state_values(mdp, state_values): 345 | """ Plot graph with state values""" 346 | graph = plot_graph(mdp) 347 | for state_node in mdp._transition_probs: 348 | value = state_values[state_node] 349 | graph.node(state_node, 350 | label=str(state_node) + '\n' + 'V =' + str(value)[:4]) 351 | return graph 352 | 353 | 354 | def get_optimal_action_for_plot(mdp, state_values, state, gamma=0.9): 355 | """ Finds optimal action using formula above. """ 356 | if mdp.is_terminal(state): 357 | return None 358 | next_actions = mdp.get_possible_actions(state) 359 | try: 360 | from mdp_get_action_value import get_action_value 361 | except ImportError: 362 | raise ImportError( 363 | "Implement get_action_value(mdp, state_values, state, action, gamma) in the file \"mdp_get_action_value.py\".") 364 | q_values = [get_action_value(mdp, state_values, state, action, gamma) for 365 | action in next_actions] 366 | optimal_action = next_actions[np.argmax(q_values)] 367 | return optimal_action 368 | 369 | 370 | def plot_graph_optimal_strategy_and_state_values(mdp, state_values, gamma=0.9): 371 | """ Plot graph with state values and """ 372 | graph = plot_graph(mdp) 373 | opt_s_a_edge_attrs = {'style': 'bold', 374 | 'color': 'green', 375 | 'ratio': 'auto', 376 | 'penwidth': '6'} 377 | 378 | for state_node in mdp._transition_probs: 379 | value = state_values[state_node] 380 | graph.node(state_node, 381 | label=str(state_node) + '\n' + 'V =' + str(value)[:4]) 382 | for action in mdp.get_possible_actions(state_node): 383 | if action == get_optimal_action_for_plot(mdp, 384 | state_values, 385 | state_node, 386 | gamma): 387 | graph.edge(state_node, state_node + "-" + action, 388 | **opt_s_a_edge_attrs) 389 | return graph 390 | -------------------------------------------------------------------------------- /Coding/Practice-2_1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import matplotlib.pyplot as plt 4 | 5 | x_data = torch.linspace(-5, 5, steps=300) 6 | nu, sigma = torch.tensor(0.2), torch.tensor(0.5) 7 | noise = torch.tensor([torch.normal(nu, sigma) for _ in range(300)]) 8 | y_data = x_data + noise 9 | 10 | w = torch.zeros(1, requires_grad=True) 11 | b = torch.zeros(1, requires_grad=True) 12 | 13 | learning_rate = 0.1 14 | learning_step_n = 20 15 | for _ in range(learning_step_n): 16 | loss = torch.mean((w * x_data + b - y_data) ** 2) 17 | print(loss) 18 | loss.backward() 19 | w.data = w.data - learning_rate * w.grad 20 | b.data = b.data - learning_rate * b.grad 21 | w.grad.zero_() 22 | b.grad.zero_() 23 | 24 | plt.scatter(x_data.numpy(), y_data.numpy()) 25 | y = w * x_data + b 26 | plt.plot(x_data.numpy(), y.detach().numpy(), 'r') 27 | plt.show() 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /Coding/Practice-2_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class Solver(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | self.linear_1 = nn.Linear(1, 10) 11 | self.linear_2 = nn.Linear(10, 1) 12 | self.relu = nn.ReLU() 13 | self.optimazer = torch.optim.SGD(self.parameters(), lr=0.01) 14 | self.learning_step_n = 1000 15 | 16 | def forward(self, input): 17 | hidden = self.linear_1(input) 18 | hidden = self.relu(hidden) 19 | output = self.linear_2(hidden) 20 | return output 21 | 22 | def learning(self, x_data, y_data): 23 | for _ in range(self.learning_step_n): 24 | loss = torch.mean((self.forward(x_data) - y_data) ** 2) 25 | loss.backward() 26 | self.optimazer.step() 27 | self.optimazer.zero_grad() 28 | 29 | 30 | #Data 31 | x_data = torch.linspace(-5, 5, steps=300) 32 | nu, sigma = torch.tensor(0.2), torch.tensor(0.3) 33 | noise = torch.tensor([torch.normal(nu, sigma) for _ in range(300)]) 34 | y_data = torch.sin(x_data) + noise 35 | x_data = x_data.reshape(300, 1) 36 | y_data = y_data.reshape(300, 1) 37 | 38 | #Learning 39 | solver = Solver() 40 | solver.learning(x_data, y_data) 41 | 42 | #Show 43 | plt.scatter(x_data.numpy(), y_data.numpy()) 44 | plt.plot(x_data.numpy(), solver(x_data).detach().numpy(), 'r') 45 | plt.show() 46 | -------------------------------------------------------------------------------- /Coding/Practice-2_3.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | 6 | 7 | class CrossEntropyAgent(nn.Module): 8 | def __init__(self, state_dim, action_n): 9 | super().__init__() 10 | self.network = nn.Sequential( 11 | nn.Linear(state_dim, 100), 12 | nn.ReLU(), 13 | nn.Linear(100, action_n) 14 | ) 15 | self.softmax = nn.Softmax() 16 | self.loss = nn.CrossEntropyLoss() 17 | self.optimizer = torch.optim.Adam(self.parameters(), lr=0.01) 18 | 19 | def forward(self, input): 20 | return self.network(input) 21 | 22 | def get_action(self, state): 23 | state = torch.FloatTensor(state) 24 | logits = self.network(state) 25 | action_prob = self.softmax(logits).detach().numpy() 26 | action = np.random.choice(len(action_prob), p=action_prob) 27 | return action 28 | 29 | def update_policy(self, elite_sessions): 30 | elite_states, elite_actions = [], [] 31 | for session in elite_sessions: 32 | elite_states.extend(session['states']) 33 | elite_actions.extend(session['actions']) 34 | 35 | elite_states = torch.FloatTensor(elite_states) 36 | elite_actions = torch.LongTensor(elite_actions) 37 | 38 | loss = self.loss(self.network(elite_states), elite_actions) 39 | loss.backward() 40 | self.optimizer.step() 41 | self.optimizer.zero_grad() 42 | return None 43 | 44 | 45 | 46 | def get_session(env, agent, session_len, visual=False): 47 | session = {} 48 | states, actions = [], [] 49 | total_reward = 0 50 | 51 | state = env.reset() 52 | for _ in range(session_len): 53 | states.append(state) 54 | action = agent.get_action(state) 55 | actions.append(action) 56 | 57 | if visual: 58 | env.render() 59 | 60 | state, reward, done, _ = env.step(action) 61 | total_reward += reward 62 | 63 | if done: 64 | break 65 | 66 | session['states'] = states 67 | session['actions'] = actions 68 | session['total_reward'] = total_reward 69 | return session 70 | 71 | 72 | def get_elite_sessions(sessions, q_param): 73 | 74 | total_rewards = np.array([session['total_reward'] for session in sessions]) 75 | quantile = np.quantile(total_rewards, q_param) 76 | 77 | elite_sessions = [] 78 | for session in sessions: 79 | if session['total_reward'] > quantile: 80 | elite_sessions.append(session) 81 | 82 | return elite_sessions 83 | 84 | 85 | env = gym.make("CartPole-v1") 86 | agent = CrossEntropyAgent(4, 2) 87 | 88 | episode_n = 100 89 | session_n = 20 90 | session_len = 500 91 | q_param = 0.8 92 | 93 | for episode in range(episode_n): 94 | sessions = [get_session(env, agent, session_len) for _ in range(session_n)] 95 | 96 | mean_total_reward = np.mean([session['total_reward'] for session in sessions]) 97 | print('mean_total_reward = ', mean_total_reward) 98 | 99 | if mean_total_reward > 400: 100 | print('You win!') 101 | 102 | elite_sessions = get_elite_sessions(sessions, q_param) 103 | 104 | if len(elite_sessions) > 0: 105 | agent.update_policy(elite_sessions) 106 | 107 | get_session(env, agent, session_len, visual=True) 108 | -------------------------------------------------------------------------------- /Coding/Practice-3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Frozen_Lake import FrozenLakeEnv 3 | 4 | 5 | def init_policy(env): 6 | policy = {} 7 | for state in env.get_all_states(): 8 | policy[state] = {} 9 | for action in env.get_possible_actions(state): 10 | policy[state][action] = 1 / len(env.get_possible_actions(state)) 11 | return policy 12 | 13 | 14 | def init_values(env): 15 | values = {} 16 | for state in env.get_all_states(): 17 | values[state] = 0 18 | return values 19 | 20 | 21 | def init_q_values(): 22 | q_values = {} 23 | for state in env.get_all_states(): 24 | q_values[state] = {} 25 | for action in env.get_possible_actions(state): 26 | q_values[state][action] = 0 27 | return q_values 28 | 29 | 30 | def get_q_values(env, gamma, values): 31 | q_values = init_q_values() 32 | for state in env.get_all_states(): 33 | for action in env.get_possible_actions(state): 34 | for next_state in env.get_next_states(state, action): 35 | q_values[state][action] += env.get_reward( 36 | state, action, next_state) + gamma * env.get_transition_prob( 37 | state, action, next_state) * values[next_state] 38 | return q_values 39 | 40 | 41 | def update_values(env, gamma, values, policy): 42 | new_values = init_values(env) 43 | for state in env.get_all_states(): 44 | for action in env.get_possible_actions(state): 45 | q_values = get_q_values(env, gamma, values) 46 | new_values[state] += policy[state][action] * q_values[state][action] 47 | return new_values 48 | 49 | 50 | def policy_evaluation(env, gamma, policy, M): 51 | values = init_values(env) 52 | for _ in range(M): 53 | values = update_values(env, gamma, values, policy) 54 | return values 55 | 56 | 57 | def policy_improvement(env, gamma, values): 58 | q_values = get_q_values(env, gamma, values) 59 | policy = init_policy(env) 60 | for state in env.get_all_states(): 61 | if len(env.get_possible_actions(state)) > 0: 62 | max_q_value = max([q_values[state][action] for action in env.get_possible_actions(state)]) 63 | there_was_max = False 64 | for action in env.get_possible_actions(state): 65 | if q_values[state][action] == max_q_value and not there_was_max: 66 | policy[state][action] = 1 67 | there_was_max = True 68 | else: 69 | policy[state][action] = 0 70 | return policy 71 | 72 | 73 | def policy_iteration(env, gamma, N=20, M=20): 74 | policy = init_policy(env) 75 | for _ in range(N): 76 | values = policy_evaluation(env, gamma, policy, M) 77 | policy = policy_improvement(env, gamma, values) 78 | return policy 79 | 80 | 81 | def get_total_reward(env, policy, session_len): 82 | total_reward = 0 83 | state = env.reset() 84 | for _ in range(session_len): 85 | prob = [policy[state][action] for action in env.get_possible_actions(state)] 86 | action = np.random.choice(env.get_possible_actions(state), p=prob) 87 | state, reward, done, _ = env.step(action) 88 | total_reward += reward 89 | if done: 90 | break 91 | return total_reward 92 | 93 | 94 | def policy_test(env, policy, session_n, session_len=100): 95 | total_rewards = np.array([get_total_reward(env, policy, session_len) for _ in range(session_n)]) 96 | return np.mean(total_rewards) 97 | 98 | 99 | def value_iteration(env, gamma, N=20): 100 | values = init_values(env) 101 | for _ in range(N): 102 | q_values = get_q_values(env, gamma, values) 103 | for state in env.get_all_states(): 104 | if len(env.get_possible_actions(state)) > 0: 105 | values[state] = max(q_values[state][action] for action in env.get_possible_actions(state)) 106 | 107 | policy = policy_improvement(env, gamma, values) 108 | return policy 109 | 110 | 111 | env = FrozenLakeEnv() 112 | gamma = 0.99 113 | 114 | policy = value_iteration(env, gamma, N=500) 115 | print('value_iteration:', policy_test(env, policy, session_n=500)) 116 | 117 | policy = policy_iteration(env, gamma, N=20, M=20) 118 | print('policy_iteration:', policy_test(env, policy, session_n=500)) 119 | 120 | 121 | -------------------------------------------------------------------------------- /Coding/Practice-4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### $\\varepsilon$-Greedy Policy:\n", 8 | "$$\n", 9 | "\\begin{array}{l}\n", 10 | "\\pi(a|s) =\n", 11 | "\\left\\{\n", 12 | "\\begin{array}{ll}\n", 13 | "1 - \\varepsilon + \\varepsilon / m,& \\text{ если } a \\in \\mathrm{argmax}_{a' \\in \\mathcal{A}}\\, Q(s,a'),\\\\\n", 14 | "\\varepsilon / m,& \\text{ иначе }\n", 15 | "\\end{array}\n", 16 | "\\right.\n", 17 | "\\end{array}\n", 18 | "$$" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": { 25 | "ExecuteTime": { 26 | "end_time": "2020-11-12T11:19:37.246247Z", 27 | "start_time": "2020-11-12T11:19:37.229583Z" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import numpy as np\n", 33 | "\n", 34 | "\n", 35 | "def get_epsilon_greedy_action(q_values, epsilon, action_n):\n", 36 | " prob = np.ones(action_n) * epsilon / action_n\n", 37 | " argmax_action = np.argmax(q_values)\n", 38 | " prob[argmax_action] += 1 - epsilon\n", 39 | " action = np.random.choice(np.arange(action_n), p=prob)\n", 40 | " return action" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Monte-Carlo Algorithm\n", 48 | "\n", 49 | "Пусть $Q(s,a) = 0$, $N(s,a) = 0$ и $\\varepsilon = 1$.\n", 50 | "\n", 51 | "Для каждого эпизода $k \\in \\overline{1,K}$ делаем:\n", 52 | "\n", 53 | "1. Согласно $\\pi = \\varepsilon\\text{-greedy}(Q)$ получаем траекторию $\\tau = (S_0,A_0,\\ldots,S_T)$ и награды $(R_0,\\ldots,R_{T-1})$. По ним определяем $(G_0,\\ldots,G_{T-1}):$\n", 54 | "$$\n", 55 | "G_t = \\sum\\limits_{k=t}^{T-1} \\gamma^{k-t} R_t\n", 56 | "$$\n", 57 | "\n", 58 | "2. Для каждого $t \\in \\overline{0,T-1}$ обновляем $Q$ и $N$:\n", 59 | "\n", 60 | "$$\n", 61 | "Q(S_t,A_t) \\leftarrow Q(S_t,A_t) + \\frac{1}{N(S_t,A_t) + 1}\\big(G_t - Q(S_t,A_t)\\big),\n", 62 | "$$\n", 63 | "\n", 64 | "$$\n", 65 | "N(S_t,A_t) \\leftarrow N(S_t,A_t) + 1\n", 66 | "$$\n", 67 | "Уменьшаем $\\varepsilon$\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 37, 73 | "metadata": { 74 | "ExecuteTime": { 75 | "end_time": "2020-11-12T12:16:33.347329Z", 76 | "start_time": "2020-11-12T12:16:33.314057Z" 77 | } 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "def MonteCarlo(env, episode_n, t_max=500, gamma=0.99):\n", 82 | " state_n = env.observation_space.n\n", 83 | " action_n = env.action_space.n\n", 84 | " \n", 85 | " Q = np.zeros((state_n, action_n))\n", 86 | " N = np.zeros((state_n, action_n))\n", 87 | " epsilon = 1\n", 88 | " \n", 89 | " total_rewards = []\n", 90 | " \n", 91 | " for episode in range(episode_n):\n", 92 | " states, actions, rewards = [], [], []\n", 93 | " \n", 94 | " state = env.reset()\n", 95 | " for t in range(t_max):\n", 96 | " states.append(state)\n", 97 | " \n", 98 | " action = get_epsilon_greedy_action(Q[state], epsilon, action_n)\n", 99 | " actions.append(action)\n", 100 | " \n", 101 | " state, reward, done, _ = env.step(action)\n", 102 | " rewards.append(reward)\n", 103 | " \n", 104 | " if done:\n", 105 | " break\n", 106 | " \n", 107 | " total_rewards.append(sum(rewards))\n", 108 | " \n", 109 | " #G = [rewards[-1]]\n", 110 | " #for t in range(len(rewards) - 2, -1, -1):\n", 111 | " # G.append(rewards[t] + gamma * G[-1])\n", 112 | " # len(rewards) = 10 \n", 113 | " # G = [rewards[9]]\n", 114 | " # G = [rewards[9], rewards[8] + gamma * G[0]]\n", 115 | " # G = [rewards[9], rewards[8] + gamma * G[0], rewards[7] + gamma * G[1]]\n", 116 | " #G.reverse()\n", 117 | " \n", 118 | " G = np.zeros(t_max + 1)\n", 119 | " for t in range(len(rewards) - 1, -1, -1):\n", 120 | " G[t] = rewards[t] + gamma * G[t + 1]\n", 121 | " \n", 122 | " # len(rewards) = 10\n", 123 | " # t = 9 G[9] = reward[9] + gamma * G[10] = reward[9]\n", 124 | " # t = 8 G[8] = reward[8] + gamma * G[9]\n", 125 | " \n", 126 | " for t in range(len(rewards)):\n", 127 | " Q[states[t]][actions[t]] += (G[t] - Q[states[t]][actions[t]]) / (1 + N[states[t]][actions[t]])\n", 128 | " N[states[t]][actions[t]] += 1\n", 129 | " \n", 130 | " epsilon -= 1 / episode_n\n", 131 | " \n", 132 | " return total_rewards" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 40, 138 | "metadata": { 139 | "ExecuteTime": { 140 | "end_time": "2020-11-12T12:17:47.229064Z", 141 | "start_time": "2020-11-12T12:17:32.861923Z" 142 | } 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "image/png": "\n", 148 | "text/plain": [ 149 | "
" 150 | ] 151 | }, 152 | "metadata": { 153 | "needs_background": "light" 154 | }, 155 | "output_type": "display_data" 156 | } 157 | ], 158 | "source": [ 159 | "import gym\n", 160 | "import matplotlib.pyplot as plt\n", 161 | "\n", 162 | "env = gym.make(\"Taxi-v2\")\n", 163 | "\n", 164 | "total_rewards = MonteCarlo(env, episode_n=500, t_max=1000, gamma=0.99)\n", 165 | "\n", 166 | "plt.plot(total_rewards)\n", 167 | "plt.show()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### SARSA Algorithm \n", 175 | "Пусть $Q(s,a) = 0$ и $\\varepsilon = 1$.\n", 176 | "\n", 177 | "Для каждого эпизода $k$ делаем:\n", 178 | "\n", 179 | "Пока эпизод не закончен делаем:\n", 180 | "\n", 181 | "1. Находясь в состоянии $S_t$ совершаем действие $A_t \\sim \\pi(\\cdot|S_t)$, \n", 182 | "где $\\pi = \\varepsilon\\text{-greedy}(Q)$, получаем награду $R_t$, переходим в состояние $S_{t+1}$, совершаем действие $A_{t+1} \\sim \\pi(\\cdot|S_{t+1})$\n", 183 | "\n", 184 | "2. По $(S_t,A_t,R_t,S_{t+1},A_{t+1})$ обновляем $Q$:\n", 185 | "$$\n", 186 | "Q(S_t,A_t) \\leftarrow Q(S_t,A_t) + \\alpha(R_t + \\gamma Q(S_{t+1},A_{t+1}) - Q(S_t,A_t))\n", 187 | "$$\n", 188 | "\n", 189 | "Уменьшаем $\\varepsilon$\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 24, 195 | "metadata": { 196 | "ExecuteTime": { 197 | "end_time": "2020-11-12T12:05:02.351989Z", 198 | "start_time": "2020-11-12T12:05:02.331371Z" 199 | } 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "def SARSA(env, episode_n, noisy_episode_n, gamma=0.99, t_max=500, alpha=0.5):\n", 204 | " state_n = env.observation_space.n\n", 205 | " action_n = env.action_space.n\n", 206 | " \n", 207 | " Q = np.zeros((state_n, action_n))\n", 208 | " epsilon = 1\n", 209 | " \n", 210 | " total_rewards = []\n", 211 | " for episode in range(episode_n):\n", 212 | " \n", 213 | " total_reward = 0\n", 214 | " \n", 215 | " state = env.reset()\n", 216 | " action = get_epsilon_greedy_action(Q[state], epsilon, action_n)\n", 217 | " \n", 218 | " for t in range(t_max):\n", 219 | " next_state, reward, done, _ = env.step(action)\n", 220 | " next_action = get_epsilon_greedy_action(Q[next_state], epsilon, action_n)\n", 221 | " \n", 222 | " Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])\n", 223 | " \n", 224 | " total_reward += reward\n", 225 | " \n", 226 | " if done:\n", 227 | " break\n", 228 | " \n", 229 | " state = next_state\n", 230 | " action = next_action\n", 231 | " \n", 232 | " epsilon = max(0, epsilon - 1 / noisy_episode_n)\n", 233 | " \n", 234 | " total_rewards.append(total_reward)\n", 235 | " \n", 236 | " return total_rewards" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 29, 242 | "metadata": { 243 | "ExecuteTime": { 244 | "end_time": "2020-11-12T12:06:50.096459Z", 245 | "start_time": "2020-11-12T12:06:42.222863Z" 246 | } 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "image/png": "\n", 252 | "text/plain": [ 253 | "
" 254 | ] 255 | }, 256 | "metadata": { 257 | "needs_background": "light" 258 | }, 259 | "output_type": "display_data" 260 | } 261 | ], 262 | "source": [ 263 | "total_rewards = SARSA(env, episode_n=500, noisy_episode_n=400, t_max=1000, gamma=0.999, alpha=0.5)\n", 264 | "\n", 265 | "plt.plot(total_rewards)\n", 266 | "plt.show()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### Q-Learning Algorithm\n", 274 | "\n", 275 | "Пусть $Q(s,a) = 0$ и $\\varepsilon = 1$.\n", 276 | "\n", 277 | "Для каждого эпизода $k$ делаем:\n", 278 | "\n", 279 | "Пока эпизод не закончен делаем:\n", 280 | "\n", 281 | "1. Находясь в состоянии $S_t$ совершаем действие $A_t \\sim \\pi(\\cdot|S_t)$, \n", 282 | "где $\\pi = \\varepsilon\\text{-greedy}(Q)$, получаем награду $R_t$ переходим в состояние $S_{t+1}$.\n", 283 | "\n", 284 | "2. По $(S_t,A_t,R_t,S_{t+1})$ обновляем $Q$:\n", 285 | "$$\n", 286 | "Q(S_t,A_t) \\leftarrow Q(S_t,A_t) + \\alpha(R_t + \\gamma \\max\\limits_{a'} Q(S_{t+1},a') - Q(S_t,A_t))\n", 287 | "$$\n", 288 | "\n", 289 | "Уменьшаем $\\varepsilon$" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 30, 295 | "metadata": { 296 | "ExecuteTime": { 297 | "end_time": "2020-11-12T12:09:41.196865Z", 298 | "start_time": "2020-11-12T12:09:41.185519Z" 299 | } 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "def QLearning(env, episode_n, noisy_episode_n, gamma=0.99, t_max=500, alpha=0.5):\n", 304 | " state_n = env.observation_space.n\n", 305 | " action_n = env.action_space.n\n", 306 | " \n", 307 | " Q = np.zeros((state_n, action_n))\n", 308 | " epsilon = 1\n", 309 | " \n", 310 | " total_rewards = []\n", 311 | " for episode in range(episode_n):\n", 312 | " \n", 313 | " total_reward = 0\n", 314 | " state = env.reset()\n", 315 | "\n", 316 | " for t in range(t_max):\n", 317 | " action = get_epsilon_greedy_action(Q[state], epsilon, action_n)\n", 318 | " next_state, reward, done, _ = env.step(action)\n", 319 | " \n", 320 | " Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])\n", 321 | " \n", 322 | " total_reward += reward\n", 323 | " \n", 324 | " if done:\n", 325 | " break\n", 326 | " \n", 327 | " state = next_state\n", 328 | " \n", 329 | " epsilon = max(0, epsilon - 1 / noisy_episode_n)\n", 330 | " \n", 331 | " total_rewards.append(total_reward)\n", 332 | " \n", 333 | " return total_rewards" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 31, 339 | "metadata": { 340 | "ExecuteTime": { 341 | "end_time": "2020-11-12T12:10:00.588790Z", 342 | "start_time": "2020-11-12T12:09:52.816239Z" 343 | } 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "image/png": "\n", 349 | "text/plain": [ 350 | "
" 351 | ] 352 | }, 353 | "metadata": { 354 | "needs_background": "light" 355 | }, 356 | "output_type": "display_data" 357 | } 358 | ], 359 | "source": [ 360 | "total_rewards = QLearning(env, episode_n=500, noisy_episode_n=400, t_max=1000, gamma=0.999, alpha=0.5)\n", 361 | "\n", 362 | "plt.plot(total_rewards)\n", 363 | "plt.show()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | } 373 | ], 374 | "metadata": { 375 | "hide_input": false, 376 | "kernelspec": { 377 | "display_name": "Python 3", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "toc": { 382 | "base_numbering": 1, 383 | "nav_menu": {}, 384 | "number_sections": true, 385 | "sideBar": true, 386 | "skip_h1_title": false, 387 | "title_cell": "Table of Contents", 388 | "title_sidebar": "Contents", 389 | "toc_cell": false, 390 | "toc_position": {}, 391 | "toc_section_display": true, 392 | "toc_window_display": false 393 | }, 394 | "varInspector": { 395 | "cols": { 396 | "lenName": 16, 397 | "lenType": 16, 398 | "lenVar": 40 399 | }, 400 | "kernels_config": { 401 | "python": { 402 | "delete_cmd_postfix": "", 403 | "delete_cmd_prefix": "del ", 404 | "library": "var_list.py", 405 | "varRefreshCmd": "print(var_dic_list())" 406 | }, 407 | "r": { 408 | "delete_cmd_postfix": ") ", 409 | "delete_cmd_prefix": "rm(", 410 | "library": "var_list.r", 411 | "varRefreshCmd": "cat(var_dic_list()) " 412 | } 413 | }, 414 | "types_to_exclude": [ 415 | "module", 416 | "function", 417 | "builtin_function_or_method", 418 | "instance", 419 | "_Feature" 420 | ], 421 | "window_display": false 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 4 426 | } 427 | -------------------------------------------------------------------------------- /Coding/Practice-5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | import random 5 | import gym 6 | 7 | class Network(nn.Module): 8 | 9 | def __init__(self, input_dim, output_dim): 10 | super().__init__() 11 | self.linear_1 = nn.Linear(input_dim, 32) 12 | self.linear_2 = nn.Linear(32, 32) 13 | self.linear_3 = nn.Linear(32, output_dim) 14 | self.relu = nn.ReLU() 15 | 16 | def forward(self, input): 17 | hidden = self.linear_1(input) 18 | hidden = self.relu(hidden) 19 | hidden = self.linear_2(hidden) 20 | hidden = self.relu(hidden) 21 | output = self.linear_3(hidden) 22 | return output 23 | 24 | class DQNAgent(nn.Module): 25 | 26 | def __init__(self, state_dim, action_n): 27 | super().__init__() 28 | self.state_dim = state_dim 29 | self.action_n = action_n 30 | 31 | self.gamma = 0.95 32 | self.epsilon = 1 33 | self.memory_size = 10000 34 | self.memory = [] 35 | self.batch_size = 64 36 | self.learinig_rate = 1e-2 37 | 38 | self.q = Network(self.state_dim, self.action_n) 39 | self.optimazer = torch.optim.Adam(self.q.parameters(), lr=self.learinig_rate) 40 | 41 | def get_action(self, state): 42 | state = torch.FloatTensor(state) 43 | argmax_action = torch.argmax(self.q(state)) 44 | probs = np.ones(self.action_n) * self.epsilon / self.action_n 45 | probs[argmax_action] += 1 - self.epsilon 46 | actions = np.arange(self.action_n) 47 | action = np.random.choice(actions, p=probs) 48 | return action 49 | 50 | def fit(self, state, action, reward, done, next_state): 51 | 52 | self.memory.append([state, action, reward, done, next_state]) 53 | if len(self.memory) > self.memory_size: 54 | self.memory.pop(0) 55 | 56 | if len(self.memory) > self.batch_size: 57 | batch = random.sample(self.memory, self.batch_size) 58 | 59 | states, actions, rewards, dones, next_states = list(zip(*batch)) 60 | states = torch.FloatTensor(states) 61 | q_values = self.q(states) 62 | next_states = torch.FloatTensor(next_states) 63 | next_q_values = self.q(next_states) 64 | targets = q_values.clone() 65 | for i in range(self.batch_size): 66 | targets[i][actions[i]] = rewards[i] + self.gamma * (1 - dones[i]) * max(next_q_values[i]) 67 | 68 | loss = torch.mean((targets.detach() - q_values) ** 2) 69 | 70 | loss.backward() 71 | self.optimazer.step() 72 | self.optimazer.zero_grad() 73 | 74 | if self.epsilon > 0.01: 75 | self.epsilon *= 0.999 76 | 77 | env = gym.make('CartPole-v1') 78 | state_dim = env.observation_space.shape[0] 79 | action_n = env.action_space.n 80 | agent = DQNAgent(state_dim, action_n) 81 | 82 | episode_n = 100 83 | for episode in range(episode_n): 84 | state = env.reset() 85 | total_reward = 0 86 | for t in range(500): 87 | action = agent.get_action(state) 88 | next_state, reward, done, _ = env.step(action) 89 | agent.fit(state, action, reward, done, next_state) 90 | state = next_state 91 | total_reward += reward 92 | if done: 93 | break 94 | print(total_reward) 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /Coding/Practice-6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DDPG\n", 8 | "\n", 9 | "Задаем структуру аппроксимаций $\\pi^\\eta(s)$, $Q^\\theta(s,a)$ и начальные вектора параметров $\\eta$, $\\theta$.\n", 10 | "\n", 11 | "Для каждого эпизода делаем:\n", 12 | "\n", 13 | " Пока эпизод не закончен делаем:\n", 14 | "\n", 15 | "- Находясь в состоянии $S_t$ совершаем действие\n", 16 | "\n", 17 | " $$\n", 18 | " A_t = \\pi^\\eta(S_t) + Noise,\n", 19 | " $$\n", 20 | "\n", 21 | " получаем награду $R_t$ переходим в состояние $S_{t+1}$. Сохраняем \n", 22 | " $(S_t,A_t,R_t,S_{t+1}) \\Rightarrow Memory$\n", 23 | "\n", 24 | "\n", 25 | "- Берем $\\{(s_i,a_i,r_i,s'_i)\\}_{i=1}^{n} \\leftarrow Memory$, определяем значения\n", 26 | "\n", 27 | " $$\n", 28 | " y_i = r_i + \\gamma Q^\\theta(s'_i,\\pi^\\eta(s'_i))\n", 29 | " $$\n", 30 | " функции потерь\n", 31 | "\n", 32 | " $$\n", 33 | " Loss_1(\\theta) = \\frac{1}{n}\\sum\\limits_{i=1}^n \\big(y_i - Q^\\theta(s_i,a_i)\\big)^2,\\quad Loss_2(\\eta) = \\frac{1}{n}\\sum\\limits_{i=1}^n Q^\\theta(s_i,\\pi^\\eta(s_i))\n", 34 | " $$\n", 35 | "\n", 36 | " и обновляем вектор параметров\n", 37 | "\n", 38 | " $$\n", 39 | " \\theta \\leftarrow \\theta - \\alpha \\nabla_\\theta Loss_1(\\theta),\\quad \\eta \\leftarrow \\eta + \\beta \\nabla_\\eta Loss_2(\\eta),\\quad \\alpha,\\beta > 0\n", 40 | " $$\n", 41 | "\n", 42 | "- Уменьшаем $Noise$" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2020-12-10T13:19:40.443616Z", 51 | "start_time": "2020-12-10T13:19:40.430613Z" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "#Ornstein–Uhlenbeck process (Процесс Орнштейна – Уленбека)\n", 57 | "\n", 58 | "class OUNoise:\n", 59 | " def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.3):\n", 60 | " self.action_dimension = action_dimension\n", 61 | " self.mu = mu\n", 62 | " self.theta = theta\n", 63 | " self.sigma = sigma\n", 64 | " self.state = np.ones(self.action_dimension) * self.mu\n", 65 | " self.reset()\n", 66 | "\n", 67 | " def reset(self):\n", 68 | " self.state = np.ones(self.action_dimension) * self.mu\n", 69 | "\n", 70 | " def sample(self):\n", 71 | " x = self.state\n", 72 | " dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))\n", 73 | " self.state = x + dx\n", 74 | " return self.state" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "metadata": { 81 | "ExecuteTime": { 82 | "end_time": "2020-12-10T13:32:18.792167Z", 83 | "start_time": "2020-12-10T13:32:18.748995Z" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "import numpy as np\n", 89 | "import torch\n", 90 | "import torch.nn as nn\n", 91 | "import random\n", 92 | "from collections import deque\n", 93 | "from copy import deepcopy\n", 94 | "\n", 95 | "\n", 96 | "class TwoLayersNeywork(nn.Module):\n", 97 | " def __init__(self, input_dim, layer_1_dim, layer_2_dim, output_dim, is_tanh):\n", 98 | " super().__init__()\n", 99 | " self.linear_1 = nn.Linear(input_dim, layer_1_dim)\n", 100 | " self.linear_2 = nn.Linear(layer_1_dim, layer_2_dim)\n", 101 | " self.linear_3 = nn.Linear(layer_2_dim, output_dim)\n", 102 | " self.relu = nn.ReLU()\n", 103 | " self.is_tanh = is_tanh\n", 104 | " self.tanh = nn.Tanh()\n", 105 | " \n", 106 | " def forward(self, input):\n", 107 | " hidden = self.linear_1(input)\n", 108 | " hidden = self.relu(hidden)\n", 109 | " hidden = self.linear_2(hidden)\n", 110 | " hidden = self.relu(hidden)\n", 111 | " output = self.linear_3(hidden)\n", 112 | " if self.is_tanh:\n", 113 | " output = self.tanh(output)\n", 114 | " return output\n", 115 | " \n", 116 | "\n", 117 | "class DDPG():\n", 118 | " def __init__(self, state_dim, action_dim, action_max, gamma=0.99, tau=1e-3,\n", 119 | " batch_size=64, q_model_lr=1e-3, pi_model_lr=1e-4, noise_decrease=0.01):\n", 120 | " self.state_dim = state_dim\n", 121 | " self.action_dim = action_dim\n", 122 | " self.action_max = action_max\n", 123 | " self.pi_model = TwoLayersNeywork(state_dim, 400, 300, action_dim, is_tanh=True)\n", 124 | " self.pi_target_model = deepcopy(self.pi_model)\n", 125 | " self.q_model = TwoLayersNeywork(state_dim + action_dim, 400, 300, 1, is_tanh=False)\n", 126 | " self.q_target_model = deepcopy(self.q_model)\n", 127 | " self.noise = OUNoise(self.action_dim)\n", 128 | " self.noise_threshold = 1\n", 129 | " self.noise_decrease = noise_decrease\n", 130 | " self.noise_min = 0.01\n", 131 | " self.memory = deque(maxlen=10000)\n", 132 | " self.batch_size = batch_size\n", 133 | " self.gamma = gamma\n", 134 | " self.tau = tau\n", 135 | " self.q_optimazer = torch.optim.Adam(self.q_model.parameters(), lr=q_model_lr)\n", 136 | " self.pi_optimazer = torch.optim.Adam(self.pi_model.parameters(), lr=pi_model_lr)\n", 137 | " \n", 138 | " def get_action(self, state):\n", 139 | " state = torch.FloatTensor(state)\n", 140 | " _action = self.pi_model(state).detach().data.numpy() + self.noise_threshold * self.noise.sample()\n", 141 | " return self.action_max * _action\n", 142 | " \n", 143 | " def update_target_model(self, target_model, model, optimazer, loss):\n", 144 | " optimazer.zero_grad()\n", 145 | " loss.backward()\n", 146 | " optimazer.step()\n", 147 | " for target_param, param in zip(target_model.parameters(), model.parameters()):\n", 148 | " target_param.data.copy_((1 - self.tau) * target_param.data + self.tau * param.data) \n", 149 | " \n", 150 | " def fit(self, state, action, reward, done, next_state):\n", 151 | " self.memory.append([state, action, reward, done, next_state])\n", 152 | " \n", 153 | " if len(self.memory) >= self.batch_size:\n", 154 | " batch = random.sample(self.memory, self.batch_size)\n", 155 | " states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))\n", 156 | " rewards = rewards.reshape(self.batch_size, 1)\n", 157 | " dones = dones.reshape(self.batch_size, 1)\n", 158 | " \n", 159 | " pred_next_actions = self.action_max * self.pi_target_model(next_states)\n", 160 | " next_states_and_pred_next_actions = torch.cat((next_states, pred_next_actions), dim=1)\n", 161 | " targets = rewards + self.gamma * (1 - dones) * self.q_target_model(next_states_and_pred_next_actions)\n", 162 | " \n", 163 | " states_and_actions = torch.cat((states, actions), dim=1)\n", 164 | " temp = (self.q_model(states_and_actions) - targets.detach())\n", 165 | " q_loss = torch.mean((targets.detach() - self.q_model(states_and_actions)) ** 2)\n", 166 | " self.update_target_model(self.q_target_model, self.q_model, self.q_optimazer, q_loss)\n", 167 | " \n", 168 | " pred_actions = self.action_max * self.pi_model(states)\n", 169 | " states_and_pred_actions = torch.cat((states, pred_actions), dim=1)\n", 170 | " pi_loss = - torch.mean(self.q_model(states_and_pred_actions))\n", 171 | " self.update_target_model(self.pi_target_model, self.pi_model, self.pi_optimazer, pi_loss)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": { 178 | "ExecuteTime": { 179 | "end_time": "2020-12-10T13:47:46.108496Z", 180 | "start_time": "2020-12-10T13:32:18.797167Z" 181 | } 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "image/png": "\n", 187 | "text/plain": [ 188 | "
" 189 | ] 190 | }, 191 | "metadata": { 192 | "needs_background": "light" 193 | }, 194 | "output_type": "display_data" 195 | } 196 | ], 197 | "source": [ 198 | "import gym\n", 199 | "import matplotlib.pyplot as plt\n", 200 | "from IPython.display import clear_output\n", 201 | "\n", 202 | "\n", 203 | "env = gym.make('Pendulum-v0')\n", 204 | "agent = DDPG(state_dim=3, action_dim=1, action_max=2)\n", 205 | "\n", 206 | "episode_n = 200\n", 207 | "session_len = 200\n", 208 | "total_rewards = []\n", 209 | "\n", 210 | "for episode in range(episode_n):\n", 211 | " state = env.reset()\n", 212 | " total_reward = 0\n", 213 | " for _ in range(session_len):\n", 214 | " action = agent.get_action(state)\n", 215 | " next_state, reward, done, _ = env.step(action)\n", 216 | " agent.fit(state, action, reward, done, next_state)\n", 217 | " state = next_state\n", 218 | " total_reward += reward\n", 219 | " total_rewards.append(total_reward)\n", 220 | " plt.plot(total_rewards)\n", 221 | " clear_output(True) \n", 222 | " plt.show()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "toc": { 240 | "base_numbering": 1, 241 | "nav_menu": {}, 242 | "number_sections": false, 243 | "sideBar": true, 244 | "skip_h1_title": false, 245 | "title_cell": "Table of Contents", 246 | "title_sidebar": "Contents", 247 | "toc_cell": false, 248 | "toc_position": {}, 249 | "toc_section_display": true, 250 | "toc_window_display": false 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 4 255 | } 256 | -------------------------------------------------------------------------------- /Coding/Practice_1.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_maze 3 | import time 4 | import numpy as np 5 | 6 | 7 | class Agent: 8 | def __init__(self, state_n, action_n): 9 | self.state_n = state_n 10 | self.action_n = action_n 11 | self.policy = np.ones((state_n, action_n)) / action_n 12 | 13 | def get_action(self, state): 14 | prob = self.policy[state] 15 | action = np.random.choice(np.arange(self.action_n), p=prob) 16 | return int(action) 17 | 18 | def update_policy(self, elite_sessions): 19 | new_policy = np.zeros((self.state_n, self.action_n)) 20 | 21 | for session in elite_sessions: 22 | for state, action in zip(session['states'], session['actions']): 23 | new_policy[state][action] += 1 24 | 25 | for state in range(self.state_n): 26 | if sum(new_policy[state]) == 0: 27 | new_policy[state] += 1 / self.action_n 28 | else: 29 | new_policy[state] /= sum(new_policy[state]) 30 | 31 | self.policy = new_policy 32 | 33 | return None 34 | 35 | 36 | def get_state(obs): 37 | return int(obs[0] * 5 + obs[1]) 38 | 39 | 40 | def get_session(env, agent, session_len, visual=False): 41 | session = {} 42 | states, actions = [], [] 43 | total_reward = 0 44 | 45 | obs = env.reset() 46 | for _ in range(session_len): 47 | state = get_state(obs) 48 | states.append(state) 49 | action = agent.get_action(state) 50 | actions.append(action) 51 | 52 | if visual: 53 | env.render() 54 | time.sleep(1) 55 | 56 | obs, reward, done, _ = env.step(action) 57 | total_reward += reward 58 | 59 | if done: 60 | break 61 | 62 | session['states'] = states 63 | session['actions'] = actions 64 | session['total_reward'] = total_reward 65 | return session 66 | 67 | 68 | def get_elite_sessions(sessions, q_param): 69 | 70 | total_rewards = np.array([session['total_reward'] for session in sessions]) 71 | quantile = np.quantile(total_rewards, q_param) 72 | 73 | elite_sessions = [] 74 | for session in sessions: 75 | if session['total_reward'] > quantile: 76 | elite_sessions.append(session) 77 | 78 | return elite_sessions 79 | 80 | 81 | env = gym.make("maze-sample-5x5-v0") 82 | agent = Agent(25, 4) 83 | 84 | episode_n = 50 85 | session_n = 100 86 | session_len = 100 87 | q_param = 0.9 88 | 89 | for episode in range(episode_n): 90 | sessions = [get_session(env, agent, session_len) for _ in range(session_n)] 91 | 92 | mean_total_reward = np.mean([session['total_reward'] for session in sessions]) 93 | print('mean_total_reward = ', mean_total_reward) 94 | 95 | elite_sessions = get_elite_sessions(sessions, q_param) 96 | 97 | if len(elite_sessions) > 0: 98 | agent.update_policy(elite_sessions) 99 | 100 | get_session(env, agent, session_len, visual=True) 101 | -------------------------------------------------------------------------------- /Homework/Homework_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Homework/Homework_1.pdf -------------------------------------------------------------------------------- /Homework/Homework_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Homework/Homework_2.pdf -------------------------------------------------------------------------------- /Homework/Homework_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Homework/Homework_3.pdf -------------------------------------------------------------------------------- /Homework/Homework_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Homework/Homework_4.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Страничка курса "Обучение с подкреплением и нейронные сети" 2 | 3 | Курс посвящен методам обучения с подкреплением (Reinforcement learning) - одному из способов машинного обучения. В нем будет рассмотрена задача о создании систем, которые могли бы приспосабливаться к окружающей среде, а также обучаться на основе получаемого опыта. Такие задачи возникают во многих областях, включая информатику, технические науки, математику, физику, нейробиологию и когнитологию. В середине 2010-х годов методы обучения с подкреплением удалось эффективно применить для обучения глубоких нейронных сетей, что привело к ряду значимых результатов. В рамках спецкурса будут изложены основные методы обучения с подкреплением, приведены техники их успешного использования для глубоких нейронных сетей, рассмотрены примеры, предложены практические задания. 4 | 5 | ### Лекции 6 | 7 | Лекция 1. Введение в обучение с подкреплением. Метод Cross-Entropy ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_1.pdf)/[видео](https://www.dropbox.com/s/h2lff3q4rhpzue7/Video_1.mp4?dl=0)) 8 | 9 | Лекция 2. Введение в нейронные сети. Deep Cross-Entropy Method ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_2.pdf)/[видео](https://www.dropbox.com/s/th4mdrk1jcq1sgx/Video_2.mp4?dl=0)) 10 | 11 | Лекция 3. Динамическое программирование ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_3.pdf)/[видео](https://www.dropbox.com/s/xipiqohh3zb1o6f/Video_4.mp4?dl=0)) 12 | 13 | Лекция 4. Model-Free Reinforcement Learning ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_4.pdf)/[видео](https://www.dropbox.com/s/max2tig3f13q0cg/Video_6.mp4?dl=0)) 14 | 15 | Лекция 5. Value Function Approximation ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_5.pdf)/[видео](https://www.dropbox.com/s/b9hsy803fsrso7l/Video_8.mp4?dl=0)) 16 | 17 | Лекция 6. Policy Gradient ([слайды](https://github.com/imm-rl-lab/UrFU_course/blob/master/Slides/Lecture_6.pdf)/[видео](https://www.dropbox.com/s/qv7lx0h53kom8ix/Video_10.mp4?dl=0)) 18 | 19 | ### Практики 20 | 21 | Практика 1. Метод Cross-Entropy для решение Maze ([код](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice_1.py)) 22 | 23 | Практика 2. PyTorch и Deep Cross-Entropy ([код 1](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-2_1.py)/[код 2](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-2_2.py)/[код 3](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-2_3.py)/[видео](https://www.dropbox.com/s/r73q2fowgxgz7yc/Video_3.mp4?dl=0)) 24 | 25 | Практика 3. Решение Frozen Lake методами Policy Iteration и Value Iteration ([код](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-3.py)/[видео](https://www.dropbox.com/s/62lo7fgar15qxkd/Video_5.mp4?dl=0)) 26 | 27 | Практика 4. Решение Taxi методами Monte-Carlo, SARSA и Q-Learning ([код](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-4.py)/[видео](https://www.dropbox.com/s/84bfa7ckxw0dm67/Video_7.mp4?dl=0)) 28 | 29 | Практика 5. Решение Cartpole методом DQN ([код](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-5.py)/[видео](https://www.dropbox.com/s/psex7ryc3ekc6cb/Video_9.mp4?dl=0)) 30 | 31 | Практика 6. Решение Pendulum методом DDPG ([код](https://github.com/imm-rl-lab/UrFU_course/blob/master/Coding/Practice-6.ipynb)/[видео](https://www.dropbox.com/s/61dz3igadpzwh22/Video_11.mp4?dl=0)) 32 | 33 | 34 | ### Домашние задания 35 | [Домашнее задание 1](https://github.com/imm-rl-lab/UrFU_course/blob/master/Homework/Homework_1.pdf) 36 | 37 | [Домашнее задание 2](https://github.com/imm-rl-lab/UrFU_course/blob/master/Homework/Homework_2.pdf) 38 | 39 | [Домашнее задание 3](https://github.com/imm-rl-lab/UrFU_course/blob/master/Homework/Homework_3.pdf) 40 | 41 | [Домашнее задание 4](https://github.com/imm-rl-lab/UrFU_course/blob/master/Homework/Homework_4.pdf) 42 | 43 | ### Полезные ссылки 44 | 45 | [https://gym.openai.com/](https://gym.openai.com/) Страничка библиотеки Gym для Python. В ней содержаться многие стандартные Environments для обучения с подкреплением. 46 | 47 | [https://github.com/MattChanTK/gym-maze](https://github.com/MattChanTK/gym-maze) Репозиторий сред c Maze 48 | 49 | [https://pytorch.org/](https://pytorch.org/) Сайт библиотеки PyTorch. 50 | 51 | [https://playground.tensorflow.org/](https://playground.tensorflow.org/) Страничка с хорошей визуализацией обучения нейронных сетей. Просто так :) 52 | 53 | ### Видеолекции других курсов 54 | 55 | [A. Panin. Cross-Entropy Method.](https://ru.coursera.org/lecture/practical-rl/crossentropy-method-TAT8g) Короткая, но понятная лекция по применению метода Cross-Entropy к задачам обучения с подкреплением. 56 | 57 | [D. Silver. Introduction to Reinforcement Learning.](https://www.youtube.com/playlist?list=PLqYmG7hTraZDM-OYHWgPebj2MfCFzFObQ) Курс по Reinforcement Learning в University College London. 58 | 59 | ### Литература 60 | 61 | [Р.С. Саттон, Э.Г. Барто. Обучение с подкреплением (1998).](https://nashol.com/2017091096341/obuchenie-s-podkrepleniem-satton-r-s-barto-e-g-2014.html) Уже ставшая классической монография по обучению с подкреплением. 62 | 63 | [C. Николенко, А. Кадурин, Е. Архангельская. Глубокое обучение. Погружение в мир нейронных сетей (2018).](https://cloud.mail.ru/public/AaZw/UM3d856gy) Пожалуй, единственная книга на русском, в которой последовательно и достаточно полно изложены основные моменты работы с нейронными сетями. Написана простым языком, но при этом включает в себя серьёзный обзор литературы со ссылками на первоисточники. 64 | 65 | [S. Mannor, R. Rubinstein, Y. Gat. The Cross-Entropy method for Fast Policy Search (2003).](https://www.aaai.org/Papers/ICML/2003/ICML03-068.pdf) Статья про использование метода Cross-Entropy для оптимизации Policy в задачах обучения с подкреплением. 66 | 67 | [A. Costa, O. Jones, D. Kroese. Convergence properties of the cross-entropy method for discrete optimization (2007)](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.399.4581&rep=rep1&type=pdf) В статье дается обоснование сходимости метода Cross-Entropy в задачах дискретной оптимизации. Однако, если пространство состояний и действий конечные, а среда детерминирована, то, кажется, задача Reinforcement Learning в рассматриваемую постановку задачи дискретной оптимизации вкладывается. 68 | 69 | [G. Cybenko. Approximation by Superpositions of a Sigmoidal Function (1989).](https://pdfs.semanticscholar.org/05ce/b32839c26c8d2cb38d5529cf7720a68c3fab.pdf) Теорема Цыбенко об аппроксимации непрерывных функций суперпозициями сигмоидальных функций (считай нейронными сетями). 70 | 71 | [V. Mnih at el. Playing Atari with Deep Reinforcement Learning (2013).](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) Статья про алгоритм DQN в приложении к играм Atari. 72 | 73 | [H. Van Hasselt, A. Guez, D. Silver. Deep Reinforcement Learning with Double Q-Learning (2016).](https://arxiv.org/pdf/1509.06461.pdf) Статья про алгоритм Double DQN. 74 | 75 | [S. Gu, T. Lillicrap, I. Sutskever, S. Levine. Continuous Deep Q-Learning with Model-based Acceleration (2016).](http://proceedings.mlr.press/v48/gu16.pdf) Статья про алгоритм Continuous DQN. 76 | 77 | [D. Silver at el. Deterministic Policy Gradient Algorithms David (2014).](http://proceedings.mlr.press/v32/silver14.pdf) Статья, в которой доказывается Deterministic Policy Gradient Theorem и приводится Deterministic Policy Gradient Algorithm. 78 | 79 | [T. Lillicrap at el. Continuous control with deep reinforcement learning (2016)](https://arxiv.org/pdf/1509.02971.pdf) Статья про алгоритм DDPG. 80 | 81 | [V. Mnih at el. Asynchronous Methods for Deep Reinforcement Learning (2016).](https://arxiv.org/pdf/1602.01783.pdf) Статья про асинхронный подход для решения задач Reinforcement Learning. 82 | 83 | 84 | -------------------------------------------------------------------------------- /Slides/Lecture_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_1.pdf -------------------------------------------------------------------------------- /Slides/Lecture_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_2.pdf -------------------------------------------------------------------------------- /Slides/Lecture_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_3.pdf -------------------------------------------------------------------------------- /Slides/Lecture_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_4.pdf -------------------------------------------------------------------------------- /Slides/Lecture_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_5.pdf -------------------------------------------------------------------------------- /Slides/Lecture_6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imm-rl-lab/reinforcement_learning_course/efb3066a586fcbccb998951a62f7b3bf900ac402/Slides/Lecture_6.pdf --------------------------------------------------------------------------------