├── CNN.py ├── Kaggle └── venus-volcanoes.ipynb ├── README.md ├── ReinforcementLearning ├── CombinedExperienceReplay │ ├── dqn_torch.py │ ├── main.py │ ├── memory.py │ ├── memory_solution.py │ └── plot.py ├── DeepQLearning │ ├── archive │ │ ├── dqn_keras.py │ │ ├── dqn_tf.py │ │ ├── frame_stack_test.py │ │ ├── main_keras_dqn_pong.py │ │ ├── main_tf_dqn_lunar_lander.py │ │ ├── main_torch_dqn_lunar_lander.py │ │ ├── main_torch_dqn_space_invaders.py │ │ ├── q_eval.h5 │ │ ├── q_next.h5 │ │ ├── simple_dqn_tf.py │ │ ├── simple_dqn_torch.py │ │ └── torch_deep_q_model.py │ ├── ddqn_keras.py │ ├── dueling_ddqn_tf2.py │ ├── dueling_ddqn_torch.py │ ├── dueling_dqn_keras.py │ ├── dueling_dqn_torch.py │ ├── main_keras_ddqn_lunar_lander.py │ ├── main_keras_dqn_lunar_lander.py │ ├── main_keras_dueling_dqn_lunar_lander.py │ ├── main_tf2_dqn_lunar_lander.py │ ├── main_tf2_dueling_ddqn_lunar_lander.py │ ├── main_tf_dqn_breakout.py │ ├── main_torch_dqn_lunar_lander_2020.py │ ├── main_torch_dueling_ddqn_lunar_lander.py │ ├── main_torch_dueling_dqn_lunar_lander.py │ ├── simple_dqn_keras.py │ ├── simple_dqn_tf2.py │ ├── simple_dqn_torch_2020.py │ └── utils.py ├── Fundamentals │ ├── acrobot.py │ ├── blackJack-no-es.py │ ├── blackJack-off-policy.py │ ├── cartpole_qlearning.py │ ├── doubleQLearning.py │ ├── dynamic_programming.py │ ├── gridworld.py │ ├── mountaincar.png │ ├── mountaincar.py │ ├── n_step_sarsa.py │ └── sarsa.py ├── ICM │ ├── A3C_CartPole_no_rewards.png │ ├── ICM_CartPole_no_rewards.png │ ├── actor_critic.py │ ├── icm.py │ ├── main.py │ ├── memory.py │ ├── parallel_env.py │ ├── shared_adam.py │ ├── utils.py │ └── worker.py └── PolicyGradient │ ├── A3C │ └── pytorch │ │ └── a3c.py │ ├── DDPG │ ├── pytorch │ │ └── lunar-lander │ │ │ ├── Torch-LunarLander-alpha000025-beta00025-400-300.png │ │ │ ├── ddpg_torch.py │ │ │ ├── main_torch.py │ │ │ └── utils.py │ ├── tensorflow │ │ ├── pendulum │ │ │ ├── ddpg_orig_tf.py │ │ │ ├── ddpg_tf.py │ │ │ ├── main_tf.py │ │ │ └── utils.py │ │ └── walker2d │ │ │ ├── ddpg_orig_tf.py │ │ │ ├── main_tf.py │ │ │ └── tmp │ │ │ └── ddpg_best_3 │ │ │ ├── Actor_ddpg.ckpt.data-00000-of-00001 │ │ │ ├── Actor_ddpg.ckpt.index │ │ │ ├── Actor_ddpg.ckpt.meta │ │ │ ├── Critic_ddpg.ckpt.data-00000-of-00001 │ │ │ ├── Critic_ddpg.ckpt.index │ │ │ ├── Critic_ddpg.ckpt.meta │ │ │ ├── TargetActor_ddpg.ckpt.data-00000-of-00001 │ │ │ ├── TargetActor_ddpg.ckpt.index │ │ │ ├── TargetActor_ddpg.ckpt.meta │ │ │ ├── TargetCritic_ddpg.ckpt.data-00000-of-00001 │ │ │ ├── TargetCritic_ddpg.ckpt.index │ │ │ └── TargetCritic_ddpg.ckpt.meta │ └── tensorflow2 │ │ └── pendulum │ │ ├── buffer.py │ │ ├── ddpg_tf2.py │ │ ├── main_ddpg.py │ │ ├── networks.py │ │ ├── pendulum.png │ │ └── utils.py │ ├── PPO │ ├── tf2 │ │ ├── agent.py │ │ ├── main.py │ │ ├── memory.py │ │ ├── networks.py │ │ └── utils.py │ └── torch │ │ ├── Slides.pdf │ │ ├── cartpole.png │ │ ├── main.py │ │ ├── ppo_torch.py │ │ └── utils.py │ ├── SAC │ ├── buffer.py │ ├── main_sac.py │ ├── networks.py │ ├── sac_torch.py │ ├── tf2 │ │ ├── Slides.pdf │ │ ├── buffer.py │ │ ├── main_sac.py │ │ ├── networks.py │ │ ├── plots │ │ │ └── inverted_pendulum.png │ │ ├── sac_tf2.py │ │ └── utils.py │ └── utils.py │ ├── TD3 │ ├── main.py │ ├── td3_torch.py │ ├── tf2 │ │ ├── main.py │ │ ├── plots │ │ │ └── walker_1500_games.png │ │ ├── td3_tf2.py │ │ └── utils.py │ └── utils.py │ ├── actor_critic │ ├── actor_critic_continuous.py │ ├── actor_critic_keras.py │ ├── actor_critic_replay_torch.py │ ├── cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png │ ├── continuous_mountain_car_actor_critic.py │ ├── discrete_cartpole.py │ ├── main_keras_actor_critic_lunar_lander.py │ ├── main_torch_actor_critic_replay_lunar_lander.py │ ├── mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png │ ├── tensorflow2 │ │ ├── actor_critic.py │ │ ├── cartpole.png │ │ ├── main.py │ │ ├── networks.py │ │ └── utils.py │ ├── torch_actor_critic_discrete.py │ ├── torch_discrete_lunar_lander.py │ └── utils.py │ └── reinforce │ ├── main_keras_reinforce_lunar_lander.py │ ├── main_tf_reinforce_lunar_lander.py │ ├── main_tf_reinforce_space_invaders.py │ ├── main_torch_reinforce_lunar_lander.py │ ├── reinforce_cnn_tf.py │ ├── reinforce_keras.py │ ├── reinforce_tf.py │ ├── reinforce_torch.py │ ├── space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png │ ├── tensorflow2 │ ├── lunar-lander-tf2-256x256-alpha0005-2000games.png │ ├── main.py │ ├── networks.py │ └── reinforce_tf2.py │ └── utils.py ├── basic_encryption ├── caesar.py ├── common.py ├── one_time_pad.py └── vignere.py ├── cmdline.py ├── giveaway_scrubbed.py ├── giveaway_scrubbed_3-23.py ├── giveaway_scrubbed_9-22.py ├── modular_cnn.py ├── simple_cnn_mnist.py ├── simple_nn_mnist.py ├── tf_embeddings.py ├── tf_sentiment.py ├── tf_text_gen.py └── threaded.py /CNN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import struct 3 | import tensorflow as tf 4 | 5 | def load_data(): 6 | with open('train-labels.idx1-ubyte', 'rb') as labels: 7 | magic, n = struct.unpack('>II', labels.read(8)) 8 | train_labels = np.fromfile(labels, dtype=np.uint8) 9 | with open('train-images.idx3-ubyte', 'rb') as imgs: 10 | magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16)) 11 | train_images = np.fromfile(imgs, dtype=np.uint8).reshape(num,784) 12 | with open('t10k-labels.idx1-ubyte', 'rb') as labels: 13 | magic, n = struct.unpack('>II', labels.read(8)) 14 | test_labels = np.fromfile(labels, dtype=np.uint8) 15 | with open('t10k-images.idx3-ubyte', 'rb') as imgs: 16 | magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16)) 17 | test_images = np.fromfile(imgs, np.uint8).reshape(num,784) 18 | return train_images, train_labels, test_images, test_labels 19 | 20 | def cnn_model_fn(features, labels, mode): 21 | input_layer = tf.cast(tf.reshape(features['x'], [-1, 28, 28, 1]), tf.float16) 22 | 23 | conv1 = tf.layers.conv2d(inputs=input_layer, 24 | filters=16, 25 | kernel_size=[5,5], 26 | padding='same', 27 | activation=tf.nn.relu) 28 | 29 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2,2], strides=2) 30 | 31 | conv2 = tf.layers.conv2d(inputs=pool1, 32 | filters=32, 33 | kernel_size=[5,5], 34 | padding='same', 35 | activation=tf.nn.relu) 36 | 37 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size = [2,2], strides=2) 38 | 39 | pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 32]) 40 | 41 | dense = tf.layers.dense(inputs=pool2_flat, units=128, activation=tf.nn.relu) 42 | logits = tf.layers.dense(inputs=dense, units=10) 43 | 44 | predictions = { 45 | 'classes': tf.argmax(input=logits, axis=1), 46 | 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') 47 | } 48 | 49 | if mode == tf.estimator.ModeKeys.PREDICT: 50 | return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) 51 | 52 | onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) 53 | 54 | loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) 55 | 56 | if mode == tf.estimator.ModeKeys.TRAIN: 57 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) 58 | train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) 59 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) 60 | 61 | if mode == tf.estimator.ModeKeys.EVAL: 62 | eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes'])} 63 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) 64 | 65 | if __name__ == '__main__': 66 | training_data, training_labels, testing_data, testing_labels = load_data() 67 | num_epochs = 10 68 | 69 | classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, 70 | model_dir='tmp/') 71 | 72 | input_fn = tf.estimator.inputs.numpy_input_fn( 73 | x={"x": training_data}, 74 | y=training_labels, 75 | batch_size=32, 76 | num_epochs=None, 77 | shuffle=True) 78 | 79 | for i in range(num_epochs): 80 | classifier.train(input_fn=input_fn, steps=1000) 81 | 82 | eval_input_fn = tf.estimator.inputs.numpy_input_fn( 83 | x={'x': testing_data}, 84 | y=testing_labels, 85 | shuffle=False) 86 | 87 | eval_results = classifier.evaluate(input_fn=eval_input_fn) 88 | print('these are the results of my evaluations') 89 | print(eval_results) 90 | 91 | pred_input_fn = tf.estimator.inputs.numpy_input_fn( 92 | x={'x': testing_data}, 93 | y=testing_labels, 94 | num_epochs=1, 95 | shuffle=False) 96 | 97 | pred_results = classifier.predict(input_fn=pred_input_fn) 98 | predicted_classes = [p['classes'] for p in pred_results] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Youtube-Code-Repository 2 | Repository for all the code from my youtube channel 3 | You can find me at https://youtube.com/MachineLearningWithPhil
4 | 5 |

Kaggle/Venus-Volcanoes

6 | 7 | My crude implementation of a convolutional neural network to perform image classification on data gathered
8 | by the Magellan spacecraft. The data is horribly skewed, as most images do not contain a volcano.
9 | This means we'll have to do some creative data engineering for our model training.
10 | Please note that in the test set, 84.1% of the data is "no volcano", and our model returns
11 | an accuracy of around 88%, which is better than a model that outputs straight 0s for predictions.
12 | 13 | You can check out the video for this at https://youtu.be/Ki-xOKydQrY
14 | You can find the data for this project at https://www.kaggle.com/fmena14/volcanoesvenus/home 15 |

ReinforcementLearning/DeepQLearning

16 | 17 | My implementation of the Deep Q learning algorithm in PyTorch. Here we teach the algorithm to play the game of space invaders. I haven't had enough time to train this model yet, as it takes quite some time even on my 1080Ti / i7 7820k @ 4.4 GHz. I'll train 18 | longer and provide a video on how well it does, at a later time. 19 | 20 | The blog post talking about how Deep Q learning works can be found at http://www.neuralnet.ai/coding-a-deep-q-network-in-pytorch/
21 | Video for this is at https://www.youtube.com/watch?v=RfNxXlO6BiA&t=2s 22 | 23 | 24 | 25 |

CNN.py

26 | 27 | Simple implementation of a convolutional neural network in TensorFlow, version 1.5.
28 | Video tutorial on this code can be found here https://youtu.be/azFyHS0odcM
29 | Achieves accuracy of 98% after 10 epochs of training
30 | Requires data from http://yann.lecun.com/exdb/mnist/
31 | 32 |

ReinforcementLearning/blackJack-no-es.py

33 | 34 | Implementation of Monte Carlo control without exploring starts in the blackjack environment from the OpenAI gym.
35 | Video tutorial on this code can be found at https://youtu.be/e8ofon3sg8E
36 | Algorithm trains for 1,000,000 games and produces a win rate of around 42%, loss rate of 52% and draw rate of 6%
37 | 38 |

ReinforcementLearning/blackJack-off-policy.py

39 | 40 | Implementation of off policy Monte Carlo control in the blackjack environment from the OpenAI gym.
41 | Video tutorial on this code can be found at https://youtu.be/TvO0Sa-6UVc
42 | Algorithm trains for 1,000,000 games and produces a win rate of around 29%, loss rate of 66% and draw rate of 5%
43 | 44 |

ReinforcementLearning/cartpole_qlearning.py

45 | 46 | Implementation of the Q learning algorithm for the cart pole problem. Code is based on the course by lazy programmer,
47 | which you can find here here
48 | Video tutorial on this code can be found at https://youtu.be/ViwBAK8Hd7Q
49 | 50 |

ReinforcementLearning/doubleQLearning.py

51 | 52 | Implementation of the double Q learning algorithm in the cart pole environment. This is based on my course on
53 | reinforcement learning, which you can find at this repo
54 | Video tutorial on this code can be found https://youtu.be/Q99bEPStnxk
55 | 56 |

ReinforcementLearning/sarsa.py

57 | 58 | Implementation of the SARSA algorithm in the cart pole environment. This is based on my course on reinforcement learning, 59 | which can be found here
60 | Video tutorial on this code can be found at https://youtu.be/P9XezMuPfLE
61 | -------------------------------------------------------------------------------- /ReinforcementLearning/CombinedExperienceReplay/dqn_torch.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | from memory import ReplayMemory 7 | 8 | 9 | class DeepQNetwork(nn.Module): 10 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions): 11 | super(DeepQNetwork, self).__init__() 12 | self.input_dims = input_dims 13 | self.fc1_dims = fc1_dims 14 | self.fc2_dims = fc2_dims 15 | self.n_actions = n_actions 16 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 17 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 18 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions) 19 | 20 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 21 | self.loss = nn.MSELoss() 22 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 23 | self.to(self.device) 24 | 25 | def forward(self, state): 26 | x = F.relu(self.fc1(state)) 27 | x = F.relu(self.fc2(x)) 28 | actions = self.fc3(x) 29 | 30 | return actions 31 | 32 | 33 | class Agent: 34 | def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, 35 | combined=False, max_mem_size=100000, eps_end=0.05, 36 | eps_dec=5e-4): 37 | self.gamma = gamma 38 | self.epsilon = epsilon 39 | self.eps_min = eps_end 40 | self.eps_dec = eps_dec 41 | self.lr = lr 42 | self.action_space = [i for i in range(n_actions)] 43 | self.batch_size = batch_size 44 | self.memory = ReplayMemory(input_dims, max_mem_size, 45 | batch_size, combined) 46 | self.iter_cntr = 0 47 | self.replace_target = 100 48 | 49 | self.Q_eval = DeepQNetwork(lr, n_actions=n_actions, 50 | input_dims=input_dims, 51 | fc1_dims=256, fc2_dims=256) 52 | self.Q_next = DeepQNetwork(lr, n_actions=n_actions, 53 | input_dims=input_dims, 54 | fc1_dims=256, fc2_dims=256) 55 | 56 | def choose_action(self, observation): 57 | if np.random.random() > self.epsilon: 58 | state = T.tensor([observation]).to(self.Q_eval.device) 59 | actions = self.Q_eval.forward(state) 60 | action = T.argmax(actions).item() 61 | else: 62 | action = np.random.choice(self.action_space) 63 | 64 | return action 65 | 66 | def learn(self): 67 | if not self.memory.is_sufficient(): 68 | return 69 | 70 | self.Q_eval.optimizer.zero_grad() 71 | batch_index = np.arange(self.batch_size, dtype=np.int32) 72 | states, actions, rewards, new_states, dones = \ 73 | self.memory.sample_memory() 74 | states = T.tensor(states).to(self.Q_eval.device) 75 | new_states = T.tensor(new_states).to(self.Q_eval.device) 76 | rewards = T.tensor(rewards).to(self.Q_eval.device) 77 | dones = T.tensor(dones).to(self.Q_eval.device) 78 | q_eval = self.Q_eval.forward(states)[batch_index, actions] 79 | q_next = self.Q_eval.forward(new_states) 80 | q_next[dones] = 0.0 81 | q_target = rewards + self.gamma*T.max(q_next, dim=1)[0] 82 | 83 | loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device) 84 | loss.backward() 85 | self.Q_eval.optimizer.step() 86 | 87 | self.iter_cntr += 1 88 | self.epsilon = self.epsilon - self.eps_dec \ 89 | if self.epsilon > self.eps_min else self.eps_min 90 | 91 | if self.iter_cntr % self.replace_target == 0: 92 | self.Q_next.load_state_dict(self.Q_eval.state_dict()) 93 | -------------------------------------------------------------------------------- /ReinforcementLearning/CombinedExperienceReplay/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from dqn_torch import Agent 4 | import numpy as np 5 | 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(description='') 9 | parser.add_argument('-bs', type=int, default=1000) 10 | parser.add_argument('-cer', type=bool, default=False) 11 | # if you supply it, then true 12 | args = parser.parse_args() 13 | 14 | env = gym.make('LunarLander-v2') 15 | combined = args.cer 16 | buffer_size = args.bs 17 | 18 | agent = Agent(gamma=0.99, epsilon=0.1, batch_size=64, n_actions=4, 19 | eps_end=0.1, input_dims=[8], lr=0.001, 20 | max_mem_size=buffer_size, combined=combined) 21 | 22 | scores = [] 23 | n_games = 500 24 | for i in range(n_games): 25 | score = 0 26 | done = False 27 | observation = env.reset() 28 | while not done: 29 | action = agent.choose_action(observation) 30 | observation_, reward, done, info = env.step(action) 31 | score += reward 32 | agent.memory.store_transition(observation, action, reward, 33 | observation_, done) 34 | agent.learn() 35 | observation = observation_ 36 | scores.append(score) 37 | 38 | avg_score = np.mean(scores[-100:]) 39 | 40 | print('combined {} episode {} score {:.0f} avg score {:.0f} eps {:.2f}' 41 | .format(combined, i, score, avg_score, agent.epsilon)) 42 | 43 | if combined: 44 | fname = 'CER_const_eps_' + str(buffer_size) + '.npy' 45 | else: 46 | fname = 'VER_const_eps_' + str(buffer_size) + '.npy' 47 | np.save(fname, np.array(scores)) 48 | -------------------------------------------------------------------------------- /ReinforcementLearning/CombinedExperienceReplay/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ReplayMemory: 5 | def __init__(self, input_dims, max_mem, batch_size, combined=False): 6 | pass 7 | 8 | def store_transition(self, state, action, reward, state_, terminal): 9 | pass 10 | 11 | def sample_memory(self): 12 | pass 13 | 14 | def is_sufficient(self): 15 | pass 16 | -------------------------------------------------------------------------------- /ReinforcementLearning/CombinedExperienceReplay/memory_solution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ReplayMemory: 5 | def __init__(self, input_dims, max_mem, batch_size, combined=False): 6 | self.mem_size = max_mem 7 | self.batch_size = batch_size 8 | self.mem_cntr = 0 9 | self.combined = combined 10 | self.state_memory = np.zeros((self.mem_size, *input_dims), 11 | dtype=np.float32) 12 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 13 | dtype=np.float32) 14 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32) 15 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 16 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 17 | 18 | def store_transition(self, state, action, reward, state_, terminal): 19 | index = self.mem_cntr % self.mem_size 20 | self.state_memory[index] = state 21 | self.action_memory[index] = action 22 | self.reward_memory[index] = reward 23 | self.new_state_memory[index] = state_ 24 | self.terminal_memory[index] = terminal 25 | 26 | self.mem_cntr += 1 27 | 28 | def sample_memory(self): 29 | offset = 1 if self.combined else 0 30 | max_mem = min(self.mem_cntr, self.mem_size) - offset 31 | batch = np.random.choice(max_mem, self.batch_size-offset, 32 | replace=False) 33 | states = self.state_memory[batch] 34 | new_states = self.new_state_memory[batch] 35 | actions = self.action_memory[batch] 36 | rewards = self.reward_memory[batch] 37 | terminals = self.terminal_memory[batch] 38 | 39 | if self.combined: 40 | index = self.mem_cntr % self.mem_size - 1 41 | last_action = self.action_memory[index] 42 | last_state = self.state_memory[index] 43 | last_new_state = self.new_state_memory[index] 44 | last_reward = self.reward_memory[index] 45 | last_terminal = self.terminal_memory[index] 46 | 47 | actions = np.append(self.action_memory[batch], last_action) 48 | states = np.vstack((self.state_memory[batch], last_state)) 49 | new_states = np.vstack((self.new_state_memory[batch], 50 | last_new_state)) 51 | rewards = np.append(self.reward_memory[batch], last_reward) 52 | terminals = np.append(self.terminal_memory[batch], last_terminal) 53 | 54 | return states, actions, rewards, new_states, terminals 55 | 56 | def is_sufficient(self): 57 | return self.mem_cntr > self.batch_size 58 | -------------------------------------------------------------------------------- /ReinforcementLearning/CombinedExperienceReplay/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | cer_1k = np.load('CER_const_eps_1000.npy') 4 | cer_10k = np.load('CER_const_eps_10000.npy') 5 | cer_100k = np.load('CER_const_eps_100000.npy') 6 | 7 | ver_1k = np.load('VER_const_eps_1000.npy') 8 | ver_10k = np.load('VER_const_eps_10000.npy') 9 | ver_100k = np.load('VER_const_eps_100000.npy') 10 | 11 | running_cer1k_avg = np.zeros(len(cer_1k)) 12 | running_cer10k_avg = np.zeros(len(cer_10k)) 13 | running_cer100k_avg = np.zeros(len(cer_100k)) 14 | running_ver1k_avg = np.zeros(len(ver_1k)) 15 | running_ver10k_avg = np.zeros(len(ver_10k)) 16 | running_ver100k_avg = np.zeros(len(ver_100k)) 17 | 18 | for i in range(len(cer_1k)): 19 | running_cer1k_avg[i] = np.mean(cer_1k[max(0, i-100):(i+1)]) 20 | running_cer10k_avg[i] = np.mean(cer_10k[max(0, i-100):(i+1)]) 21 | running_cer100k_avg[i] = np.mean(cer_100k[max(0, i-100):(i+1)]) 22 | running_ver1k_avg[i] = np.mean(ver_1k[max(0, i-100):(i+1)]) 23 | running_ver10k_avg[i] = np.mean(ver_10k[max(0, i-100):(i+1)]) 24 | running_ver100k_avg[i] = np.mean(ver_100k[max(0, i-100):(i+1)]) 25 | 26 | 27 | x_axis = np.arange(len(cer_1k)) 28 | plt.plot(x_axis, running_cer1k_avg, 'r--', label='CER (1,000)') 29 | plt.plot(x_axis, running_ver1k_avg, 'b--', label='VER (1,000)') 30 | plt.xlabel('Episode') 31 | plt.ylabel('Avg Score') 32 | plt.legend(loc='lower right') 33 | plt.savefig('CER_vs_VER_1000_const_eps.png') 34 | plt.close() 35 | 36 | x_axis = np.arange(len(cer_10k)) 37 | plt.plot(x_axis, running_cer10k_avg, 'r--', label='CER (10,000)') 38 | plt.plot(x_axis, running_ver10k_avg, 'b--', label='VER (10,000)') 39 | plt.xlabel('Episode') 40 | plt.ylabel('Avg Score') 41 | plt.legend(loc='lower right') 42 | plt.savefig('CER_vs_VER_10000_const_eps.png') 43 | plt.close() 44 | 45 | x_axis = np.arange(len(cer_100k)) 46 | plt.plot(x_axis, running_cer100k_avg, 'r--', label='CER (100,000)') 47 | plt.plot(x_axis, running_ver100k_avg, 'b--', label='VER (100,000)') 48 | plt.xlabel('Episode') 49 | plt.ylabel('Avg Score') 50 | plt.legend(loc='lower right') 51 | plt.savefig('CER_vs_VER_100000_const_eps.png') 52 | plt.close() 53 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/frame_stack_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | ACTION_DICT = {0: 'NOOP', 1: 'FIRE', 2:'RIGHT', 3:'LEFT'} 6 | 7 | def preprocess(observation): 8 | observation = observation / 255 9 | return np.mean(observation[30:,:], axis=2).reshape(180,160) 10 | 11 | def stack_frames(stacked_frames, frame, stack_size, actions, action): 12 | if stacked_frames is None: 13 | stacked_frames = np.zeros((*frame.shape, stack_size)) 14 | actions = np.zeros(stack_size) 15 | for idx in range(stack_size): 16 | stacked_frames[:,:,idx] = frame 17 | else: 18 | stacked_frames[:,:,0:stack_size-1] = stacked_frames[:,:,1:] 19 | stacked_frames[:,:,stack_size-1] = frame 20 | actions[0:stack_size-1] = actions[1:] 21 | actions[stack_size-1] = action 22 | fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4) 23 | 24 | ax1.imshow(stacked_frames[:,:,0]) 25 | ax1.set_title(ACTION_DICT[actions[0]]) 26 | ax2.imshow(stacked_frames[:,:,1]) 27 | ax2.set_title(ACTION_DICT[actions[1]]) 28 | ax3.imshow(stacked_frames[:,:,2]) 29 | ax3.set_title(ACTION_DICT[actions[2]]) 30 | ax4.imshow(stacked_frames[:,:,3]) 31 | ax4.set_title(ACTION_DICT[actions[3]]) 32 | plt.show() 33 | 34 | return actions, stacked_frames 35 | 36 | if __name__ == '__main__': 37 | env = gym.make('Breakout-v0') 38 | stack_size = 4 39 | 40 | for i in range(10): 41 | done = False 42 | observation = env.reset() 43 | observation = preprocess(observation) 44 | stacked_frames = None 45 | actions=None 46 | actions, stacked_frames = stack_frames(stacked_frames, observation, 47 | stack_size, actions, 0) 48 | while not done: 49 | action = env.action_space.sample() 50 | observation_, reward, done, info = env.step(action) 51 | actions, stacked_frames_ = stack_frames(stacked_frames, 52 | preprocess(observation_), stack_size, 53 | actions, action) 54 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/main_keras_dqn_pong.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dqn_keras import Agent 3 | from utils import plotLearning, make_env 4 | 5 | if __name__ == '__main__': 6 | env = make_env('PongNoFrameskip-v4') 7 | 8 | num_games = 500 9 | load_checkpoint = False 10 | best_score = -21 11 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0001, 12 | input_dims=(4,80,80), n_actions=6, mem_size=25000, 13 | eps_min=0.02, batch_size=32, replace=1000, eps_dec=1e-5) 14 | 15 | if load_checkpoint: 16 | agent.load_models() 17 | 18 | filename = 'PongNoFrameskip-v4.png' 19 | 20 | scores, eps_history = [], [] 21 | n_steps = 0 22 | 23 | for i in range(num_games): 24 | done = False 25 | observation = env.reset() 26 | score = 0 27 | while not done: 28 | action = agent.choose_action(observation) 29 | observation_, reward, done, info = env.step(action) 30 | n_steps += 1 31 | score += reward 32 | if not load_checkpoint: 33 | agent.store_transition(observation, action, 34 | reward, observation_, int(done)) 35 | agent.learn() 36 | else: 37 | env.render() 38 | observation = observation_ 39 | 40 | scores.append(score) 41 | 42 | avg_score = np.mean(scores[-100:]) 43 | print('episode: ', i,'score: ', score, 44 | ' average score %.3f' % avg_score, 45 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 46 | if avg_score > best_score: 47 | agent.save_models() 48 | print('avg score %.2f better than best score %.2f, saving model' % ( 49 | avg_score, best_score)) 50 | best_score = avg_score 51 | 52 | eps_history.append(agent.epsilon) 53 | 54 | x = [i+1 for i in range(num_games)] 55 | plot_learning_curve(x, scores, eps_history, filename) 56 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/main_tf_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from simple_dqn_tf import DeepQNetwork, Agent 3 | from utils import plotLearning 4 | import numpy as np 5 | from gym import wrappers 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | if __name__ == '__main__': 10 | env = gym.make('LunarLander-v2') 11 | lr = 0.0005 12 | n_games = 500 13 | 14 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=lr, input_dims=[8], 15 | n_actions=4, mem_size=1000000, n_games=n_games, 16 | batch_size=64) 17 | 18 | #load_checkpoint = True 19 | #if load_checkpoint: 20 | # agent.load_models() 21 | 22 | alpha = 'alpha' + str(lr)#.split('.')[1] 23 | 24 | filename = '0-lunar-lander-256x256-' + alpha + '-bs64-adam-faster_decay.png' 25 | scores = [] 26 | eps_history = [] 27 | 28 | score = 0 29 | env = wrappers.Monitor(env, "tmp/lunar-lander-4", 30 | video_callable=lambda episode_id: True, force=True) 31 | 32 | for i in range(n_games): 33 | done = False 34 | if i % 10 == 0 and i > 0: 35 | avg_score = np.mean(scores[max(0, i-10):(i+1)]) 36 | print('episode: ', i,'score: ', score, 37 | ' average score %.3f' % avg_score, 38 | 'epsilon %.3f' % agent.epsilon) 39 | #agent.save_models() 40 | else: 41 | print('episode: ', i,'score: ', score) 42 | 43 | observation = env.reset() 44 | score = 0 45 | while not done: 46 | action = agent.choose_action(observation) 47 | observation_, reward, done, info = env.step(action) 48 | score += reward 49 | agent.store_transition(observation, action, 50 | reward, observation_, int(done)) 51 | observation = observation_ 52 | agent.learn() 53 | 54 | eps_history.append(agent.epsilon) 55 | scores.append(score) 56 | 57 | x = [i+1 for i in range(n_games)] 58 | plotLearning(x, scores, eps_history, filename) 59 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from simple_dqn_torch import DeepQNetwork, Agent 3 | from utils import plotLearning 4 | import numpy as np 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | env = gym.make('LunarLander-v2') 9 | brain = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, 10 | input_dims=[8], alpha=0.003) 11 | 12 | scores = [] 13 | eps_history = [] 14 | num_games = 500 15 | score = 0 16 | # uncomment the line below to record every episode. 17 | #env = wrappers.Monitor(env, "tmp/space-invaders-1", 18 | #video_callable=lambda episode_id: True, force=True) 19 | for i in range(num_games): 20 | if i % 10 == 0 and i > 0: 21 | avg_score = np.mean(scores[max(0, i-10):(i+1)]) 22 | print('episode: ', i,'score: ', score, 23 | ' average score %.3f' % avg_score, 24 | 'epsilon %.3f' % brain.EPSILON) 25 | else: 26 | print('episode: ', i,'score: ', score) 27 | eps_history.append(brain.EPSILON) 28 | done = False 29 | observation = env.reset() 30 | score = 0 31 | while not done: 32 | action = brain.chooseAction(observation) 33 | observation_, reward, done, info = env.step(action) 34 | score += reward 35 | brain.storeTransition(observation, action, reward, observation_, 36 | done) 37 | observation = observation_ 38 | brain.learn() 39 | 40 | scores.append(score) 41 | 42 | x = [i+1 for i in range(num_games)] 43 | filename = str(num_games) + 'Games' + 'Gamma' + str(brain.GAMMA) + \ 44 | 'Alpha' + str(brain.ALPHA) + 'Memory' + \ 45 | str(brain.Q_eval.fc1_dims) + '-' + str(brain.Q_eval.fc2_dims) +'.png' 46 | plotLearning(x, scores, eps_history, filename) 47 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_space_invaders.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from torch_deep_q_model import DeepQNetwork, Agent 3 | from utils import plotLearning 4 | import numpy as np 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | env = gym.make('SpaceInvaders-v0') 9 | brain = Agent(gamma=0.95, epsilon=1.0, 10 | alpha=0.003, maxMemorySize=5000, 11 | replace=None) 12 | while brain.memCntr < brain.memSize: 13 | observation = env.reset() 14 | done = False 15 | while not done: 16 | # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire 17 | action = env.action_space.sample() 18 | observation_, reward, done, info = env.step(action) 19 | if done and info['ale.lives'] == 0: 20 | reward = -100 21 | brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward, 22 | np.mean(observation_[15:200,30:125], axis=2)) 23 | observation = observation_ 24 | print('done initializing memory') 25 | 26 | scores = [] 27 | epsHistory = [] 28 | numGames = 50 29 | batch_size=32 30 | # uncomment the line below to record every episode. 31 | env = wrappers.Monitor(env, "tmp/space-invaders-1", video_callable=lambda episode_id: True, force=True) 32 | for i in range(numGames): 33 | print('starting game ', i+1, 'epsilon: %.4f' % brain.EPSILON) 34 | epsHistory.append(brain.EPSILON) 35 | done = False 36 | observation = env.reset() 37 | frames = [np.sum(observation[15:200,30:125], axis=2)] 38 | score = 0 39 | lastAction = 0 40 | while not done: 41 | if len(frames) == 3: 42 | action = brain.chooseAction(frames) 43 | frames = [] 44 | else: 45 | action = lastAction 46 | observation_, reward, done, info = env.step(action) 47 | score += reward 48 | frames.append(np.sum(observation_[15:200,30:125], axis=2)) 49 | if done and info['ale.lives'] == 0: 50 | reward = -100 51 | brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward, 52 | np.mean(observation_[15:200,30:125], axis=2)) 53 | observation = observation_ 54 | brain.learn(batch_size) 55 | lastAction = action 56 | #env.render( 57 | scores.append(score) 58 | print('score:',score) 59 | x = [i+1 for i in range(numGames)] 60 | fileName = str(numGames) + 'Games' + 'Gamma' + str(brain.GAMMA) + \ 61 | 'Alpha' + str(brain.ALPHA) + 'Memory' + str(brain.memSize)+ '.png' 62 | plotLearning(x, scores, epsHistory, fileName) 63 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/q_eval.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_eval.h5 -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/q_next.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_next.h5 -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/archive/torch_deep_q_model.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | class DeepQNetwork(nn.Module): 8 | def __init__(self, ALPHA): 9 | super(DeepQNetwork, self).__init__() 10 | #self.conv1 = nn.Conv2d(3, 32, 8, stride=4, padding=1) 11 | self.conv1 = nn.Conv2d(1, 32, 8, stride=4, padding=1) 12 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 13 | self.conv3 = nn.Conv2d(64, 128, 3) 14 | #self.fc1 = nn.Linear(128*23*16, 512) 15 | self.fc1 = nn.Linear(128*19*8, 512) 16 | self.fc2 = nn.Linear(512, 6) 17 | #self.optimizer = optim.SGD(self.parameters(), lr=self.ALPHA, momentum=0.9) 18 | self.optimizer = optim.RMSprop(self.parameters(), lr=ALPHA) 19 | self.loss = nn.MSELoss() 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 21 | self.to(self.device) 22 | 23 | def forward(self, observation): 24 | observation = T.Tensor(observation).to(self.device) 25 | #observation = observation.view(-1, 3, 210, 160).to(self.device) 26 | observation = observation.view(-1, 1, 185, 95) 27 | observation = F.relu(self.conv1(observation)) 28 | observation = F.relu(self.conv2(observation)) 29 | observation = F.relu(self.conv3(observation)) 30 | #observation = observation.view(-1, 128*23*16).to(self.device) 31 | observation = observation.view(-1, 128*19*8) 32 | observation = F.relu(self.fc1(observation)) 33 | actions = self.fc2(observation) 34 | return actions 35 | 36 | class Agent(object): 37 | def __init__(self, gamma, epsilon, alpha, 38 | maxMemorySize, epsEnd=0.05, 39 | replace=10000, actionSpace=[0,1,2,3,4,5]): 40 | self.GAMMA = gamma 41 | self.EPSILON = epsilon 42 | self.EPS_END = epsEnd 43 | self.ALPHA = alpha 44 | self.actionSpace = actionSpace 45 | self.memSize = maxMemorySize 46 | self.steps = 0 47 | self.learn_step_counter = 0 48 | self.memory = [] 49 | self.memCntr = 0 50 | self.replace_target_cnt = replace 51 | self.Q_eval = DeepQNetwork(alpha) 52 | self.Q_next = DeepQNetwork(alpha) 53 | 54 | def storeTransition(self, state, action, reward, state_): 55 | if self.memCntr < self.memSize: 56 | self.memory.append([state, action, reward, state_]) 57 | else: 58 | self.memory[self.memCntr%self.memSize] = [state, action, reward, state_] 59 | self.memCntr += 1 60 | 61 | def chooseAction(self, observation): 62 | rand = np.random.random() 63 | actions = self.Q_eval.forward(observation) 64 | if rand < 1 - self.EPSILON: 65 | action = T.argmax(actions[1]).item() 66 | else: 67 | action = np.random.choice(self.actionSpace) 68 | self.steps += 1 69 | return action 70 | 71 | def learn(self, batch_size): 72 | self.Q_eval.optimizer.zero_grad() 73 | if self.replace_target_cnt is not None and \ 74 | self.learn_step_counter % self.replace_target_cnt == 0: 75 | self.Q_next.load_state_dict(self.Q_eval.state_dict()) 76 | 77 | if self.memCntr+batch_size < self.memSize: 78 | memStart = int(np.random.choice(range(self.memCntr))) 79 | else: 80 | memStart = int(np.random.choice(range(self.memSize-batch_size-1))) 81 | miniBatch=self.memory[memStart:memStart+batch_size] 82 | memory = np.array(miniBatch) 83 | 84 | # convert to list because memory is an array of numpy objects 85 | Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device) 86 | Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device) 87 | 88 | maxA = T.argmax(Qnext, dim=1).to(self.Q_eval.device) 89 | rewards = T.Tensor(list(memory[:,2])).to(self.Q_eval.device) 90 | Qtarget = Qpred.clone() 91 | indices = np.arange(batch_size) 92 | Qtarget[indices,maxA] = rewards + self.GAMMA*T.max(Qnext[1]) 93 | 94 | if self.steps > 500: 95 | if self.EPSILON - 1e-4 > self.EPS_END: 96 | self.EPSILON -= 1e-4 97 | else: 98 | self.EPSILON = self.EPS_END 99 | 100 | #Qpred.requires_grad_() 101 | loss = self.Q_eval.loss(Qtarget, Qpred).to(self.Q_eval.device) 102 | loss.backward() 103 | self.Q_eval.optimizer.step() 104 | self.learn_step_counter += 1 105 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_keras_ddqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import os 2 | # for keras the CUDA commands must come before importing the keras libraries 3 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 6 | import gym 7 | from gym import wrappers 8 | import numpy as np 9 | from ddqn_keras import DDQNAgent 10 | from utils import plotLearning 11 | 12 | if __name__ == '__main__': 13 | env = gym.make('LunarLander-v2') 14 | ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1.0, 15 | batch_size=64, input_dims=8) 16 | n_games = 500 17 | #ddqn_agent.load_model() 18 | ddqn_scores = [] 19 | eps_history = [] 20 | #env = wrappers.Monitor(env, "tmp/lunar-lander-ddqn-2", 21 | # video_callable=lambda episode_id: True, force=True) 22 | 23 | for i in range(n_games): 24 | done = False 25 | score = 0 26 | observation = env.reset() 27 | while not done: 28 | action = ddqn_agent.choose_action(observation) 29 | observation_, reward, done, info = env.step(action) 30 | score += reward 31 | ddqn_agent.remember(observation, action, reward, observation_, int(done)) 32 | observation = observation_ 33 | ddqn_agent.learn() 34 | eps_history.append(ddqn_agent.epsilon) 35 | 36 | ddqn_scores.append(score) 37 | 38 | avg_score = np.mean(ddqn_scores[max(0, i-100):(i+1)]) 39 | print('episode: ', i,'score: %.2f' % score, 40 | ' average score %.2f' % avg_score) 41 | 42 | if i % 10 == 0 and i > 0: 43 | ddqn_agent.save_model() 44 | 45 | filename = 'lunarlander-ddqn.png' 46 | 47 | x = [i+1 for i in range(n_games)] 48 | plotLearning(x, ddqn_scores, eps_history, filename) 49 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_keras_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | from simple_dqn_keras import Agent 2 | import numpy as np 3 | import gym 4 | from utils import plotLearning 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | env = gym.make('LunarLander-v2') 9 | lr = 0.0005 10 | n_games = 500 11 | agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=8, 12 | n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0.0) 13 | 14 | agent.load_model() 15 | scores = [] 16 | eps_history = [] 17 | 18 | #env = wrappers.Monitor(env, "tmp/lunar-lander-6", 19 | # video_callable=lambda episode_id: True, force=True) 20 | 21 | for i in range(n_games): 22 | done = False 23 | score = 0 24 | observation = env.reset() 25 | while not done: 26 | action = agent.choose_action(observation) 27 | observation_, reward, done, info = env.step(action) 28 | score += reward 29 | agent.remember(observation, action, reward, observation_, int(done)) 30 | observation = observation_ 31 | agent.learn() 32 | 33 | eps_history.append(agent.epsilon) 34 | scores.append(score) 35 | 36 | avg_score = np.mean(scores[max(0, i-100):(i+1)]) 37 | print('episode: ', i,'score: %.2f' % score, 38 | ' average score %.2f' % avg_score) 39 | 40 | if i % 10 == 0 and i > 0: 41 | agent.save_model() 42 | 43 | filename = 'lunarlander.png' 44 | 45 | x = [i+1 for i in range(n_games)] 46 | plotLearning(x, scores, eps_history, filename) 47 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_keras_dueling_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | from dueling_dqn_keras import Agent 2 | import numpy as np 3 | import gym 4 | from utils import plotLearning 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | n_games = 400 9 | agent = Agent(gamma=0.99, epsilon=1, lr=1e-3, input_dims=[8], 10 | epsilon_dec=1e-3, mem_size=100000, batch_size=64, eps_end=0.01, 11 | fc1_dims=128, fc2_dims=128, replace=100, n_actions=4) 12 | 13 | scores, eps_history = [], [] 14 | 15 | for i in range(n_games): 16 | done = False 17 | score = 0 18 | observation = env.reset() 19 | while not done: 20 | action = agent.choose_action(observation) 21 | observation_, reward, done, info = env.step(action) 22 | score += reward 23 | agent.store_transition(observation, action, reward, observation_, done) 24 | observation = observation_ 25 | agent.learn() 26 | eps_history.append(agent.epsilon) 27 | scores.append(score) 28 | 29 | avg_score = np.mean(scores[-100:]) 30 | print('episode ', i, 'score %.1f' % score, 31 | 'average score %.1f' % avg_score, 32 | 'epsilon %.2f' % agent.epsilon) 33 | 34 | filename='keras_lunar_lander.png' 35 | x = [i+1 for i in range(n_games)] 36 | plotLearning(x, scores, eps_history, filename) 37 | 38 | 39 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_tf2_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | from simple_dqn_tf2 import Agent 2 | import numpy as np 3 | import gym 4 | from utils import plotLearning 5 | import tensorflow as tf 6 | 7 | if __name__ == '__main__': 8 | tf.compat.v1.disable_eager_execution() 9 | env = gym.make('LunarLander-v2') 10 | lr = 0.001 11 | n_games = 500 12 | agent = Agent(gamma=0.99, epsilon=1.0, lr=lr, 13 | input_dims=env.observation_space.shape, 14 | n_actions=env.action_space.n, mem_size=1000000, batch_size=64, 15 | epsilon_end=0.01) 16 | scores = [] 17 | eps_history = [] 18 | 19 | for i in range(n_games): 20 | done = False 21 | score = 0 22 | observation = env.reset() 23 | while not done: 24 | action = agent.choose_action(observation) 25 | observation_, reward, done, info = env.step(action) 26 | score += reward 27 | agent.store_transition(observation, action, reward, observation_, done) 28 | observation = observation_ 29 | agent.learn() 30 | eps_history.append(agent.epsilon) 31 | scores.append(score) 32 | 33 | avg_score = np.mean(scores[-100:]) 34 | print('episode: ', i, 'score %.2f' % score, 35 | 'average_score %.2f' % avg_score, 36 | 'epsilon %.2f' % agent.epsilon) 37 | 38 | filename = 'lunarlander_tf2.png' 39 | x = [i+1 for i in range(n_games)] 40 | plotLearning(x, scores, eps_history, filename) 41 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_tf2_dueling_ddqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from dueling_ddqn_tf2 import Agent 4 | from utils import plotLearning 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | agent = Agent(lr=0.0005, gamma=0.99, n_actions=4, epsilon=1.0, 9 | batch_size=64, input_dims=[8]) 10 | n_games = 500 11 | ddqn_scores = [] 12 | eps_history = [] 13 | 14 | for i in range(n_games): 15 | done = False 16 | score = 0 17 | observation = env.reset() 18 | while not done: 19 | action = agent.choose_action(observation) 20 | observation_, reward, done, info = env.step(action) 21 | score += reward 22 | agent.store_transition(observation, action, reward, observation_, done) 23 | observation = observation_ 24 | agent.learn() 25 | eps_history.append(ddqn_agent.epsilon) 26 | 27 | ddqn_scores.append(score) 28 | 29 | avg_score = np.mean(scores[-100:]) 30 | print('episode: ', i,'score: %.2f' % score, 31 | ' average score %.2f' % avg_score) 32 | 33 | filename = 'lunarlander-dueling_ddqn.png' 34 | 35 | x = [i+1 for i in range(n_games)] 36 | plotLearning(x, ddqn_scores, eps_history, filename) 37 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_tf_dqn_breakout.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | from dqn_tf import DeepQNetwork, Agent 4 | from utils import plotLearning 5 | import numpy as np 6 | from gym import wrappers 7 | import matplotlib.pyplot as plt 8 | 9 | def preprocess(observation): 10 | observation = observation / 255 11 | return np.mean(observation[30:,:], axis=2).reshape(180,160,1) 12 | 13 | def stack_frames(stacked_frames, frame, buffer_size): 14 | if stacked_frames is None: 15 | stacked_frames = np.zeros((buffer_size, *frame.shape)) 16 | for idx, _ in enumerate(stacked_frames): 17 | stacked_frames[idx,:] = frame 18 | else: 19 | stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:] 20 | stacked_frames[buffer_size-1, :] = frame 21 | 22 | stacked_frames = stacked_frames.reshape(1, *frame.shape[0:2], buffer_size) 23 | 24 | return stacked_frames 25 | 26 | 27 | if __name__ == '__main__': 28 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 29 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0" 30 | #os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 31 | 32 | env = gym.make('Breakout-v0') 33 | load_checkpoint = False 34 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.000025, input_dims=(180,160,4), 35 | n_actions=3, mem_size=25000, batch_size=64) 36 | if load_checkpoint: 37 | agent.load_models() 38 | filename = 'breakout-alpha0p000025-gamma0p9-only-one-fc-2.png' 39 | scores = [] 40 | eps_history = [] 41 | numGames = 50000 42 | stack_size = 4 43 | score = 0 44 | # uncomment the line below to record every episode. 45 | #env = wrappers.Monitor(env, "tmp/breakout-0", 46 | # video_callable=lambda episode_id: True, force=True) 47 | """ 48 | print("Loading up the agent's memory with random gameplay") 49 | 50 | while agent.mem_cntr < 25000: 51 | done = False 52 | observation = env.reset() 53 | observation = preprocess(observation) 54 | stacked_frames = None 55 | observation = stack_frames(stacked_frames, observation, stack_size) 56 | while not done: 57 | action = np.random.choice([0, 1, 2]) 58 | action += 1 59 | observation_, reward, done, info = env.step(action) 60 | observation_ = stack_frames(stacked_frames, 61 | preprocess(observation_), stack_size) 62 | action -= 1 63 | agent.store_transition(observation, action, 64 | reward, observation_, int(done)) 65 | observation = observation_ 66 | print("Done with random gameplay. Game on.") 67 | """ 68 | n_steps = 0 69 | for i in range(numGames): 70 | done = False 71 | #if i % 100 == 0 and i > 0: 72 | # x = [j+1 for j in range(i)] 73 | 74 | # plotLearning(x, scores, eps_history, filename) 75 | observation = env.reset() 76 | observation = preprocess(observation) 77 | stacked_frames = None 78 | observation = stack_frames(stacked_frames, observation, stack_size) 79 | score = 0 80 | while not done: 81 | action = agent.choose_action(observation) 82 | action += 1 83 | observation_, reward, done, info = env.step(action) 84 | n_steps += 1 85 | observation_ = stack_frames(stacked_frames, 86 | preprocess(observation_), stack_size) 87 | score += reward 88 | action -= 1 89 | agent.store_transition(observation, action, 90 | reward, observation_, int(done)) 91 | observation = observation_ 92 | if n_steps % 4 == 0: 93 | agent.learn() 94 | if i % 12 == 0 and i > 0: 95 | avg_score = np.mean(scores[max(0, i-12):(i+1)]) 96 | print('episode: ', i,'score: ', score, 97 | ' average score %.3f' % avg_score, 98 | 'epsilon %.3f' % agent.epsilon) 99 | agent.save_models() 100 | else: 101 | print('episode: ', i,'score: ', score) 102 | eps_history.append(agent.epsilon) 103 | scores.append(score) 104 | x = [i+1 for i in range(numGames)] 105 | plotLearning(x, scores, eps_history, filename) 106 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_torch_dqn_lunar_lander_2020.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from simple_dqn_torch_2020 import Agent 3 | from utils import plotLearning 4 | import numpy as np 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, 9 | input_dims=[8], lr=0.001) 10 | scores, eps_history = [], [] 11 | n_games = 500 12 | 13 | for i in range(n_games): 14 | score = 0 15 | done = False 16 | observation = env.reset() 17 | while not done: 18 | action = agent.choose_action(observation) 19 | observation_, reward, done, info = env.step(action) 20 | score += reward 21 | agent.store_transition(observation, action, reward, 22 | observation_, done) 23 | agent.learn() 24 | observation = observation_ 25 | scores.append(score) 26 | eps_history.append(agent.epsilon) 27 | 28 | avg_score = np.mean(scores[-100:]) 29 | 30 | print('episode ', i, 'score %.2f' % score, 31 | 'average score %.2f' % avg_score, 32 | 'epsilon %.2f' % agent.epsilon) 33 | x = [i+1 for i in range(n_games)] 34 | filename = 'lunar_lander.png' 35 | plotLearning(x, scores, eps_history, filename) 36 | 37 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_torch_dueling_ddqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from dueling_ddqn_torch import Agent 4 | from utils import plotLearning 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | num_games = 250 9 | load_checkpoint = False 10 | 11 | agent = Agent(gamma=0.99, epsilon=1.0, lr=5e-4, 12 | input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01, 13 | batch_size=64, eps_dec=1e-3, replace=100) 14 | 15 | if load_checkpoint: 16 | agent.load_models() 17 | 18 | filename = 'LunarLander-Dueling-DDQN-512-Adam-lr0005-replace100.png' 19 | scores = [] 20 | eps_history = [] 21 | n_steps = 0 22 | 23 | for i in range(num_games): 24 | done = False 25 | observation = env.reset() 26 | score = 0 27 | 28 | while not done: 29 | action = agent.choose_action(observation) 30 | observation_, reward, done, info = env.step(action) 31 | score += reward 32 | agent.store_transition(observation, action, 33 | reward, observation_, int(done)) 34 | agent.learn() 35 | 36 | observation = observation_ 37 | 38 | scores.append(score) 39 | avg_score = np.mean(scores[max(0, i-100):(i+1)]) 40 | print('episode: ', i,'score %.1f ' % score, 41 | ' average score %.1f' % avg_score, 42 | 'epsilon %.2f' % agent.epsilon) 43 | if i > 0 and i % 10 == 0: 44 | agent.save_models() 45 | 46 | eps_history.append(agent.epsilon) 47 | 48 | x = [i+1 for i in range(num_games)] 49 | plotLearning(x, scores, eps_history, filename) 50 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/main_torch_dueling_dqn_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym, time 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from dueling_dqn_torch import Agent 5 | from utils import plotLearning 6 | 7 | if __name__ == '__main__': 8 | env = gym.make('LunarLander-v2') 9 | num_games = 1000 10 | load_checkpoint = False 11 | 12 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=5e-4, 13 | input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01, 14 | batch_size=64, eps_dec=1e-3, replace=100) 15 | 16 | if load_checkpoint: 17 | agent.load_models() 18 | 19 | filename = 'LunarLander-Dueling-128-128-Adam-lr0005-replace100.png' 20 | scores = [] 21 | eps_history = [] 22 | n_steps = 0 23 | 24 | for i in range(num_games): 25 | done = False 26 | observation = env.reset() 27 | score = 0 28 | 29 | while not done: 30 | action = agent.choose_action(observation) 31 | observation_, reward, done, info = env.step(action) 32 | n_steps += 1 33 | score += reward 34 | agent.store_transition(observation, action, 35 | reward, observation_, int(done)) 36 | agent.learn() 37 | 38 | observation = observation_ 39 | 40 | 41 | scores.append(score) 42 | avg_score = np.mean(scores[max(0, i-100):(i+1)]) 43 | print('episode: ', i,'score %.1f ' % score, 44 | ' average score %.1f' % avg_score, 45 | 'epsilon %.2f' % agent.epsilon) 46 | #if i > 0 and i % 10 == 0: 47 | # agent.save_models() 48 | 49 | eps_history.append(agent.epsilon) 50 | 51 | x = [i+1 for i in range(num_games)] 52 | plotLearning(x, scores, eps_history, filename) 53 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/simple_dqn_keras.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Dense, Activation 2 | from keras.models import Sequential, load_model 3 | from keras.optimizers import Adam 4 | import numpy as np 5 | 6 | class ReplayBuffer(object): 7 | def __init__(self, max_size, input_shape, n_actions, discrete=False): 8 | self.mem_size = max_size 9 | self.mem_cntr = 0 10 | self.discrete = discrete 11 | self.state_memory = np.zeros((self.mem_size, input_shape)) 12 | self.new_state_memory = np.zeros((self.mem_size, input_shape)) 13 | dtype = np.int8 if self.discrete else np.float32 14 | self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype) 15 | self.reward_memory = np.zeros(self.mem_size) 16 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32) 17 | 18 | def store_transition(self, state, action, reward, state_, done): 19 | index = self.mem_cntr % self.mem_size 20 | self.state_memory[index] = state 21 | self.new_state_memory[index] = state_ 22 | # store one hot encoding of actions, if appropriate 23 | if self.discrete: 24 | actions = np.zeros(self.action_memory.shape[1]) 25 | actions[action] = 1.0 26 | self.action_memory[index] = actions 27 | else: 28 | self.action_memory[index] = action 29 | self.reward_memory[index] = reward 30 | self.terminal_memory[index] = 1 - done 31 | self.mem_cntr += 1 32 | 33 | def sample_buffer(self, batch_size): 34 | max_mem = min(self.mem_cntr, self.mem_size) 35 | batch = np.random.choice(max_mem, batch_size) 36 | 37 | states = self.state_memory[batch] 38 | actions = self.action_memory[batch] 39 | rewards = self.reward_memory[batch] 40 | states_ = self.new_state_memory[batch] 41 | terminal = self.terminal_memory[batch] 42 | 43 | return states, actions, rewards, states_, terminal 44 | 45 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims): 46 | model = Sequential([ 47 | Dense(fc1_dims, input_shape=(input_dims,)), 48 | Activation('relu'), 49 | Dense(fc2_dims), 50 | Activation('relu'), 51 | Dense(n_actions)]) 52 | 53 | model.compile(optimizer=Adam(lr=lr), loss='mse') 54 | 55 | return model 56 | 57 | class Agent(object): 58 | def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, 59 | input_dims, epsilon_dec=0.996, epsilon_end=0.01, 60 | mem_size=1000000, fname='dqn_model.h5'): 61 | self.action_space = [i for i in range(n_actions)] 62 | self.gamma = gamma 63 | self.epsilon = epsilon 64 | self.epsilon_dec = epsilon_dec 65 | self.epsilon_min = epsilon_end 66 | self.batch_size = batch_size 67 | self.model_file = fname 68 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions, 69 | discrete=True) 70 | self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256) 71 | 72 | def remember(self, state, action, reward, new_state, done): 73 | self.memory.store_transition(state, action, reward, new_state, done) 74 | 75 | def choose_action(self, state): 76 | state = state[np.newaxis, :] 77 | rand = np.random.random() 78 | if rand < self.epsilon: 79 | action = np.random.choice(self.action_space) 80 | else: 81 | actions = self.q_eval.predict(state) 82 | action = np.argmax(actions) 83 | 84 | return action 85 | 86 | def learn(self): 87 | if self.memory.mem_cntr > self.batch_size: 88 | state, action, reward, new_state, done = \ 89 | self.memory.sample_buffer(self.batch_size) 90 | 91 | action_values = np.array(self.action_space, dtype=np.int8) 92 | action_indices = np.dot(action, action_values) 93 | 94 | q_eval = self.q_eval.predict(state) 95 | 96 | q_next = self.q_eval.predict(new_state) 97 | 98 | q_target = q_eval.copy() 99 | 100 | batch_index = np.arange(self.batch_size, dtype=np.int32) 101 | 102 | q_target[batch_index, action_indices] = reward + \ 103 | self.gamma*np.max(q_next, axis=1)*done 104 | 105 | _ = self.q_eval.fit(state, q_target, verbose=0) 106 | 107 | self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \ 108 | self.epsilon_min else self.epsilon_min 109 | 110 | def save_model(self): 111 | self.q_eval.save(self.model_file) 112 | 113 | def load_model(self): 114 | self.q_eval = load_model(self.model_file) 115 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/simple_dqn_tf2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | from tensorflow.keras.optimizers import Adam 5 | from tensorflow.keras.models import load_model 6 | 7 | class ReplayBuffer(): 8 | def __init__(self, max_size, input_dims): 9 | self.mem_size = max_size 10 | self.mem_cntr = 0 11 | 12 | self.state_memory = np.zeros((self.mem_size, *input_dims), 13 | dtype=np.float32) 14 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 15 | dtype=np.float32) 16 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32) 17 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 18 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32) 19 | 20 | def store_transition(self, state, action, reward, state_, done): 21 | index = self.mem_cntr % self.mem_size 22 | self.state_memory[index] = state 23 | self.new_state_memory[index] = state_ 24 | self.reward_memory[index] = reward 25 | self.action_memory[index] = action 26 | self.terminal_memory[index] = 1 - int(done) 27 | self.mem_cntr += 1 28 | 29 | def sample_buffer(self, batch_size): 30 | max_mem = min(self.mem_cntr, self.mem_size) 31 | batch = np.random.choice(max_mem, batch_size, replace=False) 32 | 33 | states = self.state_memory[batch] 34 | states_ = self.new_state_memory[batch] 35 | rewards = self.reward_memory[batch] 36 | actions = self.action_memory[batch] 37 | terminal = self.terminal_memory[batch] 38 | 39 | return states, actions, rewards, states_, terminal 40 | 41 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims): 42 | model = keras.Sequential([ 43 | keras.layers.Dense(fc1_dims, activation='relu'), 44 | keras.layers.Dense(fc2_dims, activation='relu'), 45 | keras.layers.Dense(n_actions, activation=None)]) 46 | model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error') 47 | 48 | return model 49 | 50 | class Agent(): 51 | def __init__(self, lr, gamma, n_actions, epsilon, batch_size, 52 | input_dims, epsilon_dec=1e-3, epsilon_end=0.01, 53 | mem_size=1000000, fname='dqn_model.h5'): 54 | self.action_space = [i for i in range(n_actions)] 55 | self.gamma = gamma 56 | self.epsilon = epsilon 57 | self.eps_dec = epsilon_dec 58 | self.eps_min = epsilon_end 59 | self.batch_size = batch_size 60 | self.model_file = fname 61 | self.memory = ReplayBuffer(mem_size, input_dims) 62 | self.q_eval = build_dqn(lr, n_actions, input_dims, 256, 256) 63 | 64 | def store_transition(self, state, action, reward, new_state, done): 65 | self.memory.store_transition(state, action, reward, new_state, done) 66 | 67 | def choose_action(self, observation): 68 | if np.random.random() < self.epsilon: 69 | action = np.random.choice(self.action_space) 70 | else: 71 | state = np.array([observation]) 72 | actions = self.q_eval.predict(state) 73 | 74 | action = np.argmax(actions) 75 | 76 | return action 77 | 78 | def learn(self): 79 | if self.memory.mem_cntr < self.batch_size: 80 | return 81 | 82 | states, actions, rewards, states_, dones = \ 83 | self.memory.sample_buffer(self.batch_size) 84 | 85 | q_eval = self.q_eval.predict(states) 86 | q_next = self.q_eval.predict(states_) 87 | 88 | 89 | q_target = np.copy(q_eval) 90 | batch_index = np.arange(self.batch_size, dtype=np.int32) 91 | 92 | q_target[batch_index, actions] = rewards + \ 93 | self.gamma * np.max(q_next, axis=1)*dones 94 | 95 | 96 | self.q_eval.train_on_batch(states, q_target) 97 | 98 | self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \ 99 | self.eps_min else self.eps_min 100 | 101 | def save_model(self): 102 | self.q_eval.save(self.model_file) 103 | 104 | 105 | def load_model(self): 106 | self.q_eval = load_model(self.model_file) 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/simple_dqn_torch_2020.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | 8 | class DeepQNetwork(nn.Module): 9 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims, 10 | n_actions): 11 | super(DeepQNetwork, self).__init__() 12 | self.input_dims = input_dims 13 | self.fc1_dims = fc1_dims 14 | self.fc2_dims = fc2_dims 15 | self.n_actions = n_actions 16 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 17 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 18 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions) 19 | 20 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 21 | self.loss = nn.MSELoss() 22 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 23 | self.to(self.device) 24 | 25 | def forward(self, state): 26 | x = F.relu(self.fc1(state)) 27 | x = F.relu(self.fc2(x)) 28 | actions = self.fc3(x) 29 | 30 | return actions 31 | 32 | 33 | class Agent: 34 | def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, 35 | max_mem_size=100000, eps_end=0.05, eps_dec=5e-4): 36 | self.gamma = gamma 37 | self.epsilon = epsilon 38 | self.eps_min = eps_end 39 | self.eps_dec = eps_dec 40 | self.lr = lr 41 | self.action_space = [i for i in range(n_actions)] 42 | self.mem_size = max_mem_size 43 | self.batch_size = batch_size 44 | self.mem_cntr = 0 45 | self.iter_cntr = 0 46 | self.replace_target = 100 47 | 48 | self.Q_eval = DeepQNetwork(lr, n_actions=n_actions, 49 | input_dims=input_dims, 50 | fc1_dims=256, fc2_dims=256) 51 | self.state_memory = np.zeros((self.mem_size, *input_dims), 52 | dtype=np.float32) 53 | self.new_state_memory = np.zeros((self.mem_size, *input_dims), 54 | dtype=np.float32) 55 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32) 56 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 57 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 58 | 59 | def store_transition(self, state, action, reward, state_, terminal): 60 | index = self.mem_cntr % self.mem_size 61 | self.state_memory[index] = state 62 | self.new_state_memory[index] = state_ 63 | self.reward_memory[index] = reward 64 | self.action_memory[index] = action 65 | self.terminal_memory[index] = terminal 66 | 67 | self.mem_cntr += 1 68 | 69 | def choose_action(self, observation): 70 | if np.random.random() > self.epsilon: 71 | state = T.tensor([observation]).to(self.Q_eval.device) 72 | actions = self.Q_eval.forward(state) 73 | action = T.argmax(actions).item() 74 | else: 75 | action = np.random.choice(self.action_space) 76 | 77 | return action 78 | 79 | def learn(self): 80 | if self.mem_cntr < self.batch_size: 81 | return 82 | 83 | self.Q_eval.optimizer.zero_grad() 84 | 85 | max_mem = min(self.mem_cntr, self.mem_size) 86 | 87 | batch = np.random.choice(max_mem, self.batch_size, replace=False) 88 | batch_index = np.arange(self.batch_size, dtype=np.int32) 89 | 90 | state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device) 91 | new_state_batch = T.tensor( 92 | self.new_state_memory[batch]).to(self.Q_eval.device) 93 | action_batch = self.action_memory[batch] 94 | reward_batch = T.tensor( 95 | self.reward_memory[batch]).to(self.Q_eval.device) 96 | terminal_batch = T.tensor( 97 | self.terminal_memory[batch]).to(self.Q_eval.device) 98 | 99 | q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch] 100 | q_next = self.Q_eval.forward(new_state_batch) 101 | q_next[terminal_batch] = 0.0 102 | 103 | q_target = reward_batch + self.gamma*T.max(q_next, dim=1)[0] 104 | 105 | loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device) 106 | loss.backward() 107 | self.Q_eval.optimizer.step() 108 | 109 | self.iter_cntr += 1 110 | self.epsilon = self.epsilon - self.eps_dec \ 111 | if self.epsilon > self.eps_min else self.eps_min 112 | -------------------------------------------------------------------------------- /ReinforcementLearning/DeepQLearning/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import gym 4 | 5 | def plotLearning(x, scores, epsilons, filename, lines=None): 6 | fig=plt.figure() 7 | ax=fig.add_subplot(111, label="1") 8 | ax2=fig.add_subplot(111, label="2", frame_on=False) 9 | 10 | ax.plot(x, epsilons, color="C0") 11 | ax.set_xlabel("Game", color="C0") 12 | ax.set_ylabel("Epsilon", color="C0") 13 | ax.tick_params(axis='x', colors="C0") 14 | ax.tick_params(axis='y', colors="C0") 15 | 16 | N = len(scores) 17 | running_avg = np.empty(N) 18 | for t in range(N): 19 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 20 | 21 | ax2.scatter(x, running_avg, color="C1") 22 | #ax2.xaxis.tick_top() 23 | ax2.axes.get_xaxis().set_visible(False) 24 | ax2.yaxis.tick_right() 25 | #ax2.set_xlabel('x label 2', color="C1") 26 | ax2.set_ylabel('Score', color="C1") 27 | #ax2.xaxis.set_label_position('top') 28 | ax2.yaxis.set_label_position('right') 29 | #ax2.tick_params(axis='x', colors="C1") 30 | ax2.tick_params(axis='y', colors="C1") 31 | 32 | if lines is not None: 33 | for line in lines: 34 | plt.axvline(x=line) 35 | 36 | plt.savefig(filename) 37 | 38 | class SkipEnv(gym.Wrapper): 39 | def __init__(self, env=None, skip=4): 40 | super(SkipEnv, self).__init__(env) 41 | self._skip = skip 42 | 43 | def step(self, action): 44 | t_reward = 0.0 45 | done = False 46 | for _ in range(self._skip): 47 | obs, reward, done, info = self.env.step(action) 48 | t_reward += reward 49 | if done: 50 | break 51 | return obs, t_reward, done, info 52 | 53 | def reset(self): 54 | self._obs_buffer = [] 55 | obs = self.env.reset() 56 | self._obs_buffer.append(obs) 57 | return obs 58 | 59 | class PreProcessFrame(gym.ObservationWrapper): 60 | def __init__(self, env=None): 61 | super(PreProcessFrame, self).__init__(env) 62 | self.observation_space = gym.spaces.Box(low=0, high=255, 63 | shape=(80,80,1), dtype=np.uint8) 64 | def observation(self, obs): 65 | return PreProcessFrame.process(obs) 66 | 67 | @staticmethod 68 | def process(frame): 69 | 70 | new_frame = np.reshape(frame, frame.shape).astype(np.float32) 71 | 72 | new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \ 73 | 0.114*new_frame[:,:,2] 74 | 75 | new_frame = new_frame[35:195:2, ::2].reshape(80,80,1) 76 | 77 | return new_frame.astype(np.uint8) 78 | 79 | class MoveImgChannel(gym.ObservationWrapper): 80 | def __init__(self, env): 81 | super(MoveImgChannel, self).__init__(env) 82 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 83 | shape=(self.observation_space.shape[-1], 84 | self.observation_space.shape[0], 85 | self.observation_space.shape[1]), 86 | dtype=np.float32) 87 | 88 | def observation(self, observation): 89 | return np.moveaxis(observation, 2, 0) 90 | 91 | class ScaleFrame(gym.ObservationWrapper): 92 | def observation(self, obs): 93 | return np.array(obs).astype(np.float32) / 255.0 94 | 95 | class BufferWrapper(gym.ObservationWrapper): 96 | def __init__(self, env, n_steps): 97 | super(BufferWrapper, self).__init__(env) 98 | self.observation_space = gym.spaces.Box( 99 | env.observation_space.low.repeat(n_steps, axis=0), 100 | env.observation_space.high.repeat(n_steps, axis=0), 101 | dtype=np.float32) 102 | 103 | def reset(self): 104 | self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32) 105 | return self.observation(self.env.reset()) 106 | 107 | def observation(self, observation): 108 | self.buffer[:-1] = self.buffer[1:] 109 | self.buffer[-1] = observation 110 | return self.buffer 111 | 112 | def make_env(env_name): 113 | env = gym.make(env_name) 114 | env = SkipEnv(env) 115 | env = PreProcessFrame(env) 116 | env = MoveImgChannel(env) 117 | env = BufferWrapper(env, 4) 118 | return ScaleFrame(env) 119 | -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/acrobot.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from gym import wrappers 5 | import pickle 6 | 7 | theta_space = np.linspace(-1, 1, 10) 8 | theta_dot_space = np.linspace(-10, 10, 10) 9 | 10 | def get_state(observation): 11 | cos_theta1, sin_theta1, cos_theta2, sin_theta2, theta1_dot, theta2_dot = \ 12 | observation 13 | c_th1 = int(np.digitize(cos_theta1, theta_space)) 14 | s_th1 = int(np.digitize(sin_theta1, theta_space)) 15 | c_th2 = int(np.digitize(cos_theta2, theta_space)) 16 | s_th2 = int(np.digitize(sin_theta2, theta_space)) 17 | th1_dot = int(np.digitize(theta1_dot, theta_dot_space)) 18 | th2_dot = int(np.digitize(theta2_dot, theta_dot_space)) 19 | 20 | return (c_th1, s_th2, c_th2, s_th2, th1_dot, th2_dot) 21 | 22 | def maxAction(Q, state, actions=[0, 1, 2]): 23 | values = np.array([Q[state,a] for a in actions]) 24 | action = np.argmax(values) 25 | 26 | return action 27 | 28 | if __name__ == '__main__': 29 | env = gym.make('Acrobot-v1') 30 | n_games = 100 31 | alpha = 0.1 32 | gamma = 0.99 33 | eps = 0 34 | 35 | action_space = [0, 1, 2] 36 | 37 | states = [] 38 | for c1 in range(11): 39 | for s1 in range(11): 40 | for c2 in range(11): 41 | for s2 in range(11): 42 | for dot1 in range(11): 43 | for dot2 in range(11): 44 | states.append((c1, s1, c2, s2, dot1, dot2)) 45 | """ 46 | Q = {} 47 | for state in states: 48 | for action in action_space: 49 | Q[state, action] = 0 50 | """ 51 | pickle_in = open('acrobot.pkl', 'rb') 52 | Q = pickle.load(pickle_in) 53 | env = wrappers.Monitor(env, "tmp/acrobot", video_callable=lambda episode_id: True, force=True) 54 | eps_rewards = 0 55 | total_rewards = np.zeros(n_games) 56 | for i in range(n_games): 57 | if i % 1 == 0: 58 | print('episode ', i, 'score ', eps_rewards, 'eps', eps) 59 | observation = env.reset() 60 | state = get_state(observation) 61 | done = False 62 | action = env.action_space.sample() if np.random.random() < eps else \ 63 | maxAction(Q, state) 64 | eps_rewards = 0 65 | while not done: 66 | """ 67 | print(observation) 68 | action = env.action_space.sample() 69 | """ 70 | observation_, reward, done, info = env.step(action) 71 | state_ = get_state(observation_) 72 | action_ = maxAction(Q, state_) 73 | eps_rewards += reward 74 | Q[state, action] = Q[state,action] + \ 75 | alpha*(reward + gamma*Q[state_,action_] - Q[state,action]) 76 | state = state_ 77 | action = action_ 78 | total_rewards[i] = eps_rewards 79 | eps = eps - 2 / n_games if eps > 0.01 else 0.01 80 | 81 | mean_rewards = np.zeros(n_games) 82 | for t in range(n_games): 83 | mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)]) 84 | plt.plot(mean_rewards) 85 | plt.show() 86 | 87 | f = open("acrobot.pkl","wb") 88 | pickle.dump(Q,f) 89 | f.close() 90 | -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/blackJack-no-es.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | if __name__ == '__main__': 6 | env = gym.make('Blackjack-v0') 7 | EPS = 0.05 8 | GAMMA = 1.0 9 | 10 | Q = {} 11 | agentSumSpace = [i for i in range(4, 22)] 12 | dealerShowCardSpace = [i+1 for i in range(10)] 13 | agentAceSpace = [False, True] 14 | actionSpace = [0, 1] # stick or hit 15 | 16 | stateSpace = [] 17 | returns = {} 18 | pairsVisited = {} 19 | for total in agentSumSpace: 20 | for card in dealerShowCardSpace: 21 | for ace in agentAceSpace: 22 | for action in actionSpace: 23 | Q[((total, card, ace), action)] = 0 24 | returns[((total, card, ace), action)] = 0 25 | pairsVisited[((total, card, ace), action)] = 0 26 | stateSpace.append((total, card, ace)) 27 | 28 | policy = {} 29 | for state in stateSpace: 30 | policy[state] = np.random.choice(actionSpace) 31 | 32 | numEpisodes = 1000000 33 | for i in range(numEpisodes): 34 | statesActionsReturns = [] 35 | memory = [] 36 | if i % 100000 == 0: 37 | print('starting episode', i) 38 | observation = env.reset() 39 | done = False 40 | while not done: 41 | action = policy[observation] 42 | observation_, reward, done, info = env.step(action) 43 | memory.append((observation[0], observation[1], observation[2], action, reward)) 44 | observation = observation_ 45 | memory.append((observation[0], observation[1], observation[2], action, reward)) 46 | 47 | G = 0 48 | last = True 49 | for playerSum, dealerCard, usableAce, action, reward in reversed(memory): 50 | if last: 51 | last = False 52 | else: 53 | statesActionsReturns.append((playerSum, dealerCard, usableAce, action, G)) 54 | G = GAMMA*G + reward 55 | 56 | statesActionsReturns.reverse() 57 | statesActionsVisited = [] 58 | 59 | for playerSum, dealerCard, usableAce, action, G in statesActionsReturns: 60 | sa = ((playerSum, dealerCard, usableAce), action) 61 | if sa not in statesActionsVisited: 62 | pairsVisited[sa] += 1 63 | # incremental implementation 64 | # new estimate = 1 / N * [sample - old estimate] 65 | returns[(sa)] += (1 / pairsVisited[(sa)])*(G-returns[(sa)]) 66 | Q[sa] = returns[sa] 67 | rand = np.random.random() 68 | if rand < 1 - EPS: 69 | state = (playerSum, dealerCard, usableAce) 70 | values = np.array([Q[(state, a)] for a in actionSpace ]) 71 | best = np.random.choice(np.where(values==values.max())[0]) 72 | policy[state] = actionSpace[best] 73 | else: 74 | policy[state] = np.random.choice(actionSpace) 75 | statesActionsVisited.append(sa) 76 | if EPS - 1e-7 > 0: 77 | EPS -= 1e-7 78 | else: 79 | EPS = 0 80 | 81 | numEpisodes = 1000 82 | rewards = np.zeros(numEpisodes) 83 | totalReward = 0 84 | wins = 0 85 | losses = 0 86 | draws = 0 87 | print('getting ready to test policy') 88 | for i in range(numEpisodes): 89 | observation = env.reset() 90 | done = False 91 | while not done: 92 | action = policy[observation] 93 | observation_, reward, done, info = env.step(action) 94 | observation = observation_ 95 | totalReward += reward 96 | rewards[i] = totalReward 97 | 98 | if reward >= 1: 99 | wins += 1 100 | elif reward == 0: 101 | draws += 1 102 | elif reward == -1: 103 | losses += 1 104 | 105 | wins /= numEpisodes 106 | losses /= numEpisodes 107 | draws /= numEpisodes 108 | print('win rate', wins, 'loss rate', losses, 'draw rate', draws) 109 | plt.plot(rewards) 110 | plt.show() -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/blackJack-off-policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | if __name__ == '__main__': 6 | env = gym.make('Blackjack-v0') 7 | EPS = 0.05 8 | GAMMA = 1.0 9 | 10 | agentSumSpace = [i for i in range(4, 22)] 11 | dealerShowCardSpace = [i+1 for i in range(10)] 12 | agentAceSpace = [False, True] 13 | actionSpace = [0, 1] # stick or hit 14 | stateSpace = [] 15 | 16 | Q = {} 17 | C = {} 18 | for total in agentSumSpace: 19 | for card in dealerShowCardSpace: 20 | for ace in agentAceSpace: 21 | for action in actionSpace: 22 | Q[((total, card, ace), action)] = 0 23 | C[((total, card, ace), action)] = 0 24 | stateSpace.append((total, card, ace)) 25 | 26 | targetPolicy = {} 27 | for state in stateSpace: 28 | values = np.array([Q[(state, a)] for a in actionSpace ]) 29 | best = np.random.choice(np.where(values==values.max())[0]) 30 | targetPolicy[state] = actionSpace[best] 31 | 32 | numEpisodes = 1000000 33 | for i in range(numEpisodes): 34 | memory = [] 35 | if i % 100000 == 0: 36 | print('starting episode', i) 37 | behaviorPolicy = {} 38 | for state in stateSpace: 39 | rand = np.random.random() 40 | if rand < 1 - EPS: 41 | behaviorPolicy[state] = [targetPolicy[state]] 42 | else: 43 | behaviorPolicy[state] = actionSpace 44 | observation = env.reset() 45 | done = False 46 | while not done: 47 | action = np.random.choice(behaviorPolicy[observation]) 48 | observation_, reward, done, info = env.step(action) 49 | memory.append((observation[0], observation[1], observation[2], action, reward)) 50 | observation = observation_ 51 | memory.append((observation[0], observation[1], observation[2], action, reward)) 52 | 53 | G = 0 54 | W = 1 55 | last = True 56 | for playerSum, dealerCard, usableAce, action, reward in reversed(memory): 57 | sa = ((playerSum, dealerCard, usableAce), action) 58 | if last: 59 | last = False 60 | else: 61 | C[sa] += W 62 | Q[sa] += (W / C[sa])*(G-Q[sa]) 63 | values = np.array([Q[(state, a)] for a in actionSpace ]) 64 | best = np.random.choice(np.where(values==values.max())[0]) 65 | targetPolicy[state] = actionSpace[best] 66 | if action != targetPolicy[state]: 67 | break 68 | if len(behaviorPolicy[state]) == 1: 69 | prob = 1 - EPS 70 | else: 71 | prob = EPS / len(behaviorPolicy[state]) 72 | W *= 1/prob 73 | G = GAMMA*G + reward 74 | if EPS - 1e-7 > 0: 75 | EPS -= 1e-7 76 | else: 77 | EPS = 0 78 | numEpisodes = 1000 79 | rewards = np.zeros(numEpisodes) 80 | totalReward = 0 81 | wins = 0 82 | losses = 0 83 | draws = 0 84 | print('getting ready to test target policy') 85 | for i in range(numEpisodes): 86 | observation = env.reset() 87 | done = False 88 | while not done: 89 | action = targetPolicy[observation] 90 | observation_, reward, done, info = env.step(action) 91 | observation = observation_ 92 | totalReward += reward 93 | rewards[i] = totalReward 94 | 95 | if reward >= 1: 96 | wins += 1 97 | elif reward == 0: 98 | draws += 1 99 | elif reward == -1: 100 | losses += 1 101 | 102 | wins /= numEpisodes 103 | losses /= numEpisodes 104 | draws /= numEpisodes 105 | print('win rate', wins, 'loss rate', losses, 'draw rate', draws) 106 | plt.plot(rewards) 107 | plt.show() -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/cartpole_qlearning.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('CartPole-v0') 6 | 7 | MAXSTATES = 10**4 8 | GAMMA = 0.9 9 | ALPHA = 0.01 10 | 11 | def max_dict(d): 12 | max_v = float('-inf') 13 | for key, val in d.items(): 14 | if val > max_v: 15 | max_v = val 16 | max_key = key 17 | return max_key, max_v 18 | 19 | def create_bins(): 20 | # obs[0] -> cart position --- -4.8 - 4.8 21 | # obs[1] -> cart velocity --- -inf - inf 22 | # obs[2] -> pole angle --- -41.8 - 41.8 23 | # obs[3] -> pole velocity --- -inf - inf 24 | 25 | bins = np.zeros((4,10)) 26 | bins[0] = np.linspace(-4.8, 4.8, 10) 27 | bins[1] = np.linspace(-5, 5, 10) 28 | bins[2] = np.linspace(-.418, .418, 10) 29 | bins[3] = np.linspace(-5, 5, 10) 30 | 31 | return bins 32 | 33 | def assign_bins(observation, bins): 34 | state = np.zeros(4) 35 | for i in range(4): 36 | state[i] = np.digitize(observation[i], bins[i]) 37 | return state 38 | 39 | def get_state_as_string(state): 40 | string_state = ''.join(str(int(e)) for e in state) 41 | return string_state 42 | 43 | def get_all_states_as_string(): 44 | states = [] 45 | for i in range(MAXSTATES): 46 | states.append(str(i).zfill(4)) 47 | return states 48 | 49 | def initialize_Q(): 50 | Q = {} 51 | 52 | all_states = get_all_states_as_string() 53 | for state in all_states: 54 | Q[state] = {} 55 | for action in range(env.action_space.n): 56 | Q[state][action] = 0 57 | return Q 58 | 59 | def play_one_game(bins, Q, eps=0.5): 60 | observation = env.reset() 61 | done = False 62 | cnt = 0 # number of moves in an episode 63 | state = get_state_as_string(assign_bins(observation, bins)) 64 | total_reward = 0 65 | 66 | while not done: 67 | cnt += 1 68 | # np.random.randn() seems to yield a random action 50% of the time ? 69 | if np.random.uniform() < eps: 70 | act = env.action_space.sample() # epsilon greedy 71 | else: 72 | act = max_dict(Q[state])[0] 73 | 74 | observation, reward, done, _ = env.step(act) 75 | 76 | total_reward += reward 77 | 78 | if done and cnt < 200: 79 | reward = -300 80 | 81 | state_new = get_state_as_string(assign_bins(observation, bins)) 82 | 83 | a1, max_q_s1a1 = max_dict(Q[state_new]) 84 | Q[state][act] += ALPHA*(reward + GAMMA*max_q_s1a1 - Q[state][act]) 85 | state, act = state_new, a1 86 | 87 | return total_reward, cnt 88 | 89 | def play_many_games(bins, N=10000): 90 | Q = initialize_Q() 91 | 92 | length = [] 93 | reward = [] 94 | for n in range(N): 95 | #eps=0.5/(1+n*10e-3) 96 | eps = 1.0 / np.sqrt(n+1) 97 | 98 | episode_reward, episode_length = play_one_game(bins, Q, eps) 99 | 100 | if n % 100 == 0: 101 | print(n, '%.4f' % eps, episode_reward) 102 | length.append(episode_length) 103 | reward.append(episode_reward) 104 | 105 | return length, reward 106 | 107 | def plot_running_avg(totalrewards): 108 | N = len(totalrewards) 109 | running_avg = np.empty(N) 110 | for t in range(N): 111 | running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)]) 112 | plt.plot(running_avg) 113 | plt.title("Running Average") 114 | plt.show() 115 | 116 | if __name__ == '__main__': 117 | bins = create_bins() 118 | episode_lengths, episode_rewards = play_many_games(bins) 119 | 120 | plot_running_avg(episode_rewards) -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/doubleQLearning.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import gym 5 | 6 | def maxAction(Q1, Q2, state): 7 | values = np.array([Q1[state,a] + Q2[state,a] for a in range(2)]) 8 | action = np.argmax(values) 9 | return action 10 | 11 | #discretize the spaces 12 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10) 13 | poleThetaVelSpace = np.linspace(-4, 4, 10) 14 | cartPosSpace = np.linspace(-2.4, 2.4, 10) 15 | cartVelSpace = np.linspace(-4, 4, 10) 16 | 17 | def getState(observation): 18 | cartX, cartXdot, cartTheta, cartThetadot = observation 19 | cartX = int(np.digitize(cartX, cartPosSpace)) 20 | cartXdot = int(np.digitize(cartXdot, cartVelSpace)) 21 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace)) 22 | cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace)) 23 | 24 | return (cartX, cartXdot, cartTheta, cartThetadot) 25 | 26 | def plotRunningAverage(totalrewards): 27 | N = len(totalrewards) 28 | running_avg = np.empty(N) 29 | for t in range(N): 30 | running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)]) 31 | plt.plot(running_avg) 32 | plt.title("Running Average") 33 | plt.show() 34 | 35 | if __name__ == '__main__': 36 | env = gym.make('CartPole-v0') 37 | # model hyperparameters 38 | ALPHA = 0.1 39 | GAMMA = 0.9 40 | EPS = 1.0 41 | 42 | #construct state space 43 | states = [] 44 | for i in range(len(cartPosSpace)+1): 45 | for j in range(len(cartVelSpace)+1): 46 | for k in range(len(poleThetaSpace)+1): 47 | for l in range(len(poleThetaVelSpace)+1): 48 | states.append((i,j,k,l)) 49 | 50 | Q1, Q2 = {}, {} 51 | for s in states: 52 | for a in range(2): 53 | Q1[s, a] = 0 54 | Q2[s,a] = 0 55 | 56 | numGames = 100000 57 | totalRewards = np.zeros(numGames) 58 | for i in range(numGames): 59 | if i % 5000 == 0: 60 | print('starting game ', i) 61 | done = False 62 | epRewards = 0 63 | observation = env.reset() 64 | while not done: 65 | s = getState(observation) 66 | rand = np.random.random() 67 | a = maxAction(Q1,Q2,s) if rand < (1-EPS) else env.action_space.sample() 68 | observation_, reward, done, info = env.step(a) 69 | epRewards += reward 70 | s_ = getState(observation_) 71 | rand = np.random.random() 72 | if rand <= 0.5: 73 | a_ = maxAction(Q1,Q1,s_) 74 | Q1[s,a] = Q1[s,a] + ALPHA*(reward + GAMMA*Q2[s_,a_] - Q1[s,a]) 75 | elif rand > 0.5: 76 | a_ = maxAction(Q2,Q2,s_) 77 | Q2[s,a] = Q2[s,a] + ALPHA*(reward + GAMMA*Q1[s_,a_] - Q2[s,a]) 78 | observation = observation_ 79 | EPS -= 2/(numGames) if EPS > 0 else 0 80 | totalRewards[i] = epRewards 81 | 82 | #plt.plot(totalRewards, 'b--') 83 | #plt.show() 84 | plotRunningAverage(totalRewards) 85 | -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/mountaincar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/Fundamentals/mountaincar.png -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from gym import wrappers 5 | import pickle 6 | 7 | pos_space = np.linspace(-1.2, 0.6, 12) 8 | vel_space = np.linspace(-0.07, 0.07, 20) 9 | 10 | def get_state(observation): 11 | pos, vel = observation 12 | pos_bin = int(np.digitize(pos, pos_space)) 13 | vel_bin = int(np.digitize(vel, vel_space)) 14 | 15 | return (pos_bin, vel_bin) 16 | 17 | def max_action(Q, state, actions=[0, 1, 2]): 18 | values = np.array([Q[state,a] for a in actions]) 19 | action = np.argmax(values) 20 | 21 | return action 22 | 23 | if __name__ == '__main__': 24 | env = gym.make('MountainCar-v0') 25 | env._max_episode_steps = 1000 26 | n_games = 50000 27 | alpha = 0.1 28 | gamma = 0.99 29 | eps = 1.0 30 | 31 | action_space = [0, 1, 2] 32 | 33 | states = [] 34 | for pos in range(21): 35 | for vel in range(21): 36 | states.append((pos, vel)) 37 | 38 | Q = {} 39 | for state in states: 40 | for action in action_space: 41 | Q[state, action] = 0 42 | 43 | #pickle_in = open('mountaincar.pkl', 'rb') 44 | #Q = pickle.load(pickle_in) 45 | #env = wrappers.Monitor(env, "tmp/mountaincar", 46 | #video_callable=lambda episode_id: True, force=True) 47 | score = 0 48 | total_rewards = np.zeros(n_games) 49 | for i in range(n_games): 50 | done = False 51 | obs = env.reset() 52 | state = get_state(obs) 53 | if i % 100 == 0 and i > 0: 54 | print('episode ', i, 'score ', score, 'epsilon %.3f' % eps) 55 | score = 0 56 | while not done: 57 | action = np.random.choice([0,1,2]) if np.random.random() < eps \ 58 | else max_action(Q, state) 59 | obs_, reward, done, info = env.step(action) 60 | state_ = get_state(obs_) 61 | score += reward 62 | action_ = max_action(Q, state_) 63 | Q[state, action] = Q[state, action] + \ 64 | alpha*(reward + gamma*Q[state_, action_] - Q[state, action]) 65 | state = state_ 66 | total_rewards[i] = score 67 | eps = eps - 2/n_games if eps > 0.01 else 0.01 68 | 69 | mean_rewards = np.zeros(n_games) 70 | for t in range(n_games): 71 | mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)]) 72 | plt.plot(mean_rewards) 73 | plt.savefig('mountaincar.png') 74 | 75 | #f = open("mountaincar.pkl","wb") 76 | #pickle.dump(Q,f) 77 | #f.close() 78 | -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/n_step_sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | poleThetaSpace = np.linspace(-0.209, 0.209, 10) 5 | poleThetaVelSpace = np.linspace(-4, 4, 10) 6 | cartPosSpace = np.linspace(-2.4, 2.4, 10) 7 | cartVelSpace = np.linspace(-4, 4, 10) 8 | 9 | def get_state(observation): 10 | cartX, cartXdot, cartTheta, cartThetaDot = observation 11 | cartX = int(np.digitize(cartX, cartPosSpace)) 12 | cartXdot = int(np.digitize(cartXdot, cartVelSpace)) 13 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace)) 14 | cartThetaDot = int(np.digitize(cartThetaDot, poleThetaVelSpace)) 15 | 16 | return (cartX, cartXdot, cartTheta, cartThetaDot) 17 | 18 | def choose_action(q, obs, eps, n_actions=2): 19 | state = get_state(obs) 20 | if np.random.random() < eps: 21 | action = np.random.choice([i for i in range(n_actions)]) 22 | else: 23 | action_values = [q[(state, a)] for a in range(n_actions)] 24 | action = np.argmax(action_values) 25 | return action 26 | 27 | if __name__ == '__main__': 28 | env = gym.make('CartPole-v0') 29 | alpha = 0.1 30 | gamma = 0.9 31 | epsilon = 1.0 32 | 33 | states = [] 34 | for i in range(len(cartPosSpace)+1): 35 | for j in range(len(cartVelSpace)+1): 36 | for k in range(len(poleThetaSpace)+1): 37 | for l in range(len(poleThetaVelSpace)+1): 38 | states.append((i,j,k,l)) 39 | 40 | Q = {} 41 | for s in states: 42 | for a in range(2): 43 | Q[(s, a)] = 0.0 44 | 45 | n = 16 46 | state_memory = np.zeros((n, 4)) 47 | action_memory = np.zeros(n) 48 | reward_memory = np.zeros(n) 49 | 50 | scores = [] 51 | n_episodes = 50000 52 | for i in range(n_episodes): 53 | done = False 54 | score = 0 55 | t = 0 56 | T = np.inf 57 | observation = env.reset() 58 | action = choose_action(Q, observation, epsilon) 59 | action_memory[t%n] = action 60 | state_memory[t%n] = observation 61 | while not done: 62 | observation, reward, done, info = env.step(action) 63 | score += reward 64 | state_memory[(t+1)%n] = observation 65 | reward_memory[(t+1)%n] = reward 66 | if done: 67 | T = t + 1 68 | #print('episode ends at step', t) 69 | action = choose_action(Q, observation, epsilon) 70 | action_memory[(t+1)%n] = action 71 | tau = t - n + 1 72 | if tau >= 0: 73 | G = [gamma**(j-tau-1)*reward_memory[j%n] \ 74 | for j in range(tau+1, min(tau+n, T)+1)] 75 | G = np.sum(G) 76 | if tau + n < T: 77 | s = get_state(state_memory[(tau+n)%n]) 78 | a = int(action_memory[(tau+n)%n]) 79 | G += gamma**n * Q[(s,a)] 80 | s = get_state(state_memory[tau%n]) 81 | a = action_memory[tau%n] 82 | Q[(s,a)] += alpha*(G-Q[(s,a)]) 83 | #print('tau ', tau, '| Q %.2f' % \ 84 | # Q[(get_state(state_memory[tau%n]), action_memory[tau%n])]) 85 | 86 | t += 1 87 | 88 | for tau in range(t-n+1, T): 89 | G = [gamma**(j-tau-1)*reward_memory[j%n] \ 90 | for j in range(tau+1, min(tau+n, T)+1)] 91 | G = np.sum(G) 92 | if tau + n < T: 93 | s = get_state(state_memory[(tau+n)%n]) 94 | a = int(action_memory[(tau+n)%n]) 95 | G += gamma**n * Q[(s,a)] 96 | s = get_state(state_memory[tau%n]) 97 | a = action_memory[tau%n] 98 | Q[(s,a)] += alpha*(G-Q[(s,a)]) 99 | #print('tau ', tau, '| Q %.2f' % \ 100 | # Q[(get_state(state_memory[tau%n]), action_memory[tau%n])]) 101 | scores.append(score) 102 | avg_score = np.mean(scores[-1000:]) 103 | epsilon = epsilon -2 / n_episodes if epsilon > 0 else 0 104 | if i % 1000 == 0: 105 | print('episode ', i, 'avg_score %.1f' % avg_score, 106 | 'epsilon %.2f' % epsilon) 107 | 108 | -------------------------------------------------------------------------------- /ReinforcementLearning/Fundamentals/sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import gym 4 | 5 | def maxAction(Q, state): 6 | values = np.array([Q[state,a] for a in range(2)]) 7 | action = np.argmax(values) 8 | return action 9 | 10 | #discretize the spaces 11 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10) 12 | poleThetaVelSpace = np.linspace(-4, 4, 10) 13 | cartPosSpace = np.linspace(-2.4, 2.4, 10) 14 | cartVelSpace = np.linspace(-4, 4, 10) 15 | 16 | def getState(observation): 17 | cartX, cartXdot, cartTheta, cartThetadot = observation 18 | cartX = int(np.digitize(cartX, cartPosSpace)) 19 | cartXdot = int(np.digitize(cartXdot, cartVelSpace)) 20 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace)) 21 | cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace)) 22 | 23 | return (cartX, cartXdot, cartTheta, cartThetadot) 24 | 25 | if __name__ == '__main__': 26 | env = gym.make('CartPole-v0') 27 | # model hyperparameters 28 | ALPHA = 0.1 29 | GAMMA = 0.9 30 | EPS = 1.0 31 | 32 | #construct state space 33 | states = [] 34 | for i in range(len(cartPosSpace)+1): 35 | for j in range(len(cartVelSpace)+1): 36 | for k in range(len(poleThetaSpace)+1): 37 | for l in range(len(poleThetaVelSpace)+1): 38 | states.append((i,j,k,l)) 39 | 40 | Q = {} 41 | for s in states: 42 | for a in range(2): 43 | Q[s, a] = 0 44 | 45 | numGames = 50000 46 | totalRewards = np.zeros(numGames) 47 | for i in range(numGames): 48 | if i % 5000 == 0: 49 | print('starting game', i) 50 | # cart x position, cart velocity, pole theta, pole velocity 51 | observation = env.reset() 52 | s = getState(observation) 53 | rand = np.random.random() 54 | a = maxAction(Q, s) if rand < (1-EPS) else env.action_space.sample() 55 | done = False 56 | epRewards = 0 57 | while not done: 58 | observation_, reward, done, info = env.step(a) 59 | s_ = getState(observation_) 60 | rand = np.random.random() 61 | a_ = maxAction(Q, s_) if rand < (1-EPS) else env.action_space.sample() 62 | epRewards += reward 63 | Q[s,a] = Q[s,a] + ALPHA*(reward + GAMMA*Q[s_,a_] - Q[s,a]) 64 | s, a = s_, a_ 65 | EPS -= 2/(numGames) if EPS > 0 else 0 66 | totalRewards[i] = epRewards 67 | 68 | plt.plot(totalRewards, 'b--') 69 | plt.show() -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/actor_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.distributions import Categorical 6 | 7 | 8 | class ActorCritic(nn.Module): 9 | def __init__(self, input_dims, n_actions, gamma=0.99, tau=0.98): 10 | super(ActorCritic, self).__init__() 11 | self.gamma = gamma 12 | self.tau = tau 13 | 14 | self.input = nn.Linear(*input_dims, 256) 15 | self.dense = nn.Linear(256, 256) 16 | 17 | self.gru = nn.GRUCell(256, 256) 18 | self.pi = nn.Linear(256, n_actions) 19 | self.v = nn.Linear(256, 1) 20 | 21 | def forward(self, state, hx): 22 | x = F.relu(self.input(state)) 23 | x = F.relu(self.dense(x)) 24 | hx = self.gru(x, (hx)) 25 | 26 | pi = self.pi(hx) 27 | v = self.v(hx) 28 | 29 | probs = T.softmax(pi, dim=1) 30 | dist = Categorical(probs) 31 | action = dist.sample() 32 | log_prob = dist.log_prob(action) 33 | 34 | return action.numpy()[0], v, log_prob, hx 35 | 36 | def calc_R(self, done, rewards, values): 37 | values = T.cat(values).squeeze() 38 | if len(values.size()) == 1: # batch of states 39 | R = values[-1] * (1-int(done)) 40 | elif len(values.size()) == 0: # single state 41 | R = values*(1-int(done)) 42 | 43 | batch_return = [] 44 | for reward in rewards[::-1]: 45 | R = reward + self.gamma * R 46 | batch_return.append(R) 47 | batch_return.reverse() 48 | batch_return = T.tensor(batch_return, 49 | dtype=T.float).reshape(values.size()) 50 | return batch_return 51 | 52 | def calc_loss(self, new_states, hx, done, 53 | rewards, values, log_probs, r_i_t=None): 54 | if r_i_t is not None: 55 | rewards += r_i_t.detach().numpy() 56 | returns = self.calc_R(done, rewards, values) 57 | next_v = T.zeros(1, 1) if done else self.forward(T.tensor([new_states], 58 | dtype=T.float), hx)[1] 59 | 60 | values.append(next_v.detach()) 61 | values = T.cat(values).squeeze() 62 | log_probs = T.cat(log_probs) 63 | rewards = T.tensor(rewards) 64 | 65 | delta_t = rewards + self.gamma*values[1:] - values[:-1] 66 | n_steps = len(delta_t) 67 | gae = np.zeros(n_steps) 68 | for t in range(n_steps): 69 | for k in range(0, n_steps-t): 70 | temp = (self.gamma*self.tau)**k*delta_t[t+k] 71 | gae[t] += temp 72 | gae = T.tensor(gae, dtype=T.float) 73 | 74 | actor_loss = -(log_probs*gae).sum() 75 | entropy_loss = (-log_probs*T.exp(log_probs)).sum() 76 | # [T] vs () 77 | critic_loss = F.mse_loss(values[:-1].squeeze(), returns) 78 | 79 | total_loss = actor_loss + critic_loss - 0.01*entropy_loss 80 | return total_loss 81 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/icm.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ICM(nn.Module): 7 | def __init__(self, input_dims, n_actions=2, alpha=1, beta=0.2): 8 | super(ICM, self).__init__() 9 | self.alpha = alpha 10 | self.beta = beta 11 | # hard coded for cartpole environment 12 | self.inverse = nn.Linear(4*2, 256) 13 | self.pi_logits = nn.Linear(256, n_actions) 14 | 15 | self.dense1 = nn.Linear(4+1, 256) 16 | self.new_state = nn.Linear(256, 4) 17 | 18 | device = T.device('cpu') 19 | self.to(device) 20 | 21 | def forward(self, state, new_state, action): 22 | inverse = F.elu(self.inverse(T.cat([state, new_state], dim=1))) 23 | pi_logits = self.pi_logits(inverse) 24 | 25 | # from [T] to [T,1] 26 | action = action.reshape((action.size()[0], 1)) 27 | forward_input = T.cat([state, action], dim=1) 28 | dense = F.elu(self.dense1(forward_input)) 29 | state_ = self.new_state(dense) 30 | 31 | return pi_logits, state_ 32 | 33 | def calc_loss(self, state, new_state, action): 34 | state = T.tensor(state, dtype=T.float) 35 | action = T.tensor(action, dtype=T.float) 36 | new_state = T.tensor(new_state, dtype=T.float) 37 | 38 | pi_logits, state_ = self.forward(state, new_state, action) 39 | 40 | inverse_loss = nn.CrossEntropyLoss() 41 | L_I = (1-self.beta)*inverse_loss(pi_logits, action.to(T.long)) 42 | 43 | forward_loss = nn.MSELoss() 44 | L_F = self.beta*forward_loss(state_, new_state) 45 | 46 | intrinsic_reward = self.alpha*((state_ - new_state).pow(2)).mean(dim=1) 47 | return intrinsic_reward, L_I, L_F 48 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.multiprocessing as mp 3 | from parallel_env import ParallelEnv 4 | 5 | os.environ['OMP_NUM_THREADS'] = '1' 6 | 7 | 8 | if __name__ == '__main__': 9 | mp.set_start_method('spawn') 10 | env_id = 'CartPole-v0' 11 | n_threads = 12 12 | n_actions = 2 13 | input_shape = [4] 14 | env = ParallelEnv(env_id=env_id, n_threads=n_threads, 15 | n_actions=n_actions, input_shape=input_shape, icm=True) 16 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/memory.py: -------------------------------------------------------------------------------- 1 | class Memory: 2 | def __init__(self): 3 | self.states = [] 4 | self.actions = [] 5 | self.rewards = [] 6 | self.new_states = [] 7 | self.values = [] 8 | self.log_probs = [] 9 | 10 | def remember(self, state, action, reward, new_state, value, log_p): 11 | self.actions.append(action) 12 | self.rewards.append(reward) 13 | self.states.append(state) 14 | self.new_states.append(new_state) 15 | self.log_probs.append(log_p) 16 | self.values.append(value) 17 | 18 | def clear_memory(self): 19 | self.states = [] 20 | self.actions = [] 21 | self.rewards = [] 22 | self.new_states = [] 23 | self.values = [] 24 | self.log_probs = [] 25 | 26 | def sample_memory(self): 27 | return self.states, self.actions, self.rewards, self.new_states,\ 28 | self.values, self.log_probs 29 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/parallel_env.py: -------------------------------------------------------------------------------- 1 | import torch.multiprocessing as mp 2 | from actor_critic import ActorCritic 3 | from icm import ICM 4 | from shared_adam import SharedAdam 5 | from worker import worker 6 | 7 | 8 | class ParallelEnv: 9 | def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8): 10 | names = [str(i) for i in range(1, n_threads+1)] 11 | 12 | global_actor_critic = ActorCritic(input_shape, n_actions) 13 | global_actor_critic.share_memory() 14 | global_optim = SharedAdam(global_actor_critic.parameters()) 15 | 16 | if not icm: 17 | global_icm = None 18 | global_icm_optim = None 19 | else: 20 | global_icm = ICM(input_shape, n_actions) 21 | global_icm.share_memory() 22 | global_icm_optim = SharedAdam(global_icm.parameters()) 23 | 24 | self.ps = [mp.Process(target=worker, 25 | args=(name, input_shape, n_actions, 26 | global_actor_critic, global_icm, 27 | global_optim, global_icm_optim, env_id, 28 | n_threads, icm)) 29 | for name in names] 30 | 31 | [p.start() for p in self.ps] 32 | [p.join() for p in self.ps] 33 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/shared_adam.py: -------------------------------------------------------------------------------- 1 | # from Morvan Zhou's implementation: 2 | # https://github.com/MorvanZhou/pytorch-A3C 3 | 4 | import torch as T 5 | 6 | 7 | class SharedAdam(T.optim.Adam): 8 | def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), eps=1e-8, 9 | weight_decay=0): 10 | super(SharedAdam, self).__init__(params, lr=lr, betas=betas, 11 | eps=eps, weight_decay=weight_decay) 12 | 13 | for group in self.param_groups: 14 | for p in group['params']: 15 | state = self.state[p] 16 | state['step'] = 0 17 | state['exp_avg'] = T.zeros_like(p.data) 18 | state['exp_avg_sq'] = T.zeros_like(p.data) 19 | 20 | state['exp_avg'].share_memory_() 21 | state['exp_avg_sq'].share_memory_() 22 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | def plot_learning_curve(x, scores, figure_file): 6 | running_avg = np.zeros(len(scores)) 7 | for i in range(len(running_avg)): 8 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 9 | plt.plot(x, running_avg) 10 | plt.title('Running average of previous 100 episodes') 11 | plt.savefig(figure_file) 12 | -------------------------------------------------------------------------------- /ReinforcementLearning/ICM/worker.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch as T 4 | from actor_critic import ActorCritic 5 | from icm import ICM 6 | from memory import Memory 7 | from utils import plot_learning_curve 8 | 9 | 10 | def worker(name, input_shape, n_actions, global_agent, global_icm, 11 | optimizer, icm_optimizer, env_id, n_threads, icm=False): 12 | T_MAX = 20 13 | 14 | local_agent = ActorCritic(input_shape, n_actions) 15 | 16 | if icm: 17 | local_icm = ICM(input_shape, n_actions) 18 | algo = 'ICM' 19 | else: 20 | intrinsic_reward = T.zeros(1) 21 | algo = 'A3C' 22 | 23 | memory = Memory() 24 | 25 | env = gym.make(env_id) 26 | 27 | t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0 28 | 29 | while episode < max_eps: 30 | obs = env.reset() 31 | hx = T.zeros(1, 256) 32 | score, done, ep_steps = 0, False, 0 33 | while not done: 34 | state = T.tensor([obs], dtype=T.float) 35 | action, value, log_prob, hx = local_agent(state, hx) 36 | obs_, reward, done, info = env.step(action) 37 | t_steps += 1 38 | ep_steps += 1 39 | score += reward 40 | reward = 0 # turn off extrinsic rewards 41 | memory.remember(obs, action, reward, obs_, value, log_prob) 42 | obs = obs_ 43 | if ep_steps % T_MAX == 0 or done: 44 | states, actions, rewards, new_states, values, log_probs = \ 45 | memory.sample_memory() 46 | if icm: 47 | intrinsic_reward, L_I, L_F = \ 48 | local_icm.calc_loss(states, new_states, actions) 49 | 50 | loss = local_agent.calc_loss(obs, hx, done, rewards, values, 51 | log_probs, intrinsic_reward) 52 | 53 | optimizer.zero_grad() 54 | hx = hx.detach_() 55 | if icm: 56 | icm_optimizer.zero_grad() 57 | (L_I + L_F).backward() 58 | 59 | loss.backward() 60 | T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40) 61 | 62 | for local_param, global_param in zip( 63 | local_agent.parameters(), 64 | global_agent.parameters()): 65 | global_param._grad = local_param.grad 66 | optimizer.step() 67 | local_agent.load_state_dict(global_agent.state_dict()) 68 | 69 | if icm: 70 | for local_param, global_param in zip( 71 | local_icm.parameters(), 72 | global_icm.parameters()): 73 | global_param._grad = local_param.grad 74 | icm_optimizer.step() 75 | local_icm.load_state_dict(global_icm.state_dict()) 76 | memory.clear_memory() 77 | 78 | if name == '1': 79 | scores.append(score) 80 | avg_score = np.mean(scores[-100:]) 81 | print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} ' 82 | 'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format( 83 | algo, episode, name, n_threads, 84 | t_steps/1e6, score, 85 | T.sum(intrinsic_reward), 86 | avg_score)) 87 | episode += 1 88 | if name == '1': 89 | x = [z for z in range(episode)] 90 | fname = algo + '_CartPole_no_rewards.png' 91 | plot_learning_curve(x, scores, fname) 92 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/main_torch.py: -------------------------------------------------------------------------------- 1 | from ddpg_torch import Agent 2 | import gym 3 | import numpy as np 4 | from utils import plotLearning 5 | 6 | env = gym.make('LunarLanderContinuous-v2') 7 | agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[8], tau=0.001, env=env, 8 | batch_size=64, layer1_size=400, layer2_size=300, n_actions=2) 9 | 10 | #agent.load_models() 11 | np.random.seed(0) 12 | 13 | score_history = [] 14 | for i in range(1000): 15 | obs = env.reset() 16 | done = False 17 | score = 0 18 | while not done: 19 | act = agent.choose_action(obs) 20 | new_state, reward, done, info = env.step(act) 21 | agent.remember(obs, act, reward, new_state, int(done)) 22 | agent.learn() 23 | score += reward 24 | obs = new_state 25 | #env.render() 26 | score_history.append(score) 27 | 28 | #if i % 25 == 0: 29 | # agent.save_models() 30 | 31 | print('episode ', i, 'score %.2f' % score, 32 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:])) 33 | 34 | filename = 'LunarLander-alpha000025-beta00025-400-300.png' 35 | plotLearning(score_history, filename, window=100) 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | def plotLearning(scores, filename, x=None, window=5): 5 | N = len(scores) 6 | running_avg = np.empty(N) 7 | for t in range(N): 8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)]) 9 | if x is None: 10 | x = [i for i in range(N)] 11 | plt.ylabel('Score') 12 | plt.xlabel('Game') 13 | plt.plot(x, running_avg) 14 | plt.savefig(filename) -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/main_tf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import numpy as np 4 | from ddpg_orig_tf import Agent 5 | from utils import plotLearning 6 | 7 | # Uncomment the lines below to specify which gpu to run on 8 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 9 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0" 10 | 11 | if __name__ == '__main__': 12 | env = gym.make('Pendulum-v0') 13 | agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[3], tau=0.001, 14 | env=env, batch_size=64, layer1_size=800, layer2_size=600, 15 | n_actions=1) 16 | np.random.seed(0) 17 | score_history = [] 18 | for i in range(1000): 19 | obs = env.reset() 20 | done = False 21 | score = 0 22 | while not done: 23 | act = agent.choose_action(obs) 24 | new_state, reward, done, info = env.step(act) 25 | agent.remember(obs, act, reward, new_state, int(done)) 26 | agent.learn() 27 | score += reward 28 | obs = new_state 29 | #env.render() 30 | score_history.append(score) 31 | print('episode ', i, 'score %.2f' % score, 32 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:])) 33 | 34 | filename = 'Pendulum-alpha00005-beta0005-800-600-optimized.png' 35 | plotLearning(score_history, filename, window=100) 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | def plotLearning(scores, filename, x=None, window=5): 5 | N = len(scores) 6 | running_avg = np.empty(N) 7 | for t in range(N): 8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)]) 9 | if x is None: 10 | x = [i for i in range(N)] 11 | plt.ylabel('Score') 12 | plt.xlabel('Game') 13 | plt.plot(x, running_avg) 14 | plt.savefig(filename) -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/main_tf.py: -------------------------------------------------------------------------------- 1 | from ddpg_orig_tf import Agent 2 | import gym 3 | import numpy as np 4 | from utils import plotLearning 5 | from gym import wrappers 6 | import os 7 | 8 | #tf.set_random_seed(0) 9 | if __name__ == '__main__': 10 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 12 | 13 | env = gym.make('BipedalWalker-v2') 14 | agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[24], tau=0.001, env=env, 15 | batch_size=64, layer1_size=400, layer2_size=300, n_actions=4, 16 | chkpt_dir='tmp/ddpg') 17 | np.random.seed(0) 18 | #agent.load_models() 19 | #env = wrappers.Monitor(env, "tmp/walker2d", 20 | # video_callable=lambda episode_id: True, force=True) 21 | score_history = [] 22 | for i in range(5000): 23 | obs = env.reset() 24 | done = False 25 | score = 0 26 | while not done: 27 | act = agent.choose_action(obs) 28 | new_state, reward, done, info = env.step(act) 29 | agent.remember(obs, act, reward, new_state, int(done)) 30 | agent.learn() 31 | score += reward 32 | obs = new_state 33 | env.render() 34 | score_history.append(score) 35 | print('episode ', i, 'score %.2f' % score, 36 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:])) 37 | if i % 25 == 0: 38 | agent.save_models() 39 | filename = 'WalkerTF-alpha00005-beta0005-400-300-original-5000games-testing.png' 40 | plotLearning(score_history, filename, window=100) 41 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer: 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | 16 | self.state_memory[index] = state 17 | self.new_state_memory[index] = state_ 18 | self.action_memory[index] = action 19 | self.reward_memory[index] = reward 20 | self.terminal_memory[index] = done 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | states_ = self.new_state_memory[batch] 31 | actions = self.action_memory[batch] 32 | rewards = self.reward_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/main_ddpg.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from ddpg_tf2 import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('Pendulum-v0') 8 | agent = Agent(input_dims=env.observation_space.shape, env=env, 9 | n_actions=env.action_space.shape[0]) 10 | n_games = 250 11 | 12 | figure_file = 'plots/pendulum.png' 13 | 14 | best_score = env.reward_range[0] 15 | score_history = [] 16 | load_checkpoint = False 17 | 18 | if load_checkpoint: 19 | n_steps = 0 20 | while n_steps <= agent.batch_size: 21 | observation = env.reset() 22 | action = env.action_space.sample() 23 | observation_, reward, done, info = env.step(action) 24 | agent.remember(observation, action, reward, observation_, done) 25 | n_steps += 1 26 | agent.learn() 27 | agent.load_models() 28 | evaluate = True 29 | else: 30 | evaluate = False 31 | 32 | for i in range(n_games): 33 | observation = env.reset() 34 | done = False 35 | score = 0 36 | while not done: 37 | action = agent.choose_action(observation, evaluate) 38 | observation_, reward, done, info = env.step(action) 39 | score += reward 40 | agent.remember(observation, action, reward, observation_, done) 41 | if not load_checkpoint: 42 | agent.learn() 43 | observation = observation_ 44 | 45 | score_history.append(score) 46 | avg_score = np.mean(score_history[-100:]) 47 | 48 | if avg_score > best_score: 49 | best_score = avg_score 50 | if not load_checkpoint: 51 | agent.save_models() 52 | 53 | print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score) 54 | 55 | if not load_checkpoint: 56 | x = [i+1 for i in range(n_games)] 57 | plot_learning_curve(x, score_history, figure_file) 58 | 59 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import tensorflow.keras as keras 4 | from tensorflow.keras.layers import Dense 5 | 6 | class CriticNetwork(keras.Model): 7 | def __init__(self, fc1_dims=512, fc2_dims=512, 8 | name='critic', chkpt_dir='tmp/ddpg'): 9 | super(CriticNetwork, self).__init__() 10 | self.fc1_dims = fc1_dims 11 | self.fc2_dims = fc2_dims 12 | 13 | self.model_name = name 14 | self.checkpoint_dir = chkpt_dir 15 | self.checkpoint_file = os.path.join(self.checkpoint_dir, 16 | self.model_name+'_ddpg.h5') 17 | 18 | self.fc1 = Dense(self.fc1_dims, activation='relu') 19 | self.fc2 = Dense(self.fc2_dims, activation='relu') 20 | self.q = Dense(1, activation=None) 21 | 22 | def call(self, state, action): 23 | action_value = self.fc1(tf.concat([state, action], axis=1)) 24 | action_value = self.fc2(action_value) 25 | 26 | q = self.q(action_value) 27 | 28 | return q 29 | 30 | class ActorNetwork(keras.Model): 31 | def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2, name='actor', 32 | chkpt_dir='tmp/ddpg'): 33 | super(ActorNetwork, self).__init__() 34 | self.fc1_dims = fc1_dims 35 | self.fc2_dims = fc2_dims 36 | self.n_actions = n_actions 37 | 38 | self.model_name = name 39 | self.checkpoint_dir = chkpt_dir 40 | self.checkpoint_file = os.path.join(self.checkpoint_dir, 41 | self.model_name+'_ddpg.h5') 42 | 43 | self.fc1 = Dense(self.fc1_dims, activation='relu') 44 | self.fc2 = Dense(self.fc2_dims, activation='relu') 45 | self.mu = Dense(self.n_actions, activation='tanh') 46 | 47 | def call(self, state): 48 | prob = self.fc1(state) 49 | prob = self.fc2(prob) 50 | 51 | mu = self.mu(prob) 52 | 53 | return mu 54 | 55 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('CartPole-v0') 8 | N = 20 9 | batch_size = 5 10 | n_epochs = 4 11 | alpha = 0.0003 12 | agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 13 | alpha=alpha, n_epochs=n_epochs, 14 | input_dims=env.observation_space.shape) 15 | n_games = 300 16 | 17 | figure_file = 'plots/cartpole.png' 18 | 19 | best_score = env.reward_range[0] 20 | score_history = [] 21 | 22 | learn_iters = 0 23 | avg_score = 0 24 | n_steps = 0 25 | 26 | for i in range(n_games): 27 | observation = env.reset() 28 | done = False 29 | score = 0 30 | while not done: 31 | action, prob, val = agent.choose_action(observation) 32 | observation_, reward, done, info = env.step(action) 33 | n_steps += 1 34 | score += reward 35 | agent.store_transition(observation, action, 36 | prob, val, reward, done) 37 | if n_steps % N == 0: 38 | agent.learn() 39 | learn_iters += 1 40 | observation = observation_ 41 | score_history.append(score) 42 | avg_score = np.mean(score_history[-100:]) 43 | 44 | if avg_score > best_score: 45 | best_score = avg_score 46 | agent.save_models() 47 | 48 | print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score, 49 | 'time_steps', n_steps, 'learning_steps', learn_iters) 50 | x = [i+1 for i in range(len(score_history))] 51 | plot_learning_curve(x, score_history, figure_file) 52 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/tf2/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class PPOMemory: 5 | def __init__(self, batch_size): 6 | self.states = [] 7 | self.probs = [] 8 | self.vals = [] 9 | self.actions = [] 10 | self.rewards = [] 11 | self.dones = [] 12 | 13 | self.batch_size = batch_size 14 | 15 | def generate_batches(self): 16 | n_states = len(self.states) 17 | batch_start = np.arange(0, n_states, self.batch_size) 18 | indices = np.arange(n_states, dtype=np.int64) 19 | np.random.shuffle(indices) 20 | batches = [indices[i:i+self.batch_size] for i in batch_start] 21 | 22 | return np.array(self.states),\ 23 | np.array(self.actions),\ 24 | np.array(self.probs),\ 25 | np.array(self.vals),\ 26 | np.array(self.rewards),\ 27 | np.array(self.dones),\ 28 | batches 29 | 30 | def store_memory(self, state, action, probs, vals, reward, done): 31 | self.states.append(state) 32 | self.actions.append(action) 33 | self.probs.append(probs) 34 | self.vals.append(vals) 35 | self.rewards.append(reward) 36 | self.dones.append(done) 37 | 38 | def clear_memory(self): 39 | self.states = [] 40 | self.probs = [] 41 | self.actions = [] 42 | self.rewards = [] 43 | self.dones = [] 44 | self.vals = [] 45 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Dense 3 | 4 | 5 | class ActorNetwork(keras.Model): 6 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256): 7 | super(ActorNetwork, self).__init__() 8 | 9 | self.fc1 = Dense(fc1_dims, activation='relu') 10 | self.fc2 = Dense(fc2_dims, activation='relu') 11 | self.fc3 = Dense(n_actions, activation='softmax') 12 | 13 | def call(self, state): 14 | x = self.fc1(state) 15 | x = self.fc2(x) 16 | x = self.fc3(x) 17 | 18 | return x 19 | 20 | 21 | class CriticNetwork(keras.Model): 22 | def __init__(self, fc1_dims=256, fc2_dims=256): 23 | super(CriticNetwork, self).__init__() 24 | self.fc1 = Dense(fc1_dims, activation='relu') 25 | self.fc2 = Dense(fc2_dims, activation='relu') 26 | self.q = Dense(1, activation=None) 27 | 28 | def call(self, state): 29 | x = self.fc1(state) 30 | x = self.fc2(x) 31 | q = self.q(x) 32 | 33 | return q 34 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/torch/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from ppo_torch import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('CartPole-v0') 8 | N = 20 9 | batch_size = 5 10 | n_epochs = 4 11 | alpha = 0.0003 12 | agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 13 | alpha=alpha, n_epochs=n_epochs, 14 | input_dims=env.observation_space.shape) 15 | n_games = 300 16 | 17 | figure_file = 'plots/cartpole.png' 18 | 19 | best_score = env.reward_range[0] 20 | score_history = [] 21 | 22 | learn_iters = 0 23 | avg_score = 0 24 | n_steps = 0 25 | 26 | for i in range(n_games): 27 | observation = env.reset() 28 | done = False 29 | score = 0 30 | while not done: 31 | action, prob, val = agent.choose_action(observation) 32 | observation_, reward, done, info = env.step(action) 33 | n_steps += 1 34 | score += reward 35 | agent.remember(observation, action, prob, val, reward, done) 36 | if n_steps % N == 0: 37 | agent.learn() 38 | learn_iters += 1 39 | observation = observation_ 40 | score_history.append(score) 41 | avg_score = np.mean(score_history[-100:]) 42 | 43 | if avg_score > best_score: 44 | best_score = avg_score 45 | agent.save_models() 46 | 47 | print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score, 48 | 'time_steps', n_steps, 'learning_steps', learn_iters) 49 | x = [i+1 for i in range(len(score_history))] 50 | plot_learning_curve(x, score_history, figure_file) 51 | 52 | 53 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/PPO/torch/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | 16 | self.state_memory[index] = state 17 | self.new_state_memory[index] = state_ 18 | self.action_memory[index] = action 19 | self.reward_memory[index] = reward 20 | self.terminal_memory[index] = done 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size) 28 | 29 | states = self.state_memory[batch] 30 | states_ = self.new_state_memory[batch] 31 | actions = self.action_memory[batch] 32 | rewards = self.reward_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | 37 | 38 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/main_sac.py: -------------------------------------------------------------------------------- 1 | import pybullet_envs 2 | import gym 3 | import numpy as np 4 | from sac_torch import Agent 5 | from utils import plot_learning_curve 6 | from gym import wrappers 7 | 8 | if __name__ == '__main__': 9 | env = gym.make('InvertedPendulumBulletEnv-v0') 10 | agent = Agent(input_dims=env.observation_space.shape, env=env, 11 | n_actions=env.action_space.shape[0]) 12 | n_games = 250 13 | # uncomment this line and do a mkdir tmp && mkdir video if you want to 14 | # record video of the agent playing the game. 15 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True) 16 | filename = 'inverted_pendulum.png' 17 | 18 | figure_file = 'plots/' + filename 19 | 20 | best_score = env.reward_range[0] 21 | score_history = [] 22 | load_checkpoint = False 23 | 24 | if load_checkpoint: 25 | agent.load_models() 26 | env.render(mode='human') 27 | 28 | for i in range(n_games): 29 | observation = env.reset() 30 | done = False 31 | score = 0 32 | while not done: 33 | action = agent.choose_action(observation) 34 | observation_, reward, done, info = env.step(action) 35 | score += reward 36 | agent.remember(observation, action, reward, observation_, done) 37 | if not load_checkpoint: 38 | agent.learn() 39 | observation = observation_ 40 | score_history.append(score) 41 | avg_score = np.mean(score_history[-100:]) 42 | 43 | if avg_score > best_score: 44 | best_score = avg_score 45 | if not load_checkpoint: 46 | agent.save_models() 47 | 48 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score) 49 | 50 | if not load_checkpoint: 51 | x = [i+1 for i in range(n_games)] 52 | plot_learning_curve(x, score_history, figure_file) 53 | 54 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer: 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | 16 | self.state_memory[index] = state 17 | self.new_state_memory[index] = state_ 18 | self.action_memory[index] = action 19 | self.reward_memory[index] = reward 20 | self.terminal_memory[index] = done 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size) 28 | 29 | states = self.state_memory[batch] 30 | states_ = self.new_state_memory[batch] 31 | actions = self.action_memory[batch] 32 | rewards = self.reward_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/main_sac.py: -------------------------------------------------------------------------------- 1 | import pybullet_envs 2 | import gym 3 | import numpy as np 4 | from sac_tf2 import Agent 5 | from utils import plot_learning_curve 6 | from gym import wrappers 7 | 8 | if __name__ == '__main__': 9 | env = gym.make('InvertedPendulumBulletEnv-v0') 10 | agent = Agent(input_dims=env.observation_space.shape, env=env, 11 | n_actions=env.action_space.shape[0]) 12 | n_games = 250 13 | # uncomment this line and do a mkdir tmp && mkdir tmp/video if you want to 14 | # record video of the agent playing the game. 15 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True) 16 | filename = 'inverted_pendulum.png' 17 | 18 | figure_file = 'plots/' + filename 19 | 20 | best_score = env.reward_range[0] 21 | score_history = [] 22 | load_checkpoint = True 23 | 24 | if load_checkpoint: 25 | agent.load_models() 26 | env.render(mode='human') 27 | 28 | for i in range(n_games): 29 | observation = env.reset() 30 | done = False 31 | score = 0 32 | while not done: 33 | action = agent.choose_action(observation) 34 | observation_, reward, done, info = env.step(action) 35 | score += reward 36 | agent.remember(observation, action, reward, observation_, done) 37 | if not load_checkpoint: 38 | agent.learn() 39 | observation = observation_ 40 | score_history.append(score) 41 | avg_score = np.mean(score_history[-100:]) 42 | 43 | if avg_score > best_score: 44 | best_score = avg_score 45 | if not load_checkpoint: 46 | agent.save_models() 47 | 48 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score) 49 | 50 | if not load_checkpoint: 51 | x = [i+1 for i in range(n_games)] 52 | plot_learning_curve(x, score_history, figure_file) 53 | 54 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import tensorflow.keras as keras 5 | import tensorflow_probability as tfp 6 | from tensorflow.keras.layers import Dense 7 | 8 | class CriticNetwork(keras.Model): 9 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256, 10 | name='critic', chkpt_dir='tmp/sac'): 11 | super(CriticNetwork, self).__init__() 12 | self.fc1_dims = fc1_dims 13 | self.fc2_dims = fc2_dims 14 | self.n_actions = n_actions 15 | self.model_name = name 16 | self.checkpoint_dir = chkpt_dir 17 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 18 | 19 | self.fc1 = Dense(self.fc1_dims, activation='relu') 20 | self.fc2 = Dense(self.fc2_dims, activation='relu') 21 | self.q = Dense(1, activation=None) 22 | 23 | def call(self, state, action): 24 | action_value = self.fc1(tf.concat([state, action], axis=1)) 25 | action_value = self.fc2(action_value) 26 | 27 | q = self.q(action_value) 28 | 29 | return q 30 | 31 | class ValueNetwork(keras.Model): 32 | def __init__(self, fc1_dims=256, fc2_dims=256, 33 | name='value', chkpt_dir='tmp/sac'): 34 | super(ValueNetwork, self).__init__() 35 | self.fc1_dims = fc1_dims 36 | self.fc2_dims = fc2_dims 37 | self.model_name = name 38 | self.checkpoint_dir = chkpt_dir 39 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 40 | 41 | self.fc1 = Dense(self.fc1_dims, activation='relu') 42 | self.fc2 = Dense(fc2_dims, activation='relu') 43 | self.v = Dense(1, activation=None) 44 | 45 | def call(self, state): 46 | state_value = self.fc1(state) 47 | state_value = self.fc2(state_value) 48 | 49 | v = self.v(state_value) 50 | 51 | return v 52 | 53 | class ActorNetwork(keras.Model): 54 | def __init__(self, max_action, fc1_dims=256, 55 | fc2_dims=256, n_actions=2, name='actor', chkpt_dir='tmp/sac'): 56 | super(ActorNetwork, self).__init__() 57 | self.fc1_dims = fc1_dims 58 | self.fc2_dims = fc2_dims 59 | self.n_actions = n_actions 60 | self.model_name = name 61 | self.checkpoint_dir = chkpt_dir 62 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 63 | self.max_action = max_action 64 | self.noise = 1e-6 65 | 66 | self.fc1 = Dense(self.fc1_dims, activation='relu') 67 | self.fc2 = Dense(self.fc2_dims, activation='relu') 68 | self.mu = Dense(self.n_actions, activation=None) 69 | self.sigma = Dense(self.n_actions, activation=None) 70 | 71 | def call(self, state): 72 | prob = self.fc1(state) 73 | prob = self.fc2(prob) 74 | 75 | mu = self.mu(prob) 76 | sigma = self.sigma(prob) 77 | # might want to come back and change this, perhaps tf plays more nicely with 78 | # a sigma of ~0 79 | sigma = tf.clip_by_value(sigma, self.noise, 1) 80 | 81 | return mu, sigma 82 | 83 | def sample_normal(self, state, reparameterize=True): 84 | mu, sigma = self.call(state) 85 | probabilities = tfp.distributions.Normal(mu, sigma) 86 | 87 | if reparameterize: 88 | actions = probabilities.sample() # + something else if you want to implement 89 | else: 90 | actions = probabilities.sample() 91 | 92 | action = tf.math.tanh(actions)*self.max_action 93 | log_probs = probabilities.log_prob(actions) 94 | log_probs -= tf.math.log(1-tf.math.pow(action,2)+self.noise) 95 | log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True) 96 | 97 | return action, log_probs 98 | 99 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/SAC/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/TD3/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from td3_torch import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLanderContinuous-v2') 8 | agent = Agent(alpha=0.001, beta=0.001, 9 | input_dims=env.observation_space.shape, tau=0.005, 10 | env=env, batch_size=100, layer1_size=400, layer2_size=300, 11 | n_actions=env.action_space.shape[0]) 12 | n_games = 1000 13 | filename = 'plots/' + 'LunarLanderContinuous_' + str(n_games) + '_games.png' 14 | 15 | best_score = env.reward_range[0] 16 | score_history = [] 17 | 18 | agent.load_models() 19 | 20 | for i in range(n_games): 21 | observation = env.reset() 22 | done = False 23 | score = 0 24 | while not done: 25 | action = agent.choose_action(observation) 26 | observation_, reward, done, info = env.step(action) 27 | agent.remember(observation, action, reward, observation_, done) 28 | agent.learn() 29 | score += reward 30 | observation = observation_ 31 | score_history.append(score) 32 | avg_score = np.mean(score_history[-100:]) 33 | 34 | if avg_score > best_score: 35 | best_score = avg_score 36 | agent.save_models() 37 | 38 | print('episode ', i, 'score %.1f' % score, 39 | 'average score %.1f' % avg_score) 40 | 41 | x = [i+1 for i in range(n_games)] 42 | plot_learning_curve(x, score_history, filename) 43 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/TD3/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from td3_tf2 import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | #env = gym.make('LunarLanderContinuous-v2') 8 | #env = gym.make('Pendulum-v0') 9 | env = gym.make('BipedalWalker-v2') 10 | agent = Agent(alpha=0.001, beta=0.001, 11 | input_dims=env.observation_space.shape, tau=0.005, 12 | env=env, batch_size=100, layer1_size=400, layer2_size=300, 13 | n_actions=env.action_space.shape[0]) 14 | n_games = 1000 15 | filename = 'plots/' + 'walker_' + str(n_games) + '_games.png' 16 | 17 | best_score = env.reward_range[0] 18 | score_history = [] 19 | 20 | #agent.load_models() 21 | 22 | for i in range(n_games): 23 | observation = env.reset() 24 | done = False 25 | score = 0 26 | while not done: 27 | action = agent.choose_action(observation) 28 | observation_, reward, done, info = env.step(action) 29 | agent.remember(observation, action, reward, observation_, done) 30 | agent.learn() 31 | score += reward 32 | observation = observation_ 33 | score_history.append(score) 34 | avg_score = np.mean(score_history[-100:]) 35 | 36 | if avg_score > best_score: 37 | best_score = avg_score 38 | agent.save_models() 39 | 40 | print('episode ', i, 'score %.1f' % score, 41 | 'average score %.1f' % avg_score) 42 | 43 | x = [i+1 for i in range(n_games)] 44 | plot_learning_curve(x, score_history, filename) 45 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/TD3/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/TD3/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_keras.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.layers import Dense, Activation, Input 3 | from keras.models import Model, load_model 4 | from keras.optimizers import Adam 5 | import numpy as np 6 | 7 | class Agent(object): 8 | def __init__(self, alpha, beta, gamma=0.99, n_actions=4, 9 | layer1_size=1024, layer2_size=512, input_dims=8): 10 | self.gamma = gamma 11 | self.alpha = alpha 12 | self.beta = beta 13 | self.input_dims = input_dims 14 | self.fc1_dims = layer1_size 15 | self.fc2_dims = layer2_size 16 | self.n_actions = n_actions 17 | 18 | self.actor, self.critic, self.policy = self.build_actor_critic_network() 19 | self.action_space = [i for i in range(n_actions)] 20 | 21 | def build_actor_critic_network(self): 22 | input = Input(shape=(self.input_dims,)) 23 | delta = Input(shape=[1]) 24 | dense1 = Dense(self.fc1_dims, activation='relu')(input) 25 | dense2 = Dense(self.fc2_dims, activation='relu')(dense1) 26 | probs = Dense(self.n_actions, activation='softmax')(dense2) 27 | values = Dense(1, activation='linear')(dense2) 28 | 29 | def custom_loss(y_true, y_pred): 30 | out = K.clip(y_pred, 1e-8, 1-1e-8) 31 | log_lik = y_true*K.log(out) 32 | 33 | return K.sum(-log_lik*delta) 34 | 35 | actor = Model(input=[input, delta], output=[probs]) 36 | 37 | actor.compile(optimizer=Adam(lr=self.alpha), loss=custom_loss) 38 | 39 | critic = Model(input=[input], output=[values]) 40 | 41 | critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error') 42 | 43 | policy = Model(input=[input], output=[probs]) 44 | 45 | return actor, critic, policy 46 | 47 | def choose_action(self, observation): 48 | state = observation[np.newaxis, :] 49 | probabilities = self.policy.predict(state)[0] 50 | action = np.random.choice(self.action_space, p=probabilities) 51 | 52 | return action 53 | 54 | def learn(self, state, action, reward, state_, done): 55 | state = state[np.newaxis,:] 56 | state_ = state_[np.newaxis,:] 57 | critic_value_ = self.critic.predict(state_) 58 | critic_value = self.critic.predict(state) 59 | 60 | target = reward + self.gamma*critic_value_*(1-int(done)) 61 | delta = target - critic_value 62 | 63 | actions = np.zeros([1, self.n_actions]) 64 | actions[np.arange(1), action] = 1 65 | 66 | self.actor.fit([state, delta], actions, verbose=0) 67 | 68 | self.critic.fit(state, target, verbose=0) 69 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_replay_torch.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | class ReplayBuffer(): 8 | def __init__(self, max_size, input_shape): 9 | self.mem_size = max_size 10 | self.mem_cntr = 0 11 | self.state_memory = np.zeros((self.mem_size, *input_shape), 12 | dtype=np.float32) 13 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 14 | dtype=np.float32) 15 | self.log_probs = np.zeros(self.mem_size, dtype=np.float32) 16 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 17 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8) 18 | 19 | def store_transition(self, state, log_prob, reward, state_, done): 20 | index = self.mem_cntr % self.mem_size 21 | self.state_memory[index] = state 22 | self.new_state_memory[index] = state_ 23 | self.log_probs[index] = log_prob 24 | self.reward_memory[index] = reward 25 | self.terminal_memory[index] = done 26 | self.mem_cntr += 1 27 | 28 | def sample_buffer(self, batch_size): 29 | max_mem = min(self.mem_cntr, self.mem_size) 30 | batch = np.random.choice(max_mem, batch_size, replace=False) 31 | 32 | states = self.state_memory[batch] 33 | probs = self.log_probs[batch] 34 | rewards = self.reward_memory[batch] 35 | states_ = self.new_state_memory[batch] 36 | terminal = self.terminal_memory[batch] 37 | 38 | return states, probs, rewards, states_, terminal 39 | 40 | class ActorCriticNetwork(nn.Module): 41 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims, 42 | n_actions): 43 | super(ActorCriticNetwork, self).__init__() 44 | self.input_dims = input_dims 45 | self.fc1_dims = fc1_dims 46 | self.fc2_dims = fc2_dims 47 | self.n_actions = n_actions 48 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 49 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 50 | self.pi = nn.Linear(self.fc2_dims, n_actions) 51 | self.v = nn.Linear(self.fc2_dims, 1) 52 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 53 | 54 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1') 55 | self.to(self.device) 56 | 57 | def forward(self, state): 58 | x = F.relu(self.fc1(state)) 59 | x = F.relu(self.fc2(x)) 60 | pi = self.pi(x) 61 | v = self.v(x) 62 | return (pi, v) 63 | 64 | class Agent(): 65 | def __init__(self, lr, input_dims, n_actions, gamma=0.99, 66 | l1_size=256, l2_size=256, batch_size=32, 67 | mem_size=1000000): 68 | self.gamma = gamma 69 | self.batch_size = batch_size 70 | self.memory = ReplayBuffer(mem_size, input_dims) 71 | self.actor_critic = ActorCriticNetwork(lr, input_dims, l1_size, 72 | l2_size, n_actions=n_actions) 73 | self.log_probs = [] 74 | 75 | def store_transition(self, state, prob, reward, state_, done): 76 | self.memory.store_transition(state, prob, reward, state_, done) 77 | 78 | def choose_action(self, observation): 79 | state = T.tensor([observation]).to(self.actor_critic.device) 80 | probabilities, _ = self.actor_critic.forward(state) 81 | probabilities = F.softmax(probabilities) 82 | action_probs = T.distributions.Categorical(probabilities) 83 | action = action_probs.sample() 84 | log_probs = action_probs.log_prob(action) 85 | 86 | return action.item(), log_probs 87 | 88 | def learn(self): 89 | if self.memory.mem_cntr < self.batch_size: 90 | return 91 | self.actor_critic.optimizer.zero_grad() 92 | 93 | state, prob, reward, new_state, done = \ 94 | self.memory.sample_buffer(self.batch_size) 95 | 96 | states = T.tensor(state).to(self.actor_critic.device) 97 | probs = T.tensor(prob).to(self.actor_critic.device) 98 | rewards = T.tensor(reward).to(self.actor_critic.device) 99 | dones = T.tensor(done).to(self.actor_critic.device) 100 | states_ = T.tensor(new_state).to(self.actor_critic.device) 101 | 102 | _, critic_value_ = self.actor_critic.forward(states_) 103 | _, critic_value = self.actor_critic.forward(states) 104 | 105 | critic_value_[dones] = 0.0 106 | 107 | delta = rewards + self.gamma*critic_value_ 108 | 109 | actor_loss = -T.mean(probs*(delta-critic_value)) 110 | critic_loss = F.mse_loss(delta, critic_value) 111 | 112 | (actor_loss + critic_loss).backward() 113 | 114 | self.actor_critic.optimizer.step() 115 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/continuous_mountain_car_actor_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from actor_critic_continuous import Agent 4 | import matplotlib.pyplot as plt 5 | from utils import plotLearning 6 | from gym import wrappers 7 | 8 | 9 | if __name__ == '__main__': 10 | agent = Agent(alpha=0.000005, beta=0.00001, input_dims=[2], gamma=0.99, 11 | layer1_size=256, layer2_size=256) 12 | 13 | env = gym.make('MountainCarContinuous-v0') 14 | score_history = [] 15 | num_episodes = 100 16 | for i in range(num_episodes): 17 | #env = wrappers.Monitor(env, "tmp/mountaincar-continuous-trained-1", 18 | # video_callable=lambda episode_id: True, force=True) 19 | done = False 20 | score = 0 21 | observation = env.reset() 22 | while not done: 23 | action = np.array(agent.choose_action(observation)).reshape((1,)) 24 | observation_, reward, done, info = env.step(action) 25 | agent.learn(observation, reward, observation_, done) 26 | observation = observation_ 27 | score += reward 28 | score_history.append(score) 29 | print('episode: ', i,'score: %.2f' % score) 30 | filename = 'mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png' 31 | plotLearning(score_history, filename=filename, window=20) 32 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/discrete_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from actor_critic_discrete import Agent 4 | import matplotlib.pyplot as plt 5 | from utils import plotLearning 6 | from gym import wrappers 7 | 8 | 9 | if __name__ == '__main__': 10 | agent = Agent(alpha=0.0001, beta=0.0005, input_dims=[4], gamma=0.99, 11 | n_actions=2, layer1_size=32, layer2_size=32) 12 | 13 | env = gym.make('CartPole-v1') 14 | score_history = [] 15 | score = 0 16 | num_episodes = 2500 17 | for i in range(num_episodes): 18 | print('episode: ', i,'score: %.3f' % score) 19 | 20 | 21 | #env = wrappers.Monitor(env, "tmp/cartpole-untrained", 22 | # video_callable=lambda episode_id: True, force=True) 23 | done = False 24 | score = 0 25 | observation = env.reset() 26 | while not done: 27 | action = agent.choose_action(observation) 28 | observation_, reward, done, info = env.step(action) 29 | agent.learn(observation, reward, observation_, done) 30 | observation = observation_ 31 | score += reward 32 | score_history.append(score) 33 | 34 | filename = 'cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc-1500games.png' 35 | plotLearning(score_history, filename=filename, window=10) 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/main_keras_actor_critic_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym, os 2 | from actor_critic_keras import Agent 3 | from utils import plotLearning 4 | from gym import wrappers 5 | import numpy as np 6 | 7 | if __name__ == '__main__': 8 | agent = Agent(alpha=0.00001, beta=0.00005) 9 | 10 | env = gym.make('LunarLander-v2') 11 | score_history = [] 12 | num_episodes = 2000 13 | 14 | for i in range(num_episodes): 15 | done = False 16 | score = 0 17 | observation = env.reset() 18 | while not done: 19 | action = agent.choose_action(observation) 20 | observation_, reward, done, info = env.step(action) 21 | agent.learn(observation, action, reward, observation_, done) 22 | observation = observation_ 23 | score += reward 24 | 25 | score_history.append(score) 26 | avg_score = np.mean(score_history[-100:]) 27 | print('episode: ', i,'score: %.2f' % score, 28 | 'avg score %.2f' % avg_score) 29 | 30 | filename = 'LunarLander.png' 31 | plotLearning(score_history, filename=filename, window=100) 32 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/main_torch_actor_critic_replay_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from actor_critic_replay_torch import Agent 4 | from utils import plotLearning 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | num_games = 1500 9 | agent = Agent(gamma=0.99, lr=1e-5, input_dims=[8], n_actions=4, 10 | l1_size=256, l2_size=256) 11 | 12 | filename = 'LunarLander-ActorCriticNaiveReplay-256-256-Adam-lr00001.png' 13 | scores = [] 14 | 15 | for i in range(num_games): 16 | done = False 17 | observation = env.reset() 18 | score = 0 19 | 20 | while not done: 21 | action, prob = agent.choose_action(observation) 22 | observation_, reward, done, info = env.step(action) 23 | score += reward 24 | agent.store_transition(observation, prob, 25 | reward, observation_, int(done)) 26 | agent.learn() 27 | observation = observation_ 28 | 29 | scores.append(score) 30 | avg_score = np.mean(scores[max(0, i-100):(i+1)]) 31 | print('episode: ', i,'score %.1f ' % score, 32 | ' average score %.1f' % avg_score) 33 | 34 | x = [i+1 for i in range(num_games)] 35 | plotLearning(scores, filename, x) 36 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/actor_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.optimizers import Adam 3 | import tensorflow_probability as tfp 4 | from networks import ActorCriticNetwork 5 | 6 | class Agent: 7 | def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2): 8 | self.gamma = gamma 9 | self.n_actions = n_actions 10 | self.action = None 11 | self.action_space = [i for i in range(self.n_actions)] 12 | 13 | self.actor_critic = ActorCriticNetwork(n_actions=n_actions) 14 | 15 | self.actor_critic.compile(optimizer=Adam(learning_rate=alpha)) 16 | 17 | 18 | def choose_action(self, observation): 19 | state = tf.convert_to_tensor([observation]) 20 | _, probs = self.actor_critic(state) 21 | 22 | action_probabilities = tfp.distributions.Categorical(probs=probs) 23 | action = action_probabilities.sample() 24 | log_prob = action_probabilities.log_prob(action) 25 | self.action = action 26 | 27 | return action.numpy()[0] 28 | 29 | def save_models(self): 30 | print('... saving models ...') 31 | self.actor_critic.save_weights(self.actor_critic.checkpoint_file) 32 | 33 | def load_models(self): 34 | print('... loading models ...') 35 | self.actor_critic.load_weights(self.actor_critic.checkpoint_file) 36 | 37 | def learn(self, state, reward, state_, done): 38 | state = tf.convert_to_tensor([state], dtype=tf.float32) 39 | state_ = tf.convert_to_tensor([state_], dtype=tf.float32) 40 | reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN 41 | with tf.GradientTape(persistent=True) as tape: 42 | state_value, probs = self.actor_critic(state) 43 | state_value_, _ = self.actor_critic(state_) 44 | state_value = tf.squeeze(state_value) 45 | state_value_ = tf.squeeze(state_value_) 46 | 47 | action_probs = tfp.distributions.Categorical(probs=probs) 48 | log_prob = action_probs.log_prob(self.action) 49 | 50 | delta = reward + self.gamma*state_value_*(1-int(done)) - state_value 51 | actor_loss = -log_prob*delta 52 | critic_loss = delta**2 53 | total_loss = actor_loss + critic_loss 54 | 55 | gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables) 56 | self.actor_critic.optimizer.apply_gradients(zip( 57 | gradient, self.actor_critic.trainable_variables)) 58 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from actor_critic import Agent 4 | from utils import plot_learning_curve 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | #env = gym.make('LunarLander-v2') 9 | env = gym.make('CartPole-v0') 10 | agent = Agent(alpha=1e-5, n_actions=env.action_space.n) 11 | n_games = 1800 12 | # uncomment this line and do a mkdir tmp && mkdir video if you want to 13 | # record video of the agent playing the game. 14 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True) 15 | filename = 'cartpole_1e-5_1024x512_1800games.png' 16 | 17 | figure_file = 'plots/' + filename 18 | 19 | best_score = env.reward_range[0] 20 | score_history = [] 21 | load_checkpoint = False 22 | 23 | if load_checkpoint: 24 | agent.load_models() 25 | 26 | for i in range(n_games): 27 | observation = env.reset() 28 | done = False 29 | score = 0 30 | while not done: 31 | action = agent.choose_action(observation) 32 | observation_, reward, done, info = env.step(action) 33 | score += reward 34 | if not load_checkpoint: 35 | agent.learn(observation, reward, observation_, done) 36 | observation = observation_ 37 | score_history.append(score) 38 | avg_score = np.mean(score_history[-100:]) 39 | 40 | if avg_score > best_score: 41 | best_score = avg_score 42 | if not load_checkpoint: 43 | agent.save_models() 44 | 45 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score) 46 | 47 | if not load_checkpoint: 48 | x = [i+1 for i in range(n_games)] 49 | plot_learning_curve(x, score_history, figure_file) 50 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow.keras as keras 3 | from tensorflow.keras.layers import Dense 4 | 5 | class ActorCriticNetwork(keras.Model): 6 | def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512, 7 | name='actor_critic', chkpt_dir='tmp/actor_critic'): 8 | super(ActorCriticNetwork, self).__init__() 9 | self.fc1_dims = fc1_dims 10 | self.fc2_dims = fc2_dims 11 | self.n_actions = n_actions 12 | self.model_name = name 13 | self.checkpoint_dir = chkpt_dir 14 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac') 15 | 16 | self.fc1 = Dense(self.fc1_dims, activation='relu') 17 | self.fc2 = Dense(self.fc2_dims, activation='relu') 18 | self.v = Dense(1, activation=None) 19 | self.pi = Dense(n_actions, activation='softmax') 20 | 21 | def call(self, state): 22 | value = self.fc1(state) 23 | value = self.fc2(value) 24 | 25 | v = self.v(value) 26 | pi = self.pi(value) 27 | 28 | return v, pi 29 | 30 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/torch_discrete_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from torch_actor_critic_discrete import NewAgent 3 | from utils import plotLearning 4 | from gym import wrappers 5 | 6 | 7 | if __name__ == '__main__': 8 | agent = NewAgent(alpha=0.00001, input_dims=[8], gamma=0.99, 9 | n_actions=4, layer1_size=2048, layer2_size=512) 10 | 11 | env = gym.make('LunarLander-v2') 12 | score_history = [] 13 | score = 0 14 | num_episodes = 2000 15 | for i in range(num_episodes): 16 | 17 | #env = wrappers.Monitor(env, "tmp/lunar-lander", 18 | # video_callable=lambda episode_id: True, force=True) 19 | done = False 20 | score = 0 21 | observation = env.reset() 22 | while not done: 23 | action = agent.choose_action(observation) 24 | observation_, reward, done, info = env.step(action) 25 | agent.learn(observation, reward, observation_, done) 26 | observation = observation_ 27 | score += reward 28 | 29 | score_history.append(score) 30 | print('episode: ', i,'score: %.2f' % score) 31 | 32 | filename = 'Lunar-Lander-actor-critic-new-agent-alpha00001-beta00005-2048x512fc-2000games.png' 33 | plotLearning(score_history, filename=filename, window=50) 34 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/actor_critic/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | def plotLearning(scores, filename, x=None, window=5): 5 | N = len(scores) 6 | running_avg = np.empty(N) 7 | for t in range(N): 8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)]) 9 | if x is None: 10 | x = [i for i in range(N)] 11 | plt.ylabel('Score') 12 | plt.xlabel('Game') 13 | plt.plot(x, running_avg) 14 | plt.savefig(filename) -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/main_keras_reinforce_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from gym import wrappers 5 | from reinforce_keras import Agent 6 | from utils import plotLearning 7 | 8 | if __name__ == '__main__': 9 | agent = Agent(ALPHA=0.0005, input_dims=8, GAMMA=0.99, 10 | n_actions=4, layer1_size=64, layer2_size=64) 11 | 12 | env = gym.make('LunarLander-v2') 13 | score_history = [] 14 | 15 | num_episodes = 2000 16 | 17 | for i in range(num_episodes): 18 | done = False 19 | score = 0 20 | observation = env.reset() 21 | while not done: 22 | action = agent.choose_action(observation) 23 | observation_, reward, done, info = env.step(action) 24 | agent.store_transition(observation, action, reward) 25 | observation = observation_ 26 | score += reward 27 | score_history.append(score) 28 | 29 | _ = agent.learn() 30 | print('episode: ', i,'score: %.1f' % score, 31 | 'average score %.1f' % np.mean(score_history[max(0, i-100):(i+1)])) 32 | 33 | filename = 'lunar-lander-keras-64x64-alpha0005-2000games.png' 34 | plotLearning(score_history, filename=filename, window=100) 35 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from reinforce_tf import PolicyGradientAgent 4 | import matplotlib.pyplot as plt 5 | from utils import plotLearning 6 | from gym import wrappers 7 | 8 | if __name__ == '__main__': 9 | agent = PolicyGradientAgent(ALPHA=0.0005, input_dims=8, GAMMA=0.99, 10 | n_actions=4, layer1_size=64, layer2_size=64, 11 | chkpt_dir='tmp/lunar-lander-ckpt') 12 | #agent.load_checkpoint() 13 | env = gym.make('LunarLander-v2') 14 | score_history = [] 15 | score = 0 16 | num_episodes = 2500 17 | #env = wrappers.Monitor(env, "tmp/lunar-lander", 18 | # video_callable=lambda episode_id: True, force=True) 19 | for i in range(num_episodes): 20 | print('episode: ', i,'score: ', score) 21 | done = False 22 | score = 0 23 | observation = env.reset() 24 | while not done: 25 | action = agent.choose_action(observation) 26 | observation_, reward, done, info = env.step(action) 27 | agent.store_transition(observation, action, reward) 28 | observation = observation_ 29 | score += reward 30 | score_history.append(score) 31 | agent.learn() 32 | #agent.save_checkpoint() 33 | #filename = 'lunar-lander-alpha0005-64x64fc-newG.png' 34 | #plotLearning(score_history, filename=filename, window=25) 35 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_space_invaders.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from reinforce_cnn_tf import PolicyGradientAgent 4 | from utils import plotLearning 5 | from gym import wrappers 6 | 7 | 8 | def preprocess(observation): 9 | return np.mean(observation[15:200, 30:125], axis=2) 10 | 11 | 12 | def stack_frames(stacked_frames, frame, buffer_size): 13 | if stacked_frames is None: 14 | stacked_frames = np.zeros((buffer_size, *frame.shape)) 15 | for idx, _ in enumerate(stacked_frames): 16 | stacked_frames[idx,:] = frame 17 | else: 18 | stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:] 19 | stacked_frames[buffer_size-1, :] = frame 20 | 21 | return stacked_frames 22 | 23 | if __name__ == '__main__': 24 | load_checkpoint = False 25 | agent = PolicyGradientAgent(ALPHA=0.001, GAMMA=0.9, n_actions=6, fc1=256, 26 | chkpt_dir='tmp/checkpoint-newG-0p001', gpu={'GPU':1}) 27 | filename = 'space-invaders-alpha001-newGcalc.png' 28 | print('will use ', filename, ' and ', agent.gpu) 29 | if load_checkpoint: 30 | agent.load_checkpoint() 31 | env = gym.make('SpaceInvaders-v0') 32 | score_history = [] 33 | score = 0 34 | num_episodes = 1000 35 | stack_size = 4 36 | #env = wrappers.Monitor(env, "tmp/space-invaders-newG-0p003", 37 | # video_callable=lambda episode_id: True, force=True) 38 | for i in range(num_episodes): 39 | done = False 40 | 41 | avg_score = np.mean(score_history[max(0, i-20):(i+1)]) 42 | if i % 20 == 0 and i > 0: 43 | print('episode: ', i,'score: ', score, ' average score %.3f' % avg_score) 44 | plotLearning(score_history, filename=filename, window=20) 45 | else: 46 | print('episode: ', i,'score: ', score) 47 | observation = env.reset() 48 | observation = preprocess(observation) 49 | stacked_frames = None 50 | stacked_frames = stack_frames(stacked_frames, observation, stack_size) 51 | score = 0 52 | while not done: 53 | action = agent.choose_action(stacked_frames) 54 | observation, reward, done, info = env.step(action) 55 | observation = preprocess(observation) 56 | stacked_frames = stack_frames(stacked_frames, observation, stack_size) 57 | agent.store_transition(observation, action, reward) 58 | 59 | score += reward 60 | score_history.append(score) 61 | 62 | if i % 10 == 0: 63 | agent.learn() 64 | agent.save_checkpoint() 65 | plotLearning(score_history, filename=filename, window=20) 66 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/main_torch_reinforce_lunar_lander.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from reinforce_torch import PolicyGradientAgent 4 | import matplotlib.pyplot as plt 5 | from utils import plotLearning 6 | from gym import wrappers 7 | 8 | if __name__ == '__main__': 9 | agent = PolicyGradientAgent(ALPHA=0.001, input_dims=[8], GAMMA=0.99, 10 | n_actions=4, layer1_size=128, layer2_size=128) 11 | #agent.load_checkpoint() 12 | env = gym.make('LunarLander-v2') 13 | score_history = [] 14 | score = 0 15 | num_episodes = 2500 16 | #env = wrappers.Monitor(env, "tmp/lunar-lander", 17 | # video_callable=lambda episode_id: True, force=True) 18 | for i in range(num_episodes): 19 | print('episode: ', i,'score: ', score) 20 | done = False 21 | score = 0 22 | observation = env.reset() 23 | while not done: 24 | action = agent.choose_action(observation) 25 | observation_, reward, done, info = env.step(action) 26 | agent.store_rewards(reward) 27 | observation = observation_ 28 | score += reward 29 | score_history.append(score) 30 | agent.learn() 31 | #agent.save_checkpoint() 32 | filename = 'lunar-lander-alpha001-128x128fc-newG.png' 33 | plotLearning(score_history, filename=filename, window=25) 34 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/reinforce_keras.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Dense, Activation, Input 2 | from keras.models import Model, load_model 3 | from keras.optimizers import Adam 4 | import keras.backend as K 5 | import numpy as np 6 | 7 | class Agent(object): 8 | def __init__(self, ALPHA, GAMMA=0.99, n_actions=4, 9 | layer1_size=16, layer2_size=16, input_dims=128, 10 | fname='reinforce.h5'): 11 | self.gamma = GAMMA 12 | self.lr = ALPHA 13 | self.G = 0 14 | self.input_dims = input_dims 15 | self.fc1_dims = layer1_size 16 | self.fc2_dims = layer2_size 17 | self.n_actions = n_actions 18 | self.state_memory = [] 19 | self.action_memory = [] 20 | self.reward_memory = [] 21 | self.policy, self.predict = self.build_policy_network() 22 | self.action_space = [i for i in range(n_actions)] 23 | 24 | self.model_file = fname 25 | 26 | def build_policy_network(self): 27 | input = Input(shape=(self.input_dims,)) 28 | advantages = Input(shape=[1]) 29 | dense1 = Dense(self.fc1_dims, activation='relu')(input) 30 | dense2 = Dense(self.fc2_dims, activation='relu')(dense1) 31 | probs = Dense(self.n_actions, activation='softmax')(dense2) 32 | 33 | def custom_loss(y_true, y_pred): 34 | out = K.clip(y_pred, 1e-8, 1-1e-8) 35 | log_lik = y_true*K.log(out) 36 | 37 | return K.sum(-log_lik*advantages) 38 | 39 | policy = Model(input=[input, advantages], output=[probs]) 40 | 41 | policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss) 42 | 43 | predict = Model(input=[input], output=[probs]) 44 | 45 | return policy, predict 46 | 47 | def choose_action(self, observation): 48 | state = observation[np.newaxis, :] 49 | probabilities = self.predict.predict(state)[0] 50 | action = np.random.choice(self.action_space, p=probabilities) 51 | 52 | return action 53 | 54 | def store_transition(self, observation, action, reward): 55 | self.state_memory.append(observation) 56 | self.action_memory.append(action) 57 | self.reward_memory.append(reward) 58 | 59 | def learn(self): 60 | state_memory = np.array(self.state_memory) 61 | action_memory = np.array(self.action_memory) 62 | reward_memory = np.array(self.reward_memory) 63 | 64 | actions = np.zeros([len(action_memory), self.n_actions]) 65 | actions[np.arange(len(action_memory)), action_memory] = 1 66 | 67 | G = np.zeros_like(reward_memory) 68 | for t in range(len(reward_memory)): 69 | G_sum = 0 70 | discount = 1 71 | for k in range(t, len(reward_memory)): 72 | G_sum += reward_memory[k] * discount 73 | discount *= self.gamma 74 | G[t] = G_sum 75 | mean = np.mean(G) 76 | std = np.std(G) if np.std(G) > 0 else 1 77 | self.G = (G - mean) / std 78 | 79 | cost = self.policy.train_on_batch([state_memory, self.G], actions) 80 | 81 | self.state_memory = [] 82 | self.action_memory = [] 83 | self.reward_memory = [] 84 | 85 | return cost 86 | 87 | def save_model(self): 88 | self.policy.save(self.model_file) 89 | 90 | def load_model(self): 91 | self.policy = load_model(self.model_file) 92 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/reinforce_tf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | class PolicyGradientAgent(): 6 | def __init__(self, ALPHA, GAMMA=0.95, n_actions=4, 7 | layer1_size=16, layer2_size=16, input_dims=128, 8 | chkpt_dir='tmp/checkpoints'): 9 | self.lr = ALPHA 10 | self.gamma = GAMMA 11 | self.n_actions = n_actions 12 | self.action_space = [i for i in range(n_actions)] 13 | self.layer1_size = layer1_size 14 | self.layer2_size = layer2_size 15 | self.input_dims = input_dims 16 | self.state_memory = [] 17 | self.action_memory = [] 18 | self.reward_memory = [] 19 | self.sess = tf.Session() 20 | self.build_net() 21 | self.sess.run(tf.global_variables_initializer()) 22 | self.saver = tf.train.Saver() 23 | self.checkpoint_file = os.path.join(chkpt_dir,'policy_network.ckpt') 24 | 25 | def build_net(self): 26 | with tf.variable_scope('parameters'): 27 | self.input = tf.placeholder(tf.float32, 28 | shape=[None, self.input_dims], name='input') 29 | self.label = tf.placeholder(tf.int32, 30 | shape=[None, ], name='label') 31 | self.G = tf.placeholder(tf.float32, shape=[None,], name='G') 32 | 33 | with tf.variable_scope('layer1'): 34 | l1 = tf.layers.dense(inputs=self.input, units=self.layer1_size, 35 | activation=tf.nn.relu, 36 | kernel_initializer=tf.contrib.layers.xavier_initializer()) 37 | 38 | with tf.variable_scope('layer2'): 39 | l2 = tf.layers.dense(inputs=l1, units=self.layer2_size, 40 | activation=tf.nn.relu, 41 | kernel_initializer=tf.contrib.layers.xavier_initializer()) 42 | 43 | with tf.variable_scope('layer3'): 44 | l3 = tf.layers.dense(inputs=l2, units=self.n_actions, 45 | activation=None, 46 | kernel_initializer=tf.contrib.layers.xavier_initializer()) 47 | self.actions = tf.nn.softmax(l3, name='actions') 48 | 49 | with tf.variable_scope('loss'): 50 | negative_log_probability = tf.nn.sparse_softmax_cross_entropy_with_logits( 51 | logits=l3, labels=self.label) 52 | 53 | loss = negative_log_probability * self.G 54 | 55 | with tf.variable_scope('train'): 56 | self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) 57 | 58 | def choose_action(self, observation): 59 | observation = observation[np.newaxis, :] 60 | probabilities = self.sess.run(self.actions, feed_dict={self.input: observation})[0] 61 | action = np.random.choice(self.action_space, p = probabilities ) 62 | 63 | return action 64 | 65 | def store_transition(self, observation, action, reward): 66 | self.state_memory.append(observation) 67 | self.action_memory.append(action) 68 | self.reward_memory.append(reward) 69 | 70 | def learn(self): 71 | state_memory = np.array(self.state_memory) 72 | action_memory = np.array(self.action_memory) 73 | reward_memory = np.array(self.reward_memory) 74 | 75 | G = np.zeros_like(reward_memory) 76 | for t in range(len(reward_memory)): 77 | G_sum = 0 78 | discount = 1 79 | for k in range(t, len(reward_memory)): 80 | G_sum += reward_memory[k] * discount 81 | discount *= self.gamma 82 | G[t] = G_sum 83 | mean = np.mean(G) 84 | std = np.std(G) if np.std(G) > 0 else 1 85 | G = (G - mean) / std 86 | 87 | _ = self.sess.run(self.train_op, 88 | feed_dict={self.input: state_memory, 89 | self.label: action_memory, 90 | self.G: G}) 91 | self.state_memory = [] 92 | self.action_memory = [] 93 | self.reward_memory = [] 94 | 95 | def load_checkpoint(self): 96 | print("...Loading checkpoint...") 97 | self.saver.restore(self.sess, self.checkpoint_file) 98 | 99 | def save_checkpoint(self): 100 | #print("...Saving checkpoint...") 101 | self.saver.save(self.sess, self.checkpoint_file) 102 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/reinforce_torch.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | class PolicyNetwork(nn.Module): 8 | def __init__(self, ALPHA, input_dims, fc1_dims, fc2_dims, 9 | n_actions): 10 | super(PolicyNetwork, self).__init__() 11 | self.input_dims = input_dims 12 | self.fc1_dims = fc1_dims 13 | self.fc2_dims = fc2_dims 14 | self.n_actions = n_actions 15 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 16 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 17 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions) 18 | self.optimizer = optim.Adam(self.parameters(), lr=ALPHA) 19 | 20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1') 21 | self.to(self.device) 22 | 23 | def forward(self, observation): 24 | state = T.Tensor(observation).to(self.device) 25 | x = F.relu(self.fc1(state)) 26 | x = F.relu(self.fc2(x)) 27 | x = self.fc3(x) 28 | return x 29 | 30 | class PolicyGradientAgent(object): 31 | def __init__(self, ALPHA, input_dims, GAMMA=0.99, n_actions=4, 32 | layer1_size=256, layer2_size=256): 33 | self.gamma = GAMMA 34 | self.reward_memory = [] 35 | self.action_memory = [] 36 | self.policy = PolicyNetwork(ALPHA, input_dims, layer1_size, layer2_size, 37 | n_actions) 38 | 39 | def choose_action(self, observation): 40 | probabilities = F.softmax(self.policy.forward(observation)) 41 | action_probs = T.distributions.Categorical(probabilities) 42 | action = action_probs.sample() 43 | log_probs = action_probs.log_prob(action) 44 | self.action_memory.append(log_probs) 45 | 46 | return action.item() 47 | 48 | def store_rewards(self,reward): 49 | self.reward_memory.append(reward) 50 | 51 | def learn(self): 52 | self.policy.optimizer.zero_grad() 53 | # Assumes only a single episode for reward_memory 54 | G = np.zeros_like(self.reward_memory, dtype=np.float64) 55 | for t in range(len(self.reward_memory)): 56 | G_sum = 0 57 | discount = 1 58 | for k in range(t, len(self.reward_memory)): 59 | G_sum += self.reward_memory[k] * discount 60 | discount *= self.gamma 61 | G[t] = G_sum 62 | mean = np.mean(G) 63 | std = np.std(G) if np.std(G) > 0 else 1 64 | G = (G - mean) / std 65 | 66 | G = T.tensor(G, dtype=T.float).to(self.policy.device) 67 | 68 | loss = 0 69 | for g, logprob in zip(G, self.action_memory): 70 | loss += -g * logprob 71 | 72 | loss.backward() 73 | self.policy.optimizer.step() 74 | 75 | self.action_memory = [] 76 | self.reward_memory = [] 77 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/main.py: -------------------------------------------------------------------------------- 1 | # if you have more than 1 gpu, use device '0' or '1' to assign to a gpu 2 | #import os 3 | #os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 4 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0' 5 | import gym 6 | import numpy as np 7 | from reinforce_tf2 import Agent 8 | from utils import plotLearning 9 | 10 | if __name__ == '__main__': 11 | agent = Agent(alpha=0.0005, gamma=0.99,n_actions=4) 12 | 13 | env = gym.make('LunarLander-v2') 14 | score_history = [] 15 | 16 | num_episodes = 2000 17 | 18 | for i in range(num_episodes): 19 | done = False 20 | score = 0 21 | observation = env.reset() 22 | while not done: 23 | action = agent.choose_action(observation) 24 | observation_, reward, done, info = env.step(action) 25 | agent.store_transition(observation, action, reward) 26 | observation = observation_ 27 | score += reward 28 | score_history.append(score) 29 | 30 | agent.learn() 31 | avg_score = np.mean(score_history[-100:]) 32 | print('episode: ', i,'score: %.1f' % score, 33 | 'average score %.1f' % avg_score) 34 | 35 | filename = 'lunar-lander.png' 36 | plotLearning(score_history, filename=filename, window=100) 37 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Dense 3 | 4 | class PolicyGradientNetwork(keras.Model): 5 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256): 6 | super(PolicyGradientNetwork, self).__init__() 7 | self.fc1_dims = fc1_dims 8 | self.fc2_dims = fc2_dims 9 | self.n_actions = n_actions 10 | 11 | self.fc1 = Dense(self.fc1_dims, activation='relu') 12 | self.fc2 = Dense(self.fc2_dims, activation='relu') 13 | self.pi = Dense(n_actions, activation='softmax') 14 | 15 | def call(self, state): 16 | value = self.fc1(state) 17 | value = self.fc2(value) 18 | 19 | pi = self.pi(value) 20 | 21 | return pi 22 | 23 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/reinforce_tf2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from networks import PolicyGradientNetwork 3 | import tensorflow_probability as tfp 4 | from tensorflow.keras.optimizers import Adam 5 | import numpy as np 6 | 7 | class Agent: 8 | def __init__(self, alpha=0.003, gamma=0.99, n_actions=4, 9 | layer1_size=256, layer2_size=256): 10 | 11 | self.gamma = gamma 12 | self.lr = alpha 13 | self.n_actions = n_actions 14 | self.state_memory = [] 15 | self.action_memory = [] 16 | self.reward_memory = [] 17 | self.policy = PolicyGradientNetwork(n_actions=n_actions) 18 | self.policy.compile(optimizer=Adam(learning_rate=self.lr)) 19 | 20 | def choose_action(self, observation): 21 | state = tf.convert_to_tensor([observation], dtype=tf.float32) 22 | probs = self.policy(state) 23 | action_probs = tfp.distributions.Categorical(probs=probs) 24 | action = action_probs.sample() 25 | 26 | return action.numpy()[0] 27 | 28 | def store_transition(self, observation, action, reward): 29 | self.state_memory.append(observation) 30 | self.action_memory.append(action) 31 | self.reward_memory.append(reward) 32 | 33 | def learn(self): 34 | actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32) 35 | rewards = np.array(self.reward_memory) 36 | 37 | G = np.zeros_like(rewards) 38 | for t in range(len(rewards)): 39 | G_sum = 0 40 | discount = 1 41 | for k in range(t, len(rewards)): 42 | G_sum += rewards[k] * discount 43 | discount *= self.gamma 44 | G[t] = G_sum 45 | 46 | with tf.GradientTape() as tape: 47 | loss = 0 48 | for idx, (g, state) in enumerate(zip(G, self.state_memory)): 49 | state = tf.convert_to_tensor([state], dtype=tf.float32) 50 | probs = self.policy(state) 51 | action_probs = tfp.distributions.Categorical(probs=probs) 52 | log_prob = action_probs.log_prob(actions[idx]) 53 | loss += -g * tf.squeeze(log_prob) 54 | 55 | gradient = tape.gradient(loss, self.policy.trainable_variables) 56 | self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables)) 57 | 58 | self.state_memory = [] 59 | self.action_memory = [] 60 | self.reward_memory = [] 61 | -------------------------------------------------------------------------------- /ReinforcementLearning/PolicyGradient/reinforce/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | def plotLearning(scores, filename, x=None, window=5): 5 | N = len(scores) 6 | running_avg = np.empty(N) 7 | for t in range(N): 8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)]) 9 | if x is None: 10 | x = [i for i in range(N)] 11 | plt.ylabel('Score') 12 | plt.xlabel('Game') 13 | plt.plot(x, running_avg) 14 | plt.savefig(filename) -------------------------------------------------------------------------------- /basic_encryption/caesar.py: -------------------------------------------------------------------------------- 1 | from common import alphabet 2 | 3 | 4 | def translate(message, shift, encrypt=True): 5 | new_message = '' 6 | n_chars = len(alphabet) 7 | 8 | for character in message: 9 | char_idx = alphabet.index(character) 10 | if encrypt: 11 | new_char_idx = (char_idx + shift) % n_chars 12 | elif not encrypt: 13 | new_char_idx = (char_idx - shift) % n_chars 14 | new_message += alphabet[new_char_idx] 15 | return new_message 16 | 17 | 18 | cipher_shift = 7 19 | 20 | print('AB->', translate('AB', cipher_shift)) 21 | print('ab->', translate('ab', cipher_shift)) 22 | print('Ab->', translate('Ab', cipher_shift)) 23 | print('aB->', translate('aB', cipher_shift)) 24 | 25 | plaintext = 'This is an encrypted message.' 26 | ciphertext = translate(plaintext, cipher_shift, True) 27 | print(plaintext, '->', ciphertext) 28 | original_message = translate(ciphertext, cipher_shift, False) 29 | print(ciphertext, '->', original_message) 30 | -------------------------------------------------------------------------------- /basic_encryption/common.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | # https://stackoverflow.com/questions/7001144/range-over-character-in-python 5 | def character_generator(start_char, stop_char): 6 | for char in range(ord(start_char), ord(stop_char)+1): 7 | yield chr(char) 8 | 9 | 10 | def generate_one_time_pad(n_chars, characters): 11 | return ''.join(random.choice(characters) for _ in range(n_chars)) 12 | 13 | 14 | lower_case = list(character_generator('a', 'z')) 15 | upper_case = list(character_generator('A', 'Z')) 16 | punctuation = ['.', ',', ' ', '?', '!'] 17 | 18 | alphabet = lower_case + upper_case + punctuation 19 | -------------------------------------------------------------------------------- /basic_encryption/one_time_pad.py: -------------------------------------------------------------------------------- 1 | from common import alphabet, generate_one_time_pad 2 | 3 | 4 | def translate(message, one_time_pad, encrypt=True): 5 | new_message = '' 6 | 7 | n_chars = len(alphabet) 8 | 9 | for src, key in zip(message, one_time_pad): 10 | char_idx = alphabet.index(src) 11 | pad_idx = alphabet.index(key) 12 | if encrypt: 13 | new_char_idx = (char_idx + pad_idx) % n_chars 14 | elif not encrypt: 15 | new_char_idx = (char_idx - pad_idx) % n_chars 16 | new_message += alphabet[new_char_idx] 17 | 18 | return new_message 19 | 20 | 21 | message = 'This is an encrypted message.' 22 | secret_key = generate_one_time_pad(len(message), alphabet) 23 | encrypted_message = translate(message, secret_key, True) 24 | original_message = translate(encrypted_message, secret_key, False) 25 | 26 | print(message, '->', encrypted_message) 27 | print(encrypted_message, '->', original_message) 28 | -------------------------------------------------------------------------------- /basic_encryption/vignere.py: -------------------------------------------------------------------------------- 1 | from common import alphabet, generate_one_time_pad 2 | 3 | 4 | def make_vignere_table(): 5 | table = [['']] * len(alphabet) 6 | for idx, character in enumerate(alphabet): 7 | row = [] 8 | for char in alphabet[idx:]: 9 | row.append(char) 10 | for char in alphabet[:idx]: 11 | row.append(char) 12 | table[idx] = row 13 | return table 14 | 15 | 16 | def translate(message, vig_table, one_time_pad, encrypt=True): 17 | new_message = '' 18 | 19 | if encrypt: 20 | for src, key in zip(message, one_time_pad): 21 | row = vig_table[:][0].index(key) 22 | col = vig_table[0][:].index(src) 23 | new_message += vig_table[row][col] 24 | elif not encrypt: 25 | for src, key in zip(message, one_time_pad): 26 | row = vig_table[:][0].index(key) 27 | col = vig_table[row][:].index(src) 28 | new_message += vig_table[0][col] 29 | return new_message 30 | 31 | 32 | table = make_vignere_table() 33 | message = 'This is an encrypted message.' 34 | secret_key = generate_one_time_pad(len(message), alphabet) 35 | encrypted_message = translate(message, table, secret_key, True) 36 | original_message = translate(encrypted_message, table, secret_key, False) 37 | 38 | print(message, '->', encrypted_message) 39 | print(encrypted_message, '->', original_message) 40 | -------------------------------------------------------------------------------- /cmdline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from ReinforcementLearning.DeepQLearning.utils import plotLearning 4 | from ReinforcementLearning.DeepQLearning.simple_dqn_torch import Agent 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser( 8 | description='Command line Utility for training RL models') 9 | # the hyphen makes the argument optional 10 | parser.add_argument('-n_games', type=int, default=1, 11 | help='Number of games to play') 12 | parser.add_argument('-lr', type=float, default=0.001, 13 | help='Learning rate for optimizer') 14 | parser.add_argument('-eps_end', type=float, default=0.01, 15 | help='Final value for epsilon in epsilon-greedy action selection') 16 | parser.add_argument('-gamma', type=float, default=0.99, 17 | help='Discount factor for update equation.') 18 | parser.add_argument('-env', type=str, default='LunarLander-v2', 19 | help='OpenAI gym environment for agent') 20 | parser.add_argument('-eps_dec', type=float, default=0.996, 21 | help='Multiplicative factor for decreasing epsilon') 22 | parser.add_argument('-eps', type=float, default=1.0, 23 | help='Starting value for epsilon in epsilon-greedy action selection') 24 | parser.add_argument('-max_mem', type=int, default=1000000, 25 | help='Maximum size for memory replay buffer') 26 | parser.add_argument('-dims', type=int, default=8, 27 | help='Input dimensions; matches env observation, \ 28 | must be list or tuple') 29 | parser.add_argument('-bs', type=int, default=32, 30 | help='Batch size for replay memory sampling') 31 | parser.add_argument('-n_actions', type=int, default=4, 32 | help='Number of actions in discrete action space') 33 | args = parser.parse_args() 34 | 35 | env = gym.make(args.env) 36 | 37 | args.dims = [args.dims] 38 | 39 | agent = Agent(args.gamma, args.eps, args.lr, args.dims, args.bs, 40 | args.n_actions, args.max_mem, args.eps_end, args.eps_dec) 41 | 42 | eps_history, scores = [], [] 43 | for i in range(args.n_games): 44 | observation = env.reset() 45 | done = False 46 | score = 0 47 | while not done: 48 | action = agent.chooseAction(observation) 49 | observation_, reward, done, info = env.step(action) 50 | score += reward 51 | agent.storeTransition(observation, action, 52 | reward, observation_, int(done)) 53 | observation = observation_ 54 | agent.learn() 55 | 56 | eps_history.append(agent.EPSILON) 57 | scores.append(score) 58 | 59 | if i % 10 == 0 and i > 0: 60 | avg_score = np.mean(scores[max(0, i-10):(i+1)]) 61 | print('episode: ', i,'score: ', score, 62 | ' average score %.3f' % avg_score, 63 | 'epsilon %.3f' % agent.EPSILON) 64 | else: 65 | print('episode: ', i,'score: ', score) 66 | 67 | x = [i+1 for i in range(args.n_games)] 68 | # filename should reflect whatever it is you are varying to tune your 69 | # agent. For simplicity I'm just showing alpha and gamma, but it can be 70 | # the epsilons as well. You can even include parameters for the fully 71 | # connected layers and use them as part of the file name. 72 | filename = args.env + '_alpha' + str(args.lr) + '_gamma' + str(args.gamma) + \ 73 | '.png' 74 | plotLearning(x, scores, eps_history, filename) 75 | -------------------------------------------------------------------------------- /giveaway_scrubbed.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | # These people all indicated they are located in the US/CA or had a means to 5 | # ship to a US/CA address for forwarding. 6 | us_ca_entrants = ["Rintze", "Hasan", "Keith", "Joseph", "Asceptt", "Brian", 7 | "Xiaoyu", "Anik", "Devshank", "Jeremy", "Amin", "Brenton", 8 | "Remi", "Howard", "Michael", "Khizr", "Jay", "Ricardo", 9 | "Matt", "Chris", "Tanner", "Paul", "Pang", "Jose", "David", 10 | "Kurt", "Jesse"] 11 | 12 | # These people indicated they were in a foreign country and did not indicate 13 | # they had the means to ship to a foreign address. 14 | intl_entrants = ["Harsh"] 15 | 16 | # These people did not indicate where they were or their means to forward mail 17 | unknown_entrants = ["Gareth", "Dan", "Dileep", "Zeeshan", "Romin", "Dellan", 18 | "Marcin", "Wouter", "Cecil", "Jamal", "Gabriel", "ATV", 19 | "Violet", "Waqas", "Joy", "Tianqi", "Thomas"] 20 | 21 | random.seed(2022) 22 | 23 | gpu_winner = random.choice(us_ca_entrants) 24 | 25 | all_entrants = us_ca_entrants + intl_entrants + unknown_entrants 26 | 27 | nnai_winner = random.choice(all_entrants) 28 | 29 | dli_winners = [random.choice(all_entrants) for _ in range(5)] 30 | 31 | # Make sure there are no duplicate names, so there is no ambiguity in who won 32 | assert len(np.unique(us_ca_entrants)) == len(us_ca_entrants) 33 | 34 | assert len(np.unique(all_entrants)) == len(all_entrants) 35 | 36 | print('GPU Winner:', gpu_winner) 37 | 38 | print('NeuralNet.ai Subscription Winner:', nnai_winner) 39 | 40 | print('Deep Learning Institute winners:', dli_winners) 41 | -------------------------------------------------------------------------------- /giveaway_scrubbed_3-23.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | all_entrants = ['Henry Magregor', 'Charly B', 'Arjun H', 'Pete O', 'Lolis F', 5 | 'Kaan A', 'Inosiro', 'Brian B', 'Ben C', 'Jorge B', 'Jesse G', 6 | 'Hauke H', 'Pas D', 'Aditya C', 'Marc C', 'Logan G', 'Brian C', 7 | 'Antemasq', 'Alex D', 'Bibek P', 'Andrew S', 'Gonzalo B', 8 | 'Martin P', 'Bikash S', 'William P', 'Daniel A', 'Naomi G', 9 | 'Alex V', 'Chris G', 'Steve L', 'Felix G', 'Greg K', 'x g', 10 | ] 11 | 12 | random.seed(2023) 13 | 14 | gpu_winner = random.choice(all_entrants) 15 | 16 | all_entrants.remove(gpu_winner) 17 | 18 | nnai_winner = random.choice(all_entrants) 19 | 20 | all_entrants.remove(nnai_winner) 21 | 22 | dli_winners = [random.choice(all_entrants) for _ in range(5)] 23 | 24 | # Make sure there are no duplicate names, so there is no ambiguity in who won 25 | assert len(np.unique(all_entrants)) == len(all_entrants) 26 | 27 | print('GPU Winner:', gpu_winner) 28 | 29 | print('NeuralNet.ai Subscription Winner:', nnai_winner) 30 | 31 | print('Deep Learning Institute winners:', dli_winners) 32 | -------------------------------------------------------------------------------- /giveaway_scrubbed_9-22.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | # This time I didn't specify to indicate where you are located and if you had 5 | # the means to ship abroad. Silly oversight on my part, but that's life. 6 | # This means I'll do the drawing and email everyone first. If someone overseas 7 | # wins the GPU but can't get it shipped, then I'll subtract their name and draw 8 | # again. 9 | 10 | all_entrants = ['xiaoyu', 'sunil', 'kelvin', 'jacob', 'sean', 'dilith', 11 | 'noctildon', 'lukas_k', 'alex', 'matt_t', 'inosiro', 12 | 'f1datadiver', 'sambaran', 'dean_v_a', 'balaji', 'aditya', 13 | 'brian_cu', 'sim', 'philip', 'antonio', 'roumen', 'marc', 14 | 'william_p', 'michael_f', 'behnood', 'lucas_p', 'ahmed_k', 15 | 'jamal_c', 'luciano_d', 'amir-ul', 'kinal', 'sidhanath', 16 | 'lorenzo', 'michael_w', 'ravi_j', 'brigliano', 'hrovje', 17 | 'daniel_b', 'terry_w', 'jun', 'kurt_b', 'hauke', 'super_dave', 18 | 'george', 'lukas_d', 'waleed', 'clark', 'frak', 'ravi_c', 19 | 'sawaiz', 'ferran', 'jack-ziad', 'christian_g', 'zxavier', 20 | 'daniel_k', 'akash', 'jbene', 'hause', 'jack', 'cristiano', 21 | 'nguyen_q_d', 'tatonata', 'dennis_f', 'till_z', 'dusan', 22 | 'abdennacer', 'antonio_p', 'dilan', 'adam_b', 'brian_co', 23 | 'k_ali', 'matt_r', 'navoda', 'doyun', 'william_s', 'jed_j', 24 | 'bijay', 'bruno', 'shivam', 'arjun_h', 'emil', 'abdulla_m', 25 | 'nick', 'joyce_w', 'abhinav', 'alex_v', 'ruturaj_s'] 26 | 27 | random.seed(2022) 28 | 29 | gpu_winner = random.choice(all_entrants) 30 | 31 | all_entrants.remove(gpu_winner) 32 | 33 | nnai_winner = random.choice(all_entrants) 34 | 35 | all_entrants.remove(nnai_winner) 36 | 37 | dli_winners = [random.choice(all_entrants) for _ in range(5)] 38 | 39 | # Make sure there are no duplicate names, so there is no ambiguity in who won 40 | assert len(np.unique(all_entrants)) == len(all_entrants) 41 | 42 | print('GPU Winner:', gpu_winner) 43 | 44 | print('NeuralNet.ai Subscription Winner:', nnai_winner) 45 | 46 | print('Deep Learning Institute winners:', dli_winners) 47 | -------------------------------------------------------------------------------- /tf_embeddings.py: -------------------------------------------------------------------------------- 1 | import io 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | import tensorflow_datasets as tfds 7 | 8 | #embedding_layer = layers.Embedding(1000, 5) 9 | 10 | #result = embedding_layer(tf.constant([1,2,3])) 11 | 12 | #print(result.numpy()) 13 | #print(result.numpy().shape) 14 | def get_batch_data(): 15 | (train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', 16 | split=(tfds.Split.TRAIN, tfds.Split.TEST), 17 | with_info=True, as_supervised=True) 18 | 19 | encoder = info.features['text'].encoder 20 | #print(encoder.subwords[:20]) 21 | padded_shapes = ([None], ()) 22 | train_batches = train_data.shuffle(1000).padded_batch(10, 23 | padded_shapes=padded_shapes) 24 | test_batches = test_data.shuffle(1000).padded_batch(10, 25 | padded_shapes=padded_shapes) 26 | return train_batches, test_batches, encoder 27 | 28 | def get_model(encoder, embedding_dim=16): 29 | 30 | model = keras.Sequential([ 31 | layers.Embedding(encoder.vocab_size, embedding_dim), 32 | layers.GlobalAveragePooling1D(), 33 | layers.Dense(1, activation='sigmoid')]) 34 | 35 | model.compile(optimizer='adam', loss='binary_crossentropy', 36 | metrics=['accuracy']) 37 | return model 38 | 39 | def plot_history(history): 40 | history_dict = history.history 41 | acc = history_dict['accuracy'] 42 | val_acc = history_dict['val_accuracy'] 43 | epochs = range(1, len(acc) + 1) 44 | 45 | plt.figure(figsize=(12,9)) 46 | plt.plot(epochs, acc, 'bo', label='Training acc') 47 | plt.plot(epochs, val_acc, 'b', label='Validation acc') 48 | plt.title('Training and validation accuracy') 49 | plt.xlabel('Epochs') 50 | plt.ylabel('Accuracy') 51 | plt.legend(loc='lower right') 52 | plt.ylim((0.5, 1)) 53 | plt.show() 54 | 55 | def retrieve_embeddings(model, encoder): 56 | out_vectors = io.open('vecs.tsv', 'w', encoding='utf-8') 57 | out_metadata = io.open('meta.tsv', 'w', encoding='utf-8') 58 | weights = model.layers[0].get_weights()[0] 59 | 60 | for num, word in enumerate(encoder.subwords): 61 | vec = weights[num+1] 62 | out_metadata.write(word + '\n') 63 | out_vectors.write('\t'.join([str(x) for x in vec]) + '\n') 64 | out_vectors.close() 65 | out_metadata.close() 66 | 67 | train_batches, test_batches, encoder = get_batch_data() 68 | model = get_model(encoder) 69 | history = model.fit(train_batches, epochs=10, validation_data=test_batches, 70 | validation_steps=20) 71 | plot_history(history) 72 | retrieve_embeddings(model, encoder) 73 | -------------------------------------------------------------------------------- /tf_sentiment.py: -------------------------------------------------------------------------------- 1 | import tensorflow_datasets as tfds 2 | import tensorflow as tf 3 | 4 | dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, 5 | as_supervised=True) 6 | train_dataset, test_dataset = dataset['train'], dataset['test'] 7 | 8 | encoder = info.features['text'].encoder 9 | 10 | BUFFER_SIZE = 10000 11 | BATCH_SIZE = 64 12 | 13 | padded_shapes = ([None], ()) 14 | 15 | train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, 16 | padded_shapes=padded_shapes) 17 | 18 | test_dataset = test_dataset.padded_batch(BATCH_SIZE, 19 | padded_shapes=padded_shapes) 20 | 21 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64), 22 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), 23 | tf.keras.layers.Dense(64, activation='relu'), 24 | tf.keras.layers.Dense(1, activation='sigmoid')]) 25 | model.compile(loss='binary_crossentropy', 26 | optimizer=tf.keras.optimizers.Adam(1e-4), 27 | metrics=['accuracy']) 28 | 29 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset, 30 | validation_steps=30) 31 | 32 | def pad_to_size(vec, size): 33 | zeros = [0]*(size-len(vec)) 34 | vec.extend(zeros) 35 | return vec 36 | 37 | def sample_predict(sentence, pad, model_): 38 | encoded_sample_pred_text = encoder.encode(sentence) 39 | if pad: 40 | encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64) 41 | encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32) 42 | predictions = model_.predict(tf.expand_dims(encoded_sample_pred_text, 0)) 43 | 44 | return predictions 45 | 46 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend') 47 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100 48 | 49 | print('probability this is a positive review %.2f' % predictions) 50 | 51 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend') 52 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100 53 | 54 | print('probability this is a positive review %.2f' % predictions) 55 | 56 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64), 57 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), 58 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), 59 | tf.keras.layers.Dense(64, activation='relu'), 60 | tf.keras.layers.Dropout(0.5), 61 | tf.keras.layers.Dense(1, activation='sigmoid')]) 62 | model.compile(loss='binary_crossentropy', 63 | optimizer=tf.keras.optimizers.Adam(1e-4), 64 | metrics=['accuracy']) 65 | 66 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset, 67 | validation_steps=30) 68 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend') 69 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100 70 | 71 | print('probability this is a positive review %.2f' % predictions) 72 | 73 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend') 74 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100 75 | 76 | print('probability this is a positive review %.2f' % predictions) 77 | 78 | -------------------------------------------------------------------------------- /threaded.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | import glob 4 | from keras.preprocessing.image import ImageDataGenerator 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | 7 | def augment_images(raw_images, files, mult_factor): 8 | gen = ImageDataGenerator() 9 | for idx, image in enumerate(raw_images): 10 | for mult in range(mult_factor): 11 | img_fname = files[idx].split('/')[4] 12 | img_fname = '../../Data/AugmentedImages/' + \ 13 | img_fname.split('.')[0] + '_' + str(multi) + '.jpg' 14 | 15 | theta_tfx = np.random.choice(range(270)) 16 | transformed_raw_image = gen.apply_transform(image, 17 | {'theta': theta_fx}) 18 | new_image = Image.fromarray(transformed_raw_image, 'RGB') 19 | new_image = new_image.resize((1024, 1024), Image.ANTIALIAS) 20 | new_image.save(img_fname) 21 | transformed_raw_image = None 22 | new_image = None 23 | 24 | if __name__ == '__main__': 25 | raw_images_dir = '../../Data/RawImages/' 26 | raw_image_files = sorted(glob.sglob(raw_images_dir + '*.jpg', 27 | recursive=True)) 28 | 29 | img_list = [] 30 | for file in raw_image files: 31 | img_list.append(np.array(Image.open(file))) 32 | augment_images(img_list, raw_image_files, mult_factor=10) 33 | --------------------------------------------------------------------------------