├── CNN.py
├── Kaggle
└── venus-volcanoes.ipynb
├── README.md
├── ReinforcementLearning
├── CombinedExperienceReplay
│ ├── dqn_torch.py
│ ├── main.py
│ ├── memory.py
│ ├── memory_solution.py
│ └── plot.py
├── DeepQLearning
│ ├── archive
│ │ ├── dqn_keras.py
│ │ ├── dqn_tf.py
│ │ ├── frame_stack_test.py
│ │ ├── main_keras_dqn_pong.py
│ │ ├── main_tf_dqn_lunar_lander.py
│ │ ├── main_torch_dqn_lunar_lander.py
│ │ ├── main_torch_dqn_space_invaders.py
│ │ ├── q_eval.h5
│ │ ├── q_next.h5
│ │ ├── simple_dqn_tf.py
│ │ ├── simple_dqn_torch.py
│ │ └── torch_deep_q_model.py
│ ├── ddqn_keras.py
│ ├── dueling_ddqn_tf2.py
│ ├── dueling_ddqn_torch.py
│ ├── dueling_dqn_keras.py
│ ├── dueling_dqn_torch.py
│ ├── main_keras_ddqn_lunar_lander.py
│ ├── main_keras_dqn_lunar_lander.py
│ ├── main_keras_dueling_dqn_lunar_lander.py
│ ├── main_tf2_dqn_lunar_lander.py
│ ├── main_tf2_dueling_ddqn_lunar_lander.py
│ ├── main_tf_dqn_breakout.py
│ ├── main_torch_dqn_lunar_lander_2020.py
│ ├── main_torch_dueling_ddqn_lunar_lander.py
│ ├── main_torch_dueling_dqn_lunar_lander.py
│ ├── simple_dqn_keras.py
│ ├── simple_dqn_tf2.py
│ ├── simple_dqn_torch_2020.py
│ └── utils.py
├── Fundamentals
│ ├── acrobot.py
│ ├── blackJack-no-es.py
│ ├── blackJack-off-policy.py
│ ├── cartpole_qlearning.py
│ ├── doubleQLearning.py
│ ├── dynamic_programming.py
│ ├── gridworld.py
│ ├── mountaincar.png
│ ├── mountaincar.py
│ ├── n_step_sarsa.py
│ └── sarsa.py
├── ICM
│ ├── A3C_CartPole_no_rewards.png
│ ├── ICM_CartPole_no_rewards.png
│ ├── actor_critic.py
│ ├── icm.py
│ ├── main.py
│ ├── memory.py
│ ├── parallel_env.py
│ ├── shared_adam.py
│ ├── utils.py
│ └── worker.py
└── PolicyGradient
│ ├── A3C
│ └── pytorch
│ │ └── a3c.py
│ ├── DDPG
│ ├── pytorch
│ │ └── lunar-lander
│ │ │ ├── Torch-LunarLander-alpha000025-beta00025-400-300.png
│ │ │ ├── ddpg_torch.py
│ │ │ ├── main_torch.py
│ │ │ └── utils.py
│ ├── tensorflow
│ │ ├── pendulum
│ │ │ ├── ddpg_orig_tf.py
│ │ │ ├── ddpg_tf.py
│ │ │ ├── main_tf.py
│ │ │ └── utils.py
│ │ └── walker2d
│ │ │ ├── ddpg_orig_tf.py
│ │ │ ├── main_tf.py
│ │ │ └── tmp
│ │ │ └── ddpg_best_3
│ │ │ ├── Actor_ddpg.ckpt.data-00000-of-00001
│ │ │ ├── Actor_ddpg.ckpt.index
│ │ │ ├── Actor_ddpg.ckpt.meta
│ │ │ ├── Critic_ddpg.ckpt.data-00000-of-00001
│ │ │ ├── Critic_ddpg.ckpt.index
│ │ │ ├── Critic_ddpg.ckpt.meta
│ │ │ ├── TargetActor_ddpg.ckpt.data-00000-of-00001
│ │ │ ├── TargetActor_ddpg.ckpt.index
│ │ │ ├── TargetActor_ddpg.ckpt.meta
│ │ │ ├── TargetCritic_ddpg.ckpt.data-00000-of-00001
│ │ │ ├── TargetCritic_ddpg.ckpt.index
│ │ │ └── TargetCritic_ddpg.ckpt.meta
│ └── tensorflow2
│ │ └── pendulum
│ │ ├── buffer.py
│ │ ├── ddpg_tf2.py
│ │ ├── main_ddpg.py
│ │ ├── networks.py
│ │ ├── pendulum.png
│ │ └── utils.py
│ ├── PPO
│ ├── tf2
│ │ ├── agent.py
│ │ ├── main.py
│ │ ├── memory.py
│ │ ├── networks.py
│ │ └── utils.py
│ └── torch
│ │ ├── Slides.pdf
│ │ ├── cartpole.png
│ │ ├── main.py
│ │ ├── ppo_torch.py
│ │ └── utils.py
│ ├── SAC
│ ├── buffer.py
│ ├── main_sac.py
│ ├── networks.py
│ ├── sac_torch.py
│ ├── tf2
│ │ ├── Slides.pdf
│ │ ├── buffer.py
│ │ ├── main_sac.py
│ │ ├── networks.py
│ │ ├── plots
│ │ │ └── inverted_pendulum.png
│ │ ├── sac_tf2.py
│ │ └── utils.py
│ └── utils.py
│ ├── TD3
│ ├── main.py
│ ├── td3_torch.py
│ ├── tf2
│ │ ├── main.py
│ │ ├── plots
│ │ │ └── walker_1500_games.png
│ │ ├── td3_tf2.py
│ │ └── utils.py
│ └── utils.py
│ ├── actor_critic
│ ├── actor_critic_continuous.py
│ ├── actor_critic_keras.py
│ ├── actor_critic_replay_torch.py
│ ├── cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png
│ ├── continuous_mountain_car_actor_critic.py
│ ├── discrete_cartpole.py
│ ├── main_keras_actor_critic_lunar_lander.py
│ ├── main_torch_actor_critic_replay_lunar_lander.py
│ ├── mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png
│ ├── tensorflow2
│ │ ├── actor_critic.py
│ │ ├── cartpole.png
│ │ ├── main.py
│ │ ├── networks.py
│ │ └── utils.py
│ ├── torch_actor_critic_discrete.py
│ ├── torch_discrete_lunar_lander.py
│ └── utils.py
│ └── reinforce
│ ├── main_keras_reinforce_lunar_lander.py
│ ├── main_tf_reinforce_lunar_lander.py
│ ├── main_tf_reinforce_space_invaders.py
│ ├── main_torch_reinforce_lunar_lander.py
│ ├── reinforce_cnn_tf.py
│ ├── reinforce_keras.py
│ ├── reinforce_tf.py
│ ├── reinforce_torch.py
│ ├── space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png
│ ├── tensorflow2
│ ├── lunar-lander-tf2-256x256-alpha0005-2000games.png
│ ├── main.py
│ ├── networks.py
│ └── reinforce_tf2.py
│ └── utils.py
├── basic_encryption
├── caesar.py
├── common.py
├── one_time_pad.py
└── vignere.py
├── cmdline.py
├── giveaway_scrubbed.py
├── giveaway_scrubbed_3-23.py
├── giveaway_scrubbed_9-22.py
├── modular_cnn.py
├── simple_cnn_mnist.py
├── simple_nn_mnist.py
├── tf_embeddings.py
├── tf_sentiment.py
├── tf_text_gen.py
└── threaded.py
/CNN.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import struct
3 | import tensorflow as tf
4 |
5 | def load_data():
6 | with open('train-labels.idx1-ubyte', 'rb') as labels:
7 | magic, n = struct.unpack('>II', labels.read(8))
8 | train_labels = np.fromfile(labels, dtype=np.uint8)
9 | with open('train-images.idx3-ubyte', 'rb') as imgs:
10 | magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16))
11 | train_images = np.fromfile(imgs, dtype=np.uint8).reshape(num,784)
12 | with open('t10k-labels.idx1-ubyte', 'rb') as labels:
13 | magic, n = struct.unpack('>II', labels.read(8))
14 | test_labels = np.fromfile(labels, dtype=np.uint8)
15 | with open('t10k-images.idx3-ubyte', 'rb') as imgs:
16 | magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16))
17 | test_images = np.fromfile(imgs, np.uint8).reshape(num,784)
18 | return train_images, train_labels, test_images, test_labels
19 |
20 | def cnn_model_fn(features, labels, mode):
21 | input_layer = tf.cast(tf.reshape(features['x'], [-1, 28, 28, 1]), tf.float16)
22 |
23 | conv1 = tf.layers.conv2d(inputs=input_layer,
24 | filters=16,
25 | kernel_size=[5,5],
26 | padding='same',
27 | activation=tf.nn.relu)
28 |
29 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2,2], strides=2)
30 |
31 | conv2 = tf.layers.conv2d(inputs=pool1,
32 | filters=32,
33 | kernel_size=[5,5],
34 | padding='same',
35 | activation=tf.nn.relu)
36 |
37 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size = [2,2], strides=2)
38 |
39 | pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 32])
40 |
41 | dense = tf.layers.dense(inputs=pool2_flat, units=128, activation=tf.nn.relu)
42 | logits = tf.layers.dense(inputs=dense, units=10)
43 |
44 | predictions = {
45 | 'classes': tf.argmax(input=logits, axis=1),
46 | 'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
47 | }
48 |
49 | if mode == tf.estimator.ModeKeys.PREDICT:
50 | return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
51 |
52 | onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
53 |
54 | loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
55 |
56 | if mode == tf.estimator.ModeKeys.TRAIN:
57 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
58 | train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
59 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
60 |
61 | if mode == tf.estimator.ModeKeys.EVAL:
62 | eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes'])}
63 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
64 |
65 | if __name__ == '__main__':
66 | training_data, training_labels, testing_data, testing_labels = load_data()
67 | num_epochs = 10
68 |
69 | classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
70 | model_dir='tmp/')
71 |
72 | input_fn = tf.estimator.inputs.numpy_input_fn(
73 | x={"x": training_data},
74 | y=training_labels,
75 | batch_size=32,
76 | num_epochs=None,
77 | shuffle=True)
78 |
79 | for i in range(num_epochs):
80 | classifier.train(input_fn=input_fn, steps=1000)
81 |
82 | eval_input_fn = tf.estimator.inputs.numpy_input_fn(
83 | x={'x': testing_data},
84 | y=testing_labels,
85 | shuffle=False)
86 |
87 | eval_results = classifier.evaluate(input_fn=eval_input_fn)
88 | print('these are the results of my evaluations')
89 | print(eval_results)
90 |
91 | pred_input_fn = tf.estimator.inputs.numpy_input_fn(
92 | x={'x': testing_data},
93 | y=testing_labels,
94 | num_epochs=1,
95 | shuffle=False)
96 |
97 | pred_results = classifier.predict(input_fn=pred_input_fn)
98 | predicted_classes = [p['classes'] for p in pred_results]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Youtube-Code-Repository
2 | Repository for all the code from my youtube channel
3 | You can find me at https://youtube.com/MachineLearningWithPhil
4 |
5 |
Kaggle/Venus-Volcanoes
6 |
7 | My crude implementation of a convolutional neural network to perform image classification on data gathered
8 | by the Magellan spacecraft. The data is horribly skewed, as most images do not contain a volcano.
9 | This means we'll have to do some creative data engineering for our model training.
10 | Please note that in the test set, 84.1% of the data is "no volcano", and our model returns
11 | an accuracy of around 88%, which is better than a model that outputs straight 0s for predictions.
12 |
13 | You can check out the video for this at https://youtu.be/Ki-xOKydQrY
14 | You can find the data for this project at https://www.kaggle.com/fmena14/volcanoesvenus/home
15 | ReinforcementLearning/DeepQLearning
16 |
17 | My implementation of the Deep Q learning algorithm in PyTorch. Here we teach the algorithm to play the game of space invaders. I haven't had enough time to train this model yet, as it takes quite some time even on my 1080Ti / i7 7820k @ 4.4 GHz. I'll train
18 | longer and provide a video on how well it does, at a later time.
19 |
20 | The blog post talking about how Deep Q learning works can be found at http://www.neuralnet.ai/coding-a-deep-q-network-in-pytorch/
21 | Video for this is at https://www.youtube.com/watch?v=RfNxXlO6BiA&t=2s
22 |
23 |
24 |
25 | CNN.py
26 |
27 | Simple implementation of a convolutional neural network in TensorFlow, version 1.5.
28 | Video tutorial on this code can be found here https://youtu.be/azFyHS0odcM
29 | Achieves accuracy of 98% after 10 epochs of training
30 | Requires data from http://yann.lecun.com/exdb/mnist/
31 |
32 | ReinforcementLearning/blackJack-no-es.py
33 |
34 | Implementation of Monte Carlo control without exploring starts in the blackjack environment from the OpenAI gym.
35 | Video tutorial on this code can be found at https://youtu.be/e8ofon3sg8E
36 | Algorithm trains for 1,000,000 games and produces a win rate of around 42%, loss rate of 52% and draw rate of 6%
37 |
38 | ReinforcementLearning/blackJack-off-policy.py
39 |
40 | Implementation of off policy Monte Carlo control in the blackjack environment from the OpenAI gym.
41 | Video tutorial on this code can be found at https://youtu.be/TvO0Sa-6UVc
42 | Algorithm trains for 1,000,000 games and produces a win rate of around 29%, loss rate of 66% and draw rate of 5%
43 |
44 | ReinforcementLearning/cartpole_qlearning.py
45 |
46 | Implementation of the Q learning algorithm for the cart pole problem. Code is based on the course by lazy programmer,
47 | which you can find here here
48 | Video tutorial on this code can be found at https://youtu.be/ViwBAK8Hd7Q
49 |
50 | ReinforcementLearning/doubleQLearning.py
51 |
52 | Implementation of the double Q learning algorithm in the cart pole environment. This is based on my course on
53 | reinforcement learning, which you can find at this repo
54 | Video tutorial on this code can be found https://youtu.be/Q99bEPStnxk
55 |
56 | ReinforcementLearning/sarsa.py
57 |
58 | Implementation of the SARSA algorithm in the cart pole environment. This is based on my course on reinforcement learning,
59 | which can be found here
60 | Video tutorial on this code can be found at https://youtu.be/P9XezMuPfLE
61 |
--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/dqn_torch.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 | from memory import ReplayMemory
7 |
8 |
9 | class DeepQNetwork(nn.Module):
10 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
11 | super(DeepQNetwork, self).__init__()
12 | self.input_dims = input_dims
13 | self.fc1_dims = fc1_dims
14 | self.fc2_dims = fc2_dims
15 | self.n_actions = n_actions
16 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
17 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
18 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
19 |
20 | self.optimizer = optim.Adam(self.parameters(), lr=lr)
21 | self.loss = nn.MSELoss()
22 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
23 | self.to(self.device)
24 |
25 | def forward(self, state):
26 | x = F.relu(self.fc1(state))
27 | x = F.relu(self.fc2(x))
28 | actions = self.fc3(x)
29 |
30 | return actions
31 |
32 |
33 | class Agent:
34 | def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
35 | combined=False, max_mem_size=100000, eps_end=0.05,
36 | eps_dec=5e-4):
37 | self.gamma = gamma
38 | self.epsilon = epsilon
39 | self.eps_min = eps_end
40 | self.eps_dec = eps_dec
41 | self.lr = lr
42 | self.action_space = [i for i in range(n_actions)]
43 | self.batch_size = batch_size
44 | self.memory = ReplayMemory(input_dims, max_mem_size,
45 | batch_size, combined)
46 | self.iter_cntr = 0
47 | self.replace_target = 100
48 |
49 | self.Q_eval = DeepQNetwork(lr, n_actions=n_actions,
50 | input_dims=input_dims,
51 | fc1_dims=256, fc2_dims=256)
52 | self.Q_next = DeepQNetwork(lr, n_actions=n_actions,
53 | input_dims=input_dims,
54 | fc1_dims=256, fc2_dims=256)
55 |
56 | def choose_action(self, observation):
57 | if np.random.random() > self.epsilon:
58 | state = T.tensor([observation]).to(self.Q_eval.device)
59 | actions = self.Q_eval.forward(state)
60 | action = T.argmax(actions).item()
61 | else:
62 | action = np.random.choice(self.action_space)
63 |
64 | return action
65 |
66 | def learn(self):
67 | if not self.memory.is_sufficient():
68 | return
69 |
70 | self.Q_eval.optimizer.zero_grad()
71 | batch_index = np.arange(self.batch_size, dtype=np.int32)
72 | states, actions, rewards, new_states, dones = \
73 | self.memory.sample_memory()
74 | states = T.tensor(states).to(self.Q_eval.device)
75 | new_states = T.tensor(new_states).to(self.Q_eval.device)
76 | rewards = T.tensor(rewards).to(self.Q_eval.device)
77 | dones = T.tensor(dones).to(self.Q_eval.device)
78 | q_eval = self.Q_eval.forward(states)[batch_index, actions]
79 | q_next = self.Q_eval.forward(new_states)
80 | q_next[dones] = 0.0
81 | q_target = rewards + self.gamma*T.max(q_next, dim=1)[0]
82 |
83 | loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
84 | loss.backward()
85 | self.Q_eval.optimizer.step()
86 |
87 | self.iter_cntr += 1
88 | self.epsilon = self.epsilon - self.eps_dec \
89 | if self.epsilon > self.eps_min else self.eps_min
90 |
91 | if self.iter_cntr % self.replace_target == 0:
92 | self.Q_next.load_state_dict(self.Q_eval.state_dict())
93 |
--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | from dqn_torch import Agent
4 | import numpy as np
5 |
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser(description='')
9 | parser.add_argument('-bs', type=int, default=1000)
10 | parser.add_argument('-cer', type=bool, default=False)
11 | # if you supply it, then true
12 | args = parser.parse_args()
13 |
14 | env = gym.make('LunarLander-v2')
15 | combined = args.cer
16 | buffer_size = args.bs
17 |
18 | agent = Agent(gamma=0.99, epsilon=0.1, batch_size=64, n_actions=4,
19 | eps_end=0.1, input_dims=[8], lr=0.001,
20 | max_mem_size=buffer_size, combined=combined)
21 |
22 | scores = []
23 | n_games = 500
24 | for i in range(n_games):
25 | score = 0
26 | done = False
27 | observation = env.reset()
28 | while not done:
29 | action = agent.choose_action(observation)
30 | observation_, reward, done, info = env.step(action)
31 | score += reward
32 | agent.memory.store_transition(observation, action, reward,
33 | observation_, done)
34 | agent.learn()
35 | observation = observation_
36 | scores.append(score)
37 |
38 | avg_score = np.mean(scores[-100:])
39 |
40 | print('combined {} episode {} score {:.0f} avg score {:.0f} eps {:.2f}'
41 | .format(combined, i, score, avg_score, agent.epsilon))
42 |
43 | if combined:
44 | fname = 'CER_const_eps_' + str(buffer_size) + '.npy'
45 | else:
46 | fname = 'VER_const_eps_' + str(buffer_size) + '.npy'
47 | np.save(fname, np.array(scores))
48 |
--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class ReplayMemory:
5 | def __init__(self, input_dims, max_mem, batch_size, combined=False):
6 | pass
7 |
8 | def store_transition(self, state, action, reward, state_, terminal):
9 | pass
10 |
11 | def sample_memory(self):
12 | pass
13 |
14 | def is_sufficient(self):
15 | pass
16 |
--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/memory_solution.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class ReplayMemory:
5 | def __init__(self, input_dims, max_mem, batch_size, combined=False):
6 | self.mem_size = max_mem
7 | self.batch_size = batch_size
8 | self.mem_cntr = 0
9 | self.combined = combined
10 | self.state_memory = np.zeros((self.mem_size, *input_dims),
11 | dtype=np.float32)
12 | self.new_state_memory = np.zeros((self.mem_size, *input_dims),
13 | dtype=np.float32)
14 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
15 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
16 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
17 |
18 | def store_transition(self, state, action, reward, state_, terminal):
19 | index = self.mem_cntr % self.mem_size
20 | self.state_memory[index] = state
21 | self.action_memory[index] = action
22 | self.reward_memory[index] = reward
23 | self.new_state_memory[index] = state_
24 | self.terminal_memory[index] = terminal
25 |
26 | self.mem_cntr += 1
27 |
28 | def sample_memory(self):
29 | offset = 1 if self.combined else 0
30 | max_mem = min(self.mem_cntr, self.mem_size) - offset
31 | batch = np.random.choice(max_mem, self.batch_size-offset,
32 | replace=False)
33 | states = self.state_memory[batch]
34 | new_states = self.new_state_memory[batch]
35 | actions = self.action_memory[batch]
36 | rewards = self.reward_memory[batch]
37 | terminals = self.terminal_memory[batch]
38 |
39 | if self.combined:
40 | index = self.mem_cntr % self.mem_size - 1
41 | last_action = self.action_memory[index]
42 | last_state = self.state_memory[index]
43 | last_new_state = self.new_state_memory[index]
44 | last_reward = self.reward_memory[index]
45 | last_terminal = self.terminal_memory[index]
46 |
47 | actions = np.append(self.action_memory[batch], last_action)
48 | states = np.vstack((self.state_memory[batch], last_state))
49 | new_states = np.vstack((self.new_state_memory[batch],
50 | last_new_state))
51 | rewards = np.append(self.reward_memory[batch], last_reward)
52 | terminals = np.append(self.terminal_memory[batch], last_terminal)
53 |
54 | return states, actions, rewards, new_states, terminals
55 |
56 | def is_sufficient(self):
57 | return self.mem_cntr > self.batch_size
58 |
--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/plot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | cer_1k = np.load('CER_const_eps_1000.npy')
4 | cer_10k = np.load('CER_const_eps_10000.npy')
5 | cer_100k = np.load('CER_const_eps_100000.npy')
6 |
7 | ver_1k = np.load('VER_const_eps_1000.npy')
8 | ver_10k = np.load('VER_const_eps_10000.npy')
9 | ver_100k = np.load('VER_const_eps_100000.npy')
10 |
11 | running_cer1k_avg = np.zeros(len(cer_1k))
12 | running_cer10k_avg = np.zeros(len(cer_10k))
13 | running_cer100k_avg = np.zeros(len(cer_100k))
14 | running_ver1k_avg = np.zeros(len(ver_1k))
15 | running_ver10k_avg = np.zeros(len(ver_10k))
16 | running_ver100k_avg = np.zeros(len(ver_100k))
17 |
18 | for i in range(len(cer_1k)):
19 | running_cer1k_avg[i] = np.mean(cer_1k[max(0, i-100):(i+1)])
20 | running_cer10k_avg[i] = np.mean(cer_10k[max(0, i-100):(i+1)])
21 | running_cer100k_avg[i] = np.mean(cer_100k[max(0, i-100):(i+1)])
22 | running_ver1k_avg[i] = np.mean(ver_1k[max(0, i-100):(i+1)])
23 | running_ver10k_avg[i] = np.mean(ver_10k[max(0, i-100):(i+1)])
24 | running_ver100k_avg[i] = np.mean(ver_100k[max(0, i-100):(i+1)])
25 |
26 |
27 | x_axis = np.arange(len(cer_1k))
28 | plt.plot(x_axis, running_cer1k_avg, 'r--', label='CER (1,000)')
29 | plt.plot(x_axis, running_ver1k_avg, 'b--', label='VER (1,000)')
30 | plt.xlabel('Episode')
31 | plt.ylabel('Avg Score')
32 | plt.legend(loc='lower right')
33 | plt.savefig('CER_vs_VER_1000_const_eps.png')
34 | plt.close()
35 |
36 | x_axis = np.arange(len(cer_10k))
37 | plt.plot(x_axis, running_cer10k_avg, 'r--', label='CER (10,000)')
38 | plt.plot(x_axis, running_ver10k_avg, 'b--', label='VER (10,000)')
39 | plt.xlabel('Episode')
40 | plt.ylabel('Avg Score')
41 | plt.legend(loc='lower right')
42 | plt.savefig('CER_vs_VER_10000_const_eps.png')
43 | plt.close()
44 |
45 | x_axis = np.arange(len(cer_100k))
46 | plt.plot(x_axis, running_cer100k_avg, 'r--', label='CER (100,000)')
47 | plt.plot(x_axis, running_ver100k_avg, 'b--', label='VER (100,000)')
48 | plt.xlabel('Episode')
49 | plt.ylabel('Avg Score')
50 | plt.legend(loc='lower right')
51 | plt.savefig('CER_vs_VER_100000_const_eps.png')
52 | plt.close()
53 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/frame_stack_test.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | ACTION_DICT = {0: 'NOOP', 1: 'FIRE', 2:'RIGHT', 3:'LEFT'}
6 |
7 | def preprocess(observation):
8 | observation = observation / 255
9 | return np.mean(observation[30:,:], axis=2).reshape(180,160)
10 |
11 | def stack_frames(stacked_frames, frame, stack_size, actions, action):
12 | if stacked_frames is None:
13 | stacked_frames = np.zeros((*frame.shape, stack_size))
14 | actions = np.zeros(stack_size)
15 | for idx in range(stack_size):
16 | stacked_frames[:,:,idx] = frame
17 | else:
18 | stacked_frames[:,:,0:stack_size-1] = stacked_frames[:,:,1:]
19 | stacked_frames[:,:,stack_size-1] = frame
20 | actions[0:stack_size-1] = actions[1:]
21 | actions[stack_size-1] = action
22 | fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
23 |
24 | ax1.imshow(stacked_frames[:,:,0])
25 | ax1.set_title(ACTION_DICT[actions[0]])
26 | ax2.imshow(stacked_frames[:,:,1])
27 | ax2.set_title(ACTION_DICT[actions[1]])
28 | ax3.imshow(stacked_frames[:,:,2])
29 | ax3.set_title(ACTION_DICT[actions[2]])
30 | ax4.imshow(stacked_frames[:,:,3])
31 | ax4.set_title(ACTION_DICT[actions[3]])
32 | plt.show()
33 |
34 | return actions, stacked_frames
35 |
36 | if __name__ == '__main__':
37 | env = gym.make('Breakout-v0')
38 | stack_size = 4
39 |
40 | for i in range(10):
41 | done = False
42 | observation = env.reset()
43 | observation = preprocess(observation)
44 | stacked_frames = None
45 | actions=None
46 | actions, stacked_frames = stack_frames(stacked_frames, observation,
47 | stack_size, actions, 0)
48 | while not done:
49 | action = env.action_space.sample()
50 | observation_, reward, done, info = env.step(action)
51 | actions, stacked_frames_ = stack_frames(stacked_frames,
52 | preprocess(observation_), stack_size,
53 | actions, action)
54 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_keras_dqn_pong.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from dqn_keras import Agent
3 | from utils import plotLearning, make_env
4 |
5 | if __name__ == '__main__':
6 | env = make_env('PongNoFrameskip-v4')
7 |
8 | num_games = 500
9 | load_checkpoint = False
10 | best_score = -21
11 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0001,
12 | input_dims=(4,80,80), n_actions=6, mem_size=25000,
13 | eps_min=0.02, batch_size=32, replace=1000, eps_dec=1e-5)
14 |
15 | if load_checkpoint:
16 | agent.load_models()
17 |
18 | filename = 'PongNoFrameskip-v4.png'
19 |
20 | scores, eps_history = [], []
21 | n_steps = 0
22 |
23 | for i in range(num_games):
24 | done = False
25 | observation = env.reset()
26 | score = 0
27 | while not done:
28 | action = agent.choose_action(observation)
29 | observation_, reward, done, info = env.step(action)
30 | n_steps += 1
31 | score += reward
32 | if not load_checkpoint:
33 | agent.store_transition(observation, action,
34 | reward, observation_, int(done))
35 | agent.learn()
36 | else:
37 | env.render()
38 | observation = observation_
39 |
40 | scores.append(score)
41 |
42 | avg_score = np.mean(scores[-100:])
43 | print('episode: ', i,'score: ', score,
44 | ' average score %.3f' % avg_score,
45 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
46 | if avg_score > best_score:
47 | agent.save_models()
48 | print('avg score %.2f better than best score %.2f, saving model' % (
49 | avg_score, best_score))
50 | best_score = avg_score
51 |
52 | eps_history.append(agent.epsilon)
53 |
54 | x = [i+1 for i in range(num_games)]
55 | plot_learning_curve(x, scores, eps_history, filename)
56 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_tf_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from simple_dqn_tf import DeepQNetwork, Agent
3 | from utils import plotLearning
4 | import numpy as np
5 | from gym import wrappers
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | if __name__ == '__main__':
10 | env = gym.make('LunarLander-v2')
11 | lr = 0.0005
12 | n_games = 500
13 |
14 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=lr, input_dims=[8],
15 | n_actions=4, mem_size=1000000, n_games=n_games,
16 | batch_size=64)
17 |
18 | #load_checkpoint = True
19 | #if load_checkpoint:
20 | # agent.load_models()
21 |
22 | alpha = 'alpha' + str(lr)#.split('.')[1]
23 |
24 | filename = '0-lunar-lander-256x256-' + alpha + '-bs64-adam-faster_decay.png'
25 | scores = []
26 | eps_history = []
27 |
28 | score = 0
29 | env = wrappers.Monitor(env, "tmp/lunar-lander-4",
30 | video_callable=lambda episode_id: True, force=True)
31 |
32 | for i in range(n_games):
33 | done = False
34 | if i % 10 == 0 and i > 0:
35 | avg_score = np.mean(scores[max(0, i-10):(i+1)])
36 | print('episode: ', i,'score: ', score,
37 | ' average score %.3f' % avg_score,
38 | 'epsilon %.3f' % agent.epsilon)
39 | #agent.save_models()
40 | else:
41 | print('episode: ', i,'score: ', score)
42 |
43 | observation = env.reset()
44 | score = 0
45 | while not done:
46 | action = agent.choose_action(observation)
47 | observation_, reward, done, info = env.step(action)
48 | score += reward
49 | agent.store_transition(observation, action,
50 | reward, observation_, int(done))
51 | observation = observation_
52 | agent.learn()
53 |
54 | eps_history.append(agent.epsilon)
55 | scores.append(score)
56 |
57 | x = [i+1 for i in range(n_games)]
58 | plotLearning(x, scores, eps_history, filename)
59 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from simple_dqn_torch import DeepQNetwork, Agent
3 | from utils import plotLearning
4 | import numpy as np
5 | from gym import wrappers
6 |
7 | if __name__ == '__main__':
8 | env = gym.make('LunarLander-v2')
9 | brain = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4,
10 | input_dims=[8], alpha=0.003)
11 |
12 | scores = []
13 | eps_history = []
14 | num_games = 500
15 | score = 0
16 | # uncomment the line below to record every episode.
17 | #env = wrappers.Monitor(env, "tmp/space-invaders-1",
18 | #video_callable=lambda episode_id: True, force=True)
19 | for i in range(num_games):
20 | if i % 10 == 0 and i > 0:
21 | avg_score = np.mean(scores[max(0, i-10):(i+1)])
22 | print('episode: ', i,'score: ', score,
23 | ' average score %.3f' % avg_score,
24 | 'epsilon %.3f' % brain.EPSILON)
25 | else:
26 | print('episode: ', i,'score: ', score)
27 | eps_history.append(brain.EPSILON)
28 | done = False
29 | observation = env.reset()
30 | score = 0
31 | while not done:
32 | action = brain.chooseAction(observation)
33 | observation_, reward, done, info = env.step(action)
34 | score += reward
35 | brain.storeTransition(observation, action, reward, observation_,
36 | done)
37 | observation = observation_
38 | brain.learn()
39 |
40 | scores.append(score)
41 |
42 | x = [i+1 for i in range(num_games)]
43 | filename = str(num_games) + 'Games' + 'Gamma' + str(brain.GAMMA) + \
44 | 'Alpha' + str(brain.ALPHA) + 'Memory' + \
45 | str(brain.Q_eval.fc1_dims) + '-' + str(brain.Q_eval.fc2_dims) +'.png'
46 | plotLearning(x, scores, eps_history, filename)
47 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_space_invaders.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from torch_deep_q_model import DeepQNetwork, Agent
3 | from utils import plotLearning
4 | import numpy as np
5 | from gym import wrappers
6 |
7 | if __name__ == '__main__':
8 | env = gym.make('SpaceInvaders-v0')
9 | brain = Agent(gamma=0.95, epsilon=1.0,
10 | alpha=0.003, maxMemorySize=5000,
11 | replace=None)
12 | while brain.memCntr < brain.memSize:
13 | observation = env.reset()
14 | done = False
15 | while not done:
16 | # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
17 | action = env.action_space.sample()
18 | observation_, reward, done, info = env.step(action)
19 | if done and info['ale.lives'] == 0:
20 | reward = -100
21 | brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
22 | np.mean(observation_[15:200,30:125], axis=2))
23 | observation = observation_
24 | print('done initializing memory')
25 |
26 | scores = []
27 | epsHistory = []
28 | numGames = 50
29 | batch_size=32
30 | # uncomment the line below to record every episode.
31 | env = wrappers.Monitor(env, "tmp/space-invaders-1", video_callable=lambda episode_id: True, force=True)
32 | for i in range(numGames):
33 | print('starting game ', i+1, 'epsilon: %.4f' % brain.EPSILON)
34 | epsHistory.append(brain.EPSILON)
35 | done = False
36 | observation = env.reset()
37 | frames = [np.sum(observation[15:200,30:125], axis=2)]
38 | score = 0
39 | lastAction = 0
40 | while not done:
41 | if len(frames) == 3:
42 | action = brain.chooseAction(frames)
43 | frames = []
44 | else:
45 | action = lastAction
46 | observation_, reward, done, info = env.step(action)
47 | score += reward
48 | frames.append(np.sum(observation_[15:200,30:125], axis=2))
49 | if done and info['ale.lives'] == 0:
50 | reward = -100
51 | brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
52 | np.mean(observation_[15:200,30:125], axis=2))
53 | observation = observation_
54 | brain.learn(batch_size)
55 | lastAction = action
56 | #env.render(
57 | scores.append(score)
58 | print('score:',score)
59 | x = [i+1 for i in range(numGames)]
60 | fileName = str(numGames) + 'Games' + 'Gamma' + str(brain.GAMMA) + \
61 | 'Alpha' + str(brain.ALPHA) + 'Memory' + str(brain.memSize)+ '.png'
62 | plotLearning(x, scores, epsHistory, fileName)
63 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/q_eval.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_eval.h5
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/q_next.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_next.h5
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/torch_deep_q_model.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 |
7 | class DeepQNetwork(nn.Module):
8 | def __init__(self, ALPHA):
9 | super(DeepQNetwork, self).__init__()
10 | #self.conv1 = nn.Conv2d(3, 32, 8, stride=4, padding=1)
11 | self.conv1 = nn.Conv2d(1, 32, 8, stride=4, padding=1)
12 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
13 | self.conv3 = nn.Conv2d(64, 128, 3)
14 | #self.fc1 = nn.Linear(128*23*16, 512)
15 | self.fc1 = nn.Linear(128*19*8, 512)
16 | self.fc2 = nn.Linear(512, 6)
17 | #self.optimizer = optim.SGD(self.parameters(), lr=self.ALPHA, momentum=0.9)
18 | self.optimizer = optim.RMSprop(self.parameters(), lr=ALPHA)
19 | self.loss = nn.MSELoss()
20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
21 | self.to(self.device)
22 |
23 | def forward(self, observation):
24 | observation = T.Tensor(observation).to(self.device)
25 | #observation = observation.view(-1, 3, 210, 160).to(self.device)
26 | observation = observation.view(-1, 1, 185, 95)
27 | observation = F.relu(self.conv1(observation))
28 | observation = F.relu(self.conv2(observation))
29 | observation = F.relu(self.conv3(observation))
30 | #observation = observation.view(-1, 128*23*16).to(self.device)
31 | observation = observation.view(-1, 128*19*8)
32 | observation = F.relu(self.fc1(observation))
33 | actions = self.fc2(observation)
34 | return actions
35 |
36 | class Agent(object):
37 | def __init__(self, gamma, epsilon, alpha,
38 | maxMemorySize, epsEnd=0.05,
39 | replace=10000, actionSpace=[0,1,2,3,4,5]):
40 | self.GAMMA = gamma
41 | self.EPSILON = epsilon
42 | self.EPS_END = epsEnd
43 | self.ALPHA = alpha
44 | self.actionSpace = actionSpace
45 | self.memSize = maxMemorySize
46 | self.steps = 0
47 | self.learn_step_counter = 0
48 | self.memory = []
49 | self.memCntr = 0
50 | self.replace_target_cnt = replace
51 | self.Q_eval = DeepQNetwork(alpha)
52 | self.Q_next = DeepQNetwork(alpha)
53 |
54 | def storeTransition(self, state, action, reward, state_):
55 | if self.memCntr < self.memSize:
56 | self.memory.append([state, action, reward, state_])
57 | else:
58 | self.memory[self.memCntr%self.memSize] = [state, action, reward, state_]
59 | self.memCntr += 1
60 |
61 | def chooseAction(self, observation):
62 | rand = np.random.random()
63 | actions = self.Q_eval.forward(observation)
64 | if rand < 1 - self.EPSILON:
65 | action = T.argmax(actions[1]).item()
66 | else:
67 | action = np.random.choice(self.actionSpace)
68 | self.steps += 1
69 | return action
70 |
71 | def learn(self, batch_size):
72 | self.Q_eval.optimizer.zero_grad()
73 | if self.replace_target_cnt is not None and \
74 | self.learn_step_counter % self.replace_target_cnt == 0:
75 | self.Q_next.load_state_dict(self.Q_eval.state_dict())
76 |
77 | if self.memCntr+batch_size < self.memSize:
78 | memStart = int(np.random.choice(range(self.memCntr)))
79 | else:
80 | memStart = int(np.random.choice(range(self.memSize-batch_size-1)))
81 | miniBatch=self.memory[memStart:memStart+batch_size]
82 | memory = np.array(miniBatch)
83 |
84 | # convert to list because memory is an array of numpy objects
85 | Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
86 | Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
87 |
88 | maxA = T.argmax(Qnext, dim=1).to(self.Q_eval.device)
89 | rewards = T.Tensor(list(memory[:,2])).to(self.Q_eval.device)
90 | Qtarget = Qpred.clone()
91 | indices = np.arange(batch_size)
92 | Qtarget[indices,maxA] = rewards + self.GAMMA*T.max(Qnext[1])
93 |
94 | if self.steps > 500:
95 | if self.EPSILON - 1e-4 > self.EPS_END:
96 | self.EPSILON -= 1e-4
97 | else:
98 | self.EPSILON = self.EPS_END
99 |
100 | #Qpred.requires_grad_()
101 | loss = self.Q_eval.loss(Qtarget, Qpred).to(self.Q_eval.device)
102 | loss.backward()
103 | self.Q_eval.optimizer.step()
104 | self.learn_step_counter += 1
105 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import os
2 | # for keras the CUDA commands must come before importing the keras libraries
3 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
6 | import gym
7 | from gym import wrappers
8 | import numpy as np
9 | from ddqn_keras import DDQNAgent
10 | from utils import plotLearning
11 |
12 | if __name__ == '__main__':
13 | env = gym.make('LunarLander-v2')
14 | ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1.0,
15 | batch_size=64, input_dims=8)
16 | n_games = 500
17 | #ddqn_agent.load_model()
18 | ddqn_scores = []
19 | eps_history = []
20 | #env = wrappers.Monitor(env, "tmp/lunar-lander-ddqn-2",
21 | # video_callable=lambda episode_id: True, force=True)
22 |
23 | for i in range(n_games):
24 | done = False
25 | score = 0
26 | observation = env.reset()
27 | while not done:
28 | action = ddqn_agent.choose_action(observation)
29 | observation_, reward, done, info = env.step(action)
30 | score += reward
31 | ddqn_agent.remember(observation, action, reward, observation_, int(done))
32 | observation = observation_
33 | ddqn_agent.learn()
34 | eps_history.append(ddqn_agent.epsilon)
35 |
36 | ddqn_scores.append(score)
37 |
38 | avg_score = np.mean(ddqn_scores[max(0, i-100):(i+1)])
39 | print('episode: ', i,'score: %.2f' % score,
40 | ' average score %.2f' % avg_score)
41 |
42 | if i % 10 == 0 and i > 0:
43 | ddqn_agent.save_model()
44 |
45 | filename = 'lunarlander-ddqn.png'
46 |
47 | x = [i+1 for i in range(n_games)]
48 | plotLearning(x, ddqn_scores, eps_history, filename)
49 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | from simple_dqn_keras import Agent
2 | import numpy as np
3 | import gym
4 | from utils import plotLearning
5 | from gym import wrappers
6 |
7 | if __name__ == '__main__':
8 | env = gym.make('LunarLander-v2')
9 | lr = 0.0005
10 | n_games = 500
11 | agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=8,
12 | n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0.0)
13 |
14 | agent.load_model()
15 | scores = []
16 | eps_history = []
17 |
18 | #env = wrappers.Monitor(env, "tmp/lunar-lander-6",
19 | # video_callable=lambda episode_id: True, force=True)
20 |
21 | for i in range(n_games):
22 | done = False
23 | score = 0
24 | observation = env.reset()
25 | while not done:
26 | action = agent.choose_action(observation)
27 | observation_, reward, done, info = env.step(action)
28 | score += reward
29 | agent.remember(observation, action, reward, observation_, int(done))
30 | observation = observation_
31 | agent.learn()
32 |
33 | eps_history.append(agent.epsilon)
34 | scores.append(score)
35 |
36 | avg_score = np.mean(scores[max(0, i-100):(i+1)])
37 | print('episode: ', i,'score: %.2f' % score,
38 | ' average score %.2f' % avg_score)
39 |
40 | if i % 10 == 0 and i > 0:
41 | agent.save_model()
42 |
43 | filename = 'lunarlander.png'
44 |
45 | x = [i+1 for i in range(n_games)]
46 | plotLearning(x, scores, eps_history, filename)
47 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_dueling_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | from dueling_dqn_keras import Agent
2 | import numpy as np
3 | import gym
4 | from utils import plotLearning
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLander-v2')
8 | n_games = 400
9 | agent = Agent(gamma=0.99, epsilon=1, lr=1e-3, input_dims=[8],
10 | epsilon_dec=1e-3, mem_size=100000, batch_size=64, eps_end=0.01,
11 | fc1_dims=128, fc2_dims=128, replace=100, n_actions=4)
12 |
13 | scores, eps_history = [], []
14 |
15 | for i in range(n_games):
16 | done = False
17 | score = 0
18 | observation = env.reset()
19 | while not done:
20 | action = agent.choose_action(observation)
21 | observation_, reward, done, info = env.step(action)
22 | score += reward
23 | agent.store_transition(observation, action, reward, observation_, done)
24 | observation = observation_
25 | agent.learn()
26 | eps_history.append(agent.epsilon)
27 | scores.append(score)
28 |
29 | avg_score = np.mean(scores[-100:])
30 | print('episode ', i, 'score %.1f' % score,
31 | 'average score %.1f' % avg_score,
32 | 'epsilon %.2f' % agent.epsilon)
33 |
34 | filename='keras_lunar_lander.png'
35 | x = [i+1 for i in range(n_games)]
36 | plotLearning(x, scores, eps_history, filename)
37 |
38 |
39 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf2_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | from simple_dqn_tf2 import Agent
2 | import numpy as np
3 | import gym
4 | from utils import plotLearning
5 | import tensorflow as tf
6 |
7 | if __name__ == '__main__':
8 | tf.compat.v1.disable_eager_execution()
9 | env = gym.make('LunarLander-v2')
10 | lr = 0.001
11 | n_games = 500
12 | agent = Agent(gamma=0.99, epsilon=1.0, lr=lr,
13 | input_dims=env.observation_space.shape,
14 | n_actions=env.action_space.n, mem_size=1000000, batch_size=64,
15 | epsilon_end=0.01)
16 | scores = []
17 | eps_history = []
18 |
19 | for i in range(n_games):
20 | done = False
21 | score = 0
22 | observation = env.reset()
23 | while not done:
24 | action = agent.choose_action(observation)
25 | observation_, reward, done, info = env.step(action)
26 | score += reward
27 | agent.store_transition(observation, action, reward, observation_, done)
28 | observation = observation_
29 | agent.learn()
30 | eps_history.append(agent.epsilon)
31 | scores.append(score)
32 |
33 | avg_score = np.mean(scores[-100:])
34 | print('episode: ', i, 'score %.2f' % score,
35 | 'average_score %.2f' % avg_score,
36 | 'epsilon %.2f' % agent.epsilon)
37 |
38 | filename = 'lunarlander_tf2.png'
39 | x = [i+1 for i in range(n_games)]
40 | plotLearning(x, scores, eps_history, filename)
41 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf2_dueling_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from dueling_ddqn_tf2 import Agent
4 | from utils import plotLearning
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLander-v2')
8 | agent = Agent(lr=0.0005, gamma=0.99, n_actions=4, epsilon=1.0,
9 | batch_size=64, input_dims=[8])
10 | n_games = 500
11 | ddqn_scores = []
12 | eps_history = []
13 |
14 | for i in range(n_games):
15 | done = False
16 | score = 0
17 | observation = env.reset()
18 | while not done:
19 | action = agent.choose_action(observation)
20 | observation_, reward, done, info = env.step(action)
21 | score += reward
22 | agent.store_transition(observation, action, reward, observation_, done)
23 | observation = observation_
24 | agent.learn()
25 | eps_history.append(ddqn_agent.epsilon)
26 |
27 | ddqn_scores.append(score)
28 |
29 | avg_score = np.mean(scores[-100:])
30 | print('episode: ', i,'score: %.2f' % score,
31 | ' average score %.2f' % avg_score)
32 |
33 | filename = 'lunarlander-dueling_ddqn.png'
34 |
35 | x = [i+1 for i in range(n_games)]
36 | plotLearning(x, ddqn_scores, eps_history, filename)
37 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf_dqn_breakout.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | from dqn_tf import DeepQNetwork, Agent
4 | from utils import plotLearning
5 | import numpy as np
6 | from gym import wrappers
7 | import matplotlib.pyplot as plt
8 |
9 | def preprocess(observation):
10 | observation = observation / 255
11 | return np.mean(observation[30:,:], axis=2).reshape(180,160,1)
12 |
13 | def stack_frames(stacked_frames, frame, buffer_size):
14 | if stacked_frames is None:
15 | stacked_frames = np.zeros((buffer_size, *frame.shape))
16 | for idx, _ in enumerate(stacked_frames):
17 | stacked_frames[idx,:] = frame
18 | else:
19 | stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:]
20 | stacked_frames[buffer_size-1, :] = frame
21 |
22 | stacked_frames = stacked_frames.reshape(1, *frame.shape[0:2], buffer_size)
23 |
24 | return stacked_frames
25 |
26 |
27 | if __name__ == '__main__':
28 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
29 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
30 | #os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
31 |
32 | env = gym.make('Breakout-v0')
33 | load_checkpoint = False
34 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.000025, input_dims=(180,160,4),
35 | n_actions=3, mem_size=25000, batch_size=64)
36 | if load_checkpoint:
37 | agent.load_models()
38 | filename = 'breakout-alpha0p000025-gamma0p9-only-one-fc-2.png'
39 | scores = []
40 | eps_history = []
41 | numGames = 50000
42 | stack_size = 4
43 | score = 0
44 | # uncomment the line below to record every episode.
45 | #env = wrappers.Monitor(env, "tmp/breakout-0",
46 | # video_callable=lambda episode_id: True, force=True)
47 | """
48 | print("Loading up the agent's memory with random gameplay")
49 |
50 | while agent.mem_cntr < 25000:
51 | done = False
52 | observation = env.reset()
53 | observation = preprocess(observation)
54 | stacked_frames = None
55 | observation = stack_frames(stacked_frames, observation, stack_size)
56 | while not done:
57 | action = np.random.choice([0, 1, 2])
58 | action += 1
59 | observation_, reward, done, info = env.step(action)
60 | observation_ = stack_frames(stacked_frames,
61 | preprocess(observation_), stack_size)
62 | action -= 1
63 | agent.store_transition(observation, action,
64 | reward, observation_, int(done))
65 | observation = observation_
66 | print("Done with random gameplay. Game on.")
67 | """
68 | n_steps = 0
69 | for i in range(numGames):
70 | done = False
71 | #if i % 100 == 0 and i > 0:
72 | # x = [j+1 for j in range(i)]
73 |
74 | # plotLearning(x, scores, eps_history, filename)
75 | observation = env.reset()
76 | observation = preprocess(observation)
77 | stacked_frames = None
78 | observation = stack_frames(stacked_frames, observation, stack_size)
79 | score = 0
80 | while not done:
81 | action = agent.choose_action(observation)
82 | action += 1
83 | observation_, reward, done, info = env.step(action)
84 | n_steps += 1
85 | observation_ = stack_frames(stacked_frames,
86 | preprocess(observation_), stack_size)
87 | score += reward
88 | action -= 1
89 | agent.store_transition(observation, action,
90 | reward, observation_, int(done))
91 | observation = observation_
92 | if n_steps % 4 == 0:
93 | agent.learn()
94 | if i % 12 == 0 and i > 0:
95 | avg_score = np.mean(scores[max(0, i-12):(i+1)])
96 | print('episode: ', i,'score: ', score,
97 | ' average score %.3f' % avg_score,
98 | 'epsilon %.3f' % agent.epsilon)
99 | agent.save_models()
100 | else:
101 | print('episode: ', i,'score: ', score)
102 | eps_history.append(agent.epsilon)
103 | scores.append(score)
104 | x = [i+1 for i in range(numGames)]
105 | plotLearning(x, scores, eps_history, filename)
106 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dqn_lunar_lander_2020.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from simple_dqn_torch_2020 import Agent
3 | from utils import plotLearning
4 | import numpy as np
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLander-v2')
8 | agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01,
9 | input_dims=[8], lr=0.001)
10 | scores, eps_history = [], []
11 | n_games = 500
12 |
13 | for i in range(n_games):
14 | score = 0
15 | done = False
16 | observation = env.reset()
17 | while not done:
18 | action = agent.choose_action(observation)
19 | observation_, reward, done, info = env.step(action)
20 | score += reward
21 | agent.store_transition(observation, action, reward,
22 | observation_, done)
23 | agent.learn()
24 | observation = observation_
25 | scores.append(score)
26 | eps_history.append(agent.epsilon)
27 |
28 | avg_score = np.mean(scores[-100:])
29 |
30 | print('episode ', i, 'score %.2f' % score,
31 | 'average score %.2f' % avg_score,
32 | 'epsilon %.2f' % agent.epsilon)
33 | x = [i+1 for i in range(n_games)]
34 | filename = 'lunar_lander.png'
35 | plotLearning(x, scores, eps_history, filename)
36 |
37 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dueling_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from dueling_ddqn_torch import Agent
4 | from utils import plotLearning
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLander-v2')
8 | num_games = 250
9 | load_checkpoint = False
10 |
11 | agent = Agent(gamma=0.99, epsilon=1.0, lr=5e-4,
12 | input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01,
13 | batch_size=64, eps_dec=1e-3, replace=100)
14 |
15 | if load_checkpoint:
16 | agent.load_models()
17 |
18 | filename = 'LunarLander-Dueling-DDQN-512-Adam-lr0005-replace100.png'
19 | scores = []
20 | eps_history = []
21 | n_steps = 0
22 |
23 | for i in range(num_games):
24 | done = False
25 | observation = env.reset()
26 | score = 0
27 |
28 | while not done:
29 | action = agent.choose_action(observation)
30 | observation_, reward, done, info = env.step(action)
31 | score += reward
32 | agent.store_transition(observation, action,
33 | reward, observation_, int(done))
34 | agent.learn()
35 |
36 | observation = observation_
37 |
38 | scores.append(score)
39 | avg_score = np.mean(scores[max(0, i-100):(i+1)])
40 | print('episode: ', i,'score %.1f ' % score,
41 | ' average score %.1f' % avg_score,
42 | 'epsilon %.2f' % agent.epsilon)
43 | if i > 0 and i % 10 == 0:
44 | agent.save_models()
45 |
46 | eps_history.append(agent.epsilon)
47 |
48 | x = [i+1 for i in range(num_games)]
49 | plotLearning(x, scores, eps_history, filename)
50 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dueling_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym, time
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from dueling_dqn_torch import Agent
5 | from utils import plotLearning
6 |
7 | if __name__ == '__main__':
8 | env = gym.make('LunarLander-v2')
9 | num_games = 1000
10 | load_checkpoint = False
11 |
12 | agent = Agent(gamma=0.99, epsilon=1.0, alpha=5e-4,
13 | input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01,
14 | batch_size=64, eps_dec=1e-3, replace=100)
15 |
16 | if load_checkpoint:
17 | agent.load_models()
18 |
19 | filename = 'LunarLander-Dueling-128-128-Adam-lr0005-replace100.png'
20 | scores = []
21 | eps_history = []
22 | n_steps = 0
23 |
24 | for i in range(num_games):
25 | done = False
26 | observation = env.reset()
27 | score = 0
28 |
29 | while not done:
30 | action = agent.choose_action(observation)
31 | observation_, reward, done, info = env.step(action)
32 | n_steps += 1
33 | score += reward
34 | agent.store_transition(observation, action,
35 | reward, observation_, int(done))
36 | agent.learn()
37 |
38 | observation = observation_
39 |
40 |
41 | scores.append(score)
42 | avg_score = np.mean(scores[max(0, i-100):(i+1)])
43 | print('episode: ', i,'score %.1f ' % score,
44 | ' average score %.1f' % avg_score,
45 | 'epsilon %.2f' % agent.epsilon)
46 | #if i > 0 and i % 10 == 0:
47 | # agent.save_models()
48 |
49 | eps_history.append(agent.epsilon)
50 |
51 | x = [i+1 for i in range(num_games)]
52 | plotLearning(x, scores, eps_history, filename)
53 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_keras.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Dense, Activation
2 | from keras.models import Sequential, load_model
3 | from keras.optimizers import Adam
4 | import numpy as np
5 |
6 | class ReplayBuffer(object):
7 | def __init__(self, max_size, input_shape, n_actions, discrete=False):
8 | self.mem_size = max_size
9 | self.mem_cntr = 0
10 | self.discrete = discrete
11 | self.state_memory = np.zeros((self.mem_size, input_shape))
12 | self.new_state_memory = np.zeros((self.mem_size, input_shape))
13 | dtype = np.int8 if self.discrete else np.float32
14 | self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
15 | self.reward_memory = np.zeros(self.mem_size)
16 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
17 |
18 | def store_transition(self, state, action, reward, state_, done):
19 | index = self.mem_cntr % self.mem_size
20 | self.state_memory[index] = state
21 | self.new_state_memory[index] = state_
22 | # store one hot encoding of actions, if appropriate
23 | if self.discrete:
24 | actions = np.zeros(self.action_memory.shape[1])
25 | actions[action] = 1.0
26 | self.action_memory[index] = actions
27 | else:
28 | self.action_memory[index] = action
29 | self.reward_memory[index] = reward
30 | self.terminal_memory[index] = 1 - done
31 | self.mem_cntr += 1
32 |
33 | def sample_buffer(self, batch_size):
34 | max_mem = min(self.mem_cntr, self.mem_size)
35 | batch = np.random.choice(max_mem, batch_size)
36 |
37 | states = self.state_memory[batch]
38 | actions = self.action_memory[batch]
39 | rewards = self.reward_memory[batch]
40 | states_ = self.new_state_memory[batch]
41 | terminal = self.terminal_memory[batch]
42 |
43 | return states, actions, rewards, states_, terminal
44 |
45 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
46 | model = Sequential([
47 | Dense(fc1_dims, input_shape=(input_dims,)),
48 | Activation('relu'),
49 | Dense(fc2_dims),
50 | Activation('relu'),
51 | Dense(n_actions)])
52 |
53 | model.compile(optimizer=Adam(lr=lr), loss='mse')
54 |
55 | return model
56 |
57 | class Agent(object):
58 | def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
59 | input_dims, epsilon_dec=0.996, epsilon_end=0.01,
60 | mem_size=1000000, fname='dqn_model.h5'):
61 | self.action_space = [i for i in range(n_actions)]
62 | self.gamma = gamma
63 | self.epsilon = epsilon
64 | self.epsilon_dec = epsilon_dec
65 | self.epsilon_min = epsilon_end
66 | self.batch_size = batch_size
67 | self.model_file = fname
68 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
69 | discrete=True)
70 | self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
71 |
72 | def remember(self, state, action, reward, new_state, done):
73 | self.memory.store_transition(state, action, reward, new_state, done)
74 |
75 | def choose_action(self, state):
76 | state = state[np.newaxis, :]
77 | rand = np.random.random()
78 | if rand < self.epsilon:
79 | action = np.random.choice(self.action_space)
80 | else:
81 | actions = self.q_eval.predict(state)
82 | action = np.argmax(actions)
83 |
84 | return action
85 |
86 | def learn(self):
87 | if self.memory.mem_cntr > self.batch_size:
88 | state, action, reward, new_state, done = \
89 | self.memory.sample_buffer(self.batch_size)
90 |
91 | action_values = np.array(self.action_space, dtype=np.int8)
92 | action_indices = np.dot(action, action_values)
93 |
94 | q_eval = self.q_eval.predict(state)
95 |
96 | q_next = self.q_eval.predict(new_state)
97 |
98 | q_target = q_eval.copy()
99 |
100 | batch_index = np.arange(self.batch_size, dtype=np.int32)
101 |
102 | q_target[batch_index, action_indices] = reward + \
103 | self.gamma*np.max(q_next, axis=1)*done
104 |
105 | _ = self.q_eval.fit(state, q_target, verbose=0)
106 |
107 | self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
108 | self.epsilon_min else self.epsilon_min
109 |
110 | def save_model(self):
111 | self.q_eval.save(self.model_file)
112 |
113 | def load_model(self):
114 | self.q_eval = load_model(self.model_file)
115 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_tf2.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow import keras
4 | from tensorflow.keras.optimizers import Adam
5 | from tensorflow.keras.models import load_model
6 |
7 | class ReplayBuffer():
8 | def __init__(self, max_size, input_dims):
9 | self.mem_size = max_size
10 | self.mem_cntr = 0
11 |
12 | self.state_memory = np.zeros((self.mem_size, *input_dims),
13 | dtype=np.float32)
14 | self.new_state_memory = np.zeros((self.mem_size, *input_dims),
15 | dtype=np.float32)
16 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
17 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
18 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
19 |
20 | def store_transition(self, state, action, reward, state_, done):
21 | index = self.mem_cntr % self.mem_size
22 | self.state_memory[index] = state
23 | self.new_state_memory[index] = state_
24 | self.reward_memory[index] = reward
25 | self.action_memory[index] = action
26 | self.terminal_memory[index] = 1 - int(done)
27 | self.mem_cntr += 1
28 |
29 | def sample_buffer(self, batch_size):
30 | max_mem = min(self.mem_cntr, self.mem_size)
31 | batch = np.random.choice(max_mem, batch_size, replace=False)
32 |
33 | states = self.state_memory[batch]
34 | states_ = self.new_state_memory[batch]
35 | rewards = self.reward_memory[batch]
36 | actions = self.action_memory[batch]
37 | terminal = self.terminal_memory[batch]
38 |
39 | return states, actions, rewards, states_, terminal
40 |
41 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
42 | model = keras.Sequential([
43 | keras.layers.Dense(fc1_dims, activation='relu'),
44 | keras.layers.Dense(fc2_dims, activation='relu'),
45 | keras.layers.Dense(n_actions, activation=None)])
46 | model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
47 |
48 | return model
49 |
50 | class Agent():
51 | def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
52 | input_dims, epsilon_dec=1e-3, epsilon_end=0.01,
53 | mem_size=1000000, fname='dqn_model.h5'):
54 | self.action_space = [i for i in range(n_actions)]
55 | self.gamma = gamma
56 | self.epsilon = epsilon
57 | self.eps_dec = epsilon_dec
58 | self.eps_min = epsilon_end
59 | self.batch_size = batch_size
60 | self.model_file = fname
61 | self.memory = ReplayBuffer(mem_size, input_dims)
62 | self.q_eval = build_dqn(lr, n_actions, input_dims, 256, 256)
63 |
64 | def store_transition(self, state, action, reward, new_state, done):
65 | self.memory.store_transition(state, action, reward, new_state, done)
66 |
67 | def choose_action(self, observation):
68 | if np.random.random() < self.epsilon:
69 | action = np.random.choice(self.action_space)
70 | else:
71 | state = np.array([observation])
72 | actions = self.q_eval.predict(state)
73 |
74 | action = np.argmax(actions)
75 |
76 | return action
77 |
78 | def learn(self):
79 | if self.memory.mem_cntr < self.batch_size:
80 | return
81 |
82 | states, actions, rewards, states_, dones = \
83 | self.memory.sample_buffer(self.batch_size)
84 |
85 | q_eval = self.q_eval.predict(states)
86 | q_next = self.q_eval.predict(states_)
87 |
88 |
89 | q_target = np.copy(q_eval)
90 | batch_index = np.arange(self.batch_size, dtype=np.int32)
91 |
92 | q_target[batch_index, actions] = rewards + \
93 | self.gamma * np.max(q_next, axis=1)*dones
94 |
95 |
96 | self.q_eval.train_on_batch(states, q_target)
97 |
98 | self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
99 | self.eps_min else self.eps_min
100 |
101 | def save_model(self):
102 | self.q_eval.save(self.model_file)
103 |
104 |
105 | def load_model(self):
106 | self.q_eval = load_model(self.model_file)
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_torch_2020.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 |
7 |
8 | class DeepQNetwork(nn.Module):
9 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
10 | n_actions):
11 | super(DeepQNetwork, self).__init__()
12 | self.input_dims = input_dims
13 | self.fc1_dims = fc1_dims
14 | self.fc2_dims = fc2_dims
15 | self.n_actions = n_actions
16 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
17 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
18 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
19 |
20 | self.optimizer = optim.Adam(self.parameters(), lr=lr)
21 | self.loss = nn.MSELoss()
22 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
23 | self.to(self.device)
24 |
25 | def forward(self, state):
26 | x = F.relu(self.fc1(state))
27 | x = F.relu(self.fc2(x))
28 | actions = self.fc3(x)
29 |
30 | return actions
31 |
32 |
33 | class Agent:
34 | def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
35 | max_mem_size=100000, eps_end=0.05, eps_dec=5e-4):
36 | self.gamma = gamma
37 | self.epsilon = epsilon
38 | self.eps_min = eps_end
39 | self.eps_dec = eps_dec
40 | self.lr = lr
41 | self.action_space = [i for i in range(n_actions)]
42 | self.mem_size = max_mem_size
43 | self.batch_size = batch_size
44 | self.mem_cntr = 0
45 | self.iter_cntr = 0
46 | self.replace_target = 100
47 |
48 | self.Q_eval = DeepQNetwork(lr, n_actions=n_actions,
49 | input_dims=input_dims,
50 | fc1_dims=256, fc2_dims=256)
51 | self.state_memory = np.zeros((self.mem_size, *input_dims),
52 | dtype=np.float32)
53 | self.new_state_memory = np.zeros((self.mem_size, *input_dims),
54 | dtype=np.float32)
55 | self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
56 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
57 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
58 |
59 | def store_transition(self, state, action, reward, state_, terminal):
60 | index = self.mem_cntr % self.mem_size
61 | self.state_memory[index] = state
62 | self.new_state_memory[index] = state_
63 | self.reward_memory[index] = reward
64 | self.action_memory[index] = action
65 | self.terminal_memory[index] = terminal
66 |
67 | self.mem_cntr += 1
68 |
69 | def choose_action(self, observation):
70 | if np.random.random() > self.epsilon:
71 | state = T.tensor([observation]).to(self.Q_eval.device)
72 | actions = self.Q_eval.forward(state)
73 | action = T.argmax(actions).item()
74 | else:
75 | action = np.random.choice(self.action_space)
76 |
77 | return action
78 |
79 | def learn(self):
80 | if self.mem_cntr < self.batch_size:
81 | return
82 |
83 | self.Q_eval.optimizer.zero_grad()
84 |
85 | max_mem = min(self.mem_cntr, self.mem_size)
86 |
87 | batch = np.random.choice(max_mem, self.batch_size, replace=False)
88 | batch_index = np.arange(self.batch_size, dtype=np.int32)
89 |
90 | state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
91 | new_state_batch = T.tensor(
92 | self.new_state_memory[batch]).to(self.Q_eval.device)
93 | action_batch = self.action_memory[batch]
94 | reward_batch = T.tensor(
95 | self.reward_memory[batch]).to(self.Q_eval.device)
96 | terminal_batch = T.tensor(
97 | self.terminal_memory[batch]).to(self.Q_eval.device)
98 |
99 | q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
100 | q_next = self.Q_eval.forward(new_state_batch)
101 | q_next[terminal_batch] = 0.0
102 |
103 | q_target = reward_batch + self.gamma*T.max(q_next, dim=1)[0]
104 |
105 | loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
106 | loss.backward()
107 | self.Q_eval.optimizer.step()
108 |
109 | self.iter_cntr += 1
110 | self.epsilon = self.epsilon - self.eps_dec \
111 | if self.epsilon > self.eps_min else self.eps_min
112 |
--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import gym
4 |
5 | def plotLearning(x, scores, epsilons, filename, lines=None):
6 | fig=plt.figure()
7 | ax=fig.add_subplot(111, label="1")
8 | ax2=fig.add_subplot(111, label="2", frame_on=False)
9 |
10 | ax.plot(x, epsilons, color="C0")
11 | ax.set_xlabel("Game", color="C0")
12 | ax.set_ylabel("Epsilon", color="C0")
13 | ax.tick_params(axis='x', colors="C0")
14 | ax.tick_params(axis='y', colors="C0")
15 |
16 | N = len(scores)
17 | running_avg = np.empty(N)
18 | for t in range(N):
19 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
20 |
21 | ax2.scatter(x, running_avg, color="C1")
22 | #ax2.xaxis.tick_top()
23 | ax2.axes.get_xaxis().set_visible(False)
24 | ax2.yaxis.tick_right()
25 | #ax2.set_xlabel('x label 2', color="C1")
26 | ax2.set_ylabel('Score', color="C1")
27 | #ax2.xaxis.set_label_position('top')
28 | ax2.yaxis.set_label_position('right')
29 | #ax2.tick_params(axis='x', colors="C1")
30 | ax2.tick_params(axis='y', colors="C1")
31 |
32 | if lines is not None:
33 | for line in lines:
34 | plt.axvline(x=line)
35 |
36 | plt.savefig(filename)
37 |
38 | class SkipEnv(gym.Wrapper):
39 | def __init__(self, env=None, skip=4):
40 | super(SkipEnv, self).__init__(env)
41 | self._skip = skip
42 |
43 | def step(self, action):
44 | t_reward = 0.0
45 | done = False
46 | for _ in range(self._skip):
47 | obs, reward, done, info = self.env.step(action)
48 | t_reward += reward
49 | if done:
50 | break
51 | return obs, t_reward, done, info
52 |
53 | def reset(self):
54 | self._obs_buffer = []
55 | obs = self.env.reset()
56 | self._obs_buffer.append(obs)
57 | return obs
58 |
59 | class PreProcessFrame(gym.ObservationWrapper):
60 | def __init__(self, env=None):
61 | super(PreProcessFrame, self).__init__(env)
62 | self.observation_space = gym.spaces.Box(low=0, high=255,
63 | shape=(80,80,1), dtype=np.uint8)
64 | def observation(self, obs):
65 | return PreProcessFrame.process(obs)
66 |
67 | @staticmethod
68 | def process(frame):
69 |
70 | new_frame = np.reshape(frame, frame.shape).astype(np.float32)
71 |
72 | new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \
73 | 0.114*new_frame[:,:,2]
74 |
75 | new_frame = new_frame[35:195:2, ::2].reshape(80,80,1)
76 |
77 | return new_frame.astype(np.uint8)
78 |
79 | class MoveImgChannel(gym.ObservationWrapper):
80 | def __init__(self, env):
81 | super(MoveImgChannel, self).__init__(env)
82 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
83 | shape=(self.observation_space.shape[-1],
84 | self.observation_space.shape[0],
85 | self.observation_space.shape[1]),
86 | dtype=np.float32)
87 |
88 | def observation(self, observation):
89 | return np.moveaxis(observation, 2, 0)
90 |
91 | class ScaleFrame(gym.ObservationWrapper):
92 | def observation(self, obs):
93 | return np.array(obs).astype(np.float32) / 255.0
94 |
95 | class BufferWrapper(gym.ObservationWrapper):
96 | def __init__(self, env, n_steps):
97 | super(BufferWrapper, self).__init__(env)
98 | self.observation_space = gym.spaces.Box(
99 | env.observation_space.low.repeat(n_steps, axis=0),
100 | env.observation_space.high.repeat(n_steps, axis=0),
101 | dtype=np.float32)
102 |
103 | def reset(self):
104 | self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32)
105 | return self.observation(self.env.reset())
106 |
107 | def observation(self, observation):
108 | self.buffer[:-1] = self.buffer[1:]
109 | self.buffer[-1] = observation
110 | return self.buffer
111 |
112 | def make_env(env_name):
113 | env = gym.make(env_name)
114 | env = SkipEnv(env)
115 | env = PreProcessFrame(env)
116 | env = MoveImgChannel(env)
117 | env = BufferWrapper(env, 4)
118 | return ScaleFrame(env)
119 |
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/acrobot.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from gym import wrappers
5 | import pickle
6 |
7 | theta_space = np.linspace(-1, 1, 10)
8 | theta_dot_space = np.linspace(-10, 10, 10)
9 |
10 | def get_state(observation):
11 | cos_theta1, sin_theta1, cos_theta2, sin_theta2, theta1_dot, theta2_dot = \
12 | observation
13 | c_th1 = int(np.digitize(cos_theta1, theta_space))
14 | s_th1 = int(np.digitize(sin_theta1, theta_space))
15 | c_th2 = int(np.digitize(cos_theta2, theta_space))
16 | s_th2 = int(np.digitize(sin_theta2, theta_space))
17 | th1_dot = int(np.digitize(theta1_dot, theta_dot_space))
18 | th2_dot = int(np.digitize(theta2_dot, theta_dot_space))
19 |
20 | return (c_th1, s_th2, c_th2, s_th2, th1_dot, th2_dot)
21 |
22 | def maxAction(Q, state, actions=[0, 1, 2]):
23 | values = np.array([Q[state,a] for a in actions])
24 | action = np.argmax(values)
25 |
26 | return action
27 |
28 | if __name__ == '__main__':
29 | env = gym.make('Acrobot-v1')
30 | n_games = 100
31 | alpha = 0.1
32 | gamma = 0.99
33 | eps = 0
34 |
35 | action_space = [0, 1, 2]
36 |
37 | states = []
38 | for c1 in range(11):
39 | for s1 in range(11):
40 | for c2 in range(11):
41 | for s2 in range(11):
42 | for dot1 in range(11):
43 | for dot2 in range(11):
44 | states.append((c1, s1, c2, s2, dot1, dot2))
45 | """
46 | Q = {}
47 | for state in states:
48 | for action in action_space:
49 | Q[state, action] = 0
50 | """
51 | pickle_in = open('acrobot.pkl', 'rb')
52 | Q = pickle.load(pickle_in)
53 | env = wrappers.Monitor(env, "tmp/acrobot", video_callable=lambda episode_id: True, force=True)
54 | eps_rewards = 0
55 | total_rewards = np.zeros(n_games)
56 | for i in range(n_games):
57 | if i % 1 == 0:
58 | print('episode ', i, 'score ', eps_rewards, 'eps', eps)
59 | observation = env.reset()
60 | state = get_state(observation)
61 | done = False
62 | action = env.action_space.sample() if np.random.random() < eps else \
63 | maxAction(Q, state)
64 | eps_rewards = 0
65 | while not done:
66 | """
67 | print(observation)
68 | action = env.action_space.sample()
69 | """
70 | observation_, reward, done, info = env.step(action)
71 | state_ = get_state(observation_)
72 | action_ = maxAction(Q, state_)
73 | eps_rewards += reward
74 | Q[state, action] = Q[state,action] + \
75 | alpha*(reward + gamma*Q[state_,action_] - Q[state,action])
76 | state = state_
77 | action = action_
78 | total_rewards[i] = eps_rewards
79 | eps = eps - 2 / n_games if eps > 0.01 else 0.01
80 |
81 | mean_rewards = np.zeros(n_games)
82 | for t in range(n_games):
83 | mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
84 | plt.plot(mean_rewards)
85 | plt.show()
86 |
87 | f = open("acrobot.pkl","wb")
88 | pickle.dump(Q,f)
89 | f.close()
90 |
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/blackJack-no-es.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | if __name__ == '__main__':
6 | env = gym.make('Blackjack-v0')
7 | EPS = 0.05
8 | GAMMA = 1.0
9 |
10 | Q = {}
11 | agentSumSpace = [i for i in range(4, 22)]
12 | dealerShowCardSpace = [i+1 for i in range(10)]
13 | agentAceSpace = [False, True]
14 | actionSpace = [0, 1] # stick or hit
15 |
16 | stateSpace = []
17 | returns = {}
18 | pairsVisited = {}
19 | for total in agentSumSpace:
20 | for card in dealerShowCardSpace:
21 | for ace in agentAceSpace:
22 | for action in actionSpace:
23 | Q[((total, card, ace), action)] = 0
24 | returns[((total, card, ace), action)] = 0
25 | pairsVisited[((total, card, ace), action)] = 0
26 | stateSpace.append((total, card, ace))
27 |
28 | policy = {}
29 | for state in stateSpace:
30 | policy[state] = np.random.choice(actionSpace)
31 |
32 | numEpisodes = 1000000
33 | for i in range(numEpisodes):
34 | statesActionsReturns = []
35 | memory = []
36 | if i % 100000 == 0:
37 | print('starting episode', i)
38 | observation = env.reset()
39 | done = False
40 | while not done:
41 | action = policy[observation]
42 | observation_, reward, done, info = env.step(action)
43 | memory.append((observation[0], observation[1], observation[2], action, reward))
44 | observation = observation_
45 | memory.append((observation[0], observation[1], observation[2], action, reward))
46 |
47 | G = 0
48 | last = True
49 | for playerSum, dealerCard, usableAce, action, reward in reversed(memory):
50 | if last:
51 | last = False
52 | else:
53 | statesActionsReturns.append((playerSum, dealerCard, usableAce, action, G))
54 | G = GAMMA*G + reward
55 |
56 | statesActionsReturns.reverse()
57 | statesActionsVisited = []
58 |
59 | for playerSum, dealerCard, usableAce, action, G in statesActionsReturns:
60 | sa = ((playerSum, dealerCard, usableAce), action)
61 | if sa not in statesActionsVisited:
62 | pairsVisited[sa] += 1
63 | # incremental implementation
64 | # new estimate = 1 / N * [sample - old estimate]
65 | returns[(sa)] += (1 / pairsVisited[(sa)])*(G-returns[(sa)])
66 | Q[sa] = returns[sa]
67 | rand = np.random.random()
68 | if rand < 1 - EPS:
69 | state = (playerSum, dealerCard, usableAce)
70 | values = np.array([Q[(state, a)] for a in actionSpace ])
71 | best = np.random.choice(np.where(values==values.max())[0])
72 | policy[state] = actionSpace[best]
73 | else:
74 | policy[state] = np.random.choice(actionSpace)
75 | statesActionsVisited.append(sa)
76 | if EPS - 1e-7 > 0:
77 | EPS -= 1e-7
78 | else:
79 | EPS = 0
80 |
81 | numEpisodes = 1000
82 | rewards = np.zeros(numEpisodes)
83 | totalReward = 0
84 | wins = 0
85 | losses = 0
86 | draws = 0
87 | print('getting ready to test policy')
88 | for i in range(numEpisodes):
89 | observation = env.reset()
90 | done = False
91 | while not done:
92 | action = policy[observation]
93 | observation_, reward, done, info = env.step(action)
94 | observation = observation_
95 | totalReward += reward
96 | rewards[i] = totalReward
97 |
98 | if reward >= 1:
99 | wins += 1
100 | elif reward == 0:
101 | draws += 1
102 | elif reward == -1:
103 | losses += 1
104 |
105 | wins /= numEpisodes
106 | losses /= numEpisodes
107 | draws /= numEpisodes
108 | print('win rate', wins, 'loss rate', losses, 'draw rate', draws)
109 | plt.plot(rewards)
110 | plt.show()
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/blackJack-off-policy.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | if __name__ == '__main__':
6 | env = gym.make('Blackjack-v0')
7 | EPS = 0.05
8 | GAMMA = 1.0
9 |
10 | agentSumSpace = [i for i in range(4, 22)]
11 | dealerShowCardSpace = [i+1 for i in range(10)]
12 | agentAceSpace = [False, True]
13 | actionSpace = [0, 1] # stick or hit
14 | stateSpace = []
15 |
16 | Q = {}
17 | C = {}
18 | for total in agentSumSpace:
19 | for card in dealerShowCardSpace:
20 | for ace in agentAceSpace:
21 | for action in actionSpace:
22 | Q[((total, card, ace), action)] = 0
23 | C[((total, card, ace), action)] = 0
24 | stateSpace.append((total, card, ace))
25 |
26 | targetPolicy = {}
27 | for state in stateSpace:
28 | values = np.array([Q[(state, a)] for a in actionSpace ])
29 | best = np.random.choice(np.where(values==values.max())[0])
30 | targetPolicy[state] = actionSpace[best]
31 |
32 | numEpisodes = 1000000
33 | for i in range(numEpisodes):
34 | memory = []
35 | if i % 100000 == 0:
36 | print('starting episode', i)
37 | behaviorPolicy = {}
38 | for state in stateSpace:
39 | rand = np.random.random()
40 | if rand < 1 - EPS:
41 | behaviorPolicy[state] = [targetPolicy[state]]
42 | else:
43 | behaviorPolicy[state] = actionSpace
44 | observation = env.reset()
45 | done = False
46 | while not done:
47 | action = np.random.choice(behaviorPolicy[observation])
48 | observation_, reward, done, info = env.step(action)
49 | memory.append((observation[0], observation[1], observation[2], action, reward))
50 | observation = observation_
51 | memory.append((observation[0], observation[1], observation[2], action, reward))
52 |
53 | G = 0
54 | W = 1
55 | last = True
56 | for playerSum, dealerCard, usableAce, action, reward in reversed(memory):
57 | sa = ((playerSum, dealerCard, usableAce), action)
58 | if last:
59 | last = False
60 | else:
61 | C[sa] += W
62 | Q[sa] += (W / C[sa])*(G-Q[sa])
63 | values = np.array([Q[(state, a)] for a in actionSpace ])
64 | best = np.random.choice(np.where(values==values.max())[0])
65 | targetPolicy[state] = actionSpace[best]
66 | if action != targetPolicy[state]:
67 | break
68 | if len(behaviorPolicy[state]) == 1:
69 | prob = 1 - EPS
70 | else:
71 | prob = EPS / len(behaviorPolicy[state])
72 | W *= 1/prob
73 | G = GAMMA*G + reward
74 | if EPS - 1e-7 > 0:
75 | EPS -= 1e-7
76 | else:
77 | EPS = 0
78 | numEpisodes = 1000
79 | rewards = np.zeros(numEpisodes)
80 | totalReward = 0
81 | wins = 0
82 | losses = 0
83 | draws = 0
84 | print('getting ready to test target policy')
85 | for i in range(numEpisodes):
86 | observation = env.reset()
87 | done = False
88 | while not done:
89 | action = targetPolicy[observation]
90 | observation_, reward, done, info = env.step(action)
91 | observation = observation_
92 | totalReward += reward
93 | rewards[i] = totalReward
94 |
95 | if reward >= 1:
96 | wins += 1
97 | elif reward == 0:
98 | draws += 1
99 | elif reward == -1:
100 | losses += 1
101 |
102 | wins /= numEpisodes
103 | losses /= numEpisodes
104 | draws /= numEpisodes
105 | print('win rate', wins, 'loss rate', losses, 'draw rate', draws)
106 | plt.plot(rewards)
107 | plt.show()
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/cartpole_qlearning.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | env = gym.make('CartPole-v0')
6 |
7 | MAXSTATES = 10**4
8 | GAMMA = 0.9
9 | ALPHA = 0.01
10 |
11 | def max_dict(d):
12 | max_v = float('-inf')
13 | for key, val in d.items():
14 | if val > max_v:
15 | max_v = val
16 | max_key = key
17 | return max_key, max_v
18 |
19 | def create_bins():
20 | # obs[0] -> cart position --- -4.8 - 4.8
21 | # obs[1] -> cart velocity --- -inf - inf
22 | # obs[2] -> pole angle --- -41.8 - 41.8
23 | # obs[3] -> pole velocity --- -inf - inf
24 |
25 | bins = np.zeros((4,10))
26 | bins[0] = np.linspace(-4.8, 4.8, 10)
27 | bins[1] = np.linspace(-5, 5, 10)
28 | bins[2] = np.linspace(-.418, .418, 10)
29 | bins[3] = np.linspace(-5, 5, 10)
30 |
31 | return bins
32 |
33 | def assign_bins(observation, bins):
34 | state = np.zeros(4)
35 | for i in range(4):
36 | state[i] = np.digitize(observation[i], bins[i])
37 | return state
38 |
39 | def get_state_as_string(state):
40 | string_state = ''.join(str(int(e)) for e in state)
41 | return string_state
42 |
43 | def get_all_states_as_string():
44 | states = []
45 | for i in range(MAXSTATES):
46 | states.append(str(i).zfill(4))
47 | return states
48 |
49 | def initialize_Q():
50 | Q = {}
51 |
52 | all_states = get_all_states_as_string()
53 | for state in all_states:
54 | Q[state] = {}
55 | for action in range(env.action_space.n):
56 | Q[state][action] = 0
57 | return Q
58 |
59 | def play_one_game(bins, Q, eps=0.5):
60 | observation = env.reset()
61 | done = False
62 | cnt = 0 # number of moves in an episode
63 | state = get_state_as_string(assign_bins(observation, bins))
64 | total_reward = 0
65 |
66 | while not done:
67 | cnt += 1
68 | # np.random.randn() seems to yield a random action 50% of the time ?
69 | if np.random.uniform() < eps:
70 | act = env.action_space.sample() # epsilon greedy
71 | else:
72 | act = max_dict(Q[state])[0]
73 |
74 | observation, reward, done, _ = env.step(act)
75 |
76 | total_reward += reward
77 |
78 | if done and cnt < 200:
79 | reward = -300
80 |
81 | state_new = get_state_as_string(assign_bins(observation, bins))
82 |
83 | a1, max_q_s1a1 = max_dict(Q[state_new])
84 | Q[state][act] += ALPHA*(reward + GAMMA*max_q_s1a1 - Q[state][act])
85 | state, act = state_new, a1
86 |
87 | return total_reward, cnt
88 |
89 | def play_many_games(bins, N=10000):
90 | Q = initialize_Q()
91 |
92 | length = []
93 | reward = []
94 | for n in range(N):
95 | #eps=0.5/(1+n*10e-3)
96 | eps = 1.0 / np.sqrt(n+1)
97 |
98 | episode_reward, episode_length = play_one_game(bins, Q, eps)
99 |
100 | if n % 100 == 0:
101 | print(n, '%.4f' % eps, episode_reward)
102 | length.append(episode_length)
103 | reward.append(episode_reward)
104 |
105 | return length, reward
106 |
107 | def plot_running_avg(totalrewards):
108 | N = len(totalrewards)
109 | running_avg = np.empty(N)
110 | for t in range(N):
111 | running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)])
112 | plt.plot(running_avg)
113 | plt.title("Running Average")
114 | plt.show()
115 |
116 | if __name__ == '__main__':
117 | bins = create_bins()
118 | episode_lengths, episode_rewards = play_many_games(bins)
119 |
120 | plot_running_avg(episode_rewards)
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/doubleQLearning.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import gym
5 |
6 | def maxAction(Q1, Q2, state):
7 | values = np.array([Q1[state,a] + Q2[state,a] for a in range(2)])
8 | action = np.argmax(values)
9 | return action
10 |
11 | #discretize the spaces
12 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10)
13 | poleThetaVelSpace = np.linspace(-4, 4, 10)
14 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
15 | cartVelSpace = np.linspace(-4, 4, 10)
16 |
17 | def getState(observation):
18 | cartX, cartXdot, cartTheta, cartThetadot = observation
19 | cartX = int(np.digitize(cartX, cartPosSpace))
20 | cartXdot = int(np.digitize(cartXdot, cartVelSpace))
21 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
22 | cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace))
23 |
24 | return (cartX, cartXdot, cartTheta, cartThetadot)
25 |
26 | def plotRunningAverage(totalrewards):
27 | N = len(totalrewards)
28 | running_avg = np.empty(N)
29 | for t in range(N):
30 | running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)])
31 | plt.plot(running_avg)
32 | plt.title("Running Average")
33 | plt.show()
34 |
35 | if __name__ == '__main__':
36 | env = gym.make('CartPole-v0')
37 | # model hyperparameters
38 | ALPHA = 0.1
39 | GAMMA = 0.9
40 | EPS = 1.0
41 |
42 | #construct state space
43 | states = []
44 | for i in range(len(cartPosSpace)+1):
45 | for j in range(len(cartVelSpace)+1):
46 | for k in range(len(poleThetaSpace)+1):
47 | for l in range(len(poleThetaVelSpace)+1):
48 | states.append((i,j,k,l))
49 |
50 | Q1, Q2 = {}, {}
51 | for s in states:
52 | for a in range(2):
53 | Q1[s, a] = 0
54 | Q2[s,a] = 0
55 |
56 | numGames = 100000
57 | totalRewards = np.zeros(numGames)
58 | for i in range(numGames):
59 | if i % 5000 == 0:
60 | print('starting game ', i)
61 | done = False
62 | epRewards = 0
63 | observation = env.reset()
64 | while not done:
65 | s = getState(observation)
66 | rand = np.random.random()
67 | a = maxAction(Q1,Q2,s) if rand < (1-EPS) else env.action_space.sample()
68 | observation_, reward, done, info = env.step(a)
69 | epRewards += reward
70 | s_ = getState(observation_)
71 | rand = np.random.random()
72 | if rand <= 0.5:
73 | a_ = maxAction(Q1,Q1,s_)
74 | Q1[s,a] = Q1[s,a] + ALPHA*(reward + GAMMA*Q2[s_,a_] - Q1[s,a])
75 | elif rand > 0.5:
76 | a_ = maxAction(Q2,Q2,s_)
77 | Q2[s,a] = Q2[s,a] + ALPHA*(reward + GAMMA*Q1[s_,a_] - Q2[s,a])
78 | observation = observation_
79 | EPS -= 2/(numGames) if EPS > 0 else 0
80 | totalRewards[i] = epRewards
81 |
82 | #plt.plot(totalRewards, 'b--')
83 | #plt.show()
84 | plotRunningAverage(totalRewards)
85 |
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/mountaincar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/Fundamentals/mountaincar.png
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from gym import wrappers
5 | import pickle
6 |
7 | pos_space = np.linspace(-1.2, 0.6, 12)
8 | vel_space = np.linspace(-0.07, 0.07, 20)
9 |
10 | def get_state(observation):
11 | pos, vel = observation
12 | pos_bin = int(np.digitize(pos, pos_space))
13 | vel_bin = int(np.digitize(vel, vel_space))
14 |
15 | return (pos_bin, vel_bin)
16 |
17 | def max_action(Q, state, actions=[0, 1, 2]):
18 | values = np.array([Q[state,a] for a in actions])
19 | action = np.argmax(values)
20 |
21 | return action
22 |
23 | if __name__ == '__main__':
24 | env = gym.make('MountainCar-v0')
25 | env._max_episode_steps = 1000
26 | n_games = 50000
27 | alpha = 0.1
28 | gamma = 0.99
29 | eps = 1.0
30 |
31 | action_space = [0, 1, 2]
32 |
33 | states = []
34 | for pos in range(21):
35 | for vel in range(21):
36 | states.append((pos, vel))
37 |
38 | Q = {}
39 | for state in states:
40 | for action in action_space:
41 | Q[state, action] = 0
42 |
43 | #pickle_in = open('mountaincar.pkl', 'rb')
44 | #Q = pickle.load(pickle_in)
45 | #env = wrappers.Monitor(env, "tmp/mountaincar",
46 | #video_callable=lambda episode_id: True, force=True)
47 | score = 0
48 | total_rewards = np.zeros(n_games)
49 | for i in range(n_games):
50 | done = False
51 | obs = env.reset()
52 | state = get_state(obs)
53 | if i % 100 == 0 and i > 0:
54 | print('episode ', i, 'score ', score, 'epsilon %.3f' % eps)
55 | score = 0
56 | while not done:
57 | action = np.random.choice([0,1,2]) if np.random.random() < eps \
58 | else max_action(Q, state)
59 | obs_, reward, done, info = env.step(action)
60 | state_ = get_state(obs_)
61 | score += reward
62 | action_ = max_action(Q, state_)
63 | Q[state, action] = Q[state, action] + \
64 | alpha*(reward + gamma*Q[state_, action_] - Q[state, action])
65 | state = state_
66 | total_rewards[i] = score
67 | eps = eps - 2/n_games if eps > 0.01 else 0.01
68 |
69 | mean_rewards = np.zeros(n_games)
70 | for t in range(n_games):
71 | mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
72 | plt.plot(mean_rewards)
73 | plt.savefig('mountaincar.png')
74 |
75 | #f = open("mountaincar.pkl","wb")
76 | #pickle.dump(Q,f)
77 | #f.close()
78 |
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/n_step_sarsa.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 |
4 | poleThetaSpace = np.linspace(-0.209, 0.209, 10)
5 | poleThetaVelSpace = np.linspace(-4, 4, 10)
6 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
7 | cartVelSpace = np.linspace(-4, 4, 10)
8 |
9 | def get_state(observation):
10 | cartX, cartXdot, cartTheta, cartThetaDot = observation
11 | cartX = int(np.digitize(cartX, cartPosSpace))
12 | cartXdot = int(np.digitize(cartXdot, cartVelSpace))
13 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
14 | cartThetaDot = int(np.digitize(cartThetaDot, poleThetaVelSpace))
15 |
16 | return (cartX, cartXdot, cartTheta, cartThetaDot)
17 |
18 | def choose_action(q, obs, eps, n_actions=2):
19 | state = get_state(obs)
20 | if np.random.random() < eps:
21 | action = np.random.choice([i for i in range(n_actions)])
22 | else:
23 | action_values = [q[(state, a)] for a in range(n_actions)]
24 | action = np.argmax(action_values)
25 | return action
26 |
27 | if __name__ == '__main__':
28 | env = gym.make('CartPole-v0')
29 | alpha = 0.1
30 | gamma = 0.9
31 | epsilon = 1.0
32 |
33 | states = []
34 | for i in range(len(cartPosSpace)+1):
35 | for j in range(len(cartVelSpace)+1):
36 | for k in range(len(poleThetaSpace)+1):
37 | for l in range(len(poleThetaVelSpace)+1):
38 | states.append((i,j,k,l))
39 |
40 | Q = {}
41 | for s in states:
42 | for a in range(2):
43 | Q[(s, a)] = 0.0
44 |
45 | n = 16
46 | state_memory = np.zeros((n, 4))
47 | action_memory = np.zeros(n)
48 | reward_memory = np.zeros(n)
49 |
50 | scores = []
51 | n_episodes = 50000
52 | for i in range(n_episodes):
53 | done = False
54 | score = 0
55 | t = 0
56 | T = np.inf
57 | observation = env.reset()
58 | action = choose_action(Q, observation, epsilon)
59 | action_memory[t%n] = action
60 | state_memory[t%n] = observation
61 | while not done:
62 | observation, reward, done, info = env.step(action)
63 | score += reward
64 | state_memory[(t+1)%n] = observation
65 | reward_memory[(t+1)%n] = reward
66 | if done:
67 | T = t + 1
68 | #print('episode ends at step', t)
69 | action = choose_action(Q, observation, epsilon)
70 | action_memory[(t+1)%n] = action
71 | tau = t - n + 1
72 | if tau >= 0:
73 | G = [gamma**(j-tau-1)*reward_memory[j%n] \
74 | for j in range(tau+1, min(tau+n, T)+1)]
75 | G = np.sum(G)
76 | if tau + n < T:
77 | s = get_state(state_memory[(tau+n)%n])
78 | a = int(action_memory[(tau+n)%n])
79 | G += gamma**n * Q[(s,a)]
80 | s = get_state(state_memory[tau%n])
81 | a = action_memory[tau%n]
82 | Q[(s,a)] += alpha*(G-Q[(s,a)])
83 | #print('tau ', tau, '| Q %.2f' % \
84 | # Q[(get_state(state_memory[tau%n]), action_memory[tau%n])])
85 |
86 | t += 1
87 |
88 | for tau in range(t-n+1, T):
89 | G = [gamma**(j-tau-1)*reward_memory[j%n] \
90 | for j in range(tau+1, min(tau+n, T)+1)]
91 | G = np.sum(G)
92 | if tau + n < T:
93 | s = get_state(state_memory[(tau+n)%n])
94 | a = int(action_memory[(tau+n)%n])
95 | G += gamma**n * Q[(s,a)]
96 | s = get_state(state_memory[tau%n])
97 | a = action_memory[tau%n]
98 | Q[(s,a)] += alpha*(G-Q[(s,a)])
99 | #print('tau ', tau, '| Q %.2f' % \
100 | # Q[(get_state(state_memory[tau%n]), action_memory[tau%n])])
101 | scores.append(score)
102 | avg_score = np.mean(scores[-1000:])
103 | epsilon = epsilon -2 / n_episodes if epsilon > 0 else 0
104 | if i % 1000 == 0:
105 | print('episode ', i, 'avg_score %.1f' % avg_score,
106 | 'epsilon %.2f' % epsilon)
107 |
108 |
--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/sarsa.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import gym
4 |
5 | def maxAction(Q, state):
6 | values = np.array([Q[state,a] for a in range(2)])
7 | action = np.argmax(values)
8 | return action
9 |
10 | #discretize the spaces
11 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10)
12 | poleThetaVelSpace = np.linspace(-4, 4, 10)
13 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
14 | cartVelSpace = np.linspace(-4, 4, 10)
15 |
16 | def getState(observation):
17 | cartX, cartXdot, cartTheta, cartThetadot = observation
18 | cartX = int(np.digitize(cartX, cartPosSpace))
19 | cartXdot = int(np.digitize(cartXdot, cartVelSpace))
20 | cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
21 | cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace))
22 |
23 | return (cartX, cartXdot, cartTheta, cartThetadot)
24 |
25 | if __name__ == '__main__':
26 | env = gym.make('CartPole-v0')
27 | # model hyperparameters
28 | ALPHA = 0.1
29 | GAMMA = 0.9
30 | EPS = 1.0
31 |
32 | #construct state space
33 | states = []
34 | for i in range(len(cartPosSpace)+1):
35 | for j in range(len(cartVelSpace)+1):
36 | for k in range(len(poleThetaSpace)+1):
37 | for l in range(len(poleThetaVelSpace)+1):
38 | states.append((i,j,k,l))
39 |
40 | Q = {}
41 | for s in states:
42 | for a in range(2):
43 | Q[s, a] = 0
44 |
45 | numGames = 50000
46 | totalRewards = np.zeros(numGames)
47 | for i in range(numGames):
48 | if i % 5000 == 0:
49 | print('starting game', i)
50 | # cart x position, cart velocity, pole theta, pole velocity
51 | observation = env.reset()
52 | s = getState(observation)
53 | rand = np.random.random()
54 | a = maxAction(Q, s) if rand < (1-EPS) else env.action_space.sample()
55 | done = False
56 | epRewards = 0
57 | while not done:
58 | observation_, reward, done, info = env.step(a)
59 | s_ = getState(observation_)
60 | rand = np.random.random()
61 | a_ = maxAction(Q, s_) if rand < (1-EPS) else env.action_space.sample()
62 | epRewards += reward
63 | Q[s,a] = Q[s,a] + ALPHA*(reward + GAMMA*Q[s_,a_] - Q[s,a])
64 | s, a = s_, a_
65 | EPS -= 2/(numGames) if EPS > 0 else 0
66 | totalRewards[i] = epRewards
67 |
68 | plt.plot(totalRewards, 'b--')
69 | plt.show()
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/actor_critic.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch as T
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.distributions import Categorical
6 |
7 |
8 | class ActorCritic(nn.Module):
9 | def __init__(self, input_dims, n_actions, gamma=0.99, tau=0.98):
10 | super(ActorCritic, self).__init__()
11 | self.gamma = gamma
12 | self.tau = tau
13 |
14 | self.input = nn.Linear(*input_dims, 256)
15 | self.dense = nn.Linear(256, 256)
16 |
17 | self.gru = nn.GRUCell(256, 256)
18 | self.pi = nn.Linear(256, n_actions)
19 | self.v = nn.Linear(256, 1)
20 |
21 | def forward(self, state, hx):
22 | x = F.relu(self.input(state))
23 | x = F.relu(self.dense(x))
24 | hx = self.gru(x, (hx))
25 |
26 | pi = self.pi(hx)
27 | v = self.v(hx)
28 |
29 | probs = T.softmax(pi, dim=1)
30 | dist = Categorical(probs)
31 | action = dist.sample()
32 | log_prob = dist.log_prob(action)
33 |
34 | return action.numpy()[0], v, log_prob, hx
35 |
36 | def calc_R(self, done, rewards, values):
37 | values = T.cat(values).squeeze()
38 | if len(values.size()) == 1: # batch of states
39 | R = values[-1] * (1-int(done))
40 | elif len(values.size()) == 0: # single state
41 | R = values*(1-int(done))
42 |
43 | batch_return = []
44 | for reward in rewards[::-1]:
45 | R = reward + self.gamma * R
46 | batch_return.append(R)
47 | batch_return.reverse()
48 | batch_return = T.tensor(batch_return,
49 | dtype=T.float).reshape(values.size())
50 | return batch_return
51 |
52 | def calc_loss(self, new_states, hx, done,
53 | rewards, values, log_probs, r_i_t=None):
54 | if r_i_t is not None:
55 | rewards += r_i_t.detach().numpy()
56 | returns = self.calc_R(done, rewards, values)
57 | next_v = T.zeros(1, 1) if done else self.forward(T.tensor([new_states],
58 | dtype=T.float), hx)[1]
59 |
60 | values.append(next_v.detach())
61 | values = T.cat(values).squeeze()
62 | log_probs = T.cat(log_probs)
63 | rewards = T.tensor(rewards)
64 |
65 | delta_t = rewards + self.gamma*values[1:] - values[:-1]
66 | n_steps = len(delta_t)
67 | gae = np.zeros(n_steps)
68 | for t in range(n_steps):
69 | for k in range(0, n_steps-t):
70 | temp = (self.gamma*self.tau)**k*delta_t[t+k]
71 | gae[t] += temp
72 | gae = T.tensor(gae, dtype=T.float)
73 |
74 | actor_loss = -(log_probs*gae).sum()
75 | entropy_loss = (-log_probs*T.exp(log_probs)).sum()
76 | # [T] vs ()
77 | critic_loss = F.mse_loss(values[:-1].squeeze(), returns)
78 |
79 | total_loss = actor_loss + critic_loss - 0.01*entropy_loss
80 | return total_loss
81 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/icm.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class ICM(nn.Module):
7 | def __init__(self, input_dims, n_actions=2, alpha=1, beta=0.2):
8 | super(ICM, self).__init__()
9 | self.alpha = alpha
10 | self.beta = beta
11 | # hard coded for cartpole environment
12 | self.inverse = nn.Linear(4*2, 256)
13 | self.pi_logits = nn.Linear(256, n_actions)
14 |
15 | self.dense1 = nn.Linear(4+1, 256)
16 | self.new_state = nn.Linear(256, 4)
17 |
18 | device = T.device('cpu')
19 | self.to(device)
20 |
21 | def forward(self, state, new_state, action):
22 | inverse = F.elu(self.inverse(T.cat([state, new_state], dim=1)))
23 | pi_logits = self.pi_logits(inverse)
24 |
25 | # from [T] to [T,1]
26 | action = action.reshape((action.size()[0], 1))
27 | forward_input = T.cat([state, action], dim=1)
28 | dense = F.elu(self.dense1(forward_input))
29 | state_ = self.new_state(dense)
30 |
31 | return pi_logits, state_
32 |
33 | def calc_loss(self, state, new_state, action):
34 | state = T.tensor(state, dtype=T.float)
35 | action = T.tensor(action, dtype=T.float)
36 | new_state = T.tensor(new_state, dtype=T.float)
37 |
38 | pi_logits, state_ = self.forward(state, new_state, action)
39 |
40 | inverse_loss = nn.CrossEntropyLoss()
41 | L_I = (1-self.beta)*inverse_loss(pi_logits, action.to(T.long))
42 |
43 | forward_loss = nn.MSELoss()
44 | L_F = self.beta*forward_loss(state_, new_state)
45 |
46 | intrinsic_reward = self.alpha*((state_ - new_state).pow(2)).mean(dim=1)
47 | return intrinsic_reward, L_I, L_F
48 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch.multiprocessing as mp
3 | from parallel_env import ParallelEnv
4 |
5 | os.environ['OMP_NUM_THREADS'] = '1'
6 |
7 |
8 | if __name__ == '__main__':
9 | mp.set_start_method('spawn')
10 | env_id = 'CartPole-v0'
11 | n_threads = 12
12 | n_actions = 2
13 | input_shape = [4]
14 | env = ParallelEnv(env_id=env_id, n_threads=n_threads,
15 | n_actions=n_actions, input_shape=input_shape, icm=True)
16 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/memory.py:
--------------------------------------------------------------------------------
1 | class Memory:
2 | def __init__(self):
3 | self.states = []
4 | self.actions = []
5 | self.rewards = []
6 | self.new_states = []
7 | self.values = []
8 | self.log_probs = []
9 |
10 | def remember(self, state, action, reward, new_state, value, log_p):
11 | self.actions.append(action)
12 | self.rewards.append(reward)
13 | self.states.append(state)
14 | self.new_states.append(new_state)
15 | self.log_probs.append(log_p)
16 | self.values.append(value)
17 |
18 | def clear_memory(self):
19 | self.states = []
20 | self.actions = []
21 | self.rewards = []
22 | self.new_states = []
23 | self.values = []
24 | self.log_probs = []
25 |
26 | def sample_memory(self):
27 | return self.states, self.actions, self.rewards, self.new_states,\
28 | self.values, self.log_probs
29 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/parallel_env.py:
--------------------------------------------------------------------------------
1 | import torch.multiprocessing as mp
2 | from actor_critic import ActorCritic
3 | from icm import ICM
4 | from shared_adam import SharedAdam
5 | from worker import worker
6 |
7 |
8 | class ParallelEnv:
9 | def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8):
10 | names = [str(i) for i in range(1, n_threads+1)]
11 |
12 | global_actor_critic = ActorCritic(input_shape, n_actions)
13 | global_actor_critic.share_memory()
14 | global_optim = SharedAdam(global_actor_critic.parameters())
15 |
16 | if not icm:
17 | global_icm = None
18 | global_icm_optim = None
19 | else:
20 | global_icm = ICM(input_shape, n_actions)
21 | global_icm.share_memory()
22 | global_icm_optim = SharedAdam(global_icm.parameters())
23 |
24 | self.ps = [mp.Process(target=worker,
25 | args=(name, input_shape, n_actions,
26 | global_actor_critic, global_icm,
27 | global_optim, global_icm_optim, env_id,
28 | n_threads, icm))
29 | for name in names]
30 |
31 | [p.start() for p in self.ps]
32 | [p.join() for p in self.ps]
33 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/shared_adam.py:
--------------------------------------------------------------------------------
1 | # from Morvan Zhou's implementation:
2 | # https://github.com/MorvanZhou/pytorch-A3C
3 |
4 | import torch as T
5 |
6 |
7 | class SharedAdam(T.optim.Adam):
8 | def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), eps=1e-8,
9 | weight_decay=0):
10 | super(SharedAdam, self).__init__(params, lr=lr, betas=betas,
11 | eps=eps, weight_decay=weight_decay)
12 |
13 | for group in self.param_groups:
14 | for p in group['params']:
15 | state = self.state[p]
16 | state['step'] = 0
17 | state['exp_avg'] = T.zeros_like(p.data)
18 | state['exp_avg_sq'] = T.zeros_like(p.data)
19 |
20 | state['exp_avg'].share_memory_()
21 | state['exp_avg_sq'].share_memory_()
22 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 |
5 | def plot_learning_curve(x, scores, figure_file):
6 | running_avg = np.zeros(len(scores))
7 | for i in range(len(running_avg)):
8 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
9 | plt.plot(x, running_avg)
10 | plt.title('Running average of previous 100 episodes')
11 | plt.savefig(figure_file)
12 |
--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/worker.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import torch as T
4 | from actor_critic import ActorCritic
5 | from icm import ICM
6 | from memory import Memory
7 | from utils import plot_learning_curve
8 |
9 |
10 | def worker(name, input_shape, n_actions, global_agent, global_icm,
11 | optimizer, icm_optimizer, env_id, n_threads, icm=False):
12 | T_MAX = 20
13 |
14 | local_agent = ActorCritic(input_shape, n_actions)
15 |
16 | if icm:
17 | local_icm = ICM(input_shape, n_actions)
18 | algo = 'ICM'
19 | else:
20 | intrinsic_reward = T.zeros(1)
21 | algo = 'A3C'
22 |
23 | memory = Memory()
24 |
25 | env = gym.make(env_id)
26 |
27 | t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0
28 |
29 | while episode < max_eps:
30 | obs = env.reset()
31 | hx = T.zeros(1, 256)
32 | score, done, ep_steps = 0, False, 0
33 | while not done:
34 | state = T.tensor([obs], dtype=T.float)
35 | action, value, log_prob, hx = local_agent(state, hx)
36 | obs_, reward, done, info = env.step(action)
37 | t_steps += 1
38 | ep_steps += 1
39 | score += reward
40 | reward = 0 # turn off extrinsic rewards
41 | memory.remember(obs, action, reward, obs_, value, log_prob)
42 | obs = obs_
43 | if ep_steps % T_MAX == 0 or done:
44 | states, actions, rewards, new_states, values, log_probs = \
45 | memory.sample_memory()
46 | if icm:
47 | intrinsic_reward, L_I, L_F = \
48 | local_icm.calc_loss(states, new_states, actions)
49 |
50 | loss = local_agent.calc_loss(obs, hx, done, rewards, values,
51 | log_probs, intrinsic_reward)
52 |
53 | optimizer.zero_grad()
54 | hx = hx.detach_()
55 | if icm:
56 | icm_optimizer.zero_grad()
57 | (L_I + L_F).backward()
58 |
59 | loss.backward()
60 | T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)
61 |
62 | for local_param, global_param in zip(
63 | local_agent.parameters(),
64 | global_agent.parameters()):
65 | global_param._grad = local_param.grad
66 | optimizer.step()
67 | local_agent.load_state_dict(global_agent.state_dict())
68 |
69 | if icm:
70 | for local_param, global_param in zip(
71 | local_icm.parameters(),
72 | global_icm.parameters()):
73 | global_param._grad = local_param.grad
74 | icm_optimizer.step()
75 | local_icm.load_state_dict(global_icm.state_dict())
76 | memory.clear_memory()
77 |
78 | if name == '1':
79 | scores.append(score)
80 | avg_score = np.mean(scores[-100:])
81 | print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} '
82 | 'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format(
83 | algo, episode, name, n_threads,
84 | t_steps/1e6, score,
85 | T.sum(intrinsic_reward),
86 | avg_score))
87 | episode += 1
88 | if name == '1':
89 | x = [z for z in range(episode)]
90 | fname = algo + '_CartPole_no_rewards.png'
91 | plot_learning_curve(x, scores, fname)
92 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/main_torch.py:
--------------------------------------------------------------------------------
1 | from ddpg_torch import Agent
2 | import gym
3 | import numpy as np
4 | from utils import plotLearning
5 |
6 | env = gym.make('LunarLanderContinuous-v2')
7 | agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[8], tau=0.001, env=env,
8 | batch_size=64, layer1_size=400, layer2_size=300, n_actions=2)
9 |
10 | #agent.load_models()
11 | np.random.seed(0)
12 |
13 | score_history = []
14 | for i in range(1000):
15 | obs = env.reset()
16 | done = False
17 | score = 0
18 | while not done:
19 | act = agent.choose_action(obs)
20 | new_state, reward, done, info = env.step(act)
21 | agent.remember(obs, act, reward, new_state, int(done))
22 | agent.learn()
23 | score += reward
24 | obs = new_state
25 | #env.render()
26 | score_history.append(score)
27 |
28 | #if i % 25 == 0:
29 | # agent.save_models()
30 |
31 | print('episode ', i, 'score %.2f' % score,
32 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
33 |
34 | filename = 'LunarLander-alpha000025-beta00025-400-300.png'
35 | plotLearning(score_history, filename, window=100)
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | def plotLearning(scores, filename, x=None, window=5):
5 | N = len(scores)
6 | running_avg = np.empty(N)
7 | for t in range(N):
8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
9 | if x is None:
10 | x = [i for i in range(N)]
11 | plt.ylabel('Score')
12 | plt.xlabel('Game')
13 | plt.plot(x, running_avg)
14 | plt.savefig(filename)
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/main_tf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import numpy as np
4 | from ddpg_orig_tf import Agent
5 | from utils import plotLearning
6 |
7 | # Uncomment the lines below to specify which gpu to run on
8 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
9 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10 |
11 | if __name__ == '__main__':
12 | env = gym.make('Pendulum-v0')
13 | agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[3], tau=0.001,
14 | env=env, batch_size=64, layer1_size=800, layer2_size=600,
15 | n_actions=1)
16 | np.random.seed(0)
17 | score_history = []
18 | for i in range(1000):
19 | obs = env.reset()
20 | done = False
21 | score = 0
22 | while not done:
23 | act = agent.choose_action(obs)
24 | new_state, reward, done, info = env.step(act)
25 | agent.remember(obs, act, reward, new_state, int(done))
26 | agent.learn()
27 | score += reward
28 | obs = new_state
29 | #env.render()
30 | score_history.append(score)
31 | print('episode ', i, 'score %.2f' % score,
32 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
33 |
34 | filename = 'Pendulum-alpha00005-beta0005-800-600-optimized.png'
35 | plotLearning(score_history, filename, window=100)
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | def plotLearning(scores, filename, x=None, window=5):
5 | N = len(scores)
6 | running_avg = np.empty(N)
7 | for t in range(N):
8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
9 | if x is None:
10 | x = [i for i in range(N)]
11 | plt.ylabel('Score')
12 | plt.xlabel('Game')
13 | plt.plot(x, running_avg)
14 | plt.savefig(filename)
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/main_tf.py:
--------------------------------------------------------------------------------
1 | from ddpg_orig_tf import Agent
2 | import gym
3 | import numpy as np
4 | from utils import plotLearning
5 | from gym import wrappers
6 | import os
7 |
8 | #tf.set_random_seed(0)
9 | if __name__ == '__main__':
10 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
12 |
13 | env = gym.make('BipedalWalker-v2')
14 | agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[24], tau=0.001, env=env,
15 | batch_size=64, layer1_size=400, layer2_size=300, n_actions=4,
16 | chkpt_dir='tmp/ddpg')
17 | np.random.seed(0)
18 | #agent.load_models()
19 | #env = wrappers.Monitor(env, "tmp/walker2d",
20 | # video_callable=lambda episode_id: True, force=True)
21 | score_history = []
22 | for i in range(5000):
23 | obs = env.reset()
24 | done = False
25 | score = 0
26 | while not done:
27 | act = agent.choose_action(obs)
28 | new_state, reward, done, info = env.step(act)
29 | agent.remember(obs, act, reward, new_state, int(done))
30 | agent.learn()
31 | score += reward
32 | obs = new_state
33 | env.render()
34 | score_history.append(score)
35 | print('episode ', i, 'score %.2f' % score,
36 | 'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
37 | if i % 25 == 0:
38 | agent.save_models()
39 | filename = 'WalkerTF-alpha00005-beta0005-400-300-original-5000games-testing.png'
40 | plotLearning(score_history, filename, window=100)
41 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer:
4 | def __init__(self, max_size, input_shape, n_actions):
5 | self.mem_size = max_size
6 | self.mem_cntr = 0
7 | self.state_memory = np.zeros((self.mem_size, *input_shape))
8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape))
9 | self.action_memory = np.zeros((self.mem_size, n_actions))
10 | self.reward_memory = np.zeros(self.mem_size)
11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 |
13 | def store_transition(self, state, action, reward, state_, done):
14 | index = self.mem_cntr % self.mem_size
15 |
16 | self.state_memory[index] = state
17 | self.new_state_memory[index] = state_
18 | self.action_memory[index] = action
19 | self.reward_memory[index] = reward
20 | self.terminal_memory[index] = done
21 |
22 | self.mem_cntr += 1
23 |
24 | def sample_buffer(self, batch_size):
25 | max_mem = min(self.mem_cntr, self.mem_size)
26 |
27 | batch = np.random.choice(max_mem, batch_size, replace=False)
28 |
29 | states = self.state_memory[batch]
30 | states_ = self.new_state_memory[batch]
31 | actions = self.action_memory[batch]
32 | rewards = self.reward_memory[batch]
33 | dones = self.terminal_memory[batch]
34 |
35 | return states, actions, rewards, states_, dones
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/main_ddpg.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from ddpg_tf2 import Agent
4 | from utils import plot_learning_curve
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('Pendulum-v0')
8 | agent = Agent(input_dims=env.observation_space.shape, env=env,
9 | n_actions=env.action_space.shape[0])
10 | n_games = 250
11 |
12 | figure_file = 'plots/pendulum.png'
13 |
14 | best_score = env.reward_range[0]
15 | score_history = []
16 | load_checkpoint = False
17 |
18 | if load_checkpoint:
19 | n_steps = 0
20 | while n_steps <= agent.batch_size:
21 | observation = env.reset()
22 | action = env.action_space.sample()
23 | observation_, reward, done, info = env.step(action)
24 | agent.remember(observation, action, reward, observation_, done)
25 | n_steps += 1
26 | agent.learn()
27 | agent.load_models()
28 | evaluate = True
29 | else:
30 | evaluate = False
31 |
32 | for i in range(n_games):
33 | observation = env.reset()
34 | done = False
35 | score = 0
36 | while not done:
37 | action = agent.choose_action(observation, evaluate)
38 | observation_, reward, done, info = env.step(action)
39 | score += reward
40 | agent.remember(observation, action, reward, observation_, done)
41 | if not load_checkpoint:
42 | agent.learn()
43 | observation = observation_
44 |
45 | score_history.append(score)
46 | avg_score = np.mean(score_history[-100:])
47 |
48 | if avg_score > best_score:
49 | best_score = avg_score
50 | if not load_checkpoint:
51 | agent.save_models()
52 |
53 | print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)
54 |
55 | if not load_checkpoint:
56 | x = [i+1 for i in range(n_games)]
57 | plot_learning_curve(x, score_history, figure_file)
58 |
59 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/networks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 | import tensorflow.keras as keras
4 | from tensorflow.keras.layers import Dense
5 |
6 | class CriticNetwork(keras.Model):
7 | def __init__(self, fc1_dims=512, fc2_dims=512,
8 | name='critic', chkpt_dir='tmp/ddpg'):
9 | super(CriticNetwork, self).__init__()
10 | self.fc1_dims = fc1_dims
11 | self.fc2_dims = fc2_dims
12 |
13 | self.model_name = name
14 | self.checkpoint_dir = chkpt_dir
15 | self.checkpoint_file = os.path.join(self.checkpoint_dir,
16 | self.model_name+'_ddpg.h5')
17 |
18 | self.fc1 = Dense(self.fc1_dims, activation='relu')
19 | self.fc2 = Dense(self.fc2_dims, activation='relu')
20 | self.q = Dense(1, activation=None)
21 |
22 | def call(self, state, action):
23 | action_value = self.fc1(tf.concat([state, action], axis=1))
24 | action_value = self.fc2(action_value)
25 |
26 | q = self.q(action_value)
27 |
28 | return q
29 |
30 | class ActorNetwork(keras.Model):
31 | def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2, name='actor',
32 | chkpt_dir='tmp/ddpg'):
33 | super(ActorNetwork, self).__init__()
34 | self.fc1_dims = fc1_dims
35 | self.fc2_dims = fc2_dims
36 | self.n_actions = n_actions
37 |
38 | self.model_name = name
39 | self.checkpoint_dir = chkpt_dir
40 | self.checkpoint_file = os.path.join(self.checkpoint_dir,
41 | self.model_name+'_ddpg.h5')
42 |
43 | self.fc1 = Dense(self.fc1_dims, activation='relu')
44 | self.fc2 = Dense(self.fc2_dims, activation='relu')
45 | self.mu = Dense(self.n_actions, activation='tanh')
46 |
47 | def call(self, state):
48 | prob = self.fc1(state)
49 | prob = self.fc2(prob)
50 |
51 | mu = self.mu(prob)
52 |
53 | return mu
54 |
55 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from agent import Agent
4 | from utils import plot_learning_curve
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('CartPole-v0')
8 | N = 20
9 | batch_size = 5
10 | n_epochs = 4
11 | alpha = 0.0003
12 | agent = Agent(n_actions=env.action_space.n, batch_size=batch_size,
13 | alpha=alpha, n_epochs=n_epochs,
14 | input_dims=env.observation_space.shape)
15 | n_games = 300
16 |
17 | figure_file = 'plots/cartpole.png'
18 |
19 | best_score = env.reward_range[0]
20 | score_history = []
21 |
22 | learn_iters = 0
23 | avg_score = 0
24 | n_steps = 0
25 |
26 | for i in range(n_games):
27 | observation = env.reset()
28 | done = False
29 | score = 0
30 | while not done:
31 | action, prob, val = agent.choose_action(observation)
32 | observation_, reward, done, info = env.step(action)
33 | n_steps += 1
34 | score += reward
35 | agent.store_transition(observation, action,
36 | prob, val, reward, done)
37 | if n_steps % N == 0:
38 | agent.learn()
39 | learn_iters += 1
40 | observation = observation_
41 | score_history.append(score)
42 | avg_score = np.mean(score_history[-100:])
43 |
44 | if avg_score > best_score:
45 | best_score = avg_score
46 | agent.save_models()
47 |
48 | print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
49 | 'time_steps', n_steps, 'learning_steps', learn_iters)
50 | x = [i+1 for i in range(len(score_history))]
51 | plot_learning_curve(x, score_history, figure_file)
52 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class PPOMemory:
5 | def __init__(self, batch_size):
6 | self.states = []
7 | self.probs = []
8 | self.vals = []
9 | self.actions = []
10 | self.rewards = []
11 | self.dones = []
12 |
13 | self.batch_size = batch_size
14 |
15 | def generate_batches(self):
16 | n_states = len(self.states)
17 | batch_start = np.arange(0, n_states, self.batch_size)
18 | indices = np.arange(n_states, dtype=np.int64)
19 | np.random.shuffle(indices)
20 | batches = [indices[i:i+self.batch_size] for i in batch_start]
21 |
22 | return np.array(self.states),\
23 | np.array(self.actions),\
24 | np.array(self.probs),\
25 | np.array(self.vals),\
26 | np.array(self.rewards),\
27 | np.array(self.dones),\
28 | batches
29 |
30 | def store_memory(self, state, action, probs, vals, reward, done):
31 | self.states.append(state)
32 | self.actions.append(action)
33 | self.probs.append(probs)
34 | self.vals.append(vals)
35 | self.rewards.append(reward)
36 | self.dones.append(done)
37 |
38 | def clear_memory(self):
39 | self.states = []
40 | self.probs = []
41 | self.actions = []
42 | self.rewards = []
43 | self.dones = []
44 | self.vals = []
45 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/networks.py:
--------------------------------------------------------------------------------
1 | import tensorflow.keras as keras
2 | from tensorflow.keras.layers import Dense
3 |
4 |
5 | class ActorNetwork(keras.Model):
6 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
7 | super(ActorNetwork, self).__init__()
8 |
9 | self.fc1 = Dense(fc1_dims, activation='relu')
10 | self.fc2 = Dense(fc2_dims, activation='relu')
11 | self.fc3 = Dense(n_actions, activation='softmax')
12 |
13 | def call(self, state):
14 | x = self.fc1(state)
15 | x = self.fc2(x)
16 | x = self.fc3(x)
17 |
18 | return x
19 |
20 |
21 | class CriticNetwork(keras.Model):
22 | def __init__(self, fc1_dims=256, fc2_dims=256):
23 | super(CriticNetwork, self).__init__()
24 | self.fc1 = Dense(fc1_dims, activation='relu')
25 | self.fc2 = Dense(fc2_dims, activation='relu')
26 | self.q = Dense(1, activation=None)
27 |
28 | def call(self, state):
29 | x = self.fc1(state)
30 | x = self.fc2(x)
31 | q = self.q(x)
32 |
33 | return q
34 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from ppo_torch import Agent
4 | from utils import plot_learning_curve
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('CartPole-v0')
8 | N = 20
9 | batch_size = 5
10 | n_epochs = 4
11 | alpha = 0.0003
12 | agent = Agent(n_actions=env.action_space.n, batch_size=batch_size,
13 | alpha=alpha, n_epochs=n_epochs,
14 | input_dims=env.observation_space.shape)
15 | n_games = 300
16 |
17 | figure_file = 'plots/cartpole.png'
18 |
19 | best_score = env.reward_range[0]
20 | score_history = []
21 |
22 | learn_iters = 0
23 | avg_score = 0
24 | n_steps = 0
25 |
26 | for i in range(n_games):
27 | observation = env.reset()
28 | done = False
29 | score = 0
30 | while not done:
31 | action, prob, val = agent.choose_action(observation)
32 | observation_, reward, done, info = env.step(action)
33 | n_steps += 1
34 | score += reward
35 | agent.remember(observation, action, prob, val, reward, done)
36 | if n_steps % N == 0:
37 | agent.learn()
38 | learn_iters += 1
39 | observation = observation_
40 | score_history.append(score)
41 | avg_score = np.mean(score_history[-100:])
42 |
43 | if avg_score > best_score:
44 | best_score = avg_score
45 | agent.save_models()
46 |
47 | print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
48 | 'time_steps', n_steps, 'learning_steps', learn_iters)
49 | x = [i+1 for i in range(len(score_history))]
50 | plot_learning_curve(x, score_history, figure_file)
51 |
52 |
53 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer():
4 | def __init__(self, max_size, input_shape, n_actions):
5 | self.mem_size = max_size
6 | self.mem_cntr = 0
7 | self.state_memory = np.zeros((self.mem_size, *input_shape))
8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape))
9 | self.action_memory = np.zeros((self.mem_size, n_actions))
10 | self.reward_memory = np.zeros(self.mem_size)
11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 |
13 | def store_transition(self, state, action, reward, state_, done):
14 | index = self.mem_cntr % self.mem_size
15 |
16 | self.state_memory[index] = state
17 | self.new_state_memory[index] = state_
18 | self.action_memory[index] = action
19 | self.reward_memory[index] = reward
20 | self.terminal_memory[index] = done
21 |
22 | self.mem_cntr += 1
23 |
24 | def sample_buffer(self, batch_size):
25 | max_mem = min(self.mem_cntr, self.mem_size)
26 |
27 | batch = np.random.choice(max_mem, batch_size)
28 |
29 | states = self.state_memory[batch]
30 | states_ = self.new_state_memory[batch]
31 | actions = self.action_memory[batch]
32 | rewards = self.reward_memory[batch]
33 | dones = self.terminal_memory[batch]
34 |
35 | return states, actions, rewards, states_, dones
36 |
37 |
38 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/main_sac.py:
--------------------------------------------------------------------------------
1 | import pybullet_envs
2 | import gym
3 | import numpy as np
4 | from sac_torch import Agent
5 | from utils import plot_learning_curve
6 | from gym import wrappers
7 |
8 | if __name__ == '__main__':
9 | env = gym.make('InvertedPendulumBulletEnv-v0')
10 | agent = Agent(input_dims=env.observation_space.shape, env=env,
11 | n_actions=env.action_space.shape[0])
12 | n_games = 250
13 | # uncomment this line and do a mkdir tmp && mkdir video if you want to
14 | # record video of the agent playing the game.
15 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
16 | filename = 'inverted_pendulum.png'
17 |
18 | figure_file = 'plots/' + filename
19 |
20 | best_score = env.reward_range[0]
21 | score_history = []
22 | load_checkpoint = False
23 |
24 | if load_checkpoint:
25 | agent.load_models()
26 | env.render(mode='human')
27 |
28 | for i in range(n_games):
29 | observation = env.reset()
30 | done = False
31 | score = 0
32 | while not done:
33 | action = agent.choose_action(observation)
34 | observation_, reward, done, info = env.step(action)
35 | score += reward
36 | agent.remember(observation, action, reward, observation_, done)
37 | if not load_checkpoint:
38 | agent.learn()
39 | observation = observation_
40 | score_history.append(score)
41 | avg_score = np.mean(score_history[-100:])
42 |
43 | if avg_score > best_score:
44 | best_score = avg_score
45 | if not load_checkpoint:
46 | agent.save_models()
47 |
48 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
49 |
50 | if not load_checkpoint:
51 | x = [i+1 for i in range(n_games)]
52 | plot_learning_curve(x, score_history, figure_file)
53 |
54 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer:
4 | def __init__(self, max_size, input_shape, n_actions):
5 | self.mem_size = max_size
6 | self.mem_cntr = 0
7 | self.state_memory = np.zeros((self.mem_size, *input_shape))
8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape))
9 | self.action_memory = np.zeros((self.mem_size, n_actions))
10 | self.reward_memory = np.zeros(self.mem_size)
11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 |
13 | def store_transition(self, state, action, reward, state_, done):
14 | index = self.mem_cntr % self.mem_size
15 |
16 | self.state_memory[index] = state
17 | self.new_state_memory[index] = state_
18 | self.action_memory[index] = action
19 | self.reward_memory[index] = reward
20 | self.terminal_memory[index] = done
21 |
22 | self.mem_cntr += 1
23 |
24 | def sample_buffer(self, batch_size):
25 | max_mem = min(self.mem_cntr, self.mem_size)
26 |
27 | batch = np.random.choice(max_mem, batch_size)
28 |
29 | states = self.state_memory[batch]
30 | states_ = self.new_state_memory[batch]
31 | actions = self.action_memory[batch]
32 | rewards = self.reward_memory[batch]
33 | dones = self.terminal_memory[batch]
34 |
35 | return states, actions, rewards, states_, dones
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/main_sac.py:
--------------------------------------------------------------------------------
1 | import pybullet_envs
2 | import gym
3 | import numpy as np
4 | from sac_tf2 import Agent
5 | from utils import plot_learning_curve
6 | from gym import wrappers
7 |
8 | if __name__ == '__main__':
9 | env = gym.make('InvertedPendulumBulletEnv-v0')
10 | agent = Agent(input_dims=env.observation_space.shape, env=env,
11 | n_actions=env.action_space.shape[0])
12 | n_games = 250
13 | # uncomment this line and do a mkdir tmp && mkdir tmp/video if you want to
14 | # record video of the agent playing the game.
15 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
16 | filename = 'inverted_pendulum.png'
17 |
18 | figure_file = 'plots/' + filename
19 |
20 | best_score = env.reward_range[0]
21 | score_history = []
22 | load_checkpoint = True
23 |
24 | if load_checkpoint:
25 | agent.load_models()
26 | env.render(mode='human')
27 |
28 | for i in range(n_games):
29 | observation = env.reset()
30 | done = False
31 | score = 0
32 | while not done:
33 | action = agent.choose_action(observation)
34 | observation_, reward, done, info = env.step(action)
35 | score += reward
36 | agent.remember(observation, action, reward, observation_, done)
37 | if not load_checkpoint:
38 | agent.learn()
39 | observation = observation_
40 | score_history.append(score)
41 | avg_score = np.mean(score_history[-100:])
42 |
43 | if avg_score > best_score:
44 | best_score = avg_score
45 | if not load_checkpoint:
46 | agent.save_models()
47 |
48 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
49 |
50 | if not load_checkpoint:
51 | x = [i+1 for i in range(n_games)]
52 | plot_learning_curve(x, score_history, figure_file)
53 |
54 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/networks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import tensorflow as tf
4 | import tensorflow.keras as keras
5 | import tensorflow_probability as tfp
6 | from tensorflow.keras.layers import Dense
7 |
8 | class CriticNetwork(keras.Model):
9 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256,
10 | name='critic', chkpt_dir='tmp/sac'):
11 | super(CriticNetwork, self).__init__()
12 | self.fc1_dims = fc1_dims
13 | self.fc2_dims = fc2_dims
14 | self.n_actions = n_actions
15 | self.model_name = name
16 | self.checkpoint_dir = chkpt_dir
17 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
18 |
19 | self.fc1 = Dense(self.fc1_dims, activation='relu')
20 | self.fc2 = Dense(self.fc2_dims, activation='relu')
21 | self.q = Dense(1, activation=None)
22 |
23 | def call(self, state, action):
24 | action_value = self.fc1(tf.concat([state, action], axis=1))
25 | action_value = self.fc2(action_value)
26 |
27 | q = self.q(action_value)
28 |
29 | return q
30 |
31 | class ValueNetwork(keras.Model):
32 | def __init__(self, fc1_dims=256, fc2_dims=256,
33 | name='value', chkpt_dir='tmp/sac'):
34 | super(ValueNetwork, self).__init__()
35 | self.fc1_dims = fc1_dims
36 | self.fc2_dims = fc2_dims
37 | self.model_name = name
38 | self.checkpoint_dir = chkpt_dir
39 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
40 |
41 | self.fc1 = Dense(self.fc1_dims, activation='relu')
42 | self.fc2 = Dense(fc2_dims, activation='relu')
43 | self.v = Dense(1, activation=None)
44 |
45 | def call(self, state):
46 | state_value = self.fc1(state)
47 | state_value = self.fc2(state_value)
48 |
49 | v = self.v(state_value)
50 |
51 | return v
52 |
53 | class ActorNetwork(keras.Model):
54 | def __init__(self, max_action, fc1_dims=256,
55 | fc2_dims=256, n_actions=2, name='actor', chkpt_dir='tmp/sac'):
56 | super(ActorNetwork, self).__init__()
57 | self.fc1_dims = fc1_dims
58 | self.fc2_dims = fc2_dims
59 | self.n_actions = n_actions
60 | self.model_name = name
61 | self.checkpoint_dir = chkpt_dir
62 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
63 | self.max_action = max_action
64 | self.noise = 1e-6
65 |
66 | self.fc1 = Dense(self.fc1_dims, activation='relu')
67 | self.fc2 = Dense(self.fc2_dims, activation='relu')
68 | self.mu = Dense(self.n_actions, activation=None)
69 | self.sigma = Dense(self.n_actions, activation=None)
70 |
71 | def call(self, state):
72 | prob = self.fc1(state)
73 | prob = self.fc2(prob)
74 |
75 | mu = self.mu(prob)
76 | sigma = self.sigma(prob)
77 | # might want to come back and change this, perhaps tf plays more nicely with
78 | # a sigma of ~0
79 | sigma = tf.clip_by_value(sigma, self.noise, 1)
80 |
81 | return mu, sigma
82 |
83 | def sample_normal(self, state, reparameterize=True):
84 | mu, sigma = self.call(state)
85 | probabilities = tfp.distributions.Normal(mu, sigma)
86 |
87 | if reparameterize:
88 | actions = probabilities.sample() # + something else if you want to implement
89 | else:
90 | actions = probabilities.sample()
91 |
92 | action = tf.math.tanh(actions)*self.max_action
93 | log_probs = probabilities.log_prob(actions)
94 | log_probs -= tf.math.log(1-tf.math.pow(action,2)+self.noise)
95 | log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
96 |
97 | return action, log_probs
98 |
99 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from td3_torch import Agent
4 | from utils import plot_learning_curve
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLanderContinuous-v2')
8 | agent = Agent(alpha=0.001, beta=0.001,
9 | input_dims=env.observation_space.shape, tau=0.005,
10 | env=env, batch_size=100, layer1_size=400, layer2_size=300,
11 | n_actions=env.action_space.shape[0])
12 | n_games = 1000
13 | filename = 'plots/' + 'LunarLanderContinuous_' + str(n_games) + '_games.png'
14 |
15 | best_score = env.reward_range[0]
16 | score_history = []
17 |
18 | agent.load_models()
19 |
20 | for i in range(n_games):
21 | observation = env.reset()
22 | done = False
23 | score = 0
24 | while not done:
25 | action = agent.choose_action(observation)
26 | observation_, reward, done, info = env.step(action)
27 | agent.remember(observation, action, reward, observation_, done)
28 | agent.learn()
29 | score += reward
30 | observation = observation_
31 | score_history.append(score)
32 | avg_score = np.mean(score_history[-100:])
33 |
34 | if avg_score > best_score:
35 | best_score = avg_score
36 | agent.save_models()
37 |
38 | print('episode ', i, 'score %.1f' % score,
39 | 'average score %.1f' % avg_score)
40 |
41 | x = [i+1 for i in range(n_games)]
42 | plot_learning_curve(x, score_history, filename)
43 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from td3_tf2 import Agent
4 | from utils import plot_learning_curve
5 |
6 | if __name__ == '__main__':
7 | #env = gym.make('LunarLanderContinuous-v2')
8 | #env = gym.make('Pendulum-v0')
9 | env = gym.make('BipedalWalker-v2')
10 | agent = Agent(alpha=0.001, beta=0.001,
11 | input_dims=env.observation_space.shape, tau=0.005,
12 | env=env, batch_size=100, layer1_size=400, layer2_size=300,
13 | n_actions=env.action_space.shape[0])
14 | n_games = 1000
15 | filename = 'plots/' + 'walker_' + str(n_games) + '_games.png'
16 |
17 | best_score = env.reward_range[0]
18 | score_history = []
19 |
20 | #agent.load_models()
21 |
22 | for i in range(n_games):
23 | observation = env.reset()
24 | done = False
25 | score = 0
26 | while not done:
27 | action = agent.choose_action(observation)
28 | observation_, reward, done, info = env.step(action)
29 | agent.remember(observation, action, reward, observation_, done)
30 | agent.learn()
31 | score += reward
32 | observation = observation_
33 | score_history.append(score)
34 | avg_score = np.mean(score_history[-100:])
35 |
36 | if avg_score > best_score:
37 | best_score = avg_score
38 | agent.save_models()
39 |
40 | print('episode ', i, 'score %.1f' % score,
41 | 'average score %.1f' % avg_score)
42 |
43 | x = [i+1 for i in range(n_games)]
44 | plot_learning_curve(x, score_history, filename)
45 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_keras.py:
--------------------------------------------------------------------------------
1 | from keras import backend as K
2 | from keras.layers import Dense, Activation, Input
3 | from keras.models import Model, load_model
4 | from keras.optimizers import Adam
5 | import numpy as np
6 |
7 | class Agent(object):
8 | def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
9 | layer1_size=1024, layer2_size=512, input_dims=8):
10 | self.gamma = gamma
11 | self.alpha = alpha
12 | self.beta = beta
13 | self.input_dims = input_dims
14 | self.fc1_dims = layer1_size
15 | self.fc2_dims = layer2_size
16 | self.n_actions = n_actions
17 |
18 | self.actor, self.critic, self.policy = self.build_actor_critic_network()
19 | self.action_space = [i for i in range(n_actions)]
20 |
21 | def build_actor_critic_network(self):
22 | input = Input(shape=(self.input_dims,))
23 | delta = Input(shape=[1])
24 | dense1 = Dense(self.fc1_dims, activation='relu')(input)
25 | dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
26 | probs = Dense(self.n_actions, activation='softmax')(dense2)
27 | values = Dense(1, activation='linear')(dense2)
28 |
29 | def custom_loss(y_true, y_pred):
30 | out = K.clip(y_pred, 1e-8, 1-1e-8)
31 | log_lik = y_true*K.log(out)
32 |
33 | return K.sum(-log_lik*delta)
34 |
35 | actor = Model(input=[input, delta], output=[probs])
36 |
37 | actor.compile(optimizer=Adam(lr=self.alpha), loss=custom_loss)
38 |
39 | critic = Model(input=[input], output=[values])
40 |
41 | critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
42 |
43 | policy = Model(input=[input], output=[probs])
44 |
45 | return actor, critic, policy
46 |
47 | def choose_action(self, observation):
48 | state = observation[np.newaxis, :]
49 | probabilities = self.policy.predict(state)[0]
50 | action = np.random.choice(self.action_space, p=probabilities)
51 |
52 | return action
53 |
54 | def learn(self, state, action, reward, state_, done):
55 | state = state[np.newaxis,:]
56 | state_ = state_[np.newaxis,:]
57 | critic_value_ = self.critic.predict(state_)
58 | critic_value = self.critic.predict(state)
59 |
60 | target = reward + self.gamma*critic_value_*(1-int(done))
61 | delta = target - critic_value
62 |
63 | actions = np.zeros([1, self.n_actions])
64 | actions[np.arange(1), action] = 1
65 |
66 | self.actor.fit([state, delta], actions, verbose=0)
67 |
68 | self.critic.fit(state, target, verbose=0)
69 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_replay_torch.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 |
7 | class ReplayBuffer():
8 | def __init__(self, max_size, input_shape):
9 | self.mem_size = max_size
10 | self.mem_cntr = 0
11 | self.state_memory = np.zeros((self.mem_size, *input_shape),
12 | dtype=np.float32)
13 | self.new_state_memory = np.zeros((self.mem_size, *input_shape),
14 | dtype=np.float32)
15 | self.log_probs = np.zeros(self.mem_size, dtype=np.float32)
16 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
17 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)
18 |
19 | def store_transition(self, state, log_prob, reward, state_, done):
20 | index = self.mem_cntr % self.mem_size
21 | self.state_memory[index] = state
22 | self.new_state_memory[index] = state_
23 | self.log_probs[index] = log_prob
24 | self.reward_memory[index] = reward
25 | self.terminal_memory[index] = done
26 | self.mem_cntr += 1
27 |
28 | def sample_buffer(self, batch_size):
29 | max_mem = min(self.mem_cntr, self.mem_size)
30 | batch = np.random.choice(max_mem, batch_size, replace=False)
31 |
32 | states = self.state_memory[batch]
33 | probs = self.log_probs[batch]
34 | rewards = self.reward_memory[batch]
35 | states_ = self.new_state_memory[batch]
36 | terminal = self.terminal_memory[batch]
37 |
38 | return states, probs, rewards, states_, terminal
39 |
40 | class ActorCriticNetwork(nn.Module):
41 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
42 | n_actions):
43 | super(ActorCriticNetwork, self).__init__()
44 | self.input_dims = input_dims
45 | self.fc1_dims = fc1_dims
46 | self.fc2_dims = fc2_dims
47 | self.n_actions = n_actions
48 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
49 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
50 | self.pi = nn.Linear(self.fc2_dims, n_actions)
51 | self.v = nn.Linear(self.fc2_dims, 1)
52 | self.optimizer = optim.Adam(self.parameters(), lr=lr)
53 |
54 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
55 | self.to(self.device)
56 |
57 | def forward(self, state):
58 | x = F.relu(self.fc1(state))
59 | x = F.relu(self.fc2(x))
60 | pi = self.pi(x)
61 | v = self.v(x)
62 | return (pi, v)
63 |
64 | class Agent():
65 | def __init__(self, lr, input_dims, n_actions, gamma=0.99,
66 | l1_size=256, l2_size=256, batch_size=32,
67 | mem_size=1000000):
68 | self.gamma = gamma
69 | self.batch_size = batch_size
70 | self.memory = ReplayBuffer(mem_size, input_dims)
71 | self.actor_critic = ActorCriticNetwork(lr, input_dims, l1_size,
72 | l2_size, n_actions=n_actions)
73 | self.log_probs = []
74 |
75 | def store_transition(self, state, prob, reward, state_, done):
76 | self.memory.store_transition(state, prob, reward, state_, done)
77 |
78 | def choose_action(self, observation):
79 | state = T.tensor([observation]).to(self.actor_critic.device)
80 | probabilities, _ = self.actor_critic.forward(state)
81 | probabilities = F.softmax(probabilities)
82 | action_probs = T.distributions.Categorical(probabilities)
83 | action = action_probs.sample()
84 | log_probs = action_probs.log_prob(action)
85 |
86 | return action.item(), log_probs
87 |
88 | def learn(self):
89 | if self.memory.mem_cntr < self.batch_size:
90 | return
91 | self.actor_critic.optimizer.zero_grad()
92 |
93 | state, prob, reward, new_state, done = \
94 | self.memory.sample_buffer(self.batch_size)
95 |
96 | states = T.tensor(state).to(self.actor_critic.device)
97 | probs = T.tensor(prob).to(self.actor_critic.device)
98 | rewards = T.tensor(reward).to(self.actor_critic.device)
99 | dones = T.tensor(done).to(self.actor_critic.device)
100 | states_ = T.tensor(new_state).to(self.actor_critic.device)
101 |
102 | _, critic_value_ = self.actor_critic.forward(states_)
103 | _, critic_value = self.actor_critic.forward(states)
104 |
105 | critic_value_[dones] = 0.0
106 |
107 | delta = rewards + self.gamma*critic_value_
108 |
109 | actor_loss = -T.mean(probs*(delta-critic_value))
110 | critic_loss = F.mse_loss(delta, critic_value)
111 |
112 | (actor_loss + critic_loss).backward()
113 |
114 | self.actor_critic.optimizer.step()
115 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/continuous_mountain_car_actor_critic.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from actor_critic_continuous import Agent
4 | import matplotlib.pyplot as plt
5 | from utils import plotLearning
6 | from gym import wrappers
7 |
8 |
9 | if __name__ == '__main__':
10 | agent = Agent(alpha=0.000005, beta=0.00001, input_dims=[2], gamma=0.99,
11 | layer1_size=256, layer2_size=256)
12 |
13 | env = gym.make('MountainCarContinuous-v0')
14 | score_history = []
15 | num_episodes = 100
16 | for i in range(num_episodes):
17 | #env = wrappers.Monitor(env, "tmp/mountaincar-continuous-trained-1",
18 | # video_callable=lambda episode_id: True, force=True)
19 | done = False
20 | score = 0
21 | observation = env.reset()
22 | while not done:
23 | action = np.array(agent.choose_action(observation)).reshape((1,))
24 | observation_, reward, done, info = env.step(action)
25 | agent.learn(observation, reward, observation_, done)
26 | observation = observation_
27 | score += reward
28 | score_history.append(score)
29 | print('episode: ', i,'score: %.2f' % score)
30 | filename = 'mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png'
31 | plotLearning(score_history, filename=filename, window=20)
32 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/discrete_cartpole.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from actor_critic_discrete import Agent
4 | import matplotlib.pyplot as plt
5 | from utils import plotLearning
6 | from gym import wrappers
7 |
8 |
9 | if __name__ == '__main__':
10 | agent = Agent(alpha=0.0001, beta=0.0005, input_dims=[4], gamma=0.99,
11 | n_actions=2, layer1_size=32, layer2_size=32)
12 |
13 | env = gym.make('CartPole-v1')
14 | score_history = []
15 | score = 0
16 | num_episodes = 2500
17 | for i in range(num_episodes):
18 | print('episode: ', i,'score: %.3f' % score)
19 |
20 |
21 | #env = wrappers.Monitor(env, "tmp/cartpole-untrained",
22 | # video_callable=lambda episode_id: True, force=True)
23 | done = False
24 | score = 0
25 | observation = env.reset()
26 | while not done:
27 | action = agent.choose_action(observation)
28 | observation_, reward, done, info = env.step(action)
29 | agent.learn(observation, reward, observation_, done)
30 | observation = observation_
31 | score += reward
32 | score_history.append(score)
33 |
34 | filename = 'cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc-1500games.png'
35 | plotLearning(score_history, filename=filename, window=10)
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/main_keras_actor_critic_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym, os
2 | from actor_critic_keras import Agent
3 | from utils import plotLearning
4 | from gym import wrappers
5 | import numpy as np
6 |
7 | if __name__ == '__main__':
8 | agent = Agent(alpha=0.00001, beta=0.00005)
9 |
10 | env = gym.make('LunarLander-v2')
11 | score_history = []
12 | num_episodes = 2000
13 |
14 | for i in range(num_episodes):
15 | done = False
16 | score = 0
17 | observation = env.reset()
18 | while not done:
19 | action = agent.choose_action(observation)
20 | observation_, reward, done, info = env.step(action)
21 | agent.learn(observation, action, reward, observation_, done)
22 | observation = observation_
23 | score += reward
24 |
25 | score_history.append(score)
26 | avg_score = np.mean(score_history[-100:])
27 | print('episode: ', i,'score: %.2f' % score,
28 | 'avg score %.2f' % avg_score)
29 |
30 | filename = 'LunarLander.png'
31 | plotLearning(score_history, filename=filename, window=100)
32 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/main_torch_actor_critic_replay_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from actor_critic_replay_torch import Agent
4 | from utils import plotLearning
5 |
6 | if __name__ == '__main__':
7 | env = gym.make('LunarLander-v2')
8 | num_games = 1500
9 | agent = Agent(gamma=0.99, lr=1e-5, input_dims=[8], n_actions=4,
10 | l1_size=256, l2_size=256)
11 |
12 | filename = 'LunarLander-ActorCriticNaiveReplay-256-256-Adam-lr00001.png'
13 | scores = []
14 |
15 | for i in range(num_games):
16 | done = False
17 | observation = env.reset()
18 | score = 0
19 |
20 | while not done:
21 | action, prob = agent.choose_action(observation)
22 | observation_, reward, done, info = env.step(action)
23 | score += reward
24 | agent.store_transition(observation, prob,
25 | reward, observation_, int(done))
26 | agent.learn()
27 | observation = observation_
28 |
29 | scores.append(score)
30 | avg_score = np.mean(scores[max(0, i-100):(i+1)])
31 | print('episode: ', i,'score %.1f ' % score,
32 | ' average score %.1f' % avg_score)
33 |
34 | x = [i+1 for i in range(num_games)]
35 | plotLearning(scores, filename, x)
36 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/actor_critic.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.keras.optimizers import Adam
3 | import tensorflow_probability as tfp
4 | from networks import ActorCriticNetwork
5 |
6 | class Agent:
7 | def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
8 | self.gamma = gamma
9 | self.n_actions = n_actions
10 | self.action = None
11 | self.action_space = [i for i in range(self.n_actions)]
12 |
13 | self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
14 |
15 | self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
16 |
17 |
18 | def choose_action(self, observation):
19 | state = tf.convert_to_tensor([observation])
20 | _, probs = self.actor_critic(state)
21 |
22 | action_probabilities = tfp.distributions.Categorical(probs=probs)
23 | action = action_probabilities.sample()
24 | log_prob = action_probabilities.log_prob(action)
25 | self.action = action
26 |
27 | return action.numpy()[0]
28 |
29 | def save_models(self):
30 | print('... saving models ...')
31 | self.actor_critic.save_weights(self.actor_critic.checkpoint_file)
32 |
33 | def load_models(self):
34 | print('... loading models ...')
35 | self.actor_critic.load_weights(self.actor_critic.checkpoint_file)
36 |
37 | def learn(self, state, reward, state_, done):
38 | state = tf.convert_to_tensor([state], dtype=tf.float32)
39 | state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
40 | reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN
41 | with tf.GradientTape(persistent=True) as tape:
42 | state_value, probs = self.actor_critic(state)
43 | state_value_, _ = self.actor_critic(state_)
44 | state_value = tf.squeeze(state_value)
45 | state_value_ = tf.squeeze(state_value_)
46 |
47 | action_probs = tfp.distributions.Categorical(probs=probs)
48 | log_prob = action_probs.log_prob(self.action)
49 |
50 | delta = reward + self.gamma*state_value_*(1-int(done)) - state_value
51 | actor_loss = -log_prob*delta
52 | critic_loss = delta**2
53 | total_loss = actor_loss + critic_loss
54 |
55 | gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
56 | self.actor_critic.optimizer.apply_gradients(zip(
57 | gradient, self.actor_critic.trainable_variables))
58 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/main.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from actor_critic import Agent
4 | from utils import plot_learning_curve
5 | from gym import wrappers
6 |
7 | if __name__ == '__main__':
8 | #env = gym.make('LunarLander-v2')
9 | env = gym.make('CartPole-v0')
10 | agent = Agent(alpha=1e-5, n_actions=env.action_space.n)
11 | n_games = 1800
12 | # uncomment this line and do a mkdir tmp && mkdir video if you want to
13 | # record video of the agent playing the game.
14 | #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
15 | filename = 'cartpole_1e-5_1024x512_1800games.png'
16 |
17 | figure_file = 'plots/' + filename
18 |
19 | best_score = env.reward_range[0]
20 | score_history = []
21 | load_checkpoint = False
22 |
23 | if load_checkpoint:
24 | agent.load_models()
25 |
26 | for i in range(n_games):
27 | observation = env.reset()
28 | done = False
29 | score = 0
30 | while not done:
31 | action = agent.choose_action(observation)
32 | observation_, reward, done, info = env.step(action)
33 | score += reward
34 | if not load_checkpoint:
35 | agent.learn(observation, reward, observation_, done)
36 | observation = observation_
37 | score_history.append(score)
38 | avg_score = np.mean(score_history[-100:])
39 |
40 | if avg_score > best_score:
41 | best_score = avg_score
42 | if not load_checkpoint:
43 | agent.save_models()
44 |
45 | print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
46 |
47 | if not load_checkpoint:
48 | x = [i+1 for i in range(n_games)]
49 | plot_learning_curve(x, score_history, figure_file)
50 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/networks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow.keras as keras
3 | from tensorflow.keras.layers import Dense
4 |
5 | class ActorCriticNetwork(keras.Model):
6 | def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512,
7 | name='actor_critic', chkpt_dir='tmp/actor_critic'):
8 | super(ActorCriticNetwork, self).__init__()
9 | self.fc1_dims = fc1_dims
10 | self.fc2_dims = fc2_dims
11 | self.n_actions = n_actions
12 | self.model_name = name
13 | self.checkpoint_dir = chkpt_dir
14 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac')
15 |
16 | self.fc1 = Dense(self.fc1_dims, activation='relu')
17 | self.fc2 = Dense(self.fc2_dims, activation='relu')
18 | self.v = Dense(1, activation=None)
19 | self.pi = Dense(n_actions, activation='softmax')
20 |
21 | def call(self, state):
22 | value = self.fc1(state)
23 | value = self.fc2(value)
24 |
25 | v = self.v(value)
26 | pi = self.pi(value)
27 |
28 | return v, pi
29 |
30 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | def plot_learning_curve(x, scores, figure_file):
5 | running_avg = np.zeros(len(scores))
6 | for i in range(len(running_avg)):
7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
8 | plt.plot(x, running_avg)
9 | plt.title('Running average of previous 100 scores')
10 | plt.savefig(figure_file)
11 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/torch_discrete_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from torch_actor_critic_discrete import NewAgent
3 | from utils import plotLearning
4 | from gym import wrappers
5 |
6 |
7 | if __name__ == '__main__':
8 | agent = NewAgent(alpha=0.00001, input_dims=[8], gamma=0.99,
9 | n_actions=4, layer1_size=2048, layer2_size=512)
10 |
11 | env = gym.make('LunarLander-v2')
12 | score_history = []
13 | score = 0
14 | num_episodes = 2000
15 | for i in range(num_episodes):
16 |
17 | #env = wrappers.Monitor(env, "tmp/lunar-lander",
18 | # video_callable=lambda episode_id: True, force=True)
19 | done = False
20 | score = 0
21 | observation = env.reset()
22 | while not done:
23 | action = agent.choose_action(observation)
24 | observation_, reward, done, info = env.step(action)
25 | agent.learn(observation, reward, observation_, done)
26 | observation = observation_
27 | score += reward
28 |
29 | score_history.append(score)
30 | print('episode: ', i,'score: %.2f' % score)
31 |
32 | filename = 'Lunar-Lander-actor-critic-new-agent-alpha00001-beta00005-2048x512fc-2000games.png'
33 | plotLearning(score_history, filename=filename, window=50)
34 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | def plotLearning(scores, filename, x=None, window=5):
5 | N = len(scores)
6 | running_avg = np.empty(N)
7 | for t in range(N):
8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
9 | if x is None:
10 | x = [i for i in range(N)]
11 | plt.ylabel('Score')
12 | plt.xlabel('Game')
13 | plt.plot(x, running_avg)
14 | plt.savefig(filename)
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_keras_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from gym import wrappers
5 | from reinforce_keras import Agent
6 | from utils import plotLearning
7 |
8 | if __name__ == '__main__':
9 | agent = Agent(ALPHA=0.0005, input_dims=8, GAMMA=0.99,
10 | n_actions=4, layer1_size=64, layer2_size=64)
11 |
12 | env = gym.make('LunarLander-v2')
13 | score_history = []
14 |
15 | num_episodes = 2000
16 |
17 | for i in range(num_episodes):
18 | done = False
19 | score = 0
20 | observation = env.reset()
21 | while not done:
22 | action = agent.choose_action(observation)
23 | observation_, reward, done, info = env.step(action)
24 | agent.store_transition(observation, action, reward)
25 | observation = observation_
26 | score += reward
27 | score_history.append(score)
28 |
29 | _ = agent.learn()
30 | print('episode: ', i,'score: %.1f' % score,
31 | 'average score %.1f' % np.mean(score_history[max(0, i-100):(i+1)]))
32 |
33 | filename = 'lunar-lander-keras-64x64-alpha0005-2000games.png'
34 | plotLearning(score_history, filename=filename, window=100)
35 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from reinforce_tf import PolicyGradientAgent
4 | import matplotlib.pyplot as plt
5 | from utils import plotLearning
6 | from gym import wrappers
7 |
8 | if __name__ == '__main__':
9 | agent = PolicyGradientAgent(ALPHA=0.0005, input_dims=8, GAMMA=0.99,
10 | n_actions=4, layer1_size=64, layer2_size=64,
11 | chkpt_dir='tmp/lunar-lander-ckpt')
12 | #agent.load_checkpoint()
13 | env = gym.make('LunarLander-v2')
14 | score_history = []
15 | score = 0
16 | num_episodes = 2500
17 | #env = wrappers.Monitor(env, "tmp/lunar-lander",
18 | # video_callable=lambda episode_id: True, force=True)
19 | for i in range(num_episodes):
20 | print('episode: ', i,'score: ', score)
21 | done = False
22 | score = 0
23 | observation = env.reset()
24 | while not done:
25 | action = agent.choose_action(observation)
26 | observation_, reward, done, info = env.step(action)
27 | agent.store_transition(observation, action, reward)
28 | observation = observation_
29 | score += reward
30 | score_history.append(score)
31 | agent.learn()
32 | #agent.save_checkpoint()
33 | #filename = 'lunar-lander-alpha0005-64x64fc-newG.png'
34 | #plotLearning(score_history, filename=filename, window=25)
35 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_space_invaders.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from reinforce_cnn_tf import PolicyGradientAgent
4 | from utils import plotLearning
5 | from gym import wrappers
6 |
7 |
8 | def preprocess(observation):
9 | return np.mean(observation[15:200, 30:125], axis=2)
10 |
11 |
12 | def stack_frames(stacked_frames, frame, buffer_size):
13 | if stacked_frames is None:
14 | stacked_frames = np.zeros((buffer_size, *frame.shape))
15 | for idx, _ in enumerate(stacked_frames):
16 | stacked_frames[idx,:] = frame
17 | else:
18 | stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:]
19 | stacked_frames[buffer_size-1, :] = frame
20 |
21 | return stacked_frames
22 |
23 | if __name__ == '__main__':
24 | load_checkpoint = False
25 | agent = PolicyGradientAgent(ALPHA=0.001, GAMMA=0.9, n_actions=6, fc1=256,
26 | chkpt_dir='tmp/checkpoint-newG-0p001', gpu={'GPU':1})
27 | filename = 'space-invaders-alpha001-newGcalc.png'
28 | print('will use ', filename, ' and ', agent.gpu)
29 | if load_checkpoint:
30 | agent.load_checkpoint()
31 | env = gym.make('SpaceInvaders-v0')
32 | score_history = []
33 | score = 0
34 | num_episodes = 1000
35 | stack_size = 4
36 | #env = wrappers.Monitor(env, "tmp/space-invaders-newG-0p003",
37 | # video_callable=lambda episode_id: True, force=True)
38 | for i in range(num_episodes):
39 | done = False
40 |
41 | avg_score = np.mean(score_history[max(0, i-20):(i+1)])
42 | if i % 20 == 0 and i > 0:
43 | print('episode: ', i,'score: ', score, ' average score %.3f' % avg_score)
44 | plotLearning(score_history, filename=filename, window=20)
45 | else:
46 | print('episode: ', i,'score: ', score)
47 | observation = env.reset()
48 | observation = preprocess(observation)
49 | stacked_frames = None
50 | stacked_frames = stack_frames(stacked_frames, observation, stack_size)
51 | score = 0
52 | while not done:
53 | action = agent.choose_action(stacked_frames)
54 | observation, reward, done, info = env.step(action)
55 | observation = preprocess(observation)
56 | stacked_frames = stack_frames(stacked_frames, observation, stack_size)
57 | agent.store_transition(observation, action, reward)
58 |
59 | score += reward
60 | score_history.append(score)
61 |
62 | if i % 10 == 0:
63 | agent.learn()
64 | agent.save_checkpoint()
65 | plotLearning(score_history, filename=filename, window=20)
66 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_torch_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from reinforce_torch import PolicyGradientAgent
4 | import matplotlib.pyplot as plt
5 | from utils import plotLearning
6 | from gym import wrappers
7 |
8 | if __name__ == '__main__':
9 | agent = PolicyGradientAgent(ALPHA=0.001, input_dims=[8], GAMMA=0.99,
10 | n_actions=4, layer1_size=128, layer2_size=128)
11 | #agent.load_checkpoint()
12 | env = gym.make('LunarLander-v2')
13 | score_history = []
14 | score = 0
15 | num_episodes = 2500
16 | #env = wrappers.Monitor(env, "tmp/lunar-lander",
17 | # video_callable=lambda episode_id: True, force=True)
18 | for i in range(num_episodes):
19 | print('episode: ', i,'score: ', score)
20 | done = False
21 | score = 0
22 | observation = env.reset()
23 | while not done:
24 | action = agent.choose_action(observation)
25 | observation_, reward, done, info = env.step(action)
26 | agent.store_rewards(reward)
27 | observation = observation_
28 | score += reward
29 | score_history.append(score)
30 | agent.learn()
31 | #agent.save_checkpoint()
32 | filename = 'lunar-lander-alpha001-128x128fc-newG.png'
33 | plotLearning(score_history, filename=filename, window=25)
34 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_keras.py:
--------------------------------------------------------------------------------
1 | from keras.layers import Dense, Activation, Input
2 | from keras.models import Model, load_model
3 | from keras.optimizers import Adam
4 | import keras.backend as K
5 | import numpy as np
6 |
7 | class Agent(object):
8 | def __init__(self, ALPHA, GAMMA=0.99, n_actions=4,
9 | layer1_size=16, layer2_size=16, input_dims=128,
10 | fname='reinforce.h5'):
11 | self.gamma = GAMMA
12 | self.lr = ALPHA
13 | self.G = 0
14 | self.input_dims = input_dims
15 | self.fc1_dims = layer1_size
16 | self.fc2_dims = layer2_size
17 | self.n_actions = n_actions
18 | self.state_memory = []
19 | self.action_memory = []
20 | self.reward_memory = []
21 | self.policy, self.predict = self.build_policy_network()
22 | self.action_space = [i for i in range(n_actions)]
23 |
24 | self.model_file = fname
25 |
26 | def build_policy_network(self):
27 | input = Input(shape=(self.input_dims,))
28 | advantages = Input(shape=[1])
29 | dense1 = Dense(self.fc1_dims, activation='relu')(input)
30 | dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
31 | probs = Dense(self.n_actions, activation='softmax')(dense2)
32 |
33 | def custom_loss(y_true, y_pred):
34 | out = K.clip(y_pred, 1e-8, 1-1e-8)
35 | log_lik = y_true*K.log(out)
36 |
37 | return K.sum(-log_lik*advantages)
38 |
39 | policy = Model(input=[input, advantages], output=[probs])
40 |
41 | policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss)
42 |
43 | predict = Model(input=[input], output=[probs])
44 |
45 | return policy, predict
46 |
47 | def choose_action(self, observation):
48 | state = observation[np.newaxis, :]
49 | probabilities = self.predict.predict(state)[0]
50 | action = np.random.choice(self.action_space, p=probabilities)
51 |
52 | return action
53 |
54 | def store_transition(self, observation, action, reward):
55 | self.state_memory.append(observation)
56 | self.action_memory.append(action)
57 | self.reward_memory.append(reward)
58 |
59 | def learn(self):
60 | state_memory = np.array(self.state_memory)
61 | action_memory = np.array(self.action_memory)
62 | reward_memory = np.array(self.reward_memory)
63 |
64 | actions = np.zeros([len(action_memory), self.n_actions])
65 | actions[np.arange(len(action_memory)), action_memory] = 1
66 |
67 | G = np.zeros_like(reward_memory)
68 | for t in range(len(reward_memory)):
69 | G_sum = 0
70 | discount = 1
71 | for k in range(t, len(reward_memory)):
72 | G_sum += reward_memory[k] * discount
73 | discount *= self.gamma
74 | G[t] = G_sum
75 | mean = np.mean(G)
76 | std = np.std(G) if np.std(G) > 0 else 1
77 | self.G = (G - mean) / std
78 |
79 | cost = self.policy.train_on_batch([state_memory, self.G], actions)
80 |
81 | self.state_memory = []
82 | self.action_memory = []
83 | self.reward_memory = []
84 |
85 | return cost
86 |
87 | def save_model(self):
88 | self.policy.save(self.model_file)
89 |
90 | def load_model(self):
91 | self.policy = load_model(self.model_file)
92 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_tf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 | import numpy as np
4 |
5 | class PolicyGradientAgent():
6 | def __init__(self, ALPHA, GAMMA=0.95, n_actions=4,
7 | layer1_size=16, layer2_size=16, input_dims=128,
8 | chkpt_dir='tmp/checkpoints'):
9 | self.lr = ALPHA
10 | self.gamma = GAMMA
11 | self.n_actions = n_actions
12 | self.action_space = [i for i in range(n_actions)]
13 | self.layer1_size = layer1_size
14 | self.layer2_size = layer2_size
15 | self.input_dims = input_dims
16 | self.state_memory = []
17 | self.action_memory = []
18 | self.reward_memory = []
19 | self.sess = tf.Session()
20 | self.build_net()
21 | self.sess.run(tf.global_variables_initializer())
22 | self.saver = tf.train.Saver()
23 | self.checkpoint_file = os.path.join(chkpt_dir,'policy_network.ckpt')
24 |
25 | def build_net(self):
26 | with tf.variable_scope('parameters'):
27 | self.input = tf.placeholder(tf.float32,
28 | shape=[None, self.input_dims], name='input')
29 | self.label = tf.placeholder(tf.int32,
30 | shape=[None, ], name='label')
31 | self.G = tf.placeholder(tf.float32, shape=[None,], name='G')
32 |
33 | with tf.variable_scope('layer1'):
34 | l1 = tf.layers.dense(inputs=self.input, units=self.layer1_size,
35 | activation=tf.nn.relu,
36 | kernel_initializer=tf.contrib.layers.xavier_initializer())
37 |
38 | with tf.variable_scope('layer2'):
39 | l2 = tf.layers.dense(inputs=l1, units=self.layer2_size,
40 | activation=tf.nn.relu,
41 | kernel_initializer=tf.contrib.layers.xavier_initializer())
42 |
43 | with tf.variable_scope('layer3'):
44 | l3 = tf.layers.dense(inputs=l2, units=self.n_actions,
45 | activation=None,
46 | kernel_initializer=tf.contrib.layers.xavier_initializer())
47 | self.actions = tf.nn.softmax(l3, name='actions')
48 |
49 | with tf.variable_scope('loss'):
50 | negative_log_probability = tf.nn.sparse_softmax_cross_entropy_with_logits(
51 | logits=l3, labels=self.label)
52 |
53 | loss = negative_log_probability * self.G
54 |
55 | with tf.variable_scope('train'):
56 | self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
57 |
58 | def choose_action(self, observation):
59 | observation = observation[np.newaxis, :]
60 | probabilities = self.sess.run(self.actions, feed_dict={self.input: observation})[0]
61 | action = np.random.choice(self.action_space, p = probabilities )
62 |
63 | return action
64 |
65 | def store_transition(self, observation, action, reward):
66 | self.state_memory.append(observation)
67 | self.action_memory.append(action)
68 | self.reward_memory.append(reward)
69 |
70 | def learn(self):
71 | state_memory = np.array(self.state_memory)
72 | action_memory = np.array(self.action_memory)
73 | reward_memory = np.array(self.reward_memory)
74 |
75 | G = np.zeros_like(reward_memory)
76 | for t in range(len(reward_memory)):
77 | G_sum = 0
78 | discount = 1
79 | for k in range(t, len(reward_memory)):
80 | G_sum += reward_memory[k] * discount
81 | discount *= self.gamma
82 | G[t] = G_sum
83 | mean = np.mean(G)
84 | std = np.std(G) if np.std(G) > 0 else 1
85 | G = (G - mean) / std
86 |
87 | _ = self.sess.run(self.train_op,
88 | feed_dict={self.input: state_memory,
89 | self.label: action_memory,
90 | self.G: G})
91 | self.state_memory = []
92 | self.action_memory = []
93 | self.reward_memory = []
94 |
95 | def load_checkpoint(self):
96 | print("...Loading checkpoint...")
97 | self.saver.restore(self.sess, self.checkpoint_file)
98 |
99 | def save_checkpoint(self):
100 | #print("...Saving checkpoint...")
101 | self.saver.save(self.sess, self.checkpoint_file)
102 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_torch.py:
--------------------------------------------------------------------------------
1 | import torch as T
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 |
7 | class PolicyNetwork(nn.Module):
8 | def __init__(self, ALPHA, input_dims, fc1_dims, fc2_dims,
9 | n_actions):
10 | super(PolicyNetwork, self).__init__()
11 | self.input_dims = input_dims
12 | self.fc1_dims = fc1_dims
13 | self.fc2_dims = fc2_dims
14 | self.n_actions = n_actions
15 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
16 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
17 | self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
18 | self.optimizer = optim.Adam(self.parameters(), lr=ALPHA)
19 |
20 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
21 | self.to(self.device)
22 |
23 | def forward(self, observation):
24 | state = T.Tensor(observation).to(self.device)
25 | x = F.relu(self.fc1(state))
26 | x = F.relu(self.fc2(x))
27 | x = self.fc3(x)
28 | return x
29 |
30 | class PolicyGradientAgent(object):
31 | def __init__(self, ALPHA, input_dims, GAMMA=0.99, n_actions=4,
32 | layer1_size=256, layer2_size=256):
33 | self.gamma = GAMMA
34 | self.reward_memory = []
35 | self.action_memory = []
36 | self.policy = PolicyNetwork(ALPHA, input_dims, layer1_size, layer2_size,
37 | n_actions)
38 |
39 | def choose_action(self, observation):
40 | probabilities = F.softmax(self.policy.forward(observation))
41 | action_probs = T.distributions.Categorical(probabilities)
42 | action = action_probs.sample()
43 | log_probs = action_probs.log_prob(action)
44 | self.action_memory.append(log_probs)
45 |
46 | return action.item()
47 |
48 | def store_rewards(self,reward):
49 | self.reward_memory.append(reward)
50 |
51 | def learn(self):
52 | self.policy.optimizer.zero_grad()
53 | # Assumes only a single episode for reward_memory
54 | G = np.zeros_like(self.reward_memory, dtype=np.float64)
55 | for t in range(len(self.reward_memory)):
56 | G_sum = 0
57 | discount = 1
58 | for k in range(t, len(self.reward_memory)):
59 | G_sum += self.reward_memory[k] * discount
60 | discount *= self.gamma
61 | G[t] = G_sum
62 | mean = np.mean(G)
63 | std = np.std(G) if np.std(G) > 0 else 1
64 | G = (G - mean) / std
65 |
66 | G = T.tensor(G, dtype=T.float).to(self.policy.device)
67 |
68 | loss = 0
69 | for g, logprob in zip(G, self.action_memory):
70 | loss += -g * logprob
71 |
72 | loss.backward()
73 | self.policy.optimizer.step()
74 |
75 | self.action_memory = []
76 | self.reward_memory = []
77 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/main.py:
--------------------------------------------------------------------------------
1 | # if you have more than 1 gpu, use device '0' or '1' to assign to a gpu
2 | #import os
3 | #os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
4 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
5 | import gym
6 | import numpy as np
7 | from reinforce_tf2 import Agent
8 | from utils import plotLearning
9 |
10 | if __name__ == '__main__':
11 | agent = Agent(alpha=0.0005, gamma=0.99,n_actions=4)
12 |
13 | env = gym.make('LunarLander-v2')
14 | score_history = []
15 |
16 | num_episodes = 2000
17 |
18 | for i in range(num_episodes):
19 | done = False
20 | score = 0
21 | observation = env.reset()
22 | while not done:
23 | action = agent.choose_action(observation)
24 | observation_, reward, done, info = env.step(action)
25 | agent.store_transition(observation, action, reward)
26 | observation = observation_
27 | score += reward
28 | score_history.append(score)
29 |
30 | agent.learn()
31 | avg_score = np.mean(score_history[-100:])
32 | print('episode: ', i,'score: %.1f' % score,
33 | 'average score %.1f' % avg_score)
34 |
35 | filename = 'lunar-lander.png'
36 | plotLearning(score_history, filename=filename, window=100)
37 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/networks.py:
--------------------------------------------------------------------------------
1 | import tensorflow.keras as keras
2 | from tensorflow.keras.layers import Dense
3 |
4 | class PolicyGradientNetwork(keras.Model):
5 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
6 | super(PolicyGradientNetwork, self).__init__()
7 | self.fc1_dims = fc1_dims
8 | self.fc2_dims = fc2_dims
9 | self.n_actions = n_actions
10 |
11 | self.fc1 = Dense(self.fc1_dims, activation='relu')
12 | self.fc2 = Dense(self.fc2_dims, activation='relu')
13 | self.pi = Dense(n_actions, activation='softmax')
14 |
15 | def call(self, state):
16 | value = self.fc1(state)
17 | value = self.fc2(value)
18 |
19 | pi = self.pi(value)
20 |
21 | return pi
22 |
23 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/reinforce_tf2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from networks import PolicyGradientNetwork
3 | import tensorflow_probability as tfp
4 | from tensorflow.keras.optimizers import Adam
5 | import numpy as np
6 |
7 | class Agent:
8 | def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
9 | layer1_size=256, layer2_size=256):
10 |
11 | self.gamma = gamma
12 | self.lr = alpha
13 | self.n_actions = n_actions
14 | self.state_memory = []
15 | self.action_memory = []
16 | self.reward_memory = []
17 | self.policy = PolicyGradientNetwork(n_actions=n_actions)
18 | self.policy.compile(optimizer=Adam(learning_rate=self.lr))
19 |
20 | def choose_action(self, observation):
21 | state = tf.convert_to_tensor([observation], dtype=tf.float32)
22 | probs = self.policy(state)
23 | action_probs = tfp.distributions.Categorical(probs=probs)
24 | action = action_probs.sample()
25 |
26 | return action.numpy()[0]
27 |
28 | def store_transition(self, observation, action, reward):
29 | self.state_memory.append(observation)
30 | self.action_memory.append(action)
31 | self.reward_memory.append(reward)
32 |
33 | def learn(self):
34 | actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
35 | rewards = np.array(self.reward_memory)
36 |
37 | G = np.zeros_like(rewards)
38 | for t in range(len(rewards)):
39 | G_sum = 0
40 | discount = 1
41 | for k in range(t, len(rewards)):
42 | G_sum += rewards[k] * discount
43 | discount *= self.gamma
44 | G[t] = G_sum
45 |
46 | with tf.GradientTape() as tape:
47 | loss = 0
48 | for idx, (g, state) in enumerate(zip(G, self.state_memory)):
49 | state = tf.convert_to_tensor([state], dtype=tf.float32)
50 | probs = self.policy(state)
51 | action_probs = tfp.distributions.Categorical(probs=probs)
52 | log_prob = action_probs.log_prob(actions[idx])
53 | loss += -g * tf.squeeze(log_prob)
54 |
55 | gradient = tape.gradient(loss, self.policy.trainable_variables)
56 | self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
57 |
58 | self.state_memory = []
59 | self.action_memory = []
60 | self.reward_memory = []
61 |
--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | def plotLearning(scores, filename, x=None, window=5):
5 | N = len(scores)
6 | running_avg = np.empty(N)
7 | for t in range(N):
8 | running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
9 | if x is None:
10 | x = [i for i in range(N)]
11 | plt.ylabel('Score')
12 | plt.xlabel('Game')
13 | plt.plot(x, running_avg)
14 | plt.savefig(filename)
--------------------------------------------------------------------------------
/basic_encryption/caesar.py:
--------------------------------------------------------------------------------
1 | from common import alphabet
2 |
3 |
4 | def translate(message, shift, encrypt=True):
5 | new_message = ''
6 | n_chars = len(alphabet)
7 |
8 | for character in message:
9 | char_idx = alphabet.index(character)
10 | if encrypt:
11 | new_char_idx = (char_idx + shift) % n_chars
12 | elif not encrypt:
13 | new_char_idx = (char_idx - shift) % n_chars
14 | new_message += alphabet[new_char_idx]
15 | return new_message
16 |
17 |
18 | cipher_shift = 7
19 |
20 | print('AB->', translate('AB', cipher_shift))
21 | print('ab->', translate('ab', cipher_shift))
22 | print('Ab->', translate('Ab', cipher_shift))
23 | print('aB->', translate('aB', cipher_shift))
24 |
25 | plaintext = 'This is an encrypted message.'
26 | ciphertext = translate(plaintext, cipher_shift, True)
27 | print(plaintext, '->', ciphertext)
28 | original_message = translate(ciphertext, cipher_shift, False)
29 | print(ciphertext, '->', original_message)
30 |
--------------------------------------------------------------------------------
/basic_encryption/common.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 |
4 | # https://stackoverflow.com/questions/7001144/range-over-character-in-python
5 | def character_generator(start_char, stop_char):
6 | for char in range(ord(start_char), ord(stop_char)+1):
7 | yield chr(char)
8 |
9 |
10 | def generate_one_time_pad(n_chars, characters):
11 | return ''.join(random.choice(characters) for _ in range(n_chars))
12 |
13 |
14 | lower_case = list(character_generator('a', 'z'))
15 | upper_case = list(character_generator('A', 'Z'))
16 | punctuation = ['.', ',', ' ', '?', '!']
17 |
18 | alphabet = lower_case + upper_case + punctuation
19 |
--------------------------------------------------------------------------------
/basic_encryption/one_time_pad.py:
--------------------------------------------------------------------------------
1 | from common import alphabet, generate_one_time_pad
2 |
3 |
4 | def translate(message, one_time_pad, encrypt=True):
5 | new_message = ''
6 |
7 | n_chars = len(alphabet)
8 |
9 | for src, key in zip(message, one_time_pad):
10 | char_idx = alphabet.index(src)
11 | pad_idx = alphabet.index(key)
12 | if encrypt:
13 | new_char_idx = (char_idx + pad_idx) % n_chars
14 | elif not encrypt:
15 | new_char_idx = (char_idx - pad_idx) % n_chars
16 | new_message += alphabet[new_char_idx]
17 |
18 | return new_message
19 |
20 |
21 | message = 'This is an encrypted message.'
22 | secret_key = generate_one_time_pad(len(message), alphabet)
23 | encrypted_message = translate(message, secret_key, True)
24 | original_message = translate(encrypted_message, secret_key, False)
25 |
26 | print(message, '->', encrypted_message)
27 | print(encrypted_message, '->', original_message)
28 |
--------------------------------------------------------------------------------
/basic_encryption/vignere.py:
--------------------------------------------------------------------------------
1 | from common import alphabet, generate_one_time_pad
2 |
3 |
4 | def make_vignere_table():
5 | table = [['']] * len(alphabet)
6 | for idx, character in enumerate(alphabet):
7 | row = []
8 | for char in alphabet[idx:]:
9 | row.append(char)
10 | for char in alphabet[:idx]:
11 | row.append(char)
12 | table[idx] = row
13 | return table
14 |
15 |
16 | def translate(message, vig_table, one_time_pad, encrypt=True):
17 | new_message = ''
18 |
19 | if encrypt:
20 | for src, key in zip(message, one_time_pad):
21 | row = vig_table[:][0].index(key)
22 | col = vig_table[0][:].index(src)
23 | new_message += vig_table[row][col]
24 | elif not encrypt:
25 | for src, key in zip(message, one_time_pad):
26 | row = vig_table[:][0].index(key)
27 | col = vig_table[row][:].index(src)
28 | new_message += vig_table[0][col]
29 | return new_message
30 |
31 |
32 | table = make_vignere_table()
33 | message = 'This is an encrypted message.'
34 | secret_key = generate_one_time_pad(len(message), alphabet)
35 | encrypted_message = translate(message, table, secret_key, True)
36 | original_message = translate(encrypted_message, table, secret_key, False)
37 |
38 | print(message, '->', encrypted_message)
39 | print(encrypted_message, '->', original_message)
40 |
--------------------------------------------------------------------------------
/cmdline.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | from ReinforcementLearning.DeepQLearning.utils import plotLearning
4 | from ReinforcementLearning.DeepQLearning.simple_dqn_torch import Agent
5 |
6 | if __name__ == '__main__':
7 | parser = argparse.ArgumentParser(
8 | description='Command line Utility for training RL models')
9 | # the hyphen makes the argument optional
10 | parser.add_argument('-n_games', type=int, default=1,
11 | help='Number of games to play')
12 | parser.add_argument('-lr', type=float, default=0.001,
13 | help='Learning rate for optimizer')
14 | parser.add_argument('-eps_end', type=float, default=0.01,
15 | help='Final value for epsilon in epsilon-greedy action selection')
16 | parser.add_argument('-gamma', type=float, default=0.99,
17 | help='Discount factor for update equation.')
18 | parser.add_argument('-env', type=str, default='LunarLander-v2',
19 | help='OpenAI gym environment for agent')
20 | parser.add_argument('-eps_dec', type=float, default=0.996,
21 | help='Multiplicative factor for decreasing epsilon')
22 | parser.add_argument('-eps', type=float, default=1.0,
23 | help='Starting value for epsilon in epsilon-greedy action selection')
24 | parser.add_argument('-max_mem', type=int, default=1000000,
25 | help='Maximum size for memory replay buffer')
26 | parser.add_argument('-dims', type=int, default=8,
27 | help='Input dimensions; matches env observation, \
28 | must be list or tuple')
29 | parser.add_argument('-bs', type=int, default=32,
30 | help='Batch size for replay memory sampling')
31 | parser.add_argument('-n_actions', type=int, default=4,
32 | help='Number of actions in discrete action space')
33 | args = parser.parse_args()
34 |
35 | env = gym.make(args.env)
36 |
37 | args.dims = [args.dims]
38 |
39 | agent = Agent(args.gamma, args.eps, args.lr, args.dims, args.bs,
40 | args.n_actions, args.max_mem, args.eps_end, args.eps_dec)
41 |
42 | eps_history, scores = [], []
43 | for i in range(args.n_games):
44 | observation = env.reset()
45 | done = False
46 | score = 0
47 | while not done:
48 | action = agent.chooseAction(observation)
49 | observation_, reward, done, info = env.step(action)
50 | score += reward
51 | agent.storeTransition(observation, action,
52 | reward, observation_, int(done))
53 | observation = observation_
54 | agent.learn()
55 |
56 | eps_history.append(agent.EPSILON)
57 | scores.append(score)
58 |
59 | if i % 10 == 0 and i > 0:
60 | avg_score = np.mean(scores[max(0, i-10):(i+1)])
61 | print('episode: ', i,'score: ', score,
62 | ' average score %.3f' % avg_score,
63 | 'epsilon %.3f' % agent.EPSILON)
64 | else:
65 | print('episode: ', i,'score: ', score)
66 |
67 | x = [i+1 for i in range(args.n_games)]
68 | # filename should reflect whatever it is you are varying to tune your
69 | # agent. For simplicity I'm just showing alpha and gamma, but it can be
70 | # the epsilons as well. You can even include parameters for the fully
71 | # connected layers and use them as part of the file name.
72 | filename = args.env + '_alpha' + str(args.lr) + '_gamma' + str(args.gamma) + \
73 | '.png'
74 | plotLearning(x, scores, eps_history, filename)
75 |
--------------------------------------------------------------------------------
/giveaway_scrubbed.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | # These people all indicated they are located in the US/CA or had a means to
5 | # ship to a US/CA address for forwarding.
6 | us_ca_entrants = ["Rintze", "Hasan", "Keith", "Joseph", "Asceptt", "Brian",
7 | "Xiaoyu", "Anik", "Devshank", "Jeremy", "Amin", "Brenton",
8 | "Remi", "Howard", "Michael", "Khizr", "Jay", "Ricardo",
9 | "Matt", "Chris", "Tanner", "Paul", "Pang", "Jose", "David",
10 | "Kurt", "Jesse"]
11 |
12 | # These people indicated they were in a foreign country and did not indicate
13 | # they had the means to ship to a foreign address.
14 | intl_entrants = ["Harsh"]
15 |
16 | # These people did not indicate where they were or their means to forward mail
17 | unknown_entrants = ["Gareth", "Dan", "Dileep", "Zeeshan", "Romin", "Dellan",
18 | "Marcin", "Wouter", "Cecil", "Jamal", "Gabriel", "ATV",
19 | "Violet", "Waqas", "Joy", "Tianqi", "Thomas"]
20 |
21 | random.seed(2022)
22 |
23 | gpu_winner = random.choice(us_ca_entrants)
24 |
25 | all_entrants = us_ca_entrants + intl_entrants + unknown_entrants
26 |
27 | nnai_winner = random.choice(all_entrants)
28 |
29 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
30 |
31 | # Make sure there are no duplicate names, so there is no ambiguity in who won
32 | assert len(np.unique(us_ca_entrants)) == len(us_ca_entrants)
33 |
34 | assert len(np.unique(all_entrants)) == len(all_entrants)
35 |
36 | print('GPU Winner:', gpu_winner)
37 |
38 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
39 |
40 | print('Deep Learning Institute winners:', dli_winners)
41 |
--------------------------------------------------------------------------------
/giveaway_scrubbed_3-23.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | all_entrants = ['Henry Magregor', 'Charly B', 'Arjun H', 'Pete O', 'Lolis F',
5 | 'Kaan A', 'Inosiro', 'Brian B', 'Ben C', 'Jorge B', 'Jesse G',
6 | 'Hauke H', 'Pas D', 'Aditya C', 'Marc C', 'Logan G', 'Brian C',
7 | 'Antemasq', 'Alex D', 'Bibek P', 'Andrew S', 'Gonzalo B',
8 | 'Martin P', 'Bikash S', 'William P', 'Daniel A', 'Naomi G',
9 | 'Alex V', 'Chris G', 'Steve L', 'Felix G', 'Greg K', 'x g',
10 | ]
11 |
12 | random.seed(2023)
13 |
14 | gpu_winner = random.choice(all_entrants)
15 |
16 | all_entrants.remove(gpu_winner)
17 |
18 | nnai_winner = random.choice(all_entrants)
19 |
20 | all_entrants.remove(nnai_winner)
21 |
22 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
23 |
24 | # Make sure there are no duplicate names, so there is no ambiguity in who won
25 | assert len(np.unique(all_entrants)) == len(all_entrants)
26 |
27 | print('GPU Winner:', gpu_winner)
28 |
29 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
30 |
31 | print('Deep Learning Institute winners:', dli_winners)
32 |
--------------------------------------------------------------------------------
/giveaway_scrubbed_9-22.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 |
4 | # This time I didn't specify to indicate where you are located and if you had
5 | # the means to ship abroad. Silly oversight on my part, but that's life.
6 | # This means I'll do the drawing and email everyone first. If someone overseas
7 | # wins the GPU but can't get it shipped, then I'll subtract their name and draw
8 | # again.
9 |
10 | all_entrants = ['xiaoyu', 'sunil', 'kelvin', 'jacob', 'sean', 'dilith',
11 | 'noctildon', 'lukas_k', 'alex', 'matt_t', 'inosiro',
12 | 'f1datadiver', 'sambaran', 'dean_v_a', 'balaji', 'aditya',
13 | 'brian_cu', 'sim', 'philip', 'antonio', 'roumen', 'marc',
14 | 'william_p', 'michael_f', 'behnood', 'lucas_p', 'ahmed_k',
15 | 'jamal_c', 'luciano_d', 'amir-ul', 'kinal', 'sidhanath',
16 | 'lorenzo', 'michael_w', 'ravi_j', 'brigliano', 'hrovje',
17 | 'daniel_b', 'terry_w', 'jun', 'kurt_b', 'hauke', 'super_dave',
18 | 'george', 'lukas_d', 'waleed', 'clark', 'frak', 'ravi_c',
19 | 'sawaiz', 'ferran', 'jack-ziad', 'christian_g', 'zxavier',
20 | 'daniel_k', 'akash', 'jbene', 'hause', 'jack', 'cristiano',
21 | 'nguyen_q_d', 'tatonata', 'dennis_f', 'till_z', 'dusan',
22 | 'abdennacer', 'antonio_p', 'dilan', 'adam_b', 'brian_co',
23 | 'k_ali', 'matt_r', 'navoda', 'doyun', 'william_s', 'jed_j',
24 | 'bijay', 'bruno', 'shivam', 'arjun_h', 'emil', 'abdulla_m',
25 | 'nick', 'joyce_w', 'abhinav', 'alex_v', 'ruturaj_s']
26 |
27 | random.seed(2022)
28 |
29 | gpu_winner = random.choice(all_entrants)
30 |
31 | all_entrants.remove(gpu_winner)
32 |
33 | nnai_winner = random.choice(all_entrants)
34 |
35 | all_entrants.remove(nnai_winner)
36 |
37 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
38 |
39 | # Make sure there are no duplicate names, so there is no ambiguity in who won
40 | assert len(np.unique(all_entrants)) == len(all_entrants)
41 |
42 | print('GPU Winner:', gpu_winner)
43 |
44 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
45 |
46 | print('Deep Learning Institute winners:', dli_winners)
47 |
--------------------------------------------------------------------------------
/tf_embeddings.py:
--------------------------------------------------------------------------------
1 | import io
2 | import matplotlib.pyplot as plt
3 | import tensorflow as tf
4 | from tensorflow import keras
5 | from tensorflow.keras import layers
6 | import tensorflow_datasets as tfds
7 |
8 | #embedding_layer = layers.Embedding(1000, 5)
9 |
10 | #result = embedding_layer(tf.constant([1,2,3]))
11 |
12 | #print(result.numpy())
13 | #print(result.numpy().shape)
14 | def get_batch_data():
15 | (train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
16 | split=(tfds.Split.TRAIN, tfds.Split.TEST),
17 | with_info=True, as_supervised=True)
18 |
19 | encoder = info.features['text'].encoder
20 | #print(encoder.subwords[:20])
21 | padded_shapes = ([None], ())
22 | train_batches = train_data.shuffle(1000).padded_batch(10,
23 | padded_shapes=padded_shapes)
24 | test_batches = test_data.shuffle(1000).padded_batch(10,
25 | padded_shapes=padded_shapes)
26 | return train_batches, test_batches, encoder
27 |
28 | def get_model(encoder, embedding_dim=16):
29 |
30 | model = keras.Sequential([
31 | layers.Embedding(encoder.vocab_size, embedding_dim),
32 | layers.GlobalAveragePooling1D(),
33 | layers.Dense(1, activation='sigmoid')])
34 |
35 | model.compile(optimizer='adam', loss='binary_crossentropy',
36 | metrics=['accuracy'])
37 | return model
38 |
39 | def plot_history(history):
40 | history_dict = history.history
41 | acc = history_dict['accuracy']
42 | val_acc = history_dict['val_accuracy']
43 | epochs = range(1, len(acc) + 1)
44 |
45 | plt.figure(figsize=(12,9))
46 | plt.plot(epochs, acc, 'bo', label='Training acc')
47 | plt.plot(epochs, val_acc, 'b', label='Validation acc')
48 | plt.title('Training and validation accuracy')
49 | plt.xlabel('Epochs')
50 | plt.ylabel('Accuracy')
51 | plt.legend(loc='lower right')
52 | plt.ylim((0.5, 1))
53 | plt.show()
54 |
55 | def retrieve_embeddings(model, encoder):
56 | out_vectors = io.open('vecs.tsv', 'w', encoding='utf-8')
57 | out_metadata = io.open('meta.tsv', 'w', encoding='utf-8')
58 | weights = model.layers[0].get_weights()[0]
59 |
60 | for num, word in enumerate(encoder.subwords):
61 | vec = weights[num+1]
62 | out_metadata.write(word + '\n')
63 | out_vectors.write('\t'.join([str(x) for x in vec]) + '\n')
64 | out_vectors.close()
65 | out_metadata.close()
66 |
67 | train_batches, test_batches, encoder = get_batch_data()
68 | model = get_model(encoder)
69 | history = model.fit(train_batches, epochs=10, validation_data=test_batches,
70 | validation_steps=20)
71 | plot_history(history)
72 | retrieve_embeddings(model, encoder)
73 |
--------------------------------------------------------------------------------
/tf_sentiment.py:
--------------------------------------------------------------------------------
1 | import tensorflow_datasets as tfds
2 | import tensorflow as tf
3 |
4 | dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,
5 | as_supervised=True)
6 | train_dataset, test_dataset = dataset['train'], dataset['test']
7 |
8 | encoder = info.features['text'].encoder
9 |
10 | BUFFER_SIZE = 10000
11 | BATCH_SIZE = 64
12 |
13 | padded_shapes = ([None], ())
14 |
15 | train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE,
16 | padded_shapes=padded_shapes)
17 |
18 | test_dataset = test_dataset.padded_batch(BATCH_SIZE,
19 | padded_shapes=padded_shapes)
20 |
21 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64),
22 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
23 | tf.keras.layers.Dense(64, activation='relu'),
24 | tf.keras.layers.Dense(1, activation='sigmoid')])
25 | model.compile(loss='binary_crossentropy',
26 | optimizer=tf.keras.optimizers.Adam(1e-4),
27 | metrics=['accuracy'])
28 |
29 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset,
30 | validation_steps=30)
31 |
32 | def pad_to_size(vec, size):
33 | zeros = [0]*(size-len(vec))
34 | vec.extend(zeros)
35 | return vec
36 |
37 | def sample_predict(sentence, pad, model_):
38 | encoded_sample_pred_text = encoder.encode(sentence)
39 | if pad:
40 | encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
41 | encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
42 | predictions = model_.predict(tf.expand_dims(encoded_sample_pred_text, 0))
43 |
44 | return predictions
45 |
46 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend')
47 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
48 |
49 | print('probability this is a positive review %.2f' % predictions)
50 |
51 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend')
52 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
53 |
54 | print('probability this is a positive review %.2f' % predictions)
55 |
56 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64),
57 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
58 | tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
59 | tf.keras.layers.Dense(64, activation='relu'),
60 | tf.keras.layers.Dropout(0.5),
61 | tf.keras.layers.Dense(1, activation='sigmoid')])
62 | model.compile(loss='binary_crossentropy',
63 | optimizer=tf.keras.optimizers.Adam(1e-4),
64 | metrics=['accuracy'])
65 |
66 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset,
67 | validation_steps=30)
68 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend')
69 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
70 |
71 | print('probability this is a positive review %.2f' % predictions)
72 |
73 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend')
74 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
75 |
76 | print('probability this is a positive review %.2f' % predictions)
77 |
78 |
--------------------------------------------------------------------------------
/threaded.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import numpy as np
3 | import glob
4 | from keras.preprocessing.image import ImageDataGenerator
5 | from multiprocessing.dummy import Pool as ThreadPool
6 |
7 | def augment_images(raw_images, files, mult_factor):
8 | gen = ImageDataGenerator()
9 | for idx, image in enumerate(raw_images):
10 | for mult in range(mult_factor):
11 | img_fname = files[idx].split('/')[4]
12 | img_fname = '../../Data/AugmentedImages/' + \
13 | img_fname.split('.')[0] + '_' + str(multi) + '.jpg'
14 |
15 | theta_tfx = np.random.choice(range(270))
16 | transformed_raw_image = gen.apply_transform(image,
17 | {'theta': theta_fx})
18 | new_image = Image.fromarray(transformed_raw_image, 'RGB')
19 | new_image = new_image.resize((1024, 1024), Image.ANTIALIAS)
20 | new_image.save(img_fname)
21 | transformed_raw_image = None
22 | new_image = None
23 |
24 | if __name__ == '__main__':
25 | raw_images_dir = '../../Data/RawImages/'
26 | raw_image_files = sorted(glob.sglob(raw_images_dir + '*.jpg',
27 | recursive=True))
28 |
29 | img_list = []
30 | for file in raw_image files:
31 | img_list.append(np.array(Image.open(file)))
32 | augment_images(img_list, raw_image_files, mult_factor=10)
33 |
--------------------------------------------------------------------------------