├── CNN.py
├── Kaggle
    └── venus-volcanoes.ipynb
├── README.md
├── ReinforcementLearning
    ├── CombinedExperienceReplay
    │   ├── dqn_torch.py
    │   ├── main.py
    │   ├── memory.py
    │   ├── memory_solution.py
    │   └── plot.py
    ├── DeepQLearning
    │   ├── archive
    │   │   ├── dqn_keras.py
    │   │   ├── dqn_tf.py
    │   │   ├── frame_stack_test.py
    │   │   ├── main_keras_dqn_pong.py
    │   │   ├── main_tf_dqn_lunar_lander.py
    │   │   ├── main_torch_dqn_lunar_lander.py
    │   │   ├── main_torch_dqn_space_invaders.py
    │   │   ├── q_eval.h5
    │   │   ├── q_next.h5
    │   │   ├── simple_dqn_tf.py
    │   │   ├── simple_dqn_torch.py
    │   │   └── torch_deep_q_model.py
    │   ├── ddqn_keras.py
    │   ├── dueling_ddqn_tf2.py
    │   ├── dueling_ddqn_torch.py
    │   ├── dueling_dqn_keras.py
    │   ├── dueling_dqn_torch.py
    │   ├── main_keras_ddqn_lunar_lander.py
    │   ├── main_keras_dqn_lunar_lander.py
    │   ├── main_keras_dueling_dqn_lunar_lander.py
    │   ├── main_tf2_dqn_lunar_lander.py
    │   ├── main_tf2_dueling_ddqn_lunar_lander.py
    │   ├── main_tf_dqn_breakout.py
    │   ├── main_torch_dqn_lunar_lander_2020.py
    │   ├── main_torch_dueling_ddqn_lunar_lander.py
    │   ├── main_torch_dueling_dqn_lunar_lander.py
    │   ├── simple_dqn_keras.py
    │   ├── simple_dqn_tf2.py
    │   ├── simple_dqn_torch_2020.py
    │   └── utils.py
    ├── Fundamentals
    │   ├── acrobot.py
    │   ├── blackJack-no-es.py
    │   ├── blackJack-off-policy.py
    │   ├── cartpole_qlearning.py
    │   ├── doubleQLearning.py
    │   ├── dynamic_programming.py
    │   ├── gridworld.py
    │   ├── mountaincar.png
    │   ├── mountaincar.py
    │   ├── n_step_sarsa.py
    │   └── sarsa.py
    ├── ICM
    │   ├── A3C_CartPole_no_rewards.png
    │   ├── ICM_CartPole_no_rewards.png
    │   ├── actor_critic.py
    │   ├── icm.py
    │   ├── main.py
    │   ├── memory.py
    │   ├── parallel_env.py
    │   ├── shared_adam.py
    │   ├── utils.py
    │   └── worker.py
    └── PolicyGradient
    │   ├── A3C
    │       └── pytorch
    │       │   └── a3c.py
    │   ├── DDPG
    │       ├── pytorch
    │       │   └── lunar-lander
    │       │   │   ├── Torch-LunarLander-alpha000025-beta00025-400-300.png
    │       │   │   ├── ddpg_torch.py
    │       │   │   ├── main_torch.py
    │       │   │   └── utils.py
    │       ├── tensorflow
    │       │   ├── pendulum
    │       │   │   ├── ddpg_orig_tf.py
    │       │   │   ├── ddpg_tf.py
    │       │   │   ├── main_tf.py
    │       │   │   └── utils.py
    │       │   └── walker2d
    │       │   │   ├── ddpg_orig_tf.py
    │       │   │   ├── main_tf.py
    │       │   │   └── tmp
    │       │   │       └── ddpg_best_3
    │       │   │           ├── Actor_ddpg.ckpt.data-00000-of-00001
    │       │   │           ├── Actor_ddpg.ckpt.index
    │       │   │           ├── Actor_ddpg.ckpt.meta
    │       │   │           ├── Critic_ddpg.ckpt.data-00000-of-00001
    │       │   │           ├── Critic_ddpg.ckpt.index
    │       │   │           ├── Critic_ddpg.ckpt.meta
    │       │   │           ├── TargetActor_ddpg.ckpt.data-00000-of-00001
    │       │   │           ├── TargetActor_ddpg.ckpt.index
    │       │   │           ├── TargetActor_ddpg.ckpt.meta
    │       │   │           ├── TargetCritic_ddpg.ckpt.data-00000-of-00001
    │       │   │           ├── TargetCritic_ddpg.ckpt.index
    │       │   │           └── TargetCritic_ddpg.ckpt.meta
    │       └── tensorflow2
    │       │   └── pendulum
    │       │       ├── buffer.py
    │       │       ├── ddpg_tf2.py
    │       │       ├── main_ddpg.py
    │       │       ├── networks.py
    │       │       ├── pendulum.png
    │       │       └── utils.py
    │   ├── PPO
    │       ├── tf2
    │       │   ├── agent.py
    │       │   ├── main.py
    │       │   ├── memory.py
    │       │   ├── networks.py
    │       │   └── utils.py
    │       └── torch
    │       │   ├── Slides.pdf
    │       │   ├── cartpole.png
    │       │   ├── main.py
    │       │   ├── ppo_torch.py
    │       │   └── utils.py
    │   ├── SAC
    │       ├── buffer.py
    │       ├── main_sac.py
    │       ├── networks.py
    │       ├── sac_torch.py
    │       ├── tf2
    │       │   ├── Slides.pdf
    │       │   ├── buffer.py
    │       │   ├── main_sac.py
    │       │   ├── networks.py
    │       │   ├── plots
    │       │   │   └── inverted_pendulum.png
    │       │   ├── sac_tf2.py
    │       │   └── utils.py
    │       └── utils.py
    │   ├── TD3
    │       ├── main.py
    │       ├── td3_torch.py
    │       ├── tf2
    │       │   ├── main.py
    │       │   ├── plots
    │       │   │   └── walker_1500_games.png
    │       │   ├── td3_tf2.py
    │       │   └── utils.py
    │       └── utils.py
    │   ├── actor_critic
    │       ├── actor_critic_continuous.py
    │       ├── actor_critic_keras.py
    │       ├── actor_critic_replay_torch.py
    │       ├── cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png
    │       ├── continuous_mountain_car_actor_critic.py
    │       ├── discrete_cartpole.py
    │       ├── main_keras_actor_critic_lunar_lander.py
    │       ├── main_torch_actor_critic_replay_lunar_lander.py
    │       ├── mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png
    │       ├── tensorflow2
    │       │   ├── actor_critic.py
    │       │   ├── cartpole.png
    │       │   ├── main.py
    │       │   ├── networks.py
    │       │   └── utils.py
    │       ├── torch_actor_critic_discrete.py
    │       ├── torch_discrete_lunar_lander.py
    │       └── utils.py
    │   └── reinforce
    │       ├── main_keras_reinforce_lunar_lander.py
    │       ├── main_tf_reinforce_lunar_lander.py
    │       ├── main_tf_reinforce_space_invaders.py
    │       ├── main_torch_reinforce_lunar_lander.py
    │       ├── reinforce_cnn_tf.py
    │       ├── reinforce_keras.py
    │       ├── reinforce_tf.py
    │       ├── reinforce_torch.py
    │       ├── space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png
    │       ├── tensorflow2
    │           ├── lunar-lander-tf2-256x256-alpha0005-2000games.png
    │           ├── main.py
    │           ├── networks.py
    │           └── reinforce_tf2.py
    │       └── utils.py
├── basic_encryption
    ├── caesar.py
    ├── common.py
    ├── one_time_pad.py
    └── vignere.py
├── cmdline.py
├── giveaway_scrubbed.py
├── giveaway_scrubbed_3-23.py
├── giveaway_scrubbed_9-22.py
├── modular_cnn.py
├── simple_cnn_mnist.py
├── simple_nn_mnist.py
├── tf_embeddings.py
├── tf_sentiment.py
├── tf_text_gen.py
└── threaded.py


/CNN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import struct
 3 | import tensorflow as tf
 4 | 
 5 | def load_data():
 6 |     with open('train-labels.idx1-ubyte', 'rb') as labels:
 7 |         magic, n = struct.unpack('>II', labels.read(8))
 8 |         train_labels = np.fromfile(labels, dtype=np.uint8)
 9 |     with open('train-images.idx3-ubyte', 'rb') as imgs:
10 |         magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16))
11 |         train_images = np.fromfile(imgs, dtype=np.uint8).reshape(num,784)
12 |     with open('t10k-labels.idx1-ubyte', 'rb') as labels:
13 |         magic, n = struct.unpack('>II', labels.read(8))
14 |         test_labels = np.fromfile(labels, dtype=np.uint8)
15 |     with open('t10k-images.idx3-ubyte', 'rb') as imgs:
16 |         magic, num, nrows, ncols = struct.unpack('>IIII', imgs.read(16))
17 |         test_images = np.fromfile(imgs, np.uint8).reshape(num,784)
18 |     return train_images, train_labels, test_images, test_labels
19 | 
20 | def cnn_model_fn(features, labels, mode):
21 |     input_layer = tf.cast(tf.reshape(features['x'], [-1, 28, 28, 1]), tf.float16)
22 | 
23 |     conv1 = tf.layers.conv2d(inputs=input_layer,
24 |                             filters=16,
25 |                             kernel_size=[5,5],
26 |                             padding='same',
27 |                             activation=tf.nn.relu)
28 | 
29 |     pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2,2], strides=2)
30 | 
31 |     conv2 = tf.layers.conv2d(inputs=pool1,
32 |                             filters=32,
33 |                             kernel_size=[5,5],
34 |                             padding='same',
35 |                             activation=tf.nn.relu)
36 | 
37 |     pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size = [2,2], strides=2)
38 | 
39 |     pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 32])
40 | 
41 |     dense = tf.layers.dense(inputs=pool2_flat, units=128, activation=tf.nn.relu)
42 |     logits = tf.layers.dense(inputs=dense, units=10)
43 | 
44 |     predictions = {
45 |         'classes': tf.argmax(input=logits, axis=1),
46 |         'probabilities': tf.nn.softmax(logits, name='softmax_tensor')        
47 |     }
48 | 
49 |     if mode == tf.estimator.ModeKeys.PREDICT:
50 |             return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
51 |     
52 |     onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
53 | 
54 |     loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
55 | 
56 |     if mode == tf.estimator.ModeKeys.TRAIN:
57 |         optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
58 |         train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
59 |         return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
60 |     
61 |     if mode == tf.estimator.ModeKeys.EVAL:
62 |         eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions['classes'])}
63 |         return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
64 |     
65 | if __name__ == '__main__':
66 |     training_data, training_labels, testing_data, testing_labels = load_data()
67 |     num_epochs = 10
68 | 
69 |     classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
70 |                                         model_dir='tmp/')
71 | 
72 |     input_fn = tf.estimator.inputs.numpy_input_fn(
73 |         x={"x": training_data},
74 |         y=training_labels,
75 |         batch_size=32,
76 |         num_epochs=None,
77 |         shuffle=True)
78 |     
79 |     for i in range(num_epochs):
80 |         classifier.train(input_fn=input_fn, steps=1000)
81 |     
82 |     eval_input_fn = tf.estimator.inputs.numpy_input_fn(
83 |         x={'x': testing_data},
84 |         y=testing_labels,
85 |         shuffle=False)
86 |     
87 |     eval_results = classifier.evaluate(input_fn=eval_input_fn)
88 |     print('these are the results of my evaluations')
89 |     print(eval_results)
90 | 
91 |     pred_input_fn = tf.estimator.inputs.numpy_input_fn(
92 |         x={'x': testing_data},
93 |         y=testing_labels,
94 |         num_epochs=1,
95 |         shuffle=False)
96 |     
97 |     pred_results = classifier.predict(input_fn=pred_input_fn)
98 |     predicted_classes = [p['classes'] for p in pred_results]      


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Youtube-Code-Repository
 2 | Repository for all the code from my youtube channel
 3 | You can find me at https://youtube.com/MachineLearningWithPhil <br>
 4 | 
 5 | <h2> Kaggle/Venus-Volcanoes </h2>
 6 | 
 7 | My crude implementation of a convolutional neural network to perform image classification on data gathered <br>
 8 | by the Magellan spacecraft. The data is horribly skewed, as most images do not contain a volcano. <br>
 9 | This means we'll have to do some creative data engineering for our model training. <br>
10 | Please note that in the test set, 84.1% of the data is "no volcano", and our model returns <br>
11 | an accuracy of around 88%, which is better than a model that outputs straight 0s for predictions. <br>
12 | 
13 | You can check out the video for this at https://youtu.be/Ki-xOKydQrY <br>
14 | You can find the data for this project at https://www.kaggle.com/fmena14/volcanoesvenus/home
15 | <h2> ReinforcementLearning/DeepQLearning </h2>
16 | 
17 | My implementation of the Deep Q learning algorithm in PyTorch. Here we teach the algorithm to play the game of space invaders. I haven't had enough time to train this model yet, as it takes quite some time even on my 1080Ti / i7 7820k @ 4.4 GHz. I'll train
18 | longer and provide a video on how well it does, at a later time.
19 | 
20 | The blog post talking about how Deep Q learning works can be found at http://www.neuralnet.ai/coding-a-deep-q-network-in-pytorch/ <br>
21 | Video for this is at https://www.youtube.com/watch?v=RfNxXlO6BiA&t=2s
22 | 
23 | 
24 | 
25 | <h2> CNN.py </h2>
26 | 
27 | Simple implementation of a convolutional neural network in TensorFlow, version 1.5. <br>
28 | Video tutorial on this code can be found here https://youtu.be/azFyHS0odcM <br>
29 | Achieves accuracy of 98% after 10 epochs of training <br>
30 | Requires data from http://yann.lecun.com/exdb/mnist/ <br>
31 | 
32 | <h2> ReinforcementLearning/blackJack-no-es.py </h2>
33 | 
34 | Implementation of Monte Carlo control without exploring starts in the blackjack environment from the OpenAI gym. <br>
35 | Video tutorial on this code can be found at https://youtu.be/e8ofon3sg8E <br>
36 | Algorithm trains for 1,000,000 games and produces a win rate of around 42%, loss rate of 52% and draw rate of 6% <br>
37 | 
38 | <h2> ReinforcementLearning/blackJack-off-policy.py </h2>
39 | 
40 | Implementation of off policy Monte Carlo control in the blackjack environment from the OpenAI gym. <br>
41 | Video tutorial on this code can be found at https://youtu.be/TvO0Sa-6UVc <br>
42 | Algorithm trains for 1,000,000 games and produces a win rate of around 29%, loss rate of 66% and draw rate of 5% <br>
43 | 
44 | <h2> ReinforcementLearning/cartpole_qlearning.py </h2>
45 | 
46 | Implementation of the Q learning algorithm for the cart pole problem. Code is based on the course by lazy programmer,  <br>
47 | which you can find here <a href="https://github.com/lazyprogrammer/machine_learning_examples/blob/master/rl/q_learning.py"> here </a>  <br>
48 | Video tutorial on this code can be found at https://youtu.be/ViwBAK8Hd7Q <br>
49 | 
50 | <h2> ReinforcementLearning/doubleQLearning.py </h2>
51 | 
52 | Implementation of the double Q learning algorithm in the cart pole environment. This is based on my course on  <br>
53 | reinforcement learning, which you can find at <a href="https://github.com/philtabor/Reinforcement-Learning-In-Motion/tree/master/Unit-8-The-Mountaincar"> this repo </a> <br>
54 | Video tutorial on this code can be found https://youtu.be/Q99bEPStnxk <br>
55 | 
56 | <h2> ReinforcementLearning/sarsa.py </h2>
57 | 
58 | Implementation of the SARSA algorithm in the cart pole environment. This is based on my course on reinforcement learning,  
59 | which can be found <a href="https://github.com/philtabor/Reinforcement-Learning-In-Motion/tree/master/Unit-7-The-Cartpole"> here </a> <br>
60 | Video tutorial on this code can be found at https://youtu.be/P9XezMuPfLE <br>
61 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/dqn_torch.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | import numpy as np
 6 | from memory import ReplayMemory
 7 | 
 8 | 
 9 | class DeepQNetwork(nn.Module):
10 |     def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
11 |         super(DeepQNetwork, self).__init__()
12 |         self.input_dims = input_dims
13 |         self.fc1_dims = fc1_dims
14 |         self.fc2_dims = fc2_dims
15 |         self.n_actions = n_actions
16 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
17 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
18 |         self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
19 | 
20 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
21 |         self.loss = nn.MSELoss()
22 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
23 |         self.to(self.device)
24 | 
25 |     def forward(self, state):
26 |         x = F.relu(self.fc1(state))
27 |         x = F.relu(self.fc2(x))
28 |         actions = self.fc3(x)
29 | 
30 |         return actions
31 | 
32 | 
33 | class Agent:
34 |     def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
35 |                  combined=False, max_mem_size=100000, eps_end=0.05,
36 |                  eps_dec=5e-4):
37 |         self.gamma = gamma
38 |         self.epsilon = epsilon
39 |         self.eps_min = eps_end
40 |         self.eps_dec = eps_dec
41 |         self.lr = lr
42 |         self.action_space = [i for i in range(n_actions)]
43 |         self.batch_size = batch_size
44 |         self.memory = ReplayMemory(input_dims, max_mem_size,
45 |                                    batch_size, combined)
46 |         self.iter_cntr = 0
47 |         self.replace_target = 100
48 | 
49 |         self.Q_eval = DeepQNetwork(lr, n_actions=n_actions,
50 |                                    input_dims=input_dims,
51 |                                    fc1_dims=256, fc2_dims=256)
52 |         self.Q_next = DeepQNetwork(lr, n_actions=n_actions,
53 |                                    input_dims=input_dims,
54 |                                    fc1_dims=256, fc2_dims=256)
55 | 
56 |     def choose_action(self, observation):
57 |         if np.random.random() > self.epsilon:
58 |             state = T.tensor([observation]).to(self.Q_eval.device)
59 |             actions = self.Q_eval.forward(state)
60 |             action = T.argmax(actions).item()
61 |         else:
62 |             action = np.random.choice(self.action_space)
63 | 
64 |         return action
65 | 
66 |     def learn(self):
67 |         if not self.memory.is_sufficient():
68 |             return
69 | 
70 |         self.Q_eval.optimizer.zero_grad()
71 |         batch_index = np.arange(self.batch_size, dtype=np.int32)
72 |         states, actions, rewards, new_states, dones = \
73 |             self.memory.sample_memory()
74 |         states = T.tensor(states).to(self.Q_eval.device)
75 |         new_states = T.tensor(new_states).to(self.Q_eval.device)
76 |         rewards = T.tensor(rewards).to(self.Q_eval.device)
77 |         dones = T.tensor(dones).to(self.Q_eval.device)
78 |         q_eval = self.Q_eval.forward(states)[batch_index, actions]
79 |         q_next = self.Q_eval.forward(new_states)
80 |         q_next[dones] = 0.0
81 |         q_target = rewards + self.gamma*T.max(q_next, dim=1)[0]
82 | 
83 |         loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
84 |         loss.backward()
85 |         self.Q_eval.optimizer.step()
86 | 
87 |         self.iter_cntr += 1
88 |         self.epsilon = self.epsilon - self.eps_dec \
89 |             if self.epsilon > self.eps_min else self.eps_min
90 | 
91 |         if self.iter_cntr % self.replace_target == 0:
92 |             self.Q_next.load_state_dict(self.Q_eval.state_dict())
93 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | from dqn_torch import Agent
 4 | import numpy as np
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(description='')
 9 |     parser.add_argument('-bs', type=int, default=1000)
10 |     parser.add_argument('-cer', type=bool, default=False)
11 |     # if you supply it, then true
12 |     args = parser.parse_args()
13 | 
14 |     env = gym.make('LunarLander-v2')
15 |     combined = args.cer
16 |     buffer_size = args.bs
17 | 
18 |     agent = Agent(gamma=0.99, epsilon=0.1, batch_size=64, n_actions=4,
19 |                   eps_end=0.1, input_dims=[8], lr=0.001,
20 |                   max_mem_size=buffer_size, combined=combined)
21 | 
22 |     scores = []
23 |     n_games = 500
24 |     for i in range(n_games):
25 |         score = 0
26 |         done = False
27 |         observation = env.reset()
28 |         while not done:
29 |             action = agent.choose_action(observation)
30 |             observation_, reward, done, info = env.step(action)
31 |             score += reward
32 |             agent.memory.store_transition(observation, action, reward,
33 |                                           observation_, done)
34 |             agent.learn()
35 |             observation = observation_
36 |         scores.append(score)
37 | 
38 |         avg_score = np.mean(scores[-100:])
39 | 
40 |         print('combined {} episode {} score {:.0f} avg score {:.0f} eps {:.2f}'
41 |               .format(combined, i, score, avg_score, agent.epsilon))
42 | 
43 |     if combined:
44 |         fname = 'CER_const_eps_' + str(buffer_size) + '.npy'
45 |     else:
46 |         fname = 'VER_const_eps_' + str(buffer_size) + '.npy'
47 |     np.save(fname, np.array(scores))
48 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ReplayMemory:
 5 |     def __init__(self, input_dims, max_mem, batch_size, combined=False):
 6 |         pass
 7 | 
 8 |     def store_transition(self, state, action, reward, state_, terminal):
 9 |         pass
10 | 
11 |     def sample_memory(self):
12 |         pass
13 | 
14 |     def is_sufficient(self):
15 |         pass
16 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/memory_solution.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ReplayMemory:
 5 |     def __init__(self, input_dims, max_mem, batch_size, combined=False):
 6 |         self.mem_size = max_mem
 7 |         self.batch_size = batch_size
 8 |         self.mem_cntr = 0
 9 |         self.combined = combined
10 |         self.state_memory = np.zeros((self.mem_size, *input_dims),
11 |                                      dtype=np.float32)
12 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
13 |                                          dtype=np.float32)
14 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
15 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
16 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
17 | 
18 |     def store_transition(self, state, action, reward, state_, terminal):
19 |         index = self.mem_cntr % self.mem_size
20 |         self.state_memory[index] = state
21 |         self.action_memory[index] = action
22 |         self.reward_memory[index] = reward
23 |         self.new_state_memory[index] = state_
24 |         self.terminal_memory[index] = terminal
25 | 
26 |         self.mem_cntr += 1
27 | 
28 |     def sample_memory(self):
29 |         offset = 1 if self.combined else 0
30 |         max_mem = min(self.mem_cntr, self.mem_size) - offset
31 |         batch = np.random.choice(max_mem, self.batch_size-offset,
32 |                                  replace=False)
33 |         states = self.state_memory[batch]
34 |         new_states = self.new_state_memory[batch]
35 |         actions = self.action_memory[batch]
36 |         rewards = self.reward_memory[batch]
37 |         terminals = self.terminal_memory[batch]
38 | 
39 |         if self.combined:
40 |             index = self.mem_cntr % self.mem_size - 1
41 |             last_action = self.action_memory[index]
42 |             last_state = self.state_memory[index]
43 |             last_new_state = self.new_state_memory[index]
44 |             last_reward = self.reward_memory[index]
45 |             last_terminal = self.terminal_memory[index]
46 | 
47 |             actions = np.append(self.action_memory[batch], last_action)
48 |             states = np.vstack((self.state_memory[batch], last_state))
49 |             new_states = np.vstack((self.new_state_memory[batch],
50 |                                    last_new_state))
51 |             rewards = np.append(self.reward_memory[batch], last_reward)
52 |             terminals = np.append(self.terminal_memory[batch], last_terminal)
53 | 
54 |         return states, actions, rewards, new_states, terminals
55 | 
56 |     def is_sufficient(self):
57 |         return self.mem_cntr > self.batch_size
58 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/CombinedExperienceReplay/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | cer_1k = np.load('CER_const_eps_1000.npy')
 4 | cer_10k = np.load('CER_const_eps_10000.npy')
 5 | cer_100k = np.load('CER_const_eps_100000.npy')
 6 | 
 7 | ver_1k = np.load('VER_const_eps_1000.npy')
 8 | ver_10k = np.load('VER_const_eps_10000.npy')
 9 | ver_100k = np.load('VER_const_eps_100000.npy')
10 | 
11 | running_cer1k_avg = np.zeros(len(cer_1k))
12 | running_cer10k_avg = np.zeros(len(cer_10k))
13 | running_cer100k_avg = np.zeros(len(cer_100k))
14 | running_ver1k_avg = np.zeros(len(ver_1k))
15 | running_ver10k_avg = np.zeros(len(ver_10k))
16 | running_ver100k_avg = np.zeros(len(ver_100k))
17 | 
18 | for i in range(len(cer_1k)):
19 |     running_cer1k_avg[i] = np.mean(cer_1k[max(0, i-100):(i+1)])
20 |     running_cer10k_avg[i] = np.mean(cer_10k[max(0, i-100):(i+1)])
21 |     running_cer100k_avg[i] = np.mean(cer_100k[max(0, i-100):(i+1)])
22 |     running_ver1k_avg[i] = np.mean(ver_1k[max(0, i-100):(i+1)])
23 |     running_ver10k_avg[i] = np.mean(ver_10k[max(0, i-100):(i+1)])
24 |     running_ver100k_avg[i] = np.mean(ver_100k[max(0, i-100):(i+1)])
25 | 
26 | 
27 | x_axis = np.arange(len(cer_1k))
28 | plt.plot(x_axis, running_cer1k_avg, 'r--', label='CER (1,000)')
29 | plt.plot(x_axis, running_ver1k_avg, 'b--', label='VER (1,000)')
30 | plt.xlabel('Episode')
31 | plt.ylabel('Avg Score')
32 | plt.legend(loc='lower right')
33 | plt.savefig('CER_vs_VER_1000_const_eps.png')
34 | plt.close()
35 | 
36 | x_axis = np.arange(len(cer_10k))
37 | plt.plot(x_axis, running_cer10k_avg, 'r--', label='CER (10,000)')
38 | plt.plot(x_axis, running_ver10k_avg, 'b--', label='VER (10,000)')
39 | plt.xlabel('Episode')
40 | plt.ylabel('Avg Score')
41 | plt.legend(loc='lower right')
42 | plt.savefig('CER_vs_VER_10000_const_eps.png')
43 | plt.close()
44 | 
45 | x_axis = np.arange(len(cer_100k))
46 | plt.plot(x_axis, running_cer100k_avg, 'r--', label='CER (100,000)')
47 | plt.plot(x_axis, running_ver100k_avg, 'b--', label='VER (100,000)')
48 | plt.xlabel('Episode')
49 | plt.ylabel('Avg Score')
50 | plt.legend(loc='lower right')
51 | plt.savefig('CER_vs_VER_100000_const_eps.png')
52 | plt.close()
53 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/frame_stack_test.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | ACTION_DICT = {0: 'NOOP', 1: 'FIRE', 2:'RIGHT', 3:'LEFT'}
 6 | 
 7 | def preprocess(observation):
 8 |     observation = observation / 255
 9 |     return np.mean(observation[30:,:], axis=2).reshape(180,160)
10 | 
11 | def stack_frames(stacked_frames, frame, stack_size, actions, action):
12 |     if stacked_frames is None:
13 |         stacked_frames = np.zeros((*frame.shape, stack_size))
14 |         actions = np.zeros(stack_size)
15 |         for idx in range(stack_size):
16 |             stacked_frames[:,:,idx] = frame
17 |     else:
18 |         stacked_frames[:,:,0:stack_size-1] = stacked_frames[:,:,1:]
19 |         stacked_frames[:,:,stack_size-1] = frame
20 |         actions[0:stack_size-1] = actions[1:]
21 |         actions[stack_size-1] = action
22 |     fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
23 | 
24 |     ax1.imshow(stacked_frames[:,:,0])
25 |     ax1.set_title(ACTION_DICT[actions[0]])
26 |     ax2.imshow(stacked_frames[:,:,1])
27 |     ax2.set_title(ACTION_DICT[actions[1]])
28 |     ax3.imshow(stacked_frames[:,:,2])
29 |     ax3.set_title(ACTION_DICT[actions[2]])
30 |     ax4.imshow(stacked_frames[:,:,3])
31 |     ax4.set_title(ACTION_DICT[actions[3]])
32 |     plt.show()
33 | 
34 |     return actions, stacked_frames
35 | 
36 | if __name__ == '__main__':
37 |     env = gym.make('Breakout-v0')
38 |     stack_size = 4
39 | 
40 |     for i in range(10):
41 |         done = False
42 |         observation = env.reset()
43 |         observation = preprocess(observation)
44 |         stacked_frames = None
45 |         actions=None
46 |         actions, stacked_frames = stack_frames(stacked_frames, observation,
47 |                                                stack_size, actions, 0)
48 |         while not done:
49 |             action = env.action_space.sample()
50 |             observation_, reward, done, info = env.step(action)
51 |             actions, stacked_frames_ = stack_frames(stacked_frames,
52 |                                            preprocess(observation_), stack_size,
53 |                                            actions, action)
54 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_keras_dqn_pong.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from dqn_keras import Agent
 3 | from utils import plotLearning, make_env
 4 | 
 5 | if __name__ == '__main__':
 6 |     env = make_env('PongNoFrameskip-v4')
 7 | 
 8 |     num_games = 500
 9 |     load_checkpoint = False
10 |     best_score = -21
11 |     agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0001,
12 |                   input_dims=(4,80,80), n_actions=6, mem_size=25000,
13 |                   eps_min=0.02, batch_size=32, replace=1000, eps_dec=1e-5)
14 | 
15 |     if load_checkpoint:
16 |         agent.load_models()
17 | 
18 |     filename = 'PongNoFrameskip-v4.png'
19 | 
20 |     scores, eps_history = [], []
21 |     n_steps = 0
22 | 
23 |     for i in range(num_games):
24 |         done = False
25 |         observation = env.reset()
26 |         score = 0
27 |         while not done:
28 |             action = agent.choose_action(observation)
29 |             observation_, reward, done, info = env.step(action)
30 |             n_steps += 1
31 |             score += reward
32 |             if not load_checkpoint:
33 |                 agent.store_transition(observation, action,
34 |                                      reward, observation_, int(done))
35 |                 agent.learn()
36 |             else:
37 |                 env.render()
38 |             observation = observation_
39 | 
40 |         scores.append(score)
41 | 
42 |         avg_score = np.mean(scores[-100:])
43 |         print('episode: ', i,'score: ', score,
44 |              ' average score %.3f' % avg_score,
45 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
46 |         if avg_score > best_score:
47 |             agent.save_models()
48 |             print('avg score %.2f better than best score %.2f, saving model' % (
49 |                   avg_score, best_score))
50 |             best_score = avg_score
51 | 
52 |         eps_history.append(agent.epsilon)
53 | 
54 |     x = [i+1 for i in range(num_games)]
55 |     plot_learning_curve(x, scores, eps_history, filename)
56 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_tf_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from simple_dqn_tf import DeepQNetwork, Agent
 3 | from utils import plotLearning
 4 | import numpy as np
 5 | from gym import wrappers
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     env = gym.make('LunarLander-v2')
11 |     lr = 0.0005
12 |     n_games = 500
13 | 
14 |     agent = Agent(gamma=0.99, epsilon=1.0, alpha=lr, input_dims=[8],
15 |                   n_actions=4, mem_size=1000000, n_games=n_games,
16 |                   batch_size=64)
17 | 
18 |     #load_checkpoint = True
19 |     #if load_checkpoint:
20 |     #    agent.load_models()
21 | 
22 |     alpha = 'alpha' + str(lr)#.split('.')[1]
23 | 
24 |     filename = '0-lunar-lander-256x256-' + alpha + '-bs64-adam-faster_decay.png'
25 |     scores = []
26 |     eps_history = []
27 | 
28 |     score = 0
29 |     env = wrappers.Monitor(env, "tmp/lunar-lander-4",
30 |                              video_callable=lambda episode_id: True, force=True)
31 | 
32 |     for i in range(n_games):
33 |         done = False
34 |         if i % 10 == 0 and i > 0:
35 |             avg_score = np.mean(scores[max(0, i-10):(i+1)])
36 |             print('episode: ', i,'score: ', score,
37 |                  ' average score %.3f' % avg_score,
38 |                 'epsilon %.3f' % agent.epsilon)
39 |             #agent.save_models()
40 |         else:
41 |             print('episode: ', i,'score: ', score)
42 | 
43 |         observation = env.reset()
44 |         score = 0
45 |         while not done:
46 |             action = agent.choose_action(observation)
47 |             observation_, reward, done, info = env.step(action)
48 |             score += reward
49 |             agent.store_transition(observation, action,
50 |                                    reward, observation_, int(done))
51 |             observation = observation_
52 |             agent.learn()
53 | 
54 |         eps_history.append(agent.epsilon)
55 |         scores.append(score)
56 | 
57 |     x = [i+1 for i in range(n_games)]
58 |     plotLearning(x, scores, eps_history, filename)
59 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from simple_dqn_torch import DeepQNetwork, Agent
 3 | from utils import plotLearning
 4 | import numpy as np
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = gym.make('LunarLander-v2')
 9 |     brain = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4,
10 |                   input_dims=[8], alpha=0.003)
11 | 
12 |     scores = []
13 |     eps_history = []
14 |     num_games = 500
15 |     score = 0
16 |     # uncomment the line below to record every episode.
17 |     #env = wrappers.Monitor(env, "tmp/space-invaders-1",
18 |     #video_callable=lambda episode_id: True, force=True)
19 |     for i in range(num_games):
20 |         if i % 10 == 0 and i > 0:
21 |             avg_score = np.mean(scores[max(0, i-10):(i+1)])
22 |             print('episode: ', i,'score: ', score,
23 |                  ' average score %.3f' % avg_score,
24 |                 'epsilon %.3f' % brain.EPSILON)
25 |         else:
26 |             print('episode: ', i,'score: ', score)
27 |         eps_history.append(brain.EPSILON)
28 |         done = False
29 |         observation = env.reset()
30 |         score = 0
31 |         while not done:
32 |             action = brain.chooseAction(observation)
33 |             observation_, reward, done, info = env.step(action)
34 |             score += reward
35 |             brain.storeTransition(observation, action, reward, observation_,
36 |                                   done)
37 |             observation = observation_
38 |             brain.learn()
39 | 
40 |         scores.append(score)
41 | 
42 |     x = [i+1 for i in range(num_games)]
43 |     filename = str(num_games) + 'Games' + 'Gamma' + str(brain.GAMMA) + \
44 |                'Alpha' + str(brain.ALPHA) + 'Memory' + \
45 |                 str(brain.Q_eval.fc1_dims) + '-' + str(brain.Q_eval.fc2_dims) +'.png'
46 |     plotLearning(x, scores, eps_history, filename)
47 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/main_torch_dqn_space_invaders.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from torch_deep_q_model import DeepQNetwork, Agent
 3 | from utils import plotLearning
 4 | import numpy as np
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = gym.make('SpaceInvaders-v0')
 9 |     brain = Agent(gamma=0.95, epsilon=1.0, 
10 |                   alpha=0.003, maxMemorySize=5000,
11 |                   replace=None)
12 |     while brain.memCntr < brain.memSize:
13 |         observation = env.reset()
14 |         done = False
15 |         while not done:
16 |             # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
17 |             action = env.action_space.sample()
18 |             observation_, reward, done, info = env.step(action)
19 |             if done and info['ale.lives'] == 0:
20 |                 reward = -100
21 |             brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
22 |                                 np.mean(observation_[15:200,30:125], axis=2))
23 |             observation = observation_
24 |     print('done initializing memory')
25 | 
26 |     scores = []
27 |     epsHistory = []
28 |     numGames = 50
29 |     batch_size=32
30 |     # uncomment the line below to record every episode.
31 |     env = wrappers.Monitor(env, "tmp/space-invaders-1", video_callable=lambda episode_id: True, force=True)
32 |     for i in range(numGames):
33 |         print('starting game ', i+1, 'epsilon: %.4f' % brain.EPSILON)
34 |         epsHistory.append(brain.EPSILON)
35 |         done = False
36 |         observation = env.reset()
37 |         frames = [np.sum(observation[15:200,30:125], axis=2)]
38 |         score = 0
39 |         lastAction = 0
40 |         while not done:
41 |             if len(frames) == 3:
42 |                 action = brain.chooseAction(frames)
43 |                 frames = []
44 |             else:
45 |                 action = lastAction
46 |             observation_, reward, done, info = env.step(action)
47 |             score += reward
48 |             frames.append(np.sum(observation_[15:200,30:125], axis=2))
49 |             if done and info['ale.lives'] == 0:
50 |                 reward = -100
51 |             brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
52 |                                   np.mean(observation_[15:200,30:125], axis=2))
53 |             observation = observation_
54 |             brain.learn(batch_size)
55 |             lastAction = action
56 |             #env.render(
57 |         scores.append(score)
58 |         print('score:',score)
59 |     x = [i+1 for i in range(numGames)]
60 |     fileName = str(numGames) + 'Games' + 'Gamma' + str(brain.GAMMA) + \
61 |                'Alpha' + str(brain.ALPHA) + 'Memory' + str(brain.memSize)+ '.png'
62 |     plotLearning(x, scores, epsHistory, fileName)
63 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/q_eval.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_eval.h5


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/q_next.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/DeepQLearning/archive/q_next.h5


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/archive/torch_deep_q_model.py:
--------------------------------------------------------------------------------
  1 | import torch as T
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | 
  7 | class DeepQNetwork(nn.Module):
  8 |     def __init__(self, ALPHA):
  9 |         super(DeepQNetwork, self).__init__()
 10 |         #self.conv1 = nn.Conv2d(3, 32, 8, stride=4, padding=1)
 11 |         self.conv1 = nn.Conv2d(1, 32, 8, stride=4, padding=1)
 12 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
 13 |         self.conv3 = nn.Conv2d(64, 128, 3)
 14 |         #self.fc1 = nn.Linear(128*23*16, 512)
 15 |         self.fc1 = nn.Linear(128*19*8, 512)
 16 |         self.fc2 = nn.Linear(512, 6)
 17 |         #self.optimizer = optim.SGD(self.parameters(), lr=self.ALPHA, momentum=0.9)
 18 |         self.optimizer = optim.RMSprop(self.parameters(), lr=ALPHA)
 19 |         self.loss = nn.MSELoss()
 20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 21 |         self.to(self.device)
 22 | 
 23 |     def forward(self, observation):
 24 |         observation = T.Tensor(observation).to(self.device)
 25 |         #observation = observation.view(-1, 3, 210, 160).to(self.device)
 26 |         observation = observation.view(-1, 1, 185, 95)
 27 |         observation = F.relu(self.conv1(observation))
 28 |         observation = F.relu(self.conv2(observation))
 29 |         observation = F.relu(self.conv3(observation))
 30 |         #observation = observation.view(-1, 128*23*16).to(self.device)
 31 |         observation = observation.view(-1, 128*19*8)
 32 |         observation = F.relu(self.fc1(observation))
 33 |         actions = self.fc2(observation)
 34 |         return actions
 35 | 
 36 | class Agent(object):
 37 |     def __init__(self, gamma, epsilon, alpha,
 38 |                  maxMemorySize, epsEnd=0.05,
 39 |                  replace=10000, actionSpace=[0,1,2,3,4,5]):
 40 |         self.GAMMA = gamma
 41 |         self.EPSILON = epsilon
 42 |         self.EPS_END = epsEnd
 43 |         self.ALPHA = alpha
 44 |         self.actionSpace = actionSpace
 45 |         self.memSize = maxMemorySize
 46 |         self.steps = 0
 47 |         self.learn_step_counter = 0
 48 |         self.memory = []
 49 |         self.memCntr = 0
 50 |         self.replace_target_cnt = replace
 51 |         self.Q_eval = DeepQNetwork(alpha)
 52 |         self.Q_next = DeepQNetwork(alpha)
 53 | 
 54 |     def storeTransition(self, state, action, reward, state_):
 55 |         if self.memCntr < self.memSize:
 56 |             self.memory.append([state, action, reward, state_])
 57 |         else:
 58 |             self.memory[self.memCntr%self.memSize] = [state, action, reward, state_]
 59 |         self.memCntr += 1
 60 | 
 61 |     def chooseAction(self, observation):
 62 |         rand = np.random.random()
 63 |         actions = self.Q_eval.forward(observation)
 64 |         if rand < 1 - self.EPSILON:
 65 |             action = T.argmax(actions[1]).item()
 66 |         else:
 67 |             action = np.random.choice(self.actionSpace)
 68 |         self.steps += 1
 69 |         return action
 70 | 
 71 |     def learn(self, batch_size):
 72 |         self.Q_eval.optimizer.zero_grad()
 73 |         if self.replace_target_cnt is not None and \
 74 |            self.learn_step_counter % self.replace_target_cnt == 0:
 75 |             self.Q_next.load_state_dict(self.Q_eval.state_dict())
 76 | 
 77 |         if self.memCntr+batch_size < self.memSize:
 78 |             memStart = int(np.random.choice(range(self.memCntr)))
 79 |         else:
 80 |             memStart = int(np.random.choice(range(self.memSize-batch_size-1)))
 81 |         miniBatch=self.memory[memStart:memStart+batch_size]
 82 |         memory = np.array(miniBatch)
 83 | 
 84 |         # convert to list because memory is an array of numpy objects
 85 |         Qpred = self.Q_eval.forward(list(memory[:,0][:])).to(self.Q_eval.device)
 86 |         Qnext = self.Q_next.forward(list(memory[:,3][:])).to(self.Q_eval.device)
 87 | 
 88 |         maxA = T.argmax(Qnext, dim=1).to(self.Q_eval.device)
 89 |         rewards = T.Tensor(list(memory[:,2])).to(self.Q_eval.device)
 90 |         Qtarget = Qpred.clone()
 91 |         indices = np.arange(batch_size)
 92 |         Qtarget[indices,maxA] = rewards + self.GAMMA*T.max(Qnext[1])
 93 | 
 94 |         if self.steps > 500:
 95 |             if self.EPSILON - 1e-4 > self.EPS_END:
 96 |                 self.EPSILON -= 1e-4
 97 |             else:
 98 |                 self.EPSILON = self.EPS_END
 99 | 
100 |         #Qpred.requires_grad_()
101 |         loss = self.Q_eval.loss(Qtarget, Qpred).to(self.Q_eval.device)
102 |         loss.backward()
103 |         self.Q_eval.optimizer.step()
104 |         self.learn_step_counter += 1
105 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # for keras the CUDA commands must come before importing the keras libraries
 3 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
 4 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 5 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 6 | import gym
 7 | from gym import wrappers
 8 | import numpy as np
 9 | from ddqn_keras import DDQNAgent
10 | from utils import plotLearning
11 | 
12 | if __name__ == '__main__':
13 |     env = gym.make('LunarLander-v2')
14 |     ddqn_agent = DDQNAgent(alpha=0.0005, gamma=0.99, n_actions=4, epsilon=1.0,
15 |                   batch_size=64, input_dims=8)
16 |     n_games = 500
17 |     #ddqn_agent.load_model()
18 |     ddqn_scores = []
19 |     eps_history = []
20 |     #env = wrappers.Monitor(env, "tmp/lunar-lander-ddqn-2",
21 |     #                         video_callable=lambda episode_id: True, force=True)
22 | 
23 |     for i in range(n_games):
24 |         done = False
25 |         score = 0
26 |         observation = env.reset()
27 |         while not done:
28 |             action = ddqn_agent.choose_action(observation)
29 |             observation_, reward, done, info = env.step(action)
30 |             score += reward
31 |             ddqn_agent.remember(observation, action, reward, observation_, int(done))
32 |             observation = observation_
33 |             ddqn_agent.learn()
34 |         eps_history.append(ddqn_agent.epsilon)
35 | 
36 |         ddqn_scores.append(score)
37 | 
38 |         avg_score = np.mean(ddqn_scores[max(0, i-100):(i+1)])
39 |         print('episode: ', i,'score: %.2f' % score,
40 |               ' average score %.2f' % avg_score)
41 | 
42 |         if i % 10 == 0 and i > 0:
43 |             ddqn_agent.save_model()
44 | 
45 |     filename = 'lunarlander-ddqn.png'
46 | 
47 |     x = [i+1 for i in range(n_games)]
48 |     plotLearning(x, ddqn_scores, eps_history, filename)
49 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | from simple_dqn_keras import Agent
 2 | import numpy as np
 3 | import gym
 4 | from utils import plotLearning
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = gym.make('LunarLander-v2')
 9 |     lr = 0.0005
10 |     n_games = 500
11 |     agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=8,
12 |                   n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0.0)
13 | 
14 |     agent.load_model()
15 |     scores = []
16 |     eps_history = []
17 | 
18 |     #env = wrappers.Monitor(env, "tmp/lunar-lander-6",
19 |     #                         video_callable=lambda episode_id: True, force=True)
20 | 
21 |     for i in range(n_games):
22 |         done = False
23 |         score = 0
24 |         observation = env.reset()
25 |         while not done:
26 |             action = agent.choose_action(observation)
27 |             observation_, reward, done, info = env.step(action)
28 |             score += reward
29 |             agent.remember(observation, action, reward, observation_, int(done))
30 |             observation = observation_
31 |             agent.learn()
32 | 
33 |         eps_history.append(agent.epsilon)
34 |         scores.append(score)
35 | 
36 |         avg_score = np.mean(scores[max(0, i-100):(i+1)])
37 |         print('episode: ', i,'score: %.2f' % score,
38 |               ' average score %.2f' % avg_score)
39 | 
40 |         if i % 10 == 0 and i > 0:
41 |             agent.save_model()
42 | 
43 |     filename = 'lunarlander.png'
44 | 
45 |     x = [i+1 for i in range(n_games)]
46 |     plotLearning(x, scores, eps_history, filename)
47 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_keras_dueling_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | from dueling_dqn_keras import Agent
 2 | import numpy as np
 3 | import gym
 4 | from utils import plotLearning
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     n_games = 400
 9 |     agent = Agent(gamma=0.99, epsilon=1, lr=1e-3, input_dims=[8], 
10 |                   epsilon_dec=1e-3, mem_size=100000, batch_size=64, eps_end=0.01,
11 |                   fc1_dims=128, fc2_dims=128, replace=100, n_actions=4)
12 | 
13 |     scores, eps_history = [], []
14 | 
15 |     for i in range(n_games):
16 |         done = False
17 |         score = 0
18 |         observation = env.reset()
19 |         while not done:
20 |             action = agent.choose_action(observation)
21 |             observation_, reward, done, info = env.step(action)
22 |             score += reward
23 |             agent.store_transition(observation, action, reward, observation_, done)
24 |             observation = observation_
25 |             agent.learn()
26 |         eps_history.append(agent.epsilon)
27 |         scores.append(score)
28 | 
29 |         avg_score = np.mean(scores[-100:])
30 |         print('episode ', i, 'score %.1f' % score,
31 |                 'average score %.1f' % avg_score,
32 |                 'epsilon %.2f' % agent.epsilon)
33 | 
34 |     filename='keras_lunar_lander.png'
35 |     x = [i+1 for i in range(n_games)]
36 |     plotLearning(x, scores, eps_history, filename)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf2_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | from simple_dqn_tf2 import Agent
 2 | import numpy as np
 3 | import gym
 4 | from utils import plotLearning
 5 | import tensorflow as tf
 6 | 
 7 | if __name__ == '__main__':
 8 |     tf.compat.v1.disable_eager_execution()
 9 |     env = gym.make('LunarLander-v2')
10 |     lr = 0.001
11 |     n_games = 500
12 |     agent = Agent(gamma=0.99, epsilon=1.0, lr=lr, 
13 |                 input_dims=env.observation_space.shape,
14 |                 n_actions=env.action_space.n, mem_size=1000000, batch_size=64,
15 |                 epsilon_end=0.01)
16 |     scores = []
17 |     eps_history = []
18 | 
19 |     for i in range(n_games):
20 |         done = False
21 |         score = 0
22 |         observation = env.reset()
23 |         while not done:
24 |             action = agent.choose_action(observation)
25 |             observation_, reward, done, info = env.step(action)
26 |             score += reward
27 |             agent.store_transition(observation, action, reward, observation_, done)
28 |             observation = observation_
29 |             agent.learn()
30 |         eps_history.append(agent.epsilon)
31 |         scores.append(score)
32 | 
33 |         avg_score = np.mean(scores[-100:])
34 |         print('episode: ', i, 'score %.2f' % score,
35 |                 'average_score %.2f' % avg_score,
36 |                 'epsilon %.2f' % agent.epsilon)
37 | 
38 |     filename = 'lunarlander_tf2.png'
39 |     x = [i+1 for i in range(n_games)]
40 |     plotLearning(x, scores, eps_history, filename)
41 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf2_dueling_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from dueling_ddqn_tf2 import Agent
 4 | from utils import plotLearning
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     agent = Agent(lr=0.0005, gamma=0.99, n_actions=4, epsilon=1.0,
 9 |                   batch_size=64, input_dims=[8])
10 |     n_games = 500
11 |     ddqn_scores = []
12 |     eps_history = []
13 | 
14 |     for i in range(n_games):
15 |         done = False
16 |         score = 0
17 |         observation = env.reset()
18 |         while not done:
19 |             action = agent.choose_action(observation)
20 |             observation_, reward, done, info = env.step(action)
21 |             score += reward
22 |             agent.store_transition(observation, action, reward, observation_, done)
23 |             observation = observation_
24 |             agent.learn()
25 |         eps_history.append(ddqn_agent.epsilon)
26 | 
27 |         ddqn_scores.append(score)
28 | 
29 |         avg_score = np.mean(scores[-100:])
30 |         print('episode: ', i,'score: %.2f' % score,
31 |               ' average score %.2f' % avg_score)
32 | 
33 |     filename = 'lunarlander-dueling_ddqn.png'
34 | 
35 |     x = [i+1 for i in range(n_games)]
36 |     plotLearning(x, ddqn_scores, eps_history, filename)
37 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_tf_dqn_breakout.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gym
  3 | from dqn_tf import DeepQNetwork, Agent
  4 | from utils import plotLearning
  5 | import numpy as np
  6 | from gym import wrappers
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | def preprocess(observation):
 10 |     observation = observation / 255
 11 |     return np.mean(observation[30:,:], axis=2).reshape(180,160,1)
 12 | 
 13 | def stack_frames(stacked_frames, frame, buffer_size):
 14 |     if stacked_frames is None:
 15 |         stacked_frames = np.zeros((buffer_size, *frame.shape))
 16 |         for idx, _ in enumerate(stacked_frames):
 17 |             stacked_frames[idx,:] = frame
 18 |     else:
 19 |         stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:]
 20 |         stacked_frames[buffer_size-1, :] = frame
 21 | 
 22 |     stacked_frames = stacked_frames.reshape(1, *frame.shape[0:2], buffer_size)
 23 | 
 24 |     return stacked_frames
 25 | 
 26 | 
 27 | if __name__ == '__main__':
 28 |     #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 29 |     #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 30 |     #os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 31 | 
 32 |     env = gym.make('Breakout-v0')
 33 |     load_checkpoint = False
 34 |     agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.000025, input_dims=(180,160,4),
 35 |                   n_actions=3, mem_size=25000, batch_size=64)
 36 |     if load_checkpoint:
 37 |         agent.load_models()
 38 |     filename = 'breakout-alpha0p000025-gamma0p9-only-one-fc-2.png'
 39 |     scores = []
 40 |     eps_history = []
 41 |     numGames = 50000
 42 |     stack_size = 4
 43 |     score = 0
 44 |     # uncomment the line below to record every episode.
 45 |     #env = wrappers.Monitor(env, "tmp/breakout-0",
 46 |     #                         video_callable=lambda episode_id: True, force=True)
 47 |     """
 48 |     print("Loading up the agent's memory with random gameplay")
 49 | 
 50 |     while agent.mem_cntr < 25000:
 51 |         done = False
 52 |         observation = env.reset()
 53 |         observation = preprocess(observation)
 54 |         stacked_frames = None
 55 |         observation = stack_frames(stacked_frames, observation, stack_size)
 56 |         while not done:
 57 |             action = np.random.choice([0, 1, 2])
 58 |             action += 1
 59 |             observation_, reward, done, info = env.step(action)
 60 |             observation_ = stack_frames(stacked_frames,
 61 |                                         preprocess(observation_), stack_size)
 62 |             action -= 1
 63 |             agent.store_transition(observation, action,
 64 |                                    reward, observation_, int(done))
 65 |             observation = observation_
 66 |     print("Done with random gameplay. Game on.")
 67 |     """
 68 |     n_steps = 0
 69 |     for i in range(numGames):
 70 |         done = False
 71 |         #if i % 100 == 0 and i > 0:
 72 |         #    x = [j+1 for j in range(i)]
 73 | 
 74 |         #    plotLearning(x, scores, eps_history, filename)
 75 |         observation = env.reset()
 76 |         observation = preprocess(observation)
 77 |         stacked_frames = None
 78 |         observation = stack_frames(stacked_frames, observation, stack_size)
 79 |         score = 0
 80 |         while not done:
 81 |             action = agent.choose_action(observation)
 82 |             action += 1
 83 |             observation_, reward, done, info = env.step(action)
 84 |             n_steps += 1
 85 |             observation_ = stack_frames(stacked_frames,
 86 |                                         preprocess(observation_), stack_size)
 87 |             score += reward
 88 |             action -= 1
 89 |             agent.store_transition(observation, action,
 90 |                                    reward, observation_, int(done))
 91 |             observation = observation_
 92 |             if n_steps % 4 == 0:
 93 |                 agent.learn()
 94 |         if i % 12 == 0 and i > 0:
 95 |             avg_score = np.mean(scores[max(0, i-12):(i+1)])
 96 |             print('episode: ', i,'score: ', score,
 97 |                  ' average score %.3f' % avg_score,
 98 |                 'epsilon %.3f' % agent.epsilon)
 99 |             agent.save_models()
100 |         else:
101 |             print('episode: ', i,'score: ', score)
102 |         eps_history.append(agent.epsilon)
103 |         scores.append(score)
104 |     x = [i+1 for i in range(numGames)]
105 |     plotLearning(x, scores, eps_history, filename)
106 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dqn_lunar_lander_2020.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from simple_dqn_torch_2020 import Agent
 3 | from utils import plotLearning
 4 | import numpy as np
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01,
 9 |                   input_dims=[8], lr=0.001)
10 |     scores, eps_history = [], []
11 |     n_games = 500
12 |     
13 |     for i in range(n_games):
14 |         score = 0
15 |         done = False
16 |         observation = env.reset()
17 |         while not done:
18 |             action = agent.choose_action(observation)
19 |             observation_, reward, done, info = env.step(action)
20 |             score += reward
21 |             agent.store_transition(observation, action, reward, 
22 |                                     observation_, done)
23 |             agent.learn()
24 |             observation = observation_
25 |         scores.append(score)
26 |         eps_history.append(agent.epsilon)
27 | 
28 |         avg_score = np.mean(scores[-100:])
29 | 
30 |         print('episode ', i, 'score %.2f' % score,
31 |                 'average score %.2f' % avg_score,
32 |                 'epsilon %.2f' % agent.epsilon)
33 |     x = [i+1 for i in range(n_games)]
34 |     filename = 'lunar_lander.png'
35 |     plotLearning(x, scores, eps_history, filename)
36 | 
37 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dueling_ddqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from dueling_ddqn_torch import Agent
 4 | from utils import plotLearning
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     num_games = 250
 9 |     load_checkpoint = False
10 | 
11 |     agent = Agent(gamma=0.99, epsilon=1.0, lr=5e-4,
12 |                   input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01,
13 |                   batch_size=64, eps_dec=1e-3, replace=100)
14 | 
15 |     if load_checkpoint:
16 |         agent.load_models()
17 | 
18 |     filename = 'LunarLander-Dueling-DDQN-512-Adam-lr0005-replace100.png'
19 |     scores = []
20 |     eps_history = []
21 |     n_steps = 0
22 | 
23 |     for i in range(num_games):
24 |         done = False
25 |         observation = env.reset()
26 |         score = 0
27 | 
28 |         while not done:
29 |             action = agent.choose_action(observation)
30 |             observation_, reward, done, info = env.step(action)
31 |             score += reward
32 |             agent.store_transition(observation, action,
33 |                                     reward, observation_, int(done))
34 |             agent.learn()
35 | 
36 |             observation = observation_
37 | 
38 |         scores.append(score)
39 |         avg_score = np.mean(scores[max(0, i-100):(i+1)])
40 |         print('episode: ', i,'score %.1f ' % score,
41 |              ' average score %.1f' % avg_score,
42 |             'epsilon %.2f' % agent.epsilon)
43 |         if i > 0 and i % 10 == 0:
44 |             agent.save_models()
45 | 
46 |         eps_history.append(agent.epsilon)
47 | 
48 |     x = [i+1 for i in range(num_games)]
49 |     plotLearning(x, scores, eps_history, filename)
50 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/main_torch_dueling_dqn_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym, time
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from dueling_dqn_torch import Agent
 5 | from utils import plotLearning
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = gym.make('LunarLander-v2')
 9 |     num_games = 1000
10 |     load_checkpoint = False
11 | 
12 |     agent = Agent(gamma=0.99, epsilon=1.0, alpha=5e-4,
13 |                   input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01,
14 |                   batch_size=64, eps_dec=1e-3, replace=100)
15 | 
16 |     if load_checkpoint:
17 |         agent.load_models()
18 | 
19 |     filename = 'LunarLander-Dueling-128-128-Adam-lr0005-replace100.png'
20 |     scores = []
21 |     eps_history = []
22 |     n_steps = 0
23 | 
24 |     for i in range(num_games):
25 |         done = False
26 |         observation = env.reset()
27 |         score = 0
28 | 
29 |         while not done:
30 |             action = agent.choose_action(observation)
31 |             observation_, reward, done, info = env.step(action)
32 |             n_steps += 1
33 |             score += reward
34 |             agent.store_transition(observation, action,
35 |                                     reward, observation_, int(done))
36 |             agent.learn()
37 | 
38 |             observation = observation_
39 | 
40 | 
41 |         scores.append(score)
42 |         avg_score = np.mean(scores[max(0, i-100):(i+1)])
43 |         print('episode: ', i,'score %.1f ' % score,
44 |              ' average score %.1f' % avg_score,
45 |             'epsilon %.2f' % agent.epsilon)
46 |         #if i > 0 and i % 10 == 0:
47 |         #    agent.save_models()
48 | 
49 |         eps_history.append(agent.epsilon)
50 | 
51 |     x = [i+1 for i in range(num_games)]
52 |     plotLearning(x, scores, eps_history, filename)
53 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_keras.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import Dense, Activation
  2 | from keras.models import Sequential, load_model
  3 | from keras.optimizers import Adam
  4 | import numpy as np
  5 | 
  6 | class ReplayBuffer(object):
  7 |     def __init__(self, max_size, input_shape, n_actions, discrete=False):
  8 |         self.mem_size = max_size
  9 |         self.mem_cntr = 0
 10 |         self.discrete = discrete
 11 |         self.state_memory = np.zeros((self.mem_size, input_shape))
 12 |         self.new_state_memory = np.zeros((self.mem_size, input_shape))
 13 |         dtype = np.int8 if self.discrete else np.float32
 14 |         self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
 15 |         self.reward_memory = np.zeros(self.mem_size)
 16 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
 17 | 
 18 |     def store_transition(self, state, action, reward, state_, done):
 19 |         index = self.mem_cntr % self.mem_size
 20 |         self.state_memory[index] = state
 21 |         self.new_state_memory[index] = state_
 22 |         # store one hot encoding of actions, if appropriate
 23 |         if self.discrete:
 24 |             actions = np.zeros(self.action_memory.shape[1])
 25 |             actions[action] = 1.0
 26 |             self.action_memory[index] = actions
 27 |         else:
 28 |             self.action_memory[index] = action
 29 |         self.reward_memory[index] = reward
 30 |         self.terminal_memory[index] = 1 - done
 31 |         self.mem_cntr += 1
 32 | 
 33 |     def sample_buffer(self, batch_size):
 34 |         max_mem = min(self.mem_cntr, self.mem_size)
 35 |         batch = np.random.choice(max_mem, batch_size)
 36 | 
 37 |         states = self.state_memory[batch]
 38 |         actions = self.action_memory[batch]
 39 |         rewards = self.reward_memory[batch]
 40 |         states_ = self.new_state_memory[batch]
 41 |         terminal = self.terminal_memory[batch]
 42 | 
 43 |         return states, actions, rewards, states_, terminal
 44 | 
 45 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
 46 |     model = Sequential([
 47 |                 Dense(fc1_dims, input_shape=(input_dims,)),
 48 |                 Activation('relu'),
 49 |                 Dense(fc2_dims),
 50 |                 Activation('relu'),
 51 |                 Dense(n_actions)])
 52 | 
 53 |     model.compile(optimizer=Adam(lr=lr), loss='mse')
 54 | 
 55 |     return model
 56 | 
 57 | class Agent(object):
 58 |     def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
 59 |                  input_dims, epsilon_dec=0.996,  epsilon_end=0.01,
 60 |                  mem_size=1000000, fname='dqn_model.h5'):
 61 |         self.action_space = [i for i in range(n_actions)]
 62 |         self.gamma = gamma
 63 |         self.epsilon = epsilon
 64 |         self.epsilon_dec = epsilon_dec
 65 |         self.epsilon_min = epsilon_end
 66 |         self.batch_size = batch_size
 67 |         self.model_file = fname
 68 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
 69 |                                    discrete=True)
 70 |         self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
 71 | 
 72 |     def remember(self, state, action, reward, new_state, done):
 73 |         self.memory.store_transition(state, action, reward, new_state, done)
 74 | 
 75 |     def choose_action(self, state):
 76 |         state = state[np.newaxis, :]
 77 |         rand = np.random.random()
 78 |         if rand < self.epsilon:
 79 |             action = np.random.choice(self.action_space)
 80 |         else:
 81 |             actions = self.q_eval.predict(state)
 82 |             action = np.argmax(actions)
 83 | 
 84 |         return action
 85 | 
 86 |     def learn(self):
 87 |         if self.memory.mem_cntr > self.batch_size:
 88 |             state, action, reward, new_state, done = \
 89 |                                           self.memory.sample_buffer(self.batch_size)
 90 | 
 91 |             action_values = np.array(self.action_space, dtype=np.int8)
 92 |             action_indices = np.dot(action, action_values)
 93 | 
 94 |             q_eval = self.q_eval.predict(state)
 95 | 
 96 |             q_next = self.q_eval.predict(new_state)
 97 | 
 98 |             q_target = q_eval.copy()
 99 | 
100 |             batch_index = np.arange(self.batch_size, dtype=np.int32)
101 | 
102 |             q_target[batch_index, action_indices] = reward + \
103 |                                   self.gamma*np.max(q_next, axis=1)*done
104 | 
105 |             _ = self.q_eval.fit(state, q_target, verbose=0)
106 | 
107 |             self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
108 |                            self.epsilon_min else self.epsilon_min
109 | 
110 |     def save_model(self):
111 |         self.q_eval.save(self.model_file)
112 | 
113 |     def load_model(self):
114 |         self.q_eval = load_model(self.model_file)
115 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_tf2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow import keras
  4 | from tensorflow.keras.optimizers import Adam
  5 | from tensorflow.keras.models import load_model
  6 | 
  7 | class ReplayBuffer():
  8 |     def __init__(self, max_size, input_dims):
  9 |         self.mem_size = max_size
 10 |         self.mem_cntr = 0
 11 | 
 12 |         self.state_memory = np.zeros((self.mem_size, *input_dims), 
 13 |                                     dtype=np.float32)
 14 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
 15 |                                 dtype=np.float32)
 16 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
 17 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
 18 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
 19 | 
 20 |     def store_transition(self, state, action, reward, state_, done):
 21 |         index = self.mem_cntr % self.mem_size
 22 |         self.state_memory[index] = state
 23 |         self.new_state_memory[index] = state_
 24 |         self.reward_memory[index] = reward
 25 |         self.action_memory[index] = action
 26 |         self.terminal_memory[index] = 1 - int(done)
 27 |         self.mem_cntr += 1
 28 | 
 29 |     def sample_buffer(self, batch_size):
 30 |         max_mem = min(self.mem_cntr, self.mem_size)
 31 |         batch = np.random.choice(max_mem, batch_size, replace=False)
 32 | 
 33 |         states = self.state_memory[batch]
 34 |         states_ = self.new_state_memory[batch]
 35 |         rewards = self.reward_memory[batch]
 36 |         actions = self.action_memory[batch]
 37 |         terminal = self.terminal_memory[batch]
 38 | 
 39 |         return states, actions, rewards, states_, terminal
 40 | 
 41 | def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
 42 |     model = keras.Sequential([
 43 |         keras.layers.Dense(fc1_dims, activation='relu'),
 44 |         keras.layers.Dense(fc2_dims, activation='relu'),
 45 |         keras.layers.Dense(n_actions, activation=None)])
 46 |     model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
 47 | 
 48 |     return model
 49 | 
 50 | class Agent():
 51 |     def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
 52 |                 input_dims, epsilon_dec=1e-3, epsilon_end=0.01,
 53 |                 mem_size=1000000, fname='dqn_model.h5'):
 54 |         self.action_space = [i for i in range(n_actions)]
 55 |         self.gamma = gamma
 56 |         self.epsilon = epsilon
 57 |         self.eps_dec = epsilon_dec
 58 |         self.eps_min = epsilon_end
 59 |         self.batch_size = batch_size
 60 |         self.model_file = fname
 61 |         self.memory = ReplayBuffer(mem_size, input_dims)
 62 |         self.q_eval = build_dqn(lr, n_actions, input_dims, 256, 256)
 63 | 
 64 |     def store_transition(self, state, action, reward, new_state, done):
 65 |         self.memory.store_transition(state, action, reward, new_state, done)
 66 | 
 67 |     def choose_action(self, observation):
 68 |         if np.random.random() < self.epsilon:
 69 |             action = np.random.choice(self.action_space)
 70 |         else:
 71 |             state = np.array([observation])
 72 |             actions = self.q_eval.predict(state)
 73 | 
 74 |             action = np.argmax(actions)
 75 | 
 76 |         return action
 77 | 
 78 |     def learn(self):
 79 |         if self.memory.mem_cntr < self.batch_size:
 80 |             return
 81 | 
 82 |         states, actions, rewards, states_, dones = \
 83 |                 self.memory.sample_buffer(self.batch_size)
 84 | 
 85 |         q_eval = self.q_eval.predict(states)
 86 |         q_next = self.q_eval.predict(states_)
 87 | 
 88 | 
 89 |         q_target = np.copy(q_eval)
 90 |         batch_index = np.arange(self.batch_size, dtype=np.int32)
 91 | 
 92 |         q_target[batch_index, actions] = rewards + \
 93 |                         self.gamma * np.max(q_next, axis=1)*dones
 94 | 
 95 | 
 96 |         self.q_eval.train_on_batch(states, q_target)
 97 | 
 98 |         self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
 99 |                 self.eps_min else self.eps_min
100 | 
101 |     def save_model(self):
102 |         self.q_eval.save(self.model_file)
103 | 
104 | 
105 |     def load_model(self):
106 |         self.q_eval = load_model(self.model_file)
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/simple_dqn_torch_2020.py:
--------------------------------------------------------------------------------
  1 | import torch as T
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | 
  7 | 
  8 | class DeepQNetwork(nn.Module):
  9 |     def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
 10 |                  n_actions):
 11 |         super(DeepQNetwork, self).__init__()
 12 |         self.input_dims = input_dims
 13 |         self.fc1_dims = fc1_dims
 14 |         self.fc2_dims = fc2_dims
 15 |         self.n_actions = n_actions
 16 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
 17 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 18 |         self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
 19 | 
 20 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
 21 |         self.loss = nn.MSELoss()
 22 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 23 |         self.to(self.device)
 24 | 
 25 |     def forward(self, state):
 26 |         x = F.relu(self.fc1(state))
 27 |         x = F.relu(self.fc2(x))
 28 |         actions = self.fc3(x)
 29 | 
 30 |         return actions
 31 | 
 32 | 
 33 | class Agent:
 34 |     def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
 35 |                  max_mem_size=100000, eps_end=0.05, eps_dec=5e-4):
 36 |         self.gamma = gamma
 37 |         self.epsilon = epsilon
 38 |         self.eps_min = eps_end
 39 |         self.eps_dec = eps_dec
 40 |         self.lr = lr
 41 |         self.action_space = [i for i in range(n_actions)]
 42 |         self.mem_size = max_mem_size
 43 |         self.batch_size = batch_size
 44 |         self.mem_cntr = 0
 45 |         self.iter_cntr = 0
 46 |         self.replace_target = 100
 47 | 
 48 |         self.Q_eval = DeepQNetwork(lr, n_actions=n_actions,
 49 |                                    input_dims=input_dims,
 50 |                                    fc1_dims=256, fc2_dims=256)
 51 |         self.state_memory = np.zeros((self.mem_size, *input_dims),
 52 |                                      dtype=np.float32)
 53 |         self.new_state_memory = np.zeros((self.mem_size, *input_dims),
 54 |                                          dtype=np.float32)
 55 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
 56 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
 57 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
 58 | 
 59 |     def store_transition(self, state, action, reward, state_, terminal):
 60 |         index = self.mem_cntr % self.mem_size
 61 |         self.state_memory[index] = state
 62 |         self.new_state_memory[index] = state_
 63 |         self.reward_memory[index] = reward
 64 |         self.action_memory[index] = action
 65 |         self.terminal_memory[index] = terminal
 66 | 
 67 |         self.mem_cntr += 1
 68 | 
 69 |     def choose_action(self, observation):
 70 |         if np.random.random() > self.epsilon:
 71 |             state = T.tensor([observation]).to(self.Q_eval.device)
 72 |             actions = self.Q_eval.forward(state)
 73 |             action = T.argmax(actions).item()
 74 |         else:
 75 |             action = np.random.choice(self.action_space)
 76 | 
 77 |         return action
 78 | 
 79 |     def learn(self):
 80 |         if self.mem_cntr < self.batch_size:
 81 |             return
 82 | 
 83 |         self.Q_eval.optimizer.zero_grad()
 84 | 
 85 |         max_mem = min(self.mem_cntr, self.mem_size)
 86 | 
 87 |         batch = np.random.choice(max_mem, self.batch_size, replace=False)
 88 |         batch_index = np.arange(self.batch_size, dtype=np.int32)
 89 | 
 90 |         state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
 91 |         new_state_batch = T.tensor(
 92 |                 self.new_state_memory[batch]).to(self.Q_eval.device)
 93 |         action_batch = self.action_memory[batch]
 94 |         reward_batch = T.tensor(
 95 |                 self.reward_memory[batch]).to(self.Q_eval.device)
 96 |         terminal_batch = T.tensor(
 97 |                 self.terminal_memory[batch]).to(self.Q_eval.device)
 98 | 
 99 |         q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
100 |         q_next = self.Q_eval.forward(new_state_batch)
101 |         q_next[terminal_batch] = 0.0
102 | 
103 |         q_target = reward_batch + self.gamma*T.max(q_next, dim=1)[0]
104 | 
105 |         loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
106 |         loss.backward()
107 |         self.Q_eval.optimizer.step()
108 | 
109 |         self.iter_cntr += 1
110 |         self.epsilon = self.epsilon - self.eps_dec \
111 |             if self.epsilon > self.eps_min else self.eps_min
112 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/DeepQLearning/utils.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import gym
  4 | 
  5 | def plotLearning(x, scores, epsilons, filename, lines=None):
  6 |     fig=plt.figure()
  7 |     ax=fig.add_subplot(111, label="1")
  8 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
  9 | 
 10 |     ax.plot(x, epsilons, color="C0")
 11 |     ax.set_xlabel("Game", color="C0")
 12 |     ax.set_ylabel("Epsilon", color="C0")
 13 |     ax.tick_params(axis='x', colors="C0")
 14 |     ax.tick_params(axis='y', colors="C0")
 15 | 
 16 |     N = len(scores)
 17 |     running_avg = np.empty(N)
 18 |     for t in range(N):
 19 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 20 | 
 21 |     ax2.scatter(x, running_avg, color="C1")
 22 |     #ax2.xaxis.tick_top()
 23 |     ax2.axes.get_xaxis().set_visible(False)
 24 |     ax2.yaxis.tick_right()
 25 |     #ax2.set_xlabel('x label 2', color="C1")
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     #ax2.xaxis.set_label_position('top')
 28 |     ax2.yaxis.set_label_position('right')
 29 |     #ax2.tick_params(axis='x', colors="C1")
 30 |     ax2.tick_params(axis='y', colors="C1")
 31 | 
 32 |     if lines is not None:
 33 |         for line in lines:
 34 |             plt.axvline(x=line)
 35 | 
 36 |     plt.savefig(filename)
 37 | 
 38 | class SkipEnv(gym.Wrapper):
 39 |     def __init__(self, env=None, skip=4):
 40 |         super(SkipEnv, self).__init__(env)
 41 |         self._skip = skip
 42 | 
 43 |     def step(self, action):
 44 |         t_reward = 0.0
 45 |         done = False
 46 |         for _ in range(self._skip):
 47 |             obs, reward, done, info = self.env.step(action)
 48 |             t_reward += reward
 49 |             if done:
 50 |                 break
 51 |         return obs, t_reward, done, info
 52 | 
 53 |     def reset(self):
 54 |         self._obs_buffer = []
 55 |         obs = self.env.reset()
 56 |         self._obs_buffer.append(obs)
 57 |         return obs
 58 | 
 59 | class PreProcessFrame(gym.ObservationWrapper):
 60 |     def __init__(self, env=None):
 61 |         super(PreProcessFrame, self).__init__(env)
 62 |         self.observation_space = gym.spaces.Box(low=0, high=255,
 63 |                                                 shape=(80,80,1), dtype=np.uint8)
 64 |     def observation(self, obs):
 65 |         return PreProcessFrame.process(obs)
 66 | 
 67 |     @staticmethod
 68 |     def process(frame):
 69 | 
 70 |         new_frame = np.reshape(frame, frame.shape).astype(np.float32)
 71 | 
 72 |         new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \
 73 |                     0.114*new_frame[:,:,2]
 74 | 
 75 |         new_frame = new_frame[35:195:2, ::2].reshape(80,80,1)
 76 | 
 77 |         return new_frame.astype(np.uint8)
 78 | 
 79 | class MoveImgChannel(gym.ObservationWrapper):
 80 |     def __init__(self, env):
 81 |         super(MoveImgChannel, self).__init__(env)
 82 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 83 |                             shape=(self.observation_space.shape[-1],
 84 |                                    self.observation_space.shape[0],
 85 |                                    self.observation_space.shape[1]),
 86 |                             dtype=np.float32)
 87 | 
 88 |     def observation(self, observation):
 89 |         return np.moveaxis(observation, 2, 0)
 90 | 
 91 | class ScaleFrame(gym.ObservationWrapper):
 92 |     def observation(self, obs):
 93 |         return np.array(obs).astype(np.float32) / 255.0
 94 | 
 95 | class BufferWrapper(gym.ObservationWrapper):
 96 |     def __init__(self, env, n_steps):
 97 |         super(BufferWrapper, self).__init__(env)
 98 |         self.observation_space = gym.spaces.Box(
 99 |                              env.observation_space.low.repeat(n_steps, axis=0),
100 |                              env.observation_space.high.repeat(n_steps, axis=0),
101 |                              dtype=np.float32)
102 | 
103 |     def reset(self):
104 |         self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32)
105 |         return self.observation(self.env.reset())
106 | 
107 |     def observation(self, observation):
108 |         self.buffer[:-1] = self.buffer[1:]
109 |         self.buffer[-1] = observation
110 |         return self.buffer
111 | 
112 | def make_env(env_name):
113 |     env = gym.make(env_name)
114 |     env = SkipEnv(env)
115 |     env = PreProcessFrame(env)
116 |     env = MoveImgChannel(env)
117 |     env = BufferWrapper(env, 4)
118 |     return ScaleFrame(env)
119 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/acrobot.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from gym import wrappers
 5 | import pickle
 6 | 
 7 | theta_space = np.linspace(-1, 1, 10)
 8 | theta_dot_space = np.linspace(-10, 10, 10)
 9 | 
10 | def get_state(observation):
11 |     cos_theta1, sin_theta1, cos_theta2, sin_theta2, theta1_dot, theta2_dot = \
12 |         observation
13 |     c_th1 = int(np.digitize(cos_theta1, theta_space))
14 |     s_th1 = int(np.digitize(sin_theta1, theta_space))
15 |     c_th2 = int(np.digitize(cos_theta2, theta_space))
16 |     s_th2 = int(np.digitize(sin_theta2, theta_space))
17 |     th1_dot = int(np.digitize(theta1_dot, theta_dot_space))
18 |     th2_dot = int(np.digitize(theta2_dot, theta_dot_space))
19 | 
20 |     return (c_th1, s_th2, c_th2, s_th2, th1_dot, th2_dot)
21 | 
22 | def maxAction(Q, state, actions=[0, 1, 2]):
23 |     values = np.array([Q[state,a] for a in actions])
24 |     action = np.argmax(values)
25 | 
26 |     return action
27 | 
28 | if __name__ == '__main__':
29 |     env = gym.make('Acrobot-v1')
30 |     n_games = 100
31 |     alpha = 0.1
32 |     gamma = 0.99
33 |     eps = 0
34 | 
35 |     action_space = [0, 1, 2]
36 | 
37 |     states = []
38 |     for c1 in range(11):
39 |         for s1 in range(11):
40 |             for c2 in range(11):
41 |                 for s2 in range(11):
42 |                     for dot1 in range(11):
43 |                         for dot2 in range(11):
44 |                             states.append((c1, s1, c2, s2, dot1, dot2))
45 |     """
46 |     Q = {}
47 |     for state in states:
48 |         for action in action_space:
49 |             Q[state, action] = 0
50 |     """
51 |     pickle_in = open('acrobot.pkl', 'rb')
52 |     Q = pickle.load(pickle_in)
53 |     env = wrappers.Monitor(env, "tmp/acrobot", video_callable=lambda episode_id: True, force=True)
54 |     eps_rewards = 0
55 |     total_rewards = np.zeros(n_games)
56 |     for i in range(n_games):
57 |         if i % 1 == 0:
58 |             print('episode ', i, 'score ', eps_rewards, 'eps', eps)
59 |         observation = env.reset()
60 |         state = get_state(observation)
61 |         done = False
62 |         action = env.action_space.sample() if np.random.random() < eps else \
63 |                  maxAction(Q, state)
64 |         eps_rewards = 0
65 |         while not done:
66 |             """
67 |             print(observation)
68 |             action = env.action_space.sample()
69 |             """
70 |             observation_, reward, done, info = env.step(action)
71 |             state_ = get_state(observation_)
72 |             action_ = maxAction(Q, state_)
73 |             eps_rewards += reward
74 |             Q[state, action] = Q[state,action] + \
75 |                     alpha*(reward + gamma*Q[state_,action_] - Q[state,action])
76 |             state = state_
77 |             action = action_
78 |         total_rewards[i] = eps_rewards
79 |         eps = eps - 2 / n_games if eps > 0.01 else 0.01
80 | 
81 |     mean_rewards = np.zeros(n_games)
82 |     for t in range(n_games):
83 |         mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
84 |     plt.plot(mean_rewards)
85 |     plt.show()
86 | 
87 |     f = open("acrobot.pkl","wb")
88 |     pickle.dump(Q,f)
89 |     f.close()
90 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/blackJack-no-es.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | if __name__ == '__main__':
  6 |     env = gym.make('Blackjack-v0')
  7 |     EPS = 0.05
  8 |     GAMMA = 1.0
  9 | 
 10 |     Q = {}
 11 |     agentSumSpace = [i for i in range(4, 22)]
 12 |     dealerShowCardSpace = [i+1 for i in range(10)]
 13 |     agentAceSpace = [False, True]
 14 |     actionSpace = [0, 1] # stick or hit
 15 | 
 16 |     stateSpace = []
 17 |     returns = {}
 18 |     pairsVisited = {}
 19 |     for total in agentSumSpace:
 20 |         for card in dealerShowCardSpace:
 21 |             for ace in agentAceSpace:
 22 |                 for action in actionSpace:
 23 |                     Q[((total, card, ace), action)] = 0
 24 |                     returns[((total, card, ace), action)] = 0
 25 |                     pairsVisited[((total, card, ace), action)] = 0
 26 |                 stateSpace.append((total, card, ace))
 27 |     
 28 |     policy = {}
 29 |     for state in stateSpace:
 30 |         policy[state] = np.random.choice(actionSpace)
 31 | 
 32 |     numEpisodes = 1000000
 33 |     for i in range(numEpisodes):
 34 |         statesActionsReturns = []
 35 |         memory = []
 36 |         if i % 100000 == 0:
 37 |             print('starting episode', i)
 38 |         observation = env.reset()
 39 |         done = False
 40 |         while not done:
 41 |             action = policy[observation]
 42 |             observation_, reward, done, info = env.step(action)
 43 |             memory.append((observation[0], observation[1], observation[2], action, reward))
 44 |             observation = observation_
 45 |         memory.append((observation[0], observation[1], observation[2], action, reward))    
 46 | 
 47 |         G = 0
 48 |         last = True
 49 |         for playerSum, dealerCard, usableAce, action, reward in reversed(memory):
 50 |             if last:
 51 |                 last = False
 52 |             else:
 53 |                 statesActionsReturns.append((playerSum, dealerCard, usableAce, action, G))
 54 |             G = GAMMA*G + reward
 55 | 
 56 |         statesActionsReturns.reverse()
 57 |         statesActionsVisited = []
 58 | 
 59 |         for playerSum, dealerCard, usableAce, action, G in statesActionsReturns:
 60 |             sa = ((playerSum, dealerCard, usableAce), action)
 61 |             if sa not in statesActionsVisited:
 62 |                 pairsVisited[sa] += 1
 63 |                 # incremental implementation
 64 |                 # new estimate = 1 / N * [sample - old estimate]
 65 |                 returns[(sa)] += (1 / pairsVisited[(sa)])*(G-returns[(sa)])
 66 |                 Q[sa] = returns[sa]
 67 |                 rand = np.random.random()
 68 |                 if rand < 1 - EPS:
 69 |                     state = (playerSum, dealerCard, usableAce)
 70 |                     values = np.array([Q[(state, a)] for a in actionSpace ])
 71 |                     best = np.random.choice(np.where(values==values.max())[0])
 72 |                     policy[state] = actionSpace[best]
 73 |                 else:
 74 |                     policy[state] = np.random.choice(actionSpace)
 75 |                 statesActionsVisited.append(sa)
 76 |         if EPS - 1e-7 > 0:
 77 |             EPS -= 1e-7
 78 |         else:
 79 |             EPS = 0
 80 | 
 81 |     numEpisodes = 1000
 82 |     rewards = np.zeros(numEpisodes)
 83 |     totalReward = 0
 84 |     wins = 0
 85 |     losses = 0
 86 |     draws = 0
 87 |     print('getting ready to test policy')   
 88 |     for i in range(numEpisodes):
 89 |         observation = env.reset()
 90 |         done = False
 91 |         while not done:
 92 |             action = policy[observation]
 93 |             observation_, reward, done, info = env.step(action)            
 94 |             observation = observation_
 95 |         totalReward += reward
 96 |         rewards[i] = totalReward
 97 | 
 98 |         if reward >= 1:
 99 |             wins += 1
100 |         elif reward == 0:
101 |             draws += 1
102 |         elif reward == -1:
103 |             losses += 1
104 |     
105 |     wins /= numEpisodes
106 |     losses /= numEpisodes
107 |     draws /= numEpisodes
108 |     print('win rate', wins, 'loss rate', losses, 'draw rate', draws)
109 |     plt.plot(rewards)
110 |     plt.show()    


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/blackJack-off-policy.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | if __name__ == '__main__':
  6 |     env = gym.make('Blackjack-v0')
  7 |     EPS = 0.05
  8 |     GAMMA = 1.0
  9 | 
 10 |     agentSumSpace = [i for i in range(4, 22)]
 11 |     dealerShowCardSpace = [i+1 for i in range(10)]
 12 |     agentAceSpace = [False, True]
 13 |     actionSpace = [0, 1] # stick or hit
 14 |     stateSpace = []
 15 |     
 16 |     Q = {}
 17 |     C = {}
 18 |     for total in agentSumSpace:
 19 |         for card in dealerShowCardSpace:
 20 |             for ace in agentAceSpace:
 21 |                 for action in actionSpace:
 22 |                     Q[((total, card, ace), action)] = 0
 23 |                     C[((total, card, ace), action)] = 0
 24 |                 stateSpace.append((total, card, ace))
 25 | 
 26 |     targetPolicy = {}
 27 |     for state in stateSpace:
 28 |         values = np.array([Q[(state, a)] for a in actionSpace ])
 29 |         best = np.random.choice(np.where(values==values.max())[0])        
 30 |         targetPolicy[state] = actionSpace[best]
 31 | 
 32 |     numEpisodes = 1000000
 33 |     for i in range(numEpisodes):        
 34 |         memory = []
 35 |         if i % 100000 == 0:
 36 |             print('starting episode', i)
 37 |         behaviorPolicy = {}
 38 |         for state in stateSpace:
 39 |             rand = np.random.random()
 40 |             if rand < 1 - EPS:
 41 |                 behaviorPolicy[state] = [targetPolicy[state]]
 42 |             else:
 43 |                 behaviorPolicy[state] = actionSpace
 44 |         observation = env.reset()
 45 |         done = False
 46 |         while not done:
 47 |             action = np.random.choice(behaviorPolicy[observation])
 48 |             observation_, reward, done, info = env.step(action)
 49 |             memory.append((observation[0], observation[1], observation[2], action, reward))
 50 |             observation = observation_
 51 |         memory.append((observation[0], observation[1], observation[2], action, reward))    
 52 | 
 53 |         G = 0
 54 |         W = 1
 55 |         last = True
 56 |         for playerSum, dealerCard, usableAce, action, reward in reversed(memory):
 57 |             sa = ((playerSum, dealerCard, usableAce), action)
 58 |             if last:
 59 |                 last = False
 60 |             else:
 61 |                 C[sa] += W
 62 |                 Q[sa] += (W / C[sa])*(G-Q[sa])                
 63 |                 values = np.array([Q[(state, a)] for a in actionSpace ])
 64 |                 best = np.random.choice(np.where(values==values.max())[0])        
 65 |                 targetPolicy[state] = actionSpace[best]
 66 |                 if action != targetPolicy[state]:
 67 |                     break
 68 |                 if len(behaviorPolicy[state]) == 1:
 69 |                     prob = 1 - EPS
 70 |                 else:
 71 |                     prob = EPS / len(behaviorPolicy[state])             
 72 |                 W *= 1/prob                             
 73 |             G = GAMMA*G + reward
 74 |         if EPS - 1e-7 > 0:
 75 |             EPS -= 1e-7
 76 |         else:
 77 |             EPS = 0
 78 |     numEpisodes = 1000
 79 |     rewards = np.zeros(numEpisodes)
 80 |     totalReward = 0
 81 |     wins = 0
 82 |     losses = 0
 83 |     draws = 0
 84 |     print('getting ready to test target policy')   
 85 |     for i in range(numEpisodes):
 86 |         observation = env.reset()
 87 |         done = False
 88 |         while not done:
 89 |             action = targetPolicy[observation]
 90 |             observation_, reward, done, info = env.step(action)            
 91 |             observation = observation_
 92 |         totalReward += reward
 93 |         rewards[i] = totalReward
 94 | 
 95 |         if reward >= 1:
 96 |             wins += 1
 97 |         elif reward == 0:
 98 |             draws += 1
 99 |         elif reward == -1:
100 |             losses += 1
101 |     
102 |     wins /= numEpisodes
103 |     losses /= numEpisodes
104 |     draws /= numEpisodes
105 |     print('win rate', wins, 'loss rate', losses, 'draw rate', draws)
106 |     plt.plot(rewards)
107 |     plt.show()


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/cartpole_qlearning.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np 
  3 | import matplotlib.pyplot as plt 
  4 | 
  5 | env = gym.make('CartPole-v0')
  6 | 
  7 | MAXSTATES = 10**4
  8 | GAMMA = 0.9
  9 | ALPHA = 0.01
 10 | 
 11 | def max_dict(d):
 12 | 	max_v = float('-inf')
 13 | 	for key, val in d.items():
 14 | 		if val > max_v:
 15 | 			max_v = val
 16 | 			max_key = key
 17 | 	return max_key, max_v
 18 | 
 19 | def create_bins():
 20 | 	# obs[0] -> cart position --- -4.8 - 4.8
 21 | 	# obs[1] -> cart velocity --- -inf - inf
 22 | 	# obs[2] -> pole angle    --- -41.8 - 41.8
 23 | 	# obs[3] -> pole velocity --- -inf - inf
 24 | 	
 25 | 	bins = np.zeros((4,10))
 26 | 	bins[0] = np.linspace(-4.8, 4.8, 10)
 27 | 	bins[1] = np.linspace(-5, 5, 10)
 28 | 	bins[2] = np.linspace(-.418, .418, 10)
 29 | 	bins[3] = np.linspace(-5, 5, 10)
 30 | 
 31 | 	return bins
 32 | 
 33 | def assign_bins(observation, bins):
 34 | 	state = np.zeros(4)
 35 | 	for i in range(4):
 36 | 		state[i] = np.digitize(observation[i], bins[i])
 37 | 	return state
 38 | 
 39 | def get_state_as_string(state):
 40 | 	string_state = ''.join(str(int(e)) for e in state)
 41 | 	return string_state
 42 | 
 43 | def get_all_states_as_string():
 44 | 	states = []
 45 | 	for i in range(MAXSTATES):
 46 | 		states.append(str(i).zfill(4))
 47 | 	return states
 48 | 
 49 | def initialize_Q():
 50 | 	Q = {}
 51 | 
 52 | 	all_states = get_all_states_as_string()
 53 | 	for state in all_states:
 54 | 		Q[state] = {}
 55 | 		for action in range(env.action_space.n):
 56 | 			Q[state][action] = 0
 57 | 	return Q
 58 | 
 59 | def play_one_game(bins, Q, eps=0.5):
 60 | 	observation = env.reset()
 61 | 	done = False
 62 | 	cnt = 0 # number of moves in an episode
 63 | 	state = get_state_as_string(assign_bins(observation, bins))
 64 | 	total_reward = 0
 65 | 
 66 | 	while not done:
 67 | 		cnt += 1	
 68 | 		# np.random.randn() seems to yield a random action 50% of the time ?
 69 | 		if np.random.uniform() < eps:
 70 | 			act = env.action_space.sample() # epsilon greedy
 71 | 		else:			
 72 | 			act = max_dict(Q[state])[0]
 73 | 		
 74 | 		observation, reward, done, _ = env.step(act)
 75 | 
 76 | 		total_reward += reward
 77 | 
 78 | 		if done and cnt < 200:
 79 | 			reward = -300
 80 | 
 81 | 		state_new = get_state_as_string(assign_bins(observation, bins))
 82 | 
 83 | 		a1, max_q_s1a1 = max_dict(Q[state_new])
 84 | 		Q[state][act] += ALPHA*(reward + GAMMA*max_q_s1a1 - Q[state][act])
 85 | 		state, act = state_new, a1					
 86 | 
 87 | 	return total_reward, cnt
 88 | 
 89 | def play_many_games(bins, N=10000):
 90 | 	Q = initialize_Q()
 91 | 
 92 | 	length = []
 93 | 	reward = []
 94 | 	for n in range(N):
 95 | 		#eps=0.5/(1+n*10e-3)
 96 | 		eps = 1.0 / np.sqrt(n+1)
 97 | 
 98 | 		episode_reward, episode_length = play_one_game(bins, Q, eps)
 99 | 		
100 | 		if n % 100 == 0:
101 | 			print(n, '%.4f' % eps, episode_reward)
102 | 		length.append(episode_length)
103 | 		reward.append(episode_reward)
104 | 
105 | 	return length, reward
106 | 
107 | def plot_running_avg(totalrewards):
108 | 	N = len(totalrewards)
109 | 	running_avg = np.empty(N)
110 | 	for t in range(N):
111 | 		running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)])
112 | 	plt.plot(running_avg)
113 | 	plt.title("Running Average")
114 | 	plt.show()
115 | 
116 | if __name__ == '__main__':
117 | 	bins = create_bins()
118 | 	episode_lengths, episode_rewards = play_many_games(bins)
119 | 
120 | 	plot_running_avg(episode_rewards)


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/doubleQLearning.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import gym
 5 | 
 6 | def maxAction(Q1, Q2, state):    
 7 |     values = np.array([Q1[state,a] + Q2[state,a] for a in range(2)])
 8 |     action = np.argmax(values)
 9 |     return action
10 | 
11 | #discretize the spaces
12 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10)
13 | poleThetaVelSpace = np.linspace(-4, 4, 10)
14 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
15 | cartVelSpace = np.linspace(-4, 4, 10)
16 | 
17 | def getState(observation):
18 |     cartX, cartXdot, cartTheta, cartThetadot = observation
19 |     cartX = int(np.digitize(cartX, cartPosSpace))
20 |     cartXdot = int(np.digitize(cartXdot, cartVelSpace))
21 |     cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
22 |     cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace))
23 | 
24 |     return (cartX, cartXdot, cartTheta, cartThetadot)
25 | 
26 | def plotRunningAverage(totalrewards):
27 |     N = len(totalrewards)
28 |     running_avg = np.empty(N)
29 |     for t in range(N):
30 | 	    running_avg[t] = np.mean(totalrewards[max(0, t-100):(t+1)])
31 |     plt.plot(running_avg)
32 |     plt.title("Running Average")
33 |     plt.show()
34 | 
35 | if __name__ == '__main__':
36 |     env = gym.make('CartPole-v0')
37 |     # model hyperparameters
38 |     ALPHA = 0.1
39 |     GAMMA = 0.9
40 |     EPS = 1.0
41 | 
42 |     #construct state space
43 |     states = []
44 |     for i in range(len(cartPosSpace)+1):
45 |         for j in range(len(cartVelSpace)+1):
46 |             for k in range(len(poleThetaSpace)+1):
47 |                 for l in range(len(poleThetaVelSpace)+1):
48 |                     states.append((i,j,k,l))
49 |     
50 |     Q1, Q2 = {}, {}
51 |     for s in states:
52 |         for a in range(2):
53 |             Q1[s, a] = 0
54 |             Q2[s,a] = 0
55 | 
56 |     numGames = 100000
57 |     totalRewards = np.zeros(numGames)
58 |     for i in range(numGames):
59 |         if i % 5000 == 0:
60 |             print('starting game ', i)
61 |         done = False
62 |         epRewards = 0
63 |         observation = env.reset()
64 |         while not done:
65 |             s = getState(observation)
66 |             rand = np.random.random()
67 |             a = maxAction(Q1,Q2,s) if rand < (1-EPS) else env.action_space.sample()
68 |             observation_, reward, done, info = env.step(a)
69 |             epRewards += reward
70 |             s_ = getState(observation_)
71 |             rand = np.random.random()
72 |             if rand <= 0.5:
73 |                 a_ = maxAction(Q1,Q1,s_)
74 |                 Q1[s,a] = Q1[s,a] + ALPHA*(reward + GAMMA*Q2[s_,a_] - Q1[s,a])
75 |             elif rand > 0.5:
76 |                 a_ = maxAction(Q2,Q2,s_)
77 |                 Q2[s,a] = Q2[s,a] + ALPHA*(reward + GAMMA*Q1[s_,a_] - Q2[s,a])
78 |             observation = observation_
79 |         EPS -= 2/(numGames) if EPS > 0 else 0
80 |         totalRewards[i] = epRewards
81 |     
82 |     #plt.plot(totalRewards, 'b--')
83 |     #plt.show()
84 |     plotRunningAverage(totalRewards)
85 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/mountaincar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/Fundamentals/mountaincar.png


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/mountaincar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from gym import wrappers
 5 | import pickle
 6 | 
 7 | pos_space = np.linspace(-1.2, 0.6, 12)
 8 | vel_space = np.linspace(-0.07, 0.07, 20)
 9 | 
10 | def get_state(observation):
11 |     pos, vel =  observation
12 |     pos_bin = int(np.digitize(pos, pos_space))
13 |     vel_bin = int(np.digitize(vel, vel_space))
14 | 
15 |     return (pos_bin, vel_bin)
16 | 
17 | def max_action(Q, state, actions=[0, 1, 2]):
18 |     values = np.array([Q[state,a] for a in actions])
19 |     action = np.argmax(values)
20 | 
21 |     return action
22 | 
23 | if __name__ == '__main__':
24 |     env = gym.make('MountainCar-v0')
25 |     env._max_episode_steps = 1000
26 |     n_games = 50000
27 |     alpha = 0.1
28 |     gamma = 0.99
29 |     eps = 1.0
30 | 
31 |     action_space = [0, 1, 2]
32 | 
33 |     states = []
34 |     for pos in range(21):
35 |         for vel in range(21):
36 |             states.append((pos, vel))
37 | 
38 |     Q = {}
39 |     for state in states:
40 |         for action in action_space:
41 |             Q[state, action] = 0
42 | 
43 |     #pickle_in = open('mountaincar.pkl', 'rb')
44 |     #Q = pickle.load(pickle_in)
45 |     #env = wrappers.Monitor(env, "tmp/mountaincar",
46 |                             #video_callable=lambda episode_id: True, force=True)
47 |     score = 0
48 |     total_rewards = np.zeros(n_games)
49 |     for i in range(n_games):
50 |         done = False
51 |         obs = env.reset()
52 |         state = get_state(obs)
53 |         if i % 100 == 0 and i > 0:
54 |             print('episode ', i, 'score ', score, 'epsilon %.3f' % eps)
55 |         score = 0
56 |         while not done:
57 |             action = np.random.choice([0,1,2]) if np.random.random() < eps \
58 |                     else max_action(Q, state)
59 |             obs_, reward, done, info = env.step(action)
60 |             state_ = get_state(obs_)
61 |             score += reward
62 |             action_ = max_action(Q, state_)
63 |             Q[state, action] = Q[state, action] + \
64 |                     alpha*(reward + gamma*Q[state_, action_] - Q[state, action])
65 |             state = state_
66 |         total_rewards[i] = score
67 |         eps = eps - 2/n_games if eps > 0.01 else 0.01
68 | 
69 |     mean_rewards = np.zeros(n_games)
70 |     for t in range(n_games):
71 |         mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
72 |     plt.plot(mean_rewards)
73 |     plt.savefig('mountaincar.png')
74 | 
75 |     #f = open("mountaincar.pkl","wb")
76 |     #pickle.dump(Q,f)
77 |     #f.close()
78 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/n_step_sarsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | 
  4 | poleThetaSpace = np.linspace(-0.209, 0.209, 10)
  5 | poleThetaVelSpace = np.linspace(-4, 4, 10)
  6 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
  7 | cartVelSpace = np.linspace(-4, 4, 10)
  8 | 
  9 | def get_state(observation):
 10 |     cartX, cartXdot, cartTheta, cartThetaDot = observation
 11 |     cartX = int(np.digitize(cartX, cartPosSpace))
 12 |     cartXdot = int(np.digitize(cartXdot, cartVelSpace))
 13 |     cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
 14 |     cartThetaDot = int(np.digitize(cartThetaDot, poleThetaVelSpace))
 15 | 
 16 |     return (cartX, cartXdot, cartTheta, cartThetaDot)
 17 | 
 18 | def choose_action(q, obs, eps, n_actions=2):
 19 |     state = get_state(obs)
 20 |     if np.random.random() < eps:
 21 |         action = np.random.choice([i for i in range(n_actions)])
 22 |     else:
 23 |         action_values = [q[(state, a)] for a in range(n_actions)]
 24 |         action = np.argmax(action_values)
 25 |     return action
 26 | 
 27 | if __name__ == '__main__':
 28 |     env = gym.make('CartPole-v0')
 29 |     alpha = 0.1
 30 |     gamma = 0.9
 31 |     epsilon = 1.0
 32 | 
 33 |     states = []
 34 |     for i in range(len(cartPosSpace)+1):
 35 |         for j in range(len(cartVelSpace)+1):
 36 |             for k in range(len(poleThetaSpace)+1):
 37 |                 for l in range(len(poleThetaVelSpace)+1):
 38 |                     states.append((i,j,k,l))
 39 | 
 40 |     Q = {}
 41 |     for s in states:
 42 |         for a in range(2):
 43 |             Q[(s, a)] = 0.0
 44 | 
 45 |     n = 16
 46 |     state_memory = np.zeros((n, 4))
 47 |     action_memory = np.zeros(n)
 48 |     reward_memory = np.zeros(n)
 49 | 
 50 |     scores = []
 51 |     n_episodes = 50000
 52 |     for i in range(n_episodes):
 53 |         done = False
 54 |         score = 0
 55 |         t = 0
 56 |         T = np.inf
 57 |         observation = env.reset()
 58 |         action = choose_action(Q, observation, epsilon)
 59 |         action_memory[t%n] = action
 60 |         state_memory[t%n] = observation
 61 |         while not done:
 62 |             observation, reward, done, info = env.step(action)
 63 |             score += reward
 64 |             state_memory[(t+1)%n] = observation
 65 |             reward_memory[(t+1)%n] = reward
 66 |             if done:
 67 |                 T = t + 1
 68 |                 #print('episode ends at step', t)
 69 |             action = choose_action(Q, observation, epsilon)
 70 |             action_memory[(t+1)%n] = action
 71 |             tau = t - n + 1
 72 |             if tau >= 0:
 73 |                 G = [gamma**(j-tau-1)*reward_memory[j%n] \
 74 |                         for j in range(tau+1, min(tau+n, T)+1)]
 75 |                 G = np.sum(G)
 76 |                 if tau + n < T:
 77 |                     s = get_state(state_memory[(tau+n)%n])
 78 |                     a = int(action_memory[(tau+n)%n])
 79 |                     G += gamma**n * Q[(s,a)]
 80 |                 s = get_state(state_memory[tau%n])
 81 |                 a = action_memory[tau%n]
 82 |                 Q[(s,a)] += alpha*(G-Q[(s,a)])
 83 |             #print('tau ', tau, '| Q %.2f' % \
 84 |             #        Q[(get_state(state_memory[tau%n]), action_memory[tau%n])])
 85 | 
 86 |             t += 1
 87 | 
 88 |         for tau in range(t-n+1, T):
 89 |             G = [gamma**(j-tau-1)*reward_memory[j%n] \
 90 |                     for j in range(tau+1, min(tau+n, T)+1)]
 91 |             G = np.sum(G)
 92 |             if tau + n < T:
 93 |                 s = get_state(state_memory[(tau+n)%n])
 94 |                 a = int(action_memory[(tau+n)%n])
 95 |                 G += gamma**n * Q[(s,a)]
 96 |             s = get_state(state_memory[tau%n])
 97 |             a = action_memory[tau%n]
 98 |             Q[(s,a)] += alpha*(G-Q[(s,a)])
 99 |             #print('tau ', tau, '| Q %.2f' % \
100 |             #    Q[(get_state(state_memory[tau%n]), action_memory[tau%n])])
101 |         scores.append(score)
102 |         avg_score = np.mean(scores[-1000:])
103 |         epsilon = epsilon -2 / n_episodes if epsilon > 0 else 0
104 |         if i % 1000 == 0:
105 |             print('episode ', i, 'avg_score %.1f' % avg_score,
106 |                     'epsilon %.2f' % epsilon)
107 | 
108 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/Fundamentals/sarsa.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import gym
 4 | 
 5 | def maxAction(Q, state):    
 6 |     values = np.array([Q[state,a] for a in range(2)])
 7 |     action = np.argmax(values)
 8 |     return action
 9 | 
10 | #discretize the spaces
11 | poleThetaSpace = np.linspace(-0.20943951, 0.20943951, 10)
12 | poleThetaVelSpace = np.linspace(-4, 4, 10)
13 | cartPosSpace = np.linspace(-2.4, 2.4, 10)
14 | cartVelSpace = np.linspace(-4, 4, 10)
15 | 
16 | def getState(observation):
17 |     cartX, cartXdot, cartTheta, cartThetadot = observation
18 |     cartX = int(np.digitize(cartX, cartPosSpace))
19 |     cartXdot = int(np.digitize(cartXdot, cartVelSpace))
20 |     cartTheta = int(np.digitize(cartTheta, poleThetaSpace))
21 |     cartThetadot = int(np.digitize(cartThetadot, poleThetaVelSpace))
22 | 
23 |     return (cartX, cartXdot, cartTheta, cartThetadot)
24 | 
25 | if __name__ == '__main__':
26 |     env = gym.make('CartPole-v0')
27 |     # model hyperparameters
28 |     ALPHA = 0.1
29 |     GAMMA = 0.9    
30 |     EPS = 1.0
31 | 
32 |     #construct state space
33 |     states = []
34 |     for i in range(len(cartPosSpace)+1):
35 |         for j in range(len(cartVelSpace)+1):
36 |             for k in range(len(poleThetaSpace)+1):
37 |                 for l in range(len(poleThetaVelSpace)+1):
38 |                     states.append((i,j,k,l))
39 | 
40 |     Q = {}
41 |     for s in states:
42 |         for a in range(2):
43 |             Q[s, a] = 0
44 | 
45 |     numGames = 50000
46 |     totalRewards = np.zeros(numGames)
47 |     for i in range(numGames):
48 |         if i % 5000 == 0:
49 |             print('starting game', i)
50 |         # cart x position, cart velocity, pole theta, pole velocity
51 |         observation = env.reset()        
52 |         s = getState(observation)
53 |         rand = np.random.random()
54 |         a = maxAction(Q, s) if rand < (1-EPS) else env.action_space.sample()
55 |         done = False
56 |         epRewards = 0
57 |         while not done:
58 |             observation_, reward, done, info = env.step(a)   
59 |             s_ = getState(observation_)
60 |             rand = np.random.random()
61 |             a_ = maxAction(Q, s_) if rand < (1-EPS) else env.action_space.sample()
62 |             epRewards += reward
63 |             Q[s,a] = Q[s,a] + ALPHA*(reward + GAMMA*Q[s_,a_] - Q[s,a])
64 |             s, a = s_, a_            
65 |         EPS -= 2/(numGames) if EPS > 0 else 0
66 |         totalRewards[i] = epRewards
67 | 
68 |     plt.plot(totalRewards, 'b--')
69 |     plt.show()    


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/A3C_CartPole_no_rewards.png


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/ICM/ICM_CartPole_no_rewards.png


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.distributions import Categorical
 6 | 
 7 | 
 8 | class ActorCritic(nn.Module):
 9 |     def __init__(self, input_dims, n_actions, gamma=0.99, tau=0.98):
10 |         super(ActorCritic, self).__init__()
11 |         self.gamma = gamma
12 |         self.tau = tau
13 | 
14 |         self.input = nn.Linear(*input_dims, 256)
15 |         self.dense = nn.Linear(256, 256)
16 | 
17 |         self.gru = nn.GRUCell(256, 256)
18 |         self.pi = nn.Linear(256, n_actions)
19 |         self.v = nn.Linear(256, 1)
20 | 
21 |     def forward(self, state, hx):
22 |         x = F.relu(self.input(state))
23 |         x = F.relu(self.dense(x))
24 |         hx = self.gru(x, (hx))
25 | 
26 |         pi = self.pi(hx)
27 |         v = self.v(hx)
28 | 
29 |         probs = T.softmax(pi, dim=1)
30 |         dist = Categorical(probs)
31 |         action = dist.sample()
32 |         log_prob = dist.log_prob(action)
33 | 
34 |         return action.numpy()[0], v, log_prob, hx
35 | 
36 |     def calc_R(self, done, rewards, values):
37 |         values = T.cat(values).squeeze()
38 |         if len(values.size()) == 1:  # batch of states
39 |             R = values[-1] * (1-int(done))
40 |         elif len(values.size()) == 0:  # single state
41 |             R = values*(1-int(done))
42 | 
43 |         batch_return = []
44 |         for reward in rewards[::-1]:
45 |             R = reward + self.gamma * R
46 |             batch_return.append(R)
47 |         batch_return.reverse()
48 |         batch_return = T.tensor(batch_return, 
49 |                                 dtype=T.float).reshape(values.size())
50 |         return batch_return
51 | 
52 |     def calc_loss(self, new_states, hx, done,
53 |                   rewards, values, log_probs, r_i_t=None):
54 |         if r_i_t is not None:
55 |             rewards += r_i_t.detach().numpy()
56 |         returns = self.calc_R(done, rewards, values)
57 |         next_v = T.zeros(1, 1) if done else self.forward(T.tensor([new_states],
58 |                                          dtype=T.float), hx)[1]
59 | 
60 |         values.append(next_v.detach())
61 |         values = T.cat(values).squeeze()
62 |         log_probs = T.cat(log_probs)
63 |         rewards = T.tensor(rewards)
64 | 
65 |         delta_t = rewards + self.gamma*values[1:] - values[:-1]
66 |         n_steps = len(delta_t)
67 |         gae = np.zeros(n_steps)
68 |         for t in range(n_steps):
69 |             for k in range(0, n_steps-t):
70 |                 temp = (self.gamma*self.tau)**k*delta_t[t+k]
71 |                 gae[t] += temp
72 |         gae = T.tensor(gae, dtype=T.float)
73 | 
74 |         actor_loss = -(log_probs*gae).sum()
75 |         entropy_loss = (-log_probs*T.exp(log_probs)).sum()
76 |         # [T] vs ()
77 |         critic_loss = F.mse_loss(values[:-1].squeeze(), returns)
78 | 
79 |         total_loss = actor_loss + critic_loss - 0.01*entropy_loss
80 |         return total_loss
81 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/icm.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class ICM(nn.Module):
 7 |     def __init__(self, input_dims, n_actions=2, alpha=1, beta=0.2):
 8 |         super(ICM, self).__init__()
 9 |         self.alpha = alpha
10 |         self.beta = beta
11 |         # hard coded for cartpole environment
12 |         self.inverse = nn.Linear(4*2, 256)
13 |         self.pi_logits = nn.Linear(256, n_actions)
14 | 
15 |         self.dense1 = nn.Linear(4+1, 256)
16 |         self.new_state = nn.Linear(256, 4)
17 | 
18 |         device = T.device('cpu')
19 |         self.to(device)
20 | 
21 |     def forward(self, state, new_state, action):
22 |         inverse = F.elu(self.inverse(T.cat([state, new_state], dim=1)))
23 |         pi_logits = self.pi_logits(inverse)
24 | 
25 |         # from [T] to [T,1]
26 |         action = action.reshape((action.size()[0], 1))
27 |         forward_input = T.cat([state, action], dim=1)
28 |         dense = F.elu(self.dense1(forward_input))
29 |         state_ = self.new_state(dense)
30 | 
31 |         return pi_logits, state_
32 | 
33 |     def calc_loss(self, state, new_state, action):
34 |         state = T.tensor(state, dtype=T.float)
35 |         action = T.tensor(action, dtype=T.float)
36 |         new_state = T.tensor(new_state, dtype=T.float)
37 | 
38 |         pi_logits, state_ = self.forward(state, new_state, action)
39 | 
40 |         inverse_loss = nn.CrossEntropyLoss()
41 |         L_I = (1-self.beta)*inverse_loss(pi_logits, action.to(T.long))
42 | 
43 |         forward_loss = nn.MSELoss()
44 |         L_F = self.beta*forward_loss(state_, new_state)
45 | 
46 |         intrinsic_reward = self.alpha*((state_ - new_state).pow(2)).mean(dim=1)
47 |         return intrinsic_reward, L_I, L_F
48 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch.multiprocessing as mp
 3 | from parallel_env import ParallelEnv
 4 | 
 5 | os.environ['OMP_NUM_THREADS'] = '1'
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     mp.set_start_method('spawn')
10 |     env_id = 'CartPole-v0'
11 |     n_threads = 12
12 |     n_actions = 2
13 |     input_shape = [4]
14 |     env = ParallelEnv(env_id=env_id, n_threads=n_threads,
15 |                       n_actions=n_actions, input_shape=input_shape, icm=True)
16 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/memory.py:
--------------------------------------------------------------------------------
 1 | class Memory:
 2 |     def __init__(self):
 3 |         self.states = []
 4 |         self.actions = []
 5 |         self.rewards = []
 6 |         self.new_states = []
 7 |         self.values = []
 8 |         self.log_probs = []
 9 | 
10 |     def remember(self, state, action, reward, new_state, value, log_p):
11 |         self.actions.append(action)
12 |         self.rewards.append(reward)
13 |         self.states.append(state)
14 |         self.new_states.append(new_state)
15 |         self.log_probs.append(log_p)
16 |         self.values.append(value)
17 | 
18 |     def clear_memory(self):
19 |         self.states = []
20 |         self.actions = []
21 |         self.rewards = []
22 |         self.new_states = []
23 |         self.values = []
24 |         self.log_probs = []
25 | 
26 |     def sample_memory(self):
27 |         return self.states, self.actions, self.rewards, self.new_states,\
28 |                self.values, self.log_probs
29 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/parallel_env.py:
--------------------------------------------------------------------------------
 1 | import torch.multiprocessing as mp
 2 | from actor_critic import ActorCritic
 3 | from icm import ICM
 4 | from shared_adam import SharedAdam
 5 | from worker import worker
 6 | 
 7 | 
 8 | class ParallelEnv:
 9 |     def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8):
10 |         names = [str(i) for i in range(1, n_threads+1)]
11 | 
12 |         global_actor_critic = ActorCritic(input_shape, n_actions)
13 |         global_actor_critic.share_memory()
14 |         global_optim = SharedAdam(global_actor_critic.parameters())
15 | 
16 |         if not icm:
17 |             global_icm = None
18 |             global_icm_optim = None
19 |         else:
20 |             global_icm = ICM(input_shape, n_actions)
21 |             global_icm.share_memory()
22 |             global_icm_optim = SharedAdam(global_icm.parameters())
23 | 
24 |         self.ps = [mp.Process(target=worker,
25 |                               args=(name, input_shape, n_actions,
26 |                                     global_actor_critic, global_icm,
27 |                                     global_optim, global_icm_optim, env_id,
28 |                                     n_threads, icm))
29 |                    for name in names]
30 | 
31 |         [p.start() for p in self.ps]
32 |         [p.join() for p in self.ps]
33 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/shared_adam.py:
--------------------------------------------------------------------------------
 1 | # from Morvan Zhou's implementation:
 2 | # https://github.com/MorvanZhou/pytorch-A3C
 3 | 
 4 | import torch as T
 5 | 
 6 | 
 7 | class SharedAdam(T.optim.Adam):
 8 |     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), eps=1e-8,
 9 |                  weight_decay=0):
10 |         super(SharedAdam, self).__init__(params, lr=lr, betas=betas,
11 |                                          eps=eps, weight_decay=weight_decay)
12 | 
13 |         for group in self.param_groups:
14 |             for p in group['params']:
15 |                 state = self.state[p]
16 |                 state['step'] = 0
17 |                 state['exp_avg'] = T.zeros_like(p.data)
18 |                 state['exp_avg_sq'] = T.zeros_like(p.data)
19 | 
20 |                 state['exp_avg'].share_memory_()
21 |                 state['exp_avg_sq'].share_memory_()
22 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | 
 5 | def plot_learning_curve(x, scores, figure_file):
 6 |     running_avg = np.zeros(len(scores))
 7 |     for i in range(len(running_avg)):
 8 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 9 |     plt.plot(x, running_avg)
10 |     plt.title('Running average of previous 100 episodes')
11 |     plt.savefig(figure_file)
12 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/ICM/worker.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import torch as T
 4 | from actor_critic import ActorCritic
 5 | from icm import ICM
 6 | from memory import Memory
 7 | from utils import plot_learning_curve
 8 | 
 9 | 
10 | def worker(name, input_shape, n_actions, global_agent, global_icm,
11 |            optimizer, icm_optimizer, env_id, n_threads, icm=False):
12 |     T_MAX = 20
13 | 
14 |     local_agent = ActorCritic(input_shape, n_actions)
15 | 
16 |     if icm:
17 |         local_icm = ICM(input_shape, n_actions)
18 |         algo = 'ICM'
19 |     else:
20 |         intrinsic_reward = T.zeros(1)
21 |         algo = 'A3C'
22 | 
23 |     memory = Memory()
24 | 
25 |     env = gym.make(env_id)
26 | 
27 |     t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0
28 | 
29 |     while episode < max_eps:
30 |         obs = env.reset()
31 |         hx = T.zeros(1, 256)
32 |         score, done, ep_steps = 0, False, 0
33 |         while not done:
34 |             state = T.tensor([obs], dtype=T.float)
35 |             action, value, log_prob, hx = local_agent(state, hx)
36 |             obs_, reward, done, info = env.step(action)
37 |             t_steps += 1
38 |             ep_steps += 1
39 |             score += reward
40 |             reward = 0  # turn off extrinsic rewards
41 |             memory.remember(obs, action, reward, obs_, value, log_prob)
42 |             obs = obs_
43 |             if ep_steps % T_MAX == 0 or done:
44 |                 states, actions, rewards, new_states, values, log_probs = \
45 |                         memory.sample_memory()
46 |                 if icm:
47 |                     intrinsic_reward, L_I, L_F = \
48 |                             local_icm.calc_loss(states, new_states, actions)
49 | 
50 |                 loss = local_agent.calc_loss(obs, hx, done, rewards, values,
51 |                                              log_probs, intrinsic_reward)
52 | 
53 |                 optimizer.zero_grad()
54 |                 hx = hx.detach_()
55 |                 if icm:
56 |                     icm_optimizer.zero_grad()
57 |                     (L_I + L_F).backward()
58 | 
59 |                 loss.backward()
60 |                 T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)
61 | 
62 |                 for local_param, global_param in zip(
63 |                                         local_agent.parameters(),
64 |                                         global_agent.parameters()):
65 |                     global_param._grad = local_param.grad
66 |                 optimizer.step()
67 |                 local_agent.load_state_dict(global_agent.state_dict())
68 | 
69 |                 if icm:
70 |                     for local_param, global_param in zip(
71 |                                             local_icm.parameters(),
72 |                                             global_icm.parameters()):
73 |                         global_param._grad = local_param.grad
74 |                     icm_optimizer.step()
75 |                     local_icm.load_state_dict(global_icm.state_dict())
76 |                 memory.clear_memory()
77 | 
78 |         if name == '1':
79 |             scores.append(score)
80 |             avg_score = np.mean(scores[-100:])
81 |             print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} '
82 |                   'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format(
83 |                       algo, episode, name, n_threads,
84 |                       t_steps/1e6, score,
85 |                       T.sum(intrinsic_reward),
86 |                       avg_score))
87 |         episode += 1
88 |     if name == '1':
89 |         x = [z for z in range(episode)]
90 |         fname = algo + '_CartPole_no_rewards.png'
91 |         plot_learning_curve(x, scores, fname)
92 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/Torch-LunarLander-alpha000025-beta00025-400-300.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/main_torch.py:
--------------------------------------------------------------------------------
 1 | from ddpg_torch import Agent
 2 | import gym
 3 | import numpy as np
 4 | from utils import plotLearning
 5 | 
 6 | env = gym.make('LunarLanderContinuous-v2')
 7 | agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[8], tau=0.001, env=env,
 8 |               batch_size=64,  layer1_size=400, layer2_size=300, n_actions=2)
 9 | 
10 | #agent.load_models()
11 | np.random.seed(0)
12 | 
13 | score_history = []
14 | for i in range(1000):
15 |     obs = env.reset()
16 |     done = False
17 |     score = 0
18 |     while not done:
19 |         act = agent.choose_action(obs)
20 |         new_state, reward, done, info = env.step(act)
21 |         agent.remember(obs, act, reward, new_state, int(done))
22 |         agent.learn()
23 |         score += reward
24 |         obs = new_state
25 |         #env.render()
26 |     score_history.append(score)
27 | 
28 |     #if i % 25 == 0:
29 |     #    agent.save_models()
30 | 
31 |     print('episode ', i, 'score %.2f' % score,
32 |           'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
33 | 
34 | filename = 'LunarLander-alpha000025-beta00025-400-300.png'
35 | plotLearning(score_history, filename, window=100)
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/pytorch/lunar-lander/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt 
 2 | import numpy as np
 3 | 
 4 | def plotLearning(scores, filename, x=None, window=5):   
 5 |     N = len(scores)
 6 |     running_avg = np.empty(N)
 7 |     for t in range(N):
 8 | 	    running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
 9 |     if x is None:
10 |         x = [i for i in range(N)]
11 |     plt.ylabel('Score')       
12 |     plt.xlabel('Game')                     
13 |     plt.plot(x, running_avg)
14 |     plt.savefig(filename)


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/main_tf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gym
 3 | import numpy as np
 4 | from ddpg_orig_tf import Agent
 5 | from utils import plotLearning
 6 | 
 7 | # Uncomment the lines below to specify which gpu to run on
 8 | #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 9 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10 | 
11 | if __name__ == '__main__':
12 |     env = gym.make('Pendulum-v0')
13 |     agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[3], tau=0.001,
14 |                   env=env, batch_size=64, layer1_size=800, layer2_size=600,
15 |                   n_actions=1)
16 |     np.random.seed(0)
17 |     score_history = []
18 |     for i in range(1000):
19 |         obs = env.reset()
20 |         done = False
21 |         score = 0
22 |         while not done:
23 |             act = agent.choose_action(obs)
24 |             new_state, reward, done, info = env.step(act)
25 |             agent.remember(obs, act, reward, new_state, int(done))
26 |             agent.learn()
27 |             score += reward
28 |             obs = new_state
29 |             #env.render()
30 |         score_history.append(score)
31 |         print('episode ', i, 'score %.2f' % score,
32 |               'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
33 | 
34 |     filename = 'Pendulum-alpha00005-beta0005-800-600-optimized.png'
35 |     plotLearning(score_history, filename, window=100)
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/pendulum/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt 
 2 | import numpy as np
 3 | 
 4 | def plotLearning(scores, filename, x=None, window=5):   
 5 |     N = len(scores)
 6 |     running_avg = np.empty(N)
 7 |     for t in range(N):
 8 | 	    running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
 9 |     if x is None:
10 |         x = [i for i in range(N)]
11 |     plt.ylabel('Score')       
12 |     plt.xlabel('Game')                     
13 |     plt.plot(x, running_avg)
14 |     plt.savefig(filename)


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/main_tf.py:
--------------------------------------------------------------------------------
 1 | from ddpg_orig_tf import Agent
 2 | import gym
 3 | import numpy as np
 4 | from utils import plotLearning
 5 | from gym import wrappers
 6 | import os
 7 | 
 8 | #tf.set_random_seed(0)
 9 | if __name__ == '__main__':
10 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
11 |     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
12 | 
13 |     env = gym.make('BipedalWalker-v2')
14 |     agent = Agent(alpha=0.00005, beta=0.0005, input_dims=[24], tau=0.001, env=env,
15 |                   batch_size=64,  layer1_size=400, layer2_size=300, n_actions=4,
16 |                   chkpt_dir='tmp/ddpg')
17 |     np.random.seed(0)
18 |     #agent.load_models()
19 |     #env = wrappers.Monitor(env, "tmp/walker2d",
20 |     #                            video_callable=lambda episode_id: True, force=True)
21 |     score_history = []
22 |     for i in range(5000):
23 |         obs = env.reset()
24 |         done = False
25 |         score = 0
26 |         while not done:
27 |             act = agent.choose_action(obs)
28 |             new_state, reward, done, info = env.step(act)
29 |             agent.remember(obs, act, reward, new_state, int(done))
30 |             agent.learn()
31 |             score += reward
32 |             obs = new_state
33 |             env.render()
34 |         score_history.append(score)
35 |         print('episode ', i, 'score %.2f' % score,
36 |               'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))
37 |         if i % 25 == 0:
38 |             agent.save_models()
39 |     filename = 'WalkerTF-alpha00005-beta0005-400-300-original-5000games-testing.png'
40 |     plotLearning(score_history, filename, window=100)
41 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.index


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Actor_ddpg.ckpt.meta


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.index


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/Critic_ddpg.ckpt.meta


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.index


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetActor_ddpg.ckpt.meta


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.index


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow/walker2d/tmp/ddpg_best_3/TargetCritic_ddpg.ckpt.meta


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer:
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 | 
16 |         self.state_memory[index] = state
17 |         self.new_state_memory[index] = state_
18 |         self.action_memory[index] = action
19 |         self.reward_memory[index] = reward
20 |         self.terminal_memory[index] = done
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         states_ = self.new_state_memory[batch]
31 |         actions = self.action_memory[batch]
32 |         rewards = self.reward_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/main_ddpg.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from ddpg_tf2 import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('Pendulum-v0')
 8 |     agent = Agent(input_dims=env.observation_space.shape, env=env,
 9 |             n_actions=env.action_space.shape[0])
10 |     n_games = 250
11 | 
12 |     figure_file = 'plots/pendulum.png'
13 | 
14 |     best_score = env.reward_range[0]
15 |     score_history = []
16 |     load_checkpoint = False
17 | 
18 |     if load_checkpoint:
19 |         n_steps = 0
20 |         while n_steps <= agent.batch_size:
21 |             observation = env.reset()
22 |             action = env.action_space.sample()
23 |             observation_, reward, done, info = env.step(action)
24 |             agent.remember(observation, action, reward, observation_, done)
25 |             n_steps += 1
26 |         agent.learn()
27 |         agent.load_models()
28 |         evaluate = True
29 |     else:
30 |         evaluate = False
31 | 
32 |     for i in range(n_games):
33 |         observation = env.reset()
34 |         done = False
35 |         score = 0
36 |         while not done:
37 |             action = agent.choose_action(observation, evaluate)
38 |             observation_, reward, done, info = env.step(action)
39 |             score += reward
40 |             agent.remember(observation, action, reward, observation_, done)
41 |             if not load_checkpoint:
42 |                 agent.learn()
43 |             observation = observation_
44 | 
45 |         score_history.append(score)
46 |         avg_score = np.mean(score_history[-100:])
47 | 
48 |         if avg_score > best_score:
49 |             best_score = avg_score
50 |             if not load_checkpoint:
51 |                 agent.save_models()
52 | 
53 |         print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)
54 | 
55 |     if not load_checkpoint:
56 |         x = [i+1 for i in range(n_games)]
57 |         plot_learning_curve(x, score_history, figure_file)
58 | 
59 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | import tensorflow.keras as keras
 4 | from tensorflow.keras.layers import Dense
 5 | 
 6 | class CriticNetwork(keras.Model):
 7 |     def __init__(self, fc1_dims=512, fc2_dims=512,
 8 |             name='critic', chkpt_dir='tmp/ddpg'):
 9 |         super(CriticNetwork, self).__init__()
10 |         self.fc1_dims = fc1_dims
11 |         self.fc2_dims = fc2_dims
12 | 
13 |         self.model_name = name
14 |         self.checkpoint_dir = chkpt_dir
15 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, 
16 |                     self.model_name+'_ddpg.h5')
17 | 
18 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
19 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
20 |         self.q = Dense(1, activation=None)
21 | 
22 |     def call(self, state, action):
23 |         action_value = self.fc1(tf.concat([state, action], axis=1))
24 |         action_value = self.fc2(action_value)
25 | 
26 |         q = self.q(action_value)
27 | 
28 |         return q
29 | 
30 | class ActorNetwork(keras.Model):
31 |     def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2, name='actor',
32 |             chkpt_dir='tmp/ddpg'):
33 |         super(ActorNetwork, self).__init__()
34 |         self.fc1_dims = fc1_dims
35 |         self.fc2_dims = fc2_dims
36 |         self.n_actions = n_actions
37 | 
38 |         self.model_name = name
39 |         self.checkpoint_dir = chkpt_dir
40 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, 
41 |                     self.model_name+'_ddpg.h5')
42 | 
43 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
44 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
45 |         self.mu = Dense(self.n_actions, activation='tanh')
46 | 
47 |     def call(self, state):
48 |         prob = self.fc1(state)
49 |         prob = self.fc2(prob)
50 | 
51 |         mu = self.mu(prob)
52 | 
53 |         return mu
54 | 
55 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/pendulum.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/DDPG/tensorflow2/pendulum/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('CartPole-v0')
 8 |     N = 20
 9 |     batch_size = 5
10 |     n_epochs = 4
11 |     alpha = 0.0003
12 |     agent = Agent(n_actions=env.action_space.n, batch_size=batch_size,
13 |                   alpha=alpha, n_epochs=n_epochs,
14 |                   input_dims=env.observation_space.shape)
15 |     n_games = 300
16 | 
17 |     figure_file = 'plots/cartpole.png'
18 | 
19 |     best_score = env.reward_range[0]
20 |     score_history = []
21 | 
22 |     learn_iters = 0
23 |     avg_score = 0
24 |     n_steps = 0
25 | 
26 |     for i in range(n_games):
27 |         observation = env.reset()
28 |         done = False
29 |         score = 0
30 |         while not done:
31 |             action, prob, val = agent.choose_action(observation)
32 |             observation_, reward, done, info = env.step(action)
33 |             n_steps += 1
34 |             score += reward
35 |             agent.store_transition(observation, action,
36 |                                    prob, val, reward, done)
37 |             if n_steps % N == 0:
38 |                 agent.learn()
39 |                 learn_iters += 1
40 |             observation = observation_
41 |         score_history.append(score)
42 |         avg_score = np.mean(score_history[-100:])
43 | 
44 |         if avg_score > best_score:
45 |             best_score = avg_score
46 |             agent.save_models()
47 | 
48 |         print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
49 |               'time_steps', n_steps, 'learning_steps', learn_iters)
50 |     x = [i+1 for i in range(len(score_history))]
51 |     plot_learning_curve(x, score_history, figure_file)
52 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class PPOMemory:
 5 |     def __init__(self, batch_size):
 6 |         self.states = []
 7 |         self.probs = []
 8 |         self.vals = []
 9 |         self.actions = []
10 |         self.rewards = []
11 |         self.dones = []
12 | 
13 |         self.batch_size = batch_size
14 | 
15 |     def generate_batches(self):
16 |         n_states = len(self.states)
17 |         batch_start = np.arange(0, n_states, self.batch_size)
18 |         indices = np.arange(n_states, dtype=np.int64)
19 |         np.random.shuffle(indices)
20 |         batches = [indices[i:i+self.batch_size] for i in batch_start]
21 | 
22 |         return np.array(self.states),\
23 |             np.array(self.actions),\
24 |             np.array(self.probs),\
25 |             np.array(self.vals),\
26 |             np.array(self.rewards),\
27 |             np.array(self.dones),\
28 |             batches
29 | 
30 |     def store_memory(self, state, action, probs, vals, reward, done):
31 |         self.states.append(state)
32 |         self.actions.append(action)
33 |         self.probs.append(probs)
34 |         self.vals.append(vals)
35 |         self.rewards.append(reward)
36 |         self.dones.append(done)
37 | 
38 |     def clear_memory(self):
39 |         self.states = []
40 |         self.probs = []
41 |         self.actions = []
42 |         self.rewards = []
43 |         self.dones = []
44 |         self.vals = []
45 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Dense
 3 | 
 4 | 
 5 | class ActorNetwork(keras.Model):
 6 |     def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
 7 |         super(ActorNetwork, self).__init__()
 8 | 
 9 |         self.fc1 = Dense(fc1_dims, activation='relu')
10 |         self.fc2 = Dense(fc2_dims, activation='relu')
11 |         self.fc3 = Dense(n_actions, activation='softmax')
12 | 
13 |     def call(self, state):
14 |         x = self.fc1(state)
15 |         x = self.fc2(x)
16 |         x = self.fc3(x)
17 | 
18 |         return x
19 | 
20 | 
21 | class CriticNetwork(keras.Model):
22 |     def __init__(self, fc1_dims=256, fc2_dims=256):
23 |         super(CriticNetwork, self).__init__()
24 |         self.fc1 = Dense(fc1_dims, activation='relu')
25 |         self.fc2 = Dense(fc2_dims, activation='relu')
26 |         self.q = Dense(1, activation=None)
27 | 
28 |     def call(self, state):
29 |         x = self.fc1(state)
30 |         x = self.fc2(x)
31 |         q = self.q(x)
32 | 
33 |         return q
34 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/Slides.pdf


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/PPO/torch/cartpole.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from ppo_torch import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('CartPole-v0')
 8 |     N = 20
 9 |     batch_size = 5
10 |     n_epochs = 4
11 |     alpha = 0.0003
12 |     agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
13 |                     alpha=alpha, n_epochs=n_epochs, 
14 |                     input_dims=env.observation_space.shape)
15 |     n_games = 300
16 | 
17 |     figure_file = 'plots/cartpole.png'
18 | 
19 |     best_score = env.reward_range[0]
20 |     score_history = []
21 | 
22 |     learn_iters = 0
23 |     avg_score = 0
24 |     n_steps = 0
25 | 
26 |     for i in range(n_games):
27 |         observation = env.reset()
28 |         done = False
29 |         score = 0
30 |         while not done:
31 |             action, prob, val = agent.choose_action(observation)
32 |             observation_, reward, done, info = env.step(action)
33 |             n_steps += 1
34 |             score += reward
35 |             agent.remember(observation, action, prob, val, reward, done)
36 |             if n_steps % N == 0:
37 |                 agent.learn()
38 |                 learn_iters += 1
39 |             observation = observation_
40 |         score_history.append(score)
41 |         avg_score = np.mean(score_history[-100:])
42 | 
43 |         if avg_score > best_score:
44 |             best_score = avg_score
45 |             agent.save_models()
46 | 
47 |         print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
48 |                 'time_steps', n_steps, 'learning_steps', learn_iters)
49 |     x = [i+1 for i in range(len(score_history))]
50 |     plot_learning_curve(x, score_history, figure_file)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/PPO/torch/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 | 
16 |         self.state_memory[index] = state
17 |         self.new_state_memory[index] = state_
18 |         self.action_memory[index] = action
19 |         self.reward_memory[index] = reward
20 |         self.terminal_memory[index] = done
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size)
28 | 
29 |         states = self.state_memory[batch]
30 |         states_ = self.new_state_memory[batch]
31 |         actions = self.action_memory[batch]
32 |         rewards = self.reward_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/main_sac.py:
--------------------------------------------------------------------------------
 1 | import pybullet_envs
 2 | import gym
 3 | import numpy as np
 4 | from sac_torch import Agent
 5 | from utils import plot_learning_curve
 6 | from gym import wrappers
 7 | 
 8 | if __name__ == '__main__':
 9 |     env = gym.make('InvertedPendulumBulletEnv-v0')
10 |     agent = Agent(input_dims=env.observation_space.shape, env=env,
11 |             n_actions=env.action_space.shape[0])
12 |     n_games = 250
13 |     # uncomment this line and do a mkdir tmp && mkdir video if you want to
14 |     # record video of the agent playing the game.
15 |     #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
16 |     filename = 'inverted_pendulum.png'
17 | 
18 |     figure_file = 'plots/' + filename
19 | 
20 |     best_score = env.reward_range[0]
21 |     score_history = []
22 |     load_checkpoint = False
23 | 
24 |     if load_checkpoint:
25 |         agent.load_models()
26 |         env.render(mode='human')
27 | 
28 |     for i in range(n_games):
29 |         observation = env.reset()
30 |         done = False
31 |         score = 0
32 |         while not done:
33 |             action = agent.choose_action(observation)
34 |             observation_, reward, done, info = env.step(action)
35 |             score += reward
36 |             agent.remember(observation, action, reward, observation_, done)
37 |             if not load_checkpoint:
38 |                 agent.learn()
39 |             observation = observation_
40 |         score_history.append(score)
41 |         avg_score = np.mean(score_history[-100:])
42 | 
43 |         if avg_score > best_score:
44 |             best_score = avg_score
45 |             if not load_checkpoint:
46 |                 agent.save_models()
47 | 
48 |         print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
49 | 
50 |     if not load_checkpoint:
51 |         x = [i+1 for i in range(n_games)]
52 |         plot_learning_curve(x, score_history, figure_file)
53 | 
54 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/Slides.pdf


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer:
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 | 
16 |         self.state_memory[index] = state
17 |         self.new_state_memory[index] = state_
18 |         self.action_memory[index] = action
19 |         self.reward_memory[index] = reward
20 |         self.terminal_memory[index] = done
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size)
28 | 
29 |         states = self.state_memory[batch]
30 |         states_ = self.new_state_memory[batch]
31 |         actions = self.action_memory[batch]
32 |         rewards = self.reward_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/main_sac.py:
--------------------------------------------------------------------------------
 1 | import pybullet_envs
 2 | import gym
 3 | import numpy as np
 4 | from sac_tf2 import Agent
 5 | from utils import plot_learning_curve
 6 | from gym import wrappers
 7 | 
 8 | if __name__ == '__main__':
 9 |     env = gym.make('InvertedPendulumBulletEnv-v0')
10 |     agent = Agent(input_dims=env.observation_space.shape, env=env,
11 |             n_actions=env.action_space.shape[0])
12 |     n_games = 250
13 |     # uncomment this line and do a mkdir tmp && mkdir tmp/video if you want to
14 |     # record video of the agent playing the game.
15 |     #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
16 |     filename = 'inverted_pendulum.png'
17 | 
18 |     figure_file = 'plots/' + filename
19 | 
20 |     best_score = env.reward_range[0]
21 |     score_history = []
22 |     load_checkpoint = True
23 | 
24 |     if load_checkpoint:
25 |         agent.load_models()
26 |         env.render(mode='human')
27 | 
28 |     for i in range(n_games):
29 |         observation = env.reset()
30 |         done = False
31 |         score = 0
32 |         while not done:
33 |             action = agent.choose_action(observation)
34 |             observation_, reward, done, info = env.step(action)
35 |             score += reward
36 |             agent.remember(observation, action, reward, observation_, done)
37 |             if not load_checkpoint:
38 |                 agent.learn()
39 |             observation = observation_
40 |         score_history.append(score)
41 |         avg_score = np.mean(score_history[-100:])
42 | 
43 |         if avg_score > best_score:
44 |             best_score = avg_score
45 |             if not load_checkpoint:
46 |                 agent.save_models()
47 | 
48 |         print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
49 | 
50 |     if not load_checkpoint:
51 |         x = [i+1 for i in range(n_games)]
52 |         plot_learning_curve(x, score_history, figure_file)
53 | 
54 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | import tensorflow.keras as keras
 5 | import tensorflow_probability as tfp
 6 | from tensorflow.keras.layers import Dense 
 7 | 
 8 | class CriticNetwork(keras.Model):
 9 |     def __init__(self, n_actions, fc1_dims=256, fc2_dims=256,
10 |             name='critic', chkpt_dir='tmp/sac'):
11 |         super(CriticNetwork, self).__init__()
12 |         self.fc1_dims = fc1_dims
13 |         self.fc2_dims = fc2_dims
14 |         self.n_actions = n_actions
15 |         self.model_name = name
16 |         self.checkpoint_dir = chkpt_dir
17 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
18 | 
19 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
20 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
21 |         self.q = Dense(1, activation=None)
22 | 
23 |     def call(self, state, action):
24 |         action_value = self.fc1(tf.concat([state, action], axis=1))
25 |         action_value = self.fc2(action_value)
26 | 
27 |         q = self.q(action_value)
28 | 
29 |         return q
30 | 
31 | class ValueNetwork(keras.Model):
32 |     def __init__(self, fc1_dims=256, fc2_dims=256,
33 |             name='value', chkpt_dir='tmp/sac'):
34 |         super(ValueNetwork, self).__init__()
35 |         self.fc1_dims = fc1_dims
36 |         self.fc2_dims = fc2_dims
37 |         self.model_name = name
38 |         self.checkpoint_dir = chkpt_dir
39 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
40 | 
41 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
42 |         self.fc2 = Dense(fc2_dims, activation='relu')
43 |         self.v = Dense(1, activation=None)
44 | 
45 |     def call(self, state):
46 |         state_value = self.fc1(state)
47 |         state_value = self.fc2(state_value)
48 | 
49 |         v = self.v(state_value)
50 | 
51 |         return v
52 | 
53 | class ActorNetwork(keras.Model):
54 |     def __init__(self, max_action, fc1_dims=256, 
55 |             fc2_dims=256, n_actions=2, name='actor', chkpt_dir='tmp/sac'):
56 |         super(ActorNetwork, self).__init__()
57 |         self.fc1_dims = fc1_dims
58 |         self.fc2_dims = fc2_dims
59 |         self.n_actions = n_actions
60 |         self.model_name = name
61 |         self.checkpoint_dir = chkpt_dir
62 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
63 |         self.max_action = max_action
64 |         self.noise = 1e-6
65 | 
66 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
67 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
68 |         self.mu = Dense(self.n_actions, activation=None)
69 |         self.sigma = Dense(self.n_actions, activation=None)
70 | 
71 |     def call(self, state):
72 |         prob = self.fc1(state)
73 |         prob = self.fc2(prob)
74 | 
75 |         mu = self.mu(prob)
76 |         sigma = self.sigma(prob)
77 |         # might want to come back and change this, perhaps tf plays more nicely with
78 |         # a sigma of ~0
79 |         sigma = tf.clip_by_value(sigma, self.noise, 1)
80 | 
81 |         return mu, sigma
82 | 
83 |     def sample_normal(self, state, reparameterize=True):
84 |         mu, sigma = self.call(state)
85 |         probabilities = tfp.distributions.Normal(mu, sigma)
86 | 
87 |         if reparameterize:
88 |             actions = probabilities.sample() # + something else if you want to implement
89 |         else:
90 |             actions = probabilities.sample()
91 | 
92 |         action = tf.math.tanh(actions)*self.max_action
93 |         log_probs = probabilities.log_prob(actions)
94 |         log_probs -= tf.math.log(1-tf.math.pow(action,2)+self.noise)
95 |         log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
96 | 
97 |         return action, log_probs
98 | 
99 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/SAC/tf2/plots/inverted_pendulum.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/SAC/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from td3_torch import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLanderContinuous-v2')
 8 |     agent = Agent(alpha=0.001, beta=0.001,
 9 |             input_dims=env.observation_space.shape, tau=0.005,
10 |             env=env, batch_size=100, layer1_size=400, layer2_size=300,
11 |             n_actions=env.action_space.shape[0])
12 |     n_games = 1000
13 |     filename = 'plots/' + 'LunarLanderContinuous_' + str(n_games) + '_games.png'
14 | 
15 |     best_score = env.reward_range[0]
16 |     score_history = []
17 | 
18 |     agent.load_models()
19 | 
20 |     for i in range(n_games):
21 |         observation = env.reset()
22 |         done = False
23 |         score = 0
24 |         while not done:
25 |             action = agent.choose_action(observation)
26 |             observation_, reward, done, info = env.step(action)
27 |             agent.remember(observation, action, reward, observation_, done)
28 |             agent.learn()
29 |             score += reward
30 |             observation = observation_
31 |         score_history.append(score)
32 |         avg_score = np.mean(score_history[-100:])
33 | 
34 |         if avg_score > best_score:
35 |             best_score = avg_score
36 |             agent.save_models()
37 | 
38 |         print('episode ', i, 'score %.1f' % score,
39 |                 'average score %.1f' % avg_score)
40 | 
41 |     x = [i+1 for i in range(n_games)]
42 |     plot_learning_curve(x, score_history, filename)
43 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from td3_tf2 import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     #env = gym.make('LunarLanderContinuous-v2')
 8 |     #env = gym.make('Pendulum-v0')
 9 |     env = gym.make('BipedalWalker-v2')
10 |     agent = Agent(alpha=0.001, beta=0.001,
11 |             input_dims=env.observation_space.shape, tau=0.005,
12 |             env=env, batch_size=100, layer1_size=400, layer2_size=300,
13 |             n_actions=env.action_space.shape[0])
14 |     n_games = 1000
15 |     filename = 'plots/' + 'walker_' + str(n_games) + '_games.png'
16 | 
17 |     best_score = env.reward_range[0]
18 |     score_history = []
19 | 
20 |     #agent.load_models()
21 | 
22 |     for i in range(n_games):
23 |         observation = env.reset()
24 |         done = False
25 |         score = 0
26 |         while not done:
27 |             action = agent.choose_action(observation)
28 |             observation_, reward, done, info = env.step(action)
29 |             agent.remember(observation, action, reward, observation_, done)
30 |             agent.learn()
31 |             score += reward
32 |             observation = observation_
33 |         score_history.append(score)
34 |         avg_score = np.mean(score_history[-100:])
35 | 
36 |         if avg_score > best_score:
37 |             best_score = avg_score
38 |             agent.save_models()
39 | 
40 |         print('episode ', i, 'score %.1f' % score,
41 |                 'average score %.1f' % avg_score)
42 | 
43 |     x = [i+1 for i in range(n_games)]
44 |     plot_learning_curve(x, score_history, filename)
45 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/TD3/tf2/plots/walker_1500_games.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/TD3/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_keras.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.layers import Dense, Activation, Input
 3 | from keras.models import Model, load_model
 4 | from keras.optimizers import Adam
 5 | import numpy as np
 6 | 
 7 | class Agent(object):
 8 |     def __init__(self, alpha, beta, gamma=0.99, n_actions=4,
 9 |                  layer1_size=1024, layer2_size=512, input_dims=8):
10 |         self.gamma = gamma
11 |         self.alpha = alpha
12 |         self.beta = beta
13 |         self.input_dims = input_dims
14 |         self.fc1_dims = layer1_size
15 |         self.fc2_dims = layer2_size
16 |         self.n_actions = n_actions
17 | 
18 |         self.actor, self.critic, self.policy = self.build_actor_critic_network()
19 |         self.action_space = [i for i in range(n_actions)]
20 | 
21 |     def build_actor_critic_network(self):
22 |         input = Input(shape=(self.input_dims,))
23 |         delta = Input(shape=[1])
24 |         dense1 = Dense(self.fc1_dims, activation='relu')(input)
25 |         dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
26 |         probs = Dense(self.n_actions, activation='softmax')(dense2)
27 |         values = Dense(1, activation='linear')(dense2)
28 | 
29 |         def custom_loss(y_true, y_pred):
30 |             out = K.clip(y_pred, 1e-8, 1-1e-8)
31 |             log_lik = y_true*K.log(out)
32 | 
33 |             return K.sum(-log_lik*delta)
34 | 
35 |         actor = Model(input=[input, delta], output=[probs])
36 | 
37 |         actor.compile(optimizer=Adam(lr=self.alpha), loss=custom_loss)
38 | 
39 |         critic = Model(input=[input], output=[values])
40 | 
41 |         critic.compile(optimizer=Adam(lr=self.beta), loss='mean_squared_error')
42 | 
43 |         policy = Model(input=[input], output=[probs])
44 | 
45 |         return actor, critic, policy
46 | 
47 |     def choose_action(self, observation):
48 |         state = observation[np.newaxis, :]
49 |         probabilities = self.policy.predict(state)[0]
50 |         action = np.random.choice(self.action_space, p=probabilities)
51 | 
52 |         return action
53 | 
54 |     def learn(self, state, action, reward, state_, done):
55 |         state = state[np.newaxis,:]
56 |         state_ = state_[np.newaxis,:]
57 |         critic_value_ = self.critic.predict(state_)
58 |         critic_value = self.critic.predict(state)
59 | 
60 |         target = reward + self.gamma*critic_value_*(1-int(done))
61 |         delta =  target - critic_value
62 | 
63 |         actions = np.zeros([1, self.n_actions])
64 |         actions[np.arange(1), action] = 1
65 | 
66 |         self.actor.fit([state, delta], actions, verbose=0)
67 | 
68 |         self.critic.fit(state, target, verbose=0)
69 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/actor_critic_replay_torch.py:
--------------------------------------------------------------------------------
  1 | import torch as T
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.optim as optim
  5 | import numpy as np
  6 | 
  7 | class ReplayBuffer():
  8 |     def __init__(self, max_size, input_shape):
  9 |         self.mem_size = max_size
 10 |         self.mem_cntr = 0
 11 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 12 |                                     dtype=np.float32)
 13 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
 14 |                                         dtype=np.float32)
 15 |         self.log_probs = np.zeros(self.mem_size, dtype=np.float32)
 16 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
 17 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)
 18 | 
 19 |     def store_transition(self, state, log_prob, reward, state_, done):
 20 |         index = self.mem_cntr % self.mem_size
 21 |         self.state_memory[index] = state
 22 |         self.new_state_memory[index] = state_
 23 |         self.log_probs[index] = log_prob
 24 |         self.reward_memory[index] = reward
 25 |         self.terminal_memory[index] = done
 26 |         self.mem_cntr += 1
 27 | 
 28 |     def sample_buffer(self, batch_size):
 29 |         max_mem = min(self.mem_cntr, self.mem_size)
 30 |         batch = np.random.choice(max_mem, batch_size, replace=False)
 31 | 
 32 |         states = self.state_memory[batch]
 33 |         probs = self.log_probs[batch]
 34 |         rewards = self.reward_memory[batch]
 35 |         states_ = self.new_state_memory[batch]
 36 |         terminal = self.terminal_memory[batch]
 37 | 
 38 |         return states, probs, rewards, states_, terminal
 39 | 
 40 | class ActorCriticNetwork(nn.Module):
 41 |     def __init__(self, lr, input_dims, fc1_dims, fc2_dims,
 42 |                  n_actions):
 43 |         super(ActorCriticNetwork, self).__init__()
 44 |         self.input_dims = input_dims
 45 |         self.fc1_dims = fc1_dims
 46 |         self.fc2_dims = fc2_dims
 47 |         self.n_actions = n_actions
 48 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
 49 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 50 |         self.pi = nn.Linear(self.fc2_dims, n_actions)
 51 |         self.v = nn.Linear(self.fc2_dims, 1)
 52 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
 53 | 
 54 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
 55 |         self.to(self.device)
 56 | 
 57 |     def forward(self, state):
 58 |         x = F.relu(self.fc1(state))
 59 |         x = F.relu(self.fc2(x))
 60 |         pi = self.pi(x)
 61 |         v = self.v(x)
 62 |         return (pi, v)
 63 | 
 64 | class Agent():
 65 |     def __init__(self, lr, input_dims, n_actions, gamma=0.99,
 66 |                  l1_size=256, l2_size=256, batch_size=32,
 67 |                  mem_size=1000000):
 68 |         self.gamma = gamma
 69 |         self.batch_size = batch_size
 70 |         self.memory = ReplayBuffer(mem_size, input_dims)
 71 |         self.actor_critic = ActorCriticNetwork(lr, input_dims, l1_size,
 72 |                                     l2_size, n_actions=n_actions)
 73 |         self.log_probs = []
 74 | 
 75 |     def store_transition(self, state, prob, reward, state_, done):
 76 |         self.memory.store_transition(state, prob, reward, state_, done)
 77 | 
 78 |     def choose_action(self, observation):
 79 |         state = T.tensor([observation]).to(self.actor_critic.device)
 80 |         probabilities, _ = self.actor_critic.forward(state)
 81 |         probabilities = F.softmax(probabilities)
 82 |         action_probs = T.distributions.Categorical(probabilities)
 83 |         action = action_probs.sample()
 84 |         log_probs = action_probs.log_prob(action)
 85 | 
 86 |         return action.item(), log_probs
 87 | 
 88 |     def learn(self):
 89 |         if self.memory.mem_cntr < self.batch_size:
 90 |             return
 91 |         self.actor_critic.optimizer.zero_grad()
 92 | 
 93 |         state, prob, reward, new_state, done = \
 94 |                                 self.memory.sample_buffer(self.batch_size)
 95 | 
 96 |         states = T.tensor(state).to(self.actor_critic.device)
 97 |         probs = T.tensor(prob).to(self.actor_critic.device)
 98 |         rewards = T.tensor(reward).to(self.actor_critic.device)
 99 |         dones = T.tensor(done).to(self.actor_critic.device)
100 |         states_ = T.tensor(new_state).to(self.actor_critic.device)
101 | 
102 |         _, critic_value_ = self.actor_critic.forward(states_)
103 |         _, critic_value = self.actor_critic.forward(states)
104 | 
105 |         critic_value_[dones] = 0.0
106 | 
107 |         delta = rewards + self.gamma*critic_value_
108 | 
109 |         actor_loss = -T.mean(probs*(delta-critic_value))
110 |         critic_loss = F.mse_loss(delta, critic_value)
111 | 
112 |         (actor_loss + critic_loss).backward()
113 | 
114 |         self.actor_critic.optimizer.step()
115 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/continuous_mountain_car_actor_critic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from actor_critic_continuous import Agent
 4 | import matplotlib.pyplot as plt
 5 | from utils import plotLearning
 6 | from gym import wrappers
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     agent = Agent(alpha=0.000005, beta=0.00001, input_dims=[2], gamma=0.99,
11 |                   layer1_size=256, layer2_size=256)
12 | 
13 |     env = gym.make('MountainCarContinuous-v0')
14 |     score_history = []
15 |     num_episodes = 100
16 |     for i in range(num_episodes):
17 |         #env = wrappers.Monitor(env, "tmp/mountaincar-continuous-trained-1",
18 |         #                        video_callable=lambda episode_id: True, force=True)
19 |         done = False
20 |         score = 0
21 |         observation = env.reset()
22 |         while not done:
23 |             action = np.array(agent.choose_action(observation)).reshape((1,))
24 |             observation_, reward, done, info = env.step(action)
25 |             agent.learn(observation, reward, observation_, done)
26 |             observation = observation_
27 |             score += reward
28 |         score_history.append(score)
29 |         print('episode: ', i,'score: %.2f' % score)
30 |     filename = 'mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png'
31 |     plotLearning(score_history, filename=filename, window=20)
32 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/discrete_cartpole.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from actor_critic_discrete import Agent
 4 | import matplotlib.pyplot as plt
 5 | from utils import plotLearning
 6 | from gym import wrappers
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     agent = Agent(alpha=0.0001, beta=0.0005, input_dims=[4], gamma=0.99,
11 |                   n_actions=2, layer1_size=32, layer2_size=32)
12 | 
13 |     env = gym.make('CartPole-v1')
14 |     score_history = []
15 |     score = 0
16 |     num_episodes = 2500
17 |     for i in range(num_episodes):
18 |         print('episode: ', i,'score: %.3f' % score)
19 | 
20 | 
21 |         #env = wrappers.Monitor(env, "tmp/cartpole-untrained",
22 |         #                            video_callable=lambda episode_id: True, force=True)
23 |         done = False
24 |         score = 0
25 |         observation = env.reset()
26 |         while not done:
27 |             action = agent.choose_action(observation)
28 |             observation_, reward, done, info = env.step(action)
29 |             agent.learn(observation, reward, observation_, done)
30 |             observation = observation_
31 |             score += reward
32 |         score_history.append(score)
33 | 
34 |     filename = 'cartpole-discrete-actor-critic-alpha0001-beta0005-32x32fc-1500games.png'
35 |     plotLearning(score_history, filename=filename, window=10)
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/main_keras_actor_critic_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym, os
 2 | from actor_critic_keras import Agent
 3 | from utils import plotLearning
 4 | from gym import wrappers
 5 | import numpy as np
 6 | 
 7 | if __name__ == '__main__':
 8 |     agent = Agent(alpha=0.00001, beta=0.00005)
 9 | 
10 |     env = gym.make('LunarLander-v2')
11 |     score_history = []
12 |     num_episodes = 2000
13 | 
14 |     for i in range(num_episodes):
15 |         done = False
16 |         score = 0
17 |         observation = env.reset()
18 |         while not done:
19 |             action = agent.choose_action(observation)
20 |             observation_, reward, done, info = env.step(action)
21 |             agent.learn(observation, action, reward, observation_, done)
22 |             observation = observation_
23 |             score += reward
24 | 
25 |         score_history.append(score)
26 |         avg_score = np.mean(score_history[-100:])
27 |         print('episode: ', i,'score: %.2f' % score,
28 |               'avg score %.2f' % avg_score)
29 | 
30 |     filename = 'LunarLander.png'
31 |     plotLearning(score_history, filename=filename, window=100)
32 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/main_torch_actor_critic_replay_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from actor_critic_replay_torch import Agent
 4 | from utils import plotLearning
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     num_games = 1500    
 9 |     agent = Agent(gamma=0.99, lr=1e-5, input_dims=[8], n_actions=4,
10 |                   l1_size=256, l2_size=256)
11 | 
12 |     filename = 'LunarLander-ActorCriticNaiveReplay-256-256-Adam-lr00001.png'
13 |     scores = []
14 | 
15 |     for i in range(num_games):
16 |         done = False
17 |         observation = env.reset()
18 |         score = 0
19 | 
20 |         while not done:
21 |             action, prob = agent.choose_action(observation)
22 |             observation_, reward, done, info = env.step(action)
23 |             score += reward
24 |             agent.store_transition(observation, prob,
25 |                                     reward, observation_, int(done))
26 |             agent.learn()
27 |             observation = observation_
28 | 
29 |         scores.append(score)
30 |         avg_score = np.mean(scores[max(0, i-100):(i+1)])
31 |         print('episode: ', i,'score %.1f ' % score,
32 |              ' average score %.1f' % avg_score)
33 | 
34 |     x = [i+1 for i in range(num_games)]
35 |     plotLearning(scores, filename, x)
36 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/mountaincar-continuous-old-actor-critic-alpha000005-256x256fc-100games.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras.optimizers import Adam
 3 | import tensorflow_probability as tfp
 4 | from networks import ActorCriticNetwork
 5 | 
 6 | class Agent:
 7 |     def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2):
 8 |         self.gamma = gamma
 9 |         self.n_actions = n_actions
10 |         self.action = None
11 |         self.action_space = [i for i in range(self.n_actions)]
12 | 
13 |         self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
14 | 
15 |         self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
16 | 
17 | 
18 |     def choose_action(self, observation):
19 |         state = tf.convert_to_tensor([observation])
20 |         _, probs = self.actor_critic(state)
21 | 
22 |         action_probabilities = tfp.distributions.Categorical(probs=probs)
23 |         action = action_probabilities.sample()
24 |         log_prob = action_probabilities.log_prob(action)
25 |         self.action = action
26 | 
27 |         return action.numpy()[0]
28 | 
29 |     def save_models(self):
30 |         print('... saving models ...')
31 |         self.actor_critic.save_weights(self.actor_critic.checkpoint_file)
32 | 
33 |     def load_models(self):
34 |         print('... loading models ...')
35 |         self.actor_critic.load_weights(self.actor_critic.checkpoint_file)
36 |         
37 |     def learn(self, state, reward, state_, done):
38 |         state = tf.convert_to_tensor([state], dtype=tf.float32)
39 |         state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
40 |         reward = tf.convert_to_tensor(reward, dtype=tf.float32) # not fed to NN
41 |         with tf.GradientTape(persistent=True) as tape:
42 |             state_value, probs = self.actor_critic(state)
43 |             state_value_, _ = self.actor_critic(state_)
44 |             state_value = tf.squeeze(state_value)
45 |             state_value_ = tf.squeeze(state_value_)
46 | 
47 |             action_probs = tfp.distributions.Categorical(probs=probs)
48 |             log_prob = action_probs.log_prob(self.action)
49 | 
50 |             delta = reward + self.gamma*state_value_*(1-int(done)) - state_value
51 |             actor_loss = -log_prob*delta
52 |             critic_loss = delta**2
53 |             total_loss = actor_loss + critic_loss
54 | 
55 |         gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
56 |         self.actor_critic.optimizer.apply_gradients(zip(
57 |             gradient, self.actor_critic.trainable_variables))
58 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/cartpole.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from actor_critic import Agent
 4 | from utils import plot_learning_curve
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     #env = gym.make('LunarLander-v2')
 9 |     env = gym.make('CartPole-v0')
10 |     agent = Agent(alpha=1e-5, n_actions=env.action_space.n)
11 |     n_games = 1800
12 |     # uncomment this line and do a mkdir tmp && mkdir video if you want to
13 |     # record video of the agent playing the game.
14 |     #env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
15 |     filename = 'cartpole_1e-5_1024x512_1800games.png'
16 | 
17 |     figure_file = 'plots/' + filename
18 | 
19 |     best_score = env.reward_range[0]
20 |     score_history = []
21 |     load_checkpoint = False
22 | 
23 |     if load_checkpoint:
24 |         agent.load_models()
25 | 
26 |     for i in range(n_games):
27 |         observation = env.reset()
28 |         done = False
29 |         score = 0
30 |         while not done:
31 |             action = agent.choose_action(observation)
32 |             observation_, reward, done, info = env.step(action)
33 |             score += reward
34 |             if not load_checkpoint:
35 |                 agent.learn(observation, reward, observation_, done)
36 |             observation = observation_
37 |         score_history.append(score)
38 |         avg_score = np.mean(score_history[-100:])
39 | 
40 |         if avg_score > best_score:
41 |             best_score = avg_score
42 |             if not load_checkpoint:
43 |                 agent.save_models()
44 | 
45 |         print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)
46 | 
47 |     if not load_checkpoint:
48 |         x = [i+1 for i in range(n_games)]
49 |         plot_learning_curve(x, score_history, figure_file)
50 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow.keras as keras
 3 | from tensorflow.keras.layers import Dense 
 4 | 
 5 | class ActorCriticNetwork(keras.Model):
 6 |     def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512,
 7 |             name='actor_critic', chkpt_dir='tmp/actor_critic'):
 8 |         super(ActorCriticNetwork, self).__init__()
 9 |         self.fc1_dims = fc1_dims
10 |         self.fc2_dims = fc2_dims
11 |         self.n_actions = n_actions
12 |         self.model_name = name
13 |         self.checkpoint_dir = chkpt_dir
14 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ac')
15 | 
16 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
17 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
18 |         self.v = Dense(1, activation=None)
19 |         self.pi = Dense(n_actions, activation='softmax')
20 | 
21 |     def call(self, state):
22 |         value = self.fc1(state)
23 |         value = self.fc2(value)
24 | 
25 |         v = self.v(value)
26 |         pi = self.pi(value)
27 | 
28 |         return v, pi
29 | 
30 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/tensorflow2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/torch_discrete_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from torch_actor_critic_discrete import NewAgent
 3 | from utils import plotLearning
 4 | from gym import wrappers
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     agent = NewAgent(alpha=0.00001, input_dims=[8], gamma=0.99,
 9 |                   n_actions=4, layer1_size=2048, layer2_size=512)
10 | 
11 |     env = gym.make('LunarLander-v2')
12 |     score_history = []
13 |     score = 0
14 |     num_episodes = 2000
15 |     for i in range(num_episodes):
16 | 
17 |         #env = wrappers.Monitor(env, "tmp/lunar-lander",
18 |         #                            video_callable=lambda episode_id: True, force=True)
19 |         done = False
20 |         score = 0
21 |         observation = env.reset()
22 |         while not done:
23 |             action = agent.choose_action(observation)
24 |             observation_, reward, done, info = env.step(action)
25 |             agent.learn(observation, reward, observation_, done)
26 |             observation = observation_
27 |             score += reward
28 | 
29 |         score_history.append(score)
30 |         print('episode: ', i,'score: %.2f' % score)
31 | 
32 |     filename = 'Lunar-Lander-actor-critic-new-agent-alpha00001-beta00005-2048x512fc-2000games.png'
33 |     plotLearning(score_history, filename=filename, window=50)
34 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/actor_critic/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt 
 2 | import numpy as np
 3 | 
 4 | def plotLearning(scores, filename, x=None, window=5):   
 5 |     N = len(scores)
 6 |     running_avg = np.empty(N)
 7 |     for t in range(N):
 8 | 	    running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
 9 |     if x is None:
10 |         x = [i for i in range(N)]
11 |     plt.ylabel('Score')       
12 |     plt.xlabel('Game')                     
13 |     plt.plot(x, running_avg)
14 |     plt.savefig(filename)


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_keras_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from gym import wrappers
 5 | from reinforce_keras import Agent
 6 | from utils import plotLearning
 7 | 
 8 | if __name__ == '__main__':
 9 |     agent = Agent(ALPHA=0.0005, input_dims=8, GAMMA=0.99,
10 |                   n_actions=4, layer1_size=64, layer2_size=64)
11 | 
12 |     env = gym.make('LunarLander-v2')
13 |     score_history = []
14 | 
15 |     num_episodes = 2000
16 | 
17 |     for i in range(num_episodes):
18 |         done = False
19 |         score = 0
20 |         observation = env.reset()
21 |         while not done:
22 |             action = agent.choose_action(observation)
23 |             observation_, reward, done, info = env.step(action)
24 |             agent.store_transition(observation, action, reward)
25 |             observation = observation_
26 |             score += reward
27 |         score_history.append(score)
28 | 
29 |         _ = agent.learn()
30 |         print('episode: ', i,'score: %.1f' % score,
31 |             'average score %.1f' % np.mean(score_history[max(0, i-100):(i+1)]))
32 | 
33 |     filename = 'lunar-lander-keras-64x64-alpha0005-2000games.png'
34 |     plotLearning(score_history, filename=filename, window=100)
35 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from reinforce_tf import PolicyGradientAgent
 4 | import matplotlib.pyplot as plt
 5 | from utils import plotLearning
 6 | from gym import wrappers
 7 | 
 8 | if __name__ == '__main__':
 9 |     agent = PolicyGradientAgent(ALPHA=0.0005, input_dims=8, GAMMA=0.99,
10 |                                 n_actions=4, layer1_size=64, layer2_size=64,
11 |                                 chkpt_dir='tmp/lunar-lander-ckpt')
12 |     #agent.load_checkpoint()
13 |     env = gym.make('LunarLander-v2')
14 |     score_history = []
15 |     score = 0
16 |     num_episodes = 2500
17 |     #env = wrappers.Monitor(env, "tmp/lunar-lander",
18 |     #                        video_callable=lambda episode_id: True, force=True)
19 |     for i in range(num_episodes):
20 |         print('episode: ', i,'score: ', score)
21 |         done = False
22 |         score = 0
23 |         observation = env.reset()
24 |         while not done:
25 |             action = agent.choose_action(observation)
26 |             observation_, reward, done, info = env.step(action)
27 |             agent.store_transition(observation, action, reward)
28 |             observation = observation_
29 |             score += reward
30 |         score_history.append(score)
31 |         agent.learn()
32 |         #agent.save_checkpoint()
33 |     #filename = 'lunar-lander-alpha0005-64x64fc-newG.png'
34 |     #plotLearning(score_history, filename=filename, window=25)
35 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_tf_reinforce_space_invaders.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from reinforce_cnn_tf import PolicyGradientAgent
 4 | from utils import plotLearning
 5 | from gym import wrappers
 6 | 
 7 | 
 8 | def preprocess(observation):
 9 |     return np.mean(observation[15:200, 30:125], axis=2)
10 | 
11 | 
12 | def stack_frames(stacked_frames, frame, buffer_size):
13 |     if stacked_frames is None:
14 |         stacked_frames = np.zeros((buffer_size, *frame.shape))
15 |         for idx, _ in enumerate(stacked_frames):
16 |             stacked_frames[idx,:] = frame
17 |     else:
18 |         stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:]
19 |         stacked_frames[buffer_size-1, :] = frame
20 | 
21 |     return stacked_frames
22 | 
23 | if __name__ == '__main__':
24 |     load_checkpoint = False
25 |     agent = PolicyGradientAgent(ALPHA=0.001, GAMMA=0.9, n_actions=6, fc1=256,
26 |                                 chkpt_dir='tmp/checkpoint-newG-0p001', gpu={'GPU':1})
27 |     filename = 'space-invaders-alpha001-newGcalc.png'
28 |     print('will use ', filename, ' and ', agent.gpu)
29 |     if load_checkpoint:
30 |         agent.load_checkpoint()
31 |     env = gym.make('SpaceInvaders-v0')
32 |     score_history = []
33 |     score = 0
34 |     num_episodes = 1000
35 |     stack_size = 4
36 |     #env = wrappers.Monitor(env, "tmp/space-invaders-newG-0p003",
37 |     #                        video_callable=lambda episode_id: True, force=True)
38 |     for i in range(num_episodes):
39 |         done = False
40 | 
41 |         avg_score = np.mean(score_history[max(0, i-20):(i+1)])
42 |         if i % 20 == 0 and i > 0:
43 |             print('episode: ', i,'score: ', score, ' average score %.3f' % avg_score)
44 |             plotLearning(score_history, filename=filename, window=20)
45 |         else:
46 |             print('episode: ', i,'score: ', score)
47 |         observation = env.reset()
48 |         observation = preprocess(observation)
49 |         stacked_frames = None
50 |         stacked_frames = stack_frames(stacked_frames, observation, stack_size)
51 |         score = 0
52 |         while not done:
53 |             action = agent.choose_action(stacked_frames)
54 |             observation, reward, done, info = env.step(action)
55 |             observation = preprocess(observation)
56 |             stacked_frames = stack_frames(stacked_frames, observation, stack_size)
57 |             agent.store_transition(observation, action, reward)
58 | 
59 |             score += reward
60 |         score_history.append(score)
61 | 
62 |         if i % 10 == 0:
63 |             agent.learn()
64 |             agent.save_checkpoint()
65 |     plotLearning(score_history, filename=filename, window=20)
66 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/main_torch_reinforce_lunar_lander.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from reinforce_torch import PolicyGradientAgent
 4 | import matplotlib.pyplot as plt
 5 | from utils import plotLearning
 6 | from gym import wrappers
 7 | 
 8 | if __name__ == '__main__':
 9 |     agent = PolicyGradientAgent(ALPHA=0.001, input_dims=[8], GAMMA=0.99,
10 |                                 n_actions=4, layer1_size=128, layer2_size=128)
11 |     #agent.load_checkpoint()
12 |     env = gym.make('LunarLander-v2')
13 |     score_history = []
14 |     score = 0
15 |     num_episodes = 2500
16 |     #env = wrappers.Monitor(env, "tmp/lunar-lander",
17 |     #                        video_callable=lambda episode_id: True, force=True)
18 |     for i in range(num_episodes):
19 |         print('episode: ', i,'score: ', score)
20 |         done = False
21 |         score = 0
22 |         observation = env.reset()
23 |         while not done:
24 |             action = agent.choose_action(observation)
25 |             observation_, reward, done, info = env.step(action)
26 |             agent.store_rewards(reward)
27 |             observation = observation_
28 |             score += reward
29 |         score_history.append(score)
30 |         agent.learn()
31 |         #agent.save_checkpoint()
32 |     filename = 'lunar-lander-alpha001-128x128fc-newG.png'
33 |     plotLearning(score_history, filename=filename, window=25)
34 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_keras.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Dense, Activation, Input
 2 | from keras.models import Model, load_model
 3 | from keras.optimizers import Adam
 4 | import keras.backend as K
 5 | import numpy as np
 6 | 
 7 | class Agent(object):
 8 |     def __init__(self, ALPHA, GAMMA=0.99, n_actions=4,
 9 |                  layer1_size=16, layer2_size=16, input_dims=128,
10 |                  fname='reinforce.h5'):
11 |         self.gamma = GAMMA
12 |         self.lr = ALPHA
13 |         self.G = 0
14 |         self.input_dims = input_dims
15 |         self.fc1_dims = layer1_size
16 |         self.fc2_dims = layer2_size
17 |         self.n_actions = n_actions
18 |         self.state_memory = []
19 |         self.action_memory = []
20 |         self.reward_memory = []
21 |         self.policy, self.predict = self.build_policy_network()
22 |         self.action_space = [i for i in range(n_actions)]
23 | 
24 |         self.model_file = fname
25 | 
26 |     def build_policy_network(self):
27 |         input = Input(shape=(self.input_dims,))
28 |         advantages = Input(shape=[1])
29 |         dense1 = Dense(self.fc1_dims, activation='relu')(input)
30 |         dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
31 |         probs = Dense(self.n_actions, activation='softmax')(dense2)
32 | 
33 |         def custom_loss(y_true, y_pred):
34 |             out = K.clip(y_pred, 1e-8, 1-1e-8)
35 |             log_lik = y_true*K.log(out)
36 | 
37 |             return K.sum(-log_lik*advantages)
38 | 
39 |         policy = Model(input=[input, advantages], output=[probs])
40 | 
41 |         policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss)
42 | 
43 |         predict = Model(input=[input], output=[probs])
44 | 
45 |         return policy, predict
46 | 
47 |     def choose_action(self, observation):
48 |         state = observation[np.newaxis, :]
49 |         probabilities = self.predict.predict(state)[0]
50 |         action = np.random.choice(self.action_space, p=probabilities)
51 | 
52 |         return action
53 | 
54 |     def store_transition(self, observation, action, reward):
55 |         self.state_memory.append(observation)
56 |         self.action_memory.append(action)
57 |         self.reward_memory.append(reward)
58 | 
59 |     def learn(self):
60 |         state_memory = np.array(self.state_memory)
61 |         action_memory = np.array(self.action_memory)
62 |         reward_memory = np.array(self.reward_memory)
63 | 
64 |         actions = np.zeros([len(action_memory), self.n_actions])
65 |         actions[np.arange(len(action_memory)), action_memory] = 1
66 | 
67 |         G = np.zeros_like(reward_memory)
68 |         for t in range(len(reward_memory)):
69 |             G_sum = 0
70 |             discount = 1
71 |             for k in range(t, len(reward_memory)):
72 |                 G_sum += reward_memory[k] * discount
73 |                 discount *= self.gamma
74 |             G[t] = G_sum
75 |         mean = np.mean(G)
76 |         std = np.std(G) if np.std(G) > 0 else 1
77 |         self.G = (G - mean) / std
78 | 
79 |         cost = self.policy.train_on_batch([state_memory, self.G], actions)
80 | 
81 |         self.state_memory = []
82 |         self.action_memory = []
83 |         self.reward_memory = []
84 | 
85 |         return cost
86 | 
87 |     def save_model(self):
88 |         self.policy.save(self.model_file)
89 | 
90 |     def load_model(self):
91 |         self.policy = load_model(self.model_file)
92 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_tf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | 
  5 | class PolicyGradientAgent():
  6 |     def __init__(self, ALPHA, GAMMA=0.95, n_actions=4,
  7 |                  layer1_size=16, layer2_size=16, input_dims=128,
  8 |                  chkpt_dir='tmp/checkpoints'):
  9 |         self.lr = ALPHA
 10 |         self.gamma = GAMMA
 11 |         self.n_actions = n_actions
 12 |         self.action_space = [i for i in range(n_actions)]
 13 |         self.layer1_size = layer1_size
 14 |         self.layer2_size = layer2_size
 15 |         self.input_dims = input_dims
 16 |         self.state_memory = []
 17 |         self.action_memory = []
 18 |         self.reward_memory = []
 19 |         self.sess = tf.Session()
 20 |         self.build_net()
 21 |         self.sess.run(tf.global_variables_initializer())
 22 |         self.saver = tf.train.Saver()
 23 |         self.checkpoint_file = os.path.join(chkpt_dir,'policy_network.ckpt')
 24 | 
 25 |     def build_net(self):
 26 |         with tf.variable_scope('parameters'):
 27 |             self.input = tf.placeholder(tf.float32,
 28 |                                         shape=[None, self.input_dims], name='input')
 29 |             self.label = tf.placeholder(tf.int32,
 30 |                                         shape=[None, ], name='label')
 31 |             self.G = tf.placeholder(tf.float32, shape=[None,], name='G')
 32 | 
 33 |         with tf.variable_scope('layer1'):
 34 |             l1 = tf.layers.dense(inputs=self.input, units=self.layer1_size,
 35 |                                  activation=tf.nn.relu,
 36 |                       kernel_initializer=tf.contrib.layers.xavier_initializer())
 37 | 
 38 |         with tf.variable_scope('layer2'):
 39 |             l2 = tf.layers.dense(inputs=l1, units=self.layer2_size,
 40 |                                  activation=tf.nn.relu,
 41 |                       kernel_initializer=tf.contrib.layers.xavier_initializer())
 42 | 
 43 |         with tf.variable_scope('layer3'):
 44 |             l3 = tf.layers.dense(inputs=l2, units=self.n_actions,
 45 |                                  activation=None,
 46 |                       kernel_initializer=tf.contrib.layers.xavier_initializer())
 47 |         self.actions = tf.nn.softmax(l3, name='actions')
 48 | 
 49 |         with tf.variable_scope('loss'):
 50 |             negative_log_probability = tf.nn.sparse_softmax_cross_entropy_with_logits(
 51 |                                                     logits=l3, labels=self.label)
 52 | 
 53 |             loss = negative_log_probability * self.G
 54 | 
 55 |         with tf.variable_scope('train'):
 56 |             self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
 57 | 
 58 |     def choose_action(self, observation):
 59 |         observation = observation[np.newaxis, :]
 60 |         probabilities = self.sess.run(self.actions, feed_dict={self.input: observation})[0]
 61 |         action = np.random.choice(self.action_space, p = probabilities )
 62 | 
 63 |         return action
 64 | 
 65 |     def store_transition(self, observation, action, reward):
 66 |         self.state_memory.append(observation)
 67 |         self.action_memory.append(action)
 68 |         self.reward_memory.append(reward)
 69 | 
 70 |     def learn(self):
 71 |         state_memory = np.array(self.state_memory)
 72 |         action_memory = np.array(self.action_memory)
 73 |         reward_memory = np.array(self.reward_memory)
 74 | 
 75 |         G = np.zeros_like(reward_memory)
 76 |         for t in range(len(reward_memory)):
 77 |             G_sum = 0
 78 |             discount = 1
 79 |             for k in range(t, len(reward_memory)):
 80 |                 G_sum += reward_memory[k] * discount
 81 |                 discount *= self.gamma
 82 |             G[t] = G_sum
 83 |         mean = np.mean(G)
 84 |         std = np.std(G) if np.std(G) > 0 else 1
 85 |         G = (G - mean) / std
 86 | 
 87 |         _ = self.sess.run(self.train_op,
 88 |                             feed_dict={self.input: state_memory,
 89 |                                        self.label: action_memory,
 90 |                                        self.G: G})
 91 |         self.state_memory = []
 92 |         self.action_memory = []
 93 |         self.reward_memory = []
 94 | 
 95 |     def load_checkpoint(self):
 96 |         print("...Loading checkpoint...")
 97 |         self.saver.restore(self.sess, self.checkpoint_file)
 98 | 
 99 |     def save_checkpoint(self):
100 |         #print("...Saving checkpoint...")
101 |         self.saver.save(self.sess, self.checkpoint_file)
102 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/reinforce_torch.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | import numpy as np
 6 | 
 7 | class PolicyNetwork(nn.Module):
 8 |     def __init__(self, ALPHA, input_dims, fc1_dims, fc2_dims,
 9 |                  n_actions):
10 |         super(PolicyNetwork, self).__init__()
11 |         self.input_dims = input_dims
12 |         self.fc1_dims = fc1_dims
13 |         self.fc2_dims = fc2_dims
14 |         self.n_actions = n_actions
15 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
16 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
17 |         self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
18 |         self.optimizer = optim.Adam(self.parameters(), lr=ALPHA)
19 | 
20 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
21 |         self.to(self.device)
22 | 
23 |     def forward(self, observation):
24 |         state = T.Tensor(observation).to(self.device)
25 |         x = F.relu(self.fc1(state))
26 |         x = F.relu(self.fc2(x))
27 |         x = self.fc3(x)
28 |         return x
29 | 
30 | class PolicyGradientAgent(object):
31 |     def __init__(self, ALPHA, input_dims, GAMMA=0.99, n_actions=4,
32 |                  layer1_size=256, layer2_size=256):
33 |         self.gamma = GAMMA
34 |         self.reward_memory = []
35 |         self.action_memory = []
36 |         self.policy = PolicyNetwork(ALPHA, input_dims, layer1_size, layer2_size,
37 |                                     n_actions)
38 | 
39 |     def choose_action(self, observation):
40 |         probabilities = F.softmax(self.policy.forward(observation))
41 |         action_probs = T.distributions.Categorical(probabilities)
42 |         action = action_probs.sample()
43 |         log_probs = action_probs.log_prob(action)
44 |         self.action_memory.append(log_probs)
45 | 
46 |         return action.item()
47 | 
48 |     def store_rewards(self,reward):
49 |         self.reward_memory.append(reward)
50 | 
51 |     def learn(self):
52 |         self.policy.optimizer.zero_grad()
53 |         # Assumes only a single episode for reward_memory
54 |         G = np.zeros_like(self.reward_memory, dtype=np.float64)
55 |         for t in range(len(self.reward_memory)):
56 |             G_sum = 0
57 |             discount = 1
58 |             for k in range(t, len(self.reward_memory)):
59 |                 G_sum += self.reward_memory[k] * discount
60 |                 discount *= self.gamma
61 |             G[t] = G_sum
62 |         mean = np.mean(G)
63 |         std = np.std(G) if np.std(G) > 0 else 1
64 |         G = (G - mean) / std
65 | 
66 |         G = T.tensor(G, dtype=T.float).to(self.policy.device)
67 | 
68 |         loss = 0
69 |         for g, logprob in zip(G, self.action_memory):
70 |             loss += -g * logprob
71 | 
72 |         loss.backward()
73 |         self.policy.optimizer.step()
74 | 
75 |         self.action_memory = []
76 |         self.reward_memory = []
77 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/space-invaders-alpha001-gamma0p9-decay0p99-newGcalc.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Youtube-Code-Repository/eb3aa9733158a4f7c4ba1fefaa812b27ffd889b6/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/lunar-lander-tf2-256x256-alpha0005-2000games.png


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/main.py:
--------------------------------------------------------------------------------
 1 | # if you have more than 1 gpu, use device '0' or '1' to assign to a gpu
 2 | #import os
 3 | #os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
 4 | #os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 5 | import gym
 6 | import numpy as np
 7 | from reinforce_tf2 import Agent
 8 | from utils import plotLearning
 9 | 
10 | if __name__ == '__main__':
11 |     agent = Agent(alpha=0.0005,  gamma=0.99,n_actions=4)
12 | 
13 |     env = gym.make('LunarLander-v2')
14 |     score_history = []
15 | 
16 |     num_episodes = 2000
17 | 
18 |     for i in range(num_episodes):
19 |         done = False
20 |         score = 0
21 |         observation = env.reset()
22 |         while not done:
23 |             action = agent.choose_action(observation)
24 |             observation_, reward, done, info = env.step(action)
25 |             agent.store_transition(observation, action, reward)
26 |             observation = observation_
27 |             score += reward
28 |         score_history.append(score)
29 | 
30 |         agent.learn()
31 |         avg_score = np.mean(score_history[-100:])
32 |         print('episode: ', i,'score: %.1f' % score,
33 |             'average score %.1f' % avg_score)
34 | 
35 |     filename = 'lunar-lander.png'
36 |     plotLearning(score_history, filename=filename, window=100)
37 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Dense 
 3 | 
 4 | class PolicyGradientNetwork(keras.Model):
 5 |     def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
 6 |         super(PolicyGradientNetwork, self).__init__()
 7 |         self.fc1_dims = fc1_dims
 8 |         self.fc2_dims = fc2_dims
 9 |         self.n_actions = n_actions
10 | 
11 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
12 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
13 |         self.pi = Dense(n_actions, activation='softmax')
14 | 
15 |     def call(self, state):
16 |         value = self.fc1(state)
17 |         value = self.fc2(value)
18 | 
19 |         pi = self.pi(value)
20 | 
21 |         return pi
22 | 
23 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/tensorflow2/reinforce_tf2.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from networks import PolicyGradientNetwork
 3 | import tensorflow_probability as tfp
 4 | from tensorflow.keras.optimizers import Adam
 5 | import numpy as np
 6 | 
 7 | class Agent:
 8 |     def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
 9 |                  layer1_size=256, layer2_size=256):
10 | 
11 |         self.gamma = gamma
12 |         self.lr = alpha
13 |         self.n_actions = n_actions
14 |         self.state_memory = []
15 |         self.action_memory = []
16 |         self.reward_memory = []
17 |         self.policy = PolicyGradientNetwork(n_actions=n_actions)
18 |         self.policy.compile(optimizer=Adam(learning_rate=self.lr))
19 | 
20 |     def choose_action(self, observation):
21 |         state = tf.convert_to_tensor([observation], dtype=tf.float32)
22 |         probs = self.policy(state)
23 |         action_probs = tfp.distributions.Categorical(probs=probs)
24 |         action = action_probs.sample()
25 | 
26 |         return action.numpy()[0]
27 | 
28 |     def store_transition(self, observation, action, reward):
29 |         self.state_memory.append(observation)
30 |         self.action_memory.append(action)
31 |         self.reward_memory.append(reward)
32 | 
33 |     def learn(self):
34 |         actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
35 |         rewards = np.array(self.reward_memory)
36 | 
37 |         G = np.zeros_like(rewards)
38 |         for t in range(len(rewards)):
39 |             G_sum = 0
40 |             discount = 1
41 |             for k in range(t, len(rewards)):
42 |                 G_sum += rewards[k] * discount
43 |                 discount *= self.gamma
44 |             G[t] = G_sum
45 |         
46 |         with tf.GradientTape() as tape:
47 |             loss = 0
48 |             for idx, (g, state) in enumerate(zip(G, self.state_memory)):
49 |                 state = tf.convert_to_tensor([state], dtype=tf.float32)
50 |                 probs = self.policy(state)
51 |                 action_probs = tfp.distributions.Categorical(probs=probs)
52 |                 log_prob = action_probs.log_prob(actions[idx])
53 |                 loss += -g * tf.squeeze(log_prob)
54 | 
55 |         gradient = tape.gradient(loss, self.policy.trainable_variables)
56 |         self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
57 | 
58 |         self.state_memory = []
59 |         self.action_memory = []
60 |         self.reward_memory = []
61 | 


--------------------------------------------------------------------------------
/ReinforcementLearning/PolicyGradient/reinforce/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt 
 2 | import numpy as np
 3 | 
 4 | def plotLearning(scores, filename, x=None, window=5):   
 5 |     N = len(scores)
 6 |     running_avg = np.empty(N)
 7 |     for t in range(N):
 8 | 	    running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
 9 |     if x is None:
10 |         x = [i for i in range(N)]
11 |     plt.ylabel('Score')       
12 |     plt.xlabel('Game')                     
13 |     plt.plot(x, running_avg)
14 |     plt.savefig(filename)


--------------------------------------------------------------------------------
/basic_encryption/caesar.py:
--------------------------------------------------------------------------------
 1 | from common import alphabet
 2 | 
 3 | 
 4 | def translate(message, shift, encrypt=True):
 5 |     new_message = ''
 6 |     n_chars = len(alphabet)
 7 | 
 8 |     for character in message:
 9 |         char_idx = alphabet.index(character)
10 |         if encrypt:
11 |             new_char_idx = (char_idx + shift) % n_chars
12 |         elif not encrypt:
13 |             new_char_idx = (char_idx - shift) % n_chars
14 |         new_message += alphabet[new_char_idx]
15 |     return new_message
16 | 
17 | 
18 | cipher_shift = 7
19 | 
20 | print('AB->', translate('AB', cipher_shift))
21 | print('ab->', translate('ab', cipher_shift))
22 | print('Ab->', translate('Ab', cipher_shift))
23 | print('aB->', translate('aB', cipher_shift))
24 | 
25 | plaintext = 'This is an encrypted message.'
26 | ciphertext = translate(plaintext, cipher_shift, True)
27 | print(plaintext, '->', ciphertext)
28 | original_message = translate(ciphertext, cipher_shift, False)
29 | print(ciphertext, '->', original_message)
30 | 


--------------------------------------------------------------------------------
/basic_encryption/common.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | # https://stackoverflow.com/questions/7001144/range-over-character-in-python
 5 | def character_generator(start_char, stop_char):
 6 |     for char in range(ord(start_char), ord(stop_char)+1):
 7 |         yield chr(char)
 8 | 
 9 | 
10 | def generate_one_time_pad(n_chars, characters):
11 |     return ''.join(random.choice(characters) for _ in range(n_chars))
12 | 
13 | 
14 | lower_case = list(character_generator('a', 'z'))
15 | upper_case = list(character_generator('A', 'Z'))
16 | punctuation = ['.', ',', ' ', '?', '!']
17 | 
18 | alphabet = lower_case + upper_case + punctuation
19 | 


--------------------------------------------------------------------------------
/basic_encryption/one_time_pad.py:
--------------------------------------------------------------------------------
 1 | from common import alphabet, generate_one_time_pad
 2 | 
 3 | 
 4 | def translate(message, one_time_pad, encrypt=True):
 5 |     new_message = ''
 6 | 
 7 |     n_chars = len(alphabet)
 8 | 
 9 |     for src, key in zip(message, one_time_pad):
10 |         char_idx = alphabet.index(src)
11 |         pad_idx = alphabet.index(key)
12 |         if encrypt:
13 |             new_char_idx = (char_idx + pad_idx) % n_chars
14 |         elif not encrypt:
15 |             new_char_idx = (char_idx - pad_idx) % n_chars
16 |         new_message += alphabet[new_char_idx]
17 | 
18 |     return new_message
19 | 
20 | 
21 | message = 'This is an encrypted message.'
22 | secret_key = generate_one_time_pad(len(message), alphabet)
23 | encrypted_message = translate(message, secret_key, True)
24 | original_message = translate(encrypted_message, secret_key, False)
25 | 
26 | print(message, '->', encrypted_message)
27 | print(encrypted_message, '->', original_message)
28 | 


--------------------------------------------------------------------------------
/basic_encryption/vignere.py:
--------------------------------------------------------------------------------
 1 | from common import alphabet, generate_one_time_pad
 2 | 
 3 | 
 4 | def make_vignere_table():
 5 |     table = [['']] * len(alphabet)
 6 |     for idx, character in enumerate(alphabet):
 7 |         row = []
 8 |         for char in alphabet[idx:]:
 9 |             row.append(char)
10 |         for char in alphabet[:idx]:
11 |             row.append(char)
12 |         table[idx] = row
13 |     return table
14 | 
15 | 
16 | def translate(message, vig_table, one_time_pad, encrypt=True):
17 |     new_message = ''
18 | 
19 |     if encrypt:
20 |         for src, key in zip(message, one_time_pad):
21 |             row = vig_table[:][0].index(key)
22 |             col = vig_table[0][:].index(src)
23 |             new_message += vig_table[row][col]
24 |     elif not encrypt:
25 |         for src, key in zip(message, one_time_pad):
26 |             row = vig_table[:][0].index(key)
27 |             col = vig_table[row][:].index(src)
28 |             new_message += vig_table[0][col]
29 |     return new_message
30 | 
31 | 
32 | table = make_vignere_table()
33 | message = 'This is an encrypted message.'
34 | secret_key = generate_one_time_pad(len(message), alphabet)
35 | encrypted_message = translate(message, table, secret_key, True)
36 | original_message = translate(encrypted_message, table, secret_key, False)
37 | 
38 | print(message, '->', encrypted_message)
39 | print(encrypted_message, '->', original_message)
40 | 


--------------------------------------------------------------------------------
/cmdline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | from ReinforcementLearning.DeepQLearning.utils import plotLearning
 4 | from ReinforcementLearning.DeepQLearning.simple_dqn_torch import Agent
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser(
 8 |                     description='Command line Utility for training RL models')
 9 |     # the hyphen makes the argument optional
10 |     parser.add_argument('-n_games', type=int, default=1,
11 |                         help='Number of games to play')
12 |     parser.add_argument('-lr', type=float, default=0.001,
13 |                         help='Learning rate for optimizer')
14 |     parser.add_argument('-eps_end', type=float, default=0.01,
15 |             help='Final value for epsilon in epsilon-greedy action selection')
16 |     parser.add_argument('-gamma', type=float, default=0.99,
17 |                                     help='Discount factor for update equation.')
18 |     parser.add_argument('-env', type=str, default='LunarLander-v2',
19 |                                         help='OpenAI gym environment for agent')
20 |     parser.add_argument('-eps_dec', type=float, default=0.996,
21 |                         help='Multiplicative factor for decreasing epsilon')
22 |     parser.add_argument('-eps', type=float, default=1.0,
23 |         help='Starting value for epsilon in epsilon-greedy action selection')
24 |     parser.add_argument('-max_mem', type=int, default=1000000,
25 |                                 help='Maximum size for memory replay buffer')
26 |     parser.add_argument('-dims', type=int, default=8,
27 |                             help='Input dimensions; matches env observation, \
28 |                                   must be list or tuple')
29 |     parser.add_argument('-bs', type=int, default=32,
30 |                             help='Batch size for replay memory sampling')
31 |     parser.add_argument('-n_actions', type=int, default=4,
32 |                             help='Number of actions in discrete action space')
33 |     args = parser.parse_args()
34 | 
35 |     env = gym.make(args.env)
36 | 
37 |     args.dims = [args.dims]
38 | 
39 |     agent = Agent(args.gamma, args.eps, args.lr, args.dims, args.bs,
40 |                   args.n_actions, args.max_mem, args.eps_end, args.eps_dec)
41 | 
42 |     eps_history, scores = [], []
43 |     for i in range(args.n_games):
44 |         observation = env.reset()
45 |         done = False
46 |         score = 0
47 |         while not done:
48 |             action = agent.chooseAction(observation)
49 |             observation_, reward, done, info = env.step(action)
50 |             score += reward
51 |             agent.storeTransition(observation, action,
52 |                                    reward, observation_, int(done))
53 |             observation = observation_
54 |             agent.learn()
55 | 
56 |         eps_history.append(agent.EPSILON)
57 |         scores.append(score)
58 | 
59 |         if i % 10 == 0 and i > 0:
60 |             avg_score = np.mean(scores[max(0, i-10):(i+1)])
61 |             print('episode: ', i,'score: ', score,
62 |                  ' average score %.3f' % avg_score,
63 |                 'epsilon %.3f' % agent.EPSILON)
64 |         else:
65 |             print('episode: ', i,'score: ', score)
66 | 
67 |     x = [i+1 for i in range(args.n_games)]
68 |     # filename should reflect whatever it is you are varying to tune your
69 |     # agent. For simplicity I'm just showing alpha and gamma, but it can be
70 |     # the epsilons as well. You can even include parameters for the fully
71 |     # connected layers and use them as part of the file name.
72 |     filename = args.env + '_alpha' + str(args.lr) + '_gamma' + str(args.gamma) + \
73 |               '.png'
74 |     plotLearning(x, scores, eps_history, filename)
75 | 


--------------------------------------------------------------------------------
/giveaway_scrubbed.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | # These people all indicated they are located in the US/CA or had a means to
 5 | # ship to a US/CA address for forwarding.
 6 | us_ca_entrants = ["Rintze", "Hasan", "Keith", "Joseph", "Asceptt", "Brian",
 7 |                   "Xiaoyu", "Anik", "Devshank", "Jeremy", "Amin", "Brenton",
 8 |                   "Remi", "Howard", "Michael", "Khizr", "Jay", "Ricardo",
 9 |                   "Matt", "Chris", "Tanner", "Paul", "Pang", "Jose", "David",
10 |                   "Kurt", "Jesse"]
11 | 
12 | # These people indicated they were in a foreign country and did not indicate
13 | # they had the means to ship to a foreign address.
14 | intl_entrants = ["Harsh"]
15 | 
16 | # These people did not indicate where they were or their means to forward mail
17 | unknown_entrants = ["Gareth", "Dan", "Dileep", "Zeeshan", "Romin", "Dellan",
18 |                     "Marcin", "Wouter", "Cecil", "Jamal", "Gabriel", "ATV",
19 |                     "Violet", "Waqas", "Joy", "Tianqi", "Thomas"]
20 | 
21 | random.seed(2022)
22 | 
23 | gpu_winner = random.choice(us_ca_entrants)
24 | 
25 | all_entrants = us_ca_entrants + intl_entrants + unknown_entrants
26 | 
27 | nnai_winner = random.choice(all_entrants)
28 | 
29 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
30 | 
31 | # Make sure there are no duplicate names, so there is no ambiguity in who won
32 | assert len(np.unique(us_ca_entrants)) == len(us_ca_entrants)
33 | 
34 | assert len(np.unique(all_entrants)) == len(all_entrants)
35 | 
36 | print('GPU Winner:', gpu_winner)
37 | 
38 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
39 | 
40 | print('Deep Learning Institute winners:', dli_winners)
41 | 


--------------------------------------------------------------------------------
/giveaway_scrubbed_3-23.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | all_entrants = ['Henry Magregor', 'Charly B', 'Arjun H', 'Pete O', 'Lolis F',
 5 |                 'Kaan A', 'Inosiro', 'Brian B', 'Ben C', 'Jorge B', 'Jesse G',
 6 |                 'Hauke H', 'Pas D', 'Aditya C', 'Marc C', 'Logan G', 'Brian C',
 7 |                 'Antemasq', 'Alex D', 'Bibek P', 'Andrew S', 'Gonzalo B',
 8 |                 'Martin P', 'Bikash S', 'William P', 'Daniel A', 'Naomi G',
 9 |                 'Alex V', 'Chris G', 'Steve L', 'Felix G', 'Greg K', 'x g',
10 |                 ]
11 | 
12 | random.seed(2023)
13 | 
14 | gpu_winner = random.choice(all_entrants)
15 | 
16 | all_entrants.remove(gpu_winner)
17 | 
18 | nnai_winner = random.choice(all_entrants)
19 | 
20 | all_entrants.remove(nnai_winner)
21 | 
22 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
23 | 
24 | # Make sure there are no duplicate names, so there is no ambiguity in who won
25 | assert len(np.unique(all_entrants)) == len(all_entrants)
26 | 
27 | print('GPU Winner:', gpu_winner)
28 | 
29 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
30 | 
31 | print('Deep Learning Institute winners:', dli_winners)
32 | 


--------------------------------------------------------------------------------
/giveaway_scrubbed_9-22.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | # This time I didn't specify to indicate where you are located and if you had
 5 | # the means to ship abroad. Silly oversight on my part, but that's life.
 6 | # This means I'll do the drawing and email everyone first. If someone overseas
 7 | # wins the GPU but can't get it shipped, then I'll subtract their name and draw
 8 | # again.
 9 | 
10 | all_entrants = ['xiaoyu', 'sunil', 'kelvin', 'jacob', 'sean', 'dilith',
11 |                 'noctildon', 'lukas_k', 'alex', 'matt_t', 'inosiro',
12 |                 'f1datadiver', 'sambaran', 'dean_v_a', 'balaji', 'aditya',
13 |                 'brian_cu', 'sim', 'philip', 'antonio', 'roumen', 'marc',
14 |                 'william_p', 'michael_f', 'behnood', 'lucas_p', 'ahmed_k',
15 |                 'jamal_c', 'luciano_d', 'amir-ul', 'kinal', 'sidhanath',
16 |                 'lorenzo', 'michael_w', 'ravi_j', 'brigliano', 'hrovje',
17 |                 'daniel_b', 'terry_w', 'jun', 'kurt_b', 'hauke', 'super_dave',
18 |                 'george', 'lukas_d', 'waleed', 'clark', 'frak', 'ravi_c',
19 |                 'sawaiz', 'ferran', 'jack-ziad', 'christian_g', 'zxavier',
20 |                 'daniel_k', 'akash', 'jbene', 'hause', 'jack', 'cristiano',
21 |                 'nguyen_q_d', 'tatonata', 'dennis_f', 'till_z', 'dusan',
22 |                 'abdennacer', 'antonio_p', 'dilan', 'adam_b', 'brian_co',
23 |                 'k_ali', 'matt_r', 'navoda', 'doyun', 'william_s', 'jed_j',
24 |                 'bijay', 'bruno', 'shivam', 'arjun_h', 'emil', 'abdulla_m',
25 |                 'nick', 'joyce_w', 'abhinav', 'alex_v', 'ruturaj_s']
26 | 
27 | random.seed(2022)
28 | 
29 | gpu_winner = random.choice(all_entrants)
30 | 
31 | all_entrants.remove(gpu_winner)
32 | 
33 | nnai_winner = random.choice(all_entrants)
34 | 
35 | all_entrants.remove(nnai_winner)
36 | 
37 | dli_winners = [random.choice(all_entrants) for _ in range(5)]
38 | 
39 | # Make sure there are no duplicate names, so there is no ambiguity in who won
40 | assert len(np.unique(all_entrants)) == len(all_entrants)
41 | 
42 | print('GPU Winner:', gpu_winner)
43 | 
44 | print('NeuralNet.ai Subscription Winner:', nnai_winner)
45 | 
46 | print('Deep Learning Institute winners:', dli_winners)
47 | 


--------------------------------------------------------------------------------
/tf_embeddings.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | import tensorflow_datasets as tfds
 7 | 
 8 | #embedding_layer = layers.Embedding(1000, 5)
 9 | 
10 | #result = embedding_layer(tf.constant([1,2,3]))
11 | 
12 | #print(result.numpy())
13 | #print(result.numpy().shape)
14 | def get_batch_data():
15 |     (train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
16 |                                     split=(tfds.Split.TRAIN, tfds.Split.TEST),
17 |                                     with_info=True, as_supervised=True)
18 | 
19 |     encoder = info.features['text'].encoder
20 |     #print(encoder.subwords[:20])
21 |     padded_shapes = ([None], ())
22 |     train_batches = train_data.shuffle(1000).padded_batch(10,
23 |                                                 padded_shapes=padded_shapes)
24 |     test_batches = test_data.shuffle(1000).padded_batch(10,
25 |                                                 padded_shapes=padded_shapes)
26 |     return train_batches, test_batches, encoder
27 | 
28 | def get_model(encoder, embedding_dim=16):
29 | 
30 |     model = keras.Sequential([
31 |                     layers.Embedding(encoder.vocab_size, embedding_dim),
32 |                     layers.GlobalAveragePooling1D(),
33 |                     layers.Dense(1, activation='sigmoid')])
34 | 
35 |     model.compile(optimizer='adam', loss='binary_crossentropy',
36 |                   metrics=['accuracy'])
37 |     return model
38 | 
39 | def plot_history(history):
40 |     history_dict = history.history
41 |     acc = history_dict['accuracy']
42 |     val_acc = history_dict['val_accuracy']
43 |     epochs = range(1, len(acc) + 1)
44 | 
45 |     plt.figure(figsize=(12,9))
46 |     plt.plot(epochs, acc, 'bo', label='Training acc')
47 |     plt.plot(epochs, val_acc, 'b', label='Validation acc')
48 |     plt.title('Training and validation accuracy')
49 |     plt.xlabel('Epochs')
50 |     plt.ylabel('Accuracy')
51 |     plt.legend(loc='lower right')
52 |     plt.ylim((0.5, 1))
53 |     plt.show()
54 | 
55 | def retrieve_embeddings(model, encoder):
56 |     out_vectors = io.open('vecs.tsv', 'w', encoding='utf-8')
57 |     out_metadata = io.open('meta.tsv', 'w', encoding='utf-8')
58 |     weights = model.layers[0].get_weights()[0]
59 | 
60 |     for num, word in enumerate(encoder.subwords):
61 |         vec = weights[num+1]
62 |         out_metadata.write(word + '\n')
63 |         out_vectors.write('\t'.join([str(x) for x in vec]) + '\n')
64 |     out_vectors.close()
65 |     out_metadata.close()
66 | 
67 | train_batches, test_batches, encoder = get_batch_data()
68 | model = get_model(encoder)
69 | history = model.fit(train_batches, epochs=10, validation_data=test_batches,
70 |                     validation_steps=20)
71 | plot_history(history)
72 | retrieve_embeddings(model, encoder)
73 | 


--------------------------------------------------------------------------------
/tf_sentiment.py:
--------------------------------------------------------------------------------
 1 | import tensorflow_datasets as tfds
 2 | import tensorflow as tf
 3 | 
 4 | dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,
 5 |                           as_supervised=True)
 6 | train_dataset, test_dataset = dataset['train'], dataset['test']
 7 | 
 8 | encoder = info.features['text'].encoder
 9 | 
10 | BUFFER_SIZE = 10000
11 | BATCH_SIZE = 64
12 | 
13 | padded_shapes = ([None], ())
14 | 
15 | train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE,
16 |                                 padded_shapes=padded_shapes)
17 | 
18 | test_dataset = test_dataset.padded_batch(BATCH_SIZE,
19 |                                 padded_shapes=padded_shapes)
20 | 
21 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64),
22 |                     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
23 |                     tf.keras.layers.Dense(64, activation='relu'),
24 |                     tf.keras.layers.Dense(1, activation='sigmoid')])
25 | model.compile(loss='binary_crossentropy', 
26 |                 optimizer=tf.keras.optimizers.Adam(1e-4),
27 |                 metrics=['accuracy'])
28 | 
29 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset,
30 |                     validation_steps=30)
31 | 
32 | def pad_to_size(vec, size):
33 |     zeros = [0]*(size-len(vec))
34 |     vec.extend(zeros)
35 |     return vec
36 | 
37 | def sample_predict(sentence, pad, model_):
38 |     encoded_sample_pred_text = encoder.encode(sentence)
39 |     if pad:
40 |         encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
41 |     encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
42 |     predictions = model_.predict(tf.expand_dims(encoded_sample_pred_text, 0))
43 | 
44 |     return predictions
45 | 
46 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend')
47 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
48 | 
49 | print('probability this is a positive review %.2f' % predictions)
50 | 
51 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend')
52 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
53 | 
54 | print('probability this is a positive review %.2f' % predictions)
55 | 
56 | model = tf.keras.Sequential([tf.keras.layers.Embedding(encoder.vocab_size, 64),
57 |             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
58 |             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
59 |             tf.keras.layers.Dense(64, activation='relu'),
60 |             tf.keras.layers.Dropout(0.5),
61 |             tf.keras.layers.Dense(1, activation='sigmoid')])
62 | model.compile(loss='binary_crossentropy',
63 |                 optimizer=tf.keras.optimizers.Adam(1e-4),
64 |                 metrics=['accuracy'])
65 | 
66 | history = model.fit(train_dataset, epochs=5, validation_data=test_dataset,
67 |                     validation_steps=30)
68 | sample_text = ('This movie was awesome. The acting was incredible. Highly recommend')
69 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
70 | 
71 | print('probability this is a positive review %.2f' % predictions)
72 | 
73 | sample_text = ('This movie was so so. The acting was medicore. Kind of recommend')
74 | predictions = sample_predict(sample_text, pad=True, model_=model) * 100
75 | 
76 | print('probability this is a positive review %.2f' % predictions)
77 | 
78 | 


--------------------------------------------------------------------------------
/threaded.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import numpy as np
 3 | import glob
 4 | from keras.preprocessing.image import ImageDataGenerator
 5 | from multiprocessing.dummy import Pool as ThreadPool
 6 | 
 7 | def augment_images(raw_images, files, mult_factor):
 8 |     gen = ImageDataGenerator()
 9 |     for idx, image in enumerate(raw_images):
10 |         for mult in range(mult_factor):
11 |             img_fname = files[idx].split('/')[4]
12 |             img_fname = '../../Data/AugmentedImages/' + \
13 |                 img_fname.split('.')[0] + '_' + str(multi) + '.jpg'
14 | 
15 |             theta_tfx = np.random.choice(range(270))
16 |             transformed_raw_image = gen.apply_transform(image,
17 |                                 {'theta': theta_fx})
18 |             new_image = Image.fromarray(transformed_raw_image, 'RGB')
19 |             new_image = new_image.resize((1024, 1024), Image.ANTIALIAS)
20 |             new_image.save(img_fname)
21 |             transformed_raw_image = None
22 |             new_image = None
23 | 
24 | if __name__ == '__main__':
25 |     raw_images_dir = '../../Data/RawImages/'
26 |     raw_image_files = sorted(glob.sglob(raw_images_dir + '*.jpg',
27 |                                 recursive=True))
28 | 
29 |     img_list = []
30 |     for file in raw_image files:
31 |         img_list.append(np.array(Image.open(file)))
32 |     augment_images(img_list, raw_image_files, mult_factor=10)
33 | 


--------------------------------------------------------------------------------