├── Chapter01
    ├── __init__.py
    ├── cnn.py
    ├── logs
    │   └── simple_cnn
    │   │   └── events.out.tfevents.1532869533.Seans-MBP-7.lan
    └── saves
    │   └── checkpoint
├── Chapter02
    ├── __init__.py
    ├── algorithmic_03.py
    ├── atari_02.py
    ├── box2d_04.py
    ├── cartpole.py
    ├── classic_control_05.py
    ├── mujoco_06.py
    ├── robotics_07.py
    ├── start1.py
    └── toy_text_08.py
├── Chapter03
    ├── MUJOCO_LOG.TXT
    ├── config.py
    ├── demo
    │   ├── __init__.py
    │   ├── game.py
    │   ├── object.py
    │   ├── robot.py
    │   └── utils.py
    ├── distribution
    │   ├── __init__.py
    │   ├── categorical.py
    │   └── diagonal_gaussian.py
    ├── environment.py
    ├── eval.py
    ├── game.py
    ├── krylov.py
    ├── layer.py
    ├── layers.py
    ├── log
    │   ├── Acrobot
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506500394.ywz-WorkStation-T7400
    │   ├── CartPole
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506667268.ywz-WorkStation-T7400
    │   ├── HalfCheetah
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506338471.ywz-WorkStation-T7400
    │   ├── Hopper
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506658875.ywz-WorkStation-T7400
    │   ├── Pendulum
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506666537.ywz-WorkStation-T7400
    │   ├── Reacher
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506398906.ywz-WorkStation-T7400
    │   ├── Swimmer
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1526197305.ywz-PC
    │   └── Walker2d
    │   │   ├── checkpoint
    │   │   └── events.out.tfevents.1506671852.ywz-WorkStation-T7400
    ├── logger.py
    ├── main.py
    ├── mlp.py
    ├── optimizer.py
    ├── parallel.py
    ├── policy
    │   ├── __init__.py
    │   ├── categorical_mlp.py
    │   ├── deterministic_mlp.py
    │   └── gaussian_mlp.py
    ├── ppo.py
    ├── q_learning.py
    ├── q_network.py
    ├── replay_memory.py
    ├── sampler.py
    ├── simulator.py
    ├── test.py
    ├── train.py
    ├── trpo.py
    ├── utils.py
    └── value
    │   ├── __init__.py
    │   ├── linear_fitting.py
    │   └── mlp_fitting.py
├── Chapter04
    ├── actor_critic_net.py
    ├── actor_network.py
    ├── config.py
    ├── critic_network.py
    ├── dpg.py
    ├── eval.py
    ├── layers.py
    ├── log
    │   ├── Acrobot-v1
    │   │   ├── checkpoint
    │   │   └── train
    │   │   │   └── events.out.tfevents.1523886598.ywz-PC
    │   ├── CartPole-v0
    │   │   ├── checkpoint
    │   │   └── train
    │   │   │   └── events.out.tfevents.1525870448.ywz-PC
    │   ├── MountainCar-v0
    │   │   ├── checkpoint
    │   │   └── train
    │   │   │   └── events.out.tfevents.1526196635.ywz-PC
    │   └── Pendulum-v0
    │   │   ├── checkpoint
    │   │   └── train
    │   │       └── events.out.tfevents.1525871560.ywz-PC
    ├── main.py
    ├── optimizer.py
    ├── replay_memory.py
    ├── task.py
    └── train.py
├── Chapter05
    ├── a3c.py
    ├── cluster.py
    ├── demo
    │   ├── __init__.py
    │   ├── game.py
    │   ├── object.py
    │   ├── robot.py
    │   └── utils.py
    ├── doom
    │   ├── _vizdoom.ini
    │   ├── doom.py
    │   ├── game.py
    │   └── scenarios
    │   │   ├── basic.cfg
    │   │   ├── basic.wad
    │   │   ├── cig.cfg
    │   │   ├── cig.wad
    │   │   ├── cig_with_unknown.wad
    │   │   ├── deadly_corridor.cfg
    │   │   ├── deadly_corridor.wad
    │   │   ├── deathmatch.cfg
    │   │   ├── deathmatch.wad
    │   │   ├── defend_the_center.cfg
    │   │   ├── defend_the_center.wad
    │   │   ├── defend_the_line.cfg
    │   │   ├── defend_the_line.wad
    │   │   ├── health_gathering.cfg
    │   │   ├── health_gathering.wad
    │   │   ├── health_gathering_supreme.wad
    │   │   ├── learning.cfg
    │   │   ├── multi.cfg
    │   │   ├── multi_deathmatch.wad
    │   │   ├── multi_duel.cfg
    │   │   ├── multi_duel.wad
    │   │   ├── my_way_home.cfg
    │   │   ├── my_way_home.wad
    │   │   ├── predict_position.cfg
    │   │   ├── predict_position.wad
    │   │   ├── rocket_basic.cfg
    │   │   ├── rocket_basic.wad
    │   │   ├── simpler_basic.cfg
    │   │   ├── simpler_basic.wad
    │   │   ├── take_cover.cfg
    │   │   └── take_cover.wad
    ├── environment.py
    ├── ff_policy.py
    ├── game.py
    ├── helper
    │   └── tmux
    ├── layer.py
    ├── lstm_policy.py
    ├── minecraft
    │   ├── __init__.py
    │   └── game.py
    ├── parameter.py
    ├── save
    │   ├── breakout
    │   │   └── train
    │   │   │   ├── log_0
    │   │   │       └── events.out.tfevents.1532007719.ywz-PC
    │   │   │   └── log_1
    │   │   │       └── events.out.tfevents.1532007719.ywz-PC
    │   ├── demo
    │   │   └── train
    │   │   │   ├── checkpoint
    │   │   │   ├── log_0
    │   │   │       └── events.out.tfevents.1532007504.ywz-PC
    │   │   │   └── log_1
    │   │   │       └── events.out.tfevents.1532007504.ywz-PC
    │   └── minecraftbasic-v0
    │   │   └── train
    │   │       └── log_0
    │   │           └── events.out.tfevents.1532007895.ywz-PC
    ├── test.py
    ├── timer.py
    ├── train.py
    ├── utils.py
    └── worker.py
├── Chapter06
    ├── __init__.py
    ├── commands.txt
    └── src
    │   ├── __init__.py
    │   ├── alphagozero_agent.py
    │   ├── config.py
    │   ├── constants.py
    │   ├── controller.py
    │   ├── features.py
    │   ├── go.py
    │   ├── mcts.py
    │   ├── network.py
    │   ├── preprocessing.py
    │   ├── train.py
    │   └── utils.py
├── Chapter07
    ├── RL chatbot.ipynb
    ├── convert_checkpoint.py
    ├── data_parser.py
    ├── data_reader.py
    ├── feature_extracter.py
    ├── model
    │   ├── Reversed
    │   │   └── checkpoint
    │   └── model-56-3000
    │   │   └── checkpoint
    ├── pg_model.py
    ├── results
    │   ├── sample_input.txt
    │   └── sample_output_RL.txt
    ├── seq_model.py
    ├── test.py
    └── train.py
├── Chapter08
    ├── README.md
    ├── __init__.py
    └── src
    │   ├── __init__.py
    │   ├── child_network.py
    │   ├── cifar10_processor.py
    │   ├── config.py
    │   ├── constants.py
    │   ├── controller.py
    │   └── train.py
├── Chapter09
    ├── actor.py
    ├── agent.py
    ├── critic.py
    ├── helper.py
    └── train.py
├── Dockerfile
├── LICENSE
├── README.md
├── artifacts.pptx
└── requirements.txt


/Chapter01/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter01/__init__.py


--------------------------------------------------------------------------------
/Chapter01/logs/simple_cnn/events.out.tfevents.1532869533.Seans-MBP-7.lan:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter01/logs/simple_cnn/events.out.tfevents.1532869533.Seans-MBP-7.lan


--------------------------------------------------------------------------------
/Chapter01/saves/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "simple_cnn"
2 | all_model_checkpoint_paths: "simple_cnn"
3 | 


--------------------------------------------------------------------------------
/Chapter02/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter02/__init__.py


--------------------------------------------------------------------------------
/Chapter02/algorithmic_03.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('Copy-v0')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter02/atari_02.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('SpaceInvaders-v0')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter02/box2d_04.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('LunarLander-v2')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter02/cartpole.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import random
  4 | import math
  5 | 
  6 | environment = gym.make('CartPole-v0')
  7 | 
  8 | 
  9 | no_buckets = (1, 1, 6, 3)
 10 | no_actions = environment.action_space.n
 11 | state_value_bounds = list(zip(environment.observation_space.low, environment.observation_space.high))
 12 | state_value_bounds[1] = [-0.5, 0.5]
 13 | state_value_bounds[3] = [-math.radians(50), math.radians(50)]
 14 | action_index = len(no_buckets)
 15 | 
 16 | q_value_table = np.zeros(no_buckets + (no_actions,))
 17 | 
 18 | min_explore_rate = 0.01
 19 | min_learning_rate = 0.1
 20 | 
 21 | max_episodes = 1000
 22 | max_time_steps = 250
 23 | streak_to_end = 120
 24 | solved_time = 199
 25 | discount = 0.99
 26 | no_streaks = 0
 27 | 
 28 | 
 29 | def select_action(state_value, explore_rate):
 30 |     if random.random() < explore_rate:
 31 |         action = environment.action_space.sample()
 32 |     else:
 33 |         action = np.argmax(q_value_table[state_value])
 34 |     return action
 35 | 
 36 | 
 37 | def select_explore_rate(x):
 38 |     return max(min_explore_rate, min(1, 1.0 - math.log10((x+1)/25)))
 39 | 
 40 | 
 41 | def select_learning_rate(x):
 42 |     return max(min_learning_rate, min(0.5, 1.0 - math.log10((x+1)/25)))
 43 | 
 44 | 
 45 | def bucketize_state_value(state_value):
 46 |     bucket_indexes = []
 47 |     for i in range(len(state_value)):
 48 |         if state_value[i] <= state_value_bounds[i][0]:
 49 |             bucket_index = 0
 50 |         elif state_value[i] >= state_value_bounds[i][1]:
 51 |             bucket_index = no_buckets[i] - 1
 52 |         else:
 53 |             bound_width = state_value_bounds[i][1] - state_value_bounds[i][0]
 54 |             offset = (no_buckets[i]-1)*state_value_bounds[i][0]/bound_width
 55 |             scaling = (no_buckets[i]-1)/bound_width
 56 |             bucket_index = int(round(scaling*state_value[i] - offset))
 57 |         bucket_indexes.append(bucket_index)
 58 |     return tuple(bucket_indexes)
 59 | 
 60 | 
 61 | for episode_no in range(max_episodes):
 62 |     explore_rate = select_explore_rate(episode_no)
 63 |     learning_rate = select_learning_rate(episode_no)
 64 | 
 65 |     observation = environment.reset()
 66 | 
 67 |     start_state_value = bucketize_state_value(observation)
 68 |     previous_state_value = start_state_value
 69 | 
 70 |     for time_step in range(max_time_steps):
 71 |         environment.render()
 72 |         selected_action = select_action(previous_state_value, explore_rate)
 73 |         observation, reward_gain, completed, _ = environment.step(selected_action)
 74 |         state_value = bucketize_state_value(observation)
 75 |         best_q_value = np.amax(q_value_table[state_value])
 76 |         q_value_table[previous_state_value + (selected_action,)] += learning_rate * (
 77 |                 reward_gain + discount * (best_q_value) - q_value_table[previous_state_value + (selected_action,)])
 78 | 
 79 |         print('Episode number : %d' % episode_no)
 80 |         print('Time step : %d' % time_step)
 81 |         print('Selection action : %d' % selected_action)
 82 |         print('Current state : %s' % str(state_value))
 83 |         print('Reward obtained : %f' % reward_gain)
 84 |         print('Best Q value : %f' % best_q_value)
 85 |         print('Learning rate : %f' % learning_rate)
 86 |         print('Explore rate : %f' % explore_rate)
 87 |         print('Streak number : %d' % no_streaks)
 88 | 
 89 |         if completed:
 90 |             print('Episode %d finished after %f time steps' % (episode_no, time_step))
 91 |             if time_step >= solved_time:
 92 |                 no_streaks += 1
 93 |             else:
 94 |                 no_streaks = 0
 95 |             break
 96 | 
 97 |         previous_state_value = state_value
 98 | 
 99 |     if no_streaks > streak_to_end:
100 |         break
101 | 


--------------------------------------------------------------------------------
/Chapter02/classic_control_05.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('CartPole-v0')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter02/mujoco_06.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('Humanoid-v2')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)
7 | 


--------------------------------------------------------------------------------
/Chapter02/robotics_07.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('HandManipulateBlock-v0')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter02/start1.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import time
3 | environment = gym.make('CartPole-v0')
4 | environment.reset()
5 | for dummy in range(100):
6 |     time.sleep(1)
7 |     environment.render()
8 |     environment.step(environment.action_space.sample())


--------------------------------------------------------------------------------
/Chapter02/toy_text_08.py:
--------------------------------------------------------------------------------
1 | import gym
2 | environment = gym.make('FrozenLake-v0')
3 | environment.reset()
4 | environment.render()
5 | import time
6 | time.sleep(10)


--------------------------------------------------------------------------------
/Chapter03/MUJOCO_LOG.TXT:
--------------------------------------------------------------------------------
1 | Sun May 13 16:29:23 2018
2 | ERROR: GLEW initalization error: Missing GL version
3 | 
4 | 


--------------------------------------------------------------------------------
/Chapter03/config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 25, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | 
 7 | ATARI = {
 8 |     'network_type': 'cnn',
 9 |     'gamma': 0.99,
10 |     'batch_size': 32,
11 |     'num_episode': 500000,
12 |     'capacity': 1000000,
13 |     'epsilon_decay': 1000000,
14 |     'epsilon_min': 0.1,
15 |     'num_frames': 4,
16 |     'num_nullops': 5,
17 |     'time_between_two_copies': 10000,
18 |     'input_scale': 255.0,
19 |     'update_interval': 1,
20 |     'T': 100000,
21 |     
22 |     'learning_rate': 2e-4,
23 |     'optimizer': 'rmsprop',
24 |     'rho': 0.99,
25 |     'rmsprop_epsilon': 1e-6,
26 |     
27 |     'log_dir': 'log/'
28 | }
29 | 
30 | 
31 | DEMO = {
32 |     'network_type': 'mlp',
33 |     'gamma': 0.7,
34 |     'batch_size': 32,
35 |     'num_episode': 40,
36 |     'capacity': 20000,
37 |     'epsilon_decay': 100000,
38 |     'epsilon_min': 0.1,
39 |     'num_frames': 1,
40 |     'num_nullops': 2,
41 |     'time_between_two_copies': 1000,
42 |     'input_scale': 1.0,
43 |     'update_interval': 1,
44 |     'T': 1000000,
45 |     
46 |     'learning_rate': 0.5e-2,
47 |     'optimizer': 'momentum',
48 |     'rho': 0.9,
49 |     'rmsprop_epsilon': 1e-6,
50 |     
51 |     'log_dir': 'log/'
52 | }
53 | 


--------------------------------------------------------------------------------
/Chapter03/demo/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Nov 10, 2016
3 | 
4 | @author: a0096049
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter03/demo/object.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 16, 2016
 3 | 
 4 | @author: a0096049
 5 | '''
 6 | 
 7 | import numpy, pygame
 8 | from demo.utils import Color, calculateIntersectPoint
 9 | 
10 | 
11 | class Object:
12 |     
13 |     def __init__(self, x, y, r, game):
14 |         
15 |         self.x = x
16 |         self.y = y
17 |         self.r = r
18 |         self.game = game
19 |         
20 |     def get_position(self):
21 |         return self.x, self.y
22 |     
23 |     def get_radius(self):
24 |         return self.r
25 |     
26 |     def set_position(self, x, y):
27 |         self.x = x
28 |         self.y = y
29 |         
30 |     def draw(self):
31 |         pass
32 |     
33 | class Food(Object):
34 |     
35 |     def __init__(self, x, y, radius, t, game):
36 |         
37 |         super().__init__(x, y, radius, game)
38 |         self.type = t
39 |         self.life = numpy.random.randint(1000, 5000)
40 |         
41 |     def decrease_life(self):
42 |         self.life -= 1
43 |         return self.life == 0
44 | 
45 |     def draw(self, found=False):
46 |         
47 |         if found == False:
48 |             if self.type == "bad":
49 |                 pygame.draw.circle(self.game.DISPLAYSURF, Color.RED, (self.x, self.y), self.r)
50 |             else:
51 |                 pygame.draw.circle(self.game.DISPLAYSURF, Color.GREEN, (self.x, self.y), self.r)
52 |         else:
53 |             pygame.draw.circle(self.game.DISPLAYSURF, Color.BLUE, (self.x, self.y), self.r)
54 |     
55 | class Wall:
56 |     
57 |     def __init__(self, start, end, game, width=2):
58 |         
59 |         self.start = start
60 |         self.end = end
61 |         self.game = game
62 |         self.width = width
63 |         
64 |     def draw(self):
65 |         pygame.draw.line(self.game.DISPLAYSURF, Color.WHITE, self.start, self.end, self.width)
66 |     
67 |     def collide(self, p1, p2):
68 |         
69 |         point = calculateIntersectPoint(p1, p2, self.start, self.end)
70 |         if point is None:
71 |             return None
72 |         else:
73 |             return (int(point[0]), int(point[1]))
74 |         
75 |     
76 |         
77 |     
78 | 


--------------------------------------------------------------------------------
/Chapter03/distribution/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on 18 Sep 2017
3 | 
4 | @author: ywz
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter03/distribution/categorical.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 27 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | class Categorical:
11 |     
12 |     def __init__(self, dim):
13 |         self.dim = dim
14 |         
15 |     def specs(self):
16 |         return [("prob", (self.dim,))]
17 |     
18 |     def keys(self):
19 |         return ["prob"]
20 |     
21 |     def kl_numpy(self, old_dist, new_dist):
22 |         
23 |         old_prob = old_dist["prob"]
24 |         new_prob = new_dist["prob"]
25 |         
26 |         return numpy.sum(old_prob * (numpy.log(old_prob + 1e-8) - numpy.log(new_prob + 1e-8)), axis=-1)
27 |     
28 |     def kl_tf(self, old_dist, new_dist):
29 | 
30 |         old_prob = old_dist["prob"]
31 |         new_prob = new_dist["prob"]
32 | 
33 |         return tf.reduce_sum(old_prob * (tf.log(old_prob + 1e-8) - tf.log(new_prob + 1e-8)), axis=-1)
34 |     
35 |     def likelihood_ratio_tf(self, x, old_dist, new_dist):
36 |         
37 |         old_prob = old_dist["prob"]
38 |         new_prob = new_dist["prob"]
39 | 
40 |         return (tf.reduce_sum(new_prob * x, axis=-1) + 1e-8) / \
41 |                (tf.reduce_sum(old_prob * x, axis=-1) + 1e-8)
42 |     
43 |         


--------------------------------------------------------------------------------
/Chapter03/distribution/diagonal_gaussian.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 18 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | class DiagonalGaussian:
11 |     
12 |     def __init__(self, dim):
13 |         self.dim = dim
14 |         
15 |     def specs(self):
16 |         return [("mean", (self.dim,)), ("log_var", (self.dim,))]
17 |     
18 |     def keys(self):
19 |         return ["mean", "log_var"]
20 |     
21 |     def kl_numpy(self, old_dist, new_dist):
22 |         
23 |         old_means = old_dist["mean"]
24 |         old_log_stds = old_dist["log_var"]
25 |         new_means = new_dist["mean"]
26 |         new_log_stds = new_dist["log_var"]
27 | 
28 |         old_std = numpy.exp(old_log_stds)
29 |         new_std = numpy.exp(new_log_stds)
30 |         # means: (N*A)
31 |         # std: (N*A)
32 |         # formula:
33 |         # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
34 |         # ln(\sigma_2/\sigma_1)
35 |         numerator = numpy.square(old_means - new_means) + numpy.square(old_std) - numpy.square(new_std)
36 |         denominator = 2 * numpy.square(new_std) + 1e-8
37 |         
38 |         return numpy.sum(numerator / denominator + new_log_stds - old_log_stds, axis=-1)
39 |     
40 |     def kl_tf(self, old_dist, new_dist):
41 |         
42 |         old_means = old_dist["mean"]
43 |         old_log_stds = old_dist["log_var"]
44 |         new_means = new_dist["mean"]
45 |         new_log_stds = new_dist["log_var"]
46 | 
47 |         old_std = tf.exp(old_log_stds)
48 |         new_std = tf.exp(new_log_stds)
49 |         # means: (N*A)
50 |         # std: (N*A)
51 |         # formula:
52 |         # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
53 |         # ln(\sigma_2/\sigma_1)
54 |         numerator = tf.square(old_means - new_means) + tf.square(old_std) - tf.square(new_std)
55 |         denominator = 2 * tf.square(new_std) + 1e-8
56 |         
57 |         return tf.reduce_sum(numerator / denominator + new_log_stds - old_log_stds, axis=-1)
58 |     
59 |     def likelihood_ratio_tf(self, x, old_dist, new_dist):
60 |         
61 |         new = self.log_likelihood_tf(x, new_dist)
62 |         old = self.log_likelihood_tf(x, old_dist)
63 |         
64 |         return tf.exp(new - old)
65 | 
66 |     def log_likelihood_tf(self, x, dist):
67 |         
68 |         means = dist["mean"]
69 |         log_stds = dist["log_var"]
70 |         zs = (x - means) / tf.exp(log_stds)
71 |         
72 |         return - tf.reduce_sum(log_stds, axis=-1) - \
73 |                0.5 * tf.reduce_sum(tf.square(zs), axis=-1) - \
74 |                0.5 * self.dim * numpy.log(2 * numpy.pi)
75 |     
76 |         
77 |     


--------------------------------------------------------------------------------
/Chapter03/environment.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 25, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | from threading import Thread
 7 | 
 8 | 
 9 | def new_demo(test=True):
10 |     import pygame
11 |     from demo.game import Game
12 |     
13 |     if test is False:
14 |         game = Game(640, 480, None)
15 |     else:
16 |         def _render(game):
17 |             while True:
18 |                 game.draw()
19 |                 for event in pygame.event.get():
20 |                     if event.type == pygame.KEYDOWN:
21 |                         if event.key == pygame.K_9:
22 |                             game.increase_fps()
23 |                         elif event.key == pygame.K_0:
24 |                             game.decrease_fps()    
25 |         pygame.init()
26 |         DISPLAYSURF = pygame.display.set_mode((640, 480), 0, 32)
27 |         pygame.display.set_caption('Demo')
28 |         game = Game(640, 480, DISPLAYSURF)
29 |         t = Thread(target=lambda: _render(game))
30 |         t.start()
31 |     
32 |     return game
33 | 
34 | 
35 | def new_atari_game(rom='breakout'):
36 |     from game import Game
37 |     
38 |     game = Game(rom)
39 |     
40 |     if rom == 'space_invaders':
41 |         game.set_params(frame_skip=3, lost_life_as_terminal=False, take_maximum_of_two_frames=True)
42 |     elif game == 'alien':
43 |         game.set_params(frame_skip=4, crop_offset=20, lost_life_as_terminal=False)
44 |     else:
45 |         game.set_params(frame_skip=4, lost_life_as_terminal=False)
46 |         
47 |     return game
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter03/eval.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 28, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import os
 7 | import argparse
 8 | import tensorflow as tf
 9 | from q_learning import DQN
10 | from config import ATARI, DEMO
11 | from environment import new_atari_game, new_demo
12 | 
13 | 
14 | def main():
15 |     
16 |     parser = argparse.ArgumentParser(description=None)
17 |     parser.add_argument('-g', '--game', default='demo', type=str, help='Game')
18 |     parser.add_argument('-d', '--device', default='cpu', type=str, help='Device')
19 |     args = parser.parse_args()
20 |     
21 |     rom = args.game
22 |     if rom == 'demo':
23 |         game = new_demo()
24 |         conf = DEMO
25 |     else:
26 |         game = new_atari_game(rom)
27 |         conf = ATARI
28 | 
29 |     model_dir = os.path.join(conf['log_dir'], rom)
30 |     device = '/{}:0'.format(args.device)
31 |     with tf.device(device):
32 |         dqn = DQN(conf, game, model_dir, callback=game.draw)
33 |     
34 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
35 |         saver = tf.train.Saver()
36 |         dqn.load(sess, saver)
37 |         dqn.evaluate(sess)
38 |         
39 | 
40 | if __name__ == "__main__":
41 |     main()
42 | 


--------------------------------------------------------------------------------
/Chapter03/game.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Mar 25, 2018
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import gym, numpy, time
  7 | from utils import cv2_resize_image
  8 | 
  9 | class Game:
 10 | 
 11 |     def __init__(self, name, lost_life_as_terminal=False, take_maximum_of_two_frames=False):
 12 |         
 13 |         if take_maximum_of_two_frames is False:
 14 |             self.mode = 'Deterministic'
 15 |         else:
 16 |             self.mode = 'NoFrameskip'
 17 |             
 18 |         name = ''.join([s.capitalize() for s in name.split('_')])
 19 |         self.ale = gym.make('{}{}-v4'.format(name, self.mode))
 20 |         frame = self.ale.reset()
 21 |         self.lost_life_as_terminal = lost_life_as_terminal
 22 |         self.lives = 0
 23 |         self.actions = list(range(self.ale.action_space.n))
 24 |         
 25 |         self.frame_skip = 4
 26 |         self.total_reward = 0
 27 |         self.crop_size = 84
 28 |         self.crop_offset = 8
 29 |         
 30 |         # Frame buffer
 31 |         self.buffer_size = 8
 32 |         self.buffer_index = 0
 33 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 34 |         # Overlapping frames, maximum of two frames
 35 |         self.last_frame = frame
 36 |     
 37 |     def rgb_to_gray(self, im):
 38 |         return numpy.dot(im, [0.2126, 0.7152, 0.0722])
 39 |     
 40 |     def set_params(self, crop_size=84, crop_offset=8, frame_skip=4, 
 41 |                    lost_life_as_terminal=False, take_maximum_of_two_frames=False):
 42 |         
 43 |         self.crop_size = crop_size
 44 |         self.crop_offset = crop_offset
 45 |         self.frame_skip = frame_skip
 46 |         self.lost_life_as_terminal = lost_life_as_terminal
 47 |         self.mode = 'NoFrameskip' if take_maximum_of_two_frames else 'Deterministic'
 48 |         
 49 |         frame = self.ale.reset()
 50 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 51 |         self.last_frame = frame
 52 |     
 53 |     def reset(self):
 54 |         frame = self.ale.reset()
 55 |         self.total_reward = 0
 56 |         self.buffer_index = 0
 57 |         self.lives = 0
 58 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 59 |         self.last_frame = frame
 60 |     
 61 |     def add_frame_to_buffer(self, frame):
 62 |         self.buffer_index = self.buffer_index % self.buffer_size
 63 |         self.buffer[self.buffer_index] = frame
 64 |         self.buffer_index += 1
 65 |     
 66 |     def get_available_actions(self):
 67 |         return list(range(len(self.actions)))
 68 |     
 69 |     def get_feedback_size(self):
 70 |         return (self.crop_size, self.crop_size)
 71 |     
 72 |     def crop(self, frame):
 73 |         feedback = cv2_resize_image(frame, 
 74 |                                     resized_shape=(self.crop_size, self.crop_size), 
 75 |                                     method='crop', crop_offset=self.crop_offset)
 76 |         return feedback
 77 |     
 78 |     def get_current_feedback(self, num_frames=1):
 79 |         assert num_frames < self.buffer_size, "Frame buffer is not large enough."
 80 |         index = self.buffer_index - 1
 81 |         frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)]
 82 |         if num_frames > 1:
 83 |             return numpy.concatenate(frames, axis=0)
 84 |         else:
 85 |             return frames[0]
 86 |     
 87 |     def get_total_reward(self):
 88 |         return self.total_reward
 89 |     
 90 |     def _lost_life(self, info):
 91 |         if self.lost_life_as_terminal:
 92 |             lives = info['ale.lives']
 93 |             if lives >= self.lives:
 94 |                 self.lives = lives
 95 |                 return False
 96 |             else:
 97 |                 return True
 98 |         else:
 99 |             return False
100 |     
101 |     def play_action(self, action, num_frames=1):
102 |         
103 |         if self.mode == 'Deterministic':
104 |             termination = 0
105 |             a = self.actions[action]
106 |             frame, reward, done, info = self.ale.step(a)
107 |             if done or self._lost_life(info): termination = 1
108 |             self.add_frame_to_buffer(self.crop(self.rgb_to_gray(frame)))
109 |         elif self.mode == 'NoFrameskip':
110 |             reward = 0
111 |             termination = 0
112 |             for i in range(self.frame_skip):
113 |                 a = self.actions[action]
114 |                 frame, r, done, info = self.ale.step(a)
115 |                 reward += r
116 |                 if i == self.frame_skip - 2: self.last_frame = frame
117 |                 if done or self._lost_life(info): termination = 1
118 |             self.add_frame_to_buffer(self.crop(numpy.maximum(self.rgb_to_gray(frame), self.rgb_to_gray(self.last_frame))))
119 |         else:
120 |             raise
121 |         
122 |         r = numpy.clip(reward, -1, 1)
123 |         self.total_reward += reward
124 |         
125 |         return r, self.get_current_feedback(num_frames), termination
126 |     
127 |     def draw(self):
128 |         self.ale.render()
129 |         
130 |     def get_action_meanings(self):
131 |         return self.ale.get_action_meanings()
132 |     


--------------------------------------------------------------------------------
/Chapter03/krylov.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 4 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | 
 8 | 
 9 | class Krylov:
10 |     
11 |     def __init__(self):
12 |         pass
13 |     
14 |     def cg(self, Ax, b, cg_iters=10, verbose=False, eps=1e-10):
15 |         
16 |         x = numpy.zeros_like(b)
17 |         r = b.copy()
18 |         p = b.copy()
19 |         r_dot_r = r.dot(r)
20 |         
21 |         for _ in range(cg_iters):
22 |             z = Ax(p)
23 |             v = r_dot_r / p.dot(z)
24 |             x += v * p
25 |             r -= v * z
26 |             
27 |             new_r_dot_r = r.dot(r)
28 |             beta = new_r_dot_r / r_dot_r
29 |             p = r + beta * p
30 |             
31 |             r_dot_r = new_r_dot_r
32 |             if r_dot_r < eps:
33 |                 break
34 |         
35 |         if verbose: 
36 |             print("residual norm: {:5f}, solution norm: {:5f}".format(r_dot_r, numpy.linalg.norm(x)))
37 |         return x
38 |     
39 | if __name__ == "__main__":
40 |     
41 |     from numpy.linalg import inv
42 |     
43 |     n = 5
44 |     A = numpy.random.rand(n, n)
45 |     A = A.T.dot(A) + 0.01 * numpy.eye(n)
46 |     b = numpy.random.rand(n)
47 |     x = inv(A).dot(b)
48 | 
49 |     krylov = Krylov()
50 |     y = krylov.cg(lambda x: A.dot(x), b, verbose=True)
51 |     
52 |     print(x)
53 |     print(y)
54 |     
55 |     
56 | 
57 |             


--------------------------------------------------------------------------------
/Chapter03/layer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 29 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def leaky_relu(x, leak=0.0, name="lrelu"):
11 |     return tf.maximum(leak * x, x, name=name)
12 | 
13 | def add_regularization(var, weight):
14 |     weight_decay = tf.multiply(tf.nn.l2_loss(var), weight, name='weight_loss')
15 |     tf.add_to_collection('losses', weight_decay)
16 | 
17 | def get_variable_on_cpu(shape, initializer, name, dtype=tf.float32, trainable=True):
18 |     with tf.device('/cpu:0'):
19 |         var = tf.get_variable(shape=shape, initializer=initializer, 
20 |                               dtype=dtype, name=name, trainable=trainable)
21 |     return var
22 | 
23 | def HeUniform(shape):
24 |     
25 |     if len(shape) > 2:
26 |         w = shape[0]
27 |         h = shape[1]
28 |         input_channels  = shape[2]
29 |         d = 1.0 / numpy.sqrt(input_channels * w * h)
30 |     else:
31 |         d = 1.0 / numpy.sqrt(shape[0])
32 |     
33 |     init_W = tf.random_uniform_initializer(-d, d)
34 |     init_b = tf.random_uniform_initializer(-d, d)
35 |     return init_W, init_b
36 | 
37 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 
38 |            activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'):
39 |     
40 |     assert len(x.get_shape().as_list()) == 4
41 |     shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim)
42 |     _W, _b = HeUniform(shape)
43 |     if init_W is None: init_W = _W
44 |     if init_b is None: init_b = _b
45 | 
46 |     with tf.variable_scope(name):
47 |         W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
48 |         b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
49 |         
50 |         conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding)
51 |         if activation:
52 |             conv = activation(tf.nn.bias_add(conv, b))
53 |         else:
54 |             conv = tf.nn.bias_add(conv, b)
55 |     
56 |     return conv
57 | 
58 | def linear(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='linear'):
59 |     
60 |     if len(x.get_shape().as_list()) > 2:
61 |         shape = x.get_shape().as_list()
62 |         x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:])))
63 | 
64 |     shape = (x.get_shape().as_list()[-1], output_dim)
65 |     _W, _b = HeUniform(shape)
66 |     if init_W is None: init_W = _W
67 |     if init_b is None: init_b = _b
68 | 
69 |     with tf.variable_scope(name):
70 |         W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
71 |         b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
72 |         
73 |         linear = tf.matmul(x, W) + b
74 |         if activation:
75 |             linear = activation(linear)
76 |     
77 |     return linear
78 | 


--------------------------------------------------------------------------------
/Chapter03/layers.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 25, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def get_variable(shape, initializer, name, dtype=tf.float32, trainable=True):
11 |     var = tf.get_variable(shape=shape, initializer=initializer, 
12 |                           dtype=dtype, name=name, trainable=trainable)
13 |     return var
14 | 
15 | 
16 | def HeUniform(shape):
17 |     
18 |     if len(shape) > 2:
19 |         w = shape[0]
20 |         h = shape[1]
21 |         input_channels  = shape[2]
22 |         d = 1.0 / numpy.sqrt(input_channels * w * h)
23 |     else:
24 |         d = 1.0 / numpy.sqrt(shape[0])
25 |     
26 |     init_W = tf.random_uniform_initializer(-d, d)
27 |     init_b = tf.random_uniform_initializer(-d, d)
28 |     return init_W, init_b
29 | 
30 | 
31 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 
32 |            activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'):
33 |     
34 |     assert len(x.get_shape().as_list()) == 4
35 |     shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim)
36 |     _W, _b = HeUniform(shape)
37 |     if init_W is None: init_W = _W
38 |     if init_b is None: init_b = _b
39 | 
40 |     with tf.variable_scope(name):
41 |         W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
42 |         b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
43 |         
44 |         conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding)
45 |         if activation:
46 |             conv = activation(tf.nn.bias_add(conv, b))
47 |         else:
48 |             conv = tf.nn.bias_add(conv, b)
49 |     
50 |     return conv
51 | 
52 | 
53 | def dense(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='dense'):
54 |     
55 |     if len(x.get_shape().as_list()) > 2:
56 |         shape = x.get_shape().as_list()
57 |         x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:])))
58 | 
59 |     shape = (x.get_shape().as_list()[-1], output_dim)
60 |     _W, _b = HeUniform(shape)
61 |     if init_W is None: init_W = _W
62 |     if init_b is None: init_b = _b
63 | 
64 |     with tf.variable_scope(name):
65 |         W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
66 |         b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
67 |         
68 |         output = tf.matmul(x, W) + b
69 |         if activation:
70 |             output = activation(output)
71 |     
72 |     return output
73 | 


--------------------------------------------------------------------------------
/Chapter03/log/Acrobot/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Acrobot.ckpt"
2 | all_model_checkpoint_paths: "Acrobot.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Acrobot/events.out.tfevents.1506500394.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Acrobot/events.out.tfevents.1506500394.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/CartPole/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "CartPole.ckpt"
2 | all_model_checkpoint_paths: "CartPole.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/CartPole/events.out.tfevents.1506667268.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/CartPole/events.out.tfevents.1506667268.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/HalfCheetah/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "HalfCheetah.ckpt"
2 | all_model_checkpoint_paths: "HalfCheetah.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/HalfCheetah/events.out.tfevents.1506338471.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/HalfCheetah/events.out.tfevents.1506338471.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/Hopper/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Hopper.ckpt"
2 | all_model_checkpoint_paths: "Hopper.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Hopper/events.out.tfevents.1506658875.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Hopper/events.out.tfevents.1506658875.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/Pendulum/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Pendulum.ckpt"
2 | all_model_checkpoint_paths: "Pendulum.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Pendulum/events.out.tfevents.1506666537.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Pendulum/events.out.tfevents.1506666537.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/Reacher/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Reacher.ckpt"
2 | all_model_checkpoint_paths: "Reacher.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Reacher/events.out.tfevents.1506398906.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Reacher/events.out.tfevents.1506398906.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/log/Swimmer/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Swimmer.ckpt"
2 | all_model_checkpoint_paths: "Swimmer.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Swimmer/events.out.tfevents.1526197305.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Swimmer/events.out.tfevents.1526197305.ywz-PC


--------------------------------------------------------------------------------
/Chapter03/log/Walker2d/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Walker2d.ckpt"
2 | all_model_checkpoint_paths: "Walker2d.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter03/log/Walker2d/events.out.tfevents.1506671852.ywz-WorkStation-T7400:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Walker2d/events.out.tfevents.1506671852.ywz-WorkStation-T7400


--------------------------------------------------------------------------------
/Chapter03/logger.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 22 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import sys
 7 | import tensorflow as tf
 8 | 
 9 | def delete_dir(path):
10 |     if tf.gfile.Exists(path):
11 |         tf.gfile.DeleteRecursively(path)
12 |     tf.gfile.MakeDirs(path)
13 |     return path
14 | 
15 | class Logger:
16 |     
17 |     def __init__(self, sess, directory):
18 |         
19 |         self.directory = directory
20 |         self.output_file = sys.stdout
21 |         
22 |         self.step = 0
23 |         self.summary_writer = tf.summary.FileWriter(delete_dir(directory), sess.graph)
24 |         self.print_buffer = []
25 |     
26 |     def clear(self):
27 |         self.step = 0
28 |     
29 |     def set_step(self, step):
30 |         self.step = step
31 |     
32 |     def add_summary(self, summary):
33 |         self.summary_writer.add_summary(summary, self.step)
34 |         summary_text = tf.Summary()
35 |         summary_text.ParseFromString(summary)
36 |         self.print_buffer += ["{}: {:5f}".format(v.tag, v.simple_value) for v in summary_text.value]
37 |     
38 |     def flush(self):
39 |         self.summary_writer.flush()
40 |         s = ["episode: {}".format(self.step)] + self.print_buffer
41 |         print(', '.join(s), file=self.output_file)
42 |         self.print_buffer = []
43 |         
44 |     


--------------------------------------------------------------------------------
/Chapter03/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 4 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | 
 7 | def main():
 8 |     pass
 9 | 
10 | if __name__ == "__main__":
11 |     main()
12 |     


--------------------------------------------------------------------------------
/Chapter03/mlp.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 5 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from layer import linear
 8 | 
 9 | 
10 | class MLP:
11 |     
12 |     def __init__(self, input_shape, output_size, hidden_sizes=(32, 32), 
13 |                  hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh,
14 |                  input_layer=None, name='mlp'):
15 |         
16 |         self.input_shape = input_shape
17 |         self.output_size = output_size
18 |         self.hidden_sizes = hidden_sizes
19 |         self.hidden_nonlinearity = hidden_nonlinearity
20 |         self.output_nonlinearity = output_nonlinearity
21 |         self.name = name
22 |         
23 |         if input_layer is None:
24 |             self.x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='mlp_input')
25 |         else:
26 |             self.x = input_layer
27 |             
28 |         self.build()
29 |     
30 |     def build(self):
31 |         
32 |         with tf.variable_scope(self.name):
33 |             layer = self.x
34 |             for i, hidden_size in enumerate(self.hidden_sizes):
35 |                 layer = linear(layer, hidden_size, activation=self.hidden_nonlinearity, 
36 |                                init_b=tf.constant_initializer(0.0), name='hidden_layer_{}'.format(i))
37 |                 
38 |             self.y = linear(layer, self.output_size, activation=self.output_nonlinearity, 
39 |                             init_b=tf.constant_initializer(0.0), name='output_layer')
40 |             
41 |             self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
42 |         
43 |     def get_params(self):
44 |         return self.params
45 |     
46 |     def get_input_layer(self):
47 |         return self.x
48 |     
49 |     def get_output_layer(self):
50 |         return self.y
51 |         
52 | 
53 | if __name__ == "__main__":
54 |     
55 |     import numpy
56 |     
57 |     input_shape = (None, 10)
58 |     output_size = 5
59 |     mlp = MLP(input_shape=input_shape, output_size=output_size)
60 |     print(mlp.get_params())
61 |     
62 |     with tf.Session() as sess:
63 |         sess.run(tf.global_variables_initializer())
64 |         writer = tf.summary.FileWriter("log/", sess.graph_def)
65 |         
66 |         x = numpy.random.rand(1, input_shape[1])
67 |         y = sess.run(mlp.get_output_layer(), feed_dict={mlp.get_input_layer(): x})
68 |         print(y)
69 |     
70 |     
71 |     
72 |     
73 |         
74 |         


--------------------------------------------------------------------------------
/Chapter03/parallel.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 21 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import multiprocessing as mp
 7 | import traceback, random
 8 | import sys, numpy
 9 | import tensorflow as tf
10 | from joblib.pool import MemmapingPool
11 | 
12 | 
13 | class SharedGlobal(object):
14 |     pass
15 | 
16 | class StatefulPool(object):
17 |     
18 |     def __init__(self):
19 |         
20 |         self.n_parallel = 1
21 |         self.pool = None
22 |         self.queue = None
23 |         self.worker_queue = None
24 |         self.G = SharedGlobal()
25 | 
26 |     def initialize(self, n_parallel):
27 |         
28 |         self.n_parallel = n_parallel
29 |         
30 |         if self.pool is not None:
31 |             print("Warning: terminating existing pool")
32 |             self.pool.terminate()
33 |             self.queue.close()
34 |             self.worker_queue.close()
35 |             self.G = SharedGlobal()
36 |             
37 |         if n_parallel > 1:
38 |             self.queue = mp.Queue()
39 |             self.worker_queue = mp.Queue()
40 |             self.pool = MemmapingPool(self.n_parallel, temp_folder="/tmp")
41 | 
42 |     def run_each(self, runner, args_list=None):
43 | 
44 |         if args_list is None:
45 |             args_list = [tuple()] * self.n_parallel
46 |         assert len(args_list) == self.n_parallel
47 |         
48 |         if self.n_parallel > 1:
49 |             results = self.pool.map_async(worker_run_each, [(runner, args) for args in args_list])
50 |             for _ in range(self.n_parallel):
51 |                 self.worker_queue.get()
52 |             for _ in range(self.n_parallel):
53 |                 self.queue.put(None)
54 |             return results.get()
55 |         else:
56 |             return [runner(self.G, *args_list[0])]
57 | 
58 | singleton_pool = StatefulPool()
59 | 
60 | def worker_run_each(all_args):
61 |     try:
62 |         runner, args = all_args
63 |         # signals to the master that this task is up and running
64 |         singleton_pool.worker_queue.put(None)
65 |         # wait for the master to signal continuation
66 |         singleton_pool.queue.get()
67 |         return runner(singleton_pool.G, *args)
68 |     except Exception:
69 |         raise Exception("".join(traceback.format_exception(*sys.exc_info())))
70 | 
71 | def worker_init(G, i):
72 |     G.worker_id = i
73 | 
74 | def set_seed(G, seed):
75 |     seed %= 4294967294
76 |     random.seed(seed)
77 |     numpy.random.seed(seed)
78 |     tf.set_random_seed(seed)
79 |     
80 | def initialize(n_parallel):
81 |     singleton_pool.initialize(n_parallel)
82 |     singleton_pool.run_each(worker_init, [(i,) for i in range(singleton_pool.n_parallel)])
83 |     singleton_pool.run_each(set_seed, [(123456789 + i,) for i in range(singleton_pool.n_parallel)])
84 |     
85 | if __name__ == "__main__":
86 |     
87 |     thread_num = 4
88 |     initialize(thread_num)
89 |     
90 |     
91 |     
92 | 


--------------------------------------------------------------------------------
/Chapter03/policy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/policy/__init__.py


--------------------------------------------------------------------------------
/Chapter03/policy/categorical_mlp.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 27 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | from mlp import MLP
 9 | from distribution.categorical import Categorical
10 | 
11 | 
12 | class CategoricalMLPPolicy:
13 |     
14 |     def __init__(self, 
15 |                  input_shape, 
16 |                  output_size, 
17 |                  hidden_sizes=(32, 32),
18 |                  hidden_nonlinearity=tf.nn.tanh):
19 |         
20 |         self.input_shape = input_shape
21 |         self.output_size = output_size
22 |         self.hidden_sizes = hidden_sizes
23 |         self.locals = locals()
24 |         
25 |         self.distribution = Categorical(output_size)
26 |         self.params = []
27 |         
28 |         with tf.variable_scope("policy"):
29 |             # Mean network
30 |             self.prob_mlp = MLP(input_shape=input_shape, 
31 |                                 output_size=output_size, 
32 |                                 hidden_sizes=hidden_sizes, 
33 |                                 hidden_nonlinearity=hidden_nonlinearity, 
34 |                                 output_nonlinearity=tf.nn.softmax,
35 |                                 name='prob')
36 |             
37 |             self.x = self.prob_mlp.get_input_layer()
38 |             self.prob = self.prob_mlp.get_output_layer()
39 |             self.params += self.prob_mlp.get_params()
40 |     
41 |     def get_locals(self):
42 |         arguments = {argc: argv for argc, argv in self.locals.items() if argc != 'self'}
43 |         return arguments
44 |     
45 |     def get_action(self, sess, observation):
46 |         
47 |         if observation.ndim == 1:
48 |             observation = observation.reshape((1, observation.size))
49 |             
50 |         prob = sess.run(self.prob, feed_dict={self.x: observation})[0]
51 |         idx = numpy.random.choice(range(self.output_size), p=prob)
52 |         action = numpy.zeros((self.output_size,))
53 |         action[idx] = 1
54 |         
55 |         return action, {'prob': prob}
56 |     
57 |     def get_actions(self, sess, observation):
58 |         
59 |         probs = sess.run(self.prob, feed_dict={self.x: observation})
60 |         actions = numpy.zeros((probs.shape[0], self.output_size))
61 |         for i, prob in enumerate(probs):
62 |             idx = numpy.random.choice(range(self.output_size), p=prob)
63 |             actions[i][idx] = 1
64 |             
65 |         return actions, {'prob': probs}
66 |     
67 |     def get_input(self):
68 |         return self.x
69 |     
70 |     def get_dist_info(self):
71 |         return {'prob': self.prob}
72 |     
73 |     def get_params(self):
74 |         return self.params
75 |     
76 |     @staticmethod
77 |     def copy(args):
78 |         return CategoricalMLPPolicy(**args)
79 |     
80 | if __name__ == "__main__":
81 |     
82 |     input_shape = (None, 10)
83 |     output_size = 5
84 |     
85 |     policy = CategoricalMLPPolicy(input_shape=input_shape,
86 |                                   output_size=output_size)
87 |     
88 |     for param in policy.get_params():
89 |         print(param)
90 |     
91 |     with tf.Session() as sess:
92 |         sess.run(tf.global_variables_initializer())
93 |         
94 |         observation = numpy.random.rand(2, input_shape[1])
95 |         action = policy.get_actions(sess, observation)
96 |         print(action)
97 |         
98 |             
99 |             


--------------------------------------------------------------------------------
/Chapter03/policy/deterministic_mlp.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 5 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from mlp import MLP
 8 | 
 9 | 
10 | class DeterministicMLPPolicy:
11 |     
12 |     def __init__(self, 
13 |                  input_shape, 
14 |                  output_size, 
15 |                  hidden_sizes=(32, 32),
16 |                  hidden_nonlinearity=tf.nn.relu, 
17 |                  output_nonlinearity=tf.nn.tanh):
18 |         
19 |         self.input_shape = input_shape
20 |         self.output_size = output_size
21 |         self.locals = locals()
22 |         
23 |         with tf.variable_scope("policy"):
24 |             self.mlp = MLP(input_shape=input_shape, 
25 |                            output_size=output_size, 
26 |                            hidden_sizes=hidden_sizes, 
27 |                            hidden_nonlinearity=hidden_nonlinearity, 
28 |                            output_nonlinearity=output_nonlinearity)
29 |         
30 |         self.x = self.mlp.get_input_layer()
31 |         self.y = self.mlp.get_output_layer()
32 |     
33 |     def get_locals(self):
34 |         arguments = {argc: argv for argc, argv in self.locals.items() if argc != 'self'}
35 |         return arguments
36 |     
37 |     def get_action(self, sess, observation):
38 |         if observation.ndim == 1:
39 |             observation = observation.reshape((1, observation.size))
40 |         output = sess.run(self.y, feed_dict={self.x: observation})
41 |         return output[0]
42 |     
43 |     def get_actions(self, sess, observation):
44 |         return sess.run(self.y, feed_dict={self.x: observation})
45 |     
46 |     def get_params(self):
47 |         return self.mlp.get_params()
48 |     
49 |     @staticmethod
50 |     def copy(args):
51 |         return DeterministicMLPPolicy(**args)
52 |     
53 |     
54 |     


--------------------------------------------------------------------------------
/Chapter03/ppo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 26 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from utils import iterate_minibatches
 8 | 
 9 | # Proximal Policy Optimization Algorithms
10 | class PPO:
11 |     
12 |     def __init__(self, 
13 |                  policy, 
14 |                  batch_size=1000, 
15 |                  learning_rate=1e-3, 
16 |                  epsilon=0.2):
17 |         
18 |         self.policy = policy
19 |         self.learning_rate = learning_rate
20 |         self.epsilon = epsilon
21 |         self.batch_size = batch_size
22 |         
23 |         self.x = self.policy.get_input()
24 |         self.action_dim = self.policy.output_size
25 |         self.dist = self.policy.distribution
26 |         
27 |         self.build_formula()
28 |         
29 |     def build_formula(self):
30 |         
31 |         self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.action_dim), name='action')
32 |         self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='action')
33 |         
34 |         dist_vars = self.policy.get_dist_info()
35 |         old_dist_vars = {k: tf.placeholder(tf.float32, shape=[None]+list(shape), name='old_dist_{}'.format(k))
36 |                          for k, shape in self.dist.specs()}
37 |         old_dist_vars_list = [old_dist_vars[k] for k in self.dist.keys()]
38 |         
39 |         lr = self.dist.likelihood_ratio_tf(self.action, old_dist_vars, dist_vars)
40 |         first_term = lr * self.advantage
41 |         second_term = tf.clip_by_value(lr, 1 - self.epsilon, 1 + self.epsilon) * self.advantage
42 |         loss = -tf.reduce_mean(tf.minimum(first_term, second_term))
43 |         
44 |         self.inputs_tensors = [self.x, self.action, self.advantage] + old_dist_vars_list
45 |         self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss, 
46 |                                                                             var_list=self.policy.get_params())
47 |         # Add summaries
48 |         tf.summary.scalar("loss", loss, collections=['ppo'])
49 |         self.summary_op = tf.summary.merge_all('ppo')
50 |         
51 |     def optimize_policy(self, sess, samples, logger=None, **args):
52 |         
53 |         obs = samples['observations']
54 |         actions = samples['actions']
55 |         advantages = samples['advantages']
56 |         dist_vars = [samples['infos'][k] for k in self.dist.keys()]
57 |         
58 |         inputs = [obs, actions, advantages] + dist_vars
59 |         feed_dict = dict(list(zip(self.inputs_tensors, inputs)))
60 |         if self.batch_size is not None and obs.shape[0] >= self.batch_size:
61 |             for vs in iterate_minibatches(inputs, self.batch_size, shuffle=True):
62 |                 sess.run(self.train_op, feed_dict=dict(list(zip(self.inputs_tensors, vs))))
63 |         else:
64 |             sess.run(self.train_op, feed_dict=feed_dict)
65 | 
66 |         if logger:
67 |             summary_str = sess.run(self.summary_op, feed_dict=feed_dict)
68 |             logger.add_summary(summary_str)
69 |             
70 |             


--------------------------------------------------------------------------------
/Chapter03/q_network.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Mar 25, 2018
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import tensorflow as tf
  7 | from layers import conv2d, dense
  8 | 
  9 | 
 10 | class QNetwork:
 11 |     
 12 |     def __init__(self, input_shape=(84, 84, 4), n_outputs=4, 
 13 |                  network_type='cnn', scope='q_network'):
 14 |         
 15 |         self.width = input_shape[0]
 16 |         self.height = input_shape[1]
 17 |         self.channel = input_shape[2]
 18 |         self.n_outputs = n_outputs
 19 |         self.network_type = network_type
 20 |         self.scope = scope
 21 |         
 22 |         # Frame images
 23 |         self.x = tf.placeholder(dtype=tf.float32, 
 24 |                                 shape=(None, self.channel, self.width, self.height))
 25 |         # Estimates of Q-value
 26 |         self.y = tf.placeholder(dtype=tf.float32, shape=(None,))
 27 |         # Selected actions
 28 |         self.a = tf.placeholder(dtype=tf.int32, shape=(None,))
 29 |         
 30 |         with tf.variable_scope(scope):
 31 |             self.build()
 32 |             self.build_loss()
 33 |         
 34 |     def build(self):
 35 |         
 36 |         self.net = {}
 37 |         self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1))
 38 |             
 39 |         if self.network_type == 'cnn':
 40 |             self.net['conv1'] = conv2d(self.net['input'], 32, kernel=(8, 8), stride=(4, 4), 
 41 |                                        init_b=tf.constant_initializer(0.01), name='conv1')
 42 |             self.net['conv2'] = conv2d(self.net['input'], 64, kernel=(4, 4), stride=(2, 2), 
 43 |                                        init_b=tf.constant_initializer(0.01), name='conv2')
 44 |             self.net['conv3'] = conv2d(self.net['input'], 64, kernel=(3, 3), stride=(1, 1), 
 45 |                                        init_b=tf.constant_initializer(0.01), name='conv3')
 46 |             self.net['feature'] = dense(self.net['conv2'], 512, 
 47 |                                         init_b=tf.constant_initializer(0.01), name='fc1')
 48 |         elif self.network_type == 'cnn_nips':
 49 |             self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), 
 50 |                                        init_b=tf.constant_initializer(0.01), name='conv1')
 51 |             self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), 
 52 |                                        init_b=tf.constant_initializer(0.01), name='conv2')
 53 |             self.net['feature'] = dense(self.net['conv2'], 256, 
 54 |                                         init_b=tf.constant_initializer(0.01), name='fc1')
 55 |         elif self.network_type == 'mlp':
 56 |             self.net['fc1'] = dense(self.net['input'], 50, 
 57 |                                     init_b=tf.constant_initializer(0.0), name='fc1')
 58 |             self.net['feature'] = dense(self.net['fc1'], 50, 
 59 |                                         init_b=tf.constant_initializer(0.0), name='fc2')
 60 |         else:
 61 |             raise NotImplementedError('Unknown network type: {}'.format(self.network_type))
 62 |             
 63 |         self.net['values'] = dense(self.net['feature'], self.n_outputs, activation=None,
 64 |                                    init_b=tf.constant_initializer(0.0), name='values')
 65 |         
 66 |         self.net['q_value'] = tf.reduce_max(self.net['values'], axis=1, name='q_value')
 67 |         self.net['q_action'] = tf.argmax(self.net['values'], axis=1, 
 68 |                                          name='q_action', output_type=tf.int32)
 69 |         
 70 |         self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
 71 |                                       tf.get_variable_scope().name)
 72 |     
 73 |     def build_loss(self):
 74 |         
 75 |         indices = tf.transpose(tf.stack([tf.range(tf.shape(self.a)[0]), self.a], axis=0))
 76 |         value = tf.gather_nd(self.net['values'], indices, name='action_value')
 77 |         
 78 |         self.loss = 0.5 * tf.reduce_mean(tf.square((value - self.y)))
 79 |         self.gradient = tf.gradients(self.loss, self.vars)
 80 |         
 81 |         tf.summary.scalar("loss", self.loss, collections=['q_network'])
 82 |         self.summary_op = tf.summary.merge_all('q_network')
 83 |         
 84 |     def get_q_value(self, sess, state):
 85 |         return sess.run(self.net['q_value'], feed_dict={self.x: state})
 86 |     
 87 |     def get_q_action(self, sess, state):
 88 |         return sess.run(self.net['q_action'], feed_dict={self.x: state})
 89 |     
 90 |     def get_feed_dict(self, states, actions, values):
 91 |         return {self.x: states, self.a: actions, self.y: values}
 92 |                 
 93 |     def get_clone_op(self, network):
 94 |         new_vars = {v.name.replace(network.scope, ''): v for v in network.vars}
 95 |         return [tf.assign(v, new_vars[v.name.replace(self.scope, '')]) for v in self.vars]
 96 |     
 97 | 
 98 | if __name__ == "__main__":
 99 |     import numpy
100 |     
101 |     num_actions = 4
102 |     batch_size = 5
103 |     network = QNetwork(n_outputs=num_actions)
104 |     
105 |     state = numpy.random.rand(batch_size, 4, 84, 84)
106 |     values = numpy.random.rand(batch_size)
107 |     actions = numpy.random.randint(num_actions, size=batch_size)
108 |     
109 |     with tf.Session() as sess:
110 |         summary_writer = tf.summary.FileWriter('log/', sess.graph)
111 |         sess.run(tf.global_variables_initializer())
112 |         
113 |         q_values = sess.run(network.net['values'], feed_dict={network.x: state})
114 |         q_value = network.get_q_value(sess, state)
115 |         q_action = network.get_q_action(sess, state)
116 |         
117 |         print(q_values)
118 |         print(q_value)
119 |         print(q_action)
120 |     
121 | 


--------------------------------------------------------------------------------
/Chapter03/replay_memory.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 25, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy, random
 7 | from collections import deque
 8 | 
 9 | 
10 | class ReplayMemory:
11 |     
12 |     def __init__(self, history_len=4, capacity=1000000, batch_size=32, input_scale=255.0):
13 |         
14 |         self.capacity = capacity
15 |         self.history_length = history_len
16 |         self.batch_size = batch_size
17 |         self.input_scale = input_scale
18 |         
19 |         self.frames = deque([])
20 |         self.others = deque([])
21 |     
22 |     def add(self, frame, action, r, termination):
23 |         
24 |         if len(self.frames) == self.capacity:
25 |             self.frames.popleft()
26 |             self.others.popleft()
27 |         self.frames.append(frame)
28 |         self.others.append((action, r, termination))
29 |         
30 |     def add_nullops(self, init_frame):
31 |         for _ in range(self.history_length):
32 |             self.add(init_frame, 0, 0, 0)
33 |     
34 |     def phi(self, new_frame):
35 |         assert len(self.frames) > self.history_length
36 |         images = [new_frame] + [self.frames[-1-i] for i in range(self.history_length-1)]
37 |         return numpy.concatenate(images, axis=0)
38 |     
39 |     def _phi(self, index):
40 |         images = [self.frames[index-i] for i in range(self.history_length)]
41 |         return numpy.concatenate(images, axis=0)
42 |     
43 |     def sample(self):
44 |         
45 |         while True:
46 |             
47 |             index = random.randint(a=self.history_length-1, b=len(self.frames)-2)
48 |             infos = [self.others[index-i] for i in range(self.history_length)]
49 |             # Check if termination=1 before "index"
50 |             flag = False
51 |             for i in range(1, self.history_length):
52 |                 if infos[i][2] == 1:
53 |                     flag = True
54 |                     break
55 |             if flag:
56 |                 continue
57 |             
58 |             state = self._phi(index)
59 |             new_state = self._phi(index+1)
60 |             action, r, termination = self.others[index]
61 |             state = numpy.asarray(state / self.input_scale, dtype=numpy.float32)
62 |             new_state = numpy.asarray(new_state / self.input_scale, dtype=numpy.float32)
63 |                 
64 |             return (state, action, r, new_state, termination)
65 | 
66 | 


--------------------------------------------------------------------------------
/Chapter03/simulator.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 18 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import gym, numpy
 7 | from gym import spaces
 8 | 
 9 | 
10 | class Simulator:
11 |     
12 |     # Supported tasks: 
13 |     # v1: Reacher, HalfCheetah, Hopper, Swimmer, Walker2d, Ant, Humanoid
14 |     # v0: CartPole, Acrobot, Pendulum
15 |     def __init__(self, task='Swimmer'):
16 |         
17 |         self.task = task
18 |         try:
19 |             self.env = gym.make('{}-v1'.format(task))
20 |         except:
21 |             self.env = gym.make('{}-v2'.format(task))
22 |         self.env.reset()
23 |         
24 |         if type(self.env.action_space) == spaces.Box:        
25 |             assert len(self.env.action_space.shape) == 1
26 |             self.action_dim = self.env.action_space.shape[0]
27 |             self.action_type = 'continuous'
28 |         elif type(self.env.action_space) == spaces.Discrete:
29 |             self.action_dim = self.env.action_space.n
30 |             self.action_type = 'discrete'
31 |         else:
32 |             raise NotImplementedError
33 |         
34 |         assert len(self.env.observation_space.shape) == 1
35 |         self.obsevation_dim = self.env.observation_space.shape[0]
36 |         self.total_reward = 0
37 |     
38 |     def reset(self):
39 |         self.total_reward = 0
40 |         return self.env.reset()
41 |     
42 |     def play(self, action):
43 |         
44 |         termination = 0
45 |         if self.action_type == 'continuous':
46 |             observation, reward, done, _ = self.env.step(action)
47 |         elif self.action_type == 'discrete':
48 |             observation, reward, done, _ = self.env.step(numpy.argmax(action))
49 |         
50 |         if done: termination = 1
51 |         self.total_reward += reward
52 |         
53 |         return observation, reward, termination
54 |     
55 |     def render(self):
56 |         self.env.render()
57 |     
58 |     def get_total_reward(self):
59 |         return self.total_reward
60 |     
61 |     
62 | if __name__ == "__main__":
63 | 
64 |     agent = Simulator(task='Swimmer')
65 |     
66 |     for _ in range(10):
67 |         observation = agent.reset()
68 |         while True:
69 |             action = numpy.random.uniform(low=-1.0, high=1.0, size=(agent.action_dim,))
70 |             observation, reward, termination = agent.play(action)
71 |             
72 |             print("Observation: {}".format(observation))
73 |             print("Action: {}".format(action))
74 |             print("Reward: {}".format(reward))
75 |             print("Termination: {}".format(termination))
76 |             
77 |             if termination:
78 |                 break
79 |             agent.render()
80 |             
81 |     
82 |     
83 |     


--------------------------------------------------------------------------------
/Chapter03/test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 22 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import os
 7 | import argparse
 8 | import tensorflow as tf
 9 | from simulator import Simulator
10 | from sampler import Sampler
11 | from policy.gaussian_mlp import GaussianMLPPolicy
12 | from policy.categorical_mlp import CategoricalMLPPolicy
13 | 
14 | 
15 | def test(task, 
16 |          num_episodes=10,
17 |          policy_network_hidden_sizes=(32, 32),
18 |          policy_adaptive_std=False):
19 |     
20 |     directory = 'log/{}/'.format(task)
21 |     simulator = Simulator(task=task)
22 |     
23 |     input_shape = (None, simulator.obsevation_dim)
24 |     output_size = simulator.action_dim
25 |     
26 |     if simulator.action_type == 'continuous':
27 |         policy_network = GaussianMLPPolicy(input_shape=input_shape,
28 |                                            output_size=output_size,
29 |                                            hidden_sizes=policy_network_hidden_sizes,
30 |                                            adaptive_std=policy_adaptive_std,
31 |                                            std_hidden_sizes=policy_network_hidden_sizes)
32 |     elif simulator.action_type == 'discrete':
33 |         policy_network = CategoricalMLPPolicy(input_shape=input_shape,
34 |                                               output_size=output_size,
35 |                                               hidden_sizes=policy_network_hidden_sizes)
36 |     
37 |     sampler = Sampler(simulator, policy_network)
38 |     
39 |     with tf.Session() as sess:
40 |         saver = tf.train.Saver()
41 |         checkpoint_path = os.path.join(directory, '{}.ckpt'.format(task))
42 |         saver.restore(sess, checkpoint_path)
43 |         
44 |         for i in range(num_episodes):
45 |             path = sampler.rollout(sess, max_path_length=1000, render=True)
46 |             print("epsiode {}, reward {}".format(i, path['total_reward']))
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     
51 |     parser = argparse.ArgumentParser(description=None)
52 |     parser.add_argument('-t', '--task', default='Swimmer', 
53 |                         type=str, help='Tasks: Swimmer, Walker2d, Reacher, HalfCheetah, Hopper, Ant, Humanoid')
54 |     args = parser.parse_args()
55 |     
56 |     test(task=args.task, policy_network_hidden_sizes=(32, 32))
57 |     
58 |     
59 |             
60 |     


--------------------------------------------------------------------------------
/Chapter03/trpo.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 18 Sep 2017
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import numpy, math
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | class TRPO:
 11 |     
 12 |     def __init__(self, policy, optimizer, step_size):
 13 |         
 14 |         self.policy = policy
 15 |         self.optimizer = optimizer
 16 |         self.step_size = step_size
 17 |         
 18 |         self.x = self.policy.get_input()
 19 |         self.action_dim = self.policy.output_size
 20 |         self.dist = self.policy.distribution
 21 |         
 22 |         self.build_formula()
 23 |         
 24 |     def build_formula(self):
 25 |         
 26 |         self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.action_dim), name='action')
 27 |         self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='action')
 28 |         
 29 |         dist_vars = self.policy.get_dist_info()
 30 |         old_dist_vars = {k: tf.placeholder(tf.float32, shape=[None]+list(shape), name='old_dist_{}'.format(k))
 31 |                          for k, shape in self.dist.specs()}
 32 |         old_dist_vars_list = [old_dist_vars[k] for k in self.dist.keys()]
 33 |         
 34 |         kl = self.dist.kl_tf(old_dist_vars, dist_vars)
 35 |         lr = self.dist.likelihood_ratio_tf(self.action, old_dist_vars, dist_vars)
 36 |         mean_kl = tf.reduce_mean(kl)
 37 |         loss = -tf.reduce_mean(lr * self.advantage)
 38 |         
 39 |         self.inputs_tensors = [self.x, self.action, self.advantage] + old_dist_vars_list
 40 |         self.optimizer.build(loss=loss, 
 41 |                              leq_constraint=(mean_kl, self.step_size), 
 42 |                              params=self.policy.get_params(), 
 43 |                              inputs=self.inputs_tensors)
 44 |         # Add summaries
 45 |         tf.summary.scalar("loss", loss, collections=['trpo'])
 46 |         tf.summary.scalar("mean_kl", mean_kl, collections=['trpo'])
 47 |         self.summary_op = tf.summary.merge_all('trpo')
 48 |         
 49 |     def optimize_policy(self, sess, samples, logger=None, subsample_rate=0.5):
 50 |         
 51 |         if subsample_rate < 1.0:
 52 |             n = len(samples['rewards'])
 53 |             idx = numpy.random.choice(n, int(math.floor(n * subsample_rate)), replace=False)
 54 |             obs = samples['observations'][idx]
 55 |             actions = samples['actions'][idx]
 56 |             advantages = samples['advantages'][idx]
 57 |             dist_vars = [samples['infos'][k][idx] for k in self.dist.keys()]
 58 |         else:
 59 |             obs = samples['observations']
 60 |             actions = samples['actions']
 61 |             advantages = samples['advantages']
 62 |             dist_vars = [samples['infos'][k] for k in self.dist.keys()]
 63 |         
 64 |         inputs = [obs, actions, advantages] + dist_vars
 65 |         self.optimizer.optimize(sess, input_vals=inputs)
 66 |         
 67 |         if logger:
 68 |             feed_dict = dict(list(zip(self.inputs_tensors, inputs)))
 69 |             summary_str = sess.run(self.summary_op, feed_dict=feed_dict)
 70 |             logger.add_summary(summary_str)
 71 |         
 72 | 
 73 | if __name__ == "__main__":
 74 |     from policy.gaussian_mlp import GaussianMLPPolicy
 75 |     from optimizer import ConjugateOptimizer
 76 |     
 77 |     input_shape = (None, 10)
 78 |     output_size = 5
 79 |     
 80 |     policy = GaussianMLPPolicy(input_shape=input_shape,
 81 |                                output_size=output_size,
 82 |                                learn_std=True,
 83 |                                adaptive_std=False)
 84 |     optimizer = ConjugateOptimizer()
 85 |     
 86 |     trpo = TRPO(policy, optimizer, step_size=0.01)
 87 |     with tf.Session() as sess:
 88 |         sess.run(tf.global_variables_initializer())
 89 |         
 90 |         samples = {}
 91 |         samples['observations'] = numpy.random.rand(10, input_shape[1])
 92 |         samples['actions'] = numpy.random.rand(10, output_size)
 93 |         samples['advantages'] = numpy.random.rand(10)
 94 |         samples['infos'] = {'mean': numpy.random.rand(10, output_size),
 95 |                             'log_var': numpy.random.rand(10, output_size)}
 96 |         
 97 |         trpo.optimize_policy(sess, samples, subsample_rate=1.0)
 98 |         print("Finished.")
 99 |             
100 |             


--------------------------------------------------------------------------------
/Chapter03/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 4 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import scipy.signal
 8 | import tensorflow as tf
 9 | 
10 | def flatten_tensor_variables(ts):
11 |     return tf.concat(axis=0, values=[tf.reshape(x, [-1]) for x in ts])
12 | 
13 | def flatten_tensors(tensors):
14 |     if len(tensors) > 0:
15 |         return numpy.concatenate([numpy.reshape(x, [-1]) for x in tensors])
16 |     else:
17 |         return numpy.asarray([])
18 |     
19 | def unflatten_tensors(flattened, tensor_shapes):
20 |     tensor_sizes = list(map(numpy.prod, tensor_shapes))
21 |     indices = numpy.cumsum(tensor_sizes)[:-1]
22 |     return [numpy.reshape(pair[0], pair[1]) for pair in zip(numpy.split(flattened, indices), tensor_shapes)]
23 | 
24 | def get_param_values(sess, params, flatten=True):
25 |     values = sess.run(params)
26 |     if flatten:
27 |         values = flatten_tensors(values)
28 |     return values
29 | 
30 | def get_param_assign_ops(params):
31 |     
32 |     assign_ops = []
33 |     input_tensors = []
34 |     
35 |     for param in params:
36 |         v = tf.placeholder(dtype=param.dtype, shape=param.get_shape())
37 |         assign_ops.append(tf.assign(param, v))
38 |         input_tensors.append(v)
39 |         
40 |     return assign_ops, input_tensors
41 | 
42 | def set_param_values(sess, assign_ops, input_tensors, values, flatten=True):
43 |     
44 |     if flatten:
45 |         shapes = [p.get_shape().as_list() for p in input_tensors]
46 |         values = unflatten_tensors(values, shapes)
47 |     
48 |     feed_dict = dict(list(zip(input_tensors, values)))
49 |     sess.run(assign_ops, feed_dict=feed_dict)
50 | 
51 | def discount_cumsum(x, discount):
52 |     # See https://docs.scipy.org/doc/scipy/reference/tutorial/signal.html#difference-equation-filtering
53 |     # Here, we have y[t] - discount*y[t+1] = x[t]
54 |     # or rev(y)[t] - discount*rev(y)[t-1] = rev(x)[t]
55 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
56 | 
57 | def iterate_minibatches(input_list=None, batch_size=None, shuffle=False):
58 |     
59 |     if batch_size is None:
60 |         batch_size = len(input_list[0])
61 |     assert all(len(x) == len(input_list[0]) for x in input_list)
62 | 
63 |     if shuffle:
64 |         indices = numpy.arange(len(input_list[0]))
65 |         numpy.random.shuffle(indices)
66 |         
67 |     for start_idx in range(0, len(input_list[0]), batch_size):
68 |         idx = indices[start_idx:start_idx + batch_size] if shuffle else slice(start_idx, start_idx + batch_size)
69 |         yield [r[idx] for r in input_list]
70 |         
71 |     


--------------------------------------------------------------------------------
/Chapter03/value/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on 20 Sep 2017
3 | 
4 | @author: ywz
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter03/value/linear_fitting.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 20 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | 
 8 | 
 9 | class LinearFitting:
10 |     
11 |     def __init__(self):
12 |         self.beta = None
13 |         self.sess = None
14 |     
15 |     def set_session(self, sess):
16 |         self.sess = sess
17 |     
18 |     def feature(self, path):
19 |         o = numpy.clip(path['observations'], -10, 10)
20 |         l = len(path["rewards"])
21 |         al = numpy.arange(l).reshape(-1, 1) / 100.0
22 |         return numpy.concatenate([o, o ** 2, al, al ** 2, al ** 3, numpy.ones((l, 1))], axis=1)
23 |     
24 |     def train(self, paths):
25 |         
26 |         features = numpy.concatenate([self.feature(path) for path in paths])
27 |         returns = numpy.concatenate([path['returns'] for path in paths])
28 | 
29 |         reg_coeff = 1e-5
30 |         for _ in range(5):
31 |             self.beta = numpy.linalg.lstsq(features.T.dot(features) + 
32 |                                            reg_coeff * numpy.identity(features.shape[1]), 
33 |                                            features.T.dot(returns))[0]
34 |             if not numpy.any(numpy.isnan(self.beta)):
35 |                 break
36 |             reg_coeff *= 10
37 |     
38 |     def predict(self, path):
39 |         if self.beta is None:
40 |             return numpy.zeros((len(path['rewards'],)))
41 |         else:
42 |             return self.feature(path).dot(self.beta)
43 |         
44 |             


--------------------------------------------------------------------------------
/Chapter03/value/mlp_fitting.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 26 Sep 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | from mlp import MLP
 9 | from utils import iterate_minibatches
10 | 
11 | 
12 | class MLPFitting:
13 |     
14 |     def __init__(self,
15 |                  input_shape, 
16 |                  hidden_sizes=(32, 32),
17 |                  hidden_nonlinearity=tf.nn.tanh,
18 |                  learning_rate=3e-4,
19 |                  batch_size=1000):
20 |         
21 |         self.input_shape = input_shape
22 |         self.hidden_sizes = hidden_sizes
23 |         self.learning_rate = learning_rate
24 |         self.batch_size = batch_size
25 |         self.sess = None
26 |         
27 |         with tf.variable_scope("mlp_fitting"):
28 |             self.mlp = MLP(input_shape=input_shape, 
29 |                            output_size=1, 
30 |                            hidden_sizes=hidden_sizes, 
31 |                            hidden_nonlinearity=hidden_nonlinearity, 
32 |                            output_nonlinearity=None,
33 |                            name='value')
34 |             
35 |             self.x = self.mlp.get_input_layer()
36 |             self.y = tf.reshape(self.mlp.get_output_layer(), shape=(-1,))
37 |             self.params = self.mlp.get_params()
38 |             
39 |             self.z = tf.placeholder(dtype=tf.float32, shape=(None,), name='z')
40 |             loss = tf.reduce_mean(tf.square(self.z - self.y))
41 |             self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss, var_list=self.params)
42 |         
43 |     def set_session(self, sess):
44 |         self.sess = sess
45 |         
46 |     def train(self, paths):
47 |         assert self.sess is not None
48 |         obs = numpy.concatenate([path['observations'] for path in paths])
49 |         returns = numpy.concatenate([path['returns'] for path in paths])
50 |         if self.batch_size is not None and obs.shape[0] >= self.batch_size:
51 |             for x, z in iterate_minibatches([obs, returns], self.batch_size, shuffle=True):
52 |                 self.sess.run(self.train_op, feed_dict={self.x: x, self.z: z})
53 |         else:
54 |             self.sess.run(self.train_op, feed_dict={self.x: obs, self.z: returns})
55 |     
56 |     def predict(self, path):
57 |         assert self.sess is not None
58 |         return self.sess.run(self.y, feed_dict={self.x: path['observations']})
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     
63 |     input_shape = (None, 5)
64 |     mlp = MLPFitting(input_shape)
65 |     
66 |     path = {'observations': numpy.random.rand(1000, 5),
67 |             'returns': numpy.random.rand(1000)}
68 |     
69 |     with tf.Session() as sess:
70 |         sess.run(tf.global_variables_initializer())
71 |         mlp.set_session(sess)
72 |         mlp.train(paths=[path])
73 |         print(mlp.predict(path))
74 |     
75 | 
76 |     


--------------------------------------------------------------------------------
/Chapter04/actor_critic_net.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 10, 2018
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import tensorflow as tf
  7 | from actor_network import ActorNetwork
  8 | from critic_network import CriticNetwork
  9 | 
 10 | 
 11 | class ActorCriticNet:
 12 |     
 13 |     def __init__(self, input_dim, action_dim, 
 14 |                  critic_layers, actor_layers, actor_activation, 
 15 |                  scope='ac_network'):
 16 |         
 17 |         self.input_dim = input_dim
 18 |         self.action_dim = action_dim
 19 |         self.scope = scope
 20 |         
 21 |         self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x')
 22 |         self.y = tf.placeholder(shape=(None,), dtype=tf.float32, name='y')
 23 |         
 24 |         with tf.variable_scope(scope):
 25 |             self.actor_network = ActorNetwork(self.x, action_dim, 
 26 |                                               hidden_layers=actor_layers, 
 27 |                                               activation=actor_activation)
 28 |             
 29 |             self.critic_network = CriticNetwork(self.x, 
 30 |                                                 self.actor_network.get_output_layer(),
 31 |                                                 hidden_layers=critic_layers)
 32 |             
 33 |             self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
 34 |                                           tf.get_variable_scope().name)
 35 |             self._build()
 36 |     
 37 |     def _build(self):
 38 |         
 39 |         value = self.critic_network.get_output_layer()
 40 |         
 41 |         actor_loss = -tf.reduce_mean(value)
 42 |         self.actor_vars = self.actor_network.get_params()
 43 |         self.actor_grad = tf.gradients(actor_loss, self.actor_vars)
 44 |         tf.summary.scalar("actor_loss", actor_loss, collections=['actor'])
 45 |         self.actor_summary = tf.summary.merge_all('actor')
 46 |         
 47 |         critic_loss = 0.5 * tf.reduce_mean(tf.square((value - self.y)))
 48 |         self.critic_vars = self.critic_network.get_params()
 49 |         self.critic_grad = tf.gradients(critic_loss, self.critic_vars)
 50 |         tf.summary.scalar("critic_loss", critic_loss, collections=['critic'])
 51 |         self.critic_summary = tf.summary.merge_all('critic')
 52 |     
 53 |     def get_action(self, sess, state):
 54 |         return self.actor_network.get_action(sess, state)
 55 |     
 56 |     def get_value(self, sess, state):
 57 |         return self.critic_network.get_value(sess, state)
 58 |     
 59 |     def get_action_value(self, sess, state, action):
 60 |         return self.critic_network.get_action_value(sess, state, action)
 61 |     
 62 |     def get_actor_feed_dict(self, state):
 63 |         return {self.x: state}
 64 |     
 65 |     def get_critic_feed_dict(self, state, action, target):
 66 |         return {self.x: state, self.y: target, 
 67 |                 self.critic_network.input_action: action}
 68 |     
 69 |     def get_clone_op(self, network, tau=0.9):
 70 |         update_ops = []
 71 |         new_vars = {v.name.replace(network.scope, ''): v for v in network.vars}
 72 |         for v in self.vars:
 73 |             u = (1 - tau) * v + tau * new_vars[v.name.replace(self.scope, '')]
 74 |             update_ops.append(tf.assign(v, u))
 75 |         return update_ops
 76 |     
 77 | 
 78 | if __name__ == "__main__":
 79 |     import numpy
 80 |     
 81 |     batch_size = 5
 82 |     input_dim = 10
 83 |     action_dim = 3
 84 |     hidden_layers = [20, 20]
 85 |     network = ActorCriticNet(input_dim, action_dim, 
 86 |                              hidden_layers, hidden_layers, 
 87 |                              actor_activation=tf.nn.relu)
 88 |     
 89 |     state = numpy.random.rand(batch_size, input_dim)
 90 |     action = numpy.random.rand(batch_size, action_dim)
 91 |     with tf.Session() as sess:
 92 |         summary_writer = tf.summary.FileWriter('log/', sess.graph)
 93 |         sess.run(tf.global_variables_initializer())
 94 |         
 95 |         a = network.get_action(sess, state)
 96 |         v = network.get_value(sess, state)
 97 |         assert numpy.sum(numpy.fabs(v - network.get_action_value(sess, state, action))) > 1e-3
 98 |         assert numpy.sum(numpy.fabs(v - network.get_action_value(sess, state, a))) < 1e-8
 99 |         print("Pass")
100 | 


--------------------------------------------------------------------------------
/Chapter04/actor_network.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 10, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from layers import dense
 8 | 
 9 | 
10 | class ActorNetwork:
11 |     
12 |     def __init__(self, input_state, output_dim, hidden_layers, activation=tf.nn.relu):
13 |         
14 |         self.x = input_state
15 |         self.output_dim = output_dim
16 |         self.hidden_layers = hidden_layers
17 |         self.activation = activation
18 |         
19 |         with tf.variable_scope('actor_network'):
20 |             self.output = self._build()
21 |             self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
22 |                                           tf.get_variable_scope().name)
23 |         
24 |     def _build(self):
25 |         
26 |         layer = self.x
27 |         init_b = tf.constant_initializer(0.01)
28 |         
29 |         for i, num_unit in enumerate(self.hidden_layers):
30 |             layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i))
31 |             
32 |         output = dense(layer, self.output_dim, activation=self.activation, init_b=init_b, name='output')
33 |         return output
34 |     
35 |     def get_output_layer(self):
36 |         return self.output
37 |     
38 |     def get_params(self):
39 |         return self.vars
40 |     
41 |     def get_action(self, sess, state):
42 |         return sess.run(self.output, feed_dict={self.x: state})
43 |     
44 | 
45 | if __name__ == "__main__":
46 |     import numpy
47 |     
48 |     batch_size = 5
49 |     input_dim = 10
50 |     output_dim = 3
51 |     hidden_layers = [20, 20]
52 |     x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='input')
53 |     network = ActorNetwork(x, output_dim, hidden_layers)
54 |     
55 |     state = numpy.random.rand(batch_size, input_dim)
56 |     with tf.Session() as sess:
57 |         summary_writer = tf.summary.FileWriter('log/', sess.graph)
58 |         sess.run(tf.global_variables_initializer())
59 |         action = network.get_action(sess, state)
60 |         print(action)
61 |     
62 |     


--------------------------------------------------------------------------------
/Chapter04/config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 11, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | DEMO = {
 7 |     'gamma': 0.99,
 8 |     'history_len': 2,
 9 |     'num_episode': 3000,
10 |     'capacity': 100000,
11 |     'epsilon_decay': 100000,
12 |     'epsilon_min': 0.0,
13 |     'time_between_two_copies': 2000,
14 |     'update_interval': 1,
15 |     'T': 1000000,
16 |     
17 |     'batch_size': 64,
18 |     'learning_rate': 1e-4,
19 |     'tau': 0.9,
20 |     'optimizer': 'adam',
21 |     'rho': 0.99,
22 |     'log_dir': 'log/'
23 | }
24 | 


--------------------------------------------------------------------------------
/Chapter04/critic_network.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 10, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from layers import dense
 8 | 
 9 | 
10 | class CriticNetwork:
11 |     
12 |     def __init__(self, input_state, input_action, hidden_layers):
13 |         
14 |         assert len(hidden_layers) >= 2
15 |         self.input_state = input_state
16 |         self.input_action = input_action
17 |         self.hidden_layers = hidden_layers
18 |         
19 |         with tf.variable_scope('critic_network'):
20 |             self.output = self._build()
21 |             self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 
22 |                                           tf.get_variable_scope().name)
23 |     
24 |     def _build(self):
25 |         
26 |         layer = self.input_state
27 |         init_b = tf.constant_initializer(0.01)
28 |         
29 |         for i, num_unit in enumerate(self.hidden_layers):
30 |             if i != 1:
31 |                 layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i))
32 |             else:
33 |                 layer = tf.concat([layer, self.input_action], axis=1, name='concat_action')
34 |                 layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i))
35 |         
36 |         output = dense(layer, 1, activation=None, init_b=init_b, name='output')
37 |         return tf.reshape(output, shape=(-1,))
38 |     
39 |     def get_output_layer(self):
40 |         return self.output
41 |     
42 |     def get_params(self):
43 |         return self.vars
44 |     
45 |     def get_value(self, sess, state):
46 |         return sess.run(self.output, feed_dict={self.input_state: state})
47 |     
48 |     def get_action_value(self, sess, state, action):
49 |         return sess.run(self.output, feed_dict={self.input_state: state,
50 |                                                 self.input_action: action})
51 |         
52 | 
53 | if __name__ == "__main__":
54 |     import numpy
55 |     
56 |     batch_size = 5
57 |     input_dim = 10
58 |     output_dim = 3
59 |     hidden_layers = [20, 20]
60 |     x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='input')
61 |     a = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='action')
62 |     network = CriticNetwork(x, a, hidden_layers)
63 |     
64 |     state = numpy.random.rand(batch_size, input_dim)
65 |     action = numpy.random.rand(batch_size, input_dim)
66 |     with tf.Session() as sess:
67 |         summary_writer = tf.summary.FileWriter('log/', sess.graph)
68 |         sess.run(tf.global_variables_initializer())
69 |         value = network.get_action_value(sess, state, action)
70 |         print(value)
71 |                 
72 |     


--------------------------------------------------------------------------------
/Chapter04/eval.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 15, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import os
 7 | import argparse
 8 | import tensorflow as tf
 9 | from config import DEMO
10 | from task import Task
11 | from dpg import DPG
12 | 
13 | 
14 | def main():
15 |     
16 |     parser = argparse.ArgumentParser(description=None)
17 |     parser.add_argument('-t', '--task', default='CartPole-v0', 
18 |                         type=str, help='Tasks: CartPole-v0, Pendulum-v0, Acrobot-v1')
19 |     parser.add_argument('-d', '--device', default='cpu', type=str, help='Device: cpu, gpu')
20 |     args = parser.parse_args()
21 |     
22 |     task = Task(args.task)
23 |     log_dir = os.path.join(DEMO['log_dir'], '{}/train'.format(args.task))
24 |     if not tf.gfile.Exists(log_dir):
25 |         tf.gfile.MakeDirs(log_dir)
26 |     model_dir = os.path.join(DEMO['log_dir'], args.task)
27 |     
28 |     device = '/{}:0'.format('cpu')
29 |     with tf.device(device):
30 |         model = DPG(DEMO, task, model_dir, callback=task.render)
31 |     
32 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
33 |         saver = tf.train.Saver()
34 |         model.load(sess, saver)
35 |         model.evaluate(sess)
36 |         
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/Chapter04/layers.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 25, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def get_variable(shape, initializer, name, dtype=tf.float32, trainable=True):
11 |     var = tf.get_variable(shape=shape, initializer=initializer, 
12 |                           dtype=dtype, name=name, trainable=trainable)
13 |     return var
14 | 
15 | 
16 | def HeUniform(shape):
17 |     
18 |     if len(shape) > 2:
19 |         w = shape[0]
20 |         h = shape[1]
21 |         input_channels  = shape[2]
22 |         d = 1.0 / numpy.sqrt(input_channels * w * h)
23 |     else:
24 |         d = 1.0 / numpy.sqrt(shape[0])
25 |     
26 |     init_W = tf.random_uniform_initializer(-d, d)
27 |     init_b = tf.random_uniform_initializer(-d, d)
28 |     return init_W, init_b
29 | 
30 | 
31 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 
32 |            activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'):
33 |     
34 |     assert len(x.get_shape().as_list()) == 4
35 |     shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim)
36 |     _W, _b = HeUniform(shape)
37 |     if init_W is None: init_W = _W
38 |     if init_b is None: init_b = _b
39 | 
40 |     with tf.variable_scope(name):
41 |         W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
42 |         b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
43 |         
44 |         conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding)
45 |         if activation:
46 |             conv = activation(tf.nn.bias_add(conv, b))
47 |         else:
48 |             conv = tf.nn.bias_add(conv, b)
49 |     
50 |     return conv
51 | 
52 | 
53 | def dense(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='dense'):
54 |     
55 |     if len(x.get_shape().as_list()) > 2:
56 |         shape = x.get_shape().as_list()
57 |         x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:])))
58 | 
59 |     shape = (x.get_shape().as_list()[-1], output_dim)
60 |     _W, _b = HeUniform(shape)
61 |     if init_W is None: init_W = _W
62 |     if init_b is None: init_b = _b
63 | 
64 |     with tf.variable_scope(name):
65 |         W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
66 |         b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
67 |         
68 |         output = tf.matmul(x, W) + b
69 |         if activation:
70 |             output = activation(output)
71 |     
72 |     return output
73 | 


--------------------------------------------------------------------------------
/Chapter04/log/Acrobot-v1/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter04/log/Acrobot-v1/train/events.out.tfevents.1523886598.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/Acrobot-v1/train/events.out.tfevents.1523886598.ywz-PC


--------------------------------------------------------------------------------
/Chapter04/log/CartPole-v0/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter04/log/CartPole-v0/train/events.out.tfevents.1525870448.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/CartPole-v0/train/events.out.tfevents.1525870448.ywz-PC


--------------------------------------------------------------------------------
/Chapter04/log/MountainCar-v0/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter04/log/MountainCar-v0/train/events.out.tfevents.1526196635.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/MountainCar-v0/train/events.out.tfevents.1526196635.ywz-PC


--------------------------------------------------------------------------------
/Chapter04/log/Pendulum-v0/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter04/log/Pendulum-v0/train/events.out.tfevents.1525871560.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/Pendulum-v0/train/events.out.tfevents.1525871560.ywz-PC


--------------------------------------------------------------------------------
/Chapter04/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Apr 10, 2018
3 | 
4 | @author: ywz
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter04/optimizer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 11, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | class Optimizer:
11 |     
12 |     def __init__(self, config, ac_network, target_network, replay_memory):
13 |         
14 |         self.ac_network = ac_network
15 |         self.target_network = target_network
16 |         self.replay_memory = replay_memory
17 |         self.summary_writer = None
18 |         self.gamma = config['gamma']
19 |         
20 |         if config['optimizer'] == 'adam':
21 |             opt = tf.train.AdamOptimizer(learning_rate=config['learning_rate'], 
22 |                                          beta1=config['rho'])
23 |         elif config['optimizer'] == 'momentum':
24 |             opt = tf.train.MomentumOptimizer(learning_rate=config['learning_rate'], 
25 |                                              momentum=config['rho'])
26 |         else:
27 |             raise ValueError("Unknown optimizer")
28 |         
29 |         self.actor_train_op = opt.apply_gradients(zip(ac_network.actor_grad, 
30 |                                                       ac_network.actor_vars))
31 |         
32 |         self.critic_train_op = opt.apply_gradients(zip(ac_network.critic_grad, 
33 |                                                        ac_network.critic_vars))
34 |     
35 |     def set_summary_writer(self, summary_writer=None):
36 |         self.summary_writer = summary_writer
37 |         
38 |     def sample_transitions(self, sess, batch_size):
39 |         
40 |         input_dim = self.ac_network.input_dim
41 |         action_dim = self.ac_network.action_dim
42 |         
43 |         states = numpy.zeros((batch_size, input_dim), dtype=numpy.float32)
44 |         new_states = numpy.zeros((batch_size, input_dim), dtype=numpy.float32)
45 |         targets = numpy.zeros(batch_size, dtype=numpy.float32)
46 |         actions = numpy.zeros((batch_size, action_dim), dtype=numpy.float32)
47 |         terms = numpy.zeros(batch_size, dtype=numpy.int32)
48 |         
49 |         for i in range(batch_size):
50 |             state, action, r, new_state, term = self.replay_memory.sample()
51 |             states[i] = state
52 |             new_states[i] = new_state
53 |             actions[i] = action
54 |             targets[i] = r
55 |             terms[i] = term
56 | 
57 |         targets += self.gamma * (1 - terms) * self.target_network.get_value(sess, new_states)
58 |         return states, actions, targets
59 |     
60 |     def train_one_step(self, sess, step, batch_size):
61 |         
62 |         states, actions, targets = self.sample_transitions(sess, batch_size)
63 |         
64 |         # Critic update
65 |         feed_dict = self.ac_network.get_critic_feed_dict(states, actions, targets)
66 |         if self.summary_writer and step % 2000 == 0:
67 |             s, _, = sess.run([self.ac_network.critic_summary, self.critic_train_op], 
68 |                              feed_dict=feed_dict)
69 |             self.summary_writer.add_summary(s, step)
70 |             self.summary_writer.flush()
71 |         else:
72 |             sess.run(self.critic_train_op, feed_dict=feed_dict)
73 |         
74 |         # Actor update 
75 |         feed_dict = self.ac_network.get_actor_feed_dict(states)
76 |         if self.summary_writer and step % 2000 == 0:
77 |             s, _, = sess.run([self.ac_network.actor_summary, self.actor_train_op], 
78 |                              feed_dict=feed_dict)
79 |             self.summary_writer.add_summary(s, step)
80 |             self.summary_writer.flush()
81 |         else:
82 |             sess.run(self.actor_train_op, feed_dict=feed_dict)
83 |         
84 |         
85 |         


--------------------------------------------------------------------------------
/Chapter04/replay_memory.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 11, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy, random
 7 | from collections import deque
 8 | 
 9 | 
10 | class ReplayMemory:
11 |     
12 |     def __init__(self, history_len=4, capacity=1000000):
13 |         
14 |         self.capacity = capacity
15 |         self.history_length = history_len
16 |         
17 |         self.states = deque([])
18 |         self.others = deque([])
19 |     
20 |     def add(self, state, action, r, termination):
21 |         
22 |         if len(self.states) == self.capacity:
23 |             self.states.popleft()
24 |             self.others.popleft()
25 |         self.states.append(state)
26 |         self.others.append((action, r, termination))
27 |         
28 |     def add_nullops(self, init_state):
29 |         for _ in range(self.history_length):
30 |             self.add(init_state, 0, 0, 0)
31 |     
32 |     def phi(self, new_state):
33 |         assert len(self.states) > self.history_length
34 |         states = [new_state] + [self.states[-1-i] for i in range(self.history_length-1)]
35 |         return numpy.concatenate(states, axis=0)
36 |     
37 |     def _phi(self, index):
38 |         states = [self.states[index-i] for i in range(self.history_length)]
39 |         return numpy.concatenate(states, axis=0)
40 |     
41 |     def sample(self):
42 |         
43 |         while True:
44 |             
45 |             index = random.randint(a=self.history_length-1, b=len(self.states)-2)
46 |             infos = [self.others[index-i] for i in range(self.history_length)]
47 |             # Check if termination=1 before "index"
48 |             flag = False
49 |             for i in range(1, self.history_length):
50 |                 if infos[i][2] == 1:
51 |                     flag = True
52 |                     break
53 |             if flag:
54 |                 continue
55 |             
56 |             state = self._phi(index)
57 |             new_state = self._phi(index+1)
58 |             action, r, termination = self.others[index]
59 |             state = numpy.asarray(state, dtype=numpy.float32)
60 |             new_state = numpy.asarray(new_state, dtype=numpy.float32)
61 |                 
62 |             return (state, action, r, new_state, termination)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     
67 |     history_len = 2
68 |     capacity = 20
69 |     
70 |     replay = ReplayMemory(history_len, capacity)
71 |     
72 |     for i in range(20):
73 |         state = numpy.zeros((2,)) + i
74 |         action = numpy.ones((2,)) * i
75 |         reward = i ** 2
76 |         termination = 1 if i % 10 == 0 else 0
77 |         replay.add(state, action, reward, termination)
78 |         
79 |     print(replay.states)
80 |     print(replay.others)
81 |     state, action, r, new_state, termination = replay.sample()
82 |     print(state)
83 |     print(new_state)
84 |     print(action)
85 |     print(r)
86 |     print(termination)
87 |     print('------------------------------')
88 |     
89 |     for _ in range(50):
90 |         replay.sample()
91 |         
92 | 


--------------------------------------------------------------------------------
/Chapter04/task.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 11, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import gym
 7 | import numpy
 8 | import tensorflow as tf
 9 | 
10 | 
11 | class Task:
12 |     
13 |     def __init__(self, name):
14 |         
15 |         assert name in ['CartPole-v0', 'MountainCar-v0', 
16 |                         'Pendulum-v0', 'Acrobot-v1']
17 |         self.name = name
18 |         self.task = gym.make(name)
19 |         self.last_state = self.reset()
20 |     
21 |     def reset(self):
22 |         state = self.task.reset()
23 |         self.total_reward = 0
24 |         return state
25 |         
26 |     def play_action(self, action):
27 |         
28 |         if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']:
29 |             action = numpy.fmax(action, 0)
30 |             action = action / numpy.sum(action)
31 |             action = numpy.random.choice(range(len(action)), p=action)
32 |         else:
33 |             low = self.task.env.action_space.low
34 |             high = self.task.env.action_space.high
35 |             action = numpy.fmin(numpy.fmax(action, low), high)
36 |             
37 |         state, reward, done, _ = self.task.step(action)
38 |         self.total_reward += reward
39 |         termination = 1 if done else 0
40 |         
41 |         return reward, state, termination
42 |     
43 |     def get_total_reward(self):
44 |         return self.total_reward
45 |     
46 |     def get_action_dim(self):
47 |         if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']:
48 |             return self.task.env.action_space.n
49 |         else:
50 |             return self.task.env.action_space.shape[0]
51 |     
52 |     def get_state_dim(self):
53 |         return self.last_state.shape[0]
54 |     
55 |     def get_activation_fn(self):
56 |         if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']:
57 |             return tf.nn.softmax
58 |         else:
59 |             return None
60 |     
61 |     def render(self):
62 |         self.task.render()
63 |     


--------------------------------------------------------------------------------
/Chapter04/train.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Apr 12, 2018
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import os
 7 | import argparse
 8 | import tensorflow as tf
 9 | from config import DEMO
10 | from task import Task
11 | from dpg import DPG
12 | 
13 | 
14 | def delete_dir(path):
15 |     if tf.gfile.Exists(path):
16 |         tf.gfile.DeleteRecursively(path)
17 |     tf.gfile.MakeDirs(path)
18 |     return path
19 | 
20 | 
21 | def main():
22 |     
23 |     parser = argparse.ArgumentParser(description=None)
24 |     parser.add_argument('-t', '--task', default='CartPole-v0', 
25 |                         type=str, help='Tasks: CartPole-v0, Pendulum-v0, Acrobot-v1')
26 |     parser.add_argument('-d', '--device', default='cpu', type=str, help='Device: cpu, gpu')
27 |     args = parser.parse_args()
28 |     
29 |     task = Task(args.task)
30 |     log_dir = os.path.join(DEMO['log_dir'], '{}/train'.format(args.task))
31 |     if not tf.gfile.Exists(log_dir):
32 |         tf.gfile.MakeDirs(log_dir)
33 |     model_dir = os.path.join(DEMO['log_dir'], args.task)
34 |     
35 |     device = '/{}:0'.format(args.device)
36 |     with tf.device(device):
37 |         model = DPG(DEMO, task, model_dir, callback=None)
38 |     
39 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
40 |         saver = tf.train.Saver()
41 |         writer = tf.summary.FileWriter(delete_dir(log_dir), sess.graph_def)
42 |         model.set_summary_writer(summary_writer=writer)
43 |         
44 |         sess.run(tf.global_variables_initializer())
45 |         model.train(sess, saver)
46 |         
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter05/cluster.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 1 Jun 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | PORT = 12222
 7 | 
 8 | def cluster_spec(num_workers, num_ps):
 9 | 
10 |     cluster = {}
11 |     port = PORT
12 | 
13 |     host = '127.0.0.1'
14 |     cluster['ps'] = ['{}:{}'.format(host, port+i) for i in range(num_ps)]
15 |     cluster['worker'] = ['{}:{}'.format(host, port+i+num_ps) for i in range(num_workers)]
16 |     
17 |     return cluster
18 | 


--------------------------------------------------------------------------------
/Chapter05/demo/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Nov 10, 2016
3 | 
4 | @author: a0096049
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter05/demo/object.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on May 16, 2016
 3 | 
 4 | @author: a0096049
 5 | '''
 6 | 
 7 | import numpy, pygame
 8 | from demo.utils import Color, calculateIntersectPoint
 9 | 
10 | 
11 | class Object:
12 |     
13 |     def __init__(self, x, y, r, game):
14 |         
15 |         self.x = x
16 |         self.y = y
17 |         self.r = r
18 |         self.game = game
19 |         
20 |     def get_position(self):
21 |         return self.x, self.y
22 |     
23 |     def get_radius(self):
24 |         return self.r
25 |     
26 |     def set_position(self, x, y):
27 |         self.x = x
28 |         self.y = y
29 |         
30 |     def draw(self):
31 |         pass
32 |     
33 | class Food(Object):
34 |     
35 |     def __init__(self, x, y, radius, t, game):
36 |         
37 |         super().__init__(x, y, radius, game)
38 |         self.type = t
39 |         self.life = numpy.random.randint(1000, 5000)
40 |         
41 |     def decrease_life(self):
42 |         self.life -= 1
43 |         return self.life == 0
44 | 
45 |     def draw(self, found=False):
46 |         
47 |         if found == False:
48 |             if self.type == "bad":
49 |                 pygame.draw.circle(self.game.DISPLAYSURF, Color.RED, (self.x, self.y), self.r)
50 |             else:
51 |                 pygame.draw.circle(self.game.DISPLAYSURF, Color.GREEN, (self.x, self.y), self.r)
52 |         else:
53 |             pygame.draw.circle(self.game.DISPLAYSURF, Color.BLUE, (self.x, self.y), self.r)
54 |     
55 | class Wall:
56 |     
57 |     def __init__(self, start, end, game, width=2):
58 |         
59 |         self.start = start
60 |         self.end = end
61 |         self.game = game
62 |         self.width = width
63 |         
64 |     def draw(self):
65 |         pygame.draw.line(self.game.DISPLAYSURF, Color.WHITE, self.start, self.end, self.width)
66 |     
67 |     def collide(self, p1, p2):
68 |         
69 |         point = calculateIntersectPoint(p1, p2, self.start, self.end)
70 |         if point is None:
71 |             return None
72 |         else:
73 |             return (int(point[0]), int(point[1]))
74 |         
75 |     
76 |         
77 |     
78 | 


--------------------------------------------------------------------------------
/Chapter05/doom/doom.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 7 Jun 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | from vizdoom import *
 7 | import random
 8 | import time
 9 | 
10 | def main():
11 | 
12 |     game = DoomGame()
13 |     game.load_config("./scenarios/basic.cfg")
14 |     game.init()
15 |     
16 |     shoot = [0, 0, 1]
17 |     left = [1, 0, 0]
18 |     right = [0, 1, 0]
19 |     actions = [shoot, left, right]
20 |     
21 |     episodes = 10
22 |     for _ in range(episodes):
23 |         game.new_episode()
24 |         while not game.is_episode_finished():
25 |             state = game.get_state()
26 |             img = state.screen_buffer
27 |             misc = state.game_variables
28 |             
29 |             print(img.shape)
30 |             print(misc)
31 |             
32 |             reward = game.make_action(random.choice(actions))
33 |             print("\treward: {}".format(reward))
34 |             time.sleep(0.05)
35 |         print("Result: {}".format(game.get_total_reward()))
36 |         time.sleep(2)
37 |         
38 | if __name__ == "__main__":
39 |     main()
40 |     
41 |     


--------------------------------------------------------------------------------
/Chapter05/doom/game.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 7 Jun 2017
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import numpy
  7 | from vizdoom import *
  8 | from utils import cv2_resize_image
  9 | 
 10 | class Game:
 11 | 
 12 |     def __init__(self, config='basic', window_visible=True):
 13 | 
 14 |         self.env = DoomGame()
 15 |         self.env.load_config("./scenarios/{}.cfg".format(config))
 16 |         self.env.set_window_visible(window_visible)
 17 |         self.env.set_screen_format(ScreenFormat.GRAY8)
 18 |         self.env.init()
 19 |         
 20 |         self.env.new_episode()
 21 |         frame = self.get_current_frame()
 22 |         
 23 |         shoot = [0, 0, 1]
 24 |         left  = [1, 0, 0]
 25 |         right = [0, 1, 0]
 26 |         self.raw_actions = [shoot, left, right]
 27 |         self.actions = list(range(len(self.raw_actions)))
 28 | 
 29 |         self.frame_skip = 4
 30 |         self.total_reward = 0
 31 |         self.reshape_size = 120
 32 |         
 33 |         # Frame buffer
 34 |         self.buffer_size = 8
 35 |         self.buffer_index = 0
 36 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 37 |     
 38 |     def get_current_frame(self):
 39 |         frame = self.env.get_state().screen_buffer
 40 |         return frame
 41 |     
 42 |     def rgb_to_gray(self, im):
 43 |         if len(im) == 3:
 44 |             return numpy.dot(im, [0.299, 0.587, 0.114])
 45 |         else:
 46 |             return im
 47 |     
 48 |     def set_params(self, frame_skip=4):
 49 |         self.frame_skip = frame_skip
 50 |         self.env.new_episode()
 51 |         frame = self.get_current_frame()
 52 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 53 |     
 54 |     def reset(self):
 55 |         self.env.new_episode()
 56 |         frame = self.get_current_frame()
 57 |         self.total_reward = 0
 58 |         self.buffer_index = 0
 59 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 60 |     
 61 |     def add_frame_to_buffer(self, frame):
 62 |         self.buffer_index = self.buffer_index % self.buffer_size
 63 |         self.buffer[self.buffer_index] = frame
 64 |         self.buffer_index += 1
 65 |     
 66 |     def get_available_actions(self):
 67 |         return list(range(len(self.actions)))
 68 |     
 69 |     def get_feedback_size(self):
 70 |         return (self.reshape_size, self.reshape_size)
 71 |     
 72 |     def crop(self, frame):
 73 |         frame = cv2_resize_image(frame, 
 74 |                                  resized_shape=(self.reshape_size, self.reshape_size), 
 75 |                                  method='scale', crop_offset=0)
 76 |         return frame
 77 |     
 78 |     def get_current_feedback(self, num_frames=4):
 79 |         assert num_frames < self.buffer_size, "Frame buffer is not large enough."
 80 |         index = self.buffer_index - 1
 81 |         frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)]
 82 |         if num_frames > 1:
 83 |             return numpy.concatenate(frames, axis=0)
 84 |         else:
 85 |             return frames[0]
 86 |     
 87 |     def get_total_reward(self):
 88 |         return self.total_reward
 89 |     
 90 |     def play_action(self, action, num_frames=4):
 91 |         
 92 |         termination = 0
 93 |         a = self.raw_actions[action]
 94 |         reward = self.env.make_action(a)
 95 |         done = self.env.is_episode_finished()
 96 | 
 97 |         if done: 
 98 |             termination = 1
 99 |         else:
100 |             frame = self.get_current_frame()
101 |             self.add_frame_to_buffer(self.crop(self.rgb_to_gray(frame)))
102 |         
103 |         r = numpy.clip(reward, -1, 1)
104 |         self.total_reward += reward
105 |         
106 |         return r, self.get_current_feedback(num_frames), termination
107 |     
108 | if __name__ == "__main__":
109 |     
110 |     import random
111 |     from PIL import Image
112 |     
113 |     game = Game()
114 |     game.set_params(frame_skip=4)
115 |     actions = game.get_available_actions()
116 |     print(actions)
117 |     
118 |     for t in range(500):
119 |         
120 |         action = random.choice(actions)
121 |         reward, feedback, termination = game.play_action(action, num_frames=4)
122 |         if termination:
123 |             break
124 |         
125 |         for i in range(feedback.shape[0]):
126 |             img = Image.fromarray(feedback[feedback.shape[0]-i-1])
127 |             img.save('save/{}_{}.bmp'.format(t, i))
128 |             
129 |     


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/basic.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = basic.wad
 6 | doom_map = map01
 7 | 
 8 | # Rewards
 9 | living_reward = -1
10 | 
11 | # Rendering options
12 | screen_resolution = RES_320X240
13 | screen_format = CRCGCB
14 | render_hud = True
15 | render_crosshair = false
16 | render_weapon = true
17 | render_decals = false
18 | render_particles = false
19 | window_visible = true
20 | 
21 | # make episodes start after 20 tics (after unholstering the gun)
22 | episode_start_time = 14
23 | 
24 | # make episodes finish after 300 actions (tics)
25 | episode_timeout = 300
26 | 
27 | # Available buttons
28 | available_buttons = 
29 | 	{ 
30 | 		MOVE_LEFT 
31 | 		MOVE_RIGHT 
32 | 		ATTACK 
33 | 	}
34 | 
35 | # Game variables that will be in the state
36 | available_game_variables = { AMMO2}
37 | 
38 | mode = PLAYER
39 | doom_skill = 5
40 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/basic.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/basic.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/cig.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = cig.wad
 6 | 
 7 | #12 minutes
 8 | episode_timeout = 25200
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_640X480
12 | screen_format = CRCGCB
13 | render_hud = true
14 | render_crosshair = true
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | 
19 | window_visible = true
20 | 
21 | # Available buttons
22 | available_buttons = 
23 | 	{ 
24 | 		TURN_LEFT
25 | 		TURN_RIGHT
26 | 		ATTACK
27 | 
28 | 		MOVE_RIGHT
29 | 		MOVE_LEFT
30 | 
31 | 		MOVE_FORWARD
32 | 		MOVE_BACKWARD
33 | 		TURN_LEFT_RIGHT_DELTA
34 | 		LOOK_UP_DOWN_DELTA	
35 | 		
36 | 	}
37 | 
38 | mode = ASYNC_PLAYER
39 | 
40 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/cig.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/cig.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/cig_with_unknown.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/cig_with_unknown.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/deadly_corridor.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = deadly_corridor.wad
 6 | 
 7 | # Skill 5 is reccomanded for the scenario to be a challenge.
 8 | doom_skill = 5
 9 | 
10 | # Rewards
11 | death_penalty = 100
12 | #living_reward = 0
13 | 
14 | # Rendering options
15 | screen_resolution = RES_320X240
16 | screen_format = CRCGCB
17 | render_hud = true
18 | render_crosshair = false
19 | render_weapon = true
20 | render_decals = false
21 | render_particles = false
22 | window_visible = true
23 | 
24 | episode_timeout = 2100
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		MOVE_LEFT 
30 | 		MOVE_RIGHT 
31 | 		ATTACK 
32 | 		MOVE_FORWARD
33 | 		MOVE_BACKWARD
34 | 		TURN_LEFT
35 | 		TURN_RIGHT
36 | 	}
37 | 
38 | # Game variables that will be in the state
39 | available_game_variables = { HEALTH }
40 | 
41 | mode = PLAYER
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/deadly_corridor.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/deadly_corridor.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/deathmatch.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = deathmatch.wad
 6 | 
 7 | # Rendering options
 8 | screen_resolution = RES_320X240
 9 | screen_format = CRCGCB
10 | render_hud = true
11 | render_crosshair = false
12 | render_weapon = true
13 | render_decals = false
14 | render_particles = false
15 | window_visible = true
16 | 
17 | # make episodes finish after 4200 actions (tics)
18 | episode_timeout = 4200
19 | 
20 | # Available buttons
21 | available_buttons = 
22 | 	{ 
23 | 		ATTACK 
24 |         SPEED 
25 |         STRAFE 
26 | 
27 |         MOVE_RIGHT 
28 |         MOVE_LEFT 
29 |         MOVE_BACKWARD
30 |         MOVE_FORWARD
31 |         TURN_RIGHT 
32 |         TURN_LEFT 
33 | 
34 |         SELECT_WEAPON1 
35 |         SELECT_WEAPON2 
36 |         SELECT_WEAPON3 
37 |         SELECT_WEAPON4 
38 |         SELECT_WEAPON5
39 |         SELECT_WEAPON6 
40 | 
41 |         SELECT_NEXT_WEAPON 
42 |         SELECT_PREV_WEAPON 
43 | 
44 |         LOOK_UP_DOWN_DELTA
45 |         TURN_LEFT_RIGHT_DELTA 
46 |         MOVE_LEFT_RIGHT_DELTA 
47 | 		
48 | 	}
49 | 
50 | # Game variables that will be in the state
51 | available_game_variables = 
52 | 	{ 		
53 |         KILLCOUNT
54 |         HEALTH
55 |         ARMOR
56 |         SELECTED_WEAPON
57 |         SELECTED_WEAPON_AMMO
58 |     }
59 | mode = PLAYER
60 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/deathmatch.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/deathmatch.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/defend_the_center.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = defend_the_center.wad
 6 | 
 7 | # Rewards
 8 | death_penalty = 1
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_640X480
12 | screen_format = CRCGCB
13 | render_hud = True
14 | render_crosshair = false
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | window_visible = true
19 | 
20 | # make episodes start after 10 tics (after unholstering the gun)
21 | episode_start_time = 10
22 | 
23 | # make episodes finish after 2100 actions (tics)
24 | episode_timeout = 2100
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		TURN_LEFT 
30 | 		TURN_RIGHT 
31 | 		ATTACK 
32 | 	}
33 | 
34 | # Game variables that will be in the state
35 | available_game_variables = { AMMO2 HEALTH  }
36 | 
37 | mode = PLAYER
38 | doom_skill = 3
39 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/defend_the_center.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/defend_the_center.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/defend_the_line.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = defend_the_line.wad
 6 | 
 7 | # Rewards
 8 | death_penalty = 1
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_320X240
12 | screen_format = CRCGCB
13 | render_hud = True
14 | render_crosshair = false
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | window_visible = true
19 | 
20 | # make episodes start after 10 tics (after unholstering the gun)
21 | episode_start_time = 10
22 | 
23 | 
24 | # Available buttons
25 | available_buttons = 
26 | 	{ 
27 | 		TURN_lEFT 
28 | 		TURN_RIGHT 
29 | 		ATTACK 
30 | 	}
31 | 
32 | # Game variables that will be in the state
33 | available_game_variables = {  AMMO2 HEALTH}
34 | 
35 | mode = PLAYER
36 | doom_skill = 3
37 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/defend_the_line.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/defend_the_line.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/health_gathering.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = health_gathering.wad
 6 | 
 7 | # Each step is good for you!
 8 | living_reward = 1
 9 | # And death is not!
10 | death_penalty = 100
11 | 
12 | # Rendering options
13 | screen_resolution = RES_320X240
14 | screen_format = CRCGCB
15 | render_hud = false
16 | render_crosshair = false
17 | render_weapon = false
18 | render_decals = false
19 | render_particles = false
20 | window_visible = true
21 | 
22 | # make episodes finish after 2100 actions (tics)
23 | episode_timeout = 2100
24 | 
25 | # Available buttons
26 | available_buttons = 
27 | 	{ 
28 | 		TURN_LEFT 
29 | 		TURN_RIGHT 
30 | 		MOVE_FORWARD 
31 | 	}
32 | 
33 | # Game variables that will be in the state
34 | available_game_variables = { HEALTH }
35 | 
36 | mode = PLAYER


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/health_gathering.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/health_gathering.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/health_gathering_supreme.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/health_gathering_supreme.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/learning.cfg:
--------------------------------------------------------------------------------
 1 | doom_scenario_path = basic.wad
 2 | 
 3 | # Rewards
 4 | living_reward = -1
 5 | 
 6 | # Rendering options
 7 | screen_resolution = RES_640X480
 8 | screen_format = GRAY8
 9 | render_hud = false
10 | render_crosshair = false
11 | render_weapon = true
12 | render_decals = false
13 | render_particles = false
14 | window_visible = false
15 | 
16 | # make episodes start after 20 tics (after unholstering the gun)
17 | episode_start_time = 14
18 | 
19 | # make episodes finish after 300 actions (tics)
20 | episode_timeout = 300
21 | 
22 | # Available buttons
23 | available_buttons = 
24 | 	{ 
25 | 		MOVE_LEFT 
26 | 		MOVE_RIGHT 
27 | 		ATTACK 
28 | 	}
29 | 
30 | mode = PLAYER
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/multi.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = multi_deathmatch.wad
 6 | 
 7 | # Rewards
 8 | death_penalty = 1
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_640X480
12 | screen_format = CRCGCB
13 | render_hud = true
14 | render_crosshair = true
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | 
19 | window_visible = true
20 | 
21 | 
22 | # Available buttons
23 | available_buttons = 
24 | 	{ 
25 | 		TURN_LEFT
26 | 		TURN_RIGHT
27 | 		ATTACK
28 | 
29 | 		MOVE_RIGHT
30 | 		MOVE_LEFT
31 | 
32 | 		MOVE_FORWARD
33 | 		MOVE_BACKWARD
34 | 		TURN_LEFT_RIGHT_DELTA
35 | 		LOOK_UP_DOWN_DELTA	
36 | 		
37 | 	}
38 | 
39 | available_game_variables = 
40 | {
41 | 	HEALTH
42 | 	AMMO3
43 | }
44 | mode = ASYNC_PLAYER
45 | 
46 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/multi_deathmatch.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/multi_deathmatch.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/multi_duel.cfg:
--------------------------------------------------------------------------------
 1 | doom_scenario_path = multi_duel.wad
 2 | 
 3 | screen_resolution = RES_640X480
 4 | screen_format = CRCGCB
 5 | render_hud = true
 6 | render_crosshair = false
 7 | render_weapon = true
 8 | render_decals = true
 9 | render_particles = true
10 | window_visible = true
11 | 
12 | available_buttons = 
13 | 	{ 
14 | 		MOVE_LEFT 
15 | 		MOVE_RIGHT 
16 | 		ATTACK 
17 | 	}
18 | 
19 | mode = PLAYER
20 | doom_skill = 5
21 | 
22 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/multi_duel.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/multi_duel.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/my_way_home.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = my_way_home.wad
 6 | 
 7 | # Rewards
 8 | living_reward = -0.0001
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_640X480
12 | screen_format = CRCGCB
13 | render_hud = false
14 | render_crosshair = false
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | window_visible = true
19 | 
20 | # make episodes start after 10 tics (after unholstering the gun)
21 | episode_start_time = 10
22 | 
23 | # make episodes finish after 2100 actions (tics)
24 | episode_timeout = 2100
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		TURN_LEFT
30 | 		TURN_RIGHT
31 | 		MOVE_FORWARD 
32 | 		MOVE_LEFT
33 | 		MOVE_RIGHT
34 | 	}
35 | 
36 | # Game variables that will be in the state
37 | available_game_variables = { AMMO0 }
38 | 
39 | mode = PLAYER
40 | doom_skill = 5
41 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/my_way_home.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/my_way_home.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/predict_position.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = predict_position.wad
 6 | 
 7 | # Rewards
 8 | living_reward = -0.001
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_800X450
12 | screen_format = CRCGCB
13 | render_hud = false
14 | render_crosshair = false
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | window_visible = true
19 | 
20 | # make episodes start after 16 tics (after producing the rocket launcher)
21 | episode_start_time = 16
22 | 
23 | # make episodes finish after 300 actions (tics)
24 | episode_timeout = 300
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		TURN_LEFT 
30 | 		TURN_RIGHT 
31 | 		ATTACK 
32 | 	}
33 | 
34 | # Empty list is allowed, in case you are lazy.
35 | available_game_variables = {  }
36 | 
37 | game_args += +sv_noautoaim 1
38 | 
39 | mode = PLAYER
40 | doom_skill = 1
41 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/predict_position.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/predict_position.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/rocket_basic.cfg:
--------------------------------------------------------------------------------
 1 | doom_scenario_path = rocket_basic.wad
 2 | 
 3 | # Rewards
 4 | living_reward = -1
 5 | 
 6 | # Rendering options
 7 | screen_resolution = RES_640X480
 8 | screen_format = GRAY8
 9 | render_hud = true
10 | render_crosshair = false
11 | render_weapon = true
12 | render_decals = false
13 | render_particles = false
14 | 
15 | # make episodes start after 14 tics (after unholstering the gun)
16 | episode_start_time = 14
17 | 
18 | # make episodes finish after 300 actions (tics)
19 | episode_timeout = 300
20 | 
21 | # Available buttons
22 | available_buttons =
23 | 	{
24 | 		MOVE_LEFT
25 | 		MOVE_RIGHT
26 | 		ATTACK
27 | 	}
28 | 
29 | game_args += +sv_noautoaim 1
30 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/rocket_basic.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/rocket_basic.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/simpler_basic.cfg:
--------------------------------------------------------------------------------
 1 | doom_scenario_path = simpler_basic.wad
 2 | 
 3 | # Rewards
 4 | living_reward = -1
 5 | 
 6 | # Rendering options
 7 | screen_resolution = RES_640X480
 8 | screen_format = GRAY8
 9 | 
10 | render_hud = true
11 | render_crosshair = false
12 | render_weapon = true
13 | render_decals = false
14 | render_particles = false
15 | 
16 | # make episodes start after 20 tics (after unholstering the gun)
17 | episode_start_time = 14
18 | 
19 | # make episodes finish after 300 actions (tics)
20 | episode_timeout = 300
21 | 
22 | # Available buttons
23 | available_buttons = 
24 | 	{ 
25 | 		MOVE_LEFT 
26 | 		MOVE_RIGHT 
27 | 		ATTACK 
28 | 	}
29 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/simpler_basic.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/simpler_basic.wad


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/take_cover.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = take_cover.wad
 6 | doom_map = map01
 7 | 
 8 | # Rewards
 9 | living_reward = 1
10 | 
11 | # Rendering options
12 | screen_resolution = RES_320X240
13 | screen_format = CRCGCB
14 | render_hud = false
15 | render_crosshair = false
16 | render_weapon = false
17 | render_decals = false
18 | render_particles = false
19 | window_visible = true
20 | 
21 | # Available buttons
22 | available_buttons = 
23 | 	{ 
24 | 		MOVE_LEFT 
25 | 		MOVE_RIGHT
26 | 	}
27 | 
28 | # Game variables that will be in the state
29 | available_game_variables = { HEALTH }
30 | 
31 | # Change it if you wish.
32 | doom_skill = 4
33 | 
34 | 


--------------------------------------------------------------------------------
/Chapter05/doom/scenarios/take_cover.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/take_cover.wad


--------------------------------------------------------------------------------
/Chapter05/environment.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2 Jun 2017
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import time
  7 | from threading import Thread
  8 | from parameter import Parameter
  9 | 
 10 | 
 11 | def new_demo(test=False):
 12 |     
 13 |     import pygame
 14 |     from demo.game import Game
 15 |     if test is False:
 16 |         game = Game(640, 480, None)
 17 |     else:
 18 |         def _render(game):
 19 |             while True:
 20 |                 game.draw()
 21 |                 for event in pygame.event.get():
 22 |                     if event.type == pygame.KEYDOWN:
 23 |                         if event.key == pygame.K_9:
 24 |                             game.increase_fps()
 25 |                         elif event.key == pygame.K_0:
 26 |                             game.decrease_fps()    
 27 |         pygame.init()
 28 |         DISPLAYSURF = pygame.display.set_mode((640, 480), 0, 32)
 29 |         pygame.display.set_caption('Demo')
 30 |         game = Game(640, 480, DISPLAYSURF)
 31 |         t = Thread(target=lambda: _render(game))
 32 |         t.start()
 33 |     
 34 |     parameter = Parameter(lr=1e-3)
 35 |     parameter.gamma = 0.9
 36 |     parameter.iteration_num = 300000
 37 |     parameter.num_history_frames = 1
 38 |     parameter.network_type = 'mlp'
 39 |     
 40 |     parameter.update_method = 'rmsprop'
 41 |     parameter.rho = 0.95
 42 |     parameter.async_update_interval = 5
 43 |     parameter.input_scale = 1.0
 44 |     
 45 |     return game, parameter
 46 | 
 47 | 
 48 | def new_atari_game(rom='breakout'):
 49 |     
 50 |     from game import Game
 51 |     game = Game(rom)
 52 |     
 53 |     if rom == 'space_invaders':
 54 |         game.set_params(frame_skip=3, lost_life_as_terminal=False, take_maximum_of_two_frames=True)
 55 |     elif game == 'alien':
 56 |         game.set_params(frame_skip=4, crop_offset=20, lost_life_as_terminal=False)
 57 |     else:
 58 |         game.set_params(frame_skip=4, lost_life_as_terminal=False)
 59 |     
 60 |     parameter = Parameter(lr=7e-4)
 61 |     parameter.gamma = 0.99
 62 |     parameter.num_history_frames = 4
 63 |     
 64 |     parameter.async_update_interval = 20
 65 |     parameter.max_iter_num = 16 * 10 ** 7
 66 |     parameter.update_method = 'rmsprop'
 67 |     parameter.rho = 0.99
 68 |     parameter.rmsprop_epsilon = 1e-1 # 1e-3 if rom == 'breakout' else 1e-1
 69 |     
 70 |     time.sleep(1)
 71 |     return game, parameter
 72 | 
 73 | 
 74 | def new_minecraft(rom='MinecraftBasic-v0'):
 75 |     
 76 |     from minecraft.game import Game
 77 |     game = Game(rom)
 78 |     
 79 |     parameter = Parameter(lr=7e-4)
 80 |     parameter.gamma = 0.99
 81 |     parameter.num_history_frames = 4
 82 |     
 83 |     parameter.async_update_interval = 20
 84 |     parameter.max_iter_num = 16 * 10 ** 7
 85 |     parameter.update_method = 'rmsprop'
 86 |     parameter.rho = 0.99
 87 |     parameter.rmsprop_epsilon = 1e-3
 88 |     
 89 |     time.sleep(1)
 90 |     return game, parameter
 91 | 
 92 | 
 93 | def new_environment(name='demo', test=False):
 94 |     
 95 |     if name == 'demo':
 96 |         return new_demo(test=test)
 97 |     elif name.find('Minecraft') != -1:
 98 |         return new_minecraft(rom=name)
 99 |     else:
100 |         return new_atari_game(rom=name)
101 | 
102 | 
103 |         


--------------------------------------------------------------------------------
/Chapter05/ff_policy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 29 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import tensorflow as tf
 7 | from layer import conv2d, linear
 8 | 
 9 | 
10 | class FFPolicy:
11 |     
12 |     def __init__(self, input_shape=(84, 84, 4), n_outputs=4, network_type='cnn'):
13 |         
14 |         self.width = input_shape[0]
15 |         self.height = input_shape[1]
16 |         self.channel = input_shape[2]
17 |         self.n_outputs = n_outputs
18 |         self.network_type = network_type
19 |         self.entropy_beta = 0.01
20 |         
21 |         self.x = tf.placeholder(dtype=tf.float32, 
22 |                                 shape=(None, self.channel, self.width, self.height))
23 |         self.build_model()
24 |         
25 |     def build_model(self):
26 |         
27 |         self.net = {}
28 |         self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1))
29 |             
30 |         if self.network_type == 'cnn':
31 |             self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), name='conv1')
32 |             self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), name='conv2')
33 |             self.net['feature'] = linear(self.net['conv2'], 256, name='fc1')
34 |         else:
35 |             # MLP for testing
36 |             self.net['fc1'] = linear(self.net['input'], 50, init_b = tf.constant_initializer(0.0), name='fc1')
37 |             self.net['feature'] = linear(self.net['fc1'], 50, init_b = tf.constant_initializer(0.0), name='fc2')
38 |             
39 |         self.net['value'] = tf.reshape(linear(self.net['feature'], 1, activation=None, name='value',
40 |                                               init_b = tf.constant_initializer(0.0)), 
41 |                                        shape=(-1,))
42 |         
43 |         self.net['logits'] = linear(self.net['feature'], self.n_outputs, activation=None, name='logits',
44 |                                     init_b = tf.constant_initializer(0.0))
45 |         
46 |         self.net['policy'] = tf.nn.softmax(self.net['logits'], name='policy')
47 |         self.net['log_policy'] = tf.nn.log_softmax(self.net['logits'], name='log_policy')
48 |         
49 |         self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
50 |     
51 |     def build_gradient_op(self, clip_grad=None):
52 | 
53 |         self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_outputs), name='action')
54 |         self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward')
55 |         self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='advantage')
56 | 
57 |         value = self.net['value']
58 |         policy = self.net['policy']
59 |         log_policy = self.net['log_policy']
60 |         
61 |         entropy = -tf.reduce_sum(policy * log_policy, axis=1)
62 |         p_loss = -tf.reduce_sum(tf.reduce_sum(log_policy * self.action, axis=1) * self.advantage + self.entropy_beta * entropy)
63 |         v_loss = 0.5 * tf.reduce_sum((value - self.reward) ** 2)
64 |         total_loss = p_loss + v_loss
65 |         
66 |         self.gradients = tf.gradients(total_loss, self.vars)
67 |         if clip_grad is not None:
68 |             self.gradients, _ = tf.clip_by_global_norm(self.gradients, clip_grad)
69 |         
70 |         # Add summaries
71 |         tf.summary.scalar("policy_loss", p_loss, collections=['policy_network'])
72 |         tf.summary.scalar("value_loss", v_loss, collections=['policy_network'])
73 |         tf.summary.scalar("entropy", tf.reduce_mean(entropy), collections=['policy_network'])
74 |         # tf.summary.scalar("grad_global_norm", tf.global_norm(self.gradients), collections=['policy_network'])
75 |         self.summary_op = tf.summary.merge_all('policy_network')
76 |         
77 |         return self.gradients
78 |     
79 |     def run_initial_state(self, sess):
80 |         return None
81 |     
82 |     def run_value(self, sess, state, *args):
83 |         value = sess.run(self.net['value'], 
84 |                          feed_dict={self.x: state})
85 |         return value
86 |     
87 |     def run_policy_and_value(self, sess, state, *args):
88 |         policy, value = sess.run([self.net['policy'], self.net['value']], 
89 |                                  feed_dict={self.x: state})
90 |         return policy, value
91 |     
92 |     def get_feed_dict(self, states, actions, rewards, advantages, *args):
93 |         feed_dict={self.x: states, self.action: actions, 
94 |                    self.reward: rewards, self.advantage: advantages}
95 |         return feed_dict
96 |         
97 |     
98 |         


--------------------------------------------------------------------------------
/Chapter05/helper/tmux:
--------------------------------------------------------------------------------
 1 | tmux can be controlled using a prefix key (by default, Ctrl-b) and a command key. The command key to split into two panes is %. From within tmux:
 2 | Ctrl-b %
 3 | 
 4 | We can split our second pane horizontally:
 5 | Ctrl-b "
 6 | 
 7 | To switch to the next pane (panes are numbered left-to-right, top-down):
 8 | Ctrl-b o
 9 | 
10 | A step in context above panes are windows. Windows behave similarly to tabs in a browser.
11 | When tmux starts up, it gives you a window and a single pane inside the window.
12 | To create a new window:
13 | Ctrl-b c
14 | 
15 | tmux will switch to the new window automatically. You can see the new window indicated in the status-line. Windows are numbered from 0, so our new window is number 1.
16 | Now you can create panes and treat this window like we did before. We can even create another window. Our three windows are numbered 0, 1, and 2.
17 | To move to the next window in the index:
18 | Ctrl-b n
19 | 
20 | To move backwards in the index:
21 | Ctrl-b p
22 | 
23 | 


--------------------------------------------------------------------------------
/Chapter05/layer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 29 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def leaky_relu(x, leak=0.0, name="lrelu"):
11 |     return tf.maximum(leak * x, x, name=name)
12 | 
13 | def add_regularization(var, weight):
14 |     weight_decay = tf.multiply(tf.nn.l2_loss(var), weight, name='weight_loss')
15 |     tf.add_to_collection('losses', weight_decay)
16 | 
17 | def get_variable_on_cpu(shape, initializer, name, dtype=tf.float32, trainable=True):
18 |     with tf.device('/cpu:0'):
19 |         var = tf.get_variable(shape=shape, initializer=initializer, 
20 |                               dtype=dtype, name=name, trainable=trainable)
21 |     return var
22 | 
23 | def HeUniform(shape):
24 |     
25 |     if len(shape) > 2:
26 |         w = shape[0]
27 |         h = shape[1]
28 |         input_channels  = shape[2]
29 |         d = 1.0 / numpy.sqrt(input_channels * w * h)
30 |     else:
31 |         d = 1.0 / numpy.sqrt(shape[0])
32 |     
33 |     init_W = tf.random_uniform_initializer(-d, d)
34 |     init_b = tf.random_uniform_initializer(-d, d)
35 |     return init_W, init_b
36 | 
37 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 
38 |            activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'):
39 |     
40 |     assert len(x.get_shape().as_list()) == 4
41 |     shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim)
42 |     _W, _b = HeUniform(shape)
43 |     if init_W is None: init_W = _W
44 |     if init_b is None: init_b = _b
45 | 
46 |     with tf.variable_scope(name):
47 |         W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
48 |         b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
49 |         
50 |         conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding)
51 |         if activation:
52 |             conv = activation(tf.nn.bias_add(conv, b))
53 |         else:
54 |             conv = tf.nn.bias_add(conv, b)
55 |     
56 |     return conv
57 | 
58 | def linear(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='linear'):
59 |     
60 |     if len(x.get_shape().as_list()) > 2:
61 |         shape = x.get_shape().as_list()
62 |         x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:])))
63 | 
64 |     shape = (x.get_shape().as_list()[-1], output_dim)
65 |     _W, _b = HeUniform(shape)
66 |     if init_W is None: init_W = _W
67 |     if init_b is None: init_b = _b
68 | 
69 |     with tf.variable_scope(name):
70 |         W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight')
71 |         b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias')
72 |         
73 |         linear = tf.matmul(x, W) + b
74 |         if activation:
75 |             linear = activation(linear)
76 |     
77 |     return linear
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/Chapter05/lstm_policy.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 31 May 2017
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import tensorflow as tf
  7 | from layer import conv2d, linear
  8 | 
  9 | 
 10 | class LSTMPolicy:
 11 |     
 12 |     def __init__(self, input_shape=(84, 84, 4), n_outputs=4, network_type='cnn'):
 13 |         
 14 |         self.width = input_shape[0]
 15 |         self.height = input_shape[1]
 16 |         self.channel = input_shape[2]
 17 |         self.n_outputs = n_outputs
 18 |         self.network_type = network_type
 19 |         self.entropy_beta = 0.01
 20 |         
 21 |         self.x = tf.placeholder(dtype=tf.float32, 
 22 |                                 shape=(None, self.channel, self.width, self.height))
 23 |         self.build_model()
 24 |         
 25 |     def build_model(self):
 26 |         
 27 |         self.net = {}
 28 |         self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1))
 29 |             
 30 |         if self.network_type == 'cnn':
 31 |             self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), name='conv1')
 32 |             self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), name='conv2')
 33 |             self.net['feature'] = linear(self.net['conv2'], 256, name='fc1')
 34 |         else:
 35 |             # MLP for testing
 36 |             self.net['fc1'] = linear(self.net['input'], 50, init_b = tf.constant_initializer(0.0), name='fc1')
 37 |             self.net['feature'] = linear(self.net['fc1'], 50, init_b = tf.constant_initializer(0.0), name='fc2')
 38 |         
 39 |         num_units = self.net['feature'].get_shape().as_list()[-1]
 40 |         self.lstm = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, forget_bias=0.0, state_is_tuple=True)
 41 |         self.init_state = self.lstm.zero_state(batch_size=1, dtype=tf.float32)
 42 |         
 43 |         step_size = tf.shape(self.x)[:1]
 44 |         feature = tf.expand_dims(self.net['feature'], axis=0)
 45 |         lstm_outputs, lstm_state = tf.nn.dynamic_rnn(self.lstm, feature, 
 46 |                                                      initial_state=self.init_state, 
 47 |                                                      sequence_length=step_size,
 48 |                                                      time_major=False)
 49 |         outputs = tf.reshape(lstm_outputs, shape=(-1, num_units))
 50 |         self.final_state = lstm_state
 51 |         
 52 |         self.net['value'] = tf.reshape(linear(outputs, 1, activation=None, name='value',
 53 |                                               init_b = tf.constant_initializer(0.0)), 
 54 |                                        shape=(-1,))
 55 |         
 56 |         self.net['logits'] = linear(outputs, self.n_outputs, activation=None, name='logits',
 57 |                                     init_b = tf.constant_initializer(0.0))
 58 |         
 59 |         self.net['policy'] = tf.nn.softmax(self.net['logits'], name='policy')
 60 |         self.net['log_policy'] = tf.nn.log_softmax(self.net['logits'], name='log_policy')
 61 |         
 62 |         self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
 63 |     
 64 |     def build_gradient_op(self, clip_grad=None):
 65 | 
 66 |         self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_outputs), name='action')
 67 |         self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward')
 68 |         self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='advantage')
 69 | 
 70 |         value = self.net['value']
 71 |         policy = self.net['policy']
 72 |         log_policy = self.net['log_policy']
 73 |         
 74 |         entropy = -tf.reduce_sum(policy * log_policy, axis=1)
 75 |         p_loss = -tf.reduce_sum(tf.reduce_sum(log_policy * self.action, axis=1) * self.advantage + 
 76 |                                 self.entropy_beta * entropy)
 77 |         v_loss = 0.5 * tf.reduce_sum((value - self.reward) ** 2)
 78 |         total_loss = p_loss + v_loss
 79 |         
 80 |         self.gradients = tf.gradients(total_loss, self.vars)
 81 |         if clip_grad is not None:
 82 |             self.gradients, _ = tf.clip_by_global_norm(self.gradients, clip_grad)
 83 |         
 84 |         # Add summaries
 85 |         tf.summary.scalar("policy_loss", p_loss, collections=['policy_network'])
 86 |         tf.summary.scalar("value_loss", v_loss, collections=['policy_network'])
 87 |         tf.summary.scalar("entropy", tf.reduce_mean(entropy), collections=['policy_network'])
 88 |         # tf.summary.scalar("grad_global_norm", tf.global_norm(self.gradients), collections=['policy_network'])
 89 |         self.summary_op = tf.summary.merge_all('policy_network')
 90 |         
 91 |         return self.gradients
 92 |     
 93 |     def run_initial_state(self, sess):
 94 |         return sess.run(self.init_state)
 95 |     
 96 |     def run_value(self, sess, state, cell, *args):
 97 |         feed_dict={self.x: state, self.init_state[0]: cell[0], self.init_state[1]: cell[1]}
 98 |         value = sess.run(self.net['value'], feed_dict=feed_dict)
 99 |         return value
100 |     
101 |     def run_policy_and_value(self, sess, state, cell, *args):
102 |         feed_dict={self.x: state, self.init_state[0]: cell[0], self.init_state[1]: cell[1]}
103 |         policy, value, final_state = sess.run([self.net['policy'], self.net['value'], self.final_state],
104 |                                               feed_dict=feed_dict)
105 |         return policy, value, final_state
106 |     
107 |     def get_feed_dict(self, states, actions, rewards, advantages, cell, *args):
108 |         feed_dict={self.x: states, self.action: actions, 
109 |                    self.reward: rewards, self.advantage: advantages,
110 |                    self.init_state[0]: cell[0], self.init_state[1]: cell[1]}
111 |         return feed_dict
112 |     
113 |     


--------------------------------------------------------------------------------
/Chapter05/minecraft/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Created on Jul 19, 2018
3 | 
4 | @author: ywz
5 | '''
6 | 


--------------------------------------------------------------------------------
/Chapter05/minecraft/game.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 10, 2018
  3 | 
  4 | @author: ywz
  5 | '''
  6 | import gym
  7 | import gym_minecraft
  8 | import minecraft_py
  9 | import numpy, time
 10 | from utils import cv2_resize_image
 11 | 
 12 | 
 13 | class Game:
 14 | 
 15 |     def __init__(self, name='MinecraftBasic-v0', discrete_movement=False):
 16 |         
 17 |         self.env = gym.make(name)
 18 |         if discrete_movement:
 19 |             self.env.init(start_minecraft=True, allowDiscreteMovement=["move", "turn"])
 20 |         else:
 21 |             self.env.init(start_minecraft=True, allowContinuousMovement=["move", "turn"])
 22 |         self.actions = list(range(self.env.action_space.n))
 23 |         frame = self.env.reset()
 24 |         
 25 |         self.frame_skip = 1
 26 |         self.total_reward = 0
 27 |         self.crop_size = 84
 28 |         
 29 |         # Frame buffer
 30 |         self.buffer_size = 8
 31 |         self.buffer_index = 0
 32 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 33 |         self.last_frame = frame
 34 |     
 35 |     def rgb_to_gray(self, im):
 36 |         return numpy.dot(im, [0.2126, 0.7152, 0.0722])
 37 |     
 38 |     def set_params(self, crop_size=84, frame_skip=4):
 39 |         
 40 |         self.crop_size = crop_size
 41 |         self.frame_skip = frame_skip
 42 |         
 43 |         frame = self.env.reset()
 44 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 45 |         self.last_frame = frame
 46 |     
 47 |     def reset(self):
 48 |         frame = self.env.reset()
 49 |         self.total_reward = 0
 50 |         self.buffer_index = 0
 51 |         self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)]
 52 |         self.last_frame = frame
 53 |     
 54 |     def add_frame_to_buffer(self, frame):
 55 |         self.buffer_index = self.buffer_index % self.buffer_size
 56 |         self.buffer[self.buffer_index] = frame
 57 |         self.buffer_index += 1
 58 |     
 59 |     def get_available_actions(self):
 60 |         return list(range(len(self.actions)))
 61 |     
 62 |     def get_feedback_size(self):
 63 |         return (self.crop_size, self.crop_size)
 64 |     
 65 |     def crop(self, frame):
 66 |         feedback = cv2_resize_image(frame, 
 67 |                                     resized_shape=(self.crop_size, self.crop_size), 
 68 |                                     method='scale', crop_offset=0)
 69 |         return feedback
 70 |     
 71 |     def get_current_feedback(self, num_frames=4):
 72 |         assert num_frames < self.buffer_size, "Frame buffer is not large enough."
 73 |         index = self.buffer_index - 1
 74 |         frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)]
 75 |         if num_frames > 1:
 76 |             return numpy.concatenate(frames, axis=0)
 77 |         else:
 78 |             return frames[0]
 79 |     
 80 |     def get_total_reward(self):
 81 |         return self.total_reward
 82 |     
 83 |     def play_action(self, action, num_frames=4):
 84 |         
 85 |         reward = 0
 86 |         termination = 0
 87 |         for i in range(self.frame_skip):
 88 |             a = self.actions[action]
 89 |             frame, r, done, _ = self.env.step(a)
 90 |             reward += r
 91 |             if i == self.frame_skip - 2: 
 92 |                 self.last_frame = frame
 93 |             if done: 
 94 |                 termination = 1
 95 |         self.add_frame_to_buffer(self.crop(numpy.maximum(self.rgb_to_gray(frame), self.rgb_to_gray(self.last_frame))))
 96 |         
 97 |         r = numpy.clip(reward, -1, 1)
 98 |         self.total_reward += reward
 99 |         
100 |         return r, self.get_current_feedback(num_frames), termination
101 |     
102 |     def draw(self):
103 |         time.sleep(1 / 120.0)
104 |         self.env.render(mode='human')
105 | 


--------------------------------------------------------------------------------
/Chapter05/parameter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jan 24, 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import pickle
 7 | from utils import log_uniform
 8 | 
 9 | class Parameter:
10 |     
11 |     def __init__(self, lr, directory=None):
12 |         
13 |         self.directory = directory
14 |             
15 |         if isinstance(lr, tuple):
16 |             assert len(lr) == 2
17 |             assert lr[0] < lr[1]
18 |             self.learning_rate = log_uniform(lr[0], lr[1])
19 |         else:
20 |             self.learning_rate = lr
21 |         
22 |         self.gamma = 0.99
23 |         self.num_history_frames = 4
24 |         self.iteration_num = 100000
25 |         self.async_update_interval = 5
26 |             
27 |         self.rho = 0.99
28 |         self.rmsprop_epsilon = 1e-6
29 |         self.update_method = 'rmsprop'
30 |         self.clip_delta = 0
31 |         self.max_iter_num = 10 ** 8
32 |         self.network_type = 'cnn'
33 |         self.input_scale = 255.0
34 |         
35 |     def get(self):
36 |         
37 |         param = {}
38 |         param['directory'] = self.directory
39 |         param['learning_rate'] = self.learning_rate
40 |         
41 |         param['gamma'] = self.gamma
42 |         param['num_frames'] = self.num_history_frames
43 |         param['iteration_num'] = self.iteration_num
44 |         param['async_update_interval'] = self.async_update_interval
45 |         
46 |         param['rho'] = self.rho
47 |         param['rmsprop_epsilon'] = self.rmsprop_epsilon
48 |         param['update_method'] = self.update_method
49 |         param['clip_delta'] = self.clip_delta
50 |         param['max_iter_num'] = self.max_iter_num
51 |         param['network_type'] = self.network_type
52 |         param['input_scale'] = self.input_scale
53 |         
54 |         return param
55 |     
56 |     def __str__(self):
57 |         param = self.get()
58 |         strs = ["{}: {}".format(key, value) for key, value in param.items()]
59 |         return "\n".join(strs)
60 |     
61 |     def save(self, filename):
62 |         assert self.directory is not None
63 |         filename = '{}/{}'.format(self.directory, filename)
64 |         with open(filename, 'wb') as f:
65 |             pickle.dump(self.get(), f)
66 |             


--------------------------------------------------------------------------------
/Chapter05/save/breakout/train/log_0/events.out.tfevents.1532007719.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/breakout/train/log_0/events.out.tfevents.1532007719.ywz-PC


--------------------------------------------------------------------------------
/Chapter05/save/breakout/train/log_1/events.out.tfevents.1532007719.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/breakout/train/log_1/events.out.tfevents.1532007719.ywz-PC


--------------------------------------------------------------------------------
/Chapter05/save/demo/train/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "a3c_model.ckpt"
2 | all_model_checkpoint_paths: "a3c_model.ckpt"
3 | 


--------------------------------------------------------------------------------
/Chapter05/save/demo/train/log_0/events.out.tfevents.1532007504.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/demo/train/log_0/events.out.tfevents.1532007504.ywz-PC


--------------------------------------------------------------------------------
/Chapter05/save/demo/train/log_1/events.out.tfevents.1532007504.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/demo/train/log_1/events.out.tfevents.1532007504.ywz-PC


--------------------------------------------------------------------------------
/Chapter05/save/minecraftbasic-v0/train/log_0/events.out.tfevents.1532007895.ywz-PC:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/minecraftbasic-v0/train/log_0/events.out.tfevents.1532007895.ywz-PC


--------------------------------------------------------------------------------
/Chapter05/test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 31 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import time
 7 | import argparse, os, sys, signal
 8 | import tensorflow as tf
 9 | from a3c import A3C
10 | from cluster import cluster_spec
11 | from environment import new_environment
12 | 
13 | def shutdown(signal, frame):
14 |     print('Received signal {}: exiting'.format(signal))
15 |     sys.exit(128 + signal)
16 | 
17 | def test(args, server):
18 |     
19 |     log_dir = os.path.join(args.log_dir, '{}/train'.format(args.env))
20 |     game, parameter = new_environment(name=args.env, test=True)
21 |     a3c = A3C(game, log_dir, parameter.get(), agent_index=args.task, callback=game.draw)
22 |     
23 |     config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
24 |     with tf.Session(target=server.target, config=config) as sess:
25 |         saver = tf.train.Saver()
26 |         a3c.load(sess, saver, model_name='best_a3c_model.ckpt')
27 |         a3c.evaluate(sess, n_episode=10, saver=None, verbose=True)
28 | 
29 | def main():
30 |     
31 |     parser = argparse.ArgumentParser(description=None)
32 |     parser.add_argument('-t', '--task', default=0, type=int, help='Task index')
33 |     parser.add_argument('-j', '--job_name', default="worker", type=str, help='worker or ps')
34 |     parser.add_argument('-w', '--num_workers', default=1, type=int, help='Number of workers')
35 |     parser.add_argument('-l', '--log_dir', default="save", type=str, help='Log directory path')
36 |     parser.add_argument('-e', '--env', default="demo", type=str, help='Environment')
37 |     
38 |     args = parser.parse_args()
39 |     spec = cluster_spec(args.num_workers, 1)
40 |     cluster = tf.train.ClusterSpec(spec)
41 |     
42 |     signal.signal(signal.SIGHUP, shutdown)
43 |     signal.signal(signal.SIGINT, shutdown)
44 |     signal.signal(signal.SIGTERM, shutdown)
45 |     
46 |     if args.job_name == "worker":
47 |         server = tf.train.Server(cluster, 
48 |                                  job_name="worker", 
49 |                                  task_index=args.task,
50 |                                  config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2))
51 |         test(args, server)
52 |     else:
53 |         server = tf.train.Server(cluster, 
54 |                                  job_name="ps", 
55 |                                  task_index=args.task,
56 |                                  config=tf.ConfigProto(device_filters=["/job:ps"]))
57 |         # server.join()
58 |         while True:
59 |             time.sleep(1000)
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 |     


--------------------------------------------------------------------------------
/Chapter05/timer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jan 20, 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import time
 7 | 
 8 | class Timer:
 9 |     
10 |     def __init__(self):
11 |         self.total_time = 0
12 |         self.current_time = 0
13 |         self.name = ''
14 |         
15 |     def reset(self):
16 |         self.total_time = 0
17 |         self.current_time = 0
18 |     
19 |     def set_name(self, name):
20 |         self.name = name
21 |         
22 |     def begin(self):
23 |         self.current_time = time.time()
24 |         
25 |     def end(self):
26 |         self.total_time += time.time() - self.current_time
27 |         
28 |     def total_time(self):
29 |         return self.total_time
30 |     
31 |     def print(self):
32 |         print('{} took {}s'.format(self.name, self.total_time))
33 |         


--------------------------------------------------------------------------------
/Chapter05/train.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 31 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import argparse, os, sys, cluster
 7 | from six.moves import shlex_quote  #@UnresolvedImport
 8 | 
 9 | parser = argparse.ArgumentParser(description="Run commands")
10 | parser.add_argument('-w', '--num_workers', default=1, type=int,
11 |                     help="Number of workers")
12 | parser.add_argument('-e', '--env', type=str, default="demo",
13 |                     help="Environment")
14 | parser.add_argument('-l', '--log_dir', type=str, default="save",
15 |                     help="Log directory path")
16 | 
17 | def new_cmd(session, name, cmd, logdir, shell):
18 |     if isinstance(cmd, (list, tuple)):
19 |         cmd = " ".join(shlex_quote(str(v)) for v in cmd)
20 |     return name, "tmux send-keys -t {}:{} {} Enter".format(session, name, shlex_quote(cmd))
21 | 
22 | def create_commands(session, num_workers, logdir, env, shell='bash'):
23 | 
24 |     base_cmd = ['CUDA_VISIBLE_DEVICES=',
25 |                 sys.executable, 
26 |                 'worker.py', 
27 |                 '--log_dir', logdir,
28 |                 '--num_workers', str(num_workers),
29 |                 '--env', env]
30 | 
31 |     cmds_map = [new_cmd(session, "ps", base_cmd + ["--job_name", "ps"], logdir, shell)]
32 |     for i in range(num_workers):
33 |         cmd = base_cmd + ["--job_name", "worker", "--task", str(i)]
34 |         cmds_map.append(new_cmd(session, "w-%d" % i, cmd, logdir, shell))
35 |     cmds_map.append(new_cmd(session, "htop", ["htop"], logdir, shell))
36 |     
37 |     windows = [v[0] for v in cmds_map]
38 |     notes = ["Use `tmux attach -t {}` to watch process output".format(session),
39 |              "Use `tmux kill-session -t {}` to kill the job".format(session),
40 |              "Use `ssh -L PORT:SERVER_IP:SERVER_PORT username@server_ip` to remote Tensorboard"]
41 | 
42 |     cmds = ["kill $(lsof -i:{}-{} -t) > /dev/null 2>&1".format(cluster.PORT, num_workers+cluster.PORT),
43 |             "tmux kill-session -t {}".format(session),
44 |             "tmux new-session -s {} -n {} -d {}".format(session, windows[0], shell)]
45 |     
46 |     for w in windows[1:]:
47 |         cmds.append("tmux new-window -t {} -n {} {}".format(session, w, shell))
48 |     cmds.append("sleep 1")
49 | 
50 |     for _, cmd in cmds_map:
51 |         cmds.append(cmd)
52 |     return cmds, notes
53 | 
54 | def main():
55 |     
56 |     args = parser.parse_args()
57 |     cmds, notes = create_commands("a3c", args.num_workers, args.log_dir, args.env)
58 | 
59 |     print("Executing the following commands:")
60 |     print("\n".join(cmds))
61 |     
62 |     os.environ["TMUX"] = ""
63 |     os.system("\n".join(cmds))
64 |     
65 |     print("Notes:")
66 |     print('\n'.join(notes))
67 |     
68 | if __name__ == "__main__":
69 |     main()
70 |     
71 |     


--------------------------------------------------------------------------------
/Chapter05/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 8, 2016
 3 | 
 4 | @author: a0096049
 5 | '''
 6 | import math, random
 7 | import numpy, cv2
 8 | import skimage.transform
 9 | import tensorflow as tf
10 | 
11 | 
12 | def preprocess_image(im, image_shape=(110, 84), crop_shape=84, crop_part='down'):
13 |     
14 |     im = skimage.transform.resize(im, image_shape, preserve_range=True)
15 |     
16 |     half = int(crop_shape / 2)
17 |     h, w = im.shape
18 |     if crop_part == 'center':
19 |         im = im[h//2-half:h//2+half, w//2-half:w//2+half]
20 |     if crop_part == 'down':
21 |         im = im[h-crop_shape:h, w//2-half:w//2+half]
22 | 
23 |     return numpy.asarray(im, dtype=numpy.uint8)
24 | 
25 | def cv2_resize_image(image, resized_shape=(84, 84), method='crop', crop_offset=8):
26 |         
27 |         height, width = image.shape
28 |         resized_height, resized_width = resized_shape
29 |         
30 |         if method == 'crop':
31 |             h = int(round(float(height) * resized_width / width))
32 |             resized = cv2.resize(image, (resized_width, h), interpolation=cv2.INTER_LINEAR)
33 |             crop_y_cutoff = h - crop_offset - resized_height
34 |             cropped = resized[crop_y_cutoff : crop_y_cutoff + resized_height, :]
35 |             return numpy.asarray(cropped, dtype=numpy.uint8)
36 |         elif method == 'scale':
37 |             return numpy.asarray(cv2.resize(image, (resized_width, resized_height), 
38 |                                             interpolation=cv2.INTER_LINEAR), dtype=numpy.uint8)
39 |         else:
40 |             raise ValueError('Unrecognized image resize method.')
41 | 
42 | def log_uniform(low, high):
43 |     return math.exp(random.uniform(math.log(low), math.log(high)))
44 | 
45 | def update_target_graph(from_vars, to_vars):
46 |     
47 |     op_holder = []
48 |     for from_var, to_var in zip(from_vars, to_vars):
49 |         op_holder.append(to_var.assign(from_var))
50 |         
51 |     return op_holder
52 | 
53 | def create_optimizer(method, learning_rate, rho, epsilon):
54 |     
55 |     if method == 'rmsprop':
56 |         opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate, 
57 |                                         decay=rho,
58 |                                         epsilon=epsilon)
59 |     elif method == 'adam':
60 |         opt = tf.train.AdamOptimizer(learning_rate=learning_rate,
61 |                                      beta1=rho)
62 |     else:
63 |         raise
64 |     
65 |     return opt
66 | 
67 |     


--------------------------------------------------------------------------------
/Chapter05/worker.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on 29 May 2017
 3 | 
 4 | @author: ywz
 5 | '''
 6 | import numpy, time, random
 7 | import argparse, os, sys, signal
 8 | import tensorflow as tf
 9 | from a3c import A3C
10 | from cluster import cluster_spec
11 | from environment import new_environment
12 | 
13 | def set_random_seed(seed):
14 |     random.seed(seed)
15 |     numpy.random.seed(seed)
16 | 
17 | def delete_dir(path):
18 |     if tf.gfile.Exists(path):
19 |         tf.gfile.DeleteRecursively(path)
20 |     tf.gfile.MakeDirs(path)
21 |     return path
22 | 
23 | def shutdown(signal, frame):
24 |     print('Received signal {}: exiting'.format(signal))
25 |     sys.exit(128 + signal)
26 | 
27 | def train(args, server):
28 |     
29 |     os.environ['OMP_NUM_THREADS'] = '1'
30 |     set_random_seed(args.task * 17)
31 |     log_dir = os.path.join(args.log_dir, '{}/train'.format(args.env))
32 |     if not tf.gfile.Exists(log_dir):
33 |         tf.gfile.MakeDirs(log_dir)
34 | 
35 |     game, parameter = new_environment(args.env)
36 |     a3c = A3C(game, log_dir, parameter.get(), agent_index=args.task, callback=None)
37 | 
38 |     global_vars = [v for v in tf.global_variables() if not v.name.startswith("local")]    
39 |     ready_op = tf.report_uninitialized_variables(global_vars)
40 |     config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
41 | 
42 |     with tf.Session(target=server.target, config=config) as sess:
43 |         saver = tf.train.Saver()
44 |         path = os.path.join(log_dir, 'log_%d' % args.task)
45 |         writer = tf.summary.FileWriter(delete_dir(path), sess.graph_def)
46 |         a3c.set_summary_writer(writer)
47 |         
48 |         if args.task == 0:
49 |             sess.run(tf.global_variables_initializer())
50 |         else:
51 |             while len(sess.run(ready_op)) > 0:
52 |                 print("Waiting for task 0 initializing the global variables.")
53 |                 time.sleep(1)
54 |         a3c.run(sess, saver)
55 | 
56 | def main():
57 |     
58 |     parser = argparse.ArgumentParser(description=None)
59 |     parser.add_argument('-t', '--task', default=0, type=int, help='Task index')
60 |     parser.add_argument('-j', '--job_name', default="worker", type=str, help='worker or ps')
61 |     parser.add_argument('-w', '--num_workers', default=1, type=int, help='Number of workers')
62 |     parser.add_argument('-l', '--log_dir', default="save", type=str, help='Log directory path')
63 |     parser.add_argument('-e', '--env', default="demo", type=str, help='Environment')
64 |     
65 |     args = parser.parse_args()
66 |     spec = cluster_spec(args.num_workers, 1)
67 |     cluster = tf.train.ClusterSpec(spec)
68 | 
69 |     signal.signal(signal.SIGHUP, shutdown)
70 |     signal.signal(signal.SIGINT, shutdown)
71 |     signal.signal(signal.SIGTERM, shutdown)
72 |     
73 |     if args.job_name == "worker":
74 |         server = tf.train.Server(cluster, 
75 |                                  job_name="worker", 
76 |                                  task_index=args.task,
77 |                                  config=tf.ConfigProto(intra_op_parallelism_threads=0, 
78 |                                                        inter_op_parallelism_threads=0)) # Use default op_parallelism_threads
79 |         train(args, server)
80 |     else:
81 |         server = tf.train.Server(cluster, 
82 |                                  job_name="ps", 
83 |                                  task_index=args.task,
84 |                                  config=tf.ConfigProto(device_filters=["/job:ps"]))
85 |         # server.join()
86 |         while True:
87 |             time.sleep(1000)
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 |     
92 |     


--------------------------------------------------------------------------------
/Chapter06/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter06/__init__.py


--------------------------------------------------------------------------------
/Chapter06/commands.txt:
--------------------------------------------------------------------------------
 1 | ### TRAINING
 2 | 
 3 | # First epoch
 4 | 
 5 | # Bootstrapping
 6 | 
 7 | export MODEL_NAME=000000-bootstrap
 8 | python main.py bootstrap /tmp/minigo_working_dir models/$MODEL_NAME
 9 | 
10 | # Selfplay
11 | python main.py selfplay models/$MODEL_NAME
12 | 
13 | # Gather training data and shuffle
14 | python main.py gather
15 | 
16 | # Train
17 | python main.py train /tmp/minigo_working_dir \
18 |     data/training_chunks models/000001-bootstrap -g 1
19 | 
20 | # Second epoch onwards
21 | 
22 | # Increment params
23 | export MODEL_NAME=000001-bootstrap
24 | python main.py selfplay models/$MODEL_NAME
25 | python main.py gather
26 | python main.py train /tmp/minigo_working_dir \
27 |     data/training_chunks models/000002-bootstrap -g 2
28 | 
29 | ### TESTING
30 | 
31 | # Export models
32 | export MINIGO_MODELS=/tmp/minigo-models
33 | cp models/000001* $MINIGO_MODELS
34 | 
35 | # Execute selfplay
36 | python rl_loop.py selfplay --readouts=5 -v 3
37 | 


--------------------------------------------------------------------------------
/Chapter06/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter06/src/__init__.py


--------------------------------------------------------------------------------
/Chapter06/src/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration store
 3 | """
 4 | from constants import HYPERPARAMS
 5 | 
 6 | class GOPARAMETERS:
 7 |     N = 9
 8 |     WHITE = -1
 9 |     EMPTY = 0
10 |     BLACK = 1
11 |     FILL = 2
12 |     KO = 3
13 |     UNKNOWN = 4
14 |     MISSING_GROUP_ID = -1
15 |     COL_NAMES = 'ABCDEFGHJKLMNOPQRST'
16 |     SGF_COLUMNS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
17 |     KGS_COLUMNS = 'ABCDEFGHJKLMNOPQRSTUVWXYZ'
18 | 
19 | class GLOBAL_PARAMETER_STORE:
20 |     # How many positions we should aggregate per 'chunk'.
21 |     EXAMPLES_PER_RECORD = 10000
22 |     # How many positions to draw from for our training window.
23 |     # AGZ used the most recent 500k games, which, assuming 250 moves/game = 125M
24 |     WINDOW_SIZE = 125000000
25 |     # Number of positions to look at per generation
26 |     EXAMPLES_PER_GENERATION = 2000000
27 |     # Number of selfplay games
28 |     NUM_SELFPLAY_GAMES = 100
29 |     # Positions per batch
30 |     TRAIN_BATCH_SIZE = 16
31 |     # Number of games before the selfplay workers will stop
32 |     MAX_GAMES_PER_GENERATION = 10000
33 |     # Proportion of games to holdout from training per generation
34 |     HOLDOUT = 0.05
35 |     # Number of leaves to consider simultaneously
36 |     SIMULTANEOUS_LEAVES = 8
37 |     # Step boundaries for changing the learning rate
38 |     BOUNDARIES = [int(1e6), int(2e6)]
39 |     # Learning rates corresponding to boundaries
40 |     LEARNING_RATE = [1e-2, 1e-3, 1e-4]
41 |     SGF_TEMPLATE = '''(;GM[1]FF[4]CA[UTF-8]AP[Minigo_sgfgenerator]RU[{ruleset}]
42 |     SZ[{boardsize}]KM[{komi}]PW[{white_name}]PB[{black_name}]RE[{result}]
43 |     {game_moves})'''
44 |     PROGRAM_IDENTIFIER = "AlphaGoZero"
45 |     TEMPERATURE_CUTOFF = int((GOPARAMETERS.N * GOPARAMETERS.N) / 12)
46 |     # TFRecords related parameters
47 |     SHUFFLE_BUFFER_SIZE = int(2*1e4)
48 |     CYCLE_LENGTH = 16
49 |     BLOCK_LENGTH = 64
50 |     # Number of MCTS readouts we do during selfplay
51 |     SELFPLAY_READOUTS = 1600
52 |     # Default resign threshold
53 |     RESIGN_THRESHOLD = -0.90
54 |     # Number of MCTS readouts we do during evaluation
55 |     EVALUATION_READOUTS = 400
56 |     # Number of games to play during evaluation
57 |     EVALUATION_GAMES = 16
58 |     # Buffer size for when validating model
59 |     VALIDATION_BUFFER_SIZE = 1000
60 |     # Number of global steps when validating model
61 |     VALIDATION_NUMBER_OF_STEPS = 1000
62 | 
63 | class MCTSPARAMETERS:
64 |     # 505 moves for 19x19, 113 for 9x9
65 |     MAX_DEPTH = (GOPARAMETERS.N ** 2) * 1.4
66 |     # Exploration constant
67 |     c_PUCT = 1.38
68 |     # Dirichlet noise, as a function of GOPARAMETERS.N
69 |     DIRICHLET_NOISE = 0.03 * 361 / (GOPARAMETERS.N ** 2)
70 | 
71 | class AGENTPARAMETERS:
72 |     SECONDS_PER_MOVE = 5
73 | 
74 | ALL_POSITIONS = [(i, j) for i in range(GOPARAMETERS.N) for j in range(GOPARAMETERS.N)]
75 | NEIGHBORS = {(x, y): list(filter(lambda c: c[0] % GOPARAMETERS.N == c[0] and c[1] % GOPARAMETERS.N == c[1], [
76 |     (x+1, y), (x-1, y), (x, y+1), (x, y-1)])) for x, y in ALL_POSITIONS}
77 | DIAGONALS = {(x, y): list(filter(lambda c: c[0] % GOPARAMETERS.N == c[0] and c[1] % GOPARAMETERS.N == c[1], [
78 |     (x+1, y+1), (x+1, y-1), (x-1, y+1), (x-1, y-1)])) for x, y in ALL_POSITIONS}
79 | 
80 | """
81 |     k: number of filters (AlphaGoZero used 256). We use 128 by
82 |     default for a 19x19 go board.
83 |     fc_width: Dimensionality of the fully connected linear layer
84 |     num_shared_layers: number of shared residual blocks.  AGZ used both 19
85 |     and 39. Here we use 19 because it's faster to train.
86 |     l2_strength: The L2 regularization parameter.
87 |     momentum: The momentum parameter for training
88 | """
89 | NETWORK_HYPERPARAMETERS = {
90 |     HYPERPARAMS.NUM_FILTERS: 128,  # Width of each conv layer
91 |     HYPERPARAMS.FC_WIDTH: 2 * 128,  # Width of each fully connected layer
92 |     HYPERPARAMS.NUMSHAREDLAYERS: 19,  # Number of shared trunk layers
93 |     HYPERPARAMS.BETA: 1e-4,  # Regularization strength
94 |     HYPERPARAMS.MOMENTUM: 0.9,  # Momentum used in SGD
95 | }
96 | 


--------------------------------------------------------------------------------
/Chapter06/src/constants.py:
--------------------------------------------------------------------------------
 1 | MODEL_NUM_REGEX = "^\d{6}"
 2 | MODEL_NAME_REGEX = "^\d{6}(-\w+)+"
 3 | 
 4 | class HYPERPARAMS:
 5 |     BETA = 'beta'
 6 |     MOMENTUM = 'momentum'
 7 |     NUMSHAREDLAYERS = 'num_shared_layers'
 8 |     FC_WIDTH = 'fc_width'
 9 |     NUM_FILTERS = 'k'
10 |     EPSILON = "epsilon"
11 | 
12 | class PATHS:
13 |     MODELS_DIR = "models/"
14 |     SELFPLAY_DIR = 'data/selfplay/'
15 |     HOLDOUT_DIR = "data/holdout/"
16 |     SGF_DIR = "data/sgf/"
17 |     TRAINING_CHUNK_DIR = "data/training_chunks/"
18 |     ESTIMATOR_WORKING_DIR = 'estimator_working_dir/'
19 |     INITIAL_CHECKPOINT_NAME = "model.ckpt-1"
20 | 
21 | class FEATUREPARAMETERS:
22 |     NUM_CHANNELS = 17
23 | 


--------------------------------------------------------------------------------
/Chapter06/src/features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from config import GOPARAMETERS
 4 | 
 5 | def stone_features(board_state):
 6 |     # 16 planes, where every other plane represents the stones of a particular color
 7 |     # which means we track the stones of the last 8 moves.
 8 |     features = np.zeros([16, GOPARAMETERS.N, GOPARAMETERS.N], dtype=np.uint8)
 9 | 
10 |     num_deltas_avail = board_state.board_deltas.shape[0]
11 |     cumulative_deltas = np.cumsum(board_state.board_deltas, axis=0)
12 |     last_eight = np.tile(board_state.board, [8, 1, 1])
13 |     last_eight[1:num_deltas_avail + 1] -= cumulative_deltas
14 |     last_eight[num_deltas_avail +1:] = last_eight[num_deltas_avail].reshape(1, GOPARAMETERS.N, GOPARAMETERS.N)
15 | 
16 |     features[::2] = last_eight == board_state.to_play
17 |     features[1::2] = last_eight == -board_state.to_play
18 |     return np.rollaxis(features, 0, 3)
19 | 
20 | def color_to_play_feature(board_state):
21 |     # 1 plane representing which color is to play
22 |     # The plane is filled with 1's if the color to play is black; 0's otherwise
23 |     if board_state.to_play == GOPARAMETERS.BLACK:
24 |         return np.ones([GOPARAMETERS.N, GOPARAMETERS.N, 1], dtype=np.uint8)
25 |     else:
26 |         return np.zeros([GOPARAMETERS.N, GOPARAMETERS.N, 1], dtype=np.uint8)
27 | 
28 | def extract_features(board_state):
29 |     stone_feat = stone_features(board_state=board_state)
30 |     turn_feat = color_to_play_feature(board_state=board_state)
31 |     all_features = np.concatenate([stone_feat, turn_feat], axis=2)
32 |     return all_features
33 | 


--------------------------------------------------------------------------------
/Chapter06/src/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | from config import GOPARAMETERS, GLOBAL_PARAMETER_STORE
  8 | from constants import FEATUREPARAMETERS
  9 | from features import extract_features
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | logger.setLevel(logging.INFO)
 13 | 
 14 | TF_RECORD_CONFIG = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
 15 | 
 16 | def _one_hot(index):
 17 |     onehot = np.zeros([GOPARAMETERS.N * GOPARAMETERS.N + 1], dtype=np.float32)
 18 |     onehot[index] = 1
 19 |     return onehot
 20 | 
 21 | 
 22 | def get_input_tensors(list_tf_records, buffer_size=GLOBAL_PARAMETER_STORE.SHUFFLE_BUFFER_SIZE):
 23 |     logger.info("Getting input data and tensors")
 24 |     dataset = process_tf_records(list_tf_records=list_tf_records,
 25 |                                  buffer_size=buffer_size)
 26 |     dataset = dataset.filter(lambda input_tensor: tf.equal(tf.shape(input_tensor)[0],
 27 |                                                            GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE))
 28 |     dataset = dataset.map(parse_batch_tf_example)
 29 |     logger.info("Finished parsing")
 30 |     return dataset.make_one_shot_iterator().get_next()
 31 | 
 32 | 
 33 | def create_dataset_from_selfplay(data_extracts):
 34 |     return (create_tf_train_example(extract_features(board_state), pi, result)
 35 |             for board_state, pi, result in data_extracts)
 36 | 
 37 | 
 38 | def shuffle_tf_examples(batch_size, records_to_shuffle):
 39 |     tf_dataset = process_tf_records(records_to_shuffle, batch_size=batch_size)
 40 |     iterator = tf_dataset.make_one_shot_iterator()
 41 |     next_dataset_batch = iterator.get_next()
 42 |     sess = tf.Session()
 43 |     while True:
 44 |         try:
 45 |             result = sess.run(next_dataset_batch)
 46 |             yield list(result)
 47 |         except tf.errors.OutOfRangeError:
 48 |             break
 49 | 
 50 | 
 51 | def create_tf_train_example(board_state, pi, result):
 52 |     board_state_as_tf_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[board_state.tostring()]))
 53 |     pi_as_tf_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[pi.tostring()]))
 54 |     value_as_tf_feature = tf.train.Feature(float_list=tf.train.FloatList(value=[result]))
 55 | 
 56 |     tf_example = tf.train.Example(features=tf.train.Features(feature={
 57 |         'x': board_state_as_tf_feature,
 58 |         'pi': pi_as_tf_feature,
 59 |         'z': value_as_tf_feature
 60 |     }))
 61 | 
 62 |     return tf_example
 63 | 
 64 | def write_tf_examples(record_path, tf_examples, serialize=True):
 65 |     with tf.python_io.TFRecordWriter(record_path, options=TF_RECORD_CONFIG) as tf_record_writer:
 66 |         for tf_example in tf_examples:
 67 |             if serialize:
 68 |                 tf_record_writer.write(tf_example.SerializeToString())
 69 |             else:
 70 |                 tf_record_writer.write(tf_example)
 71 | 
 72 | def parse_batch_tf_example(example_batch):
 73 |     features = {
 74 |         'x': tf.FixedLenFeature([], tf.string),
 75 |         'pi': tf.FixedLenFeature([], tf.string),
 76 |         'z': tf.FixedLenFeature([], tf.float32),
 77 |     }
 78 |     parsed_tensors = tf.parse_example(example_batch, features)
 79 | 
 80 |     # Get the board state
 81 |     x = tf.cast(tf.decode_raw(parsed_tensors['x'], tf.uint8), tf.float32)
 82 |     x = tf.reshape(x, [GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE, GOPARAMETERS.N,
 83 |                        GOPARAMETERS.N, FEATUREPARAMETERS.NUM_CHANNELS])
 84 | 
 85 |     # Get the policy target, which is the distribution of possible moves
 86 |     # Each target is a vector of length of board * length of board + 1
 87 |     distribution_of_moves = tf.decode_raw(parsed_tensors['pi'], tf.float32)
 88 |     distribution_of_moves = tf.reshape(distribution_of_moves,
 89 |                                        [GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE, GOPARAMETERS.N * GOPARAMETERS.N + 1])
 90 | 
 91 |     # Get the result of the game
 92 |     # The result is simply a scalar
 93 |     result_of_game = parsed_tensors['z']
 94 |     result_of_game.set_shape([GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE])
 95 | 
 96 |     return (x, {'pi_label': distribution_of_moves, 'z_label': result_of_game})
 97 | 
 98 | 
 99 | def process_tf_records(list_tf_records, shuffle_records=True,
100 |                        buffer_size=GLOBAL_PARAMETER_STORE.SHUFFLE_BUFFER_SIZE,
101 |                        batch_size=GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE):
102 | 
103 |     if shuffle_records:
104 |         random.shuffle(list_tf_records)
105 | 
106 |     list_dataset = tf.data.Dataset.from_tensor_slices(list_tf_records)
107 | 
108 |     tensors_dataset = list_dataset.interleave(map_func=lambda x: tf.data.TFRecordDataset(x, compression_type='ZLIB'),
109 |                                              cycle_length=GLOBAL_PARAMETER_STORE.CYCLE_LENGTH,
110 |                                              block_length=GLOBAL_PARAMETER_STORE.BLOCK_LENGTH)
111 |     tensors_dataset = tensors_dataset.repeat(1).shuffle(buffer_siz=buffer_size).batch(batch_size)
112 | 
113 |     return tensors_dataset
114 | 
115 | 


--------------------------------------------------------------------------------
/Chapter06/src/train.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | from utils import timer
 4 | 
 5 | import os
 6 | 
 7 | from constants import PATHS
 8 | 
 9 | import logging
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | def main():
14 | 
15 |     if not os.path.exists(PATHS.SELFPLAY_DIR):
16 |         with timer("Initialize"):
17 |             logger.info('==========================================')
18 |             logger.info("============ Initializing...==============")
19 |             logger.info('==========================================')
20 |             res = subprocess.call("python controller.py initialize-random-model", shell=True)
21 | 
22 |         with timer('Initial Selfplay'):
23 |             logger.info('=======================================')
24 |             logger.info('============ Selplaying...=============')
25 |             logger.info('=======================================')
26 |             subprocess.call('python controller.py selfplay', shell=True)
27 | 
28 |     while True:
29 |         with timer("Aggregate"):
30 |             logger.info('=========================================')
31 |             logger.info("============ Aggregating...==============")
32 |             logger.info('=========================================')
33 |             res = subprocess.call("python controller.py aggregate", shell=True)
34 |             if res != 0:
35 |                 logger.info("Failed to gather")
36 |                 sys.exit(1)
37 | 
38 |         with timer("Train"):
39 |             logger.info('=======================================')
40 |             logger.info("============ Training...===============")
41 |             logger.info('=======================================')
42 |             subprocess.call("python controller.py train", shell=True)
43 | 
44 |         with timer('Selfplay'):
45 |             logger.info('=======================================')
46 |             logger.info('============ Selplaying...=============')
47 |             logger.info('=======================================')
48 |             subprocess.call('python controller.py selfplay', shell=True)
49 | 
50 |         with timer("Validate"):
51 |             logger.info('=======================================')
52 |             logger.info("============ Validating...=============")
53 |             logger.info('=======================================')
54 |             subprocess.call("python controller.py validate", shell=True)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/Chapter07/RL chatbot.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%run data_parser.py\n",
10 |     "%run feature_extracter.py \n",
11 |     "from train import train\n",
12 |     "from test import test\n",
13 |     "train(False)\n",
14 |     "test()    #Argument: model path to be used for testing, if None, the default model path is used"
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": null,
20 |    "metadata": {},
21 |    "outputs": [],
22 |    "source": []
23 |   }
24 |  ],
25 |  "metadata": {
26 |   "kernelspec": {
27 |    "display_name": "Python 3",
28 |    "language": "python",
29 |    "name": "python3"
30 |   },
31 |   "language_info": {
32 |    "codemirror_mode": {
33 |     "name": "ipython",
34 |     "version": 3
35 |    },
36 |    "file_extension": ".py",
37 |    "mimetype": "text/x-python",
38 |    "name": "python",
39 |    "nbconvert_exporter": "python",
40 |    "pygments_lexer": "ipython3",
41 |    "version": "3.6.6"
42 |   }
43 |  },
44 |  "nbformat": 4,
45 |  "nbformat_minor": 2
46 | }
47 | 


--------------------------------------------------------------------------------
/Chapter07/convert_checkpoint.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter07/convert_checkpoint.py


--------------------------------------------------------------------------------
/Chapter07/data_parser.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import pickle
  3 | import codecs
  4 | import re
  5 | import os
  6 | import time
  7 | import numpy as np
  8 | 
  9 | """ This module cleans and preprocesses the text in the training dataset
 10 | """
 11 | 
 12 | def preProBuildWordVocab(word_count_threshold=5, all_words_path='data/all_words.txt'):
 13 |     # borrowed this function from NeuralTalk
 14 | 
 15 |     if not os.path.exists(all_words_path):
 16 |         parse_all_words(all_words_path)
 17 | 
 18 |     corpus = open(all_words_path, 'r').read().split('\n')[:-1]
 19 |     captions = np.asarray(corpus, dtype=np.object)
 20 | 
 21 |     captions = map(lambda x: x.replace('.', ''), captions)
 22 |     captions = map(lambda x: x.replace(',', ''), captions)
 23 |     captions = map(lambda x: x.replace('"', ''), captions)
 24 |     captions = map(lambda x: x.replace('\n', ''), captions)
 25 |     captions = map(lambda x: x.replace('?', ''), captions)
 26 |     captions = map(lambda x: x.replace('!', ''), captions)
 27 |     captions = map(lambda x: x.replace('\\', ''), captions)
 28 |     captions = map(lambda x: x.replace('/', ''), captions)
 29 | 
 30 |     print('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold))
 31 |     word_counts = {}
 32 |     nsents = 0
 33 |     for sent in captions:
 34 |         nsents += 1
 35 |         for w in sent.lower().split(' '):
 36 |             
 37 |             word_counts[w] = word_counts.get(w, 0) + 1
 38 |     vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
 39 |     print('filtered words from %d to %d' % (len(word_counts), len(vocab)))
 40 | 
 41 |     ixtoword = {}
 42 |     ixtoword[0] = '<pad>'
 43 |     ixtoword[1] = '<bos>'
 44 |     ixtoword[2] = '<eos>'
 45 |     ixtoword[3] = '<unk>'
 46 | 
 47 |     wordtoix = {}
 48 |     wordtoix['<pad>'] = 0
 49 |     wordtoix['<bos>'] = 1
 50 |     wordtoix['<eos>'] = 2
 51 |     wordtoix['<unk>'] = 3
 52 | 
 53 |     for idx, w in enumerate(vocab):
 54 |         wordtoix[w] = idx+4
 55 |         ixtoword[idx+4] = w
 56 | 
 57 |     word_counts['<pad>'] = nsents
 58 |     word_counts['<bos>'] = nsents
 59 |     word_counts['<eos>'] = nsents
 60 |     word_counts['<unk>'] = nsents
 61 | 
 62 |     bias_init_vector = np.array([1.0 * word_counts[ixtoword[i]] for i in ixtoword])
 63 |     bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
 64 |     bias_init_vector = np.log(bias_init_vector)
 65 |     bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
 66 | 
 67 |     return wordtoix, ixtoword, bias_init_vector
 68 | 
 69 | def parse_all_words(all_words_path):
 70 |     raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1]
 71 | 
 72 |     with codecs.open(all_words_path, "w", encoding='utf-8', errors='ignore') as f:
 73 |         for line in raw_movie_lines:
 74 |             line = line.split(' +++$+++ ')
 75 |             utterance = line[-1]
 76 |             f.write(utterance + '\n')
 77 | 
 78 | """ Extract only the vocabulary part of the data """
 79 | def refine(data):
 80 |     words = re.findall("[a-zA-Z'-]+", data)
 81 |     words = ["".join(word.split("'")) for word in words]
 82 |     # words = ["".join(word.split("-")) for word in words]
 83 |     data = ' '.join(words)
 84 |     return data
 85 | 
 86 | if __name__ == '__main__':
 87 |     parse_all_words('data/all_words.txt')
 88 | 
 89 |     raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1]
 90 |     
 91 |     utterance_dict = {}
 92 |     with codecs.open('data/tokenized_all_words.txt', "w", encoding='utf-8', errors='ignore') as f:
 93 |         for line in raw_movie_lines:
 94 |             line = line.split(' +++$+++ ')
 95 |             line_ID = line[0]
 96 |             utterance = line[-1]
 97 |             utterance_dict[line_ID] = utterance
 98 |             utterance = " ".join([refine(w) for w in utterance.lower().split()])
 99 |             f.write(utterance + '\n')
100 |     pickle.dump(utterance_dict, open('data/utterance_dict', 'wb'), True)


--------------------------------------------------------------------------------
/Chapter07/data_reader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import pickle
 3 | import random
 4 | 
 5 | """ This helper module helps generate trainable batches from the preprocessed training text
 6 | """
 7 | 
 8 | 
 9 | class Data_Reader:
10 |     def __init__(self, cur_train_index=0, load_list=False):
11 |         self.training_data = pickle.load(open('data/conversations_lenmax22_formersents2_with_former', 'rb'))
12 |         self.data_size = len(self.training_data)
13 |         if load_list:
14 |             self.shuffle_list = pickle.load(open('data/shuffle_index_list', 'rb'))
15 |         else:    
16 |             self.shuffle_list = self.shuffle_index()
17 |         self.train_index = cur_train_index
18 | 
19 |     def get_batch_num(self, batch_size):
20 |         return self.data_size // batch_size
21 | 
22 |     def shuffle_index(self):
23 |         shuffle_index_list = random.sample(range(self.data_size), self.data_size)
24 |         pickle.dump(shuffle_index_list, open('data/shuffle_index_list', 'wb'), True)
25 |         return shuffle_index_list
26 | 
27 |     def generate_batch_index(self, batch_size):
28 |         if self.train_index + batch_size > self.data_size:
29 |             batch_index = self.shuffle_list[self.train_index:self.data_size]
30 |             self.shuffle_list = self.shuffle_index()
31 |             remain_size = batch_size - (self.data_size - self.train_index)
32 |             batch_index += self.shuffle_list[:remain_size]
33 |             self.train_index = remain_size
34 |         else:
35 |             batch_index = self.shuffle_list[self.train_index:self.train_index+batch_size]
36 |             self.train_index += batch_size
37 | 
38 |         return batch_index
39 | 
40 |     def generate_training_batch(self, batch_size):
41 |         batch_index = self.generate_batch_index(batch_size)
42 |         batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
43 |         batch_Y = [self.training_data[i][1] for i in batch_index]   # batch_size of conv_b
44 | 
45 |         return batch_X, batch_Y
46 | 
47 |     def generate_training_batch_with_former(self, batch_size):
48 |         batch_index = self.generate_batch_index(batch_size)
49 |         batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
50 |         batch_Y = [self.training_data[i][1] for i in batch_index]   # batch_size of conv_b
51 |         former = [self.training_data[i][2] for i in batch_index]    # batch_size of former utterance
52 | 
53 |         return batch_X, batch_Y, former
54 | 
55 |     def generate_testing_batch(self, batch_size):
56 |         batch_index = self.generate_batch_index(batch_size)
57 |         batch_X = [self.training_data[i][0] for i in batch_index]   # batch_size of conv_a
58 | 
59 |         return batch_X


--------------------------------------------------------------------------------
/Chapter07/model/Reversed/checkpoint:
--------------------------------------------------------------------------------
 1 | model_checkpoint_path: "model-63"
 2 | all_model_checkpoint_paths: "model-45"
 3 | all_model_checkpoint_paths: "model-46"
 4 | all_model_checkpoint_paths: "model-47"
 5 | all_model_checkpoint_paths: "model-48"
 6 | all_model_checkpoint_paths: "model-49"
 7 | all_model_checkpoint_paths: "model-50"
 8 | all_model_checkpoint_paths: "model-51"
 9 | all_model_checkpoint_paths: "model-52"
10 | all_model_checkpoint_paths: "model-53"
11 | all_model_checkpoint_paths: "model-54"
12 | all_model_checkpoint_paths: "model-55"
13 | all_model_checkpoint_paths: "model-56"
14 | all_model_checkpoint_paths: "model-57"
15 | all_model_checkpoint_paths: "model-58"
16 | all_model_checkpoint_paths: "model-59"
17 | all_model_checkpoint_paths: "model-60"
18 | all_model_checkpoint_paths: "model-61"
19 | all_model_checkpoint_paths: "model-62"
20 | all_model_checkpoint_paths: "model-63"


--------------------------------------------------------------------------------
/Chapter07/model/model-56-3000/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "new_model-56-3000"
2 | all_model_checkpoint_paths: "new_model-56-3000"
3 | 


--------------------------------------------------------------------------------
/Chapter07/results/sample_input.txt:
--------------------------------------------------------------------------------
 1 | Have you heard about 'machine learning and having it deep and structured'?
 2 | How are you?
 3 | What's your name?
 4 | Hello
 5 | Thank God!  If I had to hear one more story about your coiffure
 6 | You never wanted to go out with me, did you?
 7 | I guess I thought I was protecting you.
 8 | Forget his reputation.  Do you think we've got a plan or not?
 9 | You didn't have a choice?
10 | Can you do me a favor?
11 | So I have to have a motive to be with you?
12 | What's next?
13 | You played for the Red Sox?
14 | Are you saying that someone is paying you to be our maid and doesn't want us to know who he is?
15 | What did he say?
16 | How do you do?
17 | Where do you live?
18 | If she can't pay, I'll have to foreclose, won't I?
19 | I want to see a movie.
20 | I didn't kill him! I had nothing to do with that, I tell you!
21 | What do you mean?
22 | For a celebrated bounder, that is an awful admission.
23 | Besides, I never knew that any female could do this to you
24 | Maybe. But I'm taking no chances.  Why, this kid's got a record.
25 | Did you get the case for the securities?
26 | I'd like to take a crack at that guy.
27 | I didn't do it! I haven't got a gun!
28 | BASTARDS! Come back here and face me!
29 | I told you you'd get your money back.
30 | Let's get back on the road. You gotta be at that convention in the morning.
31 | You ought to take up crap shooting. Talk about luck!
32 | Don't you think you should call a backup?
33 | Motherfucker you.
34 | Machine learning.
35 | Why you gotta talk about my moms?
36 | You've got to be kidding me ! His lazy ass couldn't win the special Olympics.
37 | Sir, this is not like firing any employee. We can't predict what will happen.
38 | I don't know. Maybe we should watch the tape to be sure.
39 | Listen man, I don't need this shit.
40 | Will you stand up for me?
41 | I had a feeling you would say something like that. So I brought us dinner.
42 | They can't be serious. The ship's in pieces and we've less than a skeleton aboard.
43 | How do you trun this on?
44 | Thank God it's Friday!
45 | I don't give a shit!
46 | WHAT KIND OF PLAN IS THAT!??
47 | No weirder than a sharp, young, good-looking woman working in a lumberyard.
48 | The witness need not be hesitant to say anything before this committee, as long as it's the truth.
49 | I'm sure a lot of people down in L.A. are worried sick about you.
50 | Find the rockets.  If they're guarded, kill the men guarding them.
51 | what single thing would you want the next President of this country to do most?
52 | I forgot to get the Coca-Cola.
53 | Whoa!... The government. They control everybody's mind. You're too fucking stupid to know that?
54 | How about you graduation thesis?


--------------------------------------------------------------------------------
/Chapter07/results/sample_output_RL.txt:
--------------------------------------------------------------------------------
 1 | Danger misunderstood say: connection danger.
 2 | Victor.
 3 | Medal question medal medal buddy they.
 4 | Jeremy.
 5 | During.
 6 | Assuming.
 7 | Assuming infection say: neo say: wishes or say: say: benny chief curiosity or joined victor.
 8 | Invited.
 9 | Sneak error they sneak misunderstood swana question toast connection studying treasure curiosity witches.
10 | Buddy mystery effort plenty carryin' misery miami marvelous.
11 | Assuming.
12 | Assuming infection cia treasure curiosity curiosity invited.
13 | Say: say: connection breakdown say: connection.
14 | Invited.
15 | Wise covers curiosity plenty funeral mac kent rope.
16 | Wishes say: base.
17 | Assuming.
18 | Medal order feature tag lit.
19 | Assuming jack's misunderstood curiosity invited pills toast his neo toast.
20 | Sneak.
21 | Children treasure.
22 | Times.
23 | Funeral mac illness victor max wishes 22 faggot or buddy.
24 | Dearest.
25 | Johnnie.
26 | Assuming.
27 | Illness neo treasure buddy effort plenty mac buddy mystery neo neo they neo buddy mystery neo plot neo plot treasure pills.
28 | During.
29 | <u>are<u>.
30 | Nail treasure they they neo neo neo neo.
31 | Medal.
32 | Swana funeral or buddy vietnam they jury.
33 | Assuming say: error voices infection say: did jack's extra mac burned or witches.
34 | Infection.
35 | Invited card mac santa say: pulse.
36 | Times invited they treasure funeral they problem invited.
37 | Agreement extra.
38 | They.
39 | Medal criminal buttons.
40 | During 22 jack's snow zoo infection neo sean.
41 | Invited.
42 | During.
43 | Assuming.
44 | Assuming.
45 | Toast ending lingerie walks buddy mystery stupidity benny drove swana heels swana heels heels.
46 | Invited.
47 | Cell buddy snow they they they.
48 | Johnnie infection invited during somethin'.
49 | Johnnie.
50 | They they buddy mary they loans.
51 | They sneak kent destiny poisoned they.
52 | Toast.
53 | During criminal burn neo miami rooms rooms parking seriously rooms recommend curiosity treasure gathering they.
54 | Medal criminal burn plenty victor neighbor.
55 | 


--------------------------------------------------------------------------------
/Chapter08/README.md:
--------------------------------------------------------------------------------
 1 | # Auto Generating a Deep Neural Network
 2 | 
 3 | # TODO
 4 | * Specifications for neural network DNA
 5 | * Generate TF estimator based on DNA
 6 | * Train TF estimator on CIFAR-10 and return validation accuracy
 7 | * Use tf.contrib.rnn.NASCell
 8 | 
 9 | # Basic algorithm
10 | * Initialize controller
11 | * Generate m child networks
12 | * Write checkpoint
13 | * Train m child networks and get m validation accuracies
14 | * Calculate gradient from mean loss across child networks according to REINFORCE
15 |     * This requires the EMA of previous architecture validation accuracies as a baseline function
16 | * Update controller
17 | * Repeat
18 | 
19 | # Parameters
20 | * m - number of child networks to generate in one episode
21 | * l - number of layers the child NN will have
22 | * controller_lr - learning rate of controller
23 | * child_lr - learning rate of child NN
24 | * beta - weight decay parameter of child_NN for L2
25 | * momentum - for Nesterov momentum of SGD
26 | 
27 | # Tokens generated by controller
28 | * Filter size
29 | * Stride size
30 | * Nb. filters
31 | * Max-pooling size
32 | * anchor point


--------------------------------------------------------------------------------
/Chapter08/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter08/__init__.py


--------------------------------------------------------------------------------
/Chapter08/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter08/src/__init__.py


--------------------------------------------------------------------------------
/Chapter08/src/child_network.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | class ChildCNN(object):
 8 | 
 9 |     def __init__(self, cnn_dna, child_id, beta=1e-4, drop_rate=0.2, **kwargs):
10 |         self.cnn_dna = self.process_raw_controller_output(cnn_dna)
11 |         self.child_id = child_id
12 |         self.beta = beta
13 |         self.drop_rate = drop_rate
14 |         self.is_training = tf.placeholder_with_default(True, shape=None, name="is_training_{}".format(self.child_id))
15 |         self.num_classes = 10
16 | 
17 |     def process_raw_controller_output(self, output):
18 |         """
19 |         A helper function for preprocessing the output of the NASCell
20 |         Args:
21 |             output (numpy.ndarray) The output of the NASCell
22 | 
23 |         Returns:
24 |             (list) The child network's architecture
25 |         """
26 |         output = output.ravel()
27 |         cnn_dna = [list(output[x:x+4]) for x in range(0, len(output), 4)]
28 |         return cnn_dna
29 | 
30 |     def build(self, input_tensor):
31 |         """
32 |         Method for creating the child neural network
33 |         Args:
34 |             input_tensor: The tensor which represents the input
35 | 
36 |         Returns:
37 |             The tensor which represents the output logit (pre-softmax activation)
38 | 
39 |         """
40 |         logger.info("DNA is: {}".format(self.cnn_dna))
41 |         output = input_tensor
42 |         for idx in range(len(self.cnn_dna)):
43 |             # Get the configuration for the layer
44 |             kernel_size, stride, num_filters, max_pool_size = self.cnn_dna[idx]
45 |             with tf.name_scope("child_{}_conv_layer_{}".format(self.child_id, idx)):
46 |                 output = tf.layers.conv2d(output,
47 |                         # Specify the number of filters the convolutional layer will output
48 |                         filters=num_filters,
49 |                         # This specifies the size (height, width) of the convolutional kernel
50 |                         kernel_size=(kernel_size, kernel_size),
51 |                         # The size of the stride of the kernel
52 |                         strides=(stride, stride),
53 |                         # We add padding to the image
54 |                         padding="SAME",
55 |                         # It is good practice to name your layers
56 |                         name="conv_layer_{}".format(idx),
57 |                         activation=tf.nn.relu,
58 |                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
59 |                         bias_initializer=tf.zeros_initializer(),
60 |                         kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.beta))
61 |                 # We apply 2D max pooling on the output of the conv layer
62 |                 output = tf.layers.max_pooling2d(
63 |                     output, pool_size=(max_pool_size, max_pool_size), strides=1,
64 |                     padding="SAME", name="pool_out_{}".format(idx)
65 |                 )
66 |                 # Dropout to regularize the network further
67 |                 output = tf.layers.dropout(output, rate=self.drop_rate, training=self.is_training)
68 | 
69 |         # Lastly, we flatten the outputs and add a fully-connected layer
70 |         with tf.name_scope("child_{}_fully_connected".format(self.child_id)):
71 |             output = tf.layers.flatten(output, name="flatten")
72 |             logits = tf.layers.dense(output, self.num_classes)
73 | 
74 |         return logits
75 | 


--------------------------------------------------------------------------------
/Chapter08/src/cifar10_processor.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | from keras.datasets import cifar10
 6 | from keras.utils import np_utils
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def _create_tf_dataset(x, y, batch_size):
11 |     return tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(x),
12 |                                 tf.data.Dataset.from_tensor_slices(y))).shuffle(500).repeat().batch(batch_size)
13 | 
14 | def get_tf_datasets_from_numpy(batch_size, validation_split=0.1):
15 |     """
16 |     Main function getting tf.Data.datasets for training, validation, and testing
17 | 
18 |     Args:
19 |         batch_size (int): Batch size
20 |         validation_split (float): Split for partitioning training and validation sets. Between 0.0 and 1.0.
21 |     """
22 |     # Load data from keras datasets api
23 |     (X, y), (X_test, y_test) = cifar10.load_data()
24 | 
25 |     logger.info("Dividing pixels by 255")
26 |     X = X / 255.
27 |     X_test = X_test / 255.
28 | 
29 |     X = X.astype(np.float32)
30 |     X_test = X_test.astype(np.float32)
31 |     y = y.astype(np.float32)
32 |     y_test = y_test.astype(np.float32)
33 | 
34 |     # Turn labels into onehot encodings
35 |     if y.shape[1] != 10:
36 |         y = np_utils.to_categorical(y, num_classes=10)
37 |         y_test = np_utils.to_categorical(y_test, num_classes=10)
38 | 
39 |     logger.info("Loaded data from keras")
40 | 
41 |     split_idx = int((1.0 - validation_split) * len(X))
42 |     X_train, y_train = X[:split_idx], y[:split_idx]
43 |     X_valid, y_valid = X[split_idx:], y[split_idx:]
44 | 
45 |     train_dataset = _create_tf_dataset(X_train, y_train, batch_size)
46 |     valid_dataset = _create_tf_dataset(X_valid, y_valid, batch_size)
47 |     test_dataset = _create_tf_dataset(X_test, y_test, batch_size)
48 | 
49 |     # Get the batch sizes for the train, valid, and test datasets
50 |     num_train_batches = int(X_train.shape[0] // batch_size)
51 |     num_valid_batches = int(X_valid.shape[0] // batch_size)
52 |     num_test_batches = int(X_test.shape[0] // batch_size)
53 | 
54 |     return train_dataset, valid_dataset, test_dataset, num_train_batches, num_valid_batches, num_test_batches
55 | 


--------------------------------------------------------------------------------
/Chapter08/src/config.py:
--------------------------------------------------------------------------------
 1 | child_network_params = {
 2 |     "learning_rate": 3e-5,
 3 |     "max_epochs": 100,
 4 |     "beta": 1e-3,
 5 |     "batch_size": 20
 6 | }
 7 | 
 8 | controller_params = {
 9 |     "max_layers": 3,
10 |     "components_per_layer": 4,
11 |     'beta': 1e-4,
12 |     'max_episodes': 2000,
13 |     "num_children_per_episode": 10
14 | }
15 | 


--------------------------------------------------------------------------------
/Chapter08/src/constants.py:
--------------------------------------------------------------------------------
1 | class PATHS:
2 |     DATA_DIR = "data"
3 |     SAVE_DIR = "saves"
4 | 


--------------------------------------------------------------------------------
/Chapter08/src/train.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | from .controller import Controller
 5 | 
 6 | if __name__ == '__main__':
 7 |     # Configure the logger
 8 |     logging.basicConfig(stream=sys.stdout,
 9 |                         level=logging.DEBUG,
10 |                         format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
11 |     controller = Controller()
12 |     controller.train_controller()
13 | 


--------------------------------------------------------------------------------
/Chapter09/actor.py:
--------------------------------------------------------------------------------
 1 | from keras import layers, models, optimizers
 2 | from keras import backend as K
 3 | 
 4 | 
 5 | class Actor:
 6 |     
 7 |     
 8 |   # """Actor (policy) Model. """
 9 | 
10 |     def __init__(self, state_size, action_size):
11 | 
12 |         self.state_size = state_size
13 |         self.action_size = action_size
14 | 
15 |         self.build_model()
16 | 
17 |     def build_model(self):
18 |         states = layers.Input(shape=(self.state_size,), name='states')
19 |         
20 |         net = layers.Dense(units=16,kernel_regularizer=layers.regularizers.l2(1e-6))(states)
21 |         net = layers.BatchNormalization()(net)
22 |         net = layers.Activation("relu")(net)
23 |         net = layers.Dense(units=32,kernel_regularizer=layers.regularizers.l2(1e-6))(net)
24 |         net = layers.BatchNormalization()(net)
25 |         net = layers.Activation("relu")(net)
26 | 
27 |         actions = layers.Dense(units=self.action_size, activation='softmax', name = 'actions')(net)
28 |         
29 |         self.model = models.Model(inputs=states, outputs=actions)
30 | 
31 |         action_gradients = layers.Input(shape=(self.action_size,))
32 |         loss = K.mean(-action_gradients * actions)
33 | 
34 |         optimizer = optimizers.Adam(lr=.00001)
35 |         updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss)
36 |         self.train_fn = K.function(
37 |             inputs=[self.model.input, action_gradients, K.learning_phase()],
38 |             outputs=[],
39 |             updates=updates_op)


--------------------------------------------------------------------------------
/Chapter09/agent.py:
--------------------------------------------------------------------------------
 1 | from actor import Actor
 2 | from critic import Critic
 3 | 
 4 | import numpy as np
 5 | from numpy.random import choice
 6 | import random
 7 | from collections import namedtuple, deque
 8 | 
 9 | 
10 | class ReplayBuffer:
11 |     def __init__(self, buffer_size, batch_size):
12 |     
13 |         self.memory = deque(maxlen=buffer_size)
14 |         self.batch_size = batch_size
15 |         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
16 |     
17 |     def add(self, state, action, reward, next_state, done):
18 |         e = self.experience(state, action, reward, next_state, done)
19 |         self.memory.append(e)
20 |     
21 |     def sample(self, batch_size=32):
22 |         return random.sample(self.memory, k=self.batch_size)
23 |     
24 |     def __len__(self):
25 |         return len(self.memory)
26 |     
27 |     
28 | class Agent:
29 |     def __init__(self, state_size, batch_size, is_eval = False):
30 |         self.state_size = state_size
31 |         self.action_size = 3
32 |         self.buffer_size = 1000000
33 |         self.batch_size = batch_size
34 |         self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
35 |         self.inventory = []
36 |         self.is_eval = is_eval
37 |         
38 |         self.gamma = 0.99
39 |         self.tau = 0.001
40 |         
41 |         self.actor_local = Actor(self.state_size, self.action_size)
42 |         self.actor_target = Actor(self.state_size, self.action_size)    
43 | 
44 |         self.critic_local = Critic(self.state_size, self.action_size)
45 |         self.critic_target = Critic(self.state_size, self.action_size)
46 |         
47 |         self.critic_target.model.set_weights(self.critic_local.model.get_weights()) 
48 |         self.actor_target.model.set_weights(self.actor_local.model.get_weights())
49 |         
50 |     def act(self, state):
51 |         options = self.actor_local.model.predict(state)
52 |         self.last_state = state
53 |         if not self.is_eval:
54 |             return choice(range(3), p = options[0])
55 |         return np.argmax(options[0])
56 |     
57 |     def step(self, action, reward, next_state, done):
58 |         self.memory.add(self.last_state, action, reward, next_state, done)
59 |         if len(self.memory) > self.batch_size:
60 |             experiences = self.memory.sample(self.batch_size)
61 |             self.learn(experiences)
62 |             self.last_state = next_state
63 | 
64 |     def learn(self, experiences):               
65 |         states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)    
66 |         actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size)
67 |         rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
68 |         dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
69 |         next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
70 | 
71 |         actions_next = self.actor_target.model.predict_on_batch(next_states)
72 |         Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
73 |         
74 |         Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
75 |         self.critic_local.model.train_on_batch(x = [states, actions], y=Q_targets)
76 |         
77 |         action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size))
78 |         self.actor_local.train_fn([states, action_gradients, 1])
79 |         self.soft_update(self.critic_local.model, self.critic_target.model)  
80 |         self.soft_update(self.actor_local.model, self.actor_target.model)
81 | 
82 |     def soft_update(self, local_model, target_model):
83 |         local_weights = np.array(local_model.get_weights())
84 |         target_weights = np.array(target_model.get_weights())
85 | 
86 |         assert len(local_weights) == len(target_weights)
87 | 
88 |         new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
89 |         target_model.set_weights(new_weights)
90 | 


--------------------------------------------------------------------------------
/Chapter09/critic.py:
--------------------------------------------------------------------------------
 1 | from keras import layers, models, optimizers
 2 | from keras import backend as K
 3 | 
 4 | 
 5 | class Critic:
 6 |     """Critic (Value) Model."""
 7 | 
 8 |     def __init__(self, state_size, action_size):
 9 |         """Initialize parameters and build model.
10 |         Params
11 |         ======
12 |             state_size (int): Dimension of each state
13 |             action_size (int): Dimension of each action
14 |         """
15 |         self.state_size = state_size
16 |         self.action_size = action_size
17 | 
18 |         self.build_model()
19 | 
20 |     def build_model(self):
21 |         """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
22 |         # Define input layers
23 |         states = layers.Input(shape=(self.state_size,), name='states')
24 |         actions = layers.Input(shape=(self.action_size,), name='actions')
25 | 
26 |         net_states = layers.Dense(units=16,kernel_regularizer=layers.regularizers.l2(1e-6))(states)
27 |         net_states = layers.BatchNormalization()(net_states)
28 |         net_states = layers.Activation("relu")(net_states)
29 | 
30 |         net_states = layers.Dense(units=32, kernel_regularizer=layers.regularizers.l2(1e-6))(net_states)
31 | 
32 |         net_actions = layers.Dense(units=32,kernel_regularizer=layers.regularizers.l2(1e-6))(actions)
33 | 
34 |         net = layers.Add()([net_states, net_actions])
35 |         net = layers.Activation('relu')(net)
36 | 
37 |         Q_values = layers.Dense(units=1, name='q_values',kernel_initializer=layers.initializers.RandomUniform(minval=-0.003, maxval=0.003))(net)
38 | 
39 |         self.model = models.Model(inputs=[states, actions], outputs=Q_values)
40 | 
41 |         optimizer = optimizers.Adam(lr=0.001)
42 |         self.model.compile(optimizer=optimizer, loss='mse')
43 | 
44 |         action_gradients = K.gradients(Q_values, actions)
45 | 
46 |         self.get_action_gradients = K.function(
47 |             inputs=[*self.model.input, K.learning_phase()],
48 |             outputs=action_gradients)
49 | 


--------------------------------------------------------------------------------
/Chapter09/helper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | 
 5 | def formatPrice(n):
 6 |     if n >= 0:
 7 |         curr = "$"
 8 |     else:
 9 |         curr = "-$"
10 |     return curr + "{0:.2f}".format(abs(n))
11 | 
12 | 
13 | def getStockData(key):
14 |     datavec = []
15 |     lines = open("data/" + key + ".csv", "r").read().splitlines()
16 |     
17 |     for line in lines[1:]:
18 |         datavec.append(float(line.split(",")[4]))
19 |     
20 |     return datavec
21 |   
22 | 
23 | def getState(data, t, window):
24 |     if t - window >= -1:
25 |         vec = data[t - window + 1:t + 1]
26 |     else: 
27 |         vec = -(t-window+1)*[data[0]]+data[0: t + 1]
28 |     scaled_state = []
29 |     for i in range(window - 1):
30 |         scaled_state.append(1/(1 + math.exp(vec[i] - vec[i+1])))
31 |     
32 |     return np.array([scaled_state])
33 | 


--------------------------------------------------------------------------------
/Chapter09/train.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | from helper import getStockData, getState, formatPrice
 3 | 
 4 | window_size = 50
 5 | batch_size = 32
 6 | agent = Agent(window_size, batch_size)
 7 | data = getStockData("^GSPC")
 8 | l = len(data) - 1
 9 | episode_count = 300
10 | 
11 | for e in range(episode_count):
12 |     print("Episode " + str(e) + "/" + str(episode_count))
13 |     state = getState(data, 0, window_size + 1)
14 | 
15 |     agent.inventory = []
16 |     total_profit = 0
17 |     done = False
18 |     for t in range(l):
19 |         action = agent.act(state)
20 |         action_prob = agent.actor_local.model.predict(state)
21 | 
22 |         next_state = getState(data, t + 1, window_size + 1)
23 |         reward = 0
24 | 
25 |         if action == 1:
26 |             agent.inventory.append(data[t])
27 |             print("Buy:" + formatPrice(data[t]))
28 | 
29 |         elif action == 2 and len(agent.inventory) > 0:
30 |             bought_price = agent.inventory.pop(0)
31 |             reward = max(data[t] - bought_price, 0)
32 |             total_profit += data[t] - bought_price
33 |             print("sell: " + formatPrice(data[t]) + "| profit: " + formatPrice(data[t] - bought_price))
34 | 
35 |         if t == l - 1:
36 |             done = True
37 |         agent.step(action_prob, reward, next_state, done)
38 |         state = next_state
39 | 
40 |         if done:
41 |             print("------------------------------------------")
42 |             print("Total Profit: " + formatPrice(total_profit))
43 |             print("------------------------------------------")
44 | 
45 | test_data = getStockData("^GSPC Test")
46 | l_test = len(test_data) - 1
47 | state = getState(test_data, 0, window_size + 1)
48 | total_profit = 0
49 | agent.inventory = []
50 | agent.is_eval = False
51 | done = False
52 | for t in range(l_test):
53 |     action = agent.act(state)
54 | 
55 |     next_state = getState(test_data, t + 1, window_size + 1)
56 |     reward = 0
57 | 
58 |     if action == 1:
59 | 
60 |         agent.inventory.append(test_data[t])
61 |         print("Buy: " + formatPrice(test_data[t]))
62 | 
63 |     elif action == 2 and len(agent.inventory) > 0:
64 |         bought_price = agent.inventory.pop(0)
65 |         reward = max(test_data[t] - bought_price, 0)
66 |         total_profit += test_data[t] - bought_price
67 |         print("Sell: " + formatPrice(test_data[t]) + " | profit: " + formatPrice(test_data[t] - bought_price))
68 | 
69 |     if t == l_test - 1:
70 |         done = True
71 |     agent.step(action_prob, reward, next_state, done)
72 |     state = next_state
73 | 
74 |     if done:
75 |         print("------------------------------------------")
76 |         print("Total Profit: " + formatPrice(total_profit))
77 |         print("------------------------------------------")
78 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.9.0-py3
 2 | 
 3 | 
 4 | RUN apt-get update -yqq \
 5 |     && apt-get install -y locales\
 6 |     && apt-get install -yqq \
 7 |     && pip3 install --upgrade pip \
 8 |     && locale-gen en_US.UTF-8
 9 | 
10 | RUN pip3 install keras
11 | 
12 | COPY Chapter09 Chapter09


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/artifacts.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/artifacts.pptx


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.1.10
 2 | argh==0.26.2
 3 | astroid==1.6.1
 4 | autopep8==1.3.4
 5 | bleach==1.5.0
 6 | cachetools==2.0.1
 7 | certifi==2018.1.18
 8 | chardet==3.0.4
 9 | google-api-core==0.1.4
10 | google-auth==1.4.1
11 | google-cloud-core==0.28.0
12 | google-cloud-logging==1.5.0
13 | googleapis-common-protos==1.5.3
14 | grpcio==1.9.1
15 | h5py==2.7.1
16 | html5lib==0.9999999
17 | idna==2.6
18 | isort==4.3.4
19 | Keras==2.1.6
20 | lazy-object-proxy==1.3.1
21 | Markdown==2.6.11
22 | mccabe==0.6.1
23 | numpy==1.14.1
24 | petname==2.2
25 | protobuf==3.5.1
26 | pyasn1==0.4.2
27 | pyasn1-modules==0.2.1
28 | pycodestyle==2.3.1
29 | pygtp==0.4
30 | pylint==1.8.2
31 | pytz==2018.3
32 | PyYAML==3.12
33 | requests==2.18.4
34 | rsa==3.4.2
35 | scipy==1.1.0
36 | sgf==0.5
37 | six==1.11.0
38 | tensorflow==1.5.0
39 | tensorflow-tensorboard==1.5.1
40 | tqdm==4.19.6
41 | urllib3==1.22
42 | Werkzeug==0.14.1
43 | wrapt==1.10.11
44 | 


--------------------------------------------------------------------------------