├── Chapter01 ├── __init__.py ├── cnn.py ├── logs │ └── simple_cnn │ │ └── events.out.tfevents.1532869533.Seans-MBP-7.lan └── saves │ └── checkpoint ├── Chapter02 ├── __init__.py ├── algorithmic_03.py ├── atari_02.py ├── box2d_04.py ├── cartpole.py ├── classic_control_05.py ├── mujoco_06.py ├── robotics_07.py ├── start1.py └── toy_text_08.py ├── Chapter03 ├── MUJOCO_LOG.TXT ├── config.py ├── demo │ ├── __init__.py │ ├── game.py │ ├── object.py │ ├── robot.py │ └── utils.py ├── distribution │ ├── __init__.py │ ├── categorical.py │ └── diagonal_gaussian.py ├── environment.py ├── eval.py ├── game.py ├── krylov.py ├── layer.py ├── layers.py ├── log │ ├── Acrobot │ │ ├── checkpoint │ │ └── events.out.tfevents.1506500394.ywz-WorkStation-T7400 │ ├── CartPole │ │ ├── checkpoint │ │ └── events.out.tfevents.1506667268.ywz-WorkStation-T7400 │ ├── HalfCheetah │ │ ├── checkpoint │ │ └── events.out.tfevents.1506338471.ywz-WorkStation-T7400 │ ├── Hopper │ │ ├── checkpoint │ │ └── events.out.tfevents.1506658875.ywz-WorkStation-T7400 │ ├── Pendulum │ │ ├── checkpoint │ │ └── events.out.tfevents.1506666537.ywz-WorkStation-T7400 │ ├── Reacher │ │ ├── checkpoint │ │ └── events.out.tfevents.1506398906.ywz-WorkStation-T7400 │ ├── Swimmer │ │ ├── checkpoint │ │ └── events.out.tfevents.1526197305.ywz-PC │ └── Walker2d │ │ ├── checkpoint │ │ └── events.out.tfevents.1506671852.ywz-WorkStation-T7400 ├── logger.py ├── main.py ├── mlp.py ├── optimizer.py ├── parallel.py ├── policy │ ├── __init__.py │ ├── categorical_mlp.py │ ├── deterministic_mlp.py │ └── gaussian_mlp.py ├── ppo.py ├── q_learning.py ├── q_network.py ├── replay_memory.py ├── sampler.py ├── simulator.py ├── test.py ├── train.py ├── trpo.py ├── utils.py └── value │ ├── __init__.py │ ├── linear_fitting.py │ └── mlp_fitting.py ├── Chapter04 ├── actor_critic_net.py ├── actor_network.py ├── config.py ├── critic_network.py ├── dpg.py ├── eval.py ├── layers.py ├── log │ ├── Acrobot-v1 │ │ ├── checkpoint │ │ └── train │ │ │ └── events.out.tfevents.1523886598.ywz-PC │ ├── CartPole-v0 │ │ ├── checkpoint │ │ └── train │ │ │ └── events.out.tfevents.1525870448.ywz-PC │ ├── MountainCar-v0 │ │ ├── checkpoint │ │ └── train │ │ │ └── events.out.tfevents.1526196635.ywz-PC │ └── Pendulum-v0 │ │ ├── checkpoint │ │ └── train │ │ └── events.out.tfevents.1525871560.ywz-PC ├── main.py ├── optimizer.py ├── replay_memory.py ├── task.py └── train.py ├── Chapter05 ├── a3c.py ├── cluster.py ├── demo │ ├── __init__.py │ ├── game.py │ ├── object.py │ ├── robot.py │ └── utils.py ├── doom │ ├── _vizdoom.ini │ ├── doom.py │ ├── game.py │ └── scenarios │ │ ├── basic.cfg │ │ ├── basic.wad │ │ ├── cig.cfg │ │ ├── cig.wad │ │ ├── cig_with_unknown.wad │ │ ├── deadly_corridor.cfg │ │ ├── deadly_corridor.wad │ │ ├── deathmatch.cfg │ │ ├── deathmatch.wad │ │ ├── defend_the_center.cfg │ │ ├── defend_the_center.wad │ │ ├── defend_the_line.cfg │ │ ├── defend_the_line.wad │ │ ├── health_gathering.cfg │ │ ├── health_gathering.wad │ │ ├── health_gathering_supreme.wad │ │ ├── learning.cfg │ │ ├── multi.cfg │ │ ├── multi_deathmatch.wad │ │ ├── multi_duel.cfg │ │ ├── multi_duel.wad │ │ ├── my_way_home.cfg │ │ ├── my_way_home.wad │ │ ├── predict_position.cfg │ │ ├── predict_position.wad │ │ ├── rocket_basic.cfg │ │ ├── rocket_basic.wad │ │ ├── simpler_basic.cfg │ │ ├── simpler_basic.wad │ │ ├── take_cover.cfg │ │ └── take_cover.wad ├── environment.py ├── ff_policy.py ├── game.py ├── helper │ └── tmux ├── layer.py ├── lstm_policy.py ├── minecraft │ ├── __init__.py │ └── game.py ├── parameter.py ├── save │ ├── breakout │ │ └── train │ │ │ ├── log_0 │ │ │ └── events.out.tfevents.1532007719.ywz-PC │ │ │ └── log_1 │ │ │ └── events.out.tfevents.1532007719.ywz-PC │ ├── demo │ │ └── train │ │ │ ├── checkpoint │ │ │ ├── log_0 │ │ │ └── events.out.tfevents.1532007504.ywz-PC │ │ │ └── log_1 │ │ │ └── events.out.tfevents.1532007504.ywz-PC │ └── minecraftbasic-v0 │ │ └── train │ │ └── log_0 │ │ └── events.out.tfevents.1532007895.ywz-PC ├── test.py ├── timer.py ├── train.py ├── utils.py └── worker.py ├── Chapter06 ├── __init__.py ├── commands.txt └── src │ ├── __init__.py │ ├── alphagozero_agent.py │ ├── config.py │ ├── constants.py │ ├── controller.py │ ├── features.py │ ├── go.py │ ├── mcts.py │ ├── network.py │ ├── preprocessing.py │ ├── train.py │ └── utils.py ├── Chapter07 ├── RL chatbot.ipynb ├── convert_checkpoint.py ├── data_parser.py ├── data_reader.py ├── feature_extracter.py ├── model │ ├── Reversed │ │ └── checkpoint │ └── model-56-3000 │ │ └── checkpoint ├── pg_model.py ├── results │ ├── sample_input.txt │ └── sample_output_RL.txt ├── seq_model.py ├── test.py └── train.py ├── Chapter08 ├── README.md ├── __init__.py └── src │ ├── __init__.py │ ├── child_network.py │ ├── cifar10_processor.py │ ├── config.py │ ├── constants.py │ ├── controller.py │ └── train.py ├── Chapter09 ├── actor.py ├── agent.py ├── critic.py ├── helper.py └── train.py ├── Dockerfile ├── LICENSE ├── README.md ├── artifacts.pptx └── requirements.txt /Chapter01/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter01/__init__.py -------------------------------------------------------------------------------- /Chapter01/logs/simple_cnn/events.out.tfevents.1532869533.Seans-MBP-7.lan: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter01/logs/simple_cnn/events.out.tfevents.1532869533.Seans-MBP-7.lan -------------------------------------------------------------------------------- /Chapter01/saves/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "simple_cnn" 2 | all_model_checkpoint_paths: "simple_cnn" 3 | -------------------------------------------------------------------------------- /Chapter02/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter02/__init__.py -------------------------------------------------------------------------------- /Chapter02/algorithmic_03.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('Copy-v0') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter02/atari_02.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('SpaceInvaders-v0') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter02/box2d_04.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('LunarLander-v2') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter02/cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import random 4 | import math 5 | 6 | environment = gym.make('CartPole-v0') 7 | 8 | 9 | no_buckets = (1, 1, 6, 3) 10 | no_actions = environment.action_space.n 11 | state_value_bounds = list(zip(environment.observation_space.low, environment.observation_space.high)) 12 | state_value_bounds[1] = [-0.5, 0.5] 13 | state_value_bounds[3] = [-math.radians(50), math.radians(50)] 14 | action_index = len(no_buckets) 15 | 16 | q_value_table = np.zeros(no_buckets + (no_actions,)) 17 | 18 | min_explore_rate = 0.01 19 | min_learning_rate = 0.1 20 | 21 | max_episodes = 1000 22 | max_time_steps = 250 23 | streak_to_end = 120 24 | solved_time = 199 25 | discount = 0.99 26 | no_streaks = 0 27 | 28 | 29 | def select_action(state_value, explore_rate): 30 | if random.random() < explore_rate: 31 | action = environment.action_space.sample() 32 | else: 33 | action = np.argmax(q_value_table[state_value]) 34 | return action 35 | 36 | 37 | def select_explore_rate(x): 38 | return max(min_explore_rate, min(1, 1.0 - math.log10((x+1)/25))) 39 | 40 | 41 | def select_learning_rate(x): 42 | return max(min_learning_rate, min(0.5, 1.0 - math.log10((x+1)/25))) 43 | 44 | 45 | def bucketize_state_value(state_value): 46 | bucket_indexes = [] 47 | for i in range(len(state_value)): 48 | if state_value[i] <= state_value_bounds[i][0]: 49 | bucket_index = 0 50 | elif state_value[i] >= state_value_bounds[i][1]: 51 | bucket_index = no_buckets[i] - 1 52 | else: 53 | bound_width = state_value_bounds[i][1] - state_value_bounds[i][0] 54 | offset = (no_buckets[i]-1)*state_value_bounds[i][0]/bound_width 55 | scaling = (no_buckets[i]-1)/bound_width 56 | bucket_index = int(round(scaling*state_value[i] - offset)) 57 | bucket_indexes.append(bucket_index) 58 | return tuple(bucket_indexes) 59 | 60 | 61 | for episode_no in range(max_episodes): 62 | explore_rate = select_explore_rate(episode_no) 63 | learning_rate = select_learning_rate(episode_no) 64 | 65 | observation = environment.reset() 66 | 67 | start_state_value = bucketize_state_value(observation) 68 | previous_state_value = start_state_value 69 | 70 | for time_step in range(max_time_steps): 71 | environment.render() 72 | selected_action = select_action(previous_state_value, explore_rate) 73 | observation, reward_gain, completed, _ = environment.step(selected_action) 74 | state_value = bucketize_state_value(observation) 75 | best_q_value = np.amax(q_value_table[state_value]) 76 | q_value_table[previous_state_value + (selected_action,)] += learning_rate * ( 77 | reward_gain + discount * (best_q_value) - q_value_table[previous_state_value + (selected_action,)]) 78 | 79 | print('Episode number : %d' % episode_no) 80 | print('Time step : %d' % time_step) 81 | print('Selection action : %d' % selected_action) 82 | print('Current state : %s' % str(state_value)) 83 | print('Reward obtained : %f' % reward_gain) 84 | print('Best Q value : %f' % best_q_value) 85 | print('Learning rate : %f' % learning_rate) 86 | print('Explore rate : %f' % explore_rate) 87 | print('Streak number : %d' % no_streaks) 88 | 89 | if completed: 90 | print('Episode %d finished after %f time steps' % (episode_no, time_step)) 91 | if time_step >= solved_time: 92 | no_streaks += 1 93 | else: 94 | no_streaks = 0 95 | break 96 | 97 | previous_state_value = state_value 98 | 99 | if no_streaks > streak_to_end: 100 | break 101 | -------------------------------------------------------------------------------- /Chapter02/classic_control_05.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('CartPole-v0') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter02/mujoco_06.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('Humanoid-v2') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) 7 | -------------------------------------------------------------------------------- /Chapter02/robotics_07.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('HandManipulateBlock-v0') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter02/start1.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import time 3 | environment = gym.make('CartPole-v0') 4 | environment.reset() 5 | for dummy in range(100): 6 | time.sleep(1) 7 | environment.render() 8 | environment.step(environment.action_space.sample()) -------------------------------------------------------------------------------- /Chapter02/toy_text_08.py: -------------------------------------------------------------------------------- 1 | import gym 2 | environment = gym.make('FrozenLake-v0') 3 | environment.reset() 4 | environment.render() 5 | import time 6 | time.sleep(10) -------------------------------------------------------------------------------- /Chapter03/MUJOCO_LOG.TXT: -------------------------------------------------------------------------------- 1 | Sun May 13 16:29:23 2018 2 | ERROR: GLEW initalization error: Missing GL version 3 | 4 | -------------------------------------------------------------------------------- /Chapter03/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | 7 | ATARI = { 8 | 'network_type': 'cnn', 9 | 'gamma': 0.99, 10 | 'batch_size': 32, 11 | 'num_episode': 500000, 12 | 'capacity': 1000000, 13 | 'epsilon_decay': 1000000, 14 | 'epsilon_min': 0.1, 15 | 'num_frames': 4, 16 | 'num_nullops': 5, 17 | 'time_between_two_copies': 10000, 18 | 'input_scale': 255.0, 19 | 'update_interval': 1, 20 | 'T': 100000, 21 | 22 | 'learning_rate': 2e-4, 23 | 'optimizer': 'rmsprop', 24 | 'rho': 0.99, 25 | 'rmsprop_epsilon': 1e-6, 26 | 27 | 'log_dir': 'log/' 28 | } 29 | 30 | 31 | DEMO = { 32 | 'network_type': 'mlp', 33 | 'gamma': 0.7, 34 | 'batch_size': 32, 35 | 'num_episode': 40, 36 | 'capacity': 20000, 37 | 'epsilon_decay': 100000, 38 | 'epsilon_min': 0.1, 39 | 'num_frames': 1, 40 | 'num_nullops': 2, 41 | 'time_between_two_copies': 1000, 42 | 'input_scale': 1.0, 43 | 'update_interval': 1, 44 | 'T': 1000000, 45 | 46 | 'learning_rate': 0.5e-2, 47 | 'optimizer': 'momentum', 48 | 'rho': 0.9, 49 | 'rmsprop_epsilon': 1e-6, 50 | 51 | 'log_dir': 'log/' 52 | } 53 | -------------------------------------------------------------------------------- /Chapter03/demo/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 10, 2016 3 | 4 | @author: a0096049 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter03/demo/object.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 16, 2016 3 | 4 | @author: a0096049 5 | ''' 6 | 7 | import numpy, pygame 8 | from demo.utils import Color, calculateIntersectPoint 9 | 10 | 11 | class Object: 12 | 13 | def __init__(self, x, y, r, game): 14 | 15 | self.x = x 16 | self.y = y 17 | self.r = r 18 | self.game = game 19 | 20 | def get_position(self): 21 | return self.x, self.y 22 | 23 | def get_radius(self): 24 | return self.r 25 | 26 | def set_position(self, x, y): 27 | self.x = x 28 | self.y = y 29 | 30 | def draw(self): 31 | pass 32 | 33 | class Food(Object): 34 | 35 | def __init__(self, x, y, radius, t, game): 36 | 37 | super().__init__(x, y, radius, game) 38 | self.type = t 39 | self.life = numpy.random.randint(1000, 5000) 40 | 41 | def decrease_life(self): 42 | self.life -= 1 43 | return self.life == 0 44 | 45 | def draw(self, found=False): 46 | 47 | if found == False: 48 | if self.type == "bad": 49 | pygame.draw.circle(self.game.DISPLAYSURF, Color.RED, (self.x, self.y), self.r) 50 | else: 51 | pygame.draw.circle(self.game.DISPLAYSURF, Color.GREEN, (self.x, self.y), self.r) 52 | else: 53 | pygame.draw.circle(self.game.DISPLAYSURF, Color.BLUE, (self.x, self.y), self.r) 54 | 55 | class Wall: 56 | 57 | def __init__(self, start, end, game, width=2): 58 | 59 | self.start = start 60 | self.end = end 61 | self.game = game 62 | self.width = width 63 | 64 | def draw(self): 65 | pygame.draw.line(self.game.DISPLAYSURF, Color.WHITE, self.start, self.end, self.width) 66 | 67 | def collide(self, p1, p2): 68 | 69 | point = calculateIntersectPoint(p1, p2, self.start, self.end) 70 | if point is None: 71 | return None 72 | else: 73 | return (int(point[0]), int(point[1])) 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /Chapter03/distribution/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 18 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter03/distribution/categorical.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 27 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | class Categorical: 11 | 12 | def __init__(self, dim): 13 | self.dim = dim 14 | 15 | def specs(self): 16 | return [("prob", (self.dim,))] 17 | 18 | def keys(self): 19 | return ["prob"] 20 | 21 | def kl_numpy(self, old_dist, new_dist): 22 | 23 | old_prob = old_dist["prob"] 24 | new_prob = new_dist["prob"] 25 | 26 | return numpy.sum(old_prob * (numpy.log(old_prob + 1e-8) - numpy.log(new_prob + 1e-8)), axis=-1) 27 | 28 | def kl_tf(self, old_dist, new_dist): 29 | 30 | old_prob = old_dist["prob"] 31 | new_prob = new_dist["prob"] 32 | 33 | return tf.reduce_sum(old_prob * (tf.log(old_prob + 1e-8) - tf.log(new_prob + 1e-8)), axis=-1) 34 | 35 | def likelihood_ratio_tf(self, x, old_dist, new_dist): 36 | 37 | old_prob = old_dist["prob"] 38 | new_prob = new_dist["prob"] 39 | 40 | return (tf.reduce_sum(new_prob * x, axis=-1) + 1e-8) / \ 41 | (tf.reduce_sum(old_prob * x, axis=-1) + 1e-8) 42 | 43 | -------------------------------------------------------------------------------- /Chapter03/distribution/diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 18 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | class DiagonalGaussian: 11 | 12 | def __init__(self, dim): 13 | self.dim = dim 14 | 15 | def specs(self): 16 | return [("mean", (self.dim,)), ("log_var", (self.dim,))] 17 | 18 | def keys(self): 19 | return ["mean", "log_var"] 20 | 21 | def kl_numpy(self, old_dist, new_dist): 22 | 23 | old_means = old_dist["mean"] 24 | old_log_stds = old_dist["log_var"] 25 | new_means = new_dist["mean"] 26 | new_log_stds = new_dist["log_var"] 27 | 28 | old_std = numpy.exp(old_log_stds) 29 | new_std = numpy.exp(new_log_stds) 30 | # means: (N*A) 31 | # std: (N*A) 32 | # formula: 33 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + 34 | # ln(\sigma_2/\sigma_1) 35 | numerator = numpy.square(old_means - new_means) + numpy.square(old_std) - numpy.square(new_std) 36 | denominator = 2 * numpy.square(new_std) + 1e-8 37 | 38 | return numpy.sum(numerator / denominator + new_log_stds - old_log_stds, axis=-1) 39 | 40 | def kl_tf(self, old_dist, new_dist): 41 | 42 | old_means = old_dist["mean"] 43 | old_log_stds = old_dist["log_var"] 44 | new_means = new_dist["mean"] 45 | new_log_stds = new_dist["log_var"] 46 | 47 | old_std = tf.exp(old_log_stds) 48 | new_std = tf.exp(new_log_stds) 49 | # means: (N*A) 50 | # std: (N*A) 51 | # formula: 52 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + 53 | # ln(\sigma_2/\sigma_1) 54 | numerator = tf.square(old_means - new_means) + tf.square(old_std) - tf.square(new_std) 55 | denominator = 2 * tf.square(new_std) + 1e-8 56 | 57 | return tf.reduce_sum(numerator / denominator + new_log_stds - old_log_stds, axis=-1) 58 | 59 | def likelihood_ratio_tf(self, x, old_dist, new_dist): 60 | 61 | new = self.log_likelihood_tf(x, new_dist) 62 | old = self.log_likelihood_tf(x, old_dist) 63 | 64 | return tf.exp(new - old) 65 | 66 | def log_likelihood_tf(self, x, dist): 67 | 68 | means = dist["mean"] 69 | log_stds = dist["log_var"] 70 | zs = (x - means) / tf.exp(log_stds) 71 | 72 | return - tf.reduce_sum(log_stds, axis=-1) - \ 73 | 0.5 * tf.reduce_sum(tf.square(zs), axis=-1) - \ 74 | 0.5 * self.dim * numpy.log(2 * numpy.pi) 75 | 76 | 77 | -------------------------------------------------------------------------------- /Chapter03/environment.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | from threading import Thread 7 | 8 | 9 | def new_demo(test=True): 10 | import pygame 11 | from demo.game import Game 12 | 13 | if test is False: 14 | game = Game(640, 480, None) 15 | else: 16 | def _render(game): 17 | while True: 18 | game.draw() 19 | for event in pygame.event.get(): 20 | if event.type == pygame.KEYDOWN: 21 | if event.key == pygame.K_9: 22 | game.increase_fps() 23 | elif event.key == pygame.K_0: 24 | game.decrease_fps() 25 | pygame.init() 26 | DISPLAYSURF = pygame.display.set_mode((640, 480), 0, 32) 27 | pygame.display.set_caption('Demo') 28 | game = Game(640, 480, DISPLAYSURF) 29 | t = Thread(target=lambda: _render(game)) 30 | t.start() 31 | 32 | return game 33 | 34 | 35 | def new_atari_game(rom='breakout'): 36 | from game import Game 37 | 38 | game = Game(rom) 39 | 40 | if rom == 'space_invaders': 41 | game.set_params(frame_skip=3, lost_life_as_terminal=False, take_maximum_of_two_frames=True) 42 | elif game == 'alien': 43 | game.set_params(frame_skip=4, crop_offset=20, lost_life_as_terminal=False) 44 | else: 45 | game.set_params(frame_skip=4, lost_life_as_terminal=False) 46 | 47 | return game 48 | 49 | -------------------------------------------------------------------------------- /Chapter03/eval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 28, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import os 7 | import argparse 8 | import tensorflow as tf 9 | from q_learning import DQN 10 | from config import ATARI, DEMO 11 | from environment import new_atari_game, new_demo 12 | 13 | 14 | def main(): 15 | 16 | parser = argparse.ArgumentParser(description=None) 17 | parser.add_argument('-g', '--game', default='demo', type=str, help='Game') 18 | parser.add_argument('-d', '--device', default='cpu', type=str, help='Device') 19 | args = parser.parse_args() 20 | 21 | rom = args.game 22 | if rom == 'demo': 23 | game = new_demo() 24 | conf = DEMO 25 | else: 26 | game = new_atari_game(rom) 27 | conf = ATARI 28 | 29 | model_dir = os.path.join(conf['log_dir'], rom) 30 | device = '/{}:0'.format(args.device) 31 | with tf.device(device): 32 | dqn = DQN(conf, game, model_dir, callback=game.draw) 33 | 34 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 35 | saver = tf.train.Saver() 36 | dqn.load(sess, saver) 37 | dqn.evaluate(sess) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /Chapter03/game.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import gym, numpy, time 7 | from utils import cv2_resize_image 8 | 9 | class Game: 10 | 11 | def __init__(self, name, lost_life_as_terminal=False, take_maximum_of_two_frames=False): 12 | 13 | if take_maximum_of_two_frames is False: 14 | self.mode = 'Deterministic' 15 | else: 16 | self.mode = 'NoFrameskip' 17 | 18 | name = ''.join([s.capitalize() for s in name.split('_')]) 19 | self.ale = gym.make('{}{}-v4'.format(name, self.mode)) 20 | frame = self.ale.reset() 21 | self.lost_life_as_terminal = lost_life_as_terminal 22 | self.lives = 0 23 | self.actions = list(range(self.ale.action_space.n)) 24 | 25 | self.frame_skip = 4 26 | self.total_reward = 0 27 | self.crop_size = 84 28 | self.crop_offset = 8 29 | 30 | # Frame buffer 31 | self.buffer_size = 8 32 | self.buffer_index = 0 33 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 34 | # Overlapping frames, maximum of two frames 35 | self.last_frame = frame 36 | 37 | def rgb_to_gray(self, im): 38 | return numpy.dot(im, [0.2126, 0.7152, 0.0722]) 39 | 40 | def set_params(self, crop_size=84, crop_offset=8, frame_skip=4, 41 | lost_life_as_terminal=False, take_maximum_of_two_frames=False): 42 | 43 | self.crop_size = crop_size 44 | self.crop_offset = crop_offset 45 | self.frame_skip = frame_skip 46 | self.lost_life_as_terminal = lost_life_as_terminal 47 | self.mode = 'NoFrameskip' if take_maximum_of_two_frames else 'Deterministic' 48 | 49 | frame = self.ale.reset() 50 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 51 | self.last_frame = frame 52 | 53 | def reset(self): 54 | frame = self.ale.reset() 55 | self.total_reward = 0 56 | self.buffer_index = 0 57 | self.lives = 0 58 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 59 | self.last_frame = frame 60 | 61 | def add_frame_to_buffer(self, frame): 62 | self.buffer_index = self.buffer_index % self.buffer_size 63 | self.buffer[self.buffer_index] = frame 64 | self.buffer_index += 1 65 | 66 | def get_available_actions(self): 67 | return list(range(len(self.actions))) 68 | 69 | def get_feedback_size(self): 70 | return (self.crop_size, self.crop_size) 71 | 72 | def crop(self, frame): 73 | feedback = cv2_resize_image(frame, 74 | resized_shape=(self.crop_size, self.crop_size), 75 | method='crop', crop_offset=self.crop_offset) 76 | return feedback 77 | 78 | def get_current_feedback(self, num_frames=1): 79 | assert num_frames < self.buffer_size, "Frame buffer is not large enough." 80 | index = self.buffer_index - 1 81 | frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)] 82 | if num_frames > 1: 83 | return numpy.concatenate(frames, axis=0) 84 | else: 85 | return frames[0] 86 | 87 | def get_total_reward(self): 88 | return self.total_reward 89 | 90 | def _lost_life(self, info): 91 | if self.lost_life_as_terminal: 92 | lives = info['ale.lives'] 93 | if lives >= self.lives: 94 | self.lives = lives 95 | return False 96 | else: 97 | return True 98 | else: 99 | return False 100 | 101 | def play_action(self, action, num_frames=1): 102 | 103 | if self.mode == 'Deterministic': 104 | termination = 0 105 | a = self.actions[action] 106 | frame, reward, done, info = self.ale.step(a) 107 | if done or self._lost_life(info): termination = 1 108 | self.add_frame_to_buffer(self.crop(self.rgb_to_gray(frame))) 109 | elif self.mode == 'NoFrameskip': 110 | reward = 0 111 | termination = 0 112 | for i in range(self.frame_skip): 113 | a = self.actions[action] 114 | frame, r, done, info = self.ale.step(a) 115 | reward += r 116 | if i == self.frame_skip - 2: self.last_frame = frame 117 | if done or self._lost_life(info): termination = 1 118 | self.add_frame_to_buffer(self.crop(numpy.maximum(self.rgb_to_gray(frame), self.rgb_to_gray(self.last_frame)))) 119 | else: 120 | raise 121 | 122 | r = numpy.clip(reward, -1, 1) 123 | self.total_reward += reward 124 | 125 | return r, self.get_current_feedback(num_frames), termination 126 | 127 | def draw(self): 128 | self.ale.render() 129 | 130 | def get_action_meanings(self): 131 | return self.ale.get_action_meanings() 132 | -------------------------------------------------------------------------------- /Chapter03/krylov.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 4 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | 8 | 9 | class Krylov: 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def cg(self, Ax, b, cg_iters=10, verbose=False, eps=1e-10): 15 | 16 | x = numpy.zeros_like(b) 17 | r = b.copy() 18 | p = b.copy() 19 | r_dot_r = r.dot(r) 20 | 21 | for _ in range(cg_iters): 22 | z = Ax(p) 23 | v = r_dot_r / p.dot(z) 24 | x += v * p 25 | r -= v * z 26 | 27 | new_r_dot_r = r.dot(r) 28 | beta = new_r_dot_r / r_dot_r 29 | p = r + beta * p 30 | 31 | r_dot_r = new_r_dot_r 32 | if r_dot_r < eps: 33 | break 34 | 35 | if verbose: 36 | print("residual norm: {:5f}, solution norm: {:5f}".format(r_dot_r, numpy.linalg.norm(x))) 37 | return x 38 | 39 | if __name__ == "__main__": 40 | 41 | from numpy.linalg import inv 42 | 43 | n = 5 44 | A = numpy.random.rand(n, n) 45 | A = A.T.dot(A) + 0.01 * numpy.eye(n) 46 | b = numpy.random.rand(n) 47 | x = inv(A).dot(b) 48 | 49 | krylov = Krylov() 50 | y = krylov.cg(lambda x: A.dot(x), b, verbose=True) 51 | 52 | print(x) 53 | print(y) 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /Chapter03/layer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 29 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | def leaky_relu(x, leak=0.0, name="lrelu"): 11 | return tf.maximum(leak * x, x, name=name) 12 | 13 | def add_regularization(var, weight): 14 | weight_decay = tf.multiply(tf.nn.l2_loss(var), weight, name='weight_loss') 15 | tf.add_to_collection('losses', weight_decay) 16 | 17 | def get_variable_on_cpu(shape, initializer, name, dtype=tf.float32, trainable=True): 18 | with tf.device('/cpu:0'): 19 | var = tf.get_variable(shape=shape, initializer=initializer, 20 | dtype=dtype, name=name, trainable=trainable) 21 | return var 22 | 23 | def HeUniform(shape): 24 | 25 | if len(shape) > 2: 26 | w = shape[0] 27 | h = shape[1] 28 | input_channels = shape[2] 29 | d = 1.0 / numpy.sqrt(input_channels * w * h) 30 | else: 31 | d = 1.0 / numpy.sqrt(shape[0]) 32 | 33 | init_W = tf.random_uniform_initializer(-d, d) 34 | init_b = tf.random_uniform_initializer(-d, d) 35 | return init_W, init_b 36 | 37 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 38 | activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'): 39 | 40 | assert len(x.get_shape().as_list()) == 4 41 | shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim) 42 | _W, _b = HeUniform(shape) 43 | if init_W is None: init_W = _W 44 | if init_b is None: init_b = _b 45 | 46 | with tf.variable_scope(name): 47 | W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 48 | b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 49 | 50 | conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding) 51 | if activation: 52 | conv = activation(tf.nn.bias_add(conv, b)) 53 | else: 54 | conv = tf.nn.bias_add(conv, b) 55 | 56 | return conv 57 | 58 | def linear(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='linear'): 59 | 60 | if len(x.get_shape().as_list()) > 2: 61 | shape = x.get_shape().as_list() 62 | x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:]))) 63 | 64 | shape = (x.get_shape().as_list()[-1], output_dim) 65 | _W, _b = HeUniform(shape) 66 | if init_W is None: init_W = _W 67 | if init_b is None: init_b = _b 68 | 69 | with tf.variable_scope(name): 70 | W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 71 | b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 72 | 73 | linear = tf.matmul(x, W) + b 74 | if activation: 75 | linear = activation(linear) 76 | 77 | return linear 78 | -------------------------------------------------------------------------------- /Chapter03/layers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | def get_variable(shape, initializer, name, dtype=tf.float32, trainable=True): 11 | var = tf.get_variable(shape=shape, initializer=initializer, 12 | dtype=dtype, name=name, trainable=trainable) 13 | return var 14 | 15 | 16 | def HeUniform(shape): 17 | 18 | if len(shape) > 2: 19 | w = shape[0] 20 | h = shape[1] 21 | input_channels = shape[2] 22 | d = 1.0 / numpy.sqrt(input_channels * w * h) 23 | else: 24 | d = 1.0 / numpy.sqrt(shape[0]) 25 | 26 | init_W = tf.random_uniform_initializer(-d, d) 27 | init_b = tf.random_uniform_initializer(-d, d) 28 | return init_W, init_b 29 | 30 | 31 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 32 | activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'): 33 | 34 | assert len(x.get_shape().as_list()) == 4 35 | shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim) 36 | _W, _b = HeUniform(shape) 37 | if init_W is None: init_W = _W 38 | if init_b is None: init_b = _b 39 | 40 | with tf.variable_scope(name): 41 | W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 42 | b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 43 | 44 | conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding) 45 | if activation: 46 | conv = activation(tf.nn.bias_add(conv, b)) 47 | else: 48 | conv = tf.nn.bias_add(conv, b) 49 | 50 | return conv 51 | 52 | 53 | def dense(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='dense'): 54 | 55 | if len(x.get_shape().as_list()) > 2: 56 | shape = x.get_shape().as_list() 57 | x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:]))) 58 | 59 | shape = (x.get_shape().as_list()[-1], output_dim) 60 | _W, _b = HeUniform(shape) 61 | if init_W is None: init_W = _W 62 | if init_b is None: init_b = _b 63 | 64 | with tf.variable_scope(name): 65 | W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 66 | b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 67 | 68 | output = tf.matmul(x, W) + b 69 | if activation: 70 | output = activation(output) 71 | 72 | return output 73 | -------------------------------------------------------------------------------- /Chapter03/log/Acrobot/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Acrobot.ckpt" 2 | all_model_checkpoint_paths: "Acrobot.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Acrobot/events.out.tfevents.1506500394.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Acrobot/events.out.tfevents.1506500394.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/CartPole/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "CartPole.ckpt" 2 | all_model_checkpoint_paths: "CartPole.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/CartPole/events.out.tfevents.1506667268.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/CartPole/events.out.tfevents.1506667268.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/HalfCheetah/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "HalfCheetah.ckpt" 2 | all_model_checkpoint_paths: "HalfCheetah.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/HalfCheetah/events.out.tfevents.1506338471.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/HalfCheetah/events.out.tfevents.1506338471.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/Hopper/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Hopper.ckpt" 2 | all_model_checkpoint_paths: "Hopper.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Hopper/events.out.tfevents.1506658875.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Hopper/events.out.tfevents.1506658875.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/Pendulum/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Pendulum.ckpt" 2 | all_model_checkpoint_paths: "Pendulum.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Pendulum/events.out.tfevents.1506666537.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Pendulum/events.out.tfevents.1506666537.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/Reacher/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Reacher.ckpt" 2 | all_model_checkpoint_paths: "Reacher.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Reacher/events.out.tfevents.1506398906.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Reacher/events.out.tfevents.1506398906.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/log/Swimmer/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Swimmer.ckpt" 2 | all_model_checkpoint_paths: "Swimmer.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Swimmer/events.out.tfevents.1526197305.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Swimmer/events.out.tfevents.1526197305.ywz-PC -------------------------------------------------------------------------------- /Chapter03/log/Walker2d/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Walker2d.ckpt" 2 | all_model_checkpoint_paths: "Walker2d.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter03/log/Walker2d/events.out.tfevents.1506671852.ywz-WorkStation-T7400: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/log/Walker2d/events.out.tfevents.1506671852.ywz-WorkStation-T7400 -------------------------------------------------------------------------------- /Chapter03/logger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 22 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import sys 7 | import tensorflow as tf 8 | 9 | def delete_dir(path): 10 | if tf.gfile.Exists(path): 11 | tf.gfile.DeleteRecursively(path) 12 | tf.gfile.MakeDirs(path) 13 | return path 14 | 15 | class Logger: 16 | 17 | def __init__(self, sess, directory): 18 | 19 | self.directory = directory 20 | self.output_file = sys.stdout 21 | 22 | self.step = 0 23 | self.summary_writer = tf.summary.FileWriter(delete_dir(directory), sess.graph) 24 | self.print_buffer = [] 25 | 26 | def clear(self): 27 | self.step = 0 28 | 29 | def set_step(self, step): 30 | self.step = step 31 | 32 | def add_summary(self, summary): 33 | self.summary_writer.add_summary(summary, self.step) 34 | summary_text = tf.Summary() 35 | summary_text.ParseFromString(summary) 36 | self.print_buffer += ["{}: {:5f}".format(v.tag, v.simple_value) for v in summary_text.value] 37 | 38 | def flush(self): 39 | self.summary_writer.flush() 40 | s = ["episode: {}".format(self.step)] + self.print_buffer 41 | print(', '.join(s), file=self.output_file) 42 | self.print_buffer = [] 43 | 44 | -------------------------------------------------------------------------------- /Chapter03/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 4 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | 7 | def main(): 8 | pass 9 | 10 | if __name__ == "__main__": 11 | main() 12 | -------------------------------------------------------------------------------- /Chapter03/mlp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 5 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layer import linear 8 | 9 | 10 | class MLP: 11 | 12 | def __init__(self, input_shape, output_size, hidden_sizes=(32, 32), 13 | hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, 14 | input_layer=None, name='mlp'): 15 | 16 | self.input_shape = input_shape 17 | self.output_size = output_size 18 | self.hidden_sizes = hidden_sizes 19 | self.hidden_nonlinearity = hidden_nonlinearity 20 | self.output_nonlinearity = output_nonlinearity 21 | self.name = name 22 | 23 | if input_layer is None: 24 | self.x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='mlp_input') 25 | else: 26 | self.x = input_layer 27 | 28 | self.build() 29 | 30 | def build(self): 31 | 32 | with tf.variable_scope(self.name): 33 | layer = self.x 34 | for i, hidden_size in enumerate(self.hidden_sizes): 35 | layer = linear(layer, hidden_size, activation=self.hidden_nonlinearity, 36 | init_b=tf.constant_initializer(0.0), name='hidden_layer_{}'.format(i)) 37 | 38 | self.y = linear(layer, self.output_size, activation=self.output_nonlinearity, 39 | init_b=tf.constant_initializer(0.0), name='output_layer') 40 | 41 | self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 42 | 43 | def get_params(self): 44 | return self.params 45 | 46 | def get_input_layer(self): 47 | return self.x 48 | 49 | def get_output_layer(self): 50 | return self.y 51 | 52 | 53 | if __name__ == "__main__": 54 | 55 | import numpy 56 | 57 | input_shape = (None, 10) 58 | output_size = 5 59 | mlp = MLP(input_shape=input_shape, output_size=output_size) 60 | print(mlp.get_params()) 61 | 62 | with tf.Session() as sess: 63 | sess.run(tf.global_variables_initializer()) 64 | writer = tf.summary.FileWriter("log/", sess.graph_def) 65 | 66 | x = numpy.random.rand(1, input_shape[1]) 67 | y = sess.run(mlp.get_output_layer(), feed_dict={mlp.get_input_layer(): x}) 68 | print(y) 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /Chapter03/parallel.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 21 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import multiprocessing as mp 7 | import traceback, random 8 | import sys, numpy 9 | import tensorflow as tf 10 | from joblib.pool import MemmapingPool 11 | 12 | 13 | class SharedGlobal(object): 14 | pass 15 | 16 | class StatefulPool(object): 17 | 18 | def __init__(self): 19 | 20 | self.n_parallel = 1 21 | self.pool = None 22 | self.queue = None 23 | self.worker_queue = None 24 | self.G = SharedGlobal() 25 | 26 | def initialize(self, n_parallel): 27 | 28 | self.n_parallel = n_parallel 29 | 30 | if self.pool is not None: 31 | print("Warning: terminating existing pool") 32 | self.pool.terminate() 33 | self.queue.close() 34 | self.worker_queue.close() 35 | self.G = SharedGlobal() 36 | 37 | if n_parallel > 1: 38 | self.queue = mp.Queue() 39 | self.worker_queue = mp.Queue() 40 | self.pool = MemmapingPool(self.n_parallel, temp_folder="/tmp") 41 | 42 | def run_each(self, runner, args_list=None): 43 | 44 | if args_list is None: 45 | args_list = [tuple()] * self.n_parallel 46 | assert len(args_list) == self.n_parallel 47 | 48 | if self.n_parallel > 1: 49 | results = self.pool.map_async(worker_run_each, [(runner, args) for args in args_list]) 50 | for _ in range(self.n_parallel): 51 | self.worker_queue.get() 52 | for _ in range(self.n_parallel): 53 | self.queue.put(None) 54 | return results.get() 55 | else: 56 | return [runner(self.G, *args_list[0])] 57 | 58 | singleton_pool = StatefulPool() 59 | 60 | def worker_run_each(all_args): 61 | try: 62 | runner, args = all_args 63 | # signals to the master that this task is up and running 64 | singleton_pool.worker_queue.put(None) 65 | # wait for the master to signal continuation 66 | singleton_pool.queue.get() 67 | return runner(singleton_pool.G, *args) 68 | except Exception: 69 | raise Exception("".join(traceback.format_exception(*sys.exc_info()))) 70 | 71 | def worker_init(G, i): 72 | G.worker_id = i 73 | 74 | def set_seed(G, seed): 75 | seed %= 4294967294 76 | random.seed(seed) 77 | numpy.random.seed(seed) 78 | tf.set_random_seed(seed) 79 | 80 | def initialize(n_parallel): 81 | singleton_pool.initialize(n_parallel) 82 | singleton_pool.run_each(worker_init, [(i,) for i in range(singleton_pool.n_parallel)]) 83 | singleton_pool.run_each(set_seed, [(123456789 + i,) for i in range(singleton_pool.n_parallel)]) 84 | 85 | if __name__ == "__main__": 86 | 87 | thread_num = 4 88 | initialize(thread_num) 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Chapter03/policy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter03/policy/__init__.py -------------------------------------------------------------------------------- /Chapter03/policy/categorical_mlp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 27 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | from mlp import MLP 9 | from distribution.categorical import Categorical 10 | 11 | 12 | class CategoricalMLPPolicy: 13 | 14 | def __init__(self, 15 | input_shape, 16 | output_size, 17 | hidden_sizes=(32, 32), 18 | hidden_nonlinearity=tf.nn.tanh): 19 | 20 | self.input_shape = input_shape 21 | self.output_size = output_size 22 | self.hidden_sizes = hidden_sizes 23 | self.locals = locals() 24 | 25 | self.distribution = Categorical(output_size) 26 | self.params = [] 27 | 28 | with tf.variable_scope("policy"): 29 | # Mean network 30 | self.prob_mlp = MLP(input_shape=input_shape, 31 | output_size=output_size, 32 | hidden_sizes=hidden_sizes, 33 | hidden_nonlinearity=hidden_nonlinearity, 34 | output_nonlinearity=tf.nn.softmax, 35 | name='prob') 36 | 37 | self.x = self.prob_mlp.get_input_layer() 38 | self.prob = self.prob_mlp.get_output_layer() 39 | self.params += self.prob_mlp.get_params() 40 | 41 | def get_locals(self): 42 | arguments = {argc: argv for argc, argv in self.locals.items() if argc != 'self'} 43 | return arguments 44 | 45 | def get_action(self, sess, observation): 46 | 47 | if observation.ndim == 1: 48 | observation = observation.reshape((1, observation.size)) 49 | 50 | prob = sess.run(self.prob, feed_dict={self.x: observation})[0] 51 | idx = numpy.random.choice(range(self.output_size), p=prob) 52 | action = numpy.zeros((self.output_size,)) 53 | action[idx] = 1 54 | 55 | return action, {'prob': prob} 56 | 57 | def get_actions(self, sess, observation): 58 | 59 | probs = sess.run(self.prob, feed_dict={self.x: observation}) 60 | actions = numpy.zeros((probs.shape[0], self.output_size)) 61 | for i, prob in enumerate(probs): 62 | idx = numpy.random.choice(range(self.output_size), p=prob) 63 | actions[i][idx] = 1 64 | 65 | return actions, {'prob': probs} 66 | 67 | def get_input(self): 68 | return self.x 69 | 70 | def get_dist_info(self): 71 | return {'prob': self.prob} 72 | 73 | def get_params(self): 74 | return self.params 75 | 76 | @staticmethod 77 | def copy(args): 78 | return CategoricalMLPPolicy(**args) 79 | 80 | if __name__ == "__main__": 81 | 82 | input_shape = (None, 10) 83 | output_size = 5 84 | 85 | policy = CategoricalMLPPolicy(input_shape=input_shape, 86 | output_size=output_size) 87 | 88 | for param in policy.get_params(): 89 | print(param) 90 | 91 | with tf.Session() as sess: 92 | sess.run(tf.global_variables_initializer()) 93 | 94 | observation = numpy.random.rand(2, input_shape[1]) 95 | action = policy.get_actions(sess, observation) 96 | print(action) 97 | 98 | 99 | -------------------------------------------------------------------------------- /Chapter03/policy/deterministic_mlp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 5 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from mlp import MLP 8 | 9 | 10 | class DeterministicMLPPolicy: 11 | 12 | def __init__(self, 13 | input_shape, 14 | output_size, 15 | hidden_sizes=(32, 32), 16 | hidden_nonlinearity=tf.nn.relu, 17 | output_nonlinearity=tf.nn.tanh): 18 | 19 | self.input_shape = input_shape 20 | self.output_size = output_size 21 | self.locals = locals() 22 | 23 | with tf.variable_scope("policy"): 24 | self.mlp = MLP(input_shape=input_shape, 25 | output_size=output_size, 26 | hidden_sizes=hidden_sizes, 27 | hidden_nonlinearity=hidden_nonlinearity, 28 | output_nonlinearity=output_nonlinearity) 29 | 30 | self.x = self.mlp.get_input_layer() 31 | self.y = self.mlp.get_output_layer() 32 | 33 | def get_locals(self): 34 | arguments = {argc: argv for argc, argv in self.locals.items() if argc != 'self'} 35 | return arguments 36 | 37 | def get_action(self, sess, observation): 38 | if observation.ndim == 1: 39 | observation = observation.reshape((1, observation.size)) 40 | output = sess.run(self.y, feed_dict={self.x: observation}) 41 | return output[0] 42 | 43 | def get_actions(self, sess, observation): 44 | return sess.run(self.y, feed_dict={self.x: observation}) 45 | 46 | def get_params(self): 47 | return self.mlp.get_params() 48 | 49 | @staticmethod 50 | def copy(args): 51 | return DeterministicMLPPolicy(**args) 52 | 53 | 54 | -------------------------------------------------------------------------------- /Chapter03/ppo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 26 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from utils import iterate_minibatches 8 | 9 | # Proximal Policy Optimization Algorithms 10 | class PPO: 11 | 12 | def __init__(self, 13 | policy, 14 | batch_size=1000, 15 | learning_rate=1e-3, 16 | epsilon=0.2): 17 | 18 | self.policy = policy 19 | self.learning_rate = learning_rate 20 | self.epsilon = epsilon 21 | self.batch_size = batch_size 22 | 23 | self.x = self.policy.get_input() 24 | self.action_dim = self.policy.output_size 25 | self.dist = self.policy.distribution 26 | 27 | self.build_formula() 28 | 29 | def build_formula(self): 30 | 31 | self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.action_dim), name='action') 32 | self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='action') 33 | 34 | dist_vars = self.policy.get_dist_info() 35 | old_dist_vars = {k: tf.placeholder(tf.float32, shape=[None]+list(shape), name='old_dist_{}'.format(k)) 36 | for k, shape in self.dist.specs()} 37 | old_dist_vars_list = [old_dist_vars[k] for k in self.dist.keys()] 38 | 39 | lr = self.dist.likelihood_ratio_tf(self.action, old_dist_vars, dist_vars) 40 | first_term = lr * self.advantage 41 | second_term = tf.clip_by_value(lr, 1 - self.epsilon, 1 + self.epsilon) * self.advantage 42 | loss = -tf.reduce_mean(tf.minimum(first_term, second_term)) 43 | 44 | self.inputs_tensors = [self.x, self.action, self.advantage] + old_dist_vars_list 45 | self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss, 46 | var_list=self.policy.get_params()) 47 | # Add summaries 48 | tf.summary.scalar("loss", loss, collections=['ppo']) 49 | self.summary_op = tf.summary.merge_all('ppo') 50 | 51 | def optimize_policy(self, sess, samples, logger=None, **args): 52 | 53 | obs = samples['observations'] 54 | actions = samples['actions'] 55 | advantages = samples['advantages'] 56 | dist_vars = [samples['infos'][k] for k in self.dist.keys()] 57 | 58 | inputs = [obs, actions, advantages] + dist_vars 59 | feed_dict = dict(list(zip(self.inputs_tensors, inputs))) 60 | if self.batch_size is not None and obs.shape[0] >= self.batch_size: 61 | for vs in iterate_minibatches(inputs, self.batch_size, shuffle=True): 62 | sess.run(self.train_op, feed_dict=dict(list(zip(self.inputs_tensors, vs)))) 63 | else: 64 | sess.run(self.train_op, feed_dict=feed_dict) 65 | 66 | if logger: 67 | summary_str = sess.run(self.summary_op, feed_dict=feed_dict) 68 | logger.add_summary(summary_str) 69 | 70 | -------------------------------------------------------------------------------- /Chapter03/q_network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layers import conv2d, dense 8 | 9 | 10 | class QNetwork: 11 | 12 | def __init__(self, input_shape=(84, 84, 4), n_outputs=4, 13 | network_type='cnn', scope='q_network'): 14 | 15 | self.width = input_shape[0] 16 | self.height = input_shape[1] 17 | self.channel = input_shape[2] 18 | self.n_outputs = n_outputs 19 | self.network_type = network_type 20 | self.scope = scope 21 | 22 | # Frame images 23 | self.x = tf.placeholder(dtype=tf.float32, 24 | shape=(None, self.channel, self.width, self.height)) 25 | # Estimates of Q-value 26 | self.y = tf.placeholder(dtype=tf.float32, shape=(None,)) 27 | # Selected actions 28 | self.a = tf.placeholder(dtype=tf.int32, shape=(None,)) 29 | 30 | with tf.variable_scope(scope): 31 | self.build() 32 | self.build_loss() 33 | 34 | def build(self): 35 | 36 | self.net = {} 37 | self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1)) 38 | 39 | if self.network_type == 'cnn': 40 | self.net['conv1'] = conv2d(self.net['input'], 32, kernel=(8, 8), stride=(4, 4), 41 | init_b=tf.constant_initializer(0.01), name='conv1') 42 | self.net['conv2'] = conv2d(self.net['input'], 64, kernel=(4, 4), stride=(2, 2), 43 | init_b=tf.constant_initializer(0.01), name='conv2') 44 | self.net['conv3'] = conv2d(self.net['input'], 64, kernel=(3, 3), stride=(1, 1), 45 | init_b=tf.constant_initializer(0.01), name='conv3') 46 | self.net['feature'] = dense(self.net['conv2'], 512, 47 | init_b=tf.constant_initializer(0.01), name='fc1') 48 | elif self.network_type == 'cnn_nips': 49 | self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), 50 | init_b=tf.constant_initializer(0.01), name='conv1') 51 | self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), 52 | init_b=tf.constant_initializer(0.01), name='conv2') 53 | self.net['feature'] = dense(self.net['conv2'], 256, 54 | init_b=tf.constant_initializer(0.01), name='fc1') 55 | elif self.network_type == 'mlp': 56 | self.net['fc1'] = dense(self.net['input'], 50, 57 | init_b=tf.constant_initializer(0.0), name='fc1') 58 | self.net['feature'] = dense(self.net['fc1'], 50, 59 | init_b=tf.constant_initializer(0.0), name='fc2') 60 | else: 61 | raise NotImplementedError('Unknown network type: {}'.format(self.network_type)) 62 | 63 | self.net['values'] = dense(self.net['feature'], self.n_outputs, activation=None, 64 | init_b=tf.constant_initializer(0.0), name='values') 65 | 66 | self.net['q_value'] = tf.reduce_max(self.net['values'], axis=1, name='q_value') 67 | self.net['q_action'] = tf.argmax(self.net['values'], axis=1, 68 | name='q_action', output_type=tf.int32) 69 | 70 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 71 | tf.get_variable_scope().name) 72 | 73 | def build_loss(self): 74 | 75 | indices = tf.transpose(tf.stack([tf.range(tf.shape(self.a)[0]), self.a], axis=0)) 76 | value = tf.gather_nd(self.net['values'], indices, name='action_value') 77 | 78 | self.loss = 0.5 * tf.reduce_mean(tf.square((value - self.y))) 79 | self.gradient = tf.gradients(self.loss, self.vars) 80 | 81 | tf.summary.scalar("loss", self.loss, collections=['q_network']) 82 | self.summary_op = tf.summary.merge_all('q_network') 83 | 84 | def get_q_value(self, sess, state): 85 | return sess.run(self.net['q_value'], feed_dict={self.x: state}) 86 | 87 | def get_q_action(self, sess, state): 88 | return sess.run(self.net['q_action'], feed_dict={self.x: state}) 89 | 90 | def get_feed_dict(self, states, actions, values): 91 | return {self.x: states, self.a: actions, self.y: values} 92 | 93 | def get_clone_op(self, network): 94 | new_vars = {v.name.replace(network.scope, ''): v for v in network.vars} 95 | return [tf.assign(v, new_vars[v.name.replace(self.scope, '')]) for v in self.vars] 96 | 97 | 98 | if __name__ == "__main__": 99 | import numpy 100 | 101 | num_actions = 4 102 | batch_size = 5 103 | network = QNetwork(n_outputs=num_actions) 104 | 105 | state = numpy.random.rand(batch_size, 4, 84, 84) 106 | values = numpy.random.rand(batch_size) 107 | actions = numpy.random.randint(num_actions, size=batch_size) 108 | 109 | with tf.Session() as sess: 110 | summary_writer = tf.summary.FileWriter('log/', sess.graph) 111 | sess.run(tf.global_variables_initializer()) 112 | 113 | q_values = sess.run(network.net['values'], feed_dict={network.x: state}) 114 | q_value = network.get_q_value(sess, state) 115 | q_action = network.get_q_action(sess, state) 116 | 117 | print(q_values) 118 | print(q_value) 119 | print(q_action) 120 | 121 | -------------------------------------------------------------------------------- /Chapter03/replay_memory.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import numpy, random 7 | from collections import deque 8 | 9 | 10 | class ReplayMemory: 11 | 12 | def __init__(self, history_len=4, capacity=1000000, batch_size=32, input_scale=255.0): 13 | 14 | self.capacity = capacity 15 | self.history_length = history_len 16 | self.batch_size = batch_size 17 | self.input_scale = input_scale 18 | 19 | self.frames = deque([]) 20 | self.others = deque([]) 21 | 22 | def add(self, frame, action, r, termination): 23 | 24 | if len(self.frames) == self.capacity: 25 | self.frames.popleft() 26 | self.others.popleft() 27 | self.frames.append(frame) 28 | self.others.append((action, r, termination)) 29 | 30 | def add_nullops(self, init_frame): 31 | for _ in range(self.history_length): 32 | self.add(init_frame, 0, 0, 0) 33 | 34 | def phi(self, new_frame): 35 | assert len(self.frames) > self.history_length 36 | images = [new_frame] + [self.frames[-1-i] for i in range(self.history_length-1)] 37 | return numpy.concatenate(images, axis=0) 38 | 39 | def _phi(self, index): 40 | images = [self.frames[index-i] for i in range(self.history_length)] 41 | return numpy.concatenate(images, axis=0) 42 | 43 | def sample(self): 44 | 45 | while True: 46 | 47 | index = random.randint(a=self.history_length-1, b=len(self.frames)-2) 48 | infos = [self.others[index-i] for i in range(self.history_length)] 49 | # Check if termination=1 before "index" 50 | flag = False 51 | for i in range(1, self.history_length): 52 | if infos[i][2] == 1: 53 | flag = True 54 | break 55 | if flag: 56 | continue 57 | 58 | state = self._phi(index) 59 | new_state = self._phi(index+1) 60 | action, r, termination = self.others[index] 61 | state = numpy.asarray(state / self.input_scale, dtype=numpy.float32) 62 | new_state = numpy.asarray(new_state / self.input_scale, dtype=numpy.float32) 63 | 64 | return (state, action, r, new_state, termination) 65 | 66 | -------------------------------------------------------------------------------- /Chapter03/simulator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 18 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import gym, numpy 7 | from gym import spaces 8 | 9 | 10 | class Simulator: 11 | 12 | # Supported tasks: 13 | # v1: Reacher, HalfCheetah, Hopper, Swimmer, Walker2d, Ant, Humanoid 14 | # v0: CartPole, Acrobot, Pendulum 15 | def __init__(self, task='Swimmer'): 16 | 17 | self.task = task 18 | try: 19 | self.env = gym.make('{}-v1'.format(task)) 20 | except: 21 | self.env = gym.make('{}-v2'.format(task)) 22 | self.env.reset() 23 | 24 | if type(self.env.action_space) == spaces.Box: 25 | assert len(self.env.action_space.shape) == 1 26 | self.action_dim = self.env.action_space.shape[0] 27 | self.action_type = 'continuous' 28 | elif type(self.env.action_space) == spaces.Discrete: 29 | self.action_dim = self.env.action_space.n 30 | self.action_type = 'discrete' 31 | else: 32 | raise NotImplementedError 33 | 34 | assert len(self.env.observation_space.shape) == 1 35 | self.obsevation_dim = self.env.observation_space.shape[0] 36 | self.total_reward = 0 37 | 38 | def reset(self): 39 | self.total_reward = 0 40 | return self.env.reset() 41 | 42 | def play(self, action): 43 | 44 | termination = 0 45 | if self.action_type == 'continuous': 46 | observation, reward, done, _ = self.env.step(action) 47 | elif self.action_type == 'discrete': 48 | observation, reward, done, _ = self.env.step(numpy.argmax(action)) 49 | 50 | if done: termination = 1 51 | self.total_reward += reward 52 | 53 | return observation, reward, termination 54 | 55 | def render(self): 56 | self.env.render() 57 | 58 | def get_total_reward(self): 59 | return self.total_reward 60 | 61 | 62 | if __name__ == "__main__": 63 | 64 | agent = Simulator(task='Swimmer') 65 | 66 | for _ in range(10): 67 | observation = agent.reset() 68 | while True: 69 | action = numpy.random.uniform(low=-1.0, high=1.0, size=(agent.action_dim,)) 70 | observation, reward, termination = agent.play(action) 71 | 72 | print("Observation: {}".format(observation)) 73 | print("Action: {}".format(action)) 74 | print("Reward: {}".format(reward)) 75 | print("Termination: {}".format(termination)) 76 | 77 | if termination: 78 | break 79 | agent.render() 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /Chapter03/test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 22 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import os 7 | import argparse 8 | import tensorflow as tf 9 | from simulator import Simulator 10 | from sampler import Sampler 11 | from policy.gaussian_mlp import GaussianMLPPolicy 12 | from policy.categorical_mlp import CategoricalMLPPolicy 13 | 14 | 15 | def test(task, 16 | num_episodes=10, 17 | policy_network_hidden_sizes=(32, 32), 18 | policy_adaptive_std=False): 19 | 20 | directory = 'log/{}/'.format(task) 21 | simulator = Simulator(task=task) 22 | 23 | input_shape = (None, simulator.obsevation_dim) 24 | output_size = simulator.action_dim 25 | 26 | if simulator.action_type == 'continuous': 27 | policy_network = GaussianMLPPolicy(input_shape=input_shape, 28 | output_size=output_size, 29 | hidden_sizes=policy_network_hidden_sizes, 30 | adaptive_std=policy_adaptive_std, 31 | std_hidden_sizes=policy_network_hidden_sizes) 32 | elif simulator.action_type == 'discrete': 33 | policy_network = CategoricalMLPPolicy(input_shape=input_shape, 34 | output_size=output_size, 35 | hidden_sizes=policy_network_hidden_sizes) 36 | 37 | sampler = Sampler(simulator, policy_network) 38 | 39 | with tf.Session() as sess: 40 | saver = tf.train.Saver() 41 | checkpoint_path = os.path.join(directory, '{}.ckpt'.format(task)) 42 | saver.restore(sess, checkpoint_path) 43 | 44 | for i in range(num_episodes): 45 | path = sampler.rollout(sess, max_path_length=1000, render=True) 46 | print("epsiode {}, reward {}".format(i, path['total_reward'])) 47 | 48 | 49 | if __name__ == "__main__": 50 | 51 | parser = argparse.ArgumentParser(description=None) 52 | parser.add_argument('-t', '--task', default='Swimmer', 53 | type=str, help='Tasks: Swimmer, Walker2d, Reacher, HalfCheetah, Hopper, Ant, Humanoid') 54 | args = parser.parse_args() 55 | 56 | test(task=args.task, policy_network_hidden_sizes=(32, 32)) 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /Chapter03/trpo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 18 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy, math 7 | import tensorflow as tf 8 | 9 | 10 | class TRPO: 11 | 12 | def __init__(self, policy, optimizer, step_size): 13 | 14 | self.policy = policy 15 | self.optimizer = optimizer 16 | self.step_size = step_size 17 | 18 | self.x = self.policy.get_input() 19 | self.action_dim = self.policy.output_size 20 | self.dist = self.policy.distribution 21 | 22 | self.build_formula() 23 | 24 | def build_formula(self): 25 | 26 | self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.action_dim), name='action') 27 | self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='action') 28 | 29 | dist_vars = self.policy.get_dist_info() 30 | old_dist_vars = {k: tf.placeholder(tf.float32, shape=[None]+list(shape), name='old_dist_{}'.format(k)) 31 | for k, shape in self.dist.specs()} 32 | old_dist_vars_list = [old_dist_vars[k] for k in self.dist.keys()] 33 | 34 | kl = self.dist.kl_tf(old_dist_vars, dist_vars) 35 | lr = self.dist.likelihood_ratio_tf(self.action, old_dist_vars, dist_vars) 36 | mean_kl = tf.reduce_mean(kl) 37 | loss = -tf.reduce_mean(lr * self.advantage) 38 | 39 | self.inputs_tensors = [self.x, self.action, self.advantage] + old_dist_vars_list 40 | self.optimizer.build(loss=loss, 41 | leq_constraint=(mean_kl, self.step_size), 42 | params=self.policy.get_params(), 43 | inputs=self.inputs_tensors) 44 | # Add summaries 45 | tf.summary.scalar("loss", loss, collections=['trpo']) 46 | tf.summary.scalar("mean_kl", mean_kl, collections=['trpo']) 47 | self.summary_op = tf.summary.merge_all('trpo') 48 | 49 | def optimize_policy(self, sess, samples, logger=None, subsample_rate=0.5): 50 | 51 | if subsample_rate < 1.0: 52 | n = len(samples['rewards']) 53 | idx = numpy.random.choice(n, int(math.floor(n * subsample_rate)), replace=False) 54 | obs = samples['observations'][idx] 55 | actions = samples['actions'][idx] 56 | advantages = samples['advantages'][idx] 57 | dist_vars = [samples['infos'][k][idx] for k in self.dist.keys()] 58 | else: 59 | obs = samples['observations'] 60 | actions = samples['actions'] 61 | advantages = samples['advantages'] 62 | dist_vars = [samples['infos'][k] for k in self.dist.keys()] 63 | 64 | inputs = [obs, actions, advantages] + dist_vars 65 | self.optimizer.optimize(sess, input_vals=inputs) 66 | 67 | if logger: 68 | feed_dict = dict(list(zip(self.inputs_tensors, inputs))) 69 | summary_str = sess.run(self.summary_op, feed_dict=feed_dict) 70 | logger.add_summary(summary_str) 71 | 72 | 73 | if __name__ == "__main__": 74 | from policy.gaussian_mlp import GaussianMLPPolicy 75 | from optimizer import ConjugateOptimizer 76 | 77 | input_shape = (None, 10) 78 | output_size = 5 79 | 80 | policy = GaussianMLPPolicy(input_shape=input_shape, 81 | output_size=output_size, 82 | learn_std=True, 83 | adaptive_std=False) 84 | optimizer = ConjugateOptimizer() 85 | 86 | trpo = TRPO(policy, optimizer, step_size=0.01) 87 | with tf.Session() as sess: 88 | sess.run(tf.global_variables_initializer()) 89 | 90 | samples = {} 91 | samples['observations'] = numpy.random.rand(10, input_shape[1]) 92 | samples['actions'] = numpy.random.rand(10, output_size) 93 | samples['advantages'] = numpy.random.rand(10) 94 | samples['infos'] = {'mean': numpy.random.rand(10, output_size), 95 | 'log_var': numpy.random.rand(10, output_size)} 96 | 97 | trpo.optimize_policy(sess, samples, subsample_rate=1.0) 98 | print("Finished.") 99 | 100 | -------------------------------------------------------------------------------- /Chapter03/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 4 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import scipy.signal 8 | import tensorflow as tf 9 | 10 | def flatten_tensor_variables(ts): 11 | return tf.concat(axis=0, values=[tf.reshape(x, [-1]) for x in ts]) 12 | 13 | def flatten_tensors(tensors): 14 | if len(tensors) > 0: 15 | return numpy.concatenate([numpy.reshape(x, [-1]) for x in tensors]) 16 | else: 17 | return numpy.asarray([]) 18 | 19 | def unflatten_tensors(flattened, tensor_shapes): 20 | tensor_sizes = list(map(numpy.prod, tensor_shapes)) 21 | indices = numpy.cumsum(tensor_sizes)[:-1] 22 | return [numpy.reshape(pair[0], pair[1]) for pair in zip(numpy.split(flattened, indices), tensor_shapes)] 23 | 24 | def get_param_values(sess, params, flatten=True): 25 | values = sess.run(params) 26 | if flatten: 27 | values = flatten_tensors(values) 28 | return values 29 | 30 | def get_param_assign_ops(params): 31 | 32 | assign_ops = [] 33 | input_tensors = [] 34 | 35 | for param in params: 36 | v = tf.placeholder(dtype=param.dtype, shape=param.get_shape()) 37 | assign_ops.append(tf.assign(param, v)) 38 | input_tensors.append(v) 39 | 40 | return assign_ops, input_tensors 41 | 42 | def set_param_values(sess, assign_ops, input_tensors, values, flatten=True): 43 | 44 | if flatten: 45 | shapes = [p.get_shape().as_list() for p in input_tensors] 46 | values = unflatten_tensors(values, shapes) 47 | 48 | feed_dict = dict(list(zip(input_tensors, values))) 49 | sess.run(assign_ops, feed_dict=feed_dict) 50 | 51 | def discount_cumsum(x, discount): 52 | # See https://docs.scipy.org/doc/scipy/reference/tutorial/signal.html#difference-equation-filtering 53 | # Here, we have y[t] - discount*y[t+1] = x[t] 54 | # or rev(y)[t] - discount*rev(y)[t-1] = rev(x)[t] 55 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 56 | 57 | def iterate_minibatches(input_list=None, batch_size=None, shuffle=False): 58 | 59 | if batch_size is None: 60 | batch_size = len(input_list[0]) 61 | assert all(len(x) == len(input_list[0]) for x in input_list) 62 | 63 | if shuffle: 64 | indices = numpy.arange(len(input_list[0])) 65 | numpy.random.shuffle(indices) 66 | 67 | for start_idx in range(0, len(input_list[0]), batch_size): 68 | idx = indices[start_idx:start_idx + batch_size] if shuffle else slice(start_idx, start_idx + batch_size) 69 | yield [r[idx] for r in input_list] 70 | 71 | -------------------------------------------------------------------------------- /Chapter03/value/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 20 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter03/value/linear_fitting.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 20 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | 8 | 9 | class LinearFitting: 10 | 11 | def __init__(self): 12 | self.beta = None 13 | self.sess = None 14 | 15 | def set_session(self, sess): 16 | self.sess = sess 17 | 18 | def feature(self, path): 19 | o = numpy.clip(path['observations'], -10, 10) 20 | l = len(path["rewards"]) 21 | al = numpy.arange(l).reshape(-1, 1) / 100.0 22 | return numpy.concatenate([o, o ** 2, al, al ** 2, al ** 3, numpy.ones((l, 1))], axis=1) 23 | 24 | def train(self, paths): 25 | 26 | features = numpy.concatenate([self.feature(path) for path in paths]) 27 | returns = numpy.concatenate([path['returns'] for path in paths]) 28 | 29 | reg_coeff = 1e-5 30 | for _ in range(5): 31 | self.beta = numpy.linalg.lstsq(features.T.dot(features) + 32 | reg_coeff * numpy.identity(features.shape[1]), 33 | features.T.dot(returns))[0] 34 | if not numpy.any(numpy.isnan(self.beta)): 35 | break 36 | reg_coeff *= 10 37 | 38 | def predict(self, path): 39 | if self.beta is None: 40 | return numpy.zeros((len(path['rewards'],))) 41 | else: 42 | return self.feature(path).dot(self.beta) 43 | 44 | -------------------------------------------------------------------------------- /Chapter03/value/mlp_fitting.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 26 Sep 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | from mlp import MLP 9 | from utils import iterate_minibatches 10 | 11 | 12 | class MLPFitting: 13 | 14 | def __init__(self, 15 | input_shape, 16 | hidden_sizes=(32, 32), 17 | hidden_nonlinearity=tf.nn.tanh, 18 | learning_rate=3e-4, 19 | batch_size=1000): 20 | 21 | self.input_shape = input_shape 22 | self.hidden_sizes = hidden_sizes 23 | self.learning_rate = learning_rate 24 | self.batch_size = batch_size 25 | self.sess = None 26 | 27 | with tf.variable_scope("mlp_fitting"): 28 | self.mlp = MLP(input_shape=input_shape, 29 | output_size=1, 30 | hidden_sizes=hidden_sizes, 31 | hidden_nonlinearity=hidden_nonlinearity, 32 | output_nonlinearity=None, 33 | name='value') 34 | 35 | self.x = self.mlp.get_input_layer() 36 | self.y = tf.reshape(self.mlp.get_output_layer(), shape=(-1,)) 37 | self.params = self.mlp.get_params() 38 | 39 | self.z = tf.placeholder(dtype=tf.float32, shape=(None,), name='z') 40 | loss = tf.reduce_mean(tf.square(self.z - self.y)) 41 | self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss, var_list=self.params) 42 | 43 | def set_session(self, sess): 44 | self.sess = sess 45 | 46 | def train(self, paths): 47 | assert self.sess is not None 48 | obs = numpy.concatenate([path['observations'] for path in paths]) 49 | returns = numpy.concatenate([path['returns'] for path in paths]) 50 | if self.batch_size is not None and obs.shape[0] >= self.batch_size: 51 | for x, z in iterate_minibatches([obs, returns], self.batch_size, shuffle=True): 52 | self.sess.run(self.train_op, feed_dict={self.x: x, self.z: z}) 53 | else: 54 | self.sess.run(self.train_op, feed_dict={self.x: obs, self.z: returns}) 55 | 56 | def predict(self, path): 57 | assert self.sess is not None 58 | return self.sess.run(self.y, feed_dict={self.x: path['observations']}) 59 | 60 | 61 | if __name__ == "__main__": 62 | 63 | input_shape = (None, 5) 64 | mlp = MLPFitting(input_shape) 65 | 66 | path = {'observations': numpy.random.rand(1000, 5), 67 | 'returns': numpy.random.rand(1000)} 68 | 69 | with tf.Session() as sess: 70 | sess.run(tf.global_variables_initializer()) 71 | mlp.set_session(sess) 72 | mlp.train(paths=[path]) 73 | print(mlp.predict(path)) 74 | 75 | 76 | -------------------------------------------------------------------------------- /Chapter04/actor_critic_net.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 10, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from actor_network import ActorNetwork 8 | from critic_network import CriticNetwork 9 | 10 | 11 | class ActorCriticNet: 12 | 13 | def __init__(self, input_dim, action_dim, 14 | critic_layers, actor_layers, actor_activation, 15 | scope='ac_network'): 16 | 17 | self.input_dim = input_dim 18 | self.action_dim = action_dim 19 | self.scope = scope 20 | 21 | self.x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='x') 22 | self.y = tf.placeholder(shape=(None,), dtype=tf.float32, name='y') 23 | 24 | with tf.variable_scope(scope): 25 | self.actor_network = ActorNetwork(self.x, action_dim, 26 | hidden_layers=actor_layers, 27 | activation=actor_activation) 28 | 29 | self.critic_network = CriticNetwork(self.x, 30 | self.actor_network.get_output_layer(), 31 | hidden_layers=critic_layers) 32 | 33 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 34 | tf.get_variable_scope().name) 35 | self._build() 36 | 37 | def _build(self): 38 | 39 | value = self.critic_network.get_output_layer() 40 | 41 | actor_loss = -tf.reduce_mean(value) 42 | self.actor_vars = self.actor_network.get_params() 43 | self.actor_grad = tf.gradients(actor_loss, self.actor_vars) 44 | tf.summary.scalar("actor_loss", actor_loss, collections=['actor']) 45 | self.actor_summary = tf.summary.merge_all('actor') 46 | 47 | critic_loss = 0.5 * tf.reduce_mean(tf.square((value - self.y))) 48 | self.critic_vars = self.critic_network.get_params() 49 | self.critic_grad = tf.gradients(critic_loss, self.critic_vars) 50 | tf.summary.scalar("critic_loss", critic_loss, collections=['critic']) 51 | self.critic_summary = tf.summary.merge_all('critic') 52 | 53 | def get_action(self, sess, state): 54 | return self.actor_network.get_action(sess, state) 55 | 56 | def get_value(self, sess, state): 57 | return self.critic_network.get_value(sess, state) 58 | 59 | def get_action_value(self, sess, state, action): 60 | return self.critic_network.get_action_value(sess, state, action) 61 | 62 | def get_actor_feed_dict(self, state): 63 | return {self.x: state} 64 | 65 | def get_critic_feed_dict(self, state, action, target): 66 | return {self.x: state, self.y: target, 67 | self.critic_network.input_action: action} 68 | 69 | def get_clone_op(self, network, tau=0.9): 70 | update_ops = [] 71 | new_vars = {v.name.replace(network.scope, ''): v for v in network.vars} 72 | for v in self.vars: 73 | u = (1 - tau) * v + tau * new_vars[v.name.replace(self.scope, '')] 74 | update_ops.append(tf.assign(v, u)) 75 | return update_ops 76 | 77 | 78 | if __name__ == "__main__": 79 | import numpy 80 | 81 | batch_size = 5 82 | input_dim = 10 83 | action_dim = 3 84 | hidden_layers = [20, 20] 85 | network = ActorCriticNet(input_dim, action_dim, 86 | hidden_layers, hidden_layers, 87 | actor_activation=tf.nn.relu) 88 | 89 | state = numpy.random.rand(batch_size, input_dim) 90 | action = numpy.random.rand(batch_size, action_dim) 91 | with tf.Session() as sess: 92 | summary_writer = tf.summary.FileWriter('log/', sess.graph) 93 | sess.run(tf.global_variables_initializer()) 94 | 95 | a = network.get_action(sess, state) 96 | v = network.get_value(sess, state) 97 | assert numpy.sum(numpy.fabs(v - network.get_action_value(sess, state, action))) > 1e-3 98 | assert numpy.sum(numpy.fabs(v - network.get_action_value(sess, state, a))) < 1e-8 99 | print("Pass") 100 | -------------------------------------------------------------------------------- /Chapter04/actor_network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 10, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layers import dense 8 | 9 | 10 | class ActorNetwork: 11 | 12 | def __init__(self, input_state, output_dim, hidden_layers, activation=tf.nn.relu): 13 | 14 | self.x = input_state 15 | self.output_dim = output_dim 16 | self.hidden_layers = hidden_layers 17 | self.activation = activation 18 | 19 | with tf.variable_scope('actor_network'): 20 | self.output = self._build() 21 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 22 | tf.get_variable_scope().name) 23 | 24 | def _build(self): 25 | 26 | layer = self.x 27 | init_b = tf.constant_initializer(0.01) 28 | 29 | for i, num_unit in enumerate(self.hidden_layers): 30 | layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i)) 31 | 32 | output = dense(layer, self.output_dim, activation=self.activation, init_b=init_b, name='output') 33 | return output 34 | 35 | def get_output_layer(self): 36 | return self.output 37 | 38 | def get_params(self): 39 | return self.vars 40 | 41 | def get_action(self, sess, state): 42 | return sess.run(self.output, feed_dict={self.x: state}) 43 | 44 | 45 | if __name__ == "__main__": 46 | import numpy 47 | 48 | batch_size = 5 49 | input_dim = 10 50 | output_dim = 3 51 | hidden_layers = [20, 20] 52 | x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='input') 53 | network = ActorNetwork(x, output_dim, hidden_layers) 54 | 55 | state = numpy.random.rand(batch_size, input_dim) 56 | with tf.Session() as sess: 57 | summary_writer = tf.summary.FileWriter('log/', sess.graph) 58 | sess.run(tf.global_variables_initializer()) 59 | action = network.get_action(sess, state) 60 | print(action) 61 | 62 | -------------------------------------------------------------------------------- /Chapter04/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 11, 2018 3 | 4 | @author: ywz 5 | ''' 6 | DEMO = { 7 | 'gamma': 0.99, 8 | 'history_len': 2, 9 | 'num_episode': 3000, 10 | 'capacity': 100000, 11 | 'epsilon_decay': 100000, 12 | 'epsilon_min': 0.0, 13 | 'time_between_two_copies': 2000, 14 | 'update_interval': 1, 15 | 'T': 1000000, 16 | 17 | 'batch_size': 64, 18 | 'learning_rate': 1e-4, 19 | 'tau': 0.9, 20 | 'optimizer': 'adam', 21 | 'rho': 0.99, 22 | 'log_dir': 'log/' 23 | } 24 | -------------------------------------------------------------------------------- /Chapter04/critic_network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 10, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layers import dense 8 | 9 | 10 | class CriticNetwork: 11 | 12 | def __init__(self, input_state, input_action, hidden_layers): 13 | 14 | assert len(hidden_layers) >= 2 15 | self.input_state = input_state 16 | self.input_action = input_action 17 | self.hidden_layers = hidden_layers 18 | 19 | with tf.variable_scope('critic_network'): 20 | self.output = self._build() 21 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 22 | tf.get_variable_scope().name) 23 | 24 | def _build(self): 25 | 26 | layer = self.input_state 27 | init_b = tf.constant_initializer(0.01) 28 | 29 | for i, num_unit in enumerate(self.hidden_layers): 30 | if i != 1: 31 | layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i)) 32 | else: 33 | layer = tf.concat([layer, self.input_action], axis=1, name='concat_action') 34 | layer = dense(layer, num_unit, init_b=init_b, name='hidden_layer_{}'.format(i)) 35 | 36 | output = dense(layer, 1, activation=None, init_b=init_b, name='output') 37 | return tf.reshape(output, shape=(-1,)) 38 | 39 | def get_output_layer(self): 40 | return self.output 41 | 42 | def get_params(self): 43 | return self.vars 44 | 45 | def get_value(self, sess, state): 46 | return sess.run(self.output, feed_dict={self.input_state: state}) 47 | 48 | def get_action_value(self, sess, state, action): 49 | return sess.run(self.output, feed_dict={self.input_state: state, 50 | self.input_action: action}) 51 | 52 | 53 | if __name__ == "__main__": 54 | import numpy 55 | 56 | batch_size = 5 57 | input_dim = 10 58 | output_dim = 3 59 | hidden_layers = [20, 20] 60 | x = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='input') 61 | a = tf.placeholder(shape=(None, input_dim), dtype=tf.float32, name='action') 62 | network = CriticNetwork(x, a, hidden_layers) 63 | 64 | state = numpy.random.rand(batch_size, input_dim) 65 | action = numpy.random.rand(batch_size, input_dim) 66 | with tf.Session() as sess: 67 | summary_writer = tf.summary.FileWriter('log/', sess.graph) 68 | sess.run(tf.global_variables_initializer()) 69 | value = network.get_action_value(sess, state, action) 70 | print(value) 71 | 72 | -------------------------------------------------------------------------------- /Chapter04/eval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 15, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import os 7 | import argparse 8 | import tensorflow as tf 9 | from config import DEMO 10 | from task import Task 11 | from dpg import DPG 12 | 13 | 14 | def main(): 15 | 16 | parser = argparse.ArgumentParser(description=None) 17 | parser.add_argument('-t', '--task', default='CartPole-v0', 18 | type=str, help='Tasks: CartPole-v0, Pendulum-v0, Acrobot-v1') 19 | parser.add_argument('-d', '--device', default='cpu', type=str, help='Device: cpu, gpu') 20 | args = parser.parse_args() 21 | 22 | task = Task(args.task) 23 | log_dir = os.path.join(DEMO['log_dir'], '{}/train'.format(args.task)) 24 | if not tf.gfile.Exists(log_dir): 25 | tf.gfile.MakeDirs(log_dir) 26 | model_dir = os.path.join(DEMO['log_dir'], args.task) 27 | 28 | device = '/{}:0'.format('cpu') 29 | with tf.device(device): 30 | model = DPG(DEMO, task, model_dir, callback=task.render) 31 | 32 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 33 | saver = tf.train.Saver() 34 | model.load(sess, saver) 35 | model.evaluate(sess) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /Chapter04/layers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 25, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | def get_variable(shape, initializer, name, dtype=tf.float32, trainable=True): 11 | var = tf.get_variable(shape=shape, initializer=initializer, 12 | dtype=dtype, name=name, trainable=trainable) 13 | return var 14 | 15 | 16 | def HeUniform(shape): 17 | 18 | if len(shape) > 2: 19 | w = shape[0] 20 | h = shape[1] 21 | input_channels = shape[2] 22 | d = 1.0 / numpy.sqrt(input_channels * w * h) 23 | else: 24 | d = 1.0 / numpy.sqrt(shape[0]) 25 | 26 | init_W = tf.random_uniform_initializer(-d, d) 27 | init_b = tf.random_uniform_initializer(-d, d) 28 | return init_W, init_b 29 | 30 | 31 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 32 | activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'): 33 | 34 | assert len(x.get_shape().as_list()) == 4 35 | shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim) 36 | _W, _b = HeUniform(shape) 37 | if init_W is None: init_W = _W 38 | if init_b is None: init_b = _b 39 | 40 | with tf.variable_scope(name): 41 | W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 42 | b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 43 | 44 | conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding) 45 | if activation: 46 | conv = activation(tf.nn.bias_add(conv, b)) 47 | else: 48 | conv = tf.nn.bias_add(conv, b) 49 | 50 | return conv 51 | 52 | 53 | def dense(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='dense'): 54 | 55 | if len(x.get_shape().as_list()) > 2: 56 | shape = x.get_shape().as_list() 57 | x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:]))) 58 | 59 | shape = (x.get_shape().as_list()[-1], output_dim) 60 | _W, _b = HeUniform(shape) 61 | if init_W is None: init_W = _W 62 | if init_b is None: init_b = _b 63 | 64 | with tf.variable_scope(name): 65 | W = get_variable(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 66 | b = get_variable(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 67 | 68 | output = tf.matmul(x, W) + b 69 | if activation: 70 | output = activation(output) 71 | 72 | return output 73 | -------------------------------------------------------------------------------- /Chapter04/log/Acrobot-v1/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter04/log/Acrobot-v1/train/events.out.tfevents.1523886598.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/Acrobot-v1/train/events.out.tfevents.1523886598.ywz-PC -------------------------------------------------------------------------------- /Chapter04/log/CartPole-v0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter04/log/CartPole-v0/train/events.out.tfevents.1525870448.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/CartPole-v0/train/events.out.tfevents.1525870448.ywz-PC -------------------------------------------------------------------------------- /Chapter04/log/MountainCar-v0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter04/log/MountainCar-v0/train/events.out.tfevents.1526196635.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/MountainCar-v0/train/events.out.tfevents.1526196635.ywz-PC -------------------------------------------------------------------------------- /Chapter04/log/Pendulum-v0/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter04/log/Pendulum-v0/train/events.out.tfevents.1525871560.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter04/log/Pendulum-v0/train/events.out.tfevents.1525871560.ywz-PC -------------------------------------------------------------------------------- /Chapter04/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 10, 2018 3 | 4 | @author: ywz 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter04/optimizer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 11, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | class Optimizer: 11 | 12 | def __init__(self, config, ac_network, target_network, replay_memory): 13 | 14 | self.ac_network = ac_network 15 | self.target_network = target_network 16 | self.replay_memory = replay_memory 17 | self.summary_writer = None 18 | self.gamma = config['gamma'] 19 | 20 | if config['optimizer'] == 'adam': 21 | opt = tf.train.AdamOptimizer(learning_rate=config['learning_rate'], 22 | beta1=config['rho']) 23 | elif config['optimizer'] == 'momentum': 24 | opt = tf.train.MomentumOptimizer(learning_rate=config['learning_rate'], 25 | momentum=config['rho']) 26 | else: 27 | raise ValueError("Unknown optimizer") 28 | 29 | self.actor_train_op = opt.apply_gradients(zip(ac_network.actor_grad, 30 | ac_network.actor_vars)) 31 | 32 | self.critic_train_op = opt.apply_gradients(zip(ac_network.critic_grad, 33 | ac_network.critic_vars)) 34 | 35 | def set_summary_writer(self, summary_writer=None): 36 | self.summary_writer = summary_writer 37 | 38 | def sample_transitions(self, sess, batch_size): 39 | 40 | input_dim = self.ac_network.input_dim 41 | action_dim = self.ac_network.action_dim 42 | 43 | states = numpy.zeros((batch_size, input_dim), dtype=numpy.float32) 44 | new_states = numpy.zeros((batch_size, input_dim), dtype=numpy.float32) 45 | targets = numpy.zeros(batch_size, dtype=numpy.float32) 46 | actions = numpy.zeros((batch_size, action_dim), dtype=numpy.float32) 47 | terms = numpy.zeros(batch_size, dtype=numpy.int32) 48 | 49 | for i in range(batch_size): 50 | state, action, r, new_state, term = self.replay_memory.sample() 51 | states[i] = state 52 | new_states[i] = new_state 53 | actions[i] = action 54 | targets[i] = r 55 | terms[i] = term 56 | 57 | targets += self.gamma * (1 - terms) * self.target_network.get_value(sess, new_states) 58 | return states, actions, targets 59 | 60 | def train_one_step(self, sess, step, batch_size): 61 | 62 | states, actions, targets = self.sample_transitions(sess, batch_size) 63 | 64 | # Critic update 65 | feed_dict = self.ac_network.get_critic_feed_dict(states, actions, targets) 66 | if self.summary_writer and step % 2000 == 0: 67 | s, _, = sess.run([self.ac_network.critic_summary, self.critic_train_op], 68 | feed_dict=feed_dict) 69 | self.summary_writer.add_summary(s, step) 70 | self.summary_writer.flush() 71 | else: 72 | sess.run(self.critic_train_op, feed_dict=feed_dict) 73 | 74 | # Actor update 75 | feed_dict = self.ac_network.get_actor_feed_dict(states) 76 | if self.summary_writer and step % 2000 == 0: 77 | s, _, = sess.run([self.ac_network.actor_summary, self.actor_train_op], 78 | feed_dict=feed_dict) 79 | self.summary_writer.add_summary(s, step) 80 | self.summary_writer.flush() 81 | else: 82 | sess.run(self.actor_train_op, feed_dict=feed_dict) 83 | 84 | 85 | -------------------------------------------------------------------------------- /Chapter04/replay_memory.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 11, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import numpy, random 7 | from collections import deque 8 | 9 | 10 | class ReplayMemory: 11 | 12 | def __init__(self, history_len=4, capacity=1000000): 13 | 14 | self.capacity = capacity 15 | self.history_length = history_len 16 | 17 | self.states = deque([]) 18 | self.others = deque([]) 19 | 20 | def add(self, state, action, r, termination): 21 | 22 | if len(self.states) == self.capacity: 23 | self.states.popleft() 24 | self.others.popleft() 25 | self.states.append(state) 26 | self.others.append((action, r, termination)) 27 | 28 | def add_nullops(self, init_state): 29 | for _ in range(self.history_length): 30 | self.add(init_state, 0, 0, 0) 31 | 32 | def phi(self, new_state): 33 | assert len(self.states) > self.history_length 34 | states = [new_state] + [self.states[-1-i] for i in range(self.history_length-1)] 35 | return numpy.concatenate(states, axis=0) 36 | 37 | def _phi(self, index): 38 | states = [self.states[index-i] for i in range(self.history_length)] 39 | return numpy.concatenate(states, axis=0) 40 | 41 | def sample(self): 42 | 43 | while True: 44 | 45 | index = random.randint(a=self.history_length-1, b=len(self.states)-2) 46 | infos = [self.others[index-i] for i in range(self.history_length)] 47 | # Check if termination=1 before "index" 48 | flag = False 49 | for i in range(1, self.history_length): 50 | if infos[i][2] == 1: 51 | flag = True 52 | break 53 | if flag: 54 | continue 55 | 56 | state = self._phi(index) 57 | new_state = self._phi(index+1) 58 | action, r, termination = self.others[index] 59 | state = numpy.asarray(state, dtype=numpy.float32) 60 | new_state = numpy.asarray(new_state, dtype=numpy.float32) 61 | 62 | return (state, action, r, new_state, termination) 63 | 64 | 65 | if __name__ == "__main__": 66 | 67 | history_len = 2 68 | capacity = 20 69 | 70 | replay = ReplayMemory(history_len, capacity) 71 | 72 | for i in range(20): 73 | state = numpy.zeros((2,)) + i 74 | action = numpy.ones((2,)) * i 75 | reward = i ** 2 76 | termination = 1 if i % 10 == 0 else 0 77 | replay.add(state, action, reward, termination) 78 | 79 | print(replay.states) 80 | print(replay.others) 81 | state, action, r, new_state, termination = replay.sample() 82 | print(state) 83 | print(new_state) 84 | print(action) 85 | print(r) 86 | print(termination) 87 | print('------------------------------') 88 | 89 | for _ in range(50): 90 | replay.sample() 91 | 92 | -------------------------------------------------------------------------------- /Chapter04/task.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 11, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import gym 7 | import numpy 8 | import tensorflow as tf 9 | 10 | 11 | class Task: 12 | 13 | def __init__(self, name): 14 | 15 | assert name in ['CartPole-v0', 'MountainCar-v0', 16 | 'Pendulum-v0', 'Acrobot-v1'] 17 | self.name = name 18 | self.task = gym.make(name) 19 | self.last_state = self.reset() 20 | 21 | def reset(self): 22 | state = self.task.reset() 23 | self.total_reward = 0 24 | return state 25 | 26 | def play_action(self, action): 27 | 28 | if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']: 29 | action = numpy.fmax(action, 0) 30 | action = action / numpy.sum(action) 31 | action = numpy.random.choice(range(len(action)), p=action) 32 | else: 33 | low = self.task.env.action_space.low 34 | high = self.task.env.action_space.high 35 | action = numpy.fmin(numpy.fmax(action, low), high) 36 | 37 | state, reward, done, _ = self.task.step(action) 38 | self.total_reward += reward 39 | termination = 1 if done else 0 40 | 41 | return reward, state, termination 42 | 43 | def get_total_reward(self): 44 | return self.total_reward 45 | 46 | def get_action_dim(self): 47 | if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']: 48 | return self.task.env.action_space.n 49 | else: 50 | return self.task.env.action_space.shape[0] 51 | 52 | def get_state_dim(self): 53 | return self.last_state.shape[0] 54 | 55 | def get_activation_fn(self): 56 | if self.name not in ['Pendulum-v0', 'MountainCarContinuous-v0']: 57 | return tf.nn.softmax 58 | else: 59 | return None 60 | 61 | def render(self): 62 | self.task.render() 63 | -------------------------------------------------------------------------------- /Chapter04/train.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 12, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import os 7 | import argparse 8 | import tensorflow as tf 9 | from config import DEMO 10 | from task import Task 11 | from dpg import DPG 12 | 13 | 14 | def delete_dir(path): 15 | if tf.gfile.Exists(path): 16 | tf.gfile.DeleteRecursively(path) 17 | tf.gfile.MakeDirs(path) 18 | return path 19 | 20 | 21 | def main(): 22 | 23 | parser = argparse.ArgumentParser(description=None) 24 | parser.add_argument('-t', '--task', default='CartPole-v0', 25 | type=str, help='Tasks: CartPole-v0, Pendulum-v0, Acrobot-v1') 26 | parser.add_argument('-d', '--device', default='cpu', type=str, help='Device: cpu, gpu') 27 | args = parser.parse_args() 28 | 29 | task = Task(args.task) 30 | log_dir = os.path.join(DEMO['log_dir'], '{}/train'.format(args.task)) 31 | if not tf.gfile.Exists(log_dir): 32 | tf.gfile.MakeDirs(log_dir) 33 | model_dir = os.path.join(DEMO['log_dir'], args.task) 34 | 35 | device = '/{}:0'.format(args.device) 36 | with tf.device(device): 37 | model = DPG(DEMO, task, model_dir, callback=None) 38 | 39 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 40 | saver = tf.train.Saver() 41 | writer = tf.summary.FileWriter(delete_dir(log_dir), sess.graph_def) 42 | model.set_summary_writer(summary_writer=writer) 43 | 44 | sess.run(tf.global_variables_initializer()) 45 | model.train(sess, saver) 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | 51 | -------------------------------------------------------------------------------- /Chapter05/cluster.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 1 Jun 2017 3 | 4 | @author: ywz 5 | ''' 6 | PORT = 12222 7 | 8 | def cluster_spec(num_workers, num_ps): 9 | 10 | cluster = {} 11 | port = PORT 12 | 13 | host = '127.0.0.1' 14 | cluster['ps'] = ['{}:{}'.format(host, port+i) for i in range(num_ps)] 15 | cluster['worker'] = ['{}:{}'.format(host, port+i+num_ps) for i in range(num_workers)] 16 | 17 | return cluster 18 | -------------------------------------------------------------------------------- /Chapter05/demo/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 10, 2016 3 | 4 | @author: a0096049 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter05/demo/object.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on May 16, 2016 3 | 4 | @author: a0096049 5 | ''' 6 | 7 | import numpy, pygame 8 | from demo.utils import Color, calculateIntersectPoint 9 | 10 | 11 | class Object: 12 | 13 | def __init__(self, x, y, r, game): 14 | 15 | self.x = x 16 | self.y = y 17 | self.r = r 18 | self.game = game 19 | 20 | def get_position(self): 21 | return self.x, self.y 22 | 23 | def get_radius(self): 24 | return self.r 25 | 26 | def set_position(self, x, y): 27 | self.x = x 28 | self.y = y 29 | 30 | def draw(self): 31 | pass 32 | 33 | class Food(Object): 34 | 35 | def __init__(self, x, y, radius, t, game): 36 | 37 | super().__init__(x, y, radius, game) 38 | self.type = t 39 | self.life = numpy.random.randint(1000, 5000) 40 | 41 | def decrease_life(self): 42 | self.life -= 1 43 | return self.life == 0 44 | 45 | def draw(self, found=False): 46 | 47 | if found == False: 48 | if self.type == "bad": 49 | pygame.draw.circle(self.game.DISPLAYSURF, Color.RED, (self.x, self.y), self.r) 50 | else: 51 | pygame.draw.circle(self.game.DISPLAYSURF, Color.GREEN, (self.x, self.y), self.r) 52 | else: 53 | pygame.draw.circle(self.game.DISPLAYSURF, Color.BLUE, (self.x, self.y), self.r) 54 | 55 | class Wall: 56 | 57 | def __init__(self, start, end, game, width=2): 58 | 59 | self.start = start 60 | self.end = end 61 | self.game = game 62 | self.width = width 63 | 64 | def draw(self): 65 | pygame.draw.line(self.game.DISPLAYSURF, Color.WHITE, self.start, self.end, self.width) 66 | 67 | def collide(self, p1, p2): 68 | 69 | point = calculateIntersectPoint(p1, p2, self.start, self.end) 70 | if point is None: 71 | return None 72 | else: 73 | return (int(point[0]), int(point[1])) 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /Chapter05/doom/doom.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 7 Jun 2017 3 | 4 | @author: ywz 5 | ''' 6 | from vizdoom import * 7 | import random 8 | import time 9 | 10 | def main(): 11 | 12 | game = DoomGame() 13 | game.load_config("./scenarios/basic.cfg") 14 | game.init() 15 | 16 | shoot = [0, 0, 1] 17 | left = [1, 0, 0] 18 | right = [0, 1, 0] 19 | actions = [shoot, left, right] 20 | 21 | episodes = 10 22 | for _ in range(episodes): 23 | game.new_episode() 24 | while not game.is_episode_finished(): 25 | state = game.get_state() 26 | img = state.screen_buffer 27 | misc = state.game_variables 28 | 29 | print(img.shape) 30 | print(misc) 31 | 32 | reward = game.make_action(random.choice(actions)) 33 | print("\treward: {}".format(reward)) 34 | time.sleep(0.05) 35 | print("Result: {}".format(game.get_total_reward())) 36 | time.sleep(2) 37 | 38 | if __name__ == "__main__": 39 | main() 40 | 41 | -------------------------------------------------------------------------------- /Chapter05/doom/game.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 7 Jun 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | from vizdoom import * 8 | from utils import cv2_resize_image 9 | 10 | class Game: 11 | 12 | def __init__(self, config='basic', window_visible=True): 13 | 14 | self.env = DoomGame() 15 | self.env.load_config("./scenarios/{}.cfg".format(config)) 16 | self.env.set_window_visible(window_visible) 17 | self.env.set_screen_format(ScreenFormat.GRAY8) 18 | self.env.init() 19 | 20 | self.env.new_episode() 21 | frame = self.get_current_frame() 22 | 23 | shoot = [0, 0, 1] 24 | left = [1, 0, 0] 25 | right = [0, 1, 0] 26 | self.raw_actions = [shoot, left, right] 27 | self.actions = list(range(len(self.raw_actions))) 28 | 29 | self.frame_skip = 4 30 | self.total_reward = 0 31 | self.reshape_size = 120 32 | 33 | # Frame buffer 34 | self.buffer_size = 8 35 | self.buffer_index = 0 36 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 37 | 38 | def get_current_frame(self): 39 | frame = self.env.get_state().screen_buffer 40 | return frame 41 | 42 | def rgb_to_gray(self, im): 43 | if len(im) == 3: 44 | return numpy.dot(im, [0.299, 0.587, 0.114]) 45 | else: 46 | return im 47 | 48 | def set_params(self, frame_skip=4): 49 | self.frame_skip = frame_skip 50 | self.env.new_episode() 51 | frame = self.get_current_frame() 52 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 53 | 54 | def reset(self): 55 | self.env.new_episode() 56 | frame = self.get_current_frame() 57 | self.total_reward = 0 58 | self.buffer_index = 0 59 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 60 | 61 | def add_frame_to_buffer(self, frame): 62 | self.buffer_index = self.buffer_index % self.buffer_size 63 | self.buffer[self.buffer_index] = frame 64 | self.buffer_index += 1 65 | 66 | def get_available_actions(self): 67 | return list(range(len(self.actions))) 68 | 69 | def get_feedback_size(self): 70 | return (self.reshape_size, self.reshape_size) 71 | 72 | def crop(self, frame): 73 | frame = cv2_resize_image(frame, 74 | resized_shape=(self.reshape_size, self.reshape_size), 75 | method='scale', crop_offset=0) 76 | return frame 77 | 78 | def get_current_feedback(self, num_frames=4): 79 | assert num_frames < self.buffer_size, "Frame buffer is not large enough." 80 | index = self.buffer_index - 1 81 | frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)] 82 | if num_frames > 1: 83 | return numpy.concatenate(frames, axis=0) 84 | else: 85 | return frames[0] 86 | 87 | def get_total_reward(self): 88 | return self.total_reward 89 | 90 | def play_action(self, action, num_frames=4): 91 | 92 | termination = 0 93 | a = self.raw_actions[action] 94 | reward = self.env.make_action(a) 95 | done = self.env.is_episode_finished() 96 | 97 | if done: 98 | termination = 1 99 | else: 100 | frame = self.get_current_frame() 101 | self.add_frame_to_buffer(self.crop(self.rgb_to_gray(frame))) 102 | 103 | r = numpy.clip(reward, -1, 1) 104 | self.total_reward += reward 105 | 106 | return r, self.get_current_feedback(num_frames), termination 107 | 108 | if __name__ == "__main__": 109 | 110 | import random 111 | from PIL import Image 112 | 113 | game = Game() 114 | game.set_params(frame_skip=4) 115 | actions = game.get_available_actions() 116 | print(actions) 117 | 118 | for t in range(500): 119 | 120 | action = random.choice(actions) 121 | reward, feedback, termination = game.play_action(action, num_frames=4) 122 | if termination: 123 | break 124 | 125 | for i in range(feedback.shape[0]): 126 | img = Image.fromarray(feedback[feedback.shape[0]-i-1]) 127 | img.save('save/{}_{}.bmp'.format(t, i)) 128 | 129 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/basic.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = basic.wad 6 | doom_map = map01 7 | 8 | # Rewards 9 | living_reward = -1 10 | 11 | # Rendering options 12 | screen_resolution = RES_320X240 13 | screen_format = CRCGCB 14 | render_hud = True 15 | render_crosshair = false 16 | render_weapon = true 17 | render_decals = false 18 | render_particles = false 19 | window_visible = true 20 | 21 | # make episodes start after 20 tics (after unholstering the gun) 22 | episode_start_time = 14 23 | 24 | # make episodes finish after 300 actions (tics) 25 | episode_timeout = 300 26 | 27 | # Available buttons 28 | available_buttons = 29 | { 30 | MOVE_LEFT 31 | MOVE_RIGHT 32 | ATTACK 33 | } 34 | 35 | # Game variables that will be in the state 36 | available_game_variables = { AMMO2} 37 | 38 | mode = PLAYER 39 | doom_skill = 5 40 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/basic.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/basic.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/cig.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = cig.wad 6 | 7 | #12 minutes 8 | episode_timeout = 25200 9 | 10 | # Rendering options 11 | screen_resolution = RES_640X480 12 | screen_format = CRCGCB 13 | render_hud = true 14 | render_crosshair = true 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | 19 | window_visible = true 20 | 21 | # Available buttons 22 | available_buttons = 23 | { 24 | TURN_LEFT 25 | TURN_RIGHT 26 | ATTACK 27 | 28 | MOVE_RIGHT 29 | MOVE_LEFT 30 | 31 | MOVE_FORWARD 32 | MOVE_BACKWARD 33 | TURN_LEFT_RIGHT_DELTA 34 | LOOK_UP_DOWN_DELTA 35 | 36 | } 37 | 38 | mode = ASYNC_PLAYER 39 | 40 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/cig.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/cig.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/cig_with_unknown.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/cig_with_unknown.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/deadly_corridor.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = deadly_corridor.wad 6 | 7 | # Skill 5 is reccomanded for the scenario to be a challenge. 8 | doom_skill = 5 9 | 10 | # Rewards 11 | death_penalty = 100 12 | #living_reward = 0 13 | 14 | # Rendering options 15 | screen_resolution = RES_320X240 16 | screen_format = CRCGCB 17 | render_hud = true 18 | render_crosshair = false 19 | render_weapon = true 20 | render_decals = false 21 | render_particles = false 22 | window_visible = true 23 | 24 | episode_timeout = 2100 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | MOVE_LEFT 30 | MOVE_RIGHT 31 | ATTACK 32 | MOVE_FORWARD 33 | MOVE_BACKWARD 34 | TURN_LEFT 35 | TURN_RIGHT 36 | } 37 | 38 | # Game variables that will be in the state 39 | available_game_variables = { HEALTH } 40 | 41 | mode = PLAYER 42 | 43 | 44 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/deadly_corridor.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/deadly_corridor.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/deathmatch.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = deathmatch.wad 6 | 7 | # Rendering options 8 | screen_resolution = RES_320X240 9 | screen_format = CRCGCB 10 | render_hud = true 11 | render_crosshair = false 12 | render_weapon = true 13 | render_decals = false 14 | render_particles = false 15 | window_visible = true 16 | 17 | # make episodes finish after 4200 actions (tics) 18 | episode_timeout = 4200 19 | 20 | # Available buttons 21 | available_buttons = 22 | { 23 | ATTACK 24 | SPEED 25 | STRAFE 26 | 27 | MOVE_RIGHT 28 | MOVE_LEFT 29 | MOVE_BACKWARD 30 | MOVE_FORWARD 31 | TURN_RIGHT 32 | TURN_LEFT 33 | 34 | SELECT_WEAPON1 35 | SELECT_WEAPON2 36 | SELECT_WEAPON3 37 | SELECT_WEAPON4 38 | SELECT_WEAPON5 39 | SELECT_WEAPON6 40 | 41 | SELECT_NEXT_WEAPON 42 | SELECT_PREV_WEAPON 43 | 44 | LOOK_UP_DOWN_DELTA 45 | TURN_LEFT_RIGHT_DELTA 46 | MOVE_LEFT_RIGHT_DELTA 47 | 48 | } 49 | 50 | # Game variables that will be in the state 51 | available_game_variables = 52 | { 53 | KILLCOUNT 54 | HEALTH 55 | ARMOR 56 | SELECTED_WEAPON 57 | SELECTED_WEAPON_AMMO 58 | } 59 | mode = PLAYER 60 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/deathmatch.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/deathmatch.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/defend_the_center.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = defend_the_center.wad 6 | 7 | # Rewards 8 | death_penalty = 1 9 | 10 | # Rendering options 11 | screen_resolution = RES_640X480 12 | screen_format = CRCGCB 13 | render_hud = True 14 | render_crosshair = false 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | window_visible = true 19 | 20 | # make episodes start after 10 tics (after unholstering the gun) 21 | episode_start_time = 10 22 | 23 | # make episodes finish after 2100 actions (tics) 24 | episode_timeout = 2100 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | TURN_LEFT 30 | TURN_RIGHT 31 | ATTACK 32 | } 33 | 34 | # Game variables that will be in the state 35 | available_game_variables = { AMMO2 HEALTH } 36 | 37 | mode = PLAYER 38 | doom_skill = 3 39 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/defend_the_center.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/defend_the_center.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/defend_the_line.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = defend_the_line.wad 6 | 7 | # Rewards 8 | death_penalty = 1 9 | 10 | # Rendering options 11 | screen_resolution = RES_320X240 12 | screen_format = CRCGCB 13 | render_hud = True 14 | render_crosshair = false 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | window_visible = true 19 | 20 | # make episodes start after 10 tics (after unholstering the gun) 21 | episode_start_time = 10 22 | 23 | 24 | # Available buttons 25 | available_buttons = 26 | { 27 | TURN_lEFT 28 | TURN_RIGHT 29 | ATTACK 30 | } 31 | 32 | # Game variables that will be in the state 33 | available_game_variables = { AMMO2 HEALTH} 34 | 35 | mode = PLAYER 36 | doom_skill = 3 37 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/defend_the_line.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/defend_the_line.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/health_gathering.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = health_gathering.wad 6 | 7 | # Each step is good for you! 8 | living_reward = 1 9 | # And death is not! 10 | death_penalty = 100 11 | 12 | # Rendering options 13 | screen_resolution = RES_320X240 14 | screen_format = CRCGCB 15 | render_hud = false 16 | render_crosshair = false 17 | render_weapon = false 18 | render_decals = false 19 | render_particles = false 20 | window_visible = true 21 | 22 | # make episodes finish after 2100 actions (tics) 23 | episode_timeout = 2100 24 | 25 | # Available buttons 26 | available_buttons = 27 | { 28 | TURN_LEFT 29 | TURN_RIGHT 30 | MOVE_FORWARD 31 | } 32 | 33 | # Game variables that will be in the state 34 | available_game_variables = { HEALTH } 35 | 36 | mode = PLAYER -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/health_gathering.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/health_gathering.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/health_gathering_supreme.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/health_gathering_supreme.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/learning.cfg: -------------------------------------------------------------------------------- 1 | doom_scenario_path = basic.wad 2 | 3 | # Rewards 4 | living_reward = -1 5 | 6 | # Rendering options 7 | screen_resolution = RES_640X480 8 | screen_format = GRAY8 9 | render_hud = false 10 | render_crosshair = false 11 | render_weapon = true 12 | render_decals = false 13 | render_particles = false 14 | window_visible = false 15 | 16 | # make episodes start after 20 tics (after unholstering the gun) 17 | episode_start_time = 14 18 | 19 | # make episodes finish after 300 actions (tics) 20 | episode_timeout = 300 21 | 22 | # Available buttons 23 | available_buttons = 24 | { 25 | MOVE_LEFT 26 | MOVE_RIGHT 27 | ATTACK 28 | } 29 | 30 | mode = PLAYER 31 | 32 | 33 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/multi.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = multi_deathmatch.wad 6 | 7 | # Rewards 8 | death_penalty = 1 9 | 10 | # Rendering options 11 | screen_resolution = RES_640X480 12 | screen_format = CRCGCB 13 | render_hud = true 14 | render_crosshair = true 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | 19 | window_visible = true 20 | 21 | 22 | # Available buttons 23 | available_buttons = 24 | { 25 | TURN_LEFT 26 | TURN_RIGHT 27 | ATTACK 28 | 29 | MOVE_RIGHT 30 | MOVE_LEFT 31 | 32 | MOVE_FORWARD 33 | MOVE_BACKWARD 34 | TURN_LEFT_RIGHT_DELTA 35 | LOOK_UP_DOWN_DELTA 36 | 37 | } 38 | 39 | available_game_variables = 40 | { 41 | HEALTH 42 | AMMO3 43 | } 44 | mode = ASYNC_PLAYER 45 | 46 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/multi_deathmatch.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/multi_deathmatch.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/multi_duel.cfg: -------------------------------------------------------------------------------- 1 | doom_scenario_path = multi_duel.wad 2 | 3 | screen_resolution = RES_640X480 4 | screen_format = CRCGCB 5 | render_hud = true 6 | render_crosshair = false 7 | render_weapon = true 8 | render_decals = true 9 | render_particles = true 10 | window_visible = true 11 | 12 | available_buttons = 13 | { 14 | MOVE_LEFT 15 | MOVE_RIGHT 16 | ATTACK 17 | } 18 | 19 | mode = PLAYER 20 | doom_skill = 5 21 | 22 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/multi_duel.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/multi_duel.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/my_way_home.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = my_way_home.wad 6 | 7 | # Rewards 8 | living_reward = -0.0001 9 | 10 | # Rendering options 11 | screen_resolution = RES_640X480 12 | screen_format = CRCGCB 13 | render_hud = false 14 | render_crosshair = false 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | window_visible = true 19 | 20 | # make episodes start after 10 tics (after unholstering the gun) 21 | episode_start_time = 10 22 | 23 | # make episodes finish after 2100 actions (tics) 24 | episode_timeout = 2100 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | TURN_LEFT 30 | TURN_RIGHT 31 | MOVE_FORWARD 32 | MOVE_LEFT 33 | MOVE_RIGHT 34 | } 35 | 36 | # Game variables that will be in the state 37 | available_game_variables = { AMMO0 } 38 | 39 | mode = PLAYER 40 | doom_skill = 5 41 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/my_way_home.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/my_way_home.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/predict_position.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = predict_position.wad 6 | 7 | # Rewards 8 | living_reward = -0.001 9 | 10 | # Rendering options 11 | screen_resolution = RES_800X450 12 | screen_format = CRCGCB 13 | render_hud = false 14 | render_crosshair = false 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | window_visible = true 19 | 20 | # make episodes start after 16 tics (after producing the rocket launcher) 21 | episode_start_time = 16 22 | 23 | # make episodes finish after 300 actions (tics) 24 | episode_timeout = 300 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | TURN_LEFT 30 | TURN_RIGHT 31 | ATTACK 32 | } 33 | 34 | # Empty list is allowed, in case you are lazy. 35 | available_game_variables = { } 36 | 37 | game_args += +sv_noautoaim 1 38 | 39 | mode = PLAYER 40 | doom_skill = 1 41 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/predict_position.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/predict_position.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/rocket_basic.cfg: -------------------------------------------------------------------------------- 1 | doom_scenario_path = rocket_basic.wad 2 | 3 | # Rewards 4 | living_reward = -1 5 | 6 | # Rendering options 7 | screen_resolution = RES_640X480 8 | screen_format = GRAY8 9 | render_hud = true 10 | render_crosshair = false 11 | render_weapon = true 12 | render_decals = false 13 | render_particles = false 14 | 15 | # make episodes start after 14 tics (after unholstering the gun) 16 | episode_start_time = 14 17 | 18 | # make episodes finish after 300 actions (tics) 19 | episode_timeout = 300 20 | 21 | # Available buttons 22 | available_buttons = 23 | { 24 | MOVE_LEFT 25 | MOVE_RIGHT 26 | ATTACK 27 | } 28 | 29 | game_args += +sv_noautoaim 1 30 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/rocket_basic.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/rocket_basic.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/simpler_basic.cfg: -------------------------------------------------------------------------------- 1 | doom_scenario_path = simpler_basic.wad 2 | 3 | # Rewards 4 | living_reward = -1 5 | 6 | # Rendering options 7 | screen_resolution = RES_640X480 8 | screen_format = GRAY8 9 | 10 | render_hud = true 11 | render_crosshair = false 12 | render_weapon = true 13 | render_decals = false 14 | render_particles = false 15 | 16 | # make episodes start after 20 tics (after unholstering the gun) 17 | episode_start_time = 14 18 | 19 | # make episodes finish after 300 actions (tics) 20 | episode_timeout = 300 21 | 22 | # Available buttons 23 | available_buttons = 24 | { 25 | MOVE_LEFT 26 | MOVE_RIGHT 27 | ATTACK 28 | } 29 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/simpler_basic.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/simpler_basic.wad -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/take_cover.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = take_cover.wad 6 | doom_map = map01 7 | 8 | # Rewards 9 | living_reward = 1 10 | 11 | # Rendering options 12 | screen_resolution = RES_320X240 13 | screen_format = CRCGCB 14 | render_hud = false 15 | render_crosshair = false 16 | render_weapon = false 17 | render_decals = false 18 | render_particles = false 19 | window_visible = true 20 | 21 | # Available buttons 22 | available_buttons = 23 | { 24 | MOVE_LEFT 25 | MOVE_RIGHT 26 | } 27 | 28 | # Game variables that will be in the state 29 | available_game_variables = { HEALTH } 30 | 31 | # Change it if you wish. 32 | doom_skill = 4 33 | 34 | -------------------------------------------------------------------------------- /Chapter05/doom/scenarios/take_cover.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/doom/scenarios/take_cover.wad -------------------------------------------------------------------------------- /Chapter05/environment.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2 Jun 2017 3 | 4 | @author: ywz 5 | ''' 6 | import time 7 | from threading import Thread 8 | from parameter import Parameter 9 | 10 | 11 | def new_demo(test=False): 12 | 13 | import pygame 14 | from demo.game import Game 15 | if test is False: 16 | game = Game(640, 480, None) 17 | else: 18 | def _render(game): 19 | while True: 20 | game.draw() 21 | for event in pygame.event.get(): 22 | if event.type == pygame.KEYDOWN: 23 | if event.key == pygame.K_9: 24 | game.increase_fps() 25 | elif event.key == pygame.K_0: 26 | game.decrease_fps() 27 | pygame.init() 28 | DISPLAYSURF = pygame.display.set_mode((640, 480), 0, 32) 29 | pygame.display.set_caption('Demo') 30 | game = Game(640, 480, DISPLAYSURF) 31 | t = Thread(target=lambda: _render(game)) 32 | t.start() 33 | 34 | parameter = Parameter(lr=1e-3) 35 | parameter.gamma = 0.9 36 | parameter.iteration_num = 300000 37 | parameter.num_history_frames = 1 38 | parameter.network_type = 'mlp' 39 | 40 | parameter.update_method = 'rmsprop' 41 | parameter.rho = 0.95 42 | parameter.async_update_interval = 5 43 | parameter.input_scale = 1.0 44 | 45 | return game, parameter 46 | 47 | 48 | def new_atari_game(rom='breakout'): 49 | 50 | from game import Game 51 | game = Game(rom) 52 | 53 | if rom == 'space_invaders': 54 | game.set_params(frame_skip=3, lost_life_as_terminal=False, take_maximum_of_two_frames=True) 55 | elif game == 'alien': 56 | game.set_params(frame_skip=4, crop_offset=20, lost_life_as_terminal=False) 57 | else: 58 | game.set_params(frame_skip=4, lost_life_as_terminal=False) 59 | 60 | parameter = Parameter(lr=7e-4) 61 | parameter.gamma = 0.99 62 | parameter.num_history_frames = 4 63 | 64 | parameter.async_update_interval = 20 65 | parameter.max_iter_num = 16 * 10 ** 7 66 | parameter.update_method = 'rmsprop' 67 | parameter.rho = 0.99 68 | parameter.rmsprop_epsilon = 1e-1 # 1e-3 if rom == 'breakout' else 1e-1 69 | 70 | time.sleep(1) 71 | return game, parameter 72 | 73 | 74 | def new_minecraft(rom='MinecraftBasic-v0'): 75 | 76 | from minecraft.game import Game 77 | game = Game(rom) 78 | 79 | parameter = Parameter(lr=7e-4) 80 | parameter.gamma = 0.99 81 | parameter.num_history_frames = 4 82 | 83 | parameter.async_update_interval = 20 84 | parameter.max_iter_num = 16 * 10 ** 7 85 | parameter.update_method = 'rmsprop' 86 | parameter.rho = 0.99 87 | parameter.rmsprop_epsilon = 1e-3 88 | 89 | time.sleep(1) 90 | return game, parameter 91 | 92 | 93 | def new_environment(name='demo', test=False): 94 | 95 | if name == 'demo': 96 | return new_demo(test=test) 97 | elif name.find('Minecraft') != -1: 98 | return new_minecraft(rom=name) 99 | else: 100 | return new_atari_game(rom=name) 101 | 102 | 103 | -------------------------------------------------------------------------------- /Chapter05/ff_policy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 29 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layer import conv2d, linear 8 | 9 | 10 | class FFPolicy: 11 | 12 | def __init__(self, input_shape=(84, 84, 4), n_outputs=4, network_type='cnn'): 13 | 14 | self.width = input_shape[0] 15 | self.height = input_shape[1] 16 | self.channel = input_shape[2] 17 | self.n_outputs = n_outputs 18 | self.network_type = network_type 19 | self.entropy_beta = 0.01 20 | 21 | self.x = tf.placeholder(dtype=tf.float32, 22 | shape=(None, self.channel, self.width, self.height)) 23 | self.build_model() 24 | 25 | def build_model(self): 26 | 27 | self.net = {} 28 | self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1)) 29 | 30 | if self.network_type == 'cnn': 31 | self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), name='conv1') 32 | self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), name='conv2') 33 | self.net['feature'] = linear(self.net['conv2'], 256, name='fc1') 34 | else: 35 | # MLP for testing 36 | self.net['fc1'] = linear(self.net['input'], 50, init_b = tf.constant_initializer(0.0), name='fc1') 37 | self.net['feature'] = linear(self.net['fc1'], 50, init_b = tf.constant_initializer(0.0), name='fc2') 38 | 39 | self.net['value'] = tf.reshape(linear(self.net['feature'], 1, activation=None, name='value', 40 | init_b = tf.constant_initializer(0.0)), 41 | shape=(-1,)) 42 | 43 | self.net['logits'] = linear(self.net['feature'], self.n_outputs, activation=None, name='logits', 44 | init_b = tf.constant_initializer(0.0)) 45 | 46 | self.net['policy'] = tf.nn.softmax(self.net['logits'], name='policy') 47 | self.net['log_policy'] = tf.nn.log_softmax(self.net['logits'], name='log_policy') 48 | 49 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 50 | 51 | def build_gradient_op(self, clip_grad=None): 52 | 53 | self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_outputs), name='action') 54 | self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward') 55 | self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='advantage') 56 | 57 | value = self.net['value'] 58 | policy = self.net['policy'] 59 | log_policy = self.net['log_policy'] 60 | 61 | entropy = -tf.reduce_sum(policy * log_policy, axis=1) 62 | p_loss = -tf.reduce_sum(tf.reduce_sum(log_policy * self.action, axis=1) * self.advantage + self.entropy_beta * entropy) 63 | v_loss = 0.5 * tf.reduce_sum((value - self.reward) ** 2) 64 | total_loss = p_loss + v_loss 65 | 66 | self.gradients = tf.gradients(total_loss, self.vars) 67 | if clip_grad is not None: 68 | self.gradients, _ = tf.clip_by_global_norm(self.gradients, clip_grad) 69 | 70 | # Add summaries 71 | tf.summary.scalar("policy_loss", p_loss, collections=['policy_network']) 72 | tf.summary.scalar("value_loss", v_loss, collections=['policy_network']) 73 | tf.summary.scalar("entropy", tf.reduce_mean(entropy), collections=['policy_network']) 74 | # tf.summary.scalar("grad_global_norm", tf.global_norm(self.gradients), collections=['policy_network']) 75 | self.summary_op = tf.summary.merge_all('policy_network') 76 | 77 | return self.gradients 78 | 79 | def run_initial_state(self, sess): 80 | return None 81 | 82 | def run_value(self, sess, state, *args): 83 | value = sess.run(self.net['value'], 84 | feed_dict={self.x: state}) 85 | return value 86 | 87 | def run_policy_and_value(self, sess, state, *args): 88 | policy, value = sess.run([self.net['policy'], self.net['value']], 89 | feed_dict={self.x: state}) 90 | return policy, value 91 | 92 | def get_feed_dict(self, states, actions, rewards, advantages, *args): 93 | feed_dict={self.x: states, self.action: actions, 94 | self.reward: rewards, self.advantage: advantages} 95 | return feed_dict 96 | 97 | 98 | -------------------------------------------------------------------------------- /Chapter05/helper/tmux: -------------------------------------------------------------------------------- 1 | tmux can be controlled using a prefix key (by default, Ctrl-b) and a command key. The command key to split into two panes is %. From within tmux: 2 | Ctrl-b % 3 | 4 | We can split our second pane horizontally: 5 | Ctrl-b " 6 | 7 | To switch to the next pane (panes are numbered left-to-right, top-down): 8 | Ctrl-b o 9 | 10 | A step in context above panes are windows. Windows behave similarly to tabs in a browser. 11 | When tmux starts up, it gives you a window and a single pane inside the window. 12 | To create a new window: 13 | Ctrl-b c 14 | 15 | tmux will switch to the new window automatically. You can see the new window indicated in the status-line. Windows are numbered from 0, so our new window is number 1. 16 | Now you can create panes and treat this window like we did before. We can even create another window. Our three windows are numbered 0, 1, and 2. 17 | To move to the next window in the index: 18 | Ctrl-b n 19 | 20 | To move backwards in the index: 21 | Ctrl-b p 22 | 23 | -------------------------------------------------------------------------------- /Chapter05/layer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 29 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy 7 | import tensorflow as tf 8 | 9 | 10 | def leaky_relu(x, leak=0.0, name="lrelu"): 11 | return tf.maximum(leak * x, x, name=name) 12 | 13 | def add_regularization(var, weight): 14 | weight_decay = tf.multiply(tf.nn.l2_loss(var), weight, name='weight_loss') 15 | tf.add_to_collection('losses', weight_decay) 16 | 17 | def get_variable_on_cpu(shape, initializer, name, dtype=tf.float32, trainable=True): 18 | with tf.device('/cpu:0'): 19 | var = tf.get_variable(shape=shape, initializer=initializer, 20 | dtype=dtype, name=name, trainable=trainable) 21 | return var 22 | 23 | def HeUniform(shape): 24 | 25 | if len(shape) > 2: 26 | w = shape[0] 27 | h = shape[1] 28 | input_channels = shape[2] 29 | d = 1.0 / numpy.sqrt(input_channels * w * h) 30 | else: 31 | d = 1.0 / numpy.sqrt(shape[0]) 32 | 33 | init_W = tf.random_uniform_initializer(-d, d) 34 | init_b = tf.random_uniform_initializer(-d, d) 35 | return init_W, init_b 36 | 37 | def conv2d(x, output_dim, kernel=(5, 5), stride=(2, 2), 38 | activation=tf.nn.relu, init_W=None, init_b=None, name='conv', padding='VALID'): 39 | 40 | assert len(x.get_shape().as_list()) == 4 41 | shape = (kernel[0], kernel[1], x.get_shape().as_list()[-1], output_dim) 42 | _W, _b = HeUniform(shape) 43 | if init_W is None: init_W = _W 44 | if init_b is None: init_b = _b 45 | 46 | with tf.variable_scope(name): 47 | W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 48 | b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 49 | 50 | conv = tf.nn.conv2d(input=x, filter=W, strides=(1, stride[0], stride[1], 1), padding=padding) 51 | if activation: 52 | conv = activation(tf.nn.bias_add(conv, b)) 53 | else: 54 | conv = tf.nn.bias_add(conv, b) 55 | 56 | return conv 57 | 58 | def linear(x, output_dim, activation=tf.nn.relu, init_W=None, init_b=None, name='linear'): 59 | 60 | if len(x.get_shape().as_list()) > 2: 61 | shape = x.get_shape().as_list() 62 | x = tf.reshape(x, shape=(-1, numpy.prod(shape[1:]))) 63 | 64 | shape = (x.get_shape().as_list()[-1], output_dim) 65 | _W, _b = HeUniform(shape) 66 | if init_W is None: init_W = _W 67 | if init_b is None: init_b = _b 68 | 69 | with tf.variable_scope(name): 70 | W = get_variable_on_cpu(shape=shape, initializer=init_W, dtype=tf.float32, name='weight') 71 | b = get_variable_on_cpu(shape=(output_dim,), initializer=init_b, dtype=tf.float32, name='bias') 72 | 73 | linear = tf.matmul(x, W) + b 74 | if activation: 75 | linear = activation(linear) 76 | 77 | return linear 78 | 79 | 80 | -------------------------------------------------------------------------------- /Chapter05/lstm_policy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 31 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import tensorflow as tf 7 | from layer import conv2d, linear 8 | 9 | 10 | class LSTMPolicy: 11 | 12 | def __init__(self, input_shape=(84, 84, 4), n_outputs=4, network_type='cnn'): 13 | 14 | self.width = input_shape[0] 15 | self.height = input_shape[1] 16 | self.channel = input_shape[2] 17 | self.n_outputs = n_outputs 18 | self.network_type = network_type 19 | self.entropy_beta = 0.01 20 | 21 | self.x = tf.placeholder(dtype=tf.float32, 22 | shape=(None, self.channel, self.width, self.height)) 23 | self.build_model() 24 | 25 | def build_model(self): 26 | 27 | self.net = {} 28 | self.net['input'] = tf.transpose(self.x, perm=(0, 2, 3, 1)) 29 | 30 | if self.network_type == 'cnn': 31 | self.net['conv1'] = conv2d(self.net['input'], 16, kernel=(8, 8), stride=(4, 4), name='conv1') 32 | self.net['conv2'] = conv2d(self.net['conv1'], 32, kernel=(4, 4), stride=(2, 2), name='conv2') 33 | self.net['feature'] = linear(self.net['conv2'], 256, name='fc1') 34 | else: 35 | # MLP for testing 36 | self.net['fc1'] = linear(self.net['input'], 50, init_b = tf.constant_initializer(0.0), name='fc1') 37 | self.net['feature'] = linear(self.net['fc1'], 50, init_b = tf.constant_initializer(0.0), name='fc2') 38 | 39 | num_units = self.net['feature'].get_shape().as_list()[-1] 40 | self.lstm = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, forget_bias=0.0, state_is_tuple=True) 41 | self.init_state = self.lstm.zero_state(batch_size=1, dtype=tf.float32) 42 | 43 | step_size = tf.shape(self.x)[:1] 44 | feature = tf.expand_dims(self.net['feature'], axis=0) 45 | lstm_outputs, lstm_state = tf.nn.dynamic_rnn(self.lstm, feature, 46 | initial_state=self.init_state, 47 | sequence_length=step_size, 48 | time_major=False) 49 | outputs = tf.reshape(lstm_outputs, shape=(-1, num_units)) 50 | self.final_state = lstm_state 51 | 52 | self.net['value'] = tf.reshape(linear(outputs, 1, activation=None, name='value', 53 | init_b = tf.constant_initializer(0.0)), 54 | shape=(-1,)) 55 | 56 | self.net['logits'] = linear(outputs, self.n_outputs, activation=None, name='logits', 57 | init_b = tf.constant_initializer(0.0)) 58 | 59 | self.net['policy'] = tf.nn.softmax(self.net['logits'], name='policy') 60 | self.net['log_policy'] = tf.nn.log_softmax(self.net['logits'], name='log_policy') 61 | 62 | self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) 63 | 64 | def build_gradient_op(self, clip_grad=None): 65 | 66 | self.action = tf.placeholder(dtype=tf.float32, shape=(None, self.n_outputs), name='action') 67 | self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward') 68 | self.advantage = tf.placeholder(dtype=tf.float32, shape=(None,), name='advantage') 69 | 70 | value = self.net['value'] 71 | policy = self.net['policy'] 72 | log_policy = self.net['log_policy'] 73 | 74 | entropy = -tf.reduce_sum(policy * log_policy, axis=1) 75 | p_loss = -tf.reduce_sum(tf.reduce_sum(log_policy * self.action, axis=1) * self.advantage + 76 | self.entropy_beta * entropy) 77 | v_loss = 0.5 * tf.reduce_sum((value - self.reward) ** 2) 78 | total_loss = p_loss + v_loss 79 | 80 | self.gradients = tf.gradients(total_loss, self.vars) 81 | if clip_grad is not None: 82 | self.gradients, _ = tf.clip_by_global_norm(self.gradients, clip_grad) 83 | 84 | # Add summaries 85 | tf.summary.scalar("policy_loss", p_loss, collections=['policy_network']) 86 | tf.summary.scalar("value_loss", v_loss, collections=['policy_network']) 87 | tf.summary.scalar("entropy", tf.reduce_mean(entropy), collections=['policy_network']) 88 | # tf.summary.scalar("grad_global_norm", tf.global_norm(self.gradients), collections=['policy_network']) 89 | self.summary_op = tf.summary.merge_all('policy_network') 90 | 91 | return self.gradients 92 | 93 | def run_initial_state(self, sess): 94 | return sess.run(self.init_state) 95 | 96 | def run_value(self, sess, state, cell, *args): 97 | feed_dict={self.x: state, self.init_state[0]: cell[0], self.init_state[1]: cell[1]} 98 | value = sess.run(self.net['value'], feed_dict=feed_dict) 99 | return value 100 | 101 | def run_policy_and_value(self, sess, state, cell, *args): 102 | feed_dict={self.x: state, self.init_state[0]: cell[0], self.init_state[1]: cell[1]} 103 | policy, value, final_state = sess.run([self.net['policy'], self.net['value'], self.final_state], 104 | feed_dict=feed_dict) 105 | return policy, value, final_state 106 | 107 | def get_feed_dict(self, states, actions, rewards, advantages, cell, *args): 108 | feed_dict={self.x: states, self.action: actions, 109 | self.reward: rewards, self.advantage: advantages, 110 | self.init_state[0]: cell[0], self.init_state[1]: cell[1]} 111 | return feed_dict 112 | 113 | -------------------------------------------------------------------------------- /Chapter05/minecraft/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 19, 2018 3 | 4 | @author: ywz 5 | ''' 6 | -------------------------------------------------------------------------------- /Chapter05/minecraft/game.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 10, 2018 3 | 4 | @author: ywz 5 | ''' 6 | import gym 7 | import gym_minecraft 8 | import minecraft_py 9 | import numpy, time 10 | from utils import cv2_resize_image 11 | 12 | 13 | class Game: 14 | 15 | def __init__(self, name='MinecraftBasic-v0', discrete_movement=False): 16 | 17 | self.env = gym.make(name) 18 | if discrete_movement: 19 | self.env.init(start_minecraft=True, allowDiscreteMovement=["move", "turn"]) 20 | else: 21 | self.env.init(start_minecraft=True, allowContinuousMovement=["move", "turn"]) 22 | self.actions = list(range(self.env.action_space.n)) 23 | frame = self.env.reset() 24 | 25 | self.frame_skip = 1 26 | self.total_reward = 0 27 | self.crop_size = 84 28 | 29 | # Frame buffer 30 | self.buffer_size = 8 31 | self.buffer_index = 0 32 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 33 | self.last_frame = frame 34 | 35 | def rgb_to_gray(self, im): 36 | return numpy.dot(im, [0.2126, 0.7152, 0.0722]) 37 | 38 | def set_params(self, crop_size=84, frame_skip=4): 39 | 40 | self.crop_size = crop_size 41 | self.frame_skip = frame_skip 42 | 43 | frame = self.env.reset() 44 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 45 | self.last_frame = frame 46 | 47 | def reset(self): 48 | frame = self.env.reset() 49 | self.total_reward = 0 50 | self.buffer_index = 0 51 | self.buffer = [self.crop(self.rgb_to_gray(frame)) for _ in range(self.buffer_size)] 52 | self.last_frame = frame 53 | 54 | def add_frame_to_buffer(self, frame): 55 | self.buffer_index = self.buffer_index % self.buffer_size 56 | self.buffer[self.buffer_index] = frame 57 | self.buffer_index += 1 58 | 59 | def get_available_actions(self): 60 | return list(range(len(self.actions))) 61 | 62 | def get_feedback_size(self): 63 | return (self.crop_size, self.crop_size) 64 | 65 | def crop(self, frame): 66 | feedback = cv2_resize_image(frame, 67 | resized_shape=(self.crop_size, self.crop_size), 68 | method='scale', crop_offset=0) 69 | return feedback 70 | 71 | def get_current_feedback(self, num_frames=4): 72 | assert num_frames < self.buffer_size, "Frame buffer is not large enough." 73 | index = self.buffer_index - 1 74 | frames = [numpy.expand_dims(self.buffer[index - k], axis=0) for k in range(num_frames)] 75 | if num_frames > 1: 76 | return numpy.concatenate(frames, axis=0) 77 | else: 78 | return frames[0] 79 | 80 | def get_total_reward(self): 81 | return self.total_reward 82 | 83 | def play_action(self, action, num_frames=4): 84 | 85 | reward = 0 86 | termination = 0 87 | for i in range(self.frame_skip): 88 | a = self.actions[action] 89 | frame, r, done, _ = self.env.step(a) 90 | reward += r 91 | if i == self.frame_skip - 2: 92 | self.last_frame = frame 93 | if done: 94 | termination = 1 95 | self.add_frame_to_buffer(self.crop(numpy.maximum(self.rgb_to_gray(frame), self.rgb_to_gray(self.last_frame)))) 96 | 97 | r = numpy.clip(reward, -1, 1) 98 | self.total_reward += reward 99 | 100 | return r, self.get_current_feedback(num_frames), termination 101 | 102 | def draw(self): 103 | time.sleep(1 / 120.0) 104 | self.env.render(mode='human') 105 | -------------------------------------------------------------------------------- /Chapter05/parameter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jan 24, 2017 3 | 4 | @author: ywz 5 | ''' 6 | import pickle 7 | from utils import log_uniform 8 | 9 | class Parameter: 10 | 11 | def __init__(self, lr, directory=None): 12 | 13 | self.directory = directory 14 | 15 | if isinstance(lr, tuple): 16 | assert len(lr) == 2 17 | assert lr[0] < lr[1] 18 | self.learning_rate = log_uniform(lr[0], lr[1]) 19 | else: 20 | self.learning_rate = lr 21 | 22 | self.gamma = 0.99 23 | self.num_history_frames = 4 24 | self.iteration_num = 100000 25 | self.async_update_interval = 5 26 | 27 | self.rho = 0.99 28 | self.rmsprop_epsilon = 1e-6 29 | self.update_method = 'rmsprop' 30 | self.clip_delta = 0 31 | self.max_iter_num = 10 ** 8 32 | self.network_type = 'cnn' 33 | self.input_scale = 255.0 34 | 35 | def get(self): 36 | 37 | param = {} 38 | param['directory'] = self.directory 39 | param['learning_rate'] = self.learning_rate 40 | 41 | param['gamma'] = self.gamma 42 | param['num_frames'] = self.num_history_frames 43 | param['iteration_num'] = self.iteration_num 44 | param['async_update_interval'] = self.async_update_interval 45 | 46 | param['rho'] = self.rho 47 | param['rmsprop_epsilon'] = self.rmsprop_epsilon 48 | param['update_method'] = self.update_method 49 | param['clip_delta'] = self.clip_delta 50 | param['max_iter_num'] = self.max_iter_num 51 | param['network_type'] = self.network_type 52 | param['input_scale'] = self.input_scale 53 | 54 | return param 55 | 56 | def __str__(self): 57 | param = self.get() 58 | strs = ["{}: {}".format(key, value) for key, value in param.items()] 59 | return "\n".join(strs) 60 | 61 | def save(self, filename): 62 | assert self.directory is not None 63 | filename = '{}/{}'.format(self.directory, filename) 64 | with open(filename, 'wb') as f: 65 | pickle.dump(self.get(), f) 66 | -------------------------------------------------------------------------------- /Chapter05/save/breakout/train/log_0/events.out.tfevents.1532007719.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/breakout/train/log_0/events.out.tfevents.1532007719.ywz-PC -------------------------------------------------------------------------------- /Chapter05/save/breakout/train/log_1/events.out.tfevents.1532007719.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/breakout/train/log_1/events.out.tfevents.1532007719.ywz-PC -------------------------------------------------------------------------------- /Chapter05/save/demo/train/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "a3c_model.ckpt" 2 | all_model_checkpoint_paths: "a3c_model.ckpt" 3 | -------------------------------------------------------------------------------- /Chapter05/save/demo/train/log_0/events.out.tfevents.1532007504.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/demo/train/log_0/events.out.tfevents.1532007504.ywz-PC -------------------------------------------------------------------------------- /Chapter05/save/demo/train/log_1/events.out.tfevents.1532007504.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/demo/train/log_1/events.out.tfevents.1532007504.ywz-PC -------------------------------------------------------------------------------- /Chapter05/save/minecraftbasic-v0/train/log_0/events.out.tfevents.1532007895.ywz-PC: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter05/save/minecraftbasic-v0/train/log_0/events.out.tfevents.1532007895.ywz-PC -------------------------------------------------------------------------------- /Chapter05/test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 31 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import time 7 | import argparse, os, sys, signal 8 | import tensorflow as tf 9 | from a3c import A3C 10 | from cluster import cluster_spec 11 | from environment import new_environment 12 | 13 | def shutdown(signal, frame): 14 | print('Received signal {}: exiting'.format(signal)) 15 | sys.exit(128 + signal) 16 | 17 | def test(args, server): 18 | 19 | log_dir = os.path.join(args.log_dir, '{}/train'.format(args.env)) 20 | game, parameter = new_environment(name=args.env, test=True) 21 | a3c = A3C(game, log_dir, parameter.get(), agent_index=args.task, callback=game.draw) 22 | 23 | config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) 24 | with tf.Session(target=server.target, config=config) as sess: 25 | saver = tf.train.Saver() 26 | a3c.load(sess, saver, model_name='best_a3c_model.ckpt') 27 | a3c.evaluate(sess, n_episode=10, saver=None, verbose=True) 28 | 29 | def main(): 30 | 31 | parser = argparse.ArgumentParser(description=None) 32 | parser.add_argument('-t', '--task', default=0, type=int, help='Task index') 33 | parser.add_argument('-j', '--job_name', default="worker", type=str, help='worker or ps') 34 | parser.add_argument('-w', '--num_workers', default=1, type=int, help='Number of workers') 35 | parser.add_argument('-l', '--log_dir', default="save", type=str, help='Log directory path') 36 | parser.add_argument('-e', '--env', default="demo", type=str, help='Environment') 37 | 38 | args = parser.parse_args() 39 | spec = cluster_spec(args.num_workers, 1) 40 | cluster = tf.train.ClusterSpec(spec) 41 | 42 | signal.signal(signal.SIGHUP, shutdown) 43 | signal.signal(signal.SIGINT, shutdown) 44 | signal.signal(signal.SIGTERM, shutdown) 45 | 46 | if args.job_name == "worker": 47 | server = tf.train.Server(cluster, 48 | job_name="worker", 49 | task_index=args.task, 50 | config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)) 51 | test(args, server) 52 | else: 53 | server = tf.train.Server(cluster, 54 | job_name="ps", 55 | task_index=args.task, 56 | config=tf.ConfigProto(device_filters=["/job:ps"])) 57 | # server.join() 58 | while True: 59 | time.sleep(1000) 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /Chapter05/timer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jan 20, 2017 3 | 4 | @author: ywz 5 | ''' 6 | import time 7 | 8 | class Timer: 9 | 10 | def __init__(self): 11 | self.total_time = 0 12 | self.current_time = 0 13 | self.name = '' 14 | 15 | def reset(self): 16 | self.total_time = 0 17 | self.current_time = 0 18 | 19 | def set_name(self, name): 20 | self.name = name 21 | 22 | def begin(self): 23 | self.current_time = time.time() 24 | 25 | def end(self): 26 | self.total_time += time.time() - self.current_time 27 | 28 | def total_time(self): 29 | return self.total_time 30 | 31 | def print(self): 32 | print('{} took {}s'.format(self.name, self.total_time)) 33 | -------------------------------------------------------------------------------- /Chapter05/train.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 31 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import argparse, os, sys, cluster 7 | from six.moves import shlex_quote #@UnresolvedImport 8 | 9 | parser = argparse.ArgumentParser(description="Run commands") 10 | parser.add_argument('-w', '--num_workers', default=1, type=int, 11 | help="Number of workers") 12 | parser.add_argument('-e', '--env', type=str, default="demo", 13 | help="Environment") 14 | parser.add_argument('-l', '--log_dir', type=str, default="save", 15 | help="Log directory path") 16 | 17 | def new_cmd(session, name, cmd, logdir, shell): 18 | if isinstance(cmd, (list, tuple)): 19 | cmd = " ".join(shlex_quote(str(v)) for v in cmd) 20 | return name, "tmux send-keys -t {}:{} {} Enter".format(session, name, shlex_quote(cmd)) 21 | 22 | def create_commands(session, num_workers, logdir, env, shell='bash'): 23 | 24 | base_cmd = ['CUDA_VISIBLE_DEVICES=', 25 | sys.executable, 26 | 'worker.py', 27 | '--log_dir', logdir, 28 | '--num_workers', str(num_workers), 29 | '--env', env] 30 | 31 | cmds_map = [new_cmd(session, "ps", base_cmd + ["--job_name", "ps"], logdir, shell)] 32 | for i in range(num_workers): 33 | cmd = base_cmd + ["--job_name", "worker", "--task", str(i)] 34 | cmds_map.append(new_cmd(session, "w-%d" % i, cmd, logdir, shell)) 35 | cmds_map.append(new_cmd(session, "htop", ["htop"], logdir, shell)) 36 | 37 | windows = [v[0] for v in cmds_map] 38 | notes = ["Use `tmux attach -t {}` to watch process output".format(session), 39 | "Use `tmux kill-session -t {}` to kill the job".format(session), 40 | "Use `ssh -L PORT:SERVER_IP:SERVER_PORT username@server_ip` to remote Tensorboard"] 41 | 42 | cmds = ["kill $(lsof -i:{}-{} -t) > /dev/null 2>&1".format(cluster.PORT, num_workers+cluster.PORT), 43 | "tmux kill-session -t {}".format(session), 44 | "tmux new-session -s {} -n {} -d {}".format(session, windows[0], shell)] 45 | 46 | for w in windows[1:]: 47 | cmds.append("tmux new-window -t {} -n {} {}".format(session, w, shell)) 48 | cmds.append("sleep 1") 49 | 50 | for _, cmd in cmds_map: 51 | cmds.append(cmd) 52 | return cmds, notes 53 | 54 | def main(): 55 | 56 | args = parser.parse_args() 57 | cmds, notes = create_commands("a3c", args.num_workers, args.log_dir, args.env) 58 | 59 | print("Executing the following commands:") 60 | print("\n".join(cmds)) 61 | 62 | os.environ["TMUX"] = "" 63 | os.system("\n".join(cmds)) 64 | 65 | print("Notes:") 66 | print('\n'.join(notes)) 67 | 68 | if __name__ == "__main__": 69 | main() 70 | 71 | -------------------------------------------------------------------------------- /Chapter05/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 8, 2016 3 | 4 | @author: a0096049 5 | ''' 6 | import math, random 7 | import numpy, cv2 8 | import skimage.transform 9 | import tensorflow as tf 10 | 11 | 12 | def preprocess_image(im, image_shape=(110, 84), crop_shape=84, crop_part='down'): 13 | 14 | im = skimage.transform.resize(im, image_shape, preserve_range=True) 15 | 16 | half = int(crop_shape / 2) 17 | h, w = im.shape 18 | if crop_part == 'center': 19 | im = im[h//2-half:h//2+half, w//2-half:w//2+half] 20 | if crop_part == 'down': 21 | im = im[h-crop_shape:h, w//2-half:w//2+half] 22 | 23 | return numpy.asarray(im, dtype=numpy.uint8) 24 | 25 | def cv2_resize_image(image, resized_shape=(84, 84), method='crop', crop_offset=8): 26 | 27 | height, width = image.shape 28 | resized_height, resized_width = resized_shape 29 | 30 | if method == 'crop': 31 | h = int(round(float(height) * resized_width / width)) 32 | resized = cv2.resize(image, (resized_width, h), interpolation=cv2.INTER_LINEAR) 33 | crop_y_cutoff = h - crop_offset - resized_height 34 | cropped = resized[crop_y_cutoff : crop_y_cutoff + resized_height, :] 35 | return numpy.asarray(cropped, dtype=numpy.uint8) 36 | elif method == 'scale': 37 | return numpy.asarray(cv2.resize(image, (resized_width, resized_height), 38 | interpolation=cv2.INTER_LINEAR), dtype=numpy.uint8) 39 | else: 40 | raise ValueError('Unrecognized image resize method.') 41 | 42 | def log_uniform(low, high): 43 | return math.exp(random.uniform(math.log(low), math.log(high))) 44 | 45 | def update_target_graph(from_vars, to_vars): 46 | 47 | op_holder = [] 48 | for from_var, to_var in zip(from_vars, to_vars): 49 | op_holder.append(to_var.assign(from_var)) 50 | 51 | return op_holder 52 | 53 | def create_optimizer(method, learning_rate, rho, epsilon): 54 | 55 | if method == 'rmsprop': 56 | opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate, 57 | decay=rho, 58 | epsilon=epsilon) 59 | elif method == 'adam': 60 | opt = tf.train.AdamOptimizer(learning_rate=learning_rate, 61 | beta1=rho) 62 | else: 63 | raise 64 | 65 | return opt 66 | 67 | -------------------------------------------------------------------------------- /Chapter05/worker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 29 May 2017 3 | 4 | @author: ywz 5 | ''' 6 | import numpy, time, random 7 | import argparse, os, sys, signal 8 | import tensorflow as tf 9 | from a3c import A3C 10 | from cluster import cluster_spec 11 | from environment import new_environment 12 | 13 | def set_random_seed(seed): 14 | random.seed(seed) 15 | numpy.random.seed(seed) 16 | 17 | def delete_dir(path): 18 | if tf.gfile.Exists(path): 19 | tf.gfile.DeleteRecursively(path) 20 | tf.gfile.MakeDirs(path) 21 | return path 22 | 23 | def shutdown(signal, frame): 24 | print('Received signal {}: exiting'.format(signal)) 25 | sys.exit(128 + signal) 26 | 27 | def train(args, server): 28 | 29 | os.environ['OMP_NUM_THREADS'] = '1' 30 | set_random_seed(args.task * 17) 31 | log_dir = os.path.join(args.log_dir, '{}/train'.format(args.env)) 32 | if not tf.gfile.Exists(log_dir): 33 | tf.gfile.MakeDirs(log_dir) 34 | 35 | game, parameter = new_environment(args.env) 36 | a3c = A3C(game, log_dir, parameter.get(), agent_index=args.task, callback=None) 37 | 38 | global_vars = [v for v in tf.global_variables() if not v.name.startswith("local")] 39 | ready_op = tf.report_uninitialized_variables(global_vars) 40 | config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) 41 | 42 | with tf.Session(target=server.target, config=config) as sess: 43 | saver = tf.train.Saver() 44 | path = os.path.join(log_dir, 'log_%d' % args.task) 45 | writer = tf.summary.FileWriter(delete_dir(path), sess.graph_def) 46 | a3c.set_summary_writer(writer) 47 | 48 | if args.task == 0: 49 | sess.run(tf.global_variables_initializer()) 50 | else: 51 | while len(sess.run(ready_op)) > 0: 52 | print("Waiting for task 0 initializing the global variables.") 53 | time.sleep(1) 54 | a3c.run(sess, saver) 55 | 56 | def main(): 57 | 58 | parser = argparse.ArgumentParser(description=None) 59 | parser.add_argument('-t', '--task', default=0, type=int, help='Task index') 60 | parser.add_argument('-j', '--job_name', default="worker", type=str, help='worker or ps') 61 | parser.add_argument('-w', '--num_workers', default=1, type=int, help='Number of workers') 62 | parser.add_argument('-l', '--log_dir', default="save", type=str, help='Log directory path') 63 | parser.add_argument('-e', '--env', default="demo", type=str, help='Environment') 64 | 65 | args = parser.parse_args() 66 | spec = cluster_spec(args.num_workers, 1) 67 | cluster = tf.train.ClusterSpec(spec) 68 | 69 | signal.signal(signal.SIGHUP, shutdown) 70 | signal.signal(signal.SIGINT, shutdown) 71 | signal.signal(signal.SIGTERM, shutdown) 72 | 73 | if args.job_name == "worker": 74 | server = tf.train.Server(cluster, 75 | job_name="worker", 76 | task_index=args.task, 77 | config=tf.ConfigProto(intra_op_parallelism_threads=0, 78 | inter_op_parallelism_threads=0)) # Use default op_parallelism_threads 79 | train(args, server) 80 | else: 81 | server = tf.train.Server(cluster, 82 | job_name="ps", 83 | task_index=args.task, 84 | config=tf.ConfigProto(device_filters=["/job:ps"])) 85 | # server.join() 86 | while True: 87 | time.sleep(1000) 88 | 89 | if __name__ == "__main__": 90 | main() 91 | 92 | -------------------------------------------------------------------------------- /Chapter06/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter06/__init__.py -------------------------------------------------------------------------------- /Chapter06/commands.txt: -------------------------------------------------------------------------------- 1 | ### TRAINING 2 | 3 | # First epoch 4 | 5 | # Bootstrapping 6 | 7 | export MODEL_NAME=000000-bootstrap 8 | python main.py bootstrap /tmp/minigo_working_dir models/$MODEL_NAME 9 | 10 | # Selfplay 11 | python main.py selfplay models/$MODEL_NAME 12 | 13 | # Gather training data and shuffle 14 | python main.py gather 15 | 16 | # Train 17 | python main.py train /tmp/minigo_working_dir \ 18 | data/training_chunks models/000001-bootstrap -g 1 19 | 20 | # Second epoch onwards 21 | 22 | # Increment params 23 | export MODEL_NAME=000001-bootstrap 24 | python main.py selfplay models/$MODEL_NAME 25 | python main.py gather 26 | python main.py train /tmp/minigo_working_dir \ 27 | data/training_chunks models/000002-bootstrap -g 2 28 | 29 | ### TESTING 30 | 31 | # Export models 32 | export MINIGO_MODELS=/tmp/minigo-models 33 | cp models/000001* $MINIGO_MODELS 34 | 35 | # Execute selfplay 36 | python rl_loop.py selfplay --readouts=5 -v 3 37 | -------------------------------------------------------------------------------- /Chapter06/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter06/src/__init__.py -------------------------------------------------------------------------------- /Chapter06/src/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration store 3 | """ 4 | from constants import HYPERPARAMS 5 | 6 | class GOPARAMETERS: 7 | N = 9 8 | WHITE = -1 9 | EMPTY = 0 10 | BLACK = 1 11 | FILL = 2 12 | KO = 3 13 | UNKNOWN = 4 14 | MISSING_GROUP_ID = -1 15 | COL_NAMES = 'ABCDEFGHJKLMNOPQRST' 16 | SGF_COLUMNS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 17 | KGS_COLUMNS = 'ABCDEFGHJKLMNOPQRSTUVWXYZ' 18 | 19 | class GLOBAL_PARAMETER_STORE: 20 | # How many positions we should aggregate per 'chunk'. 21 | EXAMPLES_PER_RECORD = 10000 22 | # How many positions to draw from for our training window. 23 | # AGZ used the most recent 500k games, which, assuming 250 moves/game = 125M 24 | WINDOW_SIZE = 125000000 25 | # Number of positions to look at per generation 26 | EXAMPLES_PER_GENERATION = 2000000 27 | # Number of selfplay games 28 | NUM_SELFPLAY_GAMES = 100 29 | # Positions per batch 30 | TRAIN_BATCH_SIZE = 16 31 | # Number of games before the selfplay workers will stop 32 | MAX_GAMES_PER_GENERATION = 10000 33 | # Proportion of games to holdout from training per generation 34 | HOLDOUT = 0.05 35 | # Number of leaves to consider simultaneously 36 | SIMULTANEOUS_LEAVES = 8 37 | # Step boundaries for changing the learning rate 38 | BOUNDARIES = [int(1e6), int(2e6)] 39 | # Learning rates corresponding to boundaries 40 | LEARNING_RATE = [1e-2, 1e-3, 1e-4] 41 | SGF_TEMPLATE = '''(;GM[1]FF[4]CA[UTF-8]AP[Minigo_sgfgenerator]RU[{ruleset}] 42 | SZ[{boardsize}]KM[{komi}]PW[{white_name}]PB[{black_name}]RE[{result}] 43 | {game_moves})''' 44 | PROGRAM_IDENTIFIER = "AlphaGoZero" 45 | TEMPERATURE_CUTOFF = int((GOPARAMETERS.N * GOPARAMETERS.N) / 12) 46 | # TFRecords related parameters 47 | SHUFFLE_BUFFER_SIZE = int(2*1e4) 48 | CYCLE_LENGTH = 16 49 | BLOCK_LENGTH = 64 50 | # Number of MCTS readouts we do during selfplay 51 | SELFPLAY_READOUTS = 1600 52 | # Default resign threshold 53 | RESIGN_THRESHOLD = -0.90 54 | # Number of MCTS readouts we do during evaluation 55 | EVALUATION_READOUTS = 400 56 | # Number of games to play during evaluation 57 | EVALUATION_GAMES = 16 58 | # Buffer size for when validating model 59 | VALIDATION_BUFFER_SIZE = 1000 60 | # Number of global steps when validating model 61 | VALIDATION_NUMBER_OF_STEPS = 1000 62 | 63 | class MCTSPARAMETERS: 64 | # 505 moves for 19x19, 113 for 9x9 65 | MAX_DEPTH = (GOPARAMETERS.N ** 2) * 1.4 66 | # Exploration constant 67 | c_PUCT = 1.38 68 | # Dirichlet noise, as a function of GOPARAMETERS.N 69 | DIRICHLET_NOISE = 0.03 * 361 / (GOPARAMETERS.N ** 2) 70 | 71 | class AGENTPARAMETERS: 72 | SECONDS_PER_MOVE = 5 73 | 74 | ALL_POSITIONS = [(i, j) for i in range(GOPARAMETERS.N) for j in range(GOPARAMETERS.N)] 75 | NEIGHBORS = {(x, y): list(filter(lambda c: c[0] % GOPARAMETERS.N == c[0] and c[1] % GOPARAMETERS.N == c[1], [ 76 | (x+1, y), (x-1, y), (x, y+1), (x, y-1)])) for x, y in ALL_POSITIONS} 77 | DIAGONALS = {(x, y): list(filter(lambda c: c[0] % GOPARAMETERS.N == c[0] and c[1] % GOPARAMETERS.N == c[1], [ 78 | (x+1, y+1), (x+1, y-1), (x-1, y+1), (x-1, y-1)])) for x, y in ALL_POSITIONS} 79 | 80 | """ 81 | k: number of filters (AlphaGoZero used 256). We use 128 by 82 | default for a 19x19 go board. 83 | fc_width: Dimensionality of the fully connected linear layer 84 | num_shared_layers: number of shared residual blocks. AGZ used both 19 85 | and 39. Here we use 19 because it's faster to train. 86 | l2_strength: The L2 regularization parameter. 87 | momentum: The momentum parameter for training 88 | """ 89 | NETWORK_HYPERPARAMETERS = { 90 | HYPERPARAMS.NUM_FILTERS: 128, # Width of each conv layer 91 | HYPERPARAMS.FC_WIDTH: 2 * 128, # Width of each fully connected layer 92 | HYPERPARAMS.NUMSHAREDLAYERS: 19, # Number of shared trunk layers 93 | HYPERPARAMS.BETA: 1e-4, # Regularization strength 94 | HYPERPARAMS.MOMENTUM: 0.9, # Momentum used in SGD 95 | } 96 | -------------------------------------------------------------------------------- /Chapter06/src/constants.py: -------------------------------------------------------------------------------- 1 | MODEL_NUM_REGEX = "^\d{6}" 2 | MODEL_NAME_REGEX = "^\d{6}(-\w+)+" 3 | 4 | class HYPERPARAMS: 5 | BETA = 'beta' 6 | MOMENTUM = 'momentum' 7 | NUMSHAREDLAYERS = 'num_shared_layers' 8 | FC_WIDTH = 'fc_width' 9 | NUM_FILTERS = 'k' 10 | EPSILON = "epsilon" 11 | 12 | class PATHS: 13 | MODELS_DIR = "models/" 14 | SELFPLAY_DIR = 'data/selfplay/' 15 | HOLDOUT_DIR = "data/holdout/" 16 | SGF_DIR = "data/sgf/" 17 | TRAINING_CHUNK_DIR = "data/training_chunks/" 18 | ESTIMATOR_WORKING_DIR = 'estimator_working_dir/' 19 | INITIAL_CHECKPOINT_NAME = "model.ckpt-1" 20 | 21 | class FEATUREPARAMETERS: 22 | NUM_CHANNELS = 17 23 | -------------------------------------------------------------------------------- /Chapter06/src/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from config import GOPARAMETERS 4 | 5 | def stone_features(board_state): 6 | # 16 planes, where every other plane represents the stones of a particular color 7 | # which means we track the stones of the last 8 moves. 8 | features = np.zeros([16, GOPARAMETERS.N, GOPARAMETERS.N], dtype=np.uint8) 9 | 10 | num_deltas_avail = board_state.board_deltas.shape[0] 11 | cumulative_deltas = np.cumsum(board_state.board_deltas, axis=0) 12 | last_eight = np.tile(board_state.board, [8, 1, 1]) 13 | last_eight[1:num_deltas_avail + 1] -= cumulative_deltas 14 | last_eight[num_deltas_avail +1:] = last_eight[num_deltas_avail].reshape(1, GOPARAMETERS.N, GOPARAMETERS.N) 15 | 16 | features[::2] = last_eight == board_state.to_play 17 | features[1::2] = last_eight == -board_state.to_play 18 | return np.rollaxis(features, 0, 3) 19 | 20 | def color_to_play_feature(board_state): 21 | # 1 plane representing which color is to play 22 | # The plane is filled with 1's if the color to play is black; 0's otherwise 23 | if board_state.to_play == GOPARAMETERS.BLACK: 24 | return np.ones([GOPARAMETERS.N, GOPARAMETERS.N, 1], dtype=np.uint8) 25 | else: 26 | return np.zeros([GOPARAMETERS.N, GOPARAMETERS.N, 1], dtype=np.uint8) 27 | 28 | def extract_features(board_state): 29 | stone_feat = stone_features(board_state=board_state) 30 | turn_feat = color_to_play_feature(board_state=board_state) 31 | all_features = np.concatenate([stone_feat, turn_feat], axis=2) 32 | return all_features 33 | -------------------------------------------------------------------------------- /Chapter06/src/preprocessing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from config import GOPARAMETERS, GLOBAL_PARAMETER_STORE 8 | from constants import FEATUREPARAMETERS 9 | from features import extract_features 10 | 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.INFO) 13 | 14 | TF_RECORD_CONFIG = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB) 15 | 16 | def _one_hot(index): 17 | onehot = np.zeros([GOPARAMETERS.N * GOPARAMETERS.N + 1], dtype=np.float32) 18 | onehot[index] = 1 19 | return onehot 20 | 21 | 22 | def get_input_tensors(list_tf_records, buffer_size=GLOBAL_PARAMETER_STORE.SHUFFLE_BUFFER_SIZE): 23 | logger.info("Getting input data and tensors") 24 | dataset = process_tf_records(list_tf_records=list_tf_records, 25 | buffer_size=buffer_size) 26 | dataset = dataset.filter(lambda input_tensor: tf.equal(tf.shape(input_tensor)[0], 27 | GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE)) 28 | dataset = dataset.map(parse_batch_tf_example) 29 | logger.info("Finished parsing") 30 | return dataset.make_one_shot_iterator().get_next() 31 | 32 | 33 | def create_dataset_from_selfplay(data_extracts): 34 | return (create_tf_train_example(extract_features(board_state), pi, result) 35 | for board_state, pi, result in data_extracts) 36 | 37 | 38 | def shuffle_tf_examples(batch_size, records_to_shuffle): 39 | tf_dataset = process_tf_records(records_to_shuffle, batch_size=batch_size) 40 | iterator = tf_dataset.make_one_shot_iterator() 41 | next_dataset_batch = iterator.get_next() 42 | sess = tf.Session() 43 | while True: 44 | try: 45 | result = sess.run(next_dataset_batch) 46 | yield list(result) 47 | except tf.errors.OutOfRangeError: 48 | break 49 | 50 | 51 | def create_tf_train_example(board_state, pi, result): 52 | board_state_as_tf_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[board_state.tostring()])) 53 | pi_as_tf_feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[pi.tostring()])) 54 | value_as_tf_feature = tf.train.Feature(float_list=tf.train.FloatList(value=[result])) 55 | 56 | tf_example = tf.train.Example(features=tf.train.Features(feature={ 57 | 'x': board_state_as_tf_feature, 58 | 'pi': pi_as_tf_feature, 59 | 'z': value_as_tf_feature 60 | })) 61 | 62 | return tf_example 63 | 64 | def write_tf_examples(record_path, tf_examples, serialize=True): 65 | with tf.python_io.TFRecordWriter(record_path, options=TF_RECORD_CONFIG) as tf_record_writer: 66 | for tf_example in tf_examples: 67 | if serialize: 68 | tf_record_writer.write(tf_example.SerializeToString()) 69 | else: 70 | tf_record_writer.write(tf_example) 71 | 72 | def parse_batch_tf_example(example_batch): 73 | features = { 74 | 'x': tf.FixedLenFeature([], tf.string), 75 | 'pi': tf.FixedLenFeature([], tf.string), 76 | 'z': tf.FixedLenFeature([], tf.float32), 77 | } 78 | parsed_tensors = tf.parse_example(example_batch, features) 79 | 80 | # Get the board state 81 | x = tf.cast(tf.decode_raw(parsed_tensors['x'], tf.uint8), tf.float32) 82 | x = tf.reshape(x, [GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE, GOPARAMETERS.N, 83 | GOPARAMETERS.N, FEATUREPARAMETERS.NUM_CHANNELS]) 84 | 85 | # Get the policy target, which is the distribution of possible moves 86 | # Each target is a vector of length of board * length of board + 1 87 | distribution_of_moves = tf.decode_raw(parsed_tensors['pi'], tf.float32) 88 | distribution_of_moves = tf.reshape(distribution_of_moves, 89 | [GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE, GOPARAMETERS.N * GOPARAMETERS.N + 1]) 90 | 91 | # Get the result of the game 92 | # The result is simply a scalar 93 | result_of_game = parsed_tensors['z'] 94 | result_of_game.set_shape([GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE]) 95 | 96 | return (x, {'pi_label': distribution_of_moves, 'z_label': result_of_game}) 97 | 98 | 99 | def process_tf_records(list_tf_records, shuffle_records=True, 100 | buffer_size=GLOBAL_PARAMETER_STORE.SHUFFLE_BUFFER_SIZE, 101 | batch_size=GLOBAL_PARAMETER_STORE.TRAIN_BATCH_SIZE): 102 | 103 | if shuffle_records: 104 | random.shuffle(list_tf_records) 105 | 106 | list_dataset = tf.data.Dataset.from_tensor_slices(list_tf_records) 107 | 108 | tensors_dataset = list_dataset.interleave(map_func=lambda x: tf.data.TFRecordDataset(x, compression_type='ZLIB'), 109 | cycle_length=GLOBAL_PARAMETER_STORE.CYCLE_LENGTH, 110 | block_length=GLOBAL_PARAMETER_STORE.BLOCK_LENGTH) 111 | tensors_dataset = tensors_dataset.repeat(1).shuffle(buffer_siz=buffer_size).batch(batch_size) 112 | 113 | return tensors_dataset 114 | 115 | -------------------------------------------------------------------------------- /Chapter06/src/train.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from utils import timer 4 | 5 | import os 6 | 7 | from constants import PATHS 8 | 9 | import logging 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | def main(): 14 | 15 | if not os.path.exists(PATHS.SELFPLAY_DIR): 16 | with timer("Initialize"): 17 | logger.info('==========================================') 18 | logger.info("============ Initializing...==============") 19 | logger.info('==========================================') 20 | res = subprocess.call("python controller.py initialize-random-model", shell=True) 21 | 22 | with timer('Initial Selfplay'): 23 | logger.info('=======================================') 24 | logger.info('============ Selplaying...=============') 25 | logger.info('=======================================') 26 | subprocess.call('python controller.py selfplay', shell=True) 27 | 28 | while True: 29 | with timer("Aggregate"): 30 | logger.info('=========================================') 31 | logger.info("============ Aggregating...==============") 32 | logger.info('=========================================') 33 | res = subprocess.call("python controller.py aggregate", shell=True) 34 | if res != 0: 35 | logger.info("Failed to gather") 36 | sys.exit(1) 37 | 38 | with timer("Train"): 39 | logger.info('=======================================') 40 | logger.info("============ Training...===============") 41 | logger.info('=======================================') 42 | subprocess.call("python controller.py train", shell=True) 43 | 44 | with timer('Selfplay'): 45 | logger.info('=======================================') 46 | logger.info('============ Selplaying...=============') 47 | logger.info('=======================================') 48 | subprocess.call('python controller.py selfplay', shell=True) 49 | 50 | with timer("Validate"): 51 | logger.info('=======================================') 52 | logger.info("============ Validating...=============") 53 | logger.info('=======================================') 54 | subprocess.call("python controller.py validate", shell=True) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /Chapter07/RL chatbot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run data_parser.py\n", 10 | "%run feature_extracter.py \n", 11 | "from train import train\n", 12 | "from test import test\n", 13 | "train(False)\n", 14 | "test() #Argument: model path to be used for testing, if None, the default model path is used" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.6.6" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 2 46 | } 47 | -------------------------------------------------------------------------------- /Chapter07/convert_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter07/convert_checkpoint.py -------------------------------------------------------------------------------- /Chapter07/data_parser.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import pickle 3 | import codecs 4 | import re 5 | import os 6 | import time 7 | import numpy as np 8 | 9 | """ This module cleans and preprocesses the text in the training dataset 10 | """ 11 | 12 | def preProBuildWordVocab(word_count_threshold=5, all_words_path='data/all_words.txt'): 13 | # borrowed this function from NeuralTalk 14 | 15 | if not os.path.exists(all_words_path): 16 | parse_all_words(all_words_path) 17 | 18 | corpus = open(all_words_path, 'r').read().split('\n')[:-1] 19 | captions = np.asarray(corpus, dtype=np.object) 20 | 21 | captions = map(lambda x: x.replace('.', ''), captions) 22 | captions = map(lambda x: x.replace(',', ''), captions) 23 | captions = map(lambda x: x.replace('"', ''), captions) 24 | captions = map(lambda x: x.replace('\n', ''), captions) 25 | captions = map(lambda x: x.replace('?', ''), captions) 26 | captions = map(lambda x: x.replace('!', ''), captions) 27 | captions = map(lambda x: x.replace('\\', ''), captions) 28 | captions = map(lambda x: x.replace('/', ''), captions) 29 | 30 | print('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold)) 31 | word_counts = {} 32 | nsents = 0 33 | for sent in captions: 34 | nsents += 1 35 | for w in sent.lower().split(' '): 36 | 37 | word_counts[w] = word_counts.get(w, 0) + 1 38 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] 39 | print('filtered words from %d to %d' % (len(word_counts), len(vocab))) 40 | 41 | ixtoword = {} 42 | ixtoword[0] = '' 43 | ixtoword[1] = '' 44 | ixtoword[2] = '' 45 | ixtoword[3] = '' 46 | 47 | wordtoix = {} 48 | wordtoix[''] = 0 49 | wordtoix[''] = 1 50 | wordtoix[''] = 2 51 | wordtoix[''] = 3 52 | 53 | for idx, w in enumerate(vocab): 54 | wordtoix[w] = idx+4 55 | ixtoword[idx+4] = w 56 | 57 | word_counts[''] = nsents 58 | word_counts[''] = nsents 59 | word_counts[''] = nsents 60 | word_counts[''] = nsents 61 | 62 | bias_init_vector = np.array([1.0 * word_counts[ixtoword[i]] for i in ixtoword]) 63 | bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies 64 | bias_init_vector = np.log(bias_init_vector) 65 | bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range 66 | 67 | return wordtoix, ixtoword, bias_init_vector 68 | 69 | def parse_all_words(all_words_path): 70 | raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1] 71 | 72 | with codecs.open(all_words_path, "w", encoding='utf-8', errors='ignore') as f: 73 | for line in raw_movie_lines: 74 | line = line.split(' +++$+++ ') 75 | utterance = line[-1] 76 | f.write(utterance + '\n') 77 | 78 | """ Extract only the vocabulary part of the data """ 79 | def refine(data): 80 | words = re.findall("[a-zA-Z'-]+", data) 81 | words = ["".join(word.split("'")) for word in words] 82 | # words = ["".join(word.split("-")) for word in words] 83 | data = ' '.join(words) 84 | return data 85 | 86 | if __name__ == '__main__': 87 | parse_all_words('data/all_words.txt') 88 | 89 | raw_movie_lines = open('data/movie_lines.txt', 'r', encoding='utf-8', errors='ignore').read().split('\n')[:-1] 90 | 91 | utterance_dict = {} 92 | with codecs.open('data/tokenized_all_words.txt', "w", encoding='utf-8', errors='ignore') as f: 93 | for line in raw_movie_lines: 94 | line = line.split(' +++$+++ ') 95 | line_ID = line[0] 96 | utterance = line[-1] 97 | utterance_dict[line_ID] = utterance 98 | utterance = " ".join([refine(w) for w in utterance.lower().split()]) 99 | f.write(utterance + '\n') 100 | pickle.dump(utterance_dict, open('data/utterance_dict', 'wb'), True) -------------------------------------------------------------------------------- /Chapter07/data_reader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import pickle 3 | import random 4 | 5 | """ This helper module helps generate trainable batches from the preprocessed training text 6 | """ 7 | 8 | 9 | class Data_Reader: 10 | def __init__(self, cur_train_index=0, load_list=False): 11 | self.training_data = pickle.load(open('data/conversations_lenmax22_formersents2_with_former', 'rb')) 12 | self.data_size = len(self.training_data) 13 | if load_list: 14 | self.shuffle_list = pickle.load(open('data/shuffle_index_list', 'rb')) 15 | else: 16 | self.shuffle_list = self.shuffle_index() 17 | self.train_index = cur_train_index 18 | 19 | def get_batch_num(self, batch_size): 20 | return self.data_size // batch_size 21 | 22 | def shuffle_index(self): 23 | shuffle_index_list = random.sample(range(self.data_size), self.data_size) 24 | pickle.dump(shuffle_index_list, open('data/shuffle_index_list', 'wb'), True) 25 | return shuffle_index_list 26 | 27 | def generate_batch_index(self, batch_size): 28 | if self.train_index + batch_size > self.data_size: 29 | batch_index = self.shuffle_list[self.train_index:self.data_size] 30 | self.shuffle_list = self.shuffle_index() 31 | remain_size = batch_size - (self.data_size - self.train_index) 32 | batch_index += self.shuffle_list[:remain_size] 33 | self.train_index = remain_size 34 | else: 35 | batch_index = self.shuffle_list[self.train_index:self.train_index+batch_size] 36 | self.train_index += batch_size 37 | 38 | return batch_index 39 | 40 | def generate_training_batch(self, batch_size): 41 | batch_index = self.generate_batch_index(batch_size) 42 | batch_X = [self.training_data[i][0] for i in batch_index] # batch_size of conv_a 43 | batch_Y = [self.training_data[i][1] for i in batch_index] # batch_size of conv_b 44 | 45 | return batch_X, batch_Y 46 | 47 | def generate_training_batch_with_former(self, batch_size): 48 | batch_index = self.generate_batch_index(batch_size) 49 | batch_X = [self.training_data[i][0] for i in batch_index] # batch_size of conv_a 50 | batch_Y = [self.training_data[i][1] for i in batch_index] # batch_size of conv_b 51 | former = [self.training_data[i][2] for i in batch_index] # batch_size of former utterance 52 | 53 | return batch_X, batch_Y, former 54 | 55 | def generate_testing_batch(self, batch_size): 56 | batch_index = self.generate_batch_index(batch_size) 57 | batch_X = [self.training_data[i][0] for i in batch_index] # batch_size of conv_a 58 | 59 | return batch_X -------------------------------------------------------------------------------- /Chapter07/model/Reversed/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model-63" 2 | all_model_checkpoint_paths: "model-45" 3 | all_model_checkpoint_paths: "model-46" 4 | all_model_checkpoint_paths: "model-47" 5 | all_model_checkpoint_paths: "model-48" 6 | all_model_checkpoint_paths: "model-49" 7 | all_model_checkpoint_paths: "model-50" 8 | all_model_checkpoint_paths: "model-51" 9 | all_model_checkpoint_paths: "model-52" 10 | all_model_checkpoint_paths: "model-53" 11 | all_model_checkpoint_paths: "model-54" 12 | all_model_checkpoint_paths: "model-55" 13 | all_model_checkpoint_paths: "model-56" 14 | all_model_checkpoint_paths: "model-57" 15 | all_model_checkpoint_paths: "model-58" 16 | all_model_checkpoint_paths: "model-59" 17 | all_model_checkpoint_paths: "model-60" 18 | all_model_checkpoint_paths: "model-61" 19 | all_model_checkpoint_paths: "model-62" 20 | all_model_checkpoint_paths: "model-63" -------------------------------------------------------------------------------- /Chapter07/model/model-56-3000/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "new_model-56-3000" 2 | all_model_checkpoint_paths: "new_model-56-3000" 3 | -------------------------------------------------------------------------------- /Chapter07/results/sample_input.txt: -------------------------------------------------------------------------------- 1 | Have you heard about 'machine learning and having it deep and structured'? 2 | How are you? 3 | What's your name? 4 | Hello 5 | Thank God! If I had to hear one more story about your coiffure 6 | You never wanted to go out with me, did you? 7 | I guess I thought I was protecting you. 8 | Forget his reputation. Do you think we've got a plan or not? 9 | You didn't have a choice? 10 | Can you do me a favor? 11 | So I have to have a motive to be with you? 12 | What's next? 13 | You played for the Red Sox? 14 | Are you saying that someone is paying you to be our maid and doesn't want us to know who he is? 15 | What did he say? 16 | How do you do? 17 | Where do you live? 18 | If she can't pay, I'll have to foreclose, won't I? 19 | I want to see a movie. 20 | I didn't kill him! I had nothing to do with that, I tell you! 21 | What do you mean? 22 | For a celebrated bounder, that is an awful admission. 23 | Besides, I never knew that any female could do this to you 24 | Maybe. But I'm taking no chances. Why, this kid's got a record. 25 | Did you get the case for the securities? 26 | I'd like to take a crack at that guy. 27 | I didn't do it! I haven't got a gun! 28 | BASTARDS! Come back here and face me! 29 | I told you you'd get your money back. 30 | Let's get back on the road. You gotta be at that convention in the morning. 31 | You ought to take up crap shooting. Talk about luck! 32 | Don't you think you should call a backup? 33 | Motherfucker you. 34 | Machine learning. 35 | Why you gotta talk about my moms? 36 | You've got to be kidding me ! His lazy ass couldn't win the special Olympics. 37 | Sir, this is not like firing any employee. We can't predict what will happen. 38 | I don't know. Maybe we should watch the tape to be sure. 39 | Listen man, I don't need this shit. 40 | Will you stand up for me? 41 | I had a feeling you would say something like that. So I brought us dinner. 42 | They can't be serious. The ship's in pieces and we've less than a skeleton aboard. 43 | How do you trun this on? 44 | Thank God it's Friday! 45 | I don't give a shit! 46 | WHAT KIND OF PLAN IS THAT!?? 47 | No weirder than a sharp, young, good-looking woman working in a lumberyard. 48 | The witness need not be hesitant to say anything before this committee, as long as it's the truth. 49 | I'm sure a lot of people down in L.A. are worried sick about you. 50 | Find the rockets. If they're guarded, kill the men guarding them. 51 | what single thing would you want the next President of this country to do most? 52 | I forgot to get the Coca-Cola. 53 | Whoa!... The government. They control everybody's mind. You're too fucking stupid to know that? 54 | How about you graduation thesis? -------------------------------------------------------------------------------- /Chapter07/results/sample_output_RL.txt: -------------------------------------------------------------------------------- 1 | Danger misunderstood say: connection danger. 2 | Victor. 3 | Medal question medal medal buddy they. 4 | Jeremy. 5 | During. 6 | Assuming. 7 | Assuming infection say: neo say: wishes or say: say: benny chief curiosity or joined victor. 8 | Invited. 9 | Sneak error they sneak misunderstood swana question toast connection studying treasure curiosity witches. 10 | Buddy mystery effort plenty carryin' misery miami marvelous. 11 | Assuming. 12 | Assuming infection cia treasure curiosity curiosity invited. 13 | Say: say: connection breakdown say: connection. 14 | Invited. 15 | Wise covers curiosity plenty funeral mac kent rope. 16 | Wishes say: base. 17 | Assuming. 18 | Medal order feature tag lit. 19 | Assuming jack's misunderstood curiosity invited pills toast his neo toast. 20 | Sneak. 21 | Children treasure. 22 | Times. 23 | Funeral mac illness victor max wishes 22 faggot or buddy. 24 | Dearest. 25 | Johnnie. 26 | Assuming. 27 | Illness neo treasure buddy effort plenty mac buddy mystery neo neo they neo buddy mystery neo plot neo plot treasure pills. 28 | During. 29 | are. 30 | Nail treasure they they neo neo neo neo. 31 | Medal. 32 | Swana funeral or buddy vietnam they jury. 33 | Assuming say: error voices infection say: did jack's extra mac burned or witches. 34 | Infection. 35 | Invited card mac santa say: pulse. 36 | Times invited they treasure funeral they problem invited. 37 | Agreement extra. 38 | They. 39 | Medal criminal buttons. 40 | During 22 jack's snow zoo infection neo sean. 41 | Invited. 42 | During. 43 | Assuming. 44 | Assuming. 45 | Toast ending lingerie walks buddy mystery stupidity benny drove swana heels swana heels heels. 46 | Invited. 47 | Cell buddy snow they they they. 48 | Johnnie infection invited during somethin'. 49 | Johnnie. 50 | They they buddy mary they loans. 51 | They sneak kent destiny poisoned they. 52 | Toast. 53 | During criminal burn neo miami rooms rooms parking seriously rooms recommend curiosity treasure gathering they. 54 | Medal criminal burn plenty victor neighbor. 55 | -------------------------------------------------------------------------------- /Chapter08/README.md: -------------------------------------------------------------------------------- 1 | # Auto Generating a Deep Neural Network 2 | 3 | # TODO 4 | * Specifications for neural network DNA 5 | * Generate TF estimator based on DNA 6 | * Train TF estimator on CIFAR-10 and return validation accuracy 7 | * Use tf.contrib.rnn.NASCell 8 | 9 | # Basic algorithm 10 | * Initialize controller 11 | * Generate m child networks 12 | * Write checkpoint 13 | * Train m child networks and get m validation accuracies 14 | * Calculate gradient from mean loss across child networks according to REINFORCE 15 | * This requires the EMA of previous architecture validation accuracies as a baseline function 16 | * Update controller 17 | * Repeat 18 | 19 | # Parameters 20 | * m - number of child networks to generate in one episode 21 | * l - number of layers the child NN will have 22 | * controller_lr - learning rate of controller 23 | * child_lr - learning rate of child NN 24 | * beta - weight decay parameter of child_NN for L2 25 | * momentum - for Nesterov momentum of SGD 26 | 27 | # Tokens generated by controller 28 | * Filter size 29 | * Stride size 30 | * Nb. filters 31 | * Max-pooling size 32 | * anchor point -------------------------------------------------------------------------------- /Chapter08/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter08/__init__.py -------------------------------------------------------------------------------- /Chapter08/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/Chapter08/src/__init__.py -------------------------------------------------------------------------------- /Chapter08/src/child_network.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import tensorflow as tf 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | class ChildCNN(object): 8 | 9 | def __init__(self, cnn_dna, child_id, beta=1e-4, drop_rate=0.2, **kwargs): 10 | self.cnn_dna = self.process_raw_controller_output(cnn_dna) 11 | self.child_id = child_id 12 | self.beta = beta 13 | self.drop_rate = drop_rate 14 | self.is_training = tf.placeholder_with_default(True, shape=None, name="is_training_{}".format(self.child_id)) 15 | self.num_classes = 10 16 | 17 | def process_raw_controller_output(self, output): 18 | """ 19 | A helper function for preprocessing the output of the NASCell 20 | Args: 21 | output (numpy.ndarray) The output of the NASCell 22 | 23 | Returns: 24 | (list) The child network's architecture 25 | """ 26 | output = output.ravel() 27 | cnn_dna = [list(output[x:x+4]) for x in range(0, len(output), 4)] 28 | return cnn_dna 29 | 30 | def build(self, input_tensor): 31 | """ 32 | Method for creating the child neural network 33 | Args: 34 | input_tensor: The tensor which represents the input 35 | 36 | Returns: 37 | The tensor which represents the output logit (pre-softmax activation) 38 | 39 | """ 40 | logger.info("DNA is: {}".format(self.cnn_dna)) 41 | output = input_tensor 42 | for idx in range(len(self.cnn_dna)): 43 | # Get the configuration for the layer 44 | kernel_size, stride, num_filters, max_pool_size = self.cnn_dna[idx] 45 | with tf.name_scope("child_{}_conv_layer_{}".format(self.child_id, idx)): 46 | output = tf.layers.conv2d(output, 47 | # Specify the number of filters the convolutional layer will output 48 | filters=num_filters, 49 | # This specifies the size (height, width) of the convolutional kernel 50 | kernel_size=(kernel_size, kernel_size), 51 | # The size of the stride of the kernel 52 | strides=(stride, stride), 53 | # We add padding to the image 54 | padding="SAME", 55 | # It is good practice to name your layers 56 | name="conv_layer_{}".format(idx), 57 | activation=tf.nn.relu, 58 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 59 | bias_initializer=tf.zeros_initializer(), 60 | kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=self.beta)) 61 | # We apply 2D max pooling on the output of the conv layer 62 | output = tf.layers.max_pooling2d( 63 | output, pool_size=(max_pool_size, max_pool_size), strides=1, 64 | padding="SAME", name="pool_out_{}".format(idx) 65 | ) 66 | # Dropout to regularize the network further 67 | output = tf.layers.dropout(output, rate=self.drop_rate, training=self.is_training) 68 | 69 | # Lastly, we flatten the outputs and add a fully-connected layer 70 | with tf.name_scope("child_{}_fully_connected".format(self.child_id)): 71 | output = tf.layers.flatten(output, name="flatten") 72 | logits = tf.layers.dense(output, self.num_classes) 73 | 74 | return logits 75 | -------------------------------------------------------------------------------- /Chapter08/src/cifar10_processor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from keras.datasets import cifar10 6 | from keras.utils import np_utils 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | def _create_tf_dataset(x, y, batch_size): 11 | return tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(x), 12 | tf.data.Dataset.from_tensor_slices(y))).shuffle(500).repeat().batch(batch_size) 13 | 14 | def get_tf_datasets_from_numpy(batch_size, validation_split=0.1): 15 | """ 16 | Main function getting tf.Data.datasets for training, validation, and testing 17 | 18 | Args: 19 | batch_size (int): Batch size 20 | validation_split (float): Split for partitioning training and validation sets. Between 0.0 and 1.0. 21 | """ 22 | # Load data from keras datasets api 23 | (X, y), (X_test, y_test) = cifar10.load_data() 24 | 25 | logger.info("Dividing pixels by 255") 26 | X = X / 255. 27 | X_test = X_test / 255. 28 | 29 | X = X.astype(np.float32) 30 | X_test = X_test.astype(np.float32) 31 | y = y.astype(np.float32) 32 | y_test = y_test.astype(np.float32) 33 | 34 | # Turn labels into onehot encodings 35 | if y.shape[1] != 10: 36 | y = np_utils.to_categorical(y, num_classes=10) 37 | y_test = np_utils.to_categorical(y_test, num_classes=10) 38 | 39 | logger.info("Loaded data from keras") 40 | 41 | split_idx = int((1.0 - validation_split) * len(X)) 42 | X_train, y_train = X[:split_idx], y[:split_idx] 43 | X_valid, y_valid = X[split_idx:], y[split_idx:] 44 | 45 | train_dataset = _create_tf_dataset(X_train, y_train, batch_size) 46 | valid_dataset = _create_tf_dataset(X_valid, y_valid, batch_size) 47 | test_dataset = _create_tf_dataset(X_test, y_test, batch_size) 48 | 49 | # Get the batch sizes for the train, valid, and test datasets 50 | num_train_batches = int(X_train.shape[0] // batch_size) 51 | num_valid_batches = int(X_valid.shape[0] // batch_size) 52 | num_test_batches = int(X_test.shape[0] // batch_size) 53 | 54 | return train_dataset, valid_dataset, test_dataset, num_train_batches, num_valid_batches, num_test_batches 55 | -------------------------------------------------------------------------------- /Chapter08/src/config.py: -------------------------------------------------------------------------------- 1 | child_network_params = { 2 | "learning_rate": 3e-5, 3 | "max_epochs": 100, 4 | "beta": 1e-3, 5 | "batch_size": 20 6 | } 7 | 8 | controller_params = { 9 | "max_layers": 3, 10 | "components_per_layer": 4, 11 | 'beta': 1e-4, 12 | 'max_episodes': 2000, 13 | "num_children_per_episode": 10 14 | } 15 | -------------------------------------------------------------------------------- /Chapter08/src/constants.py: -------------------------------------------------------------------------------- 1 | class PATHS: 2 | DATA_DIR = "data" 3 | SAVE_DIR = "saves" 4 | -------------------------------------------------------------------------------- /Chapter08/src/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from .controller import Controller 5 | 6 | if __name__ == '__main__': 7 | # Configure the logger 8 | logging.basicConfig(stream=sys.stdout, 9 | level=logging.DEBUG, 10 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 11 | controller = Controller() 12 | controller.train_controller() 13 | -------------------------------------------------------------------------------- /Chapter09/actor.py: -------------------------------------------------------------------------------- 1 | from keras import layers, models, optimizers 2 | from keras import backend as K 3 | 4 | 5 | class Actor: 6 | 7 | 8 | # """Actor (policy) Model. """ 9 | 10 | def __init__(self, state_size, action_size): 11 | 12 | self.state_size = state_size 13 | self.action_size = action_size 14 | 15 | self.build_model() 16 | 17 | def build_model(self): 18 | states = layers.Input(shape=(self.state_size,), name='states') 19 | 20 | net = layers.Dense(units=16,kernel_regularizer=layers.regularizers.l2(1e-6))(states) 21 | net = layers.BatchNormalization()(net) 22 | net = layers.Activation("relu")(net) 23 | net = layers.Dense(units=32,kernel_regularizer=layers.regularizers.l2(1e-6))(net) 24 | net = layers.BatchNormalization()(net) 25 | net = layers.Activation("relu")(net) 26 | 27 | actions = layers.Dense(units=self.action_size, activation='softmax', name = 'actions')(net) 28 | 29 | self.model = models.Model(inputs=states, outputs=actions) 30 | 31 | action_gradients = layers.Input(shape=(self.action_size,)) 32 | loss = K.mean(-action_gradients * actions) 33 | 34 | optimizer = optimizers.Adam(lr=.00001) 35 | updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss) 36 | self.train_fn = K.function( 37 | inputs=[self.model.input, action_gradients, K.learning_phase()], 38 | outputs=[], 39 | updates=updates_op) -------------------------------------------------------------------------------- /Chapter09/agent.py: -------------------------------------------------------------------------------- 1 | from actor import Actor 2 | from critic import Critic 3 | 4 | import numpy as np 5 | from numpy.random import choice 6 | import random 7 | from collections import namedtuple, deque 8 | 9 | 10 | class ReplayBuffer: 11 | def __init__(self, buffer_size, batch_size): 12 | 13 | self.memory = deque(maxlen=buffer_size) 14 | self.batch_size = batch_size 15 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 16 | 17 | def add(self, state, action, reward, next_state, done): 18 | e = self.experience(state, action, reward, next_state, done) 19 | self.memory.append(e) 20 | 21 | def sample(self, batch_size=32): 22 | return random.sample(self.memory, k=self.batch_size) 23 | 24 | def __len__(self): 25 | return len(self.memory) 26 | 27 | 28 | class Agent: 29 | def __init__(self, state_size, batch_size, is_eval = False): 30 | self.state_size = state_size 31 | self.action_size = 3 32 | self.buffer_size = 1000000 33 | self.batch_size = batch_size 34 | self.memory = ReplayBuffer(self.buffer_size, self.batch_size) 35 | self.inventory = [] 36 | self.is_eval = is_eval 37 | 38 | self.gamma = 0.99 39 | self.tau = 0.001 40 | 41 | self.actor_local = Actor(self.state_size, self.action_size) 42 | self.actor_target = Actor(self.state_size, self.action_size) 43 | 44 | self.critic_local = Critic(self.state_size, self.action_size) 45 | self.critic_target = Critic(self.state_size, self.action_size) 46 | 47 | self.critic_target.model.set_weights(self.critic_local.model.get_weights()) 48 | self.actor_target.model.set_weights(self.actor_local.model.get_weights()) 49 | 50 | def act(self, state): 51 | options = self.actor_local.model.predict(state) 52 | self.last_state = state 53 | if not self.is_eval: 54 | return choice(range(3), p = options[0]) 55 | return np.argmax(options[0]) 56 | 57 | def step(self, action, reward, next_state, done): 58 | self.memory.add(self.last_state, action, reward, next_state, done) 59 | if len(self.memory) > self.batch_size: 60 | experiences = self.memory.sample(self.batch_size) 61 | self.learn(experiences) 62 | self.last_state = next_state 63 | 64 | def learn(self, experiences): 65 | states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) 66 | actions = np.vstack([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) 67 | rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) 68 | dones = np.array([e.done for e in experiences if e is not None]).astype(np.float32).reshape(-1,1) 69 | next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size) 70 | 71 | actions_next = self.actor_target.model.predict_on_batch(next_states) 72 | Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) 73 | 74 | Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) 75 | self.critic_local.model.train_on_batch(x = [states, actions], y=Q_targets) 76 | 77 | action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),(-1, self.action_size)) 78 | self.actor_local.train_fn([states, action_gradients, 1]) 79 | self.soft_update(self.critic_local.model, self.critic_target.model) 80 | self.soft_update(self.actor_local.model, self.actor_target.model) 81 | 82 | def soft_update(self, local_model, target_model): 83 | local_weights = np.array(local_model.get_weights()) 84 | target_weights = np.array(target_model.get_weights()) 85 | 86 | assert len(local_weights) == len(target_weights) 87 | 88 | new_weights = self.tau * local_weights + (1 - self.tau) * target_weights 89 | target_model.set_weights(new_weights) 90 | -------------------------------------------------------------------------------- /Chapter09/critic.py: -------------------------------------------------------------------------------- 1 | from keras import layers, models, optimizers 2 | from keras import backend as K 3 | 4 | 5 | class Critic: 6 | """Critic (Value) Model.""" 7 | 8 | def __init__(self, state_size, action_size): 9 | """Initialize parameters and build model. 10 | Params 11 | ====== 12 | state_size (int): Dimension of each state 13 | action_size (int): Dimension of each action 14 | """ 15 | self.state_size = state_size 16 | self.action_size = action_size 17 | 18 | self.build_model() 19 | 20 | def build_model(self): 21 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 22 | # Define input layers 23 | states = layers.Input(shape=(self.state_size,), name='states') 24 | actions = layers.Input(shape=(self.action_size,), name='actions') 25 | 26 | net_states = layers.Dense(units=16,kernel_regularizer=layers.regularizers.l2(1e-6))(states) 27 | net_states = layers.BatchNormalization()(net_states) 28 | net_states = layers.Activation("relu")(net_states) 29 | 30 | net_states = layers.Dense(units=32, kernel_regularizer=layers.regularizers.l2(1e-6))(net_states) 31 | 32 | net_actions = layers.Dense(units=32,kernel_regularizer=layers.regularizers.l2(1e-6))(actions) 33 | 34 | net = layers.Add()([net_states, net_actions]) 35 | net = layers.Activation('relu')(net) 36 | 37 | Q_values = layers.Dense(units=1, name='q_values',kernel_initializer=layers.initializers.RandomUniform(minval=-0.003, maxval=0.003))(net) 38 | 39 | self.model = models.Model(inputs=[states, actions], outputs=Q_values) 40 | 41 | optimizer = optimizers.Adam(lr=0.001) 42 | self.model.compile(optimizer=optimizer, loss='mse') 43 | 44 | action_gradients = K.gradients(Q_values, actions) 45 | 46 | self.get_action_gradients = K.function( 47 | inputs=[*self.model.input, K.learning_phase()], 48 | outputs=action_gradients) 49 | -------------------------------------------------------------------------------- /Chapter09/helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | def formatPrice(n): 6 | if n >= 0: 7 | curr = "$" 8 | else: 9 | curr = "-$" 10 | return curr + "{0:.2f}".format(abs(n)) 11 | 12 | 13 | def getStockData(key): 14 | datavec = [] 15 | lines = open("data/" + key + ".csv", "r").read().splitlines() 16 | 17 | for line in lines[1:]: 18 | datavec.append(float(line.split(",")[4])) 19 | 20 | return datavec 21 | 22 | 23 | def getState(data, t, window): 24 | if t - window >= -1: 25 | vec = data[t - window + 1:t + 1] 26 | else: 27 | vec = -(t-window+1)*[data[0]]+data[0: t + 1] 28 | scaled_state = [] 29 | for i in range(window - 1): 30 | scaled_state.append(1/(1 + math.exp(vec[i] - vec[i+1]))) 31 | 32 | return np.array([scaled_state]) 33 | -------------------------------------------------------------------------------- /Chapter09/train.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | from helper import getStockData, getState, formatPrice 3 | 4 | window_size = 50 5 | batch_size = 32 6 | agent = Agent(window_size, batch_size) 7 | data = getStockData("^GSPC") 8 | l = len(data) - 1 9 | episode_count = 300 10 | 11 | for e in range(episode_count): 12 | print("Episode " + str(e) + "/" + str(episode_count)) 13 | state = getState(data, 0, window_size + 1) 14 | 15 | agent.inventory = [] 16 | total_profit = 0 17 | done = False 18 | for t in range(l): 19 | action = agent.act(state) 20 | action_prob = agent.actor_local.model.predict(state) 21 | 22 | next_state = getState(data, t + 1, window_size + 1) 23 | reward = 0 24 | 25 | if action == 1: 26 | agent.inventory.append(data[t]) 27 | print("Buy:" + formatPrice(data[t])) 28 | 29 | elif action == 2 and len(agent.inventory) > 0: 30 | bought_price = agent.inventory.pop(0) 31 | reward = max(data[t] - bought_price, 0) 32 | total_profit += data[t] - bought_price 33 | print("sell: " + formatPrice(data[t]) + "| profit: " + formatPrice(data[t] - bought_price)) 34 | 35 | if t == l - 1: 36 | done = True 37 | agent.step(action_prob, reward, next_state, done) 38 | state = next_state 39 | 40 | if done: 41 | print("------------------------------------------") 42 | print("Total Profit: " + formatPrice(total_profit)) 43 | print("------------------------------------------") 44 | 45 | test_data = getStockData("^GSPC Test") 46 | l_test = len(test_data) - 1 47 | state = getState(test_data, 0, window_size + 1) 48 | total_profit = 0 49 | agent.inventory = [] 50 | agent.is_eval = False 51 | done = False 52 | for t in range(l_test): 53 | action = agent.act(state) 54 | 55 | next_state = getState(test_data, t + 1, window_size + 1) 56 | reward = 0 57 | 58 | if action == 1: 59 | 60 | agent.inventory.append(test_data[t]) 61 | print("Buy: " + formatPrice(test_data[t])) 62 | 63 | elif action == 2 and len(agent.inventory) > 0: 64 | bought_price = agent.inventory.pop(0) 65 | reward = max(test_data[t] - bought_price, 0) 66 | total_profit += test_data[t] - bought_price 67 | print("Sell: " + formatPrice(test_data[t]) + " | profit: " + formatPrice(test_data[t] - bought_price)) 68 | 69 | if t == l_test - 1: 70 | done = True 71 | agent.step(action_prob, reward, next_state, done) 72 | state = next_state 73 | 74 | if done: 75 | print("------------------------------------------") 76 | print("Total Profit: " + formatPrice(total_profit)) 77 | print("------------------------------------------") 78 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.9.0-py3 2 | 3 | 4 | RUN apt-get update -yqq \ 5 | && apt-get install -y locales\ 6 | && apt-get install -yqq \ 7 | && pip3 install --upgrade pip \ 8 | && locale-gen en_US.UTF-8 9 | 10 | RUN pip3 install keras 11 | 12 | COPY Chapter09 Chapter09 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /artifacts.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Reinforcement-Learning-Projects/9c52fc77b298f34b7bc126b988262ce4a9826c6e/artifacts.pptx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.1.10 2 | argh==0.26.2 3 | astroid==1.6.1 4 | autopep8==1.3.4 5 | bleach==1.5.0 6 | cachetools==2.0.1 7 | certifi==2018.1.18 8 | chardet==3.0.4 9 | google-api-core==0.1.4 10 | google-auth==1.4.1 11 | google-cloud-core==0.28.0 12 | google-cloud-logging==1.5.0 13 | googleapis-common-protos==1.5.3 14 | grpcio==1.9.1 15 | h5py==2.7.1 16 | html5lib==0.9999999 17 | idna==2.6 18 | isort==4.3.4 19 | Keras==2.1.6 20 | lazy-object-proxy==1.3.1 21 | Markdown==2.6.11 22 | mccabe==0.6.1 23 | numpy==1.14.1 24 | petname==2.2 25 | protobuf==3.5.1 26 | pyasn1==0.4.2 27 | pyasn1-modules==0.2.1 28 | pycodestyle==2.3.1 29 | pygtp==0.4 30 | pylint==1.8.2 31 | pytz==2018.3 32 | PyYAML==3.12 33 | requests==2.18.4 34 | rsa==3.4.2 35 | scipy==1.1.0 36 | sgf==0.5 37 | six==1.11.0 38 | tensorflow==1.5.0 39 | tensorflow-tensorboard==1.5.1 40 | tqdm==4.19.6 41 | urllib3==1.22 42 | Werkzeug==0.14.1 43 | wrapt==1.10.11 44 | --------------------------------------------------------------------------------