├── DQN.py ├── Pipfile ├── README.md ├── car_constraint_values_wo_band.png ├── car_main_value.png ├── car_main_value_wo_band.png ├── car_racing.py ├── config_car.py ├── config_lake.py ├── env_dqns.py ├── env_nn.py ├── exact_policy_evaluation.py ├── experimental_results ├── hyperparam_2018_12_18_22_20.png ├── lspi.csv ├── policy_improvement_grid.h5 └── results_grid.csv ├── experimental_results_car ├── experiment_results_2019_01_02_22_00.csv ├── experiment_results_2019_01_03_11_00.csv └── experiment_results_2019_01_03_15_00.csv ├── exponentiated_gradient.py ├── fitted_algo.py ├── fitted_off_policy_evaluation.py ├── fittedq.py ├── fixed_policy.py ├── fqe_quality_test.py ├── fqe_quality_test_generalization.py ├── fqi_grid_search.py ├── fqi_seed_2_new.py ├── frozen_lake.py ├── inverse_propensity_scoring.py ├── lake_primal_dual_gap.png ├── lake_values.png ├── lake_values_wo_band.png ├── layer_visualizer.py ├── mdp_approximator.py ├── model.py ├── models ├── pi_1.hdf5 ├── pi_2.hdf5 ├── pi_old_car_cnn.h5 ├── pi_old_car_cnn.hdf5 ├── pi_old_car_cnn1.hdf5 ├── pi_old_car_cnn_good.hdf5 ├── pi_old_car_cnn_random_seed.hdf5 ├── pi_old_car_cnn_seed_2.hdf5 ├── pi_old_map_size_8_mlp.h5 ├── weights.01-2362.66.hdf5 ├── weights.01-2542.47.hdf5 └── weights.01-2635.64.hdf5 ├── neural_network.py ├── optimization_problem.py ├── pi_old_car_cnn_main.hdf5 ├── play_car_racing.py ├── plot_fqe_quality_test.py ├── plot_grid_search.py ├── plot_policy_improvement.py ├── plot_policy_improvement_v2.py ├── plot_results.py ├── print_policy.py ├── replay_buffer.py ├── run.py ├── seed_2_data ├── car_data_actions_seed_2.h5 ├── car_data_frames_seed_2.h5 ├── car_data_is_done_seed_2.h5 ├── car_data_next_states_seed_2.h5 ├── car_data_prev_states_seed_2.h5 └── car_data_rewards_seed_2.h5 ├── stochastic_policy.py ├── tests ├── car_fqe.py └── fqe_test.py ├── thread_safe.py └── value_function.py /DQN.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import numpy as np 3 | from replay_buffer import Buffer 4 | import time 5 | from keras.callbacks import ModelCheckpoint 6 | import os 7 | 8 | class DeepQLearning(object): 9 | def __init__(self, env, 10 | gamma, 11 | model_type='mlp', 12 | action_space_map = None, 13 | num_iterations = 5000, 14 | sample_every_N_transitions = 10, 15 | batchsize = 1000, 16 | copy_over_target_every_M_training_iterations = 100, 17 | max_time_spent_in_episode = 100, 18 | buffer_size = 10000, 19 | num_frame_stack=1, 20 | min_buffer_size_to_train=1000, 21 | frame_skip = 1, 22 | pic_size = (96, 96), 23 | models_path = None, 24 | ): 25 | 26 | self.models_path = models_path 27 | self.env = env 28 | self.num_iterations = num_iterations 29 | self.gamma = gamma 30 | self.frame_skip = frame_skip 31 | _ = self.env.reset() 32 | if self.env.env_type in ['car']: 33 | self.env.render() 34 | _, r, _, _ = self.env.step(action_space_map[0]) 35 | self.buffer = Buffer(buffer_size=buffer_size, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, pic_size = pic_size, n_costs = (len(np.hstack(r)),)) 36 | else: 37 | self.buffer = Buffer(buffer_size=buffer_size, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, pic_size = (1,), n_costs = (1,)) 38 | self.sample_every_N_transitions = sample_every_N_transitions 39 | self.batchsize = batchsize 40 | self.copy_over_target_every_M_training_iterations = copy_over_target_every_M_training_iterations 41 | self.max_time_spent_in_episode = max_time_spent_in_episode 42 | self.action_space_map = action_space_map 43 | 44 | def min_over_a(self, *args, **kw): 45 | return self.Q.min_over_a(*args, **kw) 46 | 47 | def all_actions(self, *args, **kw): 48 | return self.Q.all_actions(*args, **kw) 49 | 50 | # def representation(self, *args, **kw): 51 | # return self.Q.representation(*args, **kw) 52 | 53 | def learn(self): 54 | 55 | more_callbacks = [ModelCheckpointExtended(self.models_path)] 56 | self.time_steps = 0 57 | training_iteration = -1 58 | perf = Performance() 59 | main_tic = time.time() 60 | training_complete = False 61 | for i in range(self.num_iterations): 62 | if training_complete: continue 63 | tic = time.time() 64 | x = self.env.reset() 65 | if self.env.env_type in ['car']: self.env.render() 66 | self.buffer.start_new_episode(x) 67 | done = False 68 | time_spent_in_episode = 0 69 | episode_cost = 0 70 | while not done: 71 | #if self.env.env_type in ['car']: self.env.render() 72 | 73 | time_spent_in_episode += 1 74 | self.time_steps += 1 75 | # print time_spent_in_episode 76 | 77 | use_random = np.random.rand(1) < self.epsilon(epoch=i, total_steps=self.time_steps) 78 | if use_random: 79 | action = self.sample_random_action() 80 | else: 81 | action = self.Q(self.buffer.current_state())[0] 82 | 83 | if (i % 50) == 0: print use_random, action, self.Q(self.buffer.current_state())[0], self.Q.all_actions(self.buffer.current_state()) 84 | 85 | # import pdb; pdb.set_trace() 86 | # state = self.buffer.current_state() 87 | # import matplotlib.pyplot as plt 88 | # plt.imshow(state[-1]) 89 | # plt.show() 90 | # self.Q.all_actions(state) 91 | 92 | cost = [] 93 | for _ in range(self.frame_skip): 94 | if done: continue 95 | x_prime, costs, done, _ = self.env.step(self.action_space_map[action]) 96 | # import pdb; pdb.set_trace() 97 | cost.append(costs) 98 | 99 | cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0) 100 | early_done, punishment = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_spent_in_episode, total_cost=episode_cost) 101 | 102 | if early_done: 103 | cost[0] = cost[0] + punishment 104 | done = done or early_done 105 | 106 | # self.buffer.append([x,action,x_prime, cost[0], done]) 107 | self.buffer.append(action, x_prime, cost, done) 108 | 109 | # train 110 | is_train = ((self.time_steps % self.sample_every_N_transitions) == 0) and self.buffer.is_enough() 111 | 112 | if is_train: 113 | # for _ in range(len(self.buffer.data)/self.sample_every_N_transitions): 114 | training_iteration += 1 115 | if (training_iteration % self.copy_over_target_every_M_training_iterations) == 0: 116 | self.Q.copy_over_to(self.Q_target) 117 | batch_x, batch_a, batch_x_prime, batch_cost, batch_done = self.buffer.sample(self.batchsize) 118 | 119 | target = batch_cost[:,0] + self.gamma*self.Q_target.min_over_a(np.stack(batch_x_prime))[0]*(1-batch_done) 120 | X = [batch_x, batch_a] 121 | 122 | evaluation = self.Q.fit(X,target,epochs=1, batch_size=32, evaluate=False,verbose=False,tqdm_verbose=False, additional_callbacks=more_callbacks) 123 | 124 | x = x_prime 125 | 126 | episode_cost += cost[0] 127 | 128 | if self.env.env_type == 'car': 129 | perf.append(float(self.env.tile_visited_count)/len(self.env.track)) 130 | else: 131 | perf.append(episode_cost/self.env.min_cost) 132 | 133 | if (i % 1) == 0: 134 | print 'Episode %s' % i 135 | episode_time = time.time()-tic 136 | print 'Total Time: %s. Episode time: %s. Time/Frame: %s' % (np.round(time.time() - main_tic,2), np.round(episode_time, 2), np.round(episode_time/time_spent_in_episode, 2)) 137 | print 'Episode frames: %s. Total frames: %s. Total train steps: %s' % (time_spent_in_episode, self.time_steps, training_iteration) 138 | if self.env.env_type in ['car']: 139 | print 'Performance: %s/%s. Score out of 1: %s. Average Score: %s' % (self.env.tile_visited_count, len(self.env.track), perf.last(), perf.get_avg_performance()) 140 | else: 141 | print 'Score out of 1: %s. Average Score: %s' % (perf.last(), perf.get_avg_performance()) 142 | print '*'*20 143 | if perf.reached_goal(): 144 | #return more_callbacks[0].all_filepaths[-1] 145 | training_complete = True#return self.Q #more_callbacks[0].all_filepaths[-1] 146 | self.buffer.save(os.path.join(os.getcwd(),'%s_data_{0}.h5' % self.env.env_type)) 147 | 148 | def __call__(self,*args): 149 | return self.Q.__call__(*args) 150 | 151 | def __deepcopy__(self, memo): 152 | return self 153 | 154 | class Performance(object): 155 | def __init__(self): 156 | self.goal = .85 157 | self.avg_over = 20 158 | self.costs = [] 159 | 160 | def reached_goal(self): 161 | if self.get_avg_performance() >= self.goal: 162 | return True 163 | else: 164 | return False 165 | 166 | def append(self, cost): 167 | self.costs.append(cost) 168 | 169 | def last(self): 170 | return np.round(self.costs[-1], 3) 171 | 172 | def get_avg_performance(self): 173 | num_iters = min(self.avg_over, len(self.costs)) 174 | return np.round(sum(self.costs[-num_iters:])/ float(num_iters), 3) 175 | 176 | 177 | class ModelCheckpointExtended(ModelCheckpoint): 178 | def __init__(self, filepath, max_to_keep=5, monitor='loss', *args, **kw): 179 | super(ModelCheckpointExtended, self).__init__(filepath, *args, **kw) 180 | self.max_to_keep = max_to_keep 181 | self.all_filepaths = [] 182 | 183 | def on_epoch_end(self, epoch, logs=None): 184 | 185 | super(ModelCheckpointExtended, self).on_epoch_end(epoch, logs) 186 | logs = logs or {} 187 | filepath = self.filepath.format(epoch=epoch + 1, **logs) 188 | 189 | self.all_filepaths.append(filepath) 190 | if len(self.all_filepaths) > self.max_to_keep: 191 | try: 192 | os.remove(self.all_filepaths.pop(0)) 193 | except: 194 | pass 195 | 196 | 197 | # class Buffer(object): 198 | # def __init__(self, buffer_size=10000): 199 | # self.data = [] 200 | # self.size = buffer_size 201 | # self.idx = -1 202 | 203 | # def append(self, datum): 204 | # self.idx = (self.idx + 1) % self.size 205 | 206 | # if len(self.data) > self.idx: 207 | # self.data[self.idx] = datum 208 | # else: 209 | # self.data.append(datum) 210 | 211 | # def sample(self, N): 212 | # N = min(N, len(self.data)) 213 | # rows = np.random.choice(len(self.data), size=N, replace=False) 214 | # return np.array(self.data)[rows] 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | numpy = "*" 10 | tensorflow-gpu = "*" 11 | gym = "*" 12 | keras = "*" 13 | pyyaml = "*" 14 | scipy = "*" 15 | tqdm = "*" 16 | keras-tqdm = "*" 17 | pandas = "*" 18 | matplotlib = "*" 19 | argparse = "*" 20 | "box2d-py" = "*" 21 | pyglet = "*" 22 | pyvirtualdisplay = "*" 23 | scikit-image = "*" 24 | deepdish = "*" 25 | seaborn = "*" 26 | 27 | [requires] 28 | python_version = "2.7" 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # constrained_batch_policy_learning 2 | 3 | *Note: Use the --headless flag if using a server without a display. 4 | 5 | Otherwise, to run the main algorithm: 6 | ```python 7 | pip install pipenv 8 | pipenv install 9 | pipenv run python run.py -env car --headless 10 | ``` 11 | 12 | or, for lake, 13 | 14 | ```python 15 | pipenv run python run.py -env lake --headless 16 | ``` 17 | -------------------------------------------------------------------------------- /car_constraint_values_wo_band.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_constraint_values_wo_band.png -------------------------------------------------------------------------------- /car_main_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_main_value.png -------------------------------------------------------------------------------- /car_main_value_wo_band.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_main_value_wo_band.png -------------------------------------------------------------------------------- /config_car.py: -------------------------------------------------------------------------------- 1 | #### Setup Gym 2 | from car_racing import ExtendedCarRacing 3 | import itertools 4 | 5 | # env = gym.make('CarRacing-v0') 6 | init_seed = 2 7 | stochastic_env = False # = not deterministic 8 | max_pos_costs = 12 # The maximum allowable positive cost before ending episode early 9 | max_time_spent_in_episode = 2000 10 | env = ExtendedCarRacing(init_seed, stochastic_env, max_pos_costs) 11 | 12 | #### Hyperparam 13 | gamma = .95 14 | max_epochs = 3000 # max number of epochs over which to collect data 15 | max_Q_fitting_epochs = 50 #max number of epochs over which to converge to Q^\ast. Fitted Q Iter 16 | max_eval_fitting_epochs = 50 #max number of epochs over which to converge to Q^\pi. Off Policy Eval 17 | lambda_bound = 30. # l1 bound on lagrange multipliers 18 | epsilon = .01 # termination condition for two-player game 19 | deviation_from_old_policy_eps = 0.0 #With what probabaility to deviate from the old policy 20 | # convergence_epsilon = 1e-6 # termination condition for model convergence 21 | # action_space_dim = env.nA # action space dimension 22 | # state_space_dim = env.nS # state space dimension 23 | eta = .01 # param for exponentiated gradient algorithm 24 | # initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states 25 | # non_terminal_states = np.nonzero(((env.desc == 'S') + (env.desc == 'F')).reshape(-1))[0] # Used for dynamic programming. this is an optimization to make the algorithm run faster. In general, you may not have this 26 | max_number_of_main_algo_iterations = 100 # After how many iterations to cut off the main algorithm 27 | model_type = 'cnn' 28 | # old_policy_name = 'pi_old_car_{0}.hdf5'.format(model_type) 29 | old_policy_name = 'pi_old_car_{0}_seed_2.hdf5'.format(model_type) 30 | freeze_cnn_layers = False 31 | starting_lambda = [1.,1.,28.] 32 | 33 | 34 | # Constraint 1: We'd like the number of times you brake to be less than 10% of the time 35 | # Constraint 2: We'd like the car to stay within 15 units of the center of the track 90% of the time 36 | constraint_thresholds = [1., 5.] + [1] 37 | constraints_cared_about = [-1,2] 38 | constraints = [5.8, 85.] + [0] 39 | 40 | ## DQN Param 41 | num_iterations = 3000 42 | sample_every_N_transitions = 4 43 | batchsize = 64 44 | copy_over_target_every_M_training_iterations = 250 45 | buffer_size = 20000 46 | min_epsilon = .01 47 | initial_epsilon = 1. 48 | epsilon_decay_steps = 1000 #num_iterations 49 | num_frame_stack=3 50 | min_buffer_size_to_train = 2000 51 | frame_skip=3 52 | pic_size = (96, 96, 3) 53 | 54 | # Other 55 | 56 | state_space_dim = (96, 96, num_frame_stack) 57 | 58 | # action_space_map = { 59 | # 0: [0.0, 0.0, 0.0], # Brake 60 | # 1: [-0.6, 0.05, 0.0], # Sharp left 61 | # 2: [0.6, 0.05, 0.0], # Sharp right 62 | # 3: [0.0, 0.3, 0.0] } # Staight 63 | 64 | action_space_map = {} 65 | for i, action in enumerate([k for k in itertools.product([-1, 0, 1], [1, 0], [0.2, 0])]): 66 | action_space_map[i] = action 67 | 68 | action_space_dim = len(action_space_map) 69 | prob = [1/float(action_space_dim)]*action_space_dim # Probability with which to explore space when deviating from old policy 70 | 71 | calculate_gap = False # Run Main algo. If False, it skips calc of primal-dual gap 72 | infinite_loop = True # Stop script if reached primal-dual gap threshold 73 | policy_improvement_name = 'car_policy_improvement.h5' 74 | results_name = 'car_results.csv' 75 | -------------------------------------------------------------------------------- /config_lake.py: -------------------------------------------------------------------------------- 1 | 2 | #### Setup Gym 3 | from frozen_lake import ExtendedFrozenLake 4 | import numpy as np 5 | 6 | map_size = 8 7 | # register( id='FrozenLake-no-slip-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'is_slippery': False, 'map_name':'{0}x{0}'.format(map_size)} ) 8 | # env = gym.make('FrozenLake-no-slip-v0') 9 | max_time_spent_in_episode = 100 10 | env = ExtendedFrozenLake(max_time_spent_in_episode, map_name = '{0}x{0}'.format(map_size), is_slippery= False) 11 | position_of_holes = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'H')] 12 | position_of_goals = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'G')] 13 | 14 | 15 | 16 | #### Hyperparam 17 | gamma = 0.9 18 | max_epochs = 5000 # max number of epochs over which to collect data 19 | max_Q_fitting_epochs = 30 #max number of epochs over which to converge to Q^\ast. Fitted Q Iter 20 | max_eval_fitting_epochs = 30 #max number of epochs over which to converge to Q^\pi. Off Policy Eval 21 | lambda_bound = 30. # l1 bound on lagrange multipliers 22 | epsilon = .01 # termination condition for two-player game 23 | deviation_from_old_policy_eps = .95 #With what probabaility to deviate from the old policy 24 | # convergence_epsilon = 1e-6 # termination condition for model convergence 25 | action_space_dim = env.nA # action space dimension 26 | state_space_dim = env.nS # state space dimension 27 | eta = 50. # param for exponentiated gradient algorithm 28 | initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states 29 | non_terminal_states = np.nonzero(((env.desc == 'S') + (env.desc == 'F')).reshape(-1))[0] # Used for dynamic programming. this is an optimization to make the algorithm run faster. In general, you may not have this 30 | max_number_of_main_algo_iterations = 100 # After how many iterations to cut off the main algorithm 31 | model_type = 'mlp' 32 | old_policy_name = 'pi_old_map_size_{0}_{1}.h5'.format(map_size, model_type) 33 | constraints = [.1, 0] 34 | starting_lambda = 'uniform' 35 | 36 | ## DQN Param 37 | num_iterations = 5000 38 | sample_every_N_transitions = 10 39 | batchsize = 1000 40 | copy_over_target_every_M_training_iterations = 100 41 | buffer_size = 10000 42 | num_frame_stack=1 43 | min_buffer_size_to_train=0 44 | frame_skip = 1 45 | pic_size = tuple() 46 | min_epsilon = .02 47 | initial_epsilon = .3 48 | epsilon_decay_steps = 1000 #num_iterations 49 | min_buffer_size_to_train = 2000 50 | 51 | # Other 52 | stochastic_env = False 53 | action_space_map = { 54 | 0: 0, 55 | 1: 1, 56 | 2: 2, 57 | 3: 3 } 58 | 59 | prob = [1/float(action_space_dim)]*action_space_dim # Probability with which to explore space when deviating from old policy 60 | 61 | 62 | calculate_gap = True # Run Main algo. If False, it skips calc of primal-dual gap 63 | infinite_loop = True # Stop script if reached primal-dual gap threshold 64 | policy_improvement_name = 'car_policy_improvement.h5' 65 | results_name = 'car_results.csv' -------------------------------------------------------------------------------- /env_dqns.py: -------------------------------------------------------------------------------- 1 | 2 | from DQN import DeepQLearning 3 | from env_nn import * 4 | 5 | class LakeDQN(DeepQLearning): 6 | def __init__(self, *args, **kw): 7 | holes, goals = kw['position_of_holes'], kw['position_of_goals'] 8 | del kw['position_of_holes'] 9 | del kw['position_of_goals'] 10 | 11 | self.min_epsilon = kw['min_epsilon'] 12 | self.initial_epsilon = kw['initial_epsilon'] 13 | self.epsilon_decay_steps = kw['epsilon_decay_steps'] 14 | for key in ['min_epsilon', 'initial_epsilon', 'epsilon_decay_steps']: 15 | if key in kw: del kw[key] 16 | 17 | super(LakeDQN, self).__init__(*args, **kw) 18 | 19 | for key in ['action_space_map','max_time_spent_in_episode','num_iterations','sample_every_N_transitions','batchsize','copy_over_target_every_M_training_iterations', 'buffer_size', 'min_buffer_size_to_train', 'models_path']: 20 | if key in kw: del kw[key] 21 | 22 | kw['position_of_holes'],kw['position_of_goals'] = holes, goals 23 | self.state_space_dim = self.env.nS 24 | self.action_space_dim = self.env.nA 25 | self.Q = LakeNN(self.state_space_dim+self.action_space_dim, 1, [self.env.desc.shape[0], self.env.desc.shape[1]], self.action_space_dim, self.gamma, **kw) 26 | self.Q_target = LakeNN(self.state_space_dim+self.action_space_dim, 1, [self.env.desc.shape[0], self.env.desc.shape[1]], self.action_space_dim, self.gamma, **kw) 27 | 28 | def sample_random_action(self): 29 | ''' 30 | Uniform random 31 | ''' 32 | return np.random.choice(self.action_space_dim) 33 | 34 | # def epsilon(self, epoch=None, total_steps=None): 35 | # return 1./(total_steps/100 + 3) 36 | def epsilon(self, epoch=None, total_steps=None): 37 | if epoch >= self.epsilon_decay_steps: 38 | return self.min_epsilon 39 | else: 40 | alpha = epoch / float(self.epsilon_decay_steps) 41 | current_epsilon = self.initial_epsilon * (1-alpha) + self.min_epsilon * (alpha) 42 | return current_epsilon 43 | 44 | class CarDQN(DeepQLearning): 45 | def __init__(self, *args, **kw): 46 | 47 | self.gas_actions = None 48 | 49 | self.min_epsilon = kw['min_epsilon'] 50 | self.initial_epsilon = kw['initial_epsilon'] 51 | self.epsilon_decay_steps = kw['epsilon_decay_steps'] 52 | self.action_space_dim = kw['action_space_dim'] 53 | for key in ['action_space_dim', 'min_epsilon', 'initial_epsilon', 'epsilon_decay_steps']: 54 | if key in kw: del kw[key] 55 | 56 | super(CarDQN, self).__init__(*args, **kw) 57 | for key in ['action_space_map','max_time_spent_in_episode','num_iterations','sample_every_N_transitions','batchsize','copy_over_target_every_M_training_iterations', 'buffer_size', 'min_buffer_size_to_train', 'models_path']: 58 | if key in kw: del kw[key] 59 | 60 | from config_car import state_space_dim 61 | self.state_space_dim = state_space_dim 62 | self.Q = CarNN(self.state_space_dim, self.action_space_dim, self.gamma, **kw) 63 | self.Q_target = CarNN(self.state_space_dim, self.action_space_dim, self.gamma, **kw) 64 | 65 | def sample_random_action(self): 66 | ''' 67 | Biased (toward movement) random 68 | ''' 69 | if self.gas_actions is None: 70 | self.gas_actions = {key:val[1] == 1 and val[2] == 0 for key,val in self.action_space_map.iteritems()} 71 | 72 | action_weights = 14. * np.array(self.gas_actions.values()) + 1.0 73 | action_weights /= np.sum(action_weights) 74 | 75 | return np.random.choice(self.gas_actions.keys(), p=action_weights) 76 | # return np.random.choice(self.action_space_dim) 77 | 78 | def epsilon(self, epoch=None, total_steps=None): 79 | if epoch >= self.epsilon_decay_steps: 80 | # return max(.08*((2000-epoch)/1000), 0.) + .02 81 | return self.min_epsilon 82 | else: 83 | alpha = epoch / float(self.epsilon_decay_steps) 84 | current_epsilon = self.initial_epsilon * (1-alpha) + self.min_epsilon * (alpha) 85 | return current_epsilon 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /exact_policy_evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import scipy.signal as signal 5 | from replay_buffer import Buffer 6 | import os 7 | 8 | 9 | class ExactPolicyEvaluator(object): 10 | def __init__(self, action_space_map, gamma, env=None, num_frame_stack=None, frame_skip = None, pic_size = None, constraint_thresholds=None, constraints_cared_about=None): 11 | ''' 12 | An implementation of Exact Policy Evaluation through Monte Carlo 13 | 14 | In this case since the environment is fixed and initial states are fixed 15 | then this will be exact 16 | ''' 17 | self.gamma = gamma 18 | self.action_space_map = action_space_map 19 | self.constraint_thresholds = constraint_thresholds 20 | self.constraints_cared_about = constraints_cared_about 21 | 22 | self.num_frame_stack = num_frame_stack 23 | self.frame_skip = frame_skip 24 | self.pic_size = pic_size 25 | self.buffer_size = int(2000) 26 | self.min_buffer_size_to_train = 0 27 | 28 | # self.initial_states = initial_states 29 | # self.state_space_dim = state_space_dim 30 | if env is not None: 31 | self.env = env 32 | else: 33 | raise 34 | 35 | self.monitor = Monitor(self.env, 'videos') 36 | 37 | def run(self, policy, *args, **kw): 38 | 39 | environment_is_dynamic = not self.env.deterministic 40 | 41 | if 'policy_is_greedy' not in kw: 42 | kw['policy_is_greedy']=True 43 | policy_is_greedy=True 44 | else: 45 | policy_is_greedy= kw['policy_is_greedy'] 46 | 47 | if not isinstance(policy,(list,)): 48 | policy = [policy] 49 | 50 | 51 | if not environment_is_dynamic and policy_is_greedy: 52 | c,g,perf = self.determinstic_env_and_greedy_policy(policy, **kw) 53 | if len(args) > 0: 54 | if args[0] == 'c': 55 | return c 56 | else: 57 | try: 58 | return g[i] 59 | except: 60 | if isinstance(g,(list,)) and len(g) > 1: 61 | assert False, 'Index error' 62 | else: 63 | return g 64 | else: 65 | return c,g,perf 66 | 67 | else: 68 | return self.stochastic_env_or_policy(policy, **kw) 69 | 70 | def get_Qs(self, policy, initial_states, state_space_dim, idx=0): 71 | Q = [] 72 | for initial_state in initial_states: 73 | self.env.isd = np.eye(state_space_dim)[initial_state] 74 | 75 | if not isinstance(policy,(list,)): 76 | policy = [policy] 77 | Q.append(self.determinstic_env_and_greedy_policy(policy, render=False, verbose=False)[idx]) 78 | 79 | self.env.isd = np.eye(state_space_dim)[0] 80 | return Q 81 | 82 | def stochastic_env_or_policy(self, policy, render=False, verbose=False, **kw): 83 | ''' 84 | Run the evaluator 85 | ''' 86 | 87 | all_c = [] 88 | all_g = [] 89 | if len(policy) > 1: import pdb; pdb.set_trace() 90 | for pi in policy: 91 | trial_c = [] 92 | trial_g = [] 93 | for i in range(1): 94 | c = [] 95 | g = [] 96 | self.buffer = Buffer(num_frame_stack= self.num_frame_stack,buffer_size= self.buffer_size,min_buffer_size_to_train= self.min_buffer_size_to_train,pic_size = self.pic_size,) 97 | x = self.env.reset() 98 | self.buffer.start_new_episode(x) 99 | done = False 100 | time_steps = 0 101 | 102 | while not done: 103 | time_steps += 1 104 | if (self.env.env_type in ['car']) or render: self.env.render() 105 | 106 | action = pi([self.buffer.current_state()])[0] 107 | 108 | cost = [] 109 | for _ in range(self.frame_skip): 110 | x_prime, costs, done, _ = self.env.step(self.action_space_map[action]) 111 | # if self.render: 112 | # self.env.render() 113 | cost.append(costs) 114 | if done: 115 | break 116 | 117 | cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0) 118 | if self.constraint_thresholds is not None: 119 | cost[1:][self.constraints_cared_about] = np.array(cost[1:])[self.constraints_cared_about] >= self.constraint_thresholds[:-1] 120 | 121 | 122 | early_done, _ = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps, total_cost=sum(c)) 123 | done = done or early_done 124 | self.buffer.append(action, x_prime, cost[0], done) 125 | 126 | if verbose: print x,action,x_prime,cost 127 | 128 | c.append(cost[0].tolist()) 129 | g.append(cost[1:].tolist()) 130 | 131 | x = x_prime 132 | trial_c.append(c) 133 | trial_g.append(g) 134 | 135 | all_c.append(np.mean([self.discounted_sum(x, self.gamma) for x in trial_c])) 136 | all_g.append(np.mean([ [self.discounted_sum(cost, self.gamma) for cost in np.array(x).T] for x in trial_g], axis=0).tolist()) 137 | # all_g.append(np.mean([self.discounted_sum(x, self.gamma) for x in trial_g])) 138 | 139 | c = np.mean(all_c, axis=0) 140 | g = np.mean(all_g, axis=0) 141 | 142 | return c,g 143 | 144 | 145 | def determinstic_env_and_greedy_policy(self, policy, render=False, verbose=False, to_monitor=False, **kw): 146 | ''' 147 | Run the evaluator 148 | ''' 149 | 150 | all_c = [] 151 | all_g = [] 152 | for pi in policy: 153 | c = [] 154 | g = [] 155 | self.buffer = Buffer(num_frame_stack= self.num_frame_stack, 156 | buffer_size= self.buffer_size, 157 | min_buffer_size_to_train= self.min_buffer_size_to_train, 158 | pic_size = self.pic_size,) 159 | x = self.env.reset() 160 | if (self.env.env_type in ['car']) or render: self.env.render() 161 | self.buffer.start_new_episode(x) 162 | done = False 163 | time_steps = 0 164 | if to_monitor: 165 | self.monitor.delete() 166 | while not done: 167 | if (self.env.env_type in ['car']) or render: 168 | if to_monitor: self.monitor.save() 169 | # self.env.render() 170 | time_steps += 1 171 | 172 | action = pi(self.buffer.current_state())[0] 173 | # action = np.argmin(pi.model.predict(np.rollaxis(np.dot(self.buffer.current_state()/255. , [0.299, 0.587, 0.114])[np.newaxis,...],1,4))) 174 | # print self.action_space_map[action] 175 | # import pdb; pdb.set_trace() 176 | cost = [] 177 | for _ in range(self.frame_skip): 178 | x_prime, costs, done, _ = self.env.step(self.action_space_map[action]) 179 | # if self.render: 180 | if (self.env.env_type in ['car']) or render: self.env.render() 181 | cost.append(costs) 182 | if done: 183 | break 184 | 185 | cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0) 186 | if self.constraint_thresholds is not None: 187 | pass 188 | #cost[1:][self.constraints_cared_about] = np.array(cost[1:])[self.constraints_cared_about] >= self.constraint_thresholds[:-1] 189 | 190 | 191 | early_done, punishment = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps, total_cost=sum(c)) 192 | done = done or early_done 193 | 194 | self.buffer.append(action, x_prime, cost[0]+punishment, done) 195 | 196 | # if verbose: print x,action,x_prime,cost 197 | #print time_steps, cost[0], action 198 | # if (time_steps % 50) ==0 : print time_steps, cost[0]+punishment, action 199 | # print cost[0] + punishment 200 | c.append(cost[0] + punishment) 201 | g.append(cost[1:]) 202 | 203 | # x_prime , cost, done, _ = self.env.step(self.action_space_map[action]) 204 | # done = done or self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps) 205 | # self.buffer.append(action, x_prime, cost[0], done) 206 | 207 | # if verbose: print x,action,x_prime,cost 208 | # if render: self.env.render() 209 | # c.append(cost[0]) 210 | # g.append(cost[1]) 211 | 212 | x = x_prime 213 | all_c.append(c) 214 | all_g.append(g) 215 | 216 | if to_monitor: self.monitor.make_video() 217 | if self.env.env_type in ['car']: 218 | print 'Performance: %s/%s = %s' % (self.env.tile_visited_count, len(self.env.track), self.env.tile_visited_count/float(len(self.env.track))) 219 | # import pdb; pdb.set_trace() 220 | c = np.mean([self.discounted_sum(x, self.gamma) for x in all_c]) 221 | g = np.mean([ [self.discounted_sum(cost, self.gamma) for cost in np.array(x).T] for x in all_g], axis=0).tolist() 222 | # g = np.mean([self.discounted_sum(np.array(x), self.gamma) for x in all_g], axis=0).tolist() 223 | 224 | if not isinstance(g,(list,)): 225 | g = [g] 226 | 227 | if self.env.env_type in ['car']: 228 | return c,g, self.env.tile_visited_count/float(len(self.env.track)) 229 | else: 230 | return c,g, -c 231 | 232 | @staticmethod 233 | def discounted_sum(costs, discount): 234 | ''' 235 | Calculate discounted sum of costs 236 | ''' 237 | y = signal.lfilter([1], [1, -discount], x=costs[::-1]) 238 | return y[::-1][0] 239 | 240 | class Monitor(object): 241 | def __init__(self, env, filepath): 242 | self.frame_num = 0 243 | self.vid_num = 0 244 | self.filepath = os.path.join(os.getcwd(), filepath) 245 | if not os.path.exists(self.filepath): 246 | os.makedirs(self.filepath) 247 | self.image_name = "image%05d.png" 248 | self.env = env 249 | self.images = [] 250 | 251 | def save(self): 252 | import matplotlib.pyplot as plt 253 | full_path = os.path.join(self.filepath, self.image_name % self.frame_num) 254 | self.images.append(full_path) 255 | # plt.imsave(full_path, self.env.render('rgb_array')) 256 | im = self.env.render('human', render_human=True) 257 | plt.imsave(full_path, im) 258 | self.frame_num += 1 259 | 260 | def make_video(self): 261 | import subprocess 262 | current_dir = os.getcwd() 263 | os.chdir(self.filepath) 264 | # #'ffmpeg -framerate 8 -i image%05d.png -r 30 -pix_fmt yuv420p car_vid_0.mp4' 265 | subprocess.call([ 266 | 'ffmpeg', '-hide_banner', '-loglevel', 'panic', '-framerate', '8', '-i', self.image_name, '-r', '30', '-pix_fmt', 'yuv420p', 267 | 'car_vid_%s.mp4' % self.vid_num 268 | ]) 269 | 270 | self.vid_num += 1 271 | self.frame_num = 0 272 | os.chdir(current_dir) 273 | 274 | def delete(self): 275 | self.frame_num = 0 276 | current_dir = os.getcwd() 277 | os.chdir(self.filepath) 278 | 279 | for file_name in [f for f in os.listdir(os.getcwd()) if '.png' in f]: 280 | os.remove(file_name) 281 | 282 | os.chdir(current_dir) 283 | 284 | 285 | 286 | 287 | -------------------------------------------------------------------------------- /experimental_results/hyperparam_2018_12_18_22_20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/experimental_results/hyperparam_2018_12_18_22_20.png -------------------------------------------------------------------------------- /experimental_results/lspi.csv: -------------------------------------------------------------------------------- 1 | "(0.0, 0.0, -30.87238959928826, 12.867082852069641, 148.164007478981, 0.7107142857142857)","(0.0, 0.1, -25.94545768997468, 21.70199027024002, 89.24081946770697, 0.65)","(0.0, 0.2, -30.168532518598116, 22.257021833854374, 107.39076865114626, 0.65)","(0.0, 0.30000000000000004, -1.8828082912671142, 23.397868442728914, 14.194376691321793, 0.02142857142857143)","(0.0, 0.4, -4.590344009192178, 30.983625620907457, 3.4567222606865475, 0.02142857142857143)","(0.0, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.0, 0.6000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.0, 0.7000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.0, 0.8, 2.297939913687321, 35.56502472794603, 9.664349912267248, 0.014285714285714285)","(0.0, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.0, 1.0, 10.891434020469823, 24.150004214974345, 4.705301261323525, 0.010714285714285714)","(0.1, 0.0, -24.748313381309416, 13.469845336577208, 114.67437549922263, 0.10357142857142858)","(0.1, 0.1, -32.67756129871718, 18.849802000450236, 86.13297012296938, 0.9285714285714286)","(0.1, 0.2, -20.91840426304937, 16.0054493056005, 122.95385125055645, 0.95)","(0.1, 0.30000000000000004, -18.83435329655354, 24.141972805866068, 69.66527454287201, 0.6321428571428571)","(0.1, 0.4, -2.7137844892848833, 25.569733788701384, 10.119662766900444, 0.02142857142857143)","(0.1, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.1, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.1, 0.7000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.1, 0.8, 2.297939913687321, 35.56502472794603, 9.664349912267248, 0.014285714285714285)","(0.1, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.1, 1.0, 10.891434020469823, 24.150004214974345, 4.705301261323525, 0.010714285714285714)","(0.2, 0.0, -24.748313381309416, 13.469845336577208, 114.67437549922263, 0.10357142857142858)","(0.2, 0.1, -32.6975504649815, 18.844508484929076, 85.99521166871655, 0.7464285714285714)","(0.2, 0.2, -16.14420423787202, 18.482310503597816, 135.3242842447658, 0.7535714285714286)","(0.2, 0.30000000000000004, -17.96785285098834, 21.592840875710646, 68.27198666070595, 0.95)","(0.2, 0.4, -1.8828082912671142, 23.514053630877942, 14.0929711230479, 0.02142857142857143)","(0.2, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.2, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.2, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.2, 0.8, -0.9988878403025545, 35.29319244811491, 20.86673618851739, 0.017857142857142856)","(0.2, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.2, 1.0, 2.640195209896431, 34.96761121280503, 11.512898307490847, 0.014285714285714285)","(0.30000000000000004, 0.0, -37.366352683512886, 8.655131082877158, 130.78663469709892, 0.4607142857142857)","(0.30000000000000004, 0.1, -35.23633331836208, 15.116653410287167, 104.24597832621448, 0.4785714285714286)","(0.30000000000000004, 0.2, -16.778659768121216, 18.383569331456822, 135.50375610072064, 0.95)","(0.30000000000000004, 0.30000000000000004, -22.79139400282686, 16.18029772432275, 94.60950347085678, 0.7714285714285715)","(0.30000000000000004, 0.4, -6.526389520136453, 26.876610715963384, 21.941088013242688, 0.025)","(0.30000000000000004, 0.5, 10.891434020469823, 21.577879214974345, 3.7486161816997186, 0.010714285714285714)","(0.30000000000000004, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.30000000000000004, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.30000000000000004, 0.8, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.30000000000000004, 0.9, 5.106994384221383, 27.503868817202427, 14.004486089260535, 0.014285714285714285)","(0.30000000000000004, 1.0, 2.640195209896431, 34.96761121280503, 11.512898307490847, 0.014285714285714285)","(0.4, 0.0, -49.51808233133347, 8.402237596363767, 130.08088253489376, 0.8321428571428572)","(0.4, 0.1, -35.23633331836208, 15.124203849946902, 104.12115167308805, 0.4785714285714286)","(0.4, 0.2, -38.47822503478941, 5.915208989147174, 140.33440044802362, 0.7464285714285714)","(0.4, 0.30000000000000004, -29.374755790747926, 13.309517746562904, 94.9673478042272, 0.95)","(0.4, 0.4, -13.178527186331424, 30.335514310987765, 39.514564863047845, 0.14642857142857144)","(0.4, 0.5, 10.891434020469823, 18.26723791515186, 4.089257249059158, 0.010714285714285714)","(0.4, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.4, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.4, 0.8, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.4, 0.9, 10.891434020469823, 24.033937074349346, 6.274179040273764, 0.010714285714285714)","(0.4, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.5, 0.0, -30.998114130432857, 7.516518168786301, 166.9666230677311, 0.10357142857142858)","(0.5, 0.1, -35.08117877648681, 15.111430052428831, 116.87681106485796, 0.6642857142857143)","(0.5, 0.2, -38.43576004754668, 5.903004129312425, 140.22778928687947, 0.6535714285714286)","(0.5, 0.30000000000000004, -31.501277382990466, 15.928346140297581, 120.07952966692767, 0.9285714285714286)","(0.5, 0.4, -1.8828082912671142, 22.511819094699007, 13.489989370595756, 0.02142857142857143)","(0.5, 0.5, -0.5869787715565078, 26.56534165044393, 1.9626366011276195, 0.017857142857142856)","(0.5, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.5, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.5, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.5, 0.9, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.5, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.6000000000000001, 0.0, -30.998114130432857, 7.516518168786301, 166.95827914277348, 0.10357142857142858)","(0.6000000000000001, 0.1, -49.77443204139867, 3.1525241053713087, 142.88979119642954, 0.7571428571428571)","(0.6000000000000001, 0.2, -36.37118181046612, 2.168736141789653, 125.12577496981973, 0.7535714285714286)","(0.6000000000000001, 0.30000000000000004, -23.917619918151, 18.612682590090827, 109.51548998648326, 0.7571428571428571)","(0.6000000000000001, 0.4, -16.80912853057475, 25.051325386796755, 55.33028450551636, 0.7392857142857143)","(0.6000000000000001, 0.5, -2.1114579498688713, 23.533503973563352, 11.433697403414804, 0.02142857142857143)","(0.6000000000000001, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.6000000000000001, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.6000000000000001, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.6000000000000001, 0.9, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.6000000000000001, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.7000000000000001, 0.0, -30.517809626923295, 3.115532712000771, 140.22804519004828, 0.10714285714285714)","(0.7000000000000001, 0.1, -53.59647932118728, 0.6339452797313537, 157.22639231995117, 0.9285714285714286)","(0.7000000000000001, 0.2, -34.3907008089484, 2.831270113650506, 102.43360109954152, 0.625)","(0.7000000000000001, 0.30000000000000004, -23.885364246748072, 18.655017987744426, 109.6178053788328, 0.6535714285714286)","(0.7000000000000001, 0.4, -23.172040167102544, 19.593291455152045, 93.3109526893351, 0.6357142857142857)","(0.7000000000000001, 0.5, -1.8828082912671142, 23.533503973563352, 13.489989370595756, 0.02142857142857143)","(0.7000000000000001, 0.6000000000000001, 10.891434020469823, 21.577879214974345, 4.915735172714833, 0.010714285714285714)","(0.7000000000000001, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.7000000000000001, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.7000000000000001, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.7000000000000001, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 0.0, -30.517809626923295, 1.8575583526703745, 150.84933819779647, 0.10714285714285714)","(0.8, 0.1, -53.59328533339544, 0.6241644732013664, 157.20796735572, 0.9142857142857143)","(0.8, 0.2, -44.350115646278844, 2.370291860876593, 150.79763802436585, 0.6535714285714286)","(0.8, 0.30000000000000004, -36.65360783356822, 6.85555252981692, 127.66890081348451, 0.6535714285714286)","(0.8, 0.4, -28.69023667233265, 13.513905397977704, 95.1969742458172, 0.7464285714285714)","(0.8, 0.5, -1.8828082912671142, 23.533503973563352, 13.489989370595756, 0.02142857142857143)","(0.8, 0.6000000000000001, 10.891434020469823, 21.577879214974345, 4.915735172714833, 0.010714285714285714)","(0.8, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.8, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 0.0, -30.517809626923295, 1.476668335005992, 152.9233128473738, 0.10714285714285714)","(0.9, 0.1, -53.35815769429593, 0.6317049705640946, 156.96606713556662, 0.6571428571428571)","(0.9, 0.2, -44.48193015587891, 2.297478229398147, 150.35972803712596, 0.9214285714285714)","(0.9, 0.30000000000000004, -26.24859128746181, 5.450342796473387, 190.0706381948163, 0.09642857142857143)","(0.9, 0.4, -22.94466852822807, 17.705861613273452, 96.6822227929777, 0.75)","(0.9, 0.5, -1.8828082912671142, 22.511819094699007, 13.489989370595756, 0.02142857142857143)","(0.9, 0.6000000000000001, -0.8804199061437927, 23.36313968711469, 13.5047025718302, 0.017857142857142856)","(0.9, 0.7000000000000001, 5.106994384221383, 27.253086629702423, 12.641385213197506, 0.014285714285714285)","(0.9, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 0.0, -30.517809626923295, 1.4401887435416998, 152.96870715709716, 0.10714285714285714)","(1.0, 0.1, -51.77812949566279, 0.6536474492784501, 159.9790481111361, 0.4035714285714286)","(1.0, 0.2, -27.626541265577274, 1.531876264717034, 187.6652679786134, 0.09642857142857143)","(1.0, 0.30000000000000004, -34.373835628250184, 2.002533434179407, 147.72391148128506, 0.46785714285714286)","(1.0, 0.4, -19.091133223784908, 20.983568683604602, 123.45859743847552, 0.9178571428571428)","(1.0, 0.5, -1.760592374266131, 21.781018854056324, 16.58261445985894, 0.02142857142857143)","(1.0, 0.6000000000000001, -0.8804199061437927, 23.36313968711469, 13.5047025718302, 0.017857142857142856)","(1.0, 0.7000000000000001, 5.106994384221383, 27.253086629702423, 12.641385213197506, 0.014285714285714285)","(1.0, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)" 2 | -------------------------------------------------------------------------------- /experimental_results/policy_improvement_grid.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/experimental_results/policy_improvement_grid.h5 -------------------------------------------------------------------------------- /experimental_results_car/experiment_results_2019_01_03_11_00.csv: -------------------------------------------------------------------------------- 1 | iteration,max_L,min_L,c_exact_avg,g_exact_avg_0,g_exact_avg_1,c_avg,g_avg_0,g_avg_1,c_pi_exact,g_pi_exact_0,g_pi_exact_1,c_pi,g_pi_0,g_pi_1,lambda_0,lambda_1,c_br_exact,g_br_exact_0,g_br_exact_1,c_br,g_br_0,g_br_1 2 | 2.0,6.148661136627197,-574.5399553239504,-198.1867753728582,0.9244763240899999,14.042931089729445,6.148661136627197,6.882289762794971,5.676133691892028,-140.75793204722302,0.9342286880693633,13.112063586052079,10.269734382629395,12.21979808807373,12.74373722076416,9.969514664921242,9.962982068916636,-134.8524506497504,0.0,13.243908279465264,2.394774913787842,0.8382666707038879,1.3645581007003784 3 | 3.0,6.425106048583984,-584.576169644116,-165.98762484239143,0.6501060330924572,14.21169368156832,6.425106048583984,4.148929041624069,3.4076230419799685,-153.6947530179227,0.998001,18.619457899348593,15.05801010131836,1.6015316247940063,0.35395070910453796,9.925027444371498,9.918910075482357,-101.32163468214613,0.0,24.72728498336814,1.4825304746627808,-0.08765091747045517,1.2559328079223633 4 | 4.0,6.676574230194092,-569.3260919943367,-147.7454015120166,0.5654166130502738,15.375663136097678,6.676574230194092,3.9049627065658568,3.539063963169853,-23.490450456600467,0.9801888648295347,11.68609128093784,9.921900749206543,3.4011237621307373,4.371957302093506,9.879761311445243,9.87338172331725,-126.48704510723287,0.0,12.922694045301075,3.5963618755340576,2.5682566165924072,-0.21287555992603302 5 | 5.0,6.462437629699707,-574.2233519092224,-140.63167187065525,0.4723902128926595,15.73495434720103,6.462437629699707,3.7522922288626432,3.3337234503589572,-130.00775776067255,0.0,21.10419176407419,9.533533096313477,3.3523638248443604,3.174515962600708,9.835181814902072,9.827126631899771,-2.5386004956734625,0.0,0.0,4.74336576461792,1.9225209951400757,-0.30694302916526794 6 | 6.0,5.546809196472168,-592.8923824372823,-163.9243814309871,0.37791217031412755,15.587451973055595,5.546809196472168,3.62703645080328,2.845789700075984,-148.0896022302071,0.0,13.257165444910175,-0.44830286502838135,5.215179443359375,2.8287134170532227,9.79177020002498,9.774996883869964,-217.62097308094042,0.0,7.742500215588536,-0.10064613074064255,0.0935686007142067,-0.009326789528131485 7 | -------------------------------------------------------------------------------- /experimental_results_car/experiment_results_2019_01_03_15_00.csv: -------------------------------------------------------------------------------- 1 | iteration,max_L,min_L,c_exact_avg,g_exact_avg_0,g_exact_avg_1,c_avg,g_avg_0,g_avg_1,c_pi_exact,g_pi_exact_0,g_pi_exact_1,c_pi,g_pi_0,g_pi_1,lambda_0,lambda_1,c_br_exact,g_br_exact_0,g_br_exact_1,c_br,g_br_0,g_br_1 2 | 2.0,6.148661136627197,-574.5399553239504,-198.1867753728582,0.9244763240899999,14.042931089729445,6.148661136627197,6.882289762794971,5.676133691892028,-140.75793204722302,0.9342286880693633,13.112063586052079,10.269734382629395,12.21979808807373,12.74373722076416,9.969514664921242,9.962982068916636,-134.8524506497504,0.0,13.243908279465264,2.394774913787842,0.8382666707038879,1.3645581007003784 3 | 3.0,6.425106048583984,-584.576169644116,-165.98762484239143,0.6501060330924572,14.21169368156832,6.425106048583984,4.148929041624069,3.4076230419799685,-153.6947530179227,0.998001,18.619457899348593,15.05801010131836,1.6015316247940063,0.35395070910453796,9.925027444371498,9.918910075482357,-101.32163468214613,0.0,24.72728498336814,1.4825304746627808,-0.08765091747045517,1.2559328079223633 4 | 4.0,6.676574230194092,-569.3260919943367,-147.7454015120166,0.5654166130502738,15.375663136097678,6.676574230194092,3.9049627065658568,3.539063963169853,-23.490450456600467,0.9801888648295347,11.68609128093784,9.921900749206543,3.4011237621307373,4.371957302093506,9.879761311445243,9.87338172331725,-126.48704510723287,0.0,12.922694045301075,3.5963618755340576,2.5682566165924072,-0.21287555992603302 5 | 5.0,6.462437629699707,-574.2233519092224,-140.63167187065525,0.4723902128926595,15.73495434720103,6.462437629699707,3.7522922288626432,3.3337234503589572,-130.00775776067255,0.0,21.10419176407419,9.533533096313477,3.3523638248443604,3.174515962600708,9.835181814902072,9.827126631899771,-2.5386004956734625,0.0,0.0,4.74336576461792,1.9225209951400757,-0.30694302916526794 6 | 6.0,5.546809196472168,-592.8923824372823,-163.9243814309871,0.37791217031412755,15.587451973055595,5.546809196472168,3.62703645080328,2.845789700075984,-148.0896022302071,0.0,13.257165444910175,-0.44830286502838135,5.215179443359375,2.8287134170532227,9.79177020002498,9.774996883869964,-217.62097308094042,0.0,7.742500215588536,-0.10064613074064255,0.0935686007142067,-0.009326789528131485 7 | 7.0,3.827444553375244,-591.6153939177314,-162.18717787838338,0.3472098444537572,16.372180663182757,3.827444553375244,4.382718875755867,3.091123585837583,-147.51519705540926,0.0,15.621867960993018,-14.129043579101562,16.55801773071289,18.87964630126953,9.75821439310982,9.718220090422738,-113.76937185658383,0.0,24.657626784931256,0.13024234771728516,-0.016394317150115967,0.06715112924575806 8 | 8.0,3.0648725032806396,-501.46193763842234,-162.46840066296284,0.29760843810322046,16.00592572188844,3.0648725032806396,4.796520133316517,3.1554629401969057,-338.85815308597677,0.0,11.121831116525849,-1.2769678831100464,8.990571022033691,7.094027042388916,9.726399901907964,9.67252877144249,-87.02938452345659,0.0,20.770772745096483,-0.606652021408081,1.2419638633728027,7.908010005950928 9 | 9.0,2.7094998359680176,-569.1150850238164,-171.525873743171,0.355815195191935,15.498535665696489,2.7094998359680176,5.233984155859798,3.252672763634473,-165.41534154458387,0.0,11.678721787001823,2.0317790508270264,14.147116661071777,7.994142055511475,9.692725269397314,9.622130287830839,-136.9522744322237,0.0,15.791085469438224,-1.2493207454681396,0.956859290599823,1.256043791770935 10 | 10.0,2.4874157905578613,-557.3489573717159,-165.57653206627097,0.3363665591698666,15.424112272134662,2.4874157905578613,4.853659489378333,3.0225775541530715,-38.919015507108824,0.0,6.769833000615041,2.765155553817749,5.4268574714660645,3.130974769592285,9.649147925210638,9.572164286242321,-67.00296205336274,0.9029834676116293,13.576669165799476,0.1681966334581375,1.9059392213821411,1.2325890064239502 11 | 11.0,2.563368320465088,-556.9338330950056,-168.31466905274178,0.3170199628355614,15.74150682802861,2.563368320465088,4.789618060551584,3.2093044956028463,-43.477502465370364,0.0,12.540185809619366,0.6309247612953186,6.421274185180664,9.964588165283203,9.604771170052947,9.525801578257058,-147.296070173987,0.0,16.477174533949274,3.9456264972686768,1.4120573997497559,1.2542179822921753 12 | 12.0,2.4370322227478027,-527.423361767437,-172.48643098335398,0.30432482417025697,15.853650768603806,2.4370322227478027,5.0258558272976765,3.3600188912315803,-150.0063790306519,0.0,17.822908796974087,-6.6260833740234375,13.684722900390625,13.825427055358887,9.565612396055803,9.478238170086911,-44.128799941426564,0.0,14.512974359373565,-4.980665683746338,1.489221453666687,4.991724491119385 13 | 13.0,1.02047860622406,-543.5842780136225,-173.80114758790893,0.34818484257911464,16.387899557940642,1.02047860622406,5.1424289941477275,3.4068895703802506,-137.33306714979042,0.0,24.57957710003057,-31.677888870239258,9.981054306030273,8.623197555541992,9.528966004662346,9.43153405836201,-110.30751104064173,0.0,12.249665961879503,1.4286595582962036,2.4452579021453857,1.5892701148986816 14 | 14.0,-0.36623436212539673,-566.9795281820868,-171.58695901705673,0.3358173468247523,16.572092958476333,-0.36623436212539673,5.055496841181929,3.507722426033937,-137.1271940787331,0.9370369888620198,14.985253027358416,-30.010042190551758,6.458477020263672,10.072896957397461,9.48499368693826,9.38665464694548,-66.22942833376419,0.0,9.590799591462146,2.6964595317840576,0.9590728282928467,0.4115790128707886 15 | -------------------------------------------------------------------------------- /exponentiated_gradient.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | 5 | class ExponentiatedGradient(object): 6 | def __init__(self, lambda_bound, number_of_constraints, eta=1., starting_lambda='uniform'): 7 | ''' 8 | ''' 9 | self.eta = eta 10 | self.lambda_bound = lambda_bound 11 | self.number_of_constraints = number_of_constraints 12 | if starting_lambda == 'uniform': 13 | self.w_t = self.lambda_bound*np.ones(self.number_of_constraints)/self.number_of_constraints 14 | else: 15 | self.w_t = starting_lambda 16 | self.lambda_bound = np.sum(starting_lambda) 17 | 18 | def run(self, gradient): 19 | self.w_t = self.w_t/self.lambda_bound 20 | unnormalized_wt = self.w_t*np.exp(self.eta*gradient) # positive since working w/ costs. 21 | self.w_t = self.lambda_bound*unnormalized_wt/sum(unnormalized_wt) 22 | return self.w_t 23 | 24 | def get(self): 25 | return self.w_t -------------------------------------------------------------------------------- /fitted_algo.py: -------------------------------------------------------------------------------- 1 | 2 | from keras import backend as K 3 | import numpy as np 4 | 5 | class FittedAlgo(object): 6 | def __init__(self): 7 | ''' 8 | An implementation of fitted Q iteration 9 | 10 | num_inputs: number of inputs 11 | dim_of_actions: dimension of action space 12 | max_epochs: positive int, specifies how many iterations to run the algorithm 13 | gamma: discount factor 14 | ''' 15 | 16 | def init_Q(self): 17 | ''' 18 | Absract function 19 | ''' 20 | pass 21 | 22 | def fit(self, X, y, epsilon=1e-10, **kw): 23 | # D_k = {(X,y)} is the dataset of the kth iteration of Fitted Q 24 | # self.Q_k = self.init_Q(epsilon) 25 | # K.set_value(self.Q_k.model.optimizer.iterations, 0) 26 | self.Q_k.epsilon = epsilon 27 | self.Q_k.fit(X, y, **kw) 28 | 29 | def fit_generator(self, generator, epsilon=1e-10, **kw): 30 | # D_k = {(X,y)} is the dataset of the kth iteration of Fitted Q 31 | # self.Q_k = self.init_Q(epsilon) 32 | # K.set_value(self.Q_k.model.optimizer.iterations, 0) 33 | self.Q_k.epsilon = epsilon 34 | self.Q_k.fit_generator(generator, **kw) 35 | 36 | def skim(self, X_a, x_prime): 37 | full_set = np.hstack([X_a, x_prime.reshape(1,-1).T]) 38 | idxs = np.unique(full_set, axis=0, return_index=True)[1] 39 | return idxs 40 | 41 | def run(self, dataset): 42 | ''' 43 | Abstract function 44 | ''' 45 | pass 46 | 47 | 48 | -------------------------------------------------------------------------------- /fitted_off_policy_evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | from fitted_algo import FittedAlgo 3 | import numpy as np 4 | from tqdm import tqdm 5 | from env_nn import * 6 | from thread_safe import threadsafe_generator 7 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 8 | 9 | class LakeFittedQEvaluation(FittedAlgo): 10 | def __init__(self, initial_states, num_inputs, grid_shape, dim_of_actions, max_epochs, gamma,model_type='mlp', position_of_goals=None, position_of_holes=None, num_frame_stack=None): 11 | 12 | ''' 13 | An implementation of fitted Q iteration 14 | 15 | num_inputs: number of inputs 16 | dim_of_actions: dimension of action space 17 | max_epochs: positive int, specifies how many iterations to run the algorithm 18 | gamma: discount factor 19 | ''' 20 | self.model_type = model_type 21 | self.initial_states = initial_states 22 | self.num_inputs = num_inputs 23 | self.dim_of_actions = dim_of_actions 24 | self.max_epochs = max_epochs 25 | self.gamma = gamma 26 | self.grid_shape = grid_shape 27 | self.position_of_holes = position_of_holes 28 | self.position_of_goals = position_of_goals 29 | self.num_frame_stack = num_frame_stack 30 | 31 | super(LakeFittedQEvaluation, self).__init__() 32 | 33 | def run(self, policy, which_cost, dataset, epochs=500, epsilon=1e-8, desc='FQE', g_idx=None, **kw): 34 | # dataset is the original dataset generated by pi_{old} to which we will find 35 | # an approximately optimal Q 36 | 37 | self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw) 38 | 39 | X_a = np.hstack(dataset.get_state_action_pairs('lake')) 40 | x_prime = dataset['x_prime'] 41 | 42 | index_of_skim = self.skim(X_a, x_prime) 43 | X_a = X_a[index_of_skim] 44 | x_prime = x_prime[index_of_skim] 45 | dataset.set_cost(which_cost, idx=g_idx) 46 | dataset_costs = dataset['cost'][index_of_skim] 47 | dones = dataset['done'][index_of_skim] 48 | pi_of_x_prime = policy(x_prime) 49 | x_prime = x_prime.reshape(-1) 50 | 51 | values = [] 52 | for k in tqdm(range(self.max_epochs), desc=desc): 53 | 54 | # {((x,a), r+gamma* Q(x',pi(x')))} 55 | 56 | # if k == 0: 57 | # # Q_0 = 0 everywhere 58 | # costs = dataset_costs 59 | # else: 60 | costs = dataset_costs + (self.gamma*self.Q_k(x_prime, pi_of_x_prime).reshape(-1)*(1-dones.astype(int))).reshape(-1) 61 | 62 | # if (k >= (self.max_epochs-100)): K.set_value(self.Q_k.model.optimizer.lr, 0.00001) 63 | self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0) 64 | values.append(np.mean([self.Q_k(state, policy(state)) for state in self.initial_states])) 65 | print values[-1] 66 | # if not self.Q_k.callbacks_list[0].converged: 67 | # print 'Continuing training due to lack of convergence' 68 | # self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0) 69 | 70 | return np.mean(values[-10:]), values #np.mean([self.Q_k(state, policy(state)) for state in self.initial_states]) 71 | 72 | def init_Q(self, epsilon=1e-10, **kw): 73 | return LakeNN(self.num_inputs, 1, self.grid_shape, self.dim_of_actions, self.gamma, epsilon, **kw) 74 | 75 | class CarFittedQEvaluation(FittedAlgo): 76 | def __init__(self, state_space_dim, 77 | dim_of_actions, 78 | max_epochs, 79 | gamma, 80 | model_type='cnn', 81 | num_frame_stack=None): 82 | 83 | ''' 84 | An implementation of fitted Q iteration 85 | 86 | num_inputs: number of inputs 87 | dim_of_actions: dimension of action space 88 | max_epochs: positive int, specifies how many iterations to run the algorithm 89 | gamma: discount factor 90 | ''' 91 | self.model_type = model_type 92 | 93 | 94 | self.state_space_dim = state_space_dim 95 | self.dim_of_actions = dim_of_actions 96 | self.max_epochs = max_epochs 97 | self.gamma = gamma 98 | self.num_frame_stack = num_frame_stack 99 | self.Q_k = None 100 | self.Q_k_minus_1 = None 101 | 102 | earlyStopping = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='min', restore_best_weights=True) 103 | mcp_save = ModelCheckpoint('fqi.hdf5', save_best_only=True, monitor='val_loss', mode='min') 104 | reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min') 105 | 106 | self.more_callbacks = [earlyStopping, mcp_save, reduce_lr_loss] 107 | 108 | super(CarFittedQEvaluation, self).__init__() 109 | 110 | def run(self, policy, which_cost, dataset, epochs=1, epsilon=1e-8, desc='FQE', g_idx=None, testing=True, **kw): 111 | # dataset is the original dataset generated by pi_{old} to which we will find 112 | # an approximately optimal Q 113 | 114 | dataset.set_cost(which_cost, idx=g_idx) 115 | print 'Scale: ', dataset.scale 116 | # try: 117 | # initial_states = np.unique([episode.frames[[0]*episode.num_frame_stack] for episode in dataset.episodes], axis=0) 118 | # except: 119 | # initial_states = np.rollaxis(dataset['frames'][dataset['prev_states'][[0]]],1,4) 120 | 121 | initial_states = np.rollaxis(dataset['frames'][dataset['prev_states'][[0]]],1,4) 122 | 123 | # if self.Q_k is None: 124 | self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw) 125 | self.Q_k_minus_1 = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw) 126 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][[0]]], 1,4) 127 | self.Q_k.min_over_a([x_prime], x_preprocessed=True)[0] 128 | self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0] 129 | self.Q_k.copy_over_to(self.Q_k_minus_1) 130 | values = [] 131 | 132 | for k in tqdm(range(self.max_epochs), desc=desc): 133 | batch_size = 32 134 | 135 | dataset_length = len(dataset) 136 | perm = np.random.permutation(range(dataset_length)) 137 | eighty_percent_of_set = int(1.*len(perm)) 138 | training_idxs = perm[:eighty_percent_of_set] 139 | validation_idxs = perm[eighty_percent_of_set:] 140 | training_steps_per_epoch = int(.3 * np.ceil(len(training_idxs)/float(batch_size))) 141 | validation_steps_per_epoch = int(np.ceil(len(validation_idxs)/float(batch_size))) 142 | # steps_per_epoch = 1 #int(np.ceil(len(dataset)/float(batch_size))) 143 | train_gen = self.generator(policy, dataset, training_idxs, fixed_permutation=True, batch_size=batch_size) 144 | # val_gen = self.generator(policy, dataset, validation_idxs, fixed_permutation=True, batch_size=batch_size) 145 | 146 | self.fit_generator(train_gen, 147 | steps_per_epoch=training_steps_per_epoch, 148 | #validation_data=val_gen, 149 | #validation_steps=validation_steps_per_epoch, 150 | epochs=epochs, 151 | max_queue_size=10, 152 | workers=4, 153 | use_multiprocessing=False, 154 | epsilon=epsilon, 155 | evaluate=False, 156 | verbose=0, 157 | additional_callbacks = self.more_callbacks) 158 | self.Q_k.copy_over_to(self.Q_k_minus_1) 159 | if testing: 160 | actions = policy(initial_states[:,np.newaxis,...], x_preprocessed=True) 161 | assert len(actions) == initial_states.shape[0] 162 | Q_val = self.Q_k.all_actions([initial_states], x_preprocessed=True)[np.arange(len(actions)), actions] 163 | values.append(np.mean(Q_val)*dataset.scale) 164 | 165 | # initial_states = self.Q_k.representation(initial_states) 166 | if testing: 167 | return np.mean(values[-10:]), values 168 | actions = policy(initial_states[:,np.newaxis,...], x_preprocessed=True) 169 | Q_val = self.Q_k.all_actions([initial_states], x_preprocessed=True)[np.arange(len(actions)), actions] 170 | return np.mean(Q_val)*dataset.scale, values 171 | 172 | @threadsafe_generator 173 | def generator(self, policy, dataset, training_idxs, fixed_permutation=False, batch_size = 64): 174 | data_length = len(training_idxs) 175 | steps = int(np.ceil(data_length/float(batch_size))) 176 | i = -1 177 | amount_of_data_calcd = 0 178 | if fixed_permutation: 179 | calcd_costs = np.empty((len(training_idxs),), dtype='float64') 180 | while True: 181 | i = (i + 1) % steps 182 | # print 'Getting batch: %s to %s' % ((i*batch_size),((i+1)*batch_size)) 183 | if fixed_permutation: 184 | if i == 0: perm = np.random.permutation(training_idxs) 185 | batch_idxs = perm[(i*batch_size):((i+1)*batch_size)] 186 | else: 187 | batch_idxs = np.random.choice(training_idxs, batch_size) 188 | # amount_of_data_calcd += len(batch_idxs) 189 | # import pdb; pdb.set_trace() 190 | 191 | X = np.rollaxis(dataset['frames'][dataset['prev_states'][batch_idxs]],1,4) 192 | actions = np.atleast_2d(dataset['a'][batch_idxs]).T 193 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][batch_idxs]],1,4) 194 | dataset_costs = dataset['cost'][batch_idxs] 195 | dones = dataset['done'][batch_idxs] 196 | policy_action = dataset['pi_of_x_prime'][batch_idxs] 197 | 198 | # if fixed_permutation: 199 | # if amount_of_data_calcd <= data_length: 200 | # costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int)) 201 | # calcd_costs[(i*batch_size):((i+1)*batch_size)] = costs 202 | # else: 203 | # costs = calcd_costs[(i*batch_size):((i+1)*batch_size)] 204 | # else: 205 | # policy_action = policy(x_prime[:,np.newaxis,...], x_preprocessed=True) 206 | Q_val = self.Q_k_minus_1.all_actions([x_prime], x_preprocessed=True)[np.arange(len(policy_action)), policy_action] 207 | costs = dataset_costs + (self.gamma*Q_val.reshape(-1)*(1-dones.astype(int))).reshape(-1) 208 | 209 | X = self.Q_k_minus_1.representation([X], actions, x_preprocessed=True) 210 | 211 | yield (X, costs) 212 | 213 | def init_Q(self, epsilon=1e-10, **kw): 214 | return CarNN(self.state_space_dim, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, **kw) 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /fittedq.py: -------------------------------------------------------------------------------- 1 | 2 | from fitted_algo import FittedAlgo 3 | import numpy as np 4 | from tqdm import tqdm 5 | from env_nn import * 6 | from thread_safe import threadsafe_generator 7 | from keras import backend as K 8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 9 | 10 | class LakeFittedQIteration(FittedAlgo): 11 | def __init__(self, num_inputs, grid_shape, dim_of_actions, max_epochs, gamma, model_type='mlp', position_of_goals=None, position_of_holes=None, num_frame_stack=None): 12 | ''' 13 | An implementation of fitted Q iteration 14 | 15 | num_inputs: number of inputs 16 | dim_of_actions: dimension of action space 17 | max_epochs: positive int, specifies how many iterations to run the algorithm 18 | gamma: discount factor 19 | ''' 20 | self.model_type = model_type 21 | self.num_inputs = num_inputs 22 | self.grid_shape= grid_shape 23 | self.dim_of_actions = dim_of_actions 24 | self.max_epochs = max_epochs 25 | self.gamma = gamma 26 | self.position_of_goals = position_of_goals 27 | self.position_of_holes = position_of_holes 28 | self.num_frame_stack = num_frame_stack 29 | 30 | super(LakeFittedQIteration, self).__init__() 31 | 32 | 33 | def run(self, dataset, epochs=3000, epsilon=1e-8, desc='FQI', **kw): 34 | # dataset is the original dataset generated by pi_{old} to which we will find 35 | # an approximately optimal Q 36 | 37 | self.Q_k = self.init_Q(model_type=self.model_type, position_of_holes=self.position_of_holes, position_of_goals=self.position_of_goals, num_frame_stack=self.num_frame_stack, **kw) 38 | 39 | X_a = np.hstack(dataset.get_state_action_pairs()) 40 | x_prime = dataset['x_prime'] 41 | 42 | index_of_skim = self.skim(X_a, x_prime) 43 | X_a = X_a[index_of_skim] 44 | x_prime = x_prime[index_of_skim] 45 | dataset_costs = dataset['cost'][index_of_skim] 46 | dones = dataset['done'][index_of_skim] 47 | 48 | for k in tqdm(range(self.max_epochs), desc=desc): 49 | 50 | # {((x,a), c+gamma*min_a Q(x',a))} 51 | costs = dataset_costs + self.gamma*self.Q_k.min_over_a(x_prime)[0]*(1-dones.astype(int)) 52 | 53 | self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0) 54 | # import pdb; pdb.set_trace() 55 | 56 | # if not self.Q_k.callbacks_list[0].converged: 57 | # print 'Continuing training due to lack of convergence' 58 | # self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0) 59 | 60 | return self.Q_k, [] 61 | 62 | def init_Q(self, epsilon=1e-10, **kw): 63 | return LakeNN(self.num_inputs, 1, self.grid_shape, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, **kw) 64 | 65 | 66 | class CarFittedQIteration(FittedAlgo): 67 | def __init__(self, state_space_dim, 68 | dim_of_actions, 69 | max_epochs, 70 | gamma, 71 | model_type='cnn', 72 | num_frame_stack=None, 73 | initialization=None, 74 | freeze_cnn_layers=False): 75 | ''' 76 | An implementation of fitted Q iteration 77 | 78 | num_inputs: number of inputs 79 | dim_of_actions: dimension of action space 80 | max_epochs: positive int, specifies how many iterations to run the algorithm 81 | gamma: discount factor 82 | ''' 83 | self.initialization = initialization 84 | self.freeze_cnn_layers = freeze_cnn_layers 85 | self.model_type = model_type 86 | self.state_space_dim = state_space_dim 87 | self.dim_of_actions = dim_of_actions 88 | self.max_epochs = max_epochs 89 | self.gamma = gamma 90 | self.num_frame_stack = num_frame_stack 91 | self.Q_k = None 92 | self.Q_k_minus_1 = None 93 | 94 | earlyStopping = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=10, verbose=1, mode='min', restore_best_weights=True) 95 | mcp_save = ModelCheckpoint('fqi.hdf5', save_best_only=True, monitor='val_loss', mode='min') 96 | reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min') 97 | 98 | self.more_callbacks = [earlyStopping, mcp_save, reduce_lr_loss] 99 | 100 | super(CarFittedQIteration, self).__init__() 101 | 102 | 103 | def run(self, dataset, epochs=1, epsilon=1e-8, desc='FQI', exact=None, **kw): 104 | # dataset is the original dataset generated by pi_{old} to which we will find 105 | # an approximately optimal Q 106 | 107 | # if self.Q_k is None: 108 | self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw) 109 | self.Q_k_minus_1 = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw) 110 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][[0]]], 1,4) 111 | self.Q_k.min_over_a([x_prime], x_preprocessed=True)[0] 112 | self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0] 113 | self.Q_k.copy_over_to(self.Q_k_minus_1) 114 | values = [] 115 | 116 | for k in tqdm(range(self.max_epochs), desc=desc): 117 | batch_size = 64 118 | 119 | dataset_length = len(dataset) 120 | perm = np.random.permutation(range(dataset_length)) 121 | eighty_percent_of_set = int(1.*len(perm)) 122 | training_idxs = perm[:eighty_percent_of_set] 123 | validation_idxs = perm[eighty_percent_of_set:] 124 | training_steps_per_epoch = int(np.ceil(len(training_idxs)/float(batch_size))) 125 | validation_steps_per_epoch = int(np.ceil(len(validation_idxs)/float(batch_size))) 126 | # steps_per_epoch = 1 #int(np.ceil(len(dataset)/float(batch_size))) 127 | train_gen = self.generator(dataset, training_idxs, fixed_permutation=True, batch_size=batch_size) 128 | # val_gen = self.generator(dataset, validation_idxs, fixed_permutation=True, batch_size=batch_size) 129 | if (k >= (self.max_epochs-10)): K.set_value(self.Q_k.model.optimizer.lr, 0.0001) 130 | self.fit_generator(train_gen, 131 | steps_per_epoch=training_steps_per_epoch, 132 | #validation_data=val_gen, 133 | #validation_steps=validation_steps_per_epoch, 134 | epochs=epochs, 135 | max_queue_size=10, 136 | workers=4, 137 | use_multiprocessing=False, 138 | epsilon=epsilon, 139 | evaluate=False, 140 | verbose=0, 141 | additional_callbacks = self.more_callbacks) 142 | self.Q_k.copy_over_to(self.Q_k_minus_1) 143 | if k >= (self.max_epochs-10): 144 | c,g,perf = exact.run(self.Q_k,to_monitor=k==self.max_epochs) 145 | values.append([c,perf]) 146 | 147 | return self.Q_k, values 148 | 149 | @threadsafe_generator 150 | def generator(self, dataset, training_idxs, fixed_permutation=False, batch_size = 64): 151 | data_length = len(training_idxs) 152 | steps = int(np.ceil(data_length/float(batch_size))) 153 | i = -1 154 | amount_of_data_calcd = 0 155 | if fixed_permutation: 156 | calcd_costs = np.empty((len(training_idxs),), dtype='float64') 157 | while True: 158 | i = (i + 1) % steps 159 | # print 'Getting batch: %s to %s' % ((i*batch_size),((i+1)*batch_size)) 160 | if fixed_permutation: 161 | if i == 0: perm = np.random.permutation(training_idxs) 162 | batch_idxs = perm[(i*batch_size):((i+1)*batch_size)] 163 | else: 164 | batch_idxs = np.random.choice(training_idxs, batch_size) 165 | # amount_of_data_calcd += len(batch_idxs) 166 | # import pdb; pdb.set_trace() 167 | 168 | X = np.rollaxis(dataset['frames'][dataset['prev_states'][batch_idxs]],1,4) 169 | actions = np.atleast_2d(dataset['a'][batch_idxs]).T 170 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][batch_idxs]],1,4) 171 | dataset_costs = dataset['cost'][batch_idxs] 172 | dones = dataset['done'][batch_idxs] 173 | 174 | # if fixed_permutation: 175 | # if amount_of_data_calcd <= data_length: 176 | # costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int)) 177 | # calcd_costs[(i*batch_size):((i+1)*batch_size)] = costs 178 | # else: 179 | # costs = calcd_costs[(i*batch_size):((i+1)*batch_size)] 180 | # else: 181 | costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int)) 182 | 183 | X = self.Q_k_minus_1.representation([X], actions, x_preprocessed=True) 184 | 185 | yield (X, costs) 186 | 187 | def init_Q(self, epsilon=1e-10, **kw): 188 | model = CarNN(self.state_space_dim, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, freeze_cnn_layers=self.freeze_cnn_layers, **kw) 189 | if (self.initialization is not None) and self.freeze_cnn_layers: 190 | self.initialization.Q.copy_over_to(model) 191 | for layer in model.model.layers: 192 | if layer.trainable: 193 | try: 194 | layer.kernel.initializer.run( session = K.get_session() ) 195 | except: 196 | pass 197 | try: 198 | layer.bias.initializer.run( session = K.get_session() ) 199 | except: 200 | pass 201 | return model 202 | -------------------------------------------------------------------------------- /fixed_policy.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | 3 | 4 | import numpy as np 5 | from copy import deepcopy 6 | 7 | class FixedPolicy(Model): 8 | def __init__(self, policy, action_space_dim, policy_evalutor): 9 | ''' 10 | A fixed manual policy 11 | ''' 12 | super(FixedPolicy, self).__init__() 13 | self.policy = policy 14 | self.action_space_dim = action_space_dim 15 | 16 | #debug purposes 17 | self.policy_evalutor = deepcopy(policy_evalutor) 18 | self.Q = None 19 | self.get_Q_val() 20 | 21 | def get_Q_val(self): 22 | self.policy_evalutor.initial_states = np.hstack([np.nonzero((self.policy_evalutor.env.desc == 'S').reshape(-1))[0], np.nonzero((self.policy_evalutor.env.desc == 'F').reshape(-1))[0]]) 23 | self.Q_tmp = self.policy_evalutor.get_Qs(self, self.policy_evalutor.initial_states, 64) 24 | 25 | self.Q = {} 26 | for idx, state in enumerate(self.policy_evalutor.initial_states): 27 | self.Q[state] = np.eye(self.action_space_dim)[self.policy[state]]*(self.Q_tmp[idx]-1e-7) 28 | 29 | def copy_over_to(self, to_): 30 | pass 31 | 32 | def predict(self, X_a): 33 | pass # return [self.model[np.argmax(x_a[:-self.action_space_dim], axis = 1)] == np.argmax(x_a[-self.action_space_dim:], axis=1) for x_a in X_a] 34 | 35 | def fit(self, X, y, verbose=0): 36 | pass 37 | 38 | def representation(self, *args, **kw): 39 | if len(args) == 1: 40 | return args[0] 41 | elif len(args) == 2: 42 | return args[0], args[1] 43 | else: 44 | raise NotImplemented 45 | 46 | def all_actions(self, X, **kw): 47 | if self.Q is None: 48 | return np.array([-np.eye(self.action_space_dim)[self.policy[x]] for x in X]) 49 | else: 50 | arr = [] 51 | for x in X: 52 | try: 53 | arr.append(self.Q[x]) 54 | except: 55 | arr.append([0]*self.action_space_dim) 56 | return np.array(arr) 57 | 58 | -------------------------------------------------------------------------------- /fqe_quality_test_generalization.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | np.set_printoptions(suppress=True) 4 | np.random.seed(314) 5 | import tensorflow as tf 6 | from optimization_problem import Dataset 7 | from fittedq import FittedQIteration 8 | from fixed_policy import FixedPolicy 9 | from fitted_off_policy_evaluation import FittedQEvaluation 10 | from exact_policy_evaluation import ExactPolicyEvaluator 11 | from inverse_propensity_scoring import InversePropensityScorer 12 | from exact_policy_evaluation import ExactPolicyEvaluator 13 | from optimal_policy import DeepQLearning 14 | from print_policy import PrintPolicy 15 | from keras.models import load_model 16 | import pandas as pd 17 | import matplotlib 18 | matplotlib.use('TkAgg') 19 | import matplotlib.pyplot as plt 20 | 21 | ### 22 | #paths 23 | import os 24 | model_dir = os.path.join(os.getcwd(), 'models') 25 | if not os.path.exists(model_dir): 26 | os.makedirs(model_dir) 27 | ### 28 | 29 | #### Setup Gym 30 | import gym 31 | from gym.envs.registration import register 32 | map_size = [4,4] 33 | register( id='FrozenLake-no-slip-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'is_slippery': False, 'map_name':'{0}x{1}'.format(map_size[0], map_size[1])} ) 34 | env = gym.make('FrozenLake-no-slip-v0') 35 | position_of_holes = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'H')] 36 | position_of_goals = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'G')] 37 | 38 | #### Hyperparam 39 | gamma = 0.9 40 | max_fitting_epochs = 10 #max number of epochs over which to converge to Q^\ast 41 | lambda_bound = 10. # l1 bound on lagrange multipliers 42 | epsilon = .01 # termination condition for two-player game 43 | deviation_from_old_policy_eps = .7 #With what probabaility to deviate from the old policy 44 | # convergence_epsilon = 1e-6 # termination condition for model convergence 45 | action_space_dim = env.nA # action space dimension 46 | state_space_dim = env.nS # state space dimension 47 | eta = 10. # param for exponentiated gradient algorithm 48 | initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states 49 | policy_evaluator = ExactPolicyEvaluator(initial_states, state_space_dim, gamma) 50 | 51 | #### Get a decent policy. Called pi_old because this will be the policy we use to gather data 52 | policy_old = None 53 | old_policy_path = os.path.join(model_dir, 'pi_old.h5') 54 | policy_old = DeepQLearning(env, gamma) 55 | if not os.path.isfile(old_policy_path): 56 | print 'Learning a policy using DQN' 57 | policy_old.learn() 58 | policy_old.Q.model.save(old_policy_path) 59 | print policy_old.Q.evaluate(render=True) 60 | else: 61 | print 'Loading a policy' 62 | policy_old.Q.model = load_model(old_policy_path) 63 | print policy_old.Q.evaluate(render=True) 64 | 65 | print 'Old Policy' 66 | PrintPolicy(env=env).pprint(policy_old) 67 | 68 | # model_dict = {0: 1, 4: 1, 8: 0} 69 | # for i in range(grid_size*grid_size): 70 | # if i not in model_dict: 71 | # model_dict[i] = np.random.randint(action_space_dim) 72 | # policy_old = FixedPolicy(model_dict, action_space_dim, policy_evaluator) 73 | # PrintPolicy().pprint(policy_old) 74 | 75 | ### Policy to evaluate 76 | model_dict = {0: 1, 4: 1, 8: 2, 9: 1, 13: 2, 14: 2} 77 | for i in range(map_size[0]*map_size[1]): 78 | if i not in model_dict: 79 | model_dict[i] = np.random.randint(action_space_dim) 80 | policy = FixedPolicy(model_dict, action_space_dim, policy_evaluator) 81 | 82 | print 'Evaluate this policy:' 83 | PrintPolicy(env=env).pprint(policy) 84 | 85 | #### Problem setup 86 | 87 | def main(policy_old, policy, model_type='cnn'): 88 | 89 | fqi = FittedQIteration(state_space_dim + action_space_dim, map_size, action_space_dim, max_fitting_epochs, gamma,model_type =model_type ) 90 | fqe = FittedQEvaluation(initial_states, state_space_dim + action_space_dim, map_size, action_space_dim, max_fitting_epochs, gamma,model_type =model_type ) 91 | ips = InversePropensityScorer(action_space_dim) 92 | exact_evaluation = ExactPolicyEvaluator(initial_states, state_space_dim, gamma, env) 93 | 94 | max_epochs = np.array([1000]) # np.arange(50,1060,100) # max number of epochs over which to collect data 95 | epsilons = np.array([.25]) # np.array([.5]) 96 | trials = np.array([1,2]) # np.arange(20) 97 | eps_epochs_trials = cartesian_product(epsilons, max_epochs,trials) 98 | 99 | all_trials_estimators = [] 100 | for epsilon in epsilons: 101 | 102 | trials_estimators = [] 103 | for epochs in max_epochs: 104 | 105 | trial_estimators = [] 106 | for trial in trials: 107 | estimators = run_trial(policy_old, policy, epochs, epsilon, fqi, fqe, ips, exact_evaluation) 108 | 109 | trial_estimators.append(estimators) 110 | trials_estimators.append(trial_estimators) 111 | 112 | all_trials_estimators.append(trials_estimators) 113 | 114 | # print epsilon, np.mean(all_trials_evaluated[-1]), np.mean(all_trials_approx_ips[-1]), np.mean(all_trials_exact_ips[-1]), np.mean(all_trials_exact[-1]) 115 | 116 | results = np.hstack([eps_epochs_trials, np.array(all_trials_estimators).reshape(-1, np.array(all_trials_estimators).shape[-1])]) 117 | df = pd.DataFrame(results, columns=['epsilon', 'num_trajectories', 'trial_num', 'exact','fqe']) 118 | df.to_csv('fqe_quality.csv', index=False) 119 | 120 | def run_trial(policy_old, policy, epochs, epsilon, fqi, fqe, ips, exact_evaluation): 121 | #### Collect Data 122 | num_goal = 0 123 | num_hole = 0 124 | dataset = Dataset([0], action_space_dim) 125 | dataset_removed = Dataset([0], action_space_dim) 126 | 127 | data = [] 128 | mapping = {0:np.array([0,-1]), 2:np.array([0,1]), 1:np.array([1,0]), 3:np.array([-1,0])} 129 | for x in set(np.nonzero(env.desc.reshape(-1) == 'F')[0]).union(set(np.nonzero(env.desc.reshape(-1) == 'S')[0])) : 130 | for action in range(4): 131 | 132 | # if x == 4: import pdb; pdb.set_trace() 133 | row = int(x/map_size[1]) 134 | col = int(x - row*int(map_size[1])) 135 | 136 | new_row, new_col = np.array([row, col]) + mapping[action] 137 | if (new_row < 0) or (new_row > (map_size[0]-1)): 138 | new_row, new_col = row, col 139 | elif (new_col < 0) or (new_col > (map_size[1]-1)): 140 | new_row, new_col = row, col 141 | else: 142 | pass 143 | x_prime = new_row*map_size[1] + new_col 144 | 145 | if (env.desc[new_row, new_col] == 'H') or (env.desc[new_row, new_col] == 'G'): 146 | done = True 147 | else: 148 | done = False 149 | 150 | if env.desc[new_row, new_col] == 'G': 151 | goal = True 152 | else: 153 | goal = False 154 | 155 | data.append([x,action,x_prime,-goal,done and not goal,done]) 156 | 157 | 158 | for idx, datum in enumerate(data): 159 | count = idx % 4 160 | 161 | if count == 0: 162 | must_keep = 0 163 | kept = 0 164 | 165 | 166 | if (count == 3) and (kept == 0): 167 | must_keep = 1 168 | 169 | if (not must_keep) and (np.random.choice([0,1], p=[epsilon, 1-epsilon])): 170 | kept += 1 171 | dataset.append(*datum) 172 | else: 173 | dataset_removed.append(*datum) 174 | 175 | 176 | dataset.preprocess() 177 | dataset_removed.preprocess() 178 | 179 | print 'Distribution:' 180 | print np.histogram(dataset['x'], bins=np.arange(map_size[0]*map_size[1]+1)-.5)[0].reshape(map_size) 181 | 182 | print 'Distribution:' 183 | print np.histogram(dataset['x_prime'], bins=np.arange(map_size[0]*map_size[1]+1)-.5)[0].reshape(map_size) 184 | 185 | 186 | dataset.set_cost('c') 187 | dataset_removed.set_cost('c') 188 | 189 | # Exact 190 | exact = exact_evaluation.run(policy)[0] 191 | print exact 192 | 193 | # Importance Sampling 194 | # approx_ips, exact_ips, approx_pdis, exact_pdis = ips.run(dataset, policy, policy_old, epsilon, gamma) 195 | 196 | # FQE 197 | 198 | for eps in [1e-3]: 199 | 200 | evaluated = [] 201 | for i in range(1): 202 | evaluated.append(fqe.run(dataset, policy, epochs=5000, epsilon=eps, desc='FQE epsilon %s' % np.round(epsilon,2),position_of_holes=position_of_holes, position_of_goals=position_of_goals)) 203 | PrintPolicy(env=env).pprint(fqe.Q_k) 204 | 205 | print evaluated[-1] 206 | 207 | evaluated = np.mean(evaluated) 208 | print evaluated 209 | 210 | print np.mean((fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime']) )[0]*(1-dataset['done'])))**2) 211 | print np.vstack([dataset['x'], dataset['a'], np.round((fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime']) )[0]*(1-dataset['done'])))**2, 2)]).T 212 | if len(dataset_removed['x']) > 0: 213 | print np.mean((fqe.Q_k(dataset_removed['x'], dataset_removed['a']).T - (dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime']))[0]*(1-dataset_removed['done'])))**2) 214 | 215 | df = pd.DataFrame(np.vstack([dataset['x'], dataset['a'], dataset['x_prime'], dataset['cost'], dataset['done'], np.round(fqe.Q_k(dataset['x'], dataset['a']),3).T, np.around(dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime'])).T*(1-dataset['done']),2) , (fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime'])).T*(1-dataset['done']) )) ]).T, columns = ['x','a','x_prime','c','done','Q(x,a)', 'Q(x_,pi(x_))', 'diff']) 216 | df_outside = pd.DataFrame(np.vstack([dataset_removed['x'], dataset_removed['a'], dataset_removed['x_prime'], dataset_removed['cost'], dataset_removed['done'], np.round(fqe.Q_k(dataset_removed['x'], dataset_removed['a']),3).T, np.around(dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime'])).T*(1-dataset_removed['done']),2) , (fqe.Q_k(dataset_removed['x'], dataset_removed['a']).T - (dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime'])).T*(1-dataset_removed['done']) )) ]).T, columns = ['x','a','x_prime','c','done','Q(x,a)', 'Q(x_,pi(x_))', 'diff']) 217 | print exact, evaluated 218 | 219 | return exact-exact, evaluated-exact 220 | 221 | def cartesian_product(*arrays): 222 | la = len(arrays) 223 | dtype = np.result_type(*arrays) 224 | arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype) 225 | for i, a in enumerate(np.ix_(*arrays)): 226 | arr[...,i] = a 227 | return arr.reshape(-1, la) 228 | 229 | def create_df(array, **kw): 230 | return pd.DataFrame(array, **kw) 231 | 232 | 233 | def custom_plot(x, y, minimum, maximum, **kwargs): 234 | ax = kwargs.pop('ax', plt.gca()) 235 | base, = ax.plot(x, y, **kwargs) 236 | ax.fill_between(x, minimum, maximum, facecolor=base.get_color(), alpha=0.15) 237 | 238 | main(policy_old, policy) 239 | df = pd.read_csv('fqe_quality.csv') 240 | for epsilon, group in df.groupby('epsilon'): 241 | del group['epsilon'] 242 | # group.set_index('num_trajectories').plot() 243 | # import pdb; pdb.set_trace() 244 | means = group.groupby('num_trajectories').mean() 245 | stds = group.groupby('num_trajectories').std() 246 | 247 | 248 | del means['trial_num'] 249 | del stds['trial_num'] 250 | 251 | print '*'*20 252 | print 'Epsilon: %s' % epsilon 253 | print means 254 | print stds 255 | 256 | fig, ax = plt.subplots(1) 257 | colors = ['red', 'green', 'blue'] 258 | for i, col in enumerate(['fqe']): 259 | # import pdb; pdb.set_trace() 260 | 261 | x = np.array(means.index) 262 | mu = np.array(means[col]) 263 | sigma = np.array(stds[col]) 264 | 265 | lower_bound = mu + sigma 266 | upper_bound = mu - sigma 267 | 268 | custom_plot(x, mu, lower_bound, upper_bound, marker='o', label=col, color=colors[i]) 269 | 270 | 271 | 272 | # means.plot(yerr=stds) 273 | 274 | # plt.title(epsilon) 275 | ax.legend() 276 | ax.set_title('Probability of exploration: %s' % epsilon) 277 | ax.set_xlabel('Number of trajectories in dataset') 278 | ax.set_ylabel('Policy Evaluation Error') 279 | plt.show() -------------------------------------------------------------------------------- /fqi_seed_2_new.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import deepdish as dd 4 | #from thread_safe import threadsafe_generator 5 | import threading 6 | 7 | import keras 8 | from keras.models import Sequential, Model, load_model, model_from_config 9 | from keras.layers import Dense, Conv2D, Flatten, Input, concatenate, Lambda, MaxPooling2D, Dropout, dot 10 | from keras import optimizers 11 | from keras import initializers 12 | from keras import regularizers 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | from keras.callbacks import Callback, TensorBoard 15 | from keras.backend import eval 16 | 17 | from car_racing import ExtendedCarRacing 18 | import itertools 19 | from exact_policy_evaluation import ExactPolicyEvaluator 20 | 21 | from pyvirtualdisplay import Display 22 | display = Display(visible=0, size=(1280, 1024)) 23 | display.start() 24 | 25 | # env = gym.make('CarRacing-v0') 26 | constraint_thresholds = [1., 15.] + [1] 27 | constraints_cared_about = [-1,2] 28 | constraints = [300*.1, 300*.1] + [0,0,0,0,0] 29 | pic_size = (96, 96,3) 30 | num_frame_stack=3 31 | frame_skip=3 32 | gamma=.95 33 | action_space_map = {} 34 | for i, action in enumerate([k for k in itertools.product([-1, 0, 1], [1, 0], [0.2, 0])]): 35 | action_space_map[i] = action 36 | 37 | init_seed = 2 38 | stochastic_env = False # = not deterministic 39 | max_pos_costs = 12 # The maximum allowable positive cost before ending episode early 40 | max_time_spent_in_episode = 2000 41 | env = ExtendedCarRacing(init_seed, stochastic_env, max_pos_costs) 42 | exact_policy_algorithm = ExactPolicyEvaluator(action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size = pic_size, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about) 43 | env.reset() 44 | 45 | 46 | GPU = 0 47 | SEED = 0 48 | np.random.seed(SEED) 49 | import tensorflow as tf 50 | tf.set_random_seed(SEED) 51 | import random 52 | random.seed(SEED) 53 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) 54 | session_conf.gpu_options.allow_growth = True 55 | from keras import backend as K 56 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) 57 | K.set_session(sess) 58 | 59 | LEARNING_RATE = 0.0005 60 | dim_of_actions = 12 61 | input_shape = (96,96,3) 62 | gamma = 0.95 63 | 64 | class threadsafe_iter: 65 | """Takes an iterator/generator and makes it thread-safe by 66 | serializing call to the `next` method of given iterator/generator. 67 | """ 68 | def __init__(self, it): 69 | self.it = it 70 | self.lock = threading.Lock() 71 | 72 | def __iter__(self): 73 | return self 74 | 75 | def next(self): 76 | with self.lock: 77 | return self.it.next() 78 | 79 | 80 | def threadsafe_generator(f): 81 | """A decorator that takes a generator function and makes it thread-safe. 82 | """ 83 | def g(*a, **kw): 84 | return threadsafe_iter(f(*a, **kw)) 85 | return g 86 | 87 | class NN: 88 | def __init__(self, gpu=0): 89 | self.gpu = gpu 90 | rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) 91 | 92 | def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=np.random.randint(2**32)) 93 | with tf.device('/gpu:'+str(self.gpu)): 94 | model = Sequential() 95 | model.add(Conv2D(8, (7,7), strides = 3, activation = 'relu', padding = 'same', input_shape = (96,96,3),kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))) 96 | model.add(MaxPooling2D()) 97 | #model.add(Dropout(0.25)) 98 | model.add(Conv2D(16,(3,3), strides = 1, activation = 'relu', padding = 'same',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))) 99 | model.add(MaxPooling2D()) 100 | #model.add(Dropout(0.25)) 101 | model.add(Flatten()) 102 | model.add(Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))) 103 | #model.add(Dropout(0.5)) 104 | model.add(Dense(dim_of_actions, name='all_actions', activation="linear",kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6))) 105 | 106 | self.model = model 107 | self.compile() 108 | self.model._make_predict_function() 109 | #self.model.summary() 110 | 111 | def compile(self): 112 | def huber_loss(y_true, y_pred, clip_value): 113 | # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and 114 | # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b 115 | # for details. 116 | assert clip_value > 0. 117 | 118 | x = y_true - y_pred 119 | if np.isinf(clip_value): 120 | # Spacial case for infinity since Tensorflow does have problems 121 | # if we compare `K.abs(x) < np.inf`. 122 | return .5 * K.square(x) 123 | 124 | condition = K.abs(x) < clip_value 125 | squared_loss = .5 * K.square(x) 126 | linear_loss = clip_value * (K.abs(x) - .5 * clip_value) 127 | if K.backend() == 'tensorflow': 128 | import tensorflow as tf 129 | if hasattr(tf, 'select'): 130 | return tf.select(condition, squared_loss, linear_loss) # condition, true, false 131 | else: 132 | return tf.where(condition, squared_loss, linear_loss) # condition, true, false 133 | elif K.backend() == 'theano': 134 | from theano import tensor as T 135 | return T.switch(condition, squared_loss, linear_loss) 136 | else: 137 | raise RuntimeError('Unknown backend "{}".'.format(K.backend())) 138 | 139 | def mean_pred(y_true, y_pred): 140 | return K.mean(y_pred) 141 | 142 | def min_pred(y_true, y_pred): 143 | return K.min(y_pred) 144 | 145 | def clipped_masked_error(args): 146 | y_true, y_pred, mask = args 147 | loss = huber_loss(y_true, y_pred, 10) 148 | loss *= mask # apply element-wise mask 149 | return K.sum(loss, axis=-1) 150 | # Create trainable model. The problem is that we need to mask the output since we only 151 | # ever want to update the Q values for a certain action. The way we achieve this is by 152 | # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility 153 | # to mask out certain parameters by passing in multiple inputs to the Lambda layer. 154 | y_pred = self.model.output 155 | y_true = Input(name='y_true', shape=(dim_of_actions,)) 156 | mask = Input(name='mask', shape=(dim_of_actions,)) 157 | loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='huber')([y_pred, y_true, mask]) 158 | #predicted_value = Lambda(value_pred, output_shape=(1,), name='predicted_value')([y_pred, mask]) 159 | #ins = [self.model.input] if type(self.model.input) is not list else self.model.input 160 | ins = self.model.input 161 | #trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred]) 162 | trainable_model = Model(inputs=[ins,y_true, mask], outputs=[loss_out, y_pred]) 163 | assert len(trainable_model.output_names) == 2 164 | #combined_metrics = {trainable_model.output_names[1]: metrics} 165 | losses = [ 166 | lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer 167 | lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics 168 | ] 169 | #trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics) 170 | rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0) 171 | #opt = optimizers.Adam(lr=0.0001, clipnorm = 10) 172 | #trainable_model.compile(optimizer=rmsProp, loss=losses) 173 | trainable_model.compile(optimizer=rmsProp, loss=losses, metrics = [min_pred]) 174 | #trainable_model.compile(optimizer='adam', loss=losses, metrics = [min_pred]) 175 | self.trainable_model = trainable_model 176 | #print self.trainable_model.summary() 177 | #print self.trainable_model.metrics_names 178 | #time.sleep(5) 179 | 180 | self.compiled = True 181 | 182 | def saveWeight(self): 183 | self.model.save_weights('fqi_model.h5') 184 | 185 | def loadWeight(self): 186 | #path = 'weight/' 187 | self.model.load_weights('fqi_model.h5') 188 | self.model.reset_states() 189 | 190 | def clear_memory(self): 191 | del self.model 192 | 193 | 194 | @threadsafe_generator 195 | def data_generator(indices, fixed_permutation=False, batch_size = 64): 196 | #data_length = len(dataset['done']) - 1 ## Maybe throw out the very last data point to avoid out of range index error 197 | data_length = len(indices) 198 | number_of_batches = int(np.floor(data_length/float(batch_size))) 199 | #random_permutation = np.random.permutation(np.arange(data_length)) 200 | random_permutation = np.random.permutation(indices) 201 | i= -1 202 | while True: 203 | i = (i+1) % number_of_batches 204 | idxs = random_permutation[(i*batch_size):((i+1)*batch_size)] 205 | 206 | #print idxs 207 | x = np.rollaxis(dataset['frames'][dataset['prev_states'][idxs]],1,4) 208 | a = dataset['a'][idxs] ## need to make it 2d? 209 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][idxs]],1,4) 210 | c = dataset['c'][idxs] ## scaling the cost back? 211 | g = dataset['g'][idxs] 212 | dones = dataset['done'][idxs] 213 | 214 | target_q_values = Q_k_minus_1.model.predict(x_prime) 215 | assert target_q_values.shape == (batch_size, dim_of_actions) 216 | q_batch = np.min(target_q_values, axis=1) ## we're minimizing cost 217 | assert q_batch.shape == (batch_size,) 218 | 219 | targets = np.zeros((batch_size, dim_of_actions)) 220 | dummy_targets = np.zeros((batch_size,)) 221 | masks = np.zeros((batch_size, dim_of_actions)) 222 | 223 | discounted_q_batch = gamma * q_batch 224 | terminalBatch = np.array([1-float(done) for done in dones]) 225 | assert terminalBatch.shape == (batch_size,) 226 | discounted_q_batch *= terminalBatch 227 | assert c.shape == discounted_q_batch.shape 228 | cost_to_go_batch = c + discounted_q_batch 229 | 230 | for idx, (target, mask, value, action) in enumerate(zip(targets, masks, cost_to_go_batch, a)): 231 | target[action] = value # update action with estimated accumulated reward 232 | dummy_targets[idx] = value 233 | mask[action] = 1. # enable loss for this specific action 234 | 235 | assert x.shape == (batch_size, 96,96,3) 236 | assert targets.shape == (batch_size, 12) 237 | #assert sum(masks) == batch_size 238 | 239 | yield ([x, targets, masks], [dummy_targets, targets]) 240 | 241 | @threadsafe_generator 242 | def validation_generator(indices, fixed_permutation=False, batch_size = 64): 243 | #data_length = len(dataset['done']) - 1 ## Maybe throw out the very last data point to avoid out of range index error 244 | data_length = len(indices) 245 | number_of_batches = int(np.floor(data_length/float(batch_size))) 246 | #random_permutation = np.random.permutation(np.arange(data_length)) 247 | random_permutation = np.random.permutation(indices) 248 | i= -1 249 | while True: 250 | i = (i+1) % number_of_batches 251 | idxs = random_permutation[(i*batch_size):((i+1)*batch_size)] 252 | 253 | #print idxs 254 | x = np.rollaxis(dataset['frames'][dataset['prev_states'][idxs]],1,4) 255 | a = dataset['a'][idxs] ## need to make it 2d? 256 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][idxs]],1,4) 257 | c = dataset['c'][idxs]## scaling the cost back? 258 | g = dataset['g'][idxs] 259 | dones = dataset['done'][idxs] 260 | 261 | target_q_values = Q_k_minus_1.model.predict(x_prime) 262 | assert target_q_values.shape == (batch_size, dim_of_actions) 263 | q_batch = np.min(target_q_values, axis=1) ## we're minimizing cost 264 | assert q_batch.shape == (batch_size,) 265 | 266 | targets = np.zeros((batch_size, dim_of_actions)) 267 | dummy_targets = np.zeros((batch_size,)) 268 | masks = np.zeros((batch_size, dim_of_actions)) 269 | 270 | discounted_q_batch = gamma * q_batch 271 | terminalBatch = np.array([1-float(done) for done in dones]) 272 | assert terminalBatch.shape == (batch_size,) 273 | discounted_q_batch *= terminalBatch 274 | assert c.shape == discounted_q_batch.shape 275 | cost_to_go_batch = c + discounted_q_batch 276 | 277 | for idx, (target, mask, value, action) in enumerate(zip(targets, masks, cost_to_go_batch, a)): 278 | target[action] = value # update action with estimated accumulated reward 279 | dummy_targets[idx] = value 280 | mask[action] = 1. # enable loss for this specific action 281 | 282 | assert x.shape == (batch_size, 96,96,3) 283 | assert targets.shape == (batch_size, 12) 284 | #assert sum(masks) == batch_size 285 | 286 | yield ([x, targets, masks], [dummy_targets, targets]) 287 | 288 | def clone_model(model, custom_objects={}): 289 | # Requires Keras 1.0.7 since get_config has breaking changes. 290 | config = { 291 | 'class_name': model.__class__.__name__, 292 | 'config': model.get_config(), 293 | } 294 | clone = model_from_config(config, custom_objects=custom_objects) 295 | clone._make_predict_function() 296 | clone.set_weights(model.get_weights()) 297 | return clone 298 | 299 | def weight_change_norm(model, target_model): 300 | norm_list = [] 301 | number_of_layers = len(model.layers) 302 | for i in range(number_of_layers): 303 | model_matrix = model.layers[i].get_weights() 304 | target_model_matrix = target_model.layers[i].get_weights() 305 | if len(model_matrix) >0: 306 | #print "layer ", i, " has shape ", model_matrix[0].shape 307 | if model_matrix[0].shape[0] > 0: 308 | norm_change = np.linalg.norm(model_matrix[0]-target_model_matrix[0]) 309 | norm_list.append(norm_change) 310 | return sum(norm_list)*1.0/len(norm_list) 311 | 312 | 313 | class LossHistory(keras.callbacks.Callback): 314 | def on_train_begin(self, logs={}): 315 | self.losses = [] 316 | 317 | def on_batch_end(self, batch, logs={}): 318 | self.losses.append(logs.get('loss')) 319 | 320 | 321 | action_data = dd.io.load('./seed_2/car_data_actions_seed_2.h5') 322 | frame_data = dd.io.load('./seed_2/car_data_frames_seed_2.h5') 323 | done_data = dd.io.load('./seed_2/car_data_is_done_seed_2.h5') 324 | next_state_data = dd.io.load('./seed_2/car_data_next_states_seed_2.h5') 325 | current_state_data = dd.io.load('./seed_2/car_data_prev_states_seed_2.h5') 326 | cost_data = dd.io.load('./seed_2/car_data_rewards_seed_2.h5') 327 | 328 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32') 329 | for i in range(len(frame_data)): 330 | frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114]) 331 | 332 | dataset = {'frames':frame_gray_scale, 333 | 'prev_states': current_state_data, 334 | 'next_states': next_state_data, 335 | 'a': action_data, 336 | 'c':cost_data[:,0]/20.3, ## Divide by the largest one 337 | 'g':cost_data[:,1:], 338 | 'done': done_data 339 | } 340 | 341 | 342 | ### Load data set 343 | #dataset = dd.io.load('car_racing_data.h5') 344 | data_length = len(frame_data)-1 345 | ### Start training 346 | 347 | 348 | Q_k_minus_1 = NN(gpu = GPU) ## This is the target network, initialize it with something 349 | Q_k = NN(gpu=GPU) ### Initialize the value network with something 350 | #Q_k_minus_1.loadWeight() ### cheat: loading in DQN weights 351 | ## Form the data set? 352 | 353 | number_of_iter = 100 354 | batch_size = 32 355 | epochs_per_iter = 1 ## per_iter 356 | #steps_per_epoch = data_length / batch_size 357 | 358 | #mcp_save = ModelCheckpoint('fqi_test_model.hdf5', save_best_only=False, mode='auto', period=1) 359 | 360 | #reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min') 361 | history = LossHistory() 362 | iteration_losses = [] 363 | stop_training = False 364 | lr_counter = 0 365 | train_iter = 0 366 | #while not stop_training: 367 | for iteration in range(number_of_iter): 368 | print "------------" 369 | print "Iteration: ", train_iter 370 | lr = eval(Q_k.trainable_model.optimizer.lr) 371 | print "Current learning rate: ", lr 372 | lr_counter += 1 373 | ## training validation split 374 | indices = np.random.permutation(np.arange(data_length)) 375 | cutoff = int(1*data_length) 376 | train_idx = indices[:cutoff] 377 | valid_idx = indices[cutoff:] 378 | steps_per_epoch = len(train_idx) / batch_size 379 | valid_steps = len(valid_idx) / batch_size 380 | #gen = data_generator(dataset, fixed_permutation=False, batch_size=batch_size) 381 | gen = data_generator(train_idx, fixed_permutation=False, batch_size=batch_size) 382 | #valid_gen = validation_generator(valid_idx, fixed_permutation=False, batch_size=batch_size) 383 | 384 | #mcp_save = ModelCheckpoint('FQI_models/fqi_model_1epoch_gamma095_lr00025_'+str(iteration)+'.hdf5', save_best_only=True, monitor='val_loss', mode='min') 385 | #Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=8, use_multiprocessing=False, verbose=1, validation_data = valid_gen, validation_steps = valid_steps, callbacks=[history]) 386 | Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=8, use_multiprocessing=False, verbose=1, callbacks=[history]) 387 | iter_loss = sum(history.losses) *1.0/ len(history.losses) 388 | #print "This iteration loss: ", iter_loss 389 | iteration_losses.append(iter_loss) 390 | """ 391 | if len(iteration_losses) > 5 and iteration_losses[-1]>max(iteration_losses[-6:-1]) and lr_counter >=5: 392 | if lr > 0.0001: 393 | lr = max(0.0001, lr*0.5) 394 | K.set_value(Q_k.trainable_model.optimizer.lr,lr) 395 | lr_counter = 0 396 | else: 397 | stop_training = True 398 | """ 399 | #Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=3, use_multiprocessing=False, verbose=0, validation_data = valid_gen, validation_steps = valid_steps) 400 | #Q_k_minus_1.model = clone_model(Q_k.model) 401 | ## Test weight change in last layer 402 | old_matrix = Q_k_minus_1.model.layers[-1].get_weights() 403 | new_matrix = Q_k.model.layers[-1].get_weights() 404 | #print "dimension of weight layer ", new_matrix[0].shape 405 | #print "Norm of weight change is ", np.linalg.norm(new_matrix[0]-old_matrix[0]) 406 | print "Norm of weight change is ", weight_change_norm(Q_k.model, Q_k_minus_1.model) 407 | print 408 | print exact_policy_algorithm.run(Q_k) 409 | Q_k_minus_1.model.set_weights(Q_k.model.get_weights()) 410 | Q_k.model.save('FQI_models/fqi_model_1epoch_gamma095_lr0005_fixed_'+str(train_iter)+'.hdf5') 411 | train_iter += 1 412 | 413 | #Q_k.compile() ## reset optimizer state 414 | #Q_k.model.reset_states() 415 | #Q_k.trainable_model.reset_states() 416 | 417 | 418 | ### Copying model of Q_k over to Q_k_minus_1 before repeating 419 | -------------------------------------------------------------------------------- /frozen_lake.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from gym.envs.registration import register 4 | from gym.envs.toy_text import FrozenLakeEnv 5 | 6 | 7 | class ExtendedFrozenLake(FrozenLakeEnv): 8 | def __init__(self, early_termination, desc=None, map_name="4x4",is_slippery=True): 9 | super(ExtendedFrozenLake, self).__init__(desc=desc, map_name=map_name, is_slippery=is_slippery) 10 | self.deterministic = True 11 | self.max_time_steps = early_termination 12 | self.min_cost = -1. #set by env 13 | self.env_type = 'lake' 14 | 15 | def is_early_episode_termination(self, cost=None, time_steps=None, total_cost=None): 16 | if time_steps > self.max_time_steps: 17 | return True, 0. 18 | else: 19 | return False, 0. 20 | 21 | def step(self, a): 22 | transitions = self.P[self.s][a] 23 | i = self.categorical_sample([t[0] for t in transitions], self.np_random) 24 | p, s, r, d= transitions[i] 25 | self.s = s 26 | self.lastaction=a 27 | 28 | c = -r 29 | g = [int(d and not r)] 30 | return (s, (c,g), d, {"prob" : p}) 31 | 32 | @staticmethod 33 | def categorical_sample(prob_n, np_random): 34 | """ 35 | Sample from categorical distribution 36 | Each row specifies class probabilities 37 | """ 38 | prob_n = np.asarray(prob_n) 39 | csprob_n = np.cumsum(prob_n) 40 | return (csprob_n > np_random.rand()).argmax() 41 | -------------------------------------------------------------------------------- /inverse_propensity_scoring.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from fitted_algo import FittedAlgo 5 | from mdp_approximator import MDPApproximator 6 | from model import Model 7 | import numpy as np 8 | from tqdm import tqdm 9 | import scipy.signal as signal 10 | 11 | class InversePropensityScorer(object): 12 | def __init__(self, env, state_space_dim, action_space_dim, grid_shape): 13 | ''' 14 | An implementation of fitted Q iteration 15 | 16 | num_inputs: number of inputs 17 | dim_of_actions: dimension of action space 18 | max_epochs: positive int, specifies how many iterations to run the algorithm 19 | gamma: discount factor 20 | ''' 21 | self.env = env 22 | self.action_space_dim = action_space_dim 23 | self.state_space_dim = state_space_dim 24 | self.grid_shape = grid_shape 25 | # self.initial_states = initial_states 26 | 27 | def run(self, *args, **kw): 28 | ''' 29 | V^pi(s) = sum_{i = 1}^n p(h_j| pi_new, s_0 = s)/p(h_j| pi_old, s_0 = s) H(h_j) 30 | h = (s_1, a_1, r_1, s_2, ...) 31 | p(h_j | pi, s) = pi(a_0 | s_0)p(r_0 | s_0, a_0)p(s_1 | s_0, a_0)pi(a_1 |s_1) ... 32 | = prod_j pi(a_j | x_j)p(r_j | x_j, a_j)p(s_{j+1} | x_j, a_j) 33 | deterministic = prod_j pi(a_j | x_j) * 1 * 1 34 | = prod_j pi(a_j | x_j) 35 | H(h_j) = r_0 + gamma * r_1 + gamma^2 r_2 + ... 36 | 37 | ''' 38 | 39 | 40 | approx_ips = self.approx_ips(*args) 41 | exact_ips = self.exact_ips(*args) 42 | approx_pdis = self.approx_pdis(*args) 43 | exact_pdis = self.exact_pdis(*args) 44 | dr, wdr, am = self.doubly_robust_approx(*args, **kw) 45 | 46 | 47 | return approx_ips, exact_ips, approx_pdis, exact_pdis, dr, wdr, am 48 | 49 | def approx_pdis(self, dataset, pi_new, pi_old, epsilon, gamma): 50 | ''' 51 | Per decision importance sampling 52 | 53 | sum_{t=1}^{max L} gamma^t 1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t 54 | ''' 55 | 56 | pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes] 57 | 58 | # approx IPS, pi_old_a_given_x is approximated by the dataset 59 | actions = np.eye(self.action_space_dim)[dataset['a']] 60 | unique_states_seen = np.unique(dataset['x']) 61 | probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen] 62 | 63 | prob = {} 64 | for idx, state in enumerate(unique_states_seen): 65 | prob[state] = probabilities[idx] 66 | 67 | pi_old_a_given_x = [[ prob[x][a] for x,a in zip(episode['x'],episode['a']) ] for episode in dataset.episodes] 68 | 69 | pi_new_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in pi_new_a_given_x]) 70 | pi_old_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,1)) for x in pi_old_a_given_x]) 71 | costs = [episode['cost'] for episode in dataset.episodes] 72 | costs = np.array([np.pad(x, (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in costs]) 73 | 74 | return self.discounted_sum(np.mean(pi_new_cumprod / pi_old_cumprod * costs, axis=0), gamma) 75 | 76 | # pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x] 77 | # pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x] 78 | # costs = [episode['cost'] for episode in dataset.episodes] 79 | 80 | # per_decision = [] 81 | # for i in range(dataset.get_max_trajectory_length()): 82 | # s = 0 83 | # count = 0 84 | # for trajectory in range(len(costs)): 85 | # try: 86 | # s += pi_new_cumprod[trajectory][i] / pi_old_cumprod[trajectory][i] * costs[trajectory][i] 87 | # count += 1 88 | # except: 89 | # pass 90 | # per_decision.append(s/float(count)) 91 | 92 | 93 | # return self.discounted_sum(per_decision, gamma) 94 | 95 | def exact_pdis(self, dataset, pi_new, pi_old, epsilon, gamma): 96 | ''' 97 | Per decision importance sampling 98 | 99 | sum_{t=1}^{max L} gamma^t 1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t 100 | ''' 101 | 102 | pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes] 103 | pi_old_a_given_x = [(pi_old(episode['x']) == episode['a'])*(1-epsilon) + (1./self.action_space_dim)*epsilon for episode in dataset.episodes] 104 | 105 | pi_new_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in pi_new_a_given_x]) 106 | pi_old_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,1)) for x in pi_old_a_given_x]) 107 | costs = [episode['cost'] for episode in dataset.episodes] 108 | costs = np.array([np.pad(x, (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in costs]) 109 | 110 | return self.discounted_sum(np.mean(pi_new_cumprod / pi_old_cumprod * costs, axis=0), gamma) 111 | 112 | # pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x] 113 | # pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x] 114 | # costs = [episode['cost'] for episode in dataset.episodes] 115 | 116 | # per_decision = [] 117 | # for t in range(dataset.get_max_trajectory_length()): 118 | # s = 0 119 | # count = 0 120 | # for trajectory in range(len(costs)): 121 | # try: 122 | # s += pi_new_cumprod[trajectory][t] / pi_old_cumprod[trajectory][t] * costs[trajectory][t] 123 | # count += 1 124 | # except: 125 | # pass 126 | # per_decision.append(s/count) 127 | 128 | # return self.discounted_sum(per_decision, gamma) 129 | 130 | def approx_ips(self, dataset, pi_new, pi_old, epsilon, gamma): 131 | ''' 132 | Inverse propensity scoring (Importance sampling) 133 | ''' 134 | H_h_j = [self.discounted_sum(episode['cost'], gamma) for episode in dataset.episodes] 135 | pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes] 136 | 137 | # approx IPS, pi_old_a_given_x is approximated by the dataset 138 | actions = np.eye(self.action_space_dim)[dataset['a']] 139 | unique_states_seen = np.unique(dataset['x']) 140 | probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen] 141 | 142 | prob = {} 143 | for idx, state in enumerate(unique_states_seen): 144 | prob[state] = probabilities[idx] 145 | 146 | pi_old_a_given_x = [[ prob[x][a] for x,a in zip(episode['x'],episode['a'])] for episode in dataset.episodes] 147 | 148 | approx_ips= 0 149 | for i in range(len(H_h_j)): 150 | prob_new = np.prod(pi_new_a_given_x[i]) 151 | prob_old = np.prod(pi_old_a_given_x[i]) 152 | if (prob_new > 0) and (prob_old == 0): 153 | return np.inf 154 | approx_ips += prob_new/prob_old * H_h_j[i] 155 | 156 | if np.isnan(approx_ips): 157 | approx_ips = np.inf 158 | else: 159 | approx_ips /= len(H_h_j) 160 | 161 | return approx_ips 162 | 163 | 164 | def exact_ips(self, dataset, pi_new, pi_old, epsilon, gamma): 165 | H_h_j = [self.discounted_sum(episode['cost'], gamma) for episode in dataset.episodes] 166 | pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes] 167 | 168 | # exact IPS. If you know pi_old, can calculate exactly 169 | pi_old_a_given_x = [(pi_old(episode['x']) == episode['a'])*(1-epsilon) + (1./self.action_space_dim)*epsilon for episode in dataset.episodes] 170 | 171 | exact_ips = 0 172 | for i in range(len(H_h_j)): 173 | prob_new = np.prod(pi_new_a_given_x[i]) 174 | prob_old = np.prod(pi_old_a_given_x[i]) 175 | if (prob_new > 0) and (prob_old == 0): 176 | return np.inf 177 | exact_ips += prob_new/prob_old * H_h_j[i] 178 | 179 | 180 | if np.isnan(exact_ips): 181 | exact_ips = np.inf 182 | else: 183 | exact_ips /= len(H_h_j) 184 | 185 | return exact_ips 186 | 187 | def doubly_robust_approx(self, dataset, pi_new, pi_old, epsilon, gamma, MDP_approximator=None): 188 | ''' 189 | sum_{i=0}^n sum_{t=0}^\infty gamma^t w_t^i R_t^{H_i} - 190 | sum_{i=0}^n sum_{t=0}^\infty gamma^t (w_t^i \hat{Q}(S^{H_i}_t,A^{H_i}_t) - w_{t-1}^i \hat{V}(S^{H_i}_t,A^{H_i}_{t-1}) 191 | 192 | w_t^i = rho_t^i / n = 1/n * prod_{n=0}^t pi_new(a_n|x_n) / pi_old(a_n|x_n) 193 | 194 | ''' 195 | if MDP_approximator is None: 196 | mdp = MDPApproximator(self.env, self.state_space_dim + self.action_space_dim, self.grid_shape, self.action_space_dim, 500, gamma) 197 | else: 198 | mdp = MDP_approximator 199 | 200 | mdp.run(dataset) 201 | 202 | actions = np.eye(self.action_space_dim)[dataset['a']] 203 | unique_states_seen = np.unique(dataset['x']) 204 | probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen] 205 | 206 | prob = {} 207 | for idx, state in enumerate(unique_states_seen): 208 | prob[state] = probabilities[idx] 209 | 210 | 211 | pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes] 212 | pi_old_a_given_x = [[ prob[x][a] for x,a in zip(episode['x'],episode['a'])] for episode in dataset.episodes] 213 | pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x] 214 | pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x] 215 | w_t = [pi_new_cumprod[i]/pi_old_cumprod[i] for i in range(len(pi_new_cumprod))] 216 | def sum_arrays(x,y): 217 | max_len = max(len(x), len(y)) 218 | x = np.pad(x, (0,max_len-len(x)), mode='constant', constant_values=0) 219 | y = np.pad(y, (0,max_len-len(y)), mode='constant', constant_values=0) 220 | return x+y 221 | 222 | norms = reduce(lambda x,y,s_a=sum_arrays: s_a(x,y), w_t) 223 | how_many_non_zero = np.sum(norms>0) 224 | 225 | drs = [] 226 | wdrs = [] 227 | Q_hat = {} 228 | V_hat = {} 229 | 230 | print mdp.V(pi_new, 0) 231 | for idx, episode in enumerate(dataset.episodes): 232 | cost = w_t[idx]*episode['cost'] 233 | first_term = self.discounted_sum(cost, gamma) 234 | 235 | Q_hats = [] 236 | V_hats = [] 237 | for x,a in zip(episode['x'], episode['a']): 238 | if tuple([x,a]) not in Q_hat: 239 | Q_ = mdp.Q(pi_new, x, a) 240 | Q_hat[tuple([x,a])] = Q_ 241 | if tuple([x]) not in V_hat: 242 | V_ = mdp.V(pi_new, x) 243 | V_hat[tuple([x])] = V_ 244 | 245 | Q_hats.append(Q_hat[tuple([x,a])]) 246 | V_hats.append(V_hat[tuple([x])]) 247 | 248 | # DR 249 | w_t_minus_1 = np.hstack([1, w_t[idx][:-1]]) 250 | cost = w_t[idx]*Q_hats - w_t_minus_1*V_hats 251 | second_term = self.discounted_sum(cost, gamma) 252 | drs.append(first_term - second_term) 253 | 254 | #WDR 255 | #normalize w_t 256 | how_many = min(len(w_t[idx]), how_many_non_zero) 257 | w_t_ = w_t[idx][:how_many] / np.array(norms[:how_many]) 258 | w_t_ = np.hstack([w_t_, np.zeros(len(w_t[idx])-how_many) ]) 259 | cost = w_t_*episode['cost'] 260 | first_term = self.discounted_sum(cost, gamma) 261 | 262 | w_t_minus_1 = np.hstack([1./len(w_t), w_t_[:-1]]) 263 | cost = w_t_*Q_hats - w_t_minus_1*V_hats 264 | second_term = self.discounted_sum(cost, gamma) 265 | wdrs.append(first_term - second_term) 266 | 267 | if tuple([0]) not in V_hat: 268 | AM = mdp.V(pi_new, 0) 269 | else: 270 | AM = V_hat[tuple([0])] 271 | 272 | return np.mean(drs), np.sum(wdrs), AM 273 | 274 | @staticmethod 275 | def discounted_sum(costs, discount): 276 | ''' 277 | Calculate discounted sum of costs 278 | ''' 279 | y = signal.lfilter([1], [1, -discount], x=costs[::-1]) 280 | return y[::-1][0] 281 | -------------------------------------------------------------------------------- /lake_primal_dual_gap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_primal_dual_gap.png -------------------------------------------------------------------------------- /lake_values.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_values.png -------------------------------------------------------------------------------- /lake_values_wo_band.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_values_wo_band.png -------------------------------------------------------------------------------- /layer_visualizer.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | import matplotlib.pyplot as plt 3 | 4 | class LayerVisualizer(object): 5 | def __init__(self, model): 6 | 7 | self.layer_outputs = [layer.output for layer in model.layers if layer.name not in ['mask', 'inp']] 8 | self.activation_model = Model(inputs=model.input, outputs=self.layer_outputs) 9 | 10 | def display_activation(self, datum, col_size, row_size, act_index): 11 | activation = self.activation_model.predict(datum)[act_index] 12 | activation_index=0 13 | fig, ax = plt.subplots(row_size, col_size, figsize=(row_size*2.5,col_size*1.5)) 14 | for row in range(0,row_size): 15 | for col in range(0,col_size): 16 | ax[row][col].imshow(activation[0, :, :, activation_index], cmap='gray') 17 | activation_index += 1 18 | 19 | plt.show() -------------------------------------------------------------------------------- /mdp_approximator.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import keras 4 | from keras.models import Sequential, Model as KerasModel 5 | from keras.layers import Input, Dense, Flatten, concatenate, dot, MaxPooling2D 6 | from keras.losses import mean_squared_error 7 | import scipy.signal as signal 8 | from env_nn import LakeNN 9 | from keras import optimizers 10 | 11 | import gym 12 | 13 | 14 | 15 | class MDPApproximator(LakeNN): 16 | def __init__(self, env, *args, **kw): 17 | ''' 18 | Approximate P(s'| s,a) 19 | ''' 20 | self.env = env 21 | 22 | self.model_type = kw['model_type'] if 'model_type' in kw else 'mlp' 23 | self.gamma = .9 24 | super(MDPApproximator, self).__init__(68, 1, [8,8], 4, self.gamma, convergence_of_model_epsilon=1e-10, model_type='mlp', num_frame_stack=(1,), frame_skip=1, pic_size = (1,)) 25 | self.create_model(68,1) 26 | 27 | def create_model(self, num_inputs, num_outputs): 28 | if self.model_type == 'mlp': 29 | model = Sequential() 30 | def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=np.random.randint(2**32)) 31 | model.add(Dense(64, activation='tanh', input_shape=(num_inputs,),kernel_initializer=init(), bias_initializer=init())) 32 | model.add(Dense(num_outputs, activation='linear',kernel_initializer=init(), bias_initializer=init())) 33 | model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy']) 34 | self.model = model 35 | else: 36 | self.model = super(MDPApproximator, self).create_model(num_inputs, num_outputs) 37 | 38 | def run(self, dataset): 39 | ''' 40 | probability of 41 | transitioning from s to s' 42 | given action a is the number of 43 | times this transition was observed divided by the number 44 | of times action a was taken in state s. If D contains no examples 45 | of action a being taken in state s, then we assume 46 | that taking action a in state s always causes a transition to 47 | the terminal absorbing state. 48 | 49 | Since everything is deterministic then P(s'|s,a) = 0 or 1. 50 | ''' 51 | transitions = np.vstack([dataset['x'],dataset['a'],dataset['x_prime']]).T 52 | unique, idx, count = np.unique(transitions, return_index=True, return_counts=True, axis=0) 53 | 54 | partial_transitions = np.vstack([dataset['x'],dataset['a']]).T 55 | unique_a_given_x, idx_a_given_x, count_a_given_x = np.unique(partial_transitions, return_index=True, return_counts=True, axis=0) 56 | 57 | # key=(state, action). value= number of times a was taking in state 58 | all_counts_a_given_x = {tuple(key):value for key,value in zip(unique_a_given_x,count_a_given_x)} 59 | 60 | prob = {} 61 | for idx,row in enumerate(unique): 62 | if tuple(row[:-1]) in prob: 63 | prob[tuple(row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[(row[0],row[1])] 64 | else: 65 | prob[tuple(row[:-1])] = {} 66 | prob[tuple(row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[(row[0],row[1])] 67 | 68 | all_transitions = np.vstack([dataset['x'],dataset['a'],dataset['x_prime'], dataset['done']]).T 69 | self.terminal_transitions = {tuple([x,a,x_prime]):1 for x,a,x_prime in all_transitions[all_transitions[:,-1] == True][:,:-1]} 70 | 71 | # Actually fitting R, not Q_k 72 | self.Q_k = self.model #init_Q(model_type=self.model_type) 73 | X_a = np.array(zip(dataset['x'],dataset['a']))#dataset['state_action'] 74 | x_prime = dataset['x_prime'] 75 | index_of_skim = self.skim(X_a, x_prime) 76 | self.fit(X_a[index_of_skim], dataset['cost'][index_of_skim], batch_size=len(index_of_skim), verbose=0, epochs=1000) 77 | self.reward = self 78 | self.P = prob 79 | 80 | def skim(self, X_a, x_prime): 81 | full_set = np.hstack([X_a, x_prime.reshape(1,-1).T]) 82 | idxs = np.unique(full_set, axis=0, return_index=True)[1] 83 | return idxs 84 | 85 | def R(self, *args): 86 | # Exact R 87 | mapping = {0:[0,-1], 1:[1,0], 2:[0,1], 3:[-1,0]} 88 | x = args[0] 89 | x, y = np.where(np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape) == x) 90 | x,y = x[0], y[0] 91 | delta_x,delta_y = mapping[args[1][0]] 92 | new_x = x + delta_x 93 | new_y = y + delta_y 94 | new_x,new_y = (new_x,new_y) if (0 <= new_x < self.env.desc.shape[0] and 0 <= new_y < self.env.desc.shape[1]) else (x,y) 95 | return [[1]] if self.env.desc[new_x,new_y]=='H' else [[0]] 96 | 97 | # Approximated Rewards 98 | # return self.reward(*args) 99 | 100 | def transition(self, x, a): 101 | # Exact MDP dynamics 102 | # mapping = {0:[0,-1], 1:[1,0], 2:[0,1], 3:[-1,0]} 103 | # x, y = np.where(np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape) == x) 104 | # x,y = x[0], y[0] 105 | # delta_x,delta_y = mapping[a] 106 | # new_x = x + delta_x 107 | # new_y = y + delta_y 108 | # new_x,new_y = (new_x,new_y) if (0 <= new_x < self.env.desc.shape[0] and 0 <= new_y < self.env.desc.shape[1]) else (x,y) 109 | # done = True if self.env.desc[new_x,new_y]=='H' else False 110 | # done = done or (True if self.env.desc[new_x,new_y]=='G' else False) 111 | # return np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape)[new_x,new_y], done 112 | 113 | #Approximated dynamics 114 | if tuple([x,a]) in self.P: 115 | try: 116 | state = np.random.choice(self.P[(x,a)].keys(), p=self.P[(x,a)].values()) 117 | except: 118 | import pdb; pdb.set_trace() 119 | done = False 120 | else: 121 | state = None 122 | done = True 123 | 124 | return state, done 125 | 126 | def Q(self, policy, x, a): 127 | 128 | Qs = [] 129 | 130 | state = x 131 | original_a = a 132 | done = False 133 | costs = [] 134 | trajectory_length = -1 135 | # Q 136 | while not done and trajectory_length < 200: 137 | trajectory_length += 1 138 | if trajectory_length > 0: 139 | a = policy([state])[0] 140 | 141 | 142 | new_state, done = self.transition(state, a) 143 | costs.append( self.R([state], [a])[0][0] ) 144 | if (tuple([state,a,new_state]) in self.terminal_transitions): 145 | done = True 146 | 147 | 148 | state = new_state 149 | 150 | return self.discounted_sum(costs, self.gamma) 151 | 152 | def V(self, policy, x): 153 | state = x 154 | done = False 155 | weighted_costs = [] 156 | trajectory_length = -1 157 | # V 158 | while not done and trajectory_length < 200: 159 | trajectory_length += 1 160 | # Because greedy deterministic policy 161 | a = policy([state])[0] 162 | 163 | new_state, done = self.transition(state, a) 164 | weighted_costs.append( self.R([state], [a])[0][0] ) 165 | if (tuple([state,a,new_state]) in self.terminal_transitions): 166 | done = True 167 | 168 | state = new_state 169 | 170 | return self.discounted_sum(weighted_costs, self.gamma) 171 | 172 | @staticmethod 173 | def discounted_sum(costs, discount): 174 | ''' 175 | Calculate discounted sum of costs 176 | ''' 177 | y = signal.lfilter([1], [1, -discount], x=costs[::-1]) 178 | return y[::-1][0] 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | class Model(object): 5 | def __init__(self): 6 | ''' 7 | Abstract class defining which functions a model should have 8 | ''' 9 | self.model = None 10 | 11 | def fit(self, X, y, verbose=0): 12 | raise NotImplemented 13 | 14 | def predict(self, X, a): 15 | raise NotImplemented 16 | 17 | def all_actions(self, X): 18 | raise NotImplemented 19 | 20 | def representation(*args): 21 | raise NotImplemented 22 | 23 | def copy_over_to(self, to_): 24 | to_.model.set_weights(self.model.get_weights()) 25 | 26 | def evaluate(self, verbose=False, render=False, **kw): 27 | return self.policy_evalutor.run(self, verbose=verbose, render=render, **kw) 28 | 29 | def min_over_a(self, X, randomized_tiebreaking=False, **kw): 30 | ''' 31 | Returns min_a Q(X,a), argmin_a Q(X,a) 32 | ''' 33 | Q_x_a = self.all_actions(X, **kw) 34 | return self.min_and_argmin(Q_x_a, randomized_tiebreaking, axis=1) 35 | 36 | def max_over_a(self, X, randomized_tiebreaking=False, **kw): 37 | ''' 38 | Returns min_a Q(X,a), argmin_a Q(X,a) 39 | ''' 40 | 41 | Q_x_a = self.all_actions(X, **kw) 42 | return self.max_and_argmax(Q_x_a, randomized_tiebreaking, axis=1) 43 | 44 | @staticmethod 45 | def max_and_argmax(Q, randomized_tiebreaking=False, **kw): 46 | ''' max + Argmax + Breaks max/argmax ties randomly''' 47 | if not randomized_tiebreaking: 48 | return np.max(Q, **kw), np.argmax(Q, **kw) 49 | else: 50 | tie_breaker = np.random.random(Q.shape) * (Q==Q.max()) 51 | argmax = np.argmax(tie_breaker, **kw) # this is counter intuitive. 52 | return Q[np.arange(Q.shape[0]), argmax], argmax 53 | 54 | @staticmethod 55 | def min_and_argmin(Q, randomized_tiebreaking=False, **kw): 56 | ''' min + Argmin + Breaks min/argmin ties randomly''' 57 | if not randomized_tiebreaking: 58 | return np.min(Q, **kw), np.argmin(Q, **kw) 59 | else: 60 | tie_breaker = - np.random.random(Q.shape) * (Q==Q.min()) 61 | argmin = np.argmin(tie_breaker, **kw) 62 | return Q[np.arange(Q.shape[0]), argmin], argmin 63 | 64 | def __call__(self, *args, **kw): 65 | x_preprocessed = kw['x_preprocessed'] if 'x_preprocessed' in kw else False 66 | if len(args) == 1: 67 | ''' 68 | Run policy: pi = argmin_a Q(x,a) 69 | ''' 70 | x = args[0] 71 | return self.min_over_a(x, False, x_preprocessed=x_preprocessed)[1] 72 | elif len(args) == 2: 73 | ''' 74 | Evaluate Q(x,a) 75 | ''' 76 | x,a = args 77 | return self.predict(x,a, x_preprocessed=x_preprocessed) 78 | else: 79 | raise 80 | 81 | @staticmethod 82 | def cartesian_product(*arrays): 83 | la = len(arrays) 84 | dtype = np.result_type(*arrays) 85 | arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype) 86 | for i, a in enumerate(np.ix_(*arrays)): 87 | arr[...,i] = a 88 | return arr.reshape(-1, la) 89 | 90 | # def cartesian_product(x,y): 91 | # return np.hstack([np.tile(x.T, y.shape[1]).T, np.tile(y,x.shape[0]).reshape(-1,y.shape[1])]) 92 | -------------------------------------------------------------------------------- /models/pi_1.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_1.hdf5 -------------------------------------------------------------------------------- /models/pi_2.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_2.hdf5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn.h5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn.hdf5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn1.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn1.hdf5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn_good.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_good.hdf5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn_random_seed.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_random_seed.hdf5 -------------------------------------------------------------------------------- /models/pi_old_car_cnn_seed_2.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_seed_2.hdf5 -------------------------------------------------------------------------------- /models/pi_old_map_size_8_mlp.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_map_size_8_mlp.h5 -------------------------------------------------------------------------------- /models/weights.01-2362.66.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2362.66.hdf5 -------------------------------------------------------------------------------- /models/weights.01-2542.47.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2542.47.hdf5 -------------------------------------------------------------------------------- /models/weights.01-2635.64.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2635.64.hdf5 -------------------------------------------------------------------------------- /neural_network.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import keras 4 | from keras.models import Sequential, Model as KerasModel 5 | from keras.layers import Input, Dense, Flatten, concatenate, dot 6 | from keras.losses import mean_squared_error 7 | from keras import optimizers 8 | from keras.callbacks import Callback, TensorBoard 9 | from exact_policy_evaluation import ExactPolicyEvaluator 10 | from keras_tqdm import TQDMCallback 11 | from model import Model 12 | 13 | from keras.layers.convolutional import Conv2D 14 | 15 | 16 | class NN(Model): 17 | def __init__(self, num_inputs, num_outputs, grid_shape, dim_of_actions, gamma, convergence_of_model_epsilon=1e-10, model_type='mlp', position_of_holes=None, position_of_goals=None): 18 | ''' 19 | An implementation of fitted Q iteration 20 | 21 | num_inputs: number of inputs 22 | num_outputs: number of outputs 23 | dim_of_actions: dimension of action space 24 | convergence_of_model_epsilon: small float. Defines when the model has converged. 25 | ''' 26 | super(NN, self).__init__() 27 | self.convergence_of_model_epsilon = convergence_of_model_epsilon 28 | self.model_type = model_type 29 | self.dim_of_actions = dim_of_actions 30 | self.dim_of_state = grid_shape[0] * grid_shape[1] 31 | self.grid_shape = grid_shape 32 | 33 | if self.model_type == 'cnn': 34 | assert position_of_holes is not None 35 | assert position_of_goals is not None 36 | 37 | 38 | self.position_of_goals = position_of_goals 39 | 40 | if position_of_holes is not None: 41 | self.position_of_holes = np.zeros(self.dim_of_state) 42 | self.position_of_holes[position_of_holes] = 1 43 | self.position_of_holes = self.position_of_holes.reshape(self.grid_shape) 44 | else: 45 | self.position_of_holes = position_of_holes 46 | 47 | if position_of_goals is not None: 48 | self.position_of_goals = np.zeros(self.dim_of_state) 49 | self.position_of_goals[position_of_goals] = 1 50 | self.position_of_goals = self.position_of_goals.reshape(self.grid_shape) 51 | else: 52 | self.position_of_goals = position_of_goals 53 | 54 | self.model = self.create_model(num_inputs, num_outputs) 55 | #debug purposes 56 | self.policy_evalutor = ExactPolicyEvaluator([0], num_inputs-dim_of_actions, gamma) 57 | 58 | def copy_over_to(self, to_): 59 | to_.model.set_weights(self.model.get_weights()) 60 | 61 | def create_model(self, num_inputs, num_outputs): 62 | if self.model_type == 'mlp': 63 | model = Sequential() 64 | def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001, seed=np.random.randint(2**32)) 65 | model.add(Dense(64, activation='tanh', input_shape=(num_inputs,),kernel_initializer=init(), bias_initializer=init())) 66 | model.add(Dense(num_outputs, activation='linear',kernel_initializer=init(), bias_initializer=init())) 67 | # adam = optimizers.Adam(clipnorm=1.) 68 | model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy']) 69 | elif self.model_type == 'cnn': 70 | # input layer 71 | # 3 channels: holes, goals, player 72 | # and actions 73 | def init(): seed=np.random.randint(2**32); return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001, seed=seed) 74 | inp = Input(shape=(self.grid_shape[0],self.grid_shape[1],1), name='grid') 75 | actions = Input(shape=(self.dim_of_actions,), name='mask') 76 | neighbors = Input(shape=(2*self.dim_of_actions,), name='holes_and_goals') 77 | 78 | # Grid feature extraction 79 | 80 | seed = np.random.randint(2**32) 81 | 82 | conv1 = Conv2D(16, kernel_size=2, activation='elu', padding='SAME', data_format='channels_last',kernel_initializer=init(), bias_initializer=init())(inp) 83 | # conv2 = Conv2D(16, kernel_size=3, activation='elu', padding='SAME', data_format='channels_last',kernel_initializer=init(), bias_initializer=init())(conv1) 84 | flat1 = Flatten()(conv1) 85 | 86 | # Holes + goals feature extractor 87 | # flat2 = Dense(20, activation='elu',kernel_initializer=init(), bias_initializer=init())(neighbors) 88 | 89 | # merge feature extractors 90 | # merge = concatenate([flat1, flat2]) 91 | 92 | # interpret 93 | hidden1 = Dense(10, activation='elu',kernel_initializer=init(), bias_initializer=init())(flat1) 94 | hidden2 = Dense(self.dim_of_actions, activation='linear',kernel_initializer=init(), bias_initializer=init())(hidden1) 95 | 96 | output = dot([hidden2, actions], 1) 97 | # predict 98 | # output = Dense(1, activation='linear',kernel_initializer=init(), bias_initializer=init())(hidden1) 99 | model = KerasModel(inputs=[inp, neighbors, actions], outputs=output) 100 | model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy']) 101 | else: 102 | raise NotImplemented 103 | 104 | # model.summary() 105 | return model 106 | 107 | 108 | def fit(self, X, y, verbose=0, batch_size=512, epochs=1000, evaluate=False, tqdm_verbose=True, **kw): 109 | 110 | X = self.representation(X[:,0], X[:, 1]) 111 | self.callbacks_list = [EarlyStoppingByConvergence(epsilon=self.convergence_of_model_epsilon, diff =1e-10, verbose=verbose)]#, TQDMCallback(show_inner=False, show_outer=tqdm_verbose)] 112 | self.model.fit(X,y,verbose=verbose==2, batch_size=batch_size, epochs=epochs, callbacks=self.callbacks_list, **kw) 113 | 114 | if evaluate: 115 | return self.evaluate() 116 | else: 117 | return None 118 | 119 | def representation(self, *args): 120 | if self.model_type == 'mlp': 121 | if len(args) == 1: 122 | return np.eye(self.dim_of_state)[np.array(args[0]).astype(int)] 123 | elif len(args) == 2: 124 | return np.hstack([np.eye(self.dim_of_state)[np.array(args[0]).astype(int)], np.eye(self.dim_of_actions)[np.array(args[1]).astype(int)] ]) 125 | else: 126 | raise NotImplemented 127 | elif self.model_type == 'cnn': 128 | if len(args) == 1: 129 | position = np.eye(self.dim_of_state)[np.array(args[0]).astype(int)].reshape(-1,self.grid_shape[0],self.grid_shape[1]) 130 | X, surrounding = self.create_cnn_rep_helper(position) 131 | return [X, surrounding] 132 | elif len(args) == 2: 133 | position = np.eye(self.dim_of_state)[np.array(args[0]).astype(int)].reshape(-1,self.grid_shape[0],self.grid_shape[1]) 134 | X, surrounding = self.create_cnn_rep_helper(position) 135 | return [X, surrounding, np.eye(self.dim_of_actions)[np.array(args[1]).astype(int)] ] 136 | else: 137 | raise NotImplemented 138 | else: 139 | raise NotImplemented 140 | 141 | def create_cnn_rep_helper(self, position): 142 | how_many = position.shape[0] 143 | holes = np.repeat(self.position_of_holes[np.newaxis, :, :], how_many, axis=0) 144 | goals = np.repeat(self.position_of_goals[np.newaxis, :, :], how_many, axis=0) 145 | 146 | ix_x, ix_y, ix_z = np.where(position) 147 | surrounding = self.is_next_to([self.position_of_holes, self.position_of_goals], ix_y, ix_z) 148 | 149 | return np.sum([position*.5, holes*1, goals*(-1)], axis = 0)[:,:,:,np.newaxis], np.hstack(surrounding) 150 | 151 | def is_next_to(self, obstacles, x, y): 152 | # obstacles must be list 153 | assert np.all(np.array([obstacle.shape for obstacle in obstacles]) == obstacles[0].shape) 154 | surround = lambda x,y: [(x, y-1), (x+1, y), (x, y+1), (x-1, y)] 155 | 156 | ret = [] 157 | for idx in range(len(x)): 158 | neighbors = [] 159 | for a,b in surround(x[idx], y[idx]): 160 | # only works if all obstacles are same shape 161 | neighbor = np.vstack([obstacle[a, b] for obstacle in obstacles]) if 0 <= a < obstacles[0].shape[0] and 0 <= b < obstacles[0].shape[1] else np.array([0.]*len(obstacles)).reshape(1,-1).T 162 | neighbors.append(neighbor) 163 | 164 | ret.append(np.hstack(neighbors)) 165 | 166 | return np.stack(ret, axis=1) 167 | 168 | def predict(self, X, a): 169 | return self.model.predict(self.representation(X,a)) 170 | 171 | def all_actions(self, X): 172 | # X_a = ((x_1, a_1) 173 | # (x_1, a_2) 174 | # .... 175 | # (x_1, a_m) 176 | # ... 177 | # (x_N, a_1) 178 | # (x_N, a_2) 179 | # ... 180 | # ... 181 | # (x_N, a_m)) 182 | X = np.array(X) 183 | X_a = self.cartesian_product(X, np.arange(self.dim_of_actions)) 184 | 185 | 186 | # Q_x_a = ((Q_x1_a1, Q_x1_a2,... Q_x1_am) 187 | # (Q_x2_a1, Q_x2_a2,... Q_x2_am) 188 | # ... 189 | # (Q_xN_a1, Q_xN_a2,... Q_xN_am) 190 | # by reshaping using C ordering 191 | 192 | Q_x_a = self.predict(X_a[:,0], X_a[:,1]).reshape(X.shape[0],self.dim_of_actions,order='C') 193 | return Q_x_a 194 | 195 | class EarlyStoppingByConvergence(Callback): 196 | def __init__(self, monitor='loss', epsilon=0.01, diff=.001, use_both=True, verbose=0): 197 | super(Callback, self).__init__() 198 | self.monitor = monitor 199 | self.epsilon = epsilon 200 | self.diff = diff 201 | self.use_both = use_both 202 | self.verbose = verbose 203 | self.losses_so_far = [] 204 | self.converged = False 205 | 206 | def on_epoch_end(self, epoch, logs={}): 207 | self.epoch = epoch 208 | 209 | current = logs.get(self.monitor) 210 | if current is None: 211 | print("Early stopping requires %s available!" % self.monitor) 212 | exit() 213 | else: 214 | self.losses_so_far.append(current) 215 | 216 | if self.verbose: 217 | if (self.epoch % 100) == 0: 218 | print 'Epoch %s, loss: %s' % (epoch, self.losses_so_far[-1]) 219 | 220 | if self.use_both: 221 | if ((len(self.losses_so_far) > 1) and (np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]) < self.epsilon)) or (self.losses_so_far[-1] < self.diff): 222 | self.model.stop_training = True 223 | self.converged = True 224 | else: 225 | pass 226 | else: 227 | if ((len(self.losses_so_far) > 1) and (np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]) < self.epsilon)): 228 | self.model.stop_training = True 229 | self.converged = True 230 | else: 231 | pass 232 | 233 | 234 | def on_train_end(self, logs=None): 235 | if self.epoch > 1: 236 | if self.verbose > 0: 237 | if self.converged: 238 | print 'Epoch %s: early stopping. Converged. Delta: %s. Loss: %s' % (self.epoch, np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]), self.losses_so_far[-1]) 239 | else: 240 | print 'Epoch %s. NOT converged. Delta: %s. Loss: %s' % (self.epoch, np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]), self.losses_so_far[-1]) 241 | 242 | def on_train_begin(self, logs=None): 243 | # Allow instances to be re-used 244 | self.losses_so_far = [] 245 | self.converged = False 246 | 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /pi_old_car_cnn_main.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/pi_old_car_cnn_main.hdf5 -------------------------------------------------------------------------------- /play_car_racing.py: -------------------------------------------------------------------------------- 1 | from car_racing import ExtendedCarRacing 2 | import numpy as np 3 | 4 | if __name__=="__main__": 5 | from pyglet.window import key 6 | a = np.array( [0.0, 0.0, 0.0] ) 7 | def key_press(k, mod): 8 | global restart 9 | if k==0xff0d: restart = True 10 | if k==key.LEFT: a[0] = -1.0 11 | if k==key.RIGHT: a[0] = +1.0 12 | if k==key.UP: a[1] = +1.0 13 | if k==key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation 14 | def key_release(k, mod): 15 | if k==key.LEFT and a[0]==-1.0: a[0] = 0 16 | if k==key.RIGHT and a[0]==+1.0: a[0] = 0 17 | if k==key.UP: a[1] = 0 18 | if k==key.DOWN: a[2] = 0 19 | env = ExtendedCarRacing(0, False, 12) 20 | env.render() 21 | record_video = False 22 | if record_video: 23 | env.monitor.start('/tmp/video-test', force=True) 24 | env.viewer.window.on_key_press = key_press 25 | env.viewer.window.on_key_release = key_release 26 | while True: 27 | env.reset() 28 | total_reward = 0.0 29 | steps = 0 30 | restart = False 31 | while True: 32 | s, r, done, info = env.step(a) 33 | print r[1][1], r[1][3], r[1][4] 34 | total_reward += r[0] 35 | if steps % 200 == 0 or done: 36 | pass 37 | # print("\naction " + str(["{:+0.2f}".format(x) for x in a])) 38 | # print("step {} total_reward {:+0.2f}".format(steps, total_reward)) 39 | #import matplotlib.pyplot as plt 40 | #plt.imshow(s) 41 | #plt.savefig("test.jpeg") 42 | steps += 1 43 | if not record_video: # Faster, but you can as well call env.render() every time to play full window. 44 | env.render() 45 | if done or restart or float(env.tile_visited_count)>139: break 46 | print steps, float(env.tile_visited_count), len(env.track), float(env.tile_visited_count)/len(env.track) 47 | env.close() -------------------------------------------------------------------------------- /plot_fqe_quality_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | import matplotlib.pyplot as plt 5 | import os 6 | import numpy as np 7 | import seaborn as sns; sns.set(color_codes=True) 8 | from matplotlib.ticker import FuncFormatter 9 | def percent(x, pos): 10 | return '%1d%%' % (x) 11 | percent_formatter = FuncFormatter(percent) 12 | 13 | # Colors 14 | alpha = 0.15 15 | sns.set(style="whitegrid", palette="Paired") 16 | 17 | colorSet = sns.color_palette("Paired", 10); 18 | def color_gen(): 19 | 20 | 21 | colors = [ "dusty purple", "faded green", "amber", "windows blue", "coral"] 22 | colors = sns.xkcd_palette(colors) 23 | idx = -1 24 | while 1: 25 | idx = (idx + 1) % len(colors) 26 | yield colors[idx] 27 | 28 | 29 | path = os.path.join(os.getcwd(), 'experimental_results') 30 | files = os.listdir(path) 31 | csvs = [f for f in files if 'fqe_quality' in f] 32 | 33 | # tmp = pd.DataFrame([csv.split('.csv')[0].split('_')[2:] for csv in csvs], columns=['year','month','day','hour','minute','a','b']) 34 | # results_file = 'fqe_quality_' + '_'.join(tmp.sort_values(by=['year','month','day','hour','minute'], ascending=False).iloc[0]) + '.csv' 35 | # results_file = 'fqe_quality_2018_12_23_11_00_g_cnn.csv' 36 | # dr_fix = 'fqe_quality_fixed_dr.csv' 37 | results_file = 'fqe_quality_fixed_dr_tabular_4.csv' 38 | df = pd.read_csv(os.path.join(path, results_file)) 39 | df['trial_num'] = np.array([[i]*10 for i in range(int(1+max(df['trial_num'])))]).reshape(-1) 40 | df['num_trajectories'] = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9]*int(max(df['trial_num'])+1) 41 | # df_dr_fix = pd.read_csv(os.path.join(path, dr_fix)) 42 | 43 | # df = df.merge(df_dr_fix, left_on=['epsilon','num_trajectories','trial_num'], right_on=['epsilon','num_trajectories','trial_num'], how='left') 44 | # for col in [col for col in df.columns if '_y' in col]: 45 | # if 'doubly_robust' not in col: 46 | # del df[col] 47 | 48 | # for col in [col for col in df.columns if ('_x' in col) and ('doubly_robust' in col)]: 49 | # del df[col] 50 | # df.columns = [col.replace('_x', '') for col in df.columns] 51 | # df.columns = [col.replace('_y', '') for col in df.columns] 52 | 53 | def custom_plot(x, y, minimum, maximum, plot_band=True, zorder=11, alpha=.15, **kwargs): 54 | ax = kwargs.pop('ax', plt.gca()) 55 | base, = ax.plot(x, y, **kwargs) 56 | if plot_band: 57 | ax.fill_between(x, minimum, maximum, facecolor=base.get_color(), alpha=alpha, zorder=zorder) 58 | 59 | for epsilon, group in df.groupby('epsilon'): 60 | del group['epsilon'] 61 | # group.set_index('num_trajectories').plot() 62 | # import pdb; pdb.set_trace() 63 | small_value = 1e-10 64 | exact = group['approx_pdis'].iloc[0]+small_value 65 | print list(group.apply(lambda x: x+exact).groupby('num_trajectories'))[-1][1][['trial_num', 'exact', 'fqe']] 66 | means = group.apply(lambda x: x+exact).groupby('num_trajectories').mean() 67 | stds = group.apply(lambda x: x+exact).groupby('num_trajectories').std() 68 | 69 | medians = group.apply(lambda x: x+exact).groupby('num_trajectories').median() 70 | lower_quants = group.apply(lambda x: x+exact).groupby('num_trajectories').quantile(.25) 71 | upper_quants = group.apply(lambda x: x+exact).groupby('num_trajectories').quantile(.75) 72 | 73 | del means['trial_num'] 74 | del stds['trial_num'] 75 | del medians['trial_num'] 76 | del lower_quants['trial_num'] 77 | del upper_quants['trial_num'] 78 | 79 | print '*'*20 80 | print 'Epsilon: %s' % epsilon 81 | print means 82 | print stds 83 | 84 | fig, ax = plt.subplots(1) 85 | colors = color_gen() 86 | for i, col in enumerate(['fqe', 'approx_pdis', 'doubly_robust', 'weighted_doubly_robust']): 87 | # import pdb; pdb.set_trace() 88 | 89 | x = np.array(np.unique(group['num_trajectories'])) 90 | mu = np.array(means[col]) 91 | sigma = np.array(stds[col]) 92 | lower_bound = mu + sigma 93 | upper_bound = mu - sigma 94 | # mu = np.array(medians[col]) 95 | # lower_bound = np.array(lower_quants[col]) 96 | # upper_bound = np.array(upper_quants[col]) 97 | 98 | 99 | 100 | col = ['Fitted Q Evaluation (FQE)', 'Per-Decision IS (PDIS)', 'Doubly Robust (DR)', 'Weighted Doubly Robust (WDR)', 'AM'][i] 101 | if (i == 0) or (i == 3): 102 | custom_plot(x*100, mu, lower_bound, upper_bound, plot_band=True,zorder=11, marker='o', label=col, color=colors.next()) 103 | else: 104 | custom_plot(x*100, mu, lower_bound, upper_bound, plot_band=False,zorder=11, marker='o', label=col, color=colors.next()) 105 | 106 | custom_plot(x*100, [exact]*len(x), lower_bound, upper_bound, plot_band=False, marker='o', label='True Value', color=colors.next()) 107 | 108 | 109 | # means.plot(yerr=stds) 110 | 111 | # plt.title(epsilon) 112 | col = color_gen().next() 113 | print 'Number of Trials: ', max(df['trial_num'])+1 114 | ax.legend(loc='upper right') 115 | ax.grid(alpha=.35) 116 | # ax.set_title('Probability of exploration: %s' % epsilon) 117 | ax.set_xlabel('Percentage of Data Sub-Sampled for Evaluation') 118 | ax.set_ylabel('Estimated Constraint Value') 119 | ax.set_title('Off-Policy Evaluation - Standalone Comparison', fontsize=16) 120 | ax.xaxis.set_major_formatter(percent_formatter) 121 | ax.set_ylim(bottom=-1, top=0) 122 | plt.tight_layout() 123 | plt.savefig('lake_fqe_vs_others.png', format='png', dpi=300) 124 | plt.show() 125 | 126 | -------------------------------------------------------------------------------- /plot_grid_search.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.lines as mlines 8 | from matplotlib.legend import Legend 9 | import seaborn as sns 10 | import deepdish as dd 11 | from mpl_toolkits.axes_grid1 import make_axes_locatable 12 | sns.set(context="paper")#, font="monospace") 13 | plt.rc('text', usetex=True) 14 | #sns.set(style="darkgrid", palette="Paired") 15 | 16 | # Load the datset of correlations between cortical brain networks 17 | #df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0) 18 | #corrmat = df.corr() 19 | 20 | # matrix = np.load('policy_role_freq.npy') 21 | import os 22 | df = pd.read_csv(os.path.join(os.getcwd(),'experimental_results','lspi.csv')) 23 | df = pd.DataFrame(np.array([[y.strip('(').strip(')') for y in x] for x in [x.split(',') for x in np.array(df.columns)]]).astype(float), 24 | columns = ['lambda_0', 'lambda_1', 'c_pi_exact', 'g_pi_exact_0', 'g_pi_exact_1', 'performance']) 25 | # import pdb; pdb.set_trace() 26 | df = pd.read_csv(os.path.join(os.getcwd(),'experimental_results','results_grid.csv')) 27 | data = dd.io.load(os.path.join(os.getcwd(),'experimental_results','policy_improvement_grid.h5')) 28 | # performance = np.array(df['performance']) 29 | performance = np.array(data['c_performance']) 30 | df = df[['c_pi_exact','g_pi_exact_0','g_pi_exact_1','lambda_0','lambda_1']] 31 | #labels=['c_pi_exact','g_pi_exact_0','g_pi_exact_1','lambda_0','lambda_1','performance']) 32 | 33 | 34 | main = np.array(df['c_pi_exact']).reshape(11,11) 35 | braking = np.array(df['g_pi_exact_0']).reshape(11,11) 36 | lane = np.array(df['g_pi_exact_1']).reshape(11,11) 37 | 38 | 39 | # import pdb; pdb.set_trace() 40 | 41 | 42 | # # Set up the matplotlib figure 43 | # f, axarr = plt.subplots(nrows=1,ncols=3, figsize=(12, 9)) 44 | # sns.set(font_scale=2) 45 | # upper_bound = [-60, 8, 135.]#[1.5, 5.] 46 | # lower_bound = [-30, 0., 0.]#[1.5, 5.] 47 | # for i, matrix in enumerate([main, braking, lane]): 48 | # sns.heatmap(matrix, cmap = 'summer', ax=axarr[i], vmin= lower_bound[i], vmax =upper_bound[i], square=True) 49 | # axarr[i].tick_params(axis='x', labelsize=18) 50 | # axarr[i].tick_params(axis='y', labelsize=18) 51 | # axarr[i].set_xlabel(r'$\lambda_0$' + ' (Braking Penalty)', fontsize = 18) 52 | # axarr[i].set_ylabel(r'$\lambda_1$' + ' (Center of Lane Penalty)', fontsize = 18) 53 | 54 | 55 | 56 | # #g.axes[0,0].set_xlabel('axes label 1') 57 | 58 | # # Use matplotlib directly to emphasize known networks 59 | # """ 60 | # networks = corrmat.columns.get_level_values("network") 61 | # for i, network in enumerate(networks): 62 | # if i and network != networks[i - 1]: 63 | # ax.axhline(len(networks) - i, c="w") 64 | # ax.axvline(i, c="w") 65 | # """ 66 | # f.tight_layout() 67 | # #f.savefig('role_frequency.png', format='png', dpi=300) 68 | # plt.show() 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | # generate data 78 | constraints = [5.8, 85.] 79 | use_rewards = True 80 | # x = np.linspace(0,1, num=11) 81 | # y = np.linspace(0,1, num=11) 82 | # X,Y = np.meshgrid(x,y) 83 | # signal = main.reshape(-1) 84 | det = (braking.reshape(-1) < constraints[0]) & (lane.reshape(-1) < constraints[1]) & (performance.reshape(-1) >= .95) #np.random.poisson(lam=0.5,size=len(x)*len(y)) 85 | det = det.astype(int) 86 | 87 | df_signal = df[['c_pi_exact', 'lambda_0', 'lambda_1']]#pd.DataFrame({"y":df.flatten(), "x":X.flatten(), "intensity":signal}) 88 | df_signal.columns = ['intensity', 'x', 'y'] 89 | df_det = pd.DataFrame({"y":df['lambda_1'], "x":df['lambda_0'], "det":det}) 90 | df_signal['intensity'] = -use_rewards*df_signal['intensity'] 91 | 92 | # prepare Dataframes 93 | dfmark = df_det[df_det["det"]>0] 94 | 95 | #plotting 96 | fig, ax=plt.subplots() 97 | divider = make_axes_locatable(ax) 98 | cax = divider.append_axes('right', size='5%', pad=0.05) 99 | 100 | x = df_signal["x"].unique() 101 | y = df_signal["y"].unique() 102 | ext = [x.min()-np.diff(x)[0]/2.,x.max()+np.diff(x)[0]/2., 103 | y.min()-np.diff(y)[0]/2.,y.max()+np.diff(y)[0]/2. ] 104 | 105 | # df_signal['y'] += 1 106 | # df_signal['y'] = 1/df_signal['y'] 107 | df = df_signal.pivot(index="y", columns="x") 108 | im = ax.imshow(df, extent=ext, cmap=plt.get_cmap('YlGnBu'), origin='lower') 109 | ax.set_xticks(x) 110 | ax.set_xticklabels([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], fontsize=14) 111 | ax.set_yticks(y) 112 | ax.set_yticklabels([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], fontsize=14) 113 | 114 | # ax.scatter(dfmark["x"], dfmark["y"], marker="s", s=100, c="crimson") 115 | dx = np.diff(x)[0]; dy = np.diff(y)[0] 116 | recs = [] 117 | for (xi,yi), in zip(dfmark[["x","y"]].values): 118 | rec = plt.Rectangle((xi-dx/2.,yi-dy/2.),dx,dy, fill=False, 119 | edgecolor="black", lw=2, hatch='\\') 120 | recs.append(rec) 121 | ax.add_artist(rec) 122 | 123 | rec = plt.Rectangle((0.,0.),0,0, fill=False, 124 | edgecolor="black", lw=2, hatch='\\') 125 | recs.append(rec) 126 | 127 | df = df_signal.merge(df_det, how='left') 128 | 129 | # good_policies = -use_rewards*np.array(main).reshape(11,11) * det.reshape(11,11) 130 | for (xi, yi) in df[df['det']>0][df[df['det'] > 0]['intensity'] == df[df['det'] > 0]['intensity'].max()][['x','y']].values: 131 | 132 | best = plt.Rectangle((xi-dx/2.,yi-dy/2.),dx,dy, fill=False, 133 | edgecolor="crimson", lw=2, hatch='*' ) 134 | ax.add_artist(best) 135 | 136 | best = plt.Rectangle((0.,0.),0,0, fill=False, 137 | edgecolor="crimson", lw=2, hatch='*' ) 138 | 139 | # plt.legend([recs[-1], best]) 140 | cbar = fig.colorbar(im, cax=cax, orientation='vertical', ticks=np.arange(-10, 60, 10)[::-1]) 141 | cbar.ax.set_yticklabels(np.arange(10, -60, -10)[::-1], fontsize=16) 142 | cax.set_ylabel('Main Objective Value', fontsize=18) 143 | ax.set_xlabel(r'$\lambda_0$' + ' (Braking Penalty)', fontsize = 18) 144 | ax.set_ylabel(r'$\lambda_1$' + ' (Center of Lane Penalty)', fontsize = 18) 145 | ax.legend([recs[-1], best], ['Satisfies Constraints', 'Best, Satisfies Constraints'], fontsize=16, loc='upper left', framealpha=.4) 146 | ax.set_title('Regularized FQI Grid Search', fontsize=18) 147 | plt.tight_layout() 148 | plt.savefig('fqi_grid_search.png', format='png', dpi=300) 149 | plt.show() 150 | 151 | 152 | -------------------------------------------------------------------------------- /plot_policy_improvement_v2.py: -------------------------------------------------------------------------------- 1 | import deepdish as dd 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.lines as mlines 8 | from matplotlib.legend import Legend 9 | import seaborn as sns; sns.set(color_codes=True) 10 | import os 11 | import scipy.signal as signal 12 | 13 | # Colors 14 | alpha = 0.15 15 | sns.set(style="whitegrid", palette="Paired") 16 | colorSet = sns.color_palette("Paired", 10); 17 | 18 | def discounted_sum(costs, discount): 19 | ''' 20 | Calculate discounted sum of costs 21 | ''' 22 | y = signal.lfilter([1], [1, -discount], x=costs[::-1]) 23 | return y[::-1][0] 24 | 25 | def color_gen(): 26 | 27 | colors = [ "dusty purple", "windows blue", "faded green", "dark pink", "amber"] 28 | colors = sns.xkcd_palette(colors) 29 | idx = -1 30 | while 1: 31 | idx = (idx + 1) % len(colors) 32 | yield colors[idx] 33 | 34 | # Data setup 35 | 36 | dones = dd.io.load(os.path.join('seed_2_data', 'car_data_is_done_seed_2.h5')) 37 | costs = dd.io.load(os.path.join('seed_2_data', 'car_data_rewards_seed_2.h5')) 38 | dones = np.hstack([0,1+np.where(dones)[0]]) 39 | episodes = [] 40 | for low_, high_ in zip(dones[:-1], dones[1:]): 41 | new_episode ={ 42 | 'c': costs[low_:high_, 0].reshape(-1), 43 | 'brake': costs[low_:high_, -1].reshape(-1), 44 | 'lane': costs[low_:high_, 3].reshape(-1), 45 | } 46 | 47 | episodes.append(new_episode) 48 | 49 | discounted_costs = np.array([[discounted_sum(x['c'],.95),discounted_sum(x['brake'],.95),discounted_sum(x['lane'],.95)] for x in episodes]) 50 | data = dd.io.load('car_policy_improvement.h5') 51 | DQN = [-39.61397106365249, 7.703194041056963, 115.62071639160499] 52 | LSPI = pd.read_csv('lspi_results.csv') 53 | plt.rc('text', usetex=True) 54 | 55 | 56 | lines, fill_betweens= [], [] 57 | plt.rc_context({'axes.edgecolor':'k'}) 58 | 59 | 60 | 61 | 62 | fig = plt.figure(figsize=(12, 6)) 63 | grid = plt.GridSpec(6, 4, wspace=0.6, hspace=0.5) 64 | ax1 = fig.add_subplot(grid[0:4, :2]) 65 | 66 | 67 | 68 | # fig, ax1 = plt.subplots() 69 | ax1.grid(alpha=.35) 70 | max_iterations = 27 71 | iterations = range(len(data['g_eval'][0][:max_iterations])) 72 | colors = color_gen() 73 | constraint_names = ['Braking', 'Center of Lane'] 74 | constraint_upper_bound = [5.8, 85.]#[1.5, 5.] 75 | locations = ['lower left', 'lower center', 'lower right'] 76 | fontsize = 16 77 | legend_fontsize = 16 78 | legend_title_fontsize = 16 79 | major_tick_mark_size = 14 80 | 81 | 82 | def derandomize(data, constraints, min_iteration): 83 | 84 | fqe_c = np.array(data['c_eval'][0])[:,-1] 85 | fqe_g_0 = np.array(data['g_eval'][0])[:,-1] 86 | fqe_g_1 = np.array(data['g_eval'][1])[:,-1] 87 | out = [] 88 | for iteration in range(min_iteration, len(fqe_c)): 89 | 90 | df_tmp = pd.DataFrame(np.hstack([np.arange(min_iteration,iteration+1).reshape(1,-1).T, fqe_c[min_iteration:(iteration+1)].reshape(1,-1).T, fqe_g_0[min_iteration:(iteration+1)].reshape(1,-1).T, fqe_g_1[min_iteration:(iteration+1)].reshape(1,-1).T ]), columns=['iteration', 'fqe_c', 'fqe_g_0', 'fqe_g_1']) 91 | df_tmp = df_tmp[(df_tmp['fqe_g_0'] < constraints[0]) & (df_tmp['fqe_g_1'] < constraints[1]) ] 92 | try: 93 | argmin = np.argmin(np.array(df_tmp['fqe_c'])) 94 | it = int(df_tmp.iloc[argmin]['iteration']) 95 | except: 96 | argmin = 0 97 | it = 0 98 | out.append(np.hstack([iteration, np.hstack([data['c_exacts'][it], np.array(data['g_exacts'])[it,:-1]]) ])) 99 | 100 | return pd.DataFrame(out, columns=['iterations', 'c_derandomized', 'g_0_derandomized', 'g_1_derandomized']) 101 | 102 | 103 | df_derandom = derandomize(data, np.array(constraint_upper_bound)*.8, 0) 104 | 105 | legend = [] 106 | car_color = colors.next() 107 | derandom_color = colors.next() 108 | c_values = np.array(data['c_eval_actuals'])[:max_iterations,-10:,:] # shape = (iteration #, k, performance) 109 | last = np.cumsum(c_values[:,-1,0])/np.arange(1,1+len(c_values[:,-1,0]))#*100 110 | evaluation = np.array(pd.DataFrame(c_values[:,-1,0]).expanding().mean()).reshape(-1) 111 | std = np.array(pd.DataFrame(c_values[:,-1,0]).expanding().std()).reshape(-1) 112 | 113 | #evaluation = np.mean(c_values, axis=1)[:,0]#*100 114 | #std = np.std(c_values, axis=1)[:,0]#*100 115 | lines.append( ax1.plot(iterations, last, color = car_color,linestyle='-',markersize=7, label='Exact') ) 116 | 117 | lines.append( ax1.plot(df_derandom['iterations'], df_derandom['c_derandomized'], color = derandom_color,linestyle='-',markersize=7, label='Exact') ) 118 | 119 | # lines.append( ax1.plot(iterations, evaluation, color = car_color,marker='s',markersize=7, label='Mean of last 10') ) 120 | # fill_betweens.append( ax1.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = car_color, zorder = 10) ) 121 | 122 | y_err_lower = last - np.min(c_values, axis=1)[:,0] 123 | y_err_higher = last - np.max(c_values, axis=1)[:,0] 124 | legend.append( mlines.Line2D([], [], color=car_color, linestyle='-', 125 | markersize=7, label='Percent of Track Covered') ) 126 | legend.append( mlines.Line2D([], [], color=derandom_color, linestyle='-', 127 | markersize=7, label='Percent of Track Covered') ) 128 | # legend.append( mlines.Line2D([], [], color=car_color, marker='s', 129 | # markersize=7, label='Percent of Track Covered') ) 130 | 131 | ## Baselines 132 | # LSPI 133 | lspi_color = colors.next() 134 | lspi = np.array(LSPI.iloc[:,0]) 135 | evaluation = np.array(pd.DataFrame(lspi).expanding().mean()).reshape(-1) 136 | lines.append( ax1.plot(iterations, evaluation, color = lspi_color,markersize=7,linestyle='-' , label='Exact') ) 137 | legend.append( mlines.Line2D([], [], color=lspi_color, linestyle='-' , 138 | markersize=7, label='Percent of Track Covered') ) 139 | 140 | # DQN 141 | dqn_color = colors.next() 142 | dqn = np.array([DQN[0]]*len(last)) 143 | evaluation = np.array(pd.DataFrame(dqn).expanding().mean()).reshape(-1) 144 | lines.append( ax1.plot(iterations, evaluation, color = dqn_color,markersize=7,linestyle='-' , label='Exact') ) 145 | legend.append( mlines.Line2D([], [], color=dqn_color, linestyle='-' , 146 | markersize=7, label='Percent of Track Covered') ) 147 | 148 | # Pi_D 149 | pi_d_color = colors.next() 150 | evaluation = np.mean(discounted_costs[:,0]).reshape(-1) 151 | lines.append( ax1.plot(iterations, [evaluation]*len(iterations), color = pi_d_color,markersize=7,linestyle='-' , label='Exact') ) 152 | legend.append( mlines.Line2D([], [], color=pi_d_color, linestyle='-' , 153 | markersize=7, label='Percent of Track Covered') ) 154 | 155 | legend.append( mlines.Line2D([], [], color='k', linestyle='--' , 156 | markersize=7, label='Percent of Track Covered') ) 157 | 158 | ax1.set_xlabel('Iteration (t)', fontsize=fontsize) 159 | ax1.set_ylabel('Value (Main Objective)', color='k', fontsize=fontsize+2) 160 | ax1.tick_params(axis='y', labelcolor='k') 161 | ax1.set_ylim(bottom=-55, top=-15) 162 | # ax1.set_ylim(bottom=20, top=55) 163 | ax1.set_xlim(-.5, 25) 164 | labels = np.array(['FQE', 'Algo 2', 'Mean', 'Regularized LSPI', 'Online-RL \n(no constraint)', 'Algo 2 \n(Derandomized)', r'$\pi_D$', 'Constraint Threshold']) 165 | leg = Legend(ax1, 166 | np.array(legend)[[0,1,3,2,4,5]], 167 | labels[[1,5,4,3,6,7]], 168 | loc='lower left', 169 | bbox_to_anchor=(.05,.02), 170 | bbox_transform=fig.transFigure, 171 | ncol = 2, 172 | frameon=True, 173 | fontsize = legend_fontsize-1) 174 | ax1.add_artist(leg) 175 | 176 | plt.tick_params(axis='both', which='major', labelsize=major_tick_mark_size) 177 | plt.tick_params(axis='both', which='minor', labelsize=8) 178 | ax1.set_title('Main Objective - Accumulated Cost', fontsize=fontsize+2) 179 | # plt.subplots_adjust(right=0.7) 180 | # plt.tight_layout()#rect=[0,.2,1,1]) 181 | # plt.tight_layout(rect=[-.025,-.025,.675,1.025]) 182 | # plt.savefig('car_main_value_wo_band.png', format='png', dpi=300) 183 | # plt.show() 184 | # import pdb; pdb.set_trace() 185 | 186 | 187 | 188 | 189 | # car_color = colors.next() 190 | # c_values = np.array(data['c_eval'][0])[:,-10:] # shape = (iteration #, k) 191 | # last = np.mean(c_values, axis=1)#c_values[:,-1,0]#*100 192 | # evaluation = np.mean(c_values, axis=1)#*100 193 | # std = np.std(c_values, axis=1)#*100 194 | # lines.append( ax1.plot(iterations, last, color = car_color,marker='o',markersize=7, label='Percent of Track Covered') ) 195 | # fill_betweens.append( ax1.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = car_color, zorder = 10) ) 196 | # legend.append( mlines.Line2D([], [], color=car_color, marker='o', 197 | # markersize=7, label='Percent of Track Covered') ) 198 | 199 | #labels = np.array([r"$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50}\widehat{C^{i}}(\pi_{Q_{50}})$", r"$Exact: \;\; C(\pi_{Q_{50}})$", r'$Mean: \;\; \frac{1}{10}\sum_{i=40}^{50} C(\pi_{Q_i})$']) 200 | # labels = np.array(['FQE', r'$Our \; C$', 'Mean', r'$DQN \; C$', r'$LSPI \; C$']) 201 | # leg = Legend(ax1, np.array(legend)[[0,1,2]], labels[[1,3,4]], title='Main Objective', loc=locations[0], frameon=False, fontsize = legend_fontsize) #shadow=True, fancybox=True, 202 | # ax1.add_artist(leg) 203 | # ax = ax1.twinx() 204 | 205 | # tex_labels = [[r'$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50} \widehat{G^{i}_0}(\pi_{Q_{50}})$', r'$Exact: \;\; G_0(\pi_{Q_{50}})$'], 206 | # [r'$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50} \widehat{G^{i}_1}(\pi_{Q_{50}})$', r'$Exact: \;\; G_1(\pi_{Q_{50}})$']] 207 | # tex_labels = [[r'$FQE \; G_0$', r'$Our \; G_0$'], [r'$FQE \; G_1$', r'$Our \; G_1$']] 208 | # baseline_labels = [[r'$DQN \; G_0$', r'$LSPI \; G_0$'], [r'$DQN \; G_1$', r'$LSPI \; G_1$']] 209 | tex_labels = [[r'$FQE \; G_0$', 'Algo 2', 'Algo 2 (Derandomized)'], [r'$FQE \; G_1$', 'Algo 2', 'Algo 2 (Derandomized)']] 210 | # baseline_labels = [['DDQN', 'LSPI'], ['DDQN', 'LSPI']] 211 | baseline_labels = [['Online-RL (no constraint)', 'Regularized LSPI', r'$\pi_D$'], ['Online-RL (no constraint)', 'Regularized LSPI', r'$\pi_D$']] 212 | # plt.clf() 213 | # fig, axs = plt.subplots(2, sharex=True) 214 | axs = [] 215 | axs.append(fig.add_subplot(grid[:3, 2:])) 216 | axs.append(fig.add_subplot(grid[3:, 2:])) 217 | 218 | 219 | for idx in data['g_eval'].keys(): 220 | colors = color_gen() 221 | ax = axs[idx] 222 | ax.grid(alpha=.35) 223 | legend = [] 224 | # FQE 225 | constraint = np.array(data['g_eval'][idx]) # shape = (iteration #, k) referring to Q_k 226 | constraint = constraint[:max_iterations,-10:] 227 | 228 | # evaluation = np.array(pd.DataFrame(np.mean(constraint, axis = 1)/constraint_upper_bound[idx]).expanding().mean()).reshape(-1) 229 | # std = np.array(pd.DataFrame(np.mean(constraint, axis = 1)/constraint_upper_bound[idx]).expanding().std()).reshape(-1) 230 | 231 | # evaluation = np.mean(constraint, axis = 1)/constraint_upper_bound[idx] 232 | # std = np.std(constraint, axis=1)/constraint_upper_bound[idx] 233 | 234 | label = constraint_names[idx] 235 | 236 | # lines.append( ax.plot(iterations, evaluation, color = color,marker='o',markersize=7, label=tex_labels[idx][0]) ) 237 | # fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx) ) 238 | 239 | #legend.append( mlines.Line2D([], [], color=color, marker='o', 240 | # markersize=7, label=tex_labels[idx][0]) ) 241 | 242 | # EXACT 243 | g_exacts = np.array(data['g_exacts'])[:max_iterations,idx]#/constraint_upper_bound[idx] 244 | evaluation = np.array(pd.DataFrame(g_exacts).expanding().mean()).reshape(-1) 245 | std = np.array(pd.DataFrame(g_exacts).expanding().std()).reshape(-1) 246 | 247 | lines.append( ax.plot(iterations, evaluation, color = car_color ,linestyle='-' ,linewidth=2.0, label=tex_labels[idx][1]) ) 248 | legend.append( mlines.Line2D([], [], color=car_color, linestyle='-' ,linewidth=2.0, 249 | markersize=7, label=tex_labels[idx][1]) ) 250 | 251 | # Derandomized 252 | lines.append( ax.plot(df_derandom['iterations'], df_derandom['g_%s_derandomized' % idx], linewidth=2.0,color = derandom_color,linestyle='-',markersize=7, label='Exact') ) 253 | legend.append( mlines.Line2D([], [], color=derandom_color, linestyle='-' ,linewidth=2.0, 254 | markersize=7, label=tex_labels[idx][2]) ) 255 | 256 | 257 | # fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) ) 258 | 259 | ## BASELINES 260 | 261 | # fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) ) 262 | # LSPI 263 | baseline = np.array(LSPI.iloc[:,idx+1])*constraint_upper_bound[idx] 264 | evaluation = np.array(pd.DataFrame(baseline).expanding().mean()).reshape(-1) 265 | std = np.array(pd.DataFrame(baseline).expanding().std()).reshape(-1) 266 | 267 | lines.append( ax.plot(iterations, evaluation, color = lspi_color , linestyle='-', linewidth=2.0,label=baseline_labels[idx][1]) ) 268 | legend.append( mlines.Line2D([], [], color=lspi_color, linestyle='-', linewidth=2.0, 269 | markersize=7, label=baseline_labels[idx][1]) ) 270 | # fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) ) 271 | # DQN 272 | baseline = np.array([DQN[idx+1]]*len(evaluation))#/constraint_upper_bound[idx] 273 | evaluation = np.array(pd.DataFrame(baseline).expanding().mean()).reshape(-1) 274 | std = np.array(pd.DataFrame(baseline).expanding().std()).reshape(-1) 275 | 276 | lines.append( ax.plot(iterations, evaluation, color = dqn_color , linestyle='-' ,linewidth=2.0, label=baseline_labels[idx][0]) ) 277 | legend.append( mlines.Line2D([], [], color=dqn_color, linestyle='-',linewidth=2.0, 278 | markersize=7, label=baseline_labels[idx][0]) ) 279 | 280 | # Pi_D 281 | evaluation = np.mean(discounted_costs[:,idx+1]).reshape(-1) 282 | lines.append( ax.plot(iterations, [evaluation]*len(iterations), color = pi_d_color,markersize=7,linewidth=2.0,linestyle='-' , label=baseline_labels[idx][2]) ) 283 | legend.append( mlines.Line2D([], [], color=pi_d_color, linestyle='-' ,linewidth=2.0, 284 | markersize=7, label=baseline_labels[idx][2]) ) 285 | 286 | 287 | # THRESHOLD 288 | constraint_violation = [constraint_upper_bound[idx]]*len(iterations) 289 | lines.append( ax.plot(iterations, constraint_violation, color = 'k', linestyle=':', linewidth=2.0, marker=None) ) 290 | legend.append( mlines.Line2D([], [], color='k', linestyle=':', marker=None, linewidth=2.0, label='Constraint Threshold') ) 291 | 292 | 293 | labels = [] 294 | for i in range(len(legend)): 295 | line = legend[i] 296 | try: 297 | labels += [line.label] 298 | except: 299 | labels += [line.get_label()] 300 | 301 | if idx == 1: 302 | # leg = Legend(ax, 303 | # legend, 304 | # labels, 305 | # title=label, 306 | # # loc='center left', 307 | # bbox_to_anchor=(0.5, -0.05), 308 | # ncol = 3, 309 | # frameon=False, 310 | # # bbox_to_anchor=(1, 0.5), 311 | # fontsize = legend_fontsize-3) 312 | leg = Legend(ax, 313 | legend, 314 | labels, 315 | loc='lower center', 316 | bbox_to_anchor=(.5,0), 317 | bbox_transform=fig.transFigure, 318 | ncol = 2, 319 | frameon=True, 320 | fontsize = legend_fontsize-2) 321 | 322 | plt.setp(leg.get_title(),fontsize='%s' % legend_title_fontsize) 323 | # ax.add_artist(leg) 324 | 325 | if idx == 1: 326 | ax.set_xlabel('Iteration (t)', fontsize=fontsize) 327 | lab = ['Value (Braking)', 'Value (Lane Center)'][idx] 328 | ax.set_ylabel(lab, color='k', fontsize=fontsize+2) 329 | # ax.set_ylim(bottom=-1, top=3) 330 | ax.tick_params(axis='y', labelcolor='k') 331 | if idx == 0: ax.set_ylim(bottom=-2, top=15) 332 | ax.set_xlim(-.5, 25) 333 | axs[0].tick_params(axis='both', which='major', labelsize=major_tick_mark_size) 334 | axs[0].set_xticklabels([]) 335 | axs[1].tick_params(axis='both', which='major', labelsize=major_tick_mark_size) 336 | axs[0].set_title('Constraints - Accumulated Cost', fontsize=fontsize+2) 337 | plt.tight_layout(rect=[-.025,-.025,1.025,1.025])#rect=[0,.2,1,1]) 338 | plt.savefig('car_all_values_wo_band.png', format='png', dpi=300) 339 | plt.show() 340 | 341 | import pdb; pdb.set_trace() 342 | 343 | 344 | -------------------------------------------------------------------------------- /plot_results.py: -------------------------------------------------------------------------------- 1 | import deepdish as dd 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.lines as mlines 8 | from matplotlib.legend import Legend 9 | import seaborn as sns; sns.set(color_codes=True) 10 | import os 11 | from exponentiated_gradient import ExponentiatedGradient 12 | from matplotlib.lines import Line2D 13 | 14 | # Colors 15 | alpha = 0.15 16 | sns.set(style="whitegrid", palette="Paired") 17 | colorSet = sns.color_palette("Paired", 10); 18 | 19 | def color_gen(): 20 | 21 | colors = ["dark pink","dusty purple", "amber", "faded green", "windows blue", ] 22 | colors = sns.xkcd_palette(colors) 23 | idx = -1 24 | while 1: 25 | idx = (idx + 1) % len(colors) 26 | yield colors[idx] 27 | 28 | plt.rc('text', usetex=True) 29 | EG = ExponentiatedGradient(5., 2, 10.) 30 | 31 | path = os.path.join(os.getcwd(), 'experimental_results') 32 | files = os.listdir(path) 33 | csvs = [f for f in files if 'experiment_results' in f] 34 | tmp = pd.DataFrame([csv.split('.csv')[0].split('_')[2:] for csv in csvs], columns=['year','month','day','hour','minute']) 35 | results_file = 'experiment_results_' + '_'.join(tmp.sort_values(by=['year','month','day','hour','minute'], ascending=False).iloc[0]) + '.csv' 36 | 37 | # results_file = 'experiment_results_12_18_2018_22_20.csv' 38 | df = pd.read_csv(os.path.join(path, results_file)) 39 | df['iteration'] -= 2 40 | 41 | df['primal_dual_gap'] = df['max_L'] - df['min_L'] 42 | 43 | # plt.plot(df['iteration'], df['primal_dual_gap']) 44 | # plt.show() 45 | def unrandomize(df, constraints, min_iteration): 46 | df = df[df['iteration'] >= min_iteration] 47 | 48 | out = [] 49 | for iteration in range(min_iteration, int(max(df['iteration']))): 50 | df_tmp = df[df['iteration'] <= iteration] 51 | df_tmp = df_tmp[df_tmp['g_pi'] < constraints[0]] 52 | argmin = np.argmin(np.array(df_tmp['c_pi'])) 53 | out.append(np.hstack([iteration, np.array(df_tmp.iloc[argmin][['c_pi_exact', 'g_pi_exact']]) ])) 54 | 55 | return pd.DataFrame(out, columns=['iteration', 'c_unrandomized', 'g_unrandomized']) 56 | 57 | 58 | df_unrandom = unrandomize(df, [.1], 5) 59 | 60 | 61 | # f, ax = plt.subplots() 62 | # ax.plot(df['iteration'], df['primal_dual_gap']) 63 | # ax.set_title('Primal Dual Gap') 64 | 65 | # import pdb; pdb.set_trace() 66 | 67 | fontsize=20 68 | colors = color_gen() 69 | color_optimal = colors.next() 70 | plt.plot(df['iteration'], df['primal_dual_gap'], color='b', label='Empirical Gap') 71 | plt.plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, label='Minimum/Optimal Gap') 72 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize) 73 | plt.ylabel('Primal-Dual Gap ' + r'$(\widehat{L}_{max} - \widehat{L}_{min})$', fontsize=fontsize) 74 | plt.legend(fontsize=fontsize) 75 | plt.tick_params(axis='both', which='major', labelsize=fontsize) 76 | plt.xlim((-1,150)) 77 | plt.ylim((-.02,2)) 78 | plt.tight_layout() 79 | plt.title('Convergence Behavior of Algo 2', fontsize=fontsize) 80 | plt.savefig('lake_primal_dual_gap.png', format='png', dpi=300) 81 | plt.clf() 82 | plt.show() 83 | 84 | # W BANDS 85 | fontsize=16 86 | f, axarr = plt.subplots(2, sharex=True) 87 | colors = color_gen() 88 | color_optimal = colors.next() 89 | color_main = colors.next() 90 | vals = pd.DataFrame(df['c_pi']) 91 | evaluation = np.array(vals.expanding().mean()).reshape(-1) 92 | std = np.array(vals.expanding().std()).reshape(-1) 93 | axarr[0].plot(df['iteration'], evaluation, color=color_main, label='Algo 2') 94 | axarr[0].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11) 95 | color_pi_d = colors.next() 96 | axarr[0].plot(df['iteration'], [-9.94428910084026e-05]*len(df['iteration']), color=color_pi_d, label=r'$\pi_D$') 97 | axarr[0].fill_between(df['iteration'],[-9.94428910084026e-05-0.002297397386833141]*len(df['iteration']), [-9.94428910084026e-05+0.002297397386833141]*len(df['iteration']), alpha = alpha, color = color_main, zorder = 11) 98 | # axarr[0].set_ylabel('Main Objective Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize) 99 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), color=color_optimal, label='Optimal Value') 100 | axarr[0].set_ylabel('Main Objective Value', fontsize=fontsize-2) 101 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main) 102 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d) 103 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal) 104 | # axarr[0].legend([line0, line2, line4], ['Algo 2', r'$\pi_D$', 'Optimal Value'], loc='lower right', fontsize=12, frameon=True) 105 | # axarr[0].legend(fontsize=fontsize, loc='lower right', frameon=True) 106 | axarr[0].grid(alpha=.35) 107 | 108 | 109 | evaluation = np.array(pd.DataFrame(df['g_pi_exact']).expanding().mean()).reshape(-1) 110 | std = np.array(pd.DataFrame(df['g_pi_exact']).expanding().std()).reshape(-1) 111 | axarr[1].plot(df['iteration'], evaluation, linewidth=2.0, linestyle=(0,[8,8]), color=color_main, label=r'$G(\widehat{\pi_t})$') 112 | axarr[1].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11) 113 | axarr[1].plot(df['iteration'], [0.15173932921437544]*len(df['iteration']), color=color_pi_d, label=r'$pi_D$') 114 | axarr[1].fill_between(df['iteration'],[0.15173932921437544-0.162341876715503]*len(df['iteration']),[0.15173932921437544+0.162341876715503]*len(df['iteration']), alpha = alpha, color = color_pi_d, zorder = 11) 115 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, linewidth=2., linestyle=(8,[8,8]), label='Optimal value') 116 | axarr[1].plot(df['iteration'], [.1]*len(df['iteration']), color = 'k', linestyle=':', linewidth=2.0, label='Threshold', marker=None) 117 | 118 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main) 119 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d) 120 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal) 121 | line5 = Line2D([0,1],[0,1],linestyle='--', color='k') 122 | axarr[1].legend([line0, line2, line4, line5], ['Algo 2', r'$\pi_D$', 'Optimal Value', 'Constraint Threshold'], loc='lower center', bbox_to_anchor=(.5,0), bbox_transform=f.transFigure, ncol = 2, fontsize=fontsize, frameon=True) 123 | 124 | # axarr[1].set_ylabel('Constraint Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize) 125 | axarr[1].set_ylabel('Constraint Value', fontsize=fontsize-2) 126 | # axarr[1].legend([line0, line1, line2, line3], ['Our Algorithm', 'DDQN (no constraint)', 'Optimal Value', 'Threshold'], loc='lower right', fontsize=fontsize, frameon=True) 127 | axarr[1].grid(alpha=.35) 128 | axarr[1].set_ylim(-.05, .35) 129 | 130 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize) 131 | plt.xlim((-1,150)) 132 | # plt.ylim((-.02,2)) 133 | plt.tight_layout(rect=[0,.15,1,1]) 134 | fig = plt.gcf() 135 | size = fig.get_size_inches() 136 | # fig.set_size_inches(size[0], size[1]+.75) 137 | axarr[0].set_title('Main Objective and Constraint - Accumulated Cost', fontsize=fontsize) 138 | plt.savefig('lake_values.png', format='png', dpi=300) 139 | plt.show() 140 | 141 | # WO BANDS 142 | f, axarr = plt.subplots(2, sharex=True) 143 | colors = color_gen() 144 | color_optimal = colors.next() 145 | color_main = colors.next() 146 | vals = pd.DataFrame(df['c_pi']) 147 | evaluation = np.array(vals.expanding().mean()).reshape(-1) 148 | std = np.array(vals.expanding().std()).reshape(-1) 149 | axarr[0].plot(df['iteration'], evaluation, color=color_main, label='Algo 2') 150 | # axarr[0].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11) 151 | color_pi_d = colors.next() 152 | axarr[0].plot(df['iteration'], [-9.94428910084026e-05]*len(df['iteration']), color=color_pi_d, label=r'$\pi_D$') 153 | spacing = 8 154 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(0*spacing,[spacing,spacing*2]), color=color_optimal, label='Optimal Value') 155 | color_ddqn = colors.next() 156 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(1*spacing,[spacing,spacing*2]), color=color_ddqn, label='DDQN (no constraint)') 157 | color_unrandomized = colors.next() 158 | # axarr[0].plot(df_unrandom['iteration'], df_unrandom['c_unrandomized'], linestyle=(16,[8,24]), color=color_unrandomized, label='Algo 2 (Unrandomized)') 159 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(2*spacing,[spacing,spacing*2]), color=color_unrandomized, label='Algo 2 (Unrandomized)') 160 | 161 | # axarr[0].set_ylabel('Main Objective Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize) 162 | axarr[0].set_ylabel('Main Objective Value', fontsize=fontsize-2) 163 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main) 164 | line1 = Line2D([0,1],[0,1],linestyle='-', color=color_unrandomized) 165 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d) 166 | line3 = Line2D([0,1],[0,1],linestyle='-', color=color_ddqn) 167 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal) 168 | # axarr[0].legend([line0, line1, line2, line3, line4], ['Algo 2', 'Algo 2 (Derandomized)', r'$\pi_D$', 'Online-RL (no constraint)', 'Optimal Value' ], loc='lower right', fontsize=12, frameon=True) 169 | # axarr[0].legend(fontsize=fontsize, loc='lower right', frameon=True) 170 | axarr[0].grid(alpha=.35) 171 | 172 | 173 | evaluation = np.array(pd.DataFrame(df['g_pi_exact']).expanding().mean()).reshape(-1) 174 | std = np.array(pd.DataFrame(df['g_pi_exact']).expanding().std()).reshape(-1) 175 | axarr[1].plot(df['iteration'], evaluation, linewidth=2.0, linestyle=(0*spacing,[spacing,spacing*3]), color=color_main, label=r'$G(\widehat{\pi_t})$') 176 | # axarr[1].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11) 177 | axarr[1].plot(df['iteration'], [0.15173932921437544]*len(df['iteration']), color=color_pi_d, label=r'$pi_D$') 178 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, linewidth=2.0, linestyle=(1*spacing,[spacing,spacing*3]), label='Optimal value') 179 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_unrandomized, linewidth=2.0, linestyle=(2*spacing,[spacing,spacing*3]), label='Unrandomized') 180 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_ddqn, linewidth=2.0, linestyle=(3*spacing,[spacing,spacing*3]), label='DDQN') 181 | axarr[1].plot(df['iteration'], [.1]*len(df['iteration']), color = 'k', linestyle=':', linewidth=2.0, label='Threshold', marker=None) 182 | 183 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main) 184 | line1 = Line2D([0,1],[0,1],linestyle='-', color=color_unrandomized) 185 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d) 186 | line3 = Line2D([0,1],[0,1],linestyle='-', color=color_ddqn) 187 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal) 188 | line5 = Line2D([0,1],[0,1],linestyle='--', color='k') 189 | axarr[1].legend([line0, line1, line2, line3, line4, line5], ['Algo 2', 'Algo 2 (Derandomized)', r'$\pi_D$', 'Online-RL (no constraint)', 'Optimal Value', 'Constraint Threshold'], loc='lower center', bbox_to_anchor=(.5,0), bbox_transform=f.transFigure, ncol = 2, fontsize=fontsize-2, frameon=True) 190 | 191 | # axarr[1].set_ylabel('Constraint Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize) 192 | axarr[1].set_ylabel('Constraint Value', fontsize=fontsize-2) 193 | # axarr[1].legend([line0, line1, line2, line3], ['Our Algorithm', 'DDQN (no constraint)', 'Optimal Value', 'Threshold'], loc='lower right', fontsize=fontsize, frameon=True) 194 | axarr[1].grid(alpha=.35) 195 | 196 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize) 197 | plt.xlim((-1,150)) 198 | # plt.ylim((-.02,2)) 199 | plt.tight_layout(rect=[0,.18,1,1]) 200 | fig = plt.gcf() 201 | size = fig.get_size_inches() 202 | axarr[0].set_title('Main Objective and Constraint - Accumulated Cost', fontsize=fontsize) 203 | # fig.set_size_inches(size[0], size[1]+.75) 204 | plt.savefig('lake_values_wo_band.png', format='png', dpi=300) 205 | plt.show() 206 | 207 | 208 | 209 | 210 | 211 | 212 | # # Two subplots, the axes array is 1-d 213 | # number_of_constraints = len([col for col in df.columns if 'g_avg_' in col]) 214 | # f, axarr = plt.subplots(2+number_of_constraints, sharex=True) 215 | # axarr[0].plot(df['iteration'], df['primal_dual_gap'], label='gap') 216 | # # axarr[0].plot(df['iteration'], pd.ewma(df['primal_dual_gap'], span=100), label='moving average') 217 | # axarr[0].plot(df['iteration'], [0]*len(df['iteration']), color='g', label='Minimum/Optimal Gap') 218 | # axarr[0].set_title('Primal Dual Gap') 219 | # axarr[0].legend() 220 | # axarr[1].plot(df['iteration'], df['c_avg'], color='b', label='C fqe') 221 | # axarr[1].plot(df['iteration'], df['c_exact_avg'], color = 'r', label='C exact') 222 | # # axarr[1].plot(df['iteration'], [-0.254186583]*len(df['iteration']), color='g', label='C optimal') 223 | # # axarr[1].scatter(0, -2.763302804497763e-05, marker='x', color='k', label='C pi_old') 224 | # axarr[1].set_title('Value of C of mean policy') 225 | # axarr[1].legend() 226 | # for col in range(number_of_constraints): 227 | # axarr[2+col].plot(df['iteration'], df['g_avg_%s' % col], color='b', label='G_%s fqe' % col) 228 | # axarr[2+col].plot(df['iteration'], df['g_exact_avg_%s' % col], color='r', label='G_%s exact' % col) 229 | # # axarr[2].plot(df['iteration'], [0.]*len(df['iteration']), color='g', label='G optimal') 230 | # # axarr[2].scatter(0, 0.13755537388963082, marker='x', color='k', label='G pi_old') 231 | # axarr[2+col].set_title('Value of G_%s of mean policy' % col) 232 | # axarr[2+col].legend() 233 | # plt.show() 234 | 235 | # # Number episodes achieved goal: 5. Number episodes fell in hole: 4890 236 | # # C(pi_old): -7.560596707938992e-06. G(pi_old): 0.13777596062648703 237 | # # Percentage of State/Action space seen: 0.943396226415 238 | 239 | -------------------------------------------------------------------------------- /print_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class PrintPolicy(object): 4 | def __init__(self, size=[4,4], env=None): 5 | self.mapping = {0:'<', 1:'v', 2:'>', 3:'^'} 6 | self.size = size 7 | self.action_space_dim = len(self.mapping.keys()) 8 | self.env = env 9 | 10 | def pprint(self, *args): 11 | if len(args) == 1: 12 | pi = args[0] 13 | size = self.size[0]*self.size[1] 14 | if not isinstance(pi,(list,)): 15 | pi = [pi] 16 | 17 | if len(pi) == 0: return 18 | 19 | 20 | states = np.array(range(size)).reshape(1,-1).T 21 | actions_for_each_pi = np.hstack([[np.eye(self.action_space_dim)[p.min_over_a(np.arange(size))[1]] for p in pi]]) 22 | policy = np.hstack([states, np.argmax(actions_for_each_pi.mean(0), 1).reshape(1,-1).T]) 23 | 24 | Qs_for_each_pi = np.vstack([np.array([p.all_actions(np.arange(size))]) for p in pi]) 25 | Q = np.hstack([states, np.mean(Qs_for_each_pi,axis=0)[np.arange(Qs_for_each_pi.shape[1]),policy[:,1]].reshape(-1,1)]) 26 | else: 27 | raise 28 | 29 | direction_grid = [['H' for x in range(self.size[1])] for y in range(self.size[0])] 30 | direction_grid[-1][-1] = 'G' 31 | # direction_grid[0][0] = ' S ' 32 | 33 | Q_grid = [[' H ' for x in range(self.size[1])] for y in range(self.size[0])] 34 | Q_grid[-1][-1] = ' G ' 35 | # Q_grid[0][0] = ' S ' 36 | 37 | 38 | 39 | 40 | for direction in policy: 41 | row = int(direction[0]/self.size[1]) 42 | col = int(direction[0] - row*int(self.size[1])) 43 | direction_grid[row][col] = self.mapping[direction[1]] 44 | 45 | for value in Q: 46 | row = int(value[0]/self.size[1]) 47 | col = int(value[0] - row*int(self.size[1])) 48 | Q_grid[row][col] = value[1] 49 | 50 | if self.env is not None: 51 | direction_grid = np.array(direction_grid) 52 | Q_grid = np.array(Q_grid).astype(str) 53 | 54 | holes = np.where(self.env.desc == 'H') 55 | starts = np.where(self.env.desc == 'S') 56 | goals = np.where(self.env.desc == 'G') 57 | 58 | 59 | direction_grid[holes] = 'H' 60 | # direction_grid[starts] = 'S' 61 | direction_grid[goals] = 'G' 62 | Q_grid[holes] = ' H ' 63 | # Q_grid[starts] = ' S ' 64 | Q_grid[goals] = ' G ' 65 | 66 | direction_grid = direction_grid.tolist() 67 | Q_grid = Q_grid.tolist() 68 | 69 | 70 | for i in range(2*len(direction_grid)+1): 71 | row = [] 72 | for j in range(2*len(direction_grid[0])+1): 73 | if (i % 2) == 1 & (j % 2) == 1: 74 | row.append(direction_grid[(i-1)/2][(j-1)/2]) 75 | elif (j % 2) == 0: 76 | row.append('|') 77 | else: 78 | row.append('_') 79 | print ' '.join(row) 80 | print 81 | 82 | for i in range(2*len(Q_grid)+1): 83 | row = [] 84 | for j in range(2*len(Q_grid[0])+1): 85 | if (i % 2) == 1 & (j % 2) == 1: 86 | try: 87 | val = float(Q_grid[(i-1)/2][(j-1)/2]) 88 | sign = '+'*(val > 0) + '-'*(val<=0) 89 | val = str(np.abs(round(val,2))) 90 | row.append(sign + val) 91 | except: 92 | val = Q_grid[(i-1)/2][(j-1)/2] 93 | row.append(val) 94 | elif (j % 2) == 0: 95 | row.append('|') 96 | else: 97 | row.append('_____') 98 | print ' '.join(row) 99 | print 100 | -------------------------------------------------------------------------------- /replay_buffer.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import deepdish as dd 4 | 5 | class Buffer(object): 6 | """ 7 | This saves the agent's experience in windowed cache. 8 | Each frame is saved only once but state is stack of num_frame_stack frames 9 | 10 | In the beginning of an episode the frame-stack is padded 11 | with the beginning frame 12 | """ 13 | 14 | def __init__(self, 15 | num_frame_stack=1, 16 | buffer_size=10000, 17 | min_buffer_size_to_train=1000, 18 | pic_size = (96,96), 19 | action_space_dim = 4, 20 | n_costs = (), 21 | ): 22 | self.n_costs = n_costs 23 | self.pic_size = pic_size 24 | self.action_space_dim = action_space_dim 25 | self.num_frame_stack = num_frame_stack 26 | self.capacity = buffer_size 27 | self.counter = -1 28 | self.exp_idx = -1 29 | self.frame_window = None 30 | self.max_frame_cache = self.capacity + 2 * self.num_frame_stack + 1 31 | self.frame_idx = self.counter % self.max_frame_cache 32 | self.init_caches() 33 | self.expecting_new_episode = True 34 | self.min_buffer_size_to_train = min_buffer_size_to_train 35 | self.data = {'x':[], 'a':[], 'x_prime':[], 'c':[], 'g':[], 'done':[], 'cost':[]} 36 | 37 | def append(self, action, frame, reward, done): 38 | assert self.frame_window is not None, "start episode first" 39 | self.counter += 1 40 | self.frame_idx = self.counter % self.max_frame_cache 41 | self.exp_idx = (self.exp_idx + 1) % self.capacity 42 | 43 | exp_idx = self.exp_idx 44 | self.prev_states[exp_idx] = self.frame_window 45 | self.frame_window = np.append(self.frame_window[1:], self.frame_idx) 46 | self.next_states[exp_idx] = self.frame_window 47 | self.actions[exp_idx] = action 48 | self.is_done[exp_idx] = done 49 | self.frames[self.frame_idx] = frame 50 | self.rewards[exp_idx] = reward 51 | if done: 52 | self.expecting_new_episode = True 53 | 54 | def start_new_episode(self, frame): 55 | # it should be okay not to increment counter here 56 | # because episode ending frames are not used 57 | assert self.expecting_new_episode, "previous episode didn't end yet" 58 | self.counter += 1 59 | self.frame_idx = self.counter % self.max_frame_cache 60 | self.frame_window = np.repeat(self.frame_idx, self.num_frame_stack) 61 | self.frames[self.frame_idx] = frame 62 | self.expecting_new_episode = False 63 | 64 | def is_over(self): 65 | return self.expecting_new_episode 66 | 67 | def get_length(self): 68 | return min(self.capacity, self.exp_idx) 69 | 70 | def sample(self, N): 71 | count = min(self.capacity, self.exp_idx) 72 | minimum = max(count-40000, 0) # UNHARDCODE THIS. THIS IS FOR USING BUFFER AS SAVER + Exp Replay 73 | batchidx = np.random.randint(minimum, count, size=N) 74 | 75 | x = self.frames[self.prev_states[batchidx]] 76 | action = self.actions[batchidx] 77 | x_prime = self.frames[self.next_states[batchidx]] 78 | reward = self.rewards[batchidx] 79 | done = self.is_done[batchidx] 80 | 81 | return [x, action, x_prime, reward, done] 82 | 83 | def get_all(self, key): 84 | valid_states = min(self.capacity, self.exp_idx) 85 | if key == 'x': 86 | return self.frames[self.prev_states[:valid_states]] 87 | elif key == 'a': 88 | return self.actions[:valid_states] 89 | elif key == 'x_prime': 90 | return self.frames[self.next_states[:valid_states]] 91 | elif key == 'c': 92 | return self.rewards[:valid_states][:, 0] 93 | elif key == 'g': 94 | return self.rewards[:valid_states][:, 1:] 95 | elif key == 'done': 96 | return self.is_done[:valid_states] 97 | elif key == 'cost': 98 | return [] 99 | elif key == 'frames': 100 | maximum = max(np.max(self.prev_states[:valid_states]), np.max(self.next_states[:valid_states])) + 1 101 | return self.frames[:maximum] 102 | elif key == 'prev_states': 103 | return self.prev_states[:valid_states] 104 | elif key == 'next_states': 105 | return self.next_states[:valid_states] 106 | else: 107 | raise 108 | 109 | def is_enough(self): 110 | return self.exp_idx > self.min_buffer_size_to_train 111 | 112 | def current_state(self): 113 | # assert not self.expecting_new_episode, "start new episode first"' 114 | assert self.frame_window is not None, "do something first" 115 | if len(self.pic_size) == 2: 116 | return np.rollaxis(self.frames[self.frame_window], 0,3) 117 | else: 118 | return self.frames[self.frame_window] 119 | 120 | def init_caches(self): 121 | self.rewards = np.empty((self.capacity,) + self.n_costs, dtype="float64") 122 | self.prev_states = np.empty((self.capacity, self.num_frame_stack), dtype="uint32") 123 | self.next_states = np.empty((self.capacity, self.num_frame_stack), dtype="uint32") 124 | self.is_done = np.empty(self.capacity, "uint8") 125 | self.actions = np.empty((self.capacity), dtype="uint8") 126 | self.frames = np.empty((self.max_frame_cache,) + self.pic_size, dtype="uint8") 127 | 128 | def get_state_action_pairs(self, env_type='lake'): 129 | if 'state_action' in self.data: 130 | return self.data['state_action'] 131 | else: 132 | if env_type == 'lake': 133 | pairs = [np.array(self.data['x']), np.array(self.data['a']).reshape(1,-1).T ] 134 | elif env_type == 'car': 135 | pairs = [np.array(self.data['x']), np.array(self.data['a']).reshape(1,-1).T ] 136 | self.data['state_action'] = pairs 137 | 138 | def calculate_cost(self, lamb): 139 | self.scale = np.max(np.abs(np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T)))) 140 | costs = np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))/self.scale 141 | 142 | 143 | # costs = costs/np.max(np.abs(costs)) 144 | self.data['cost'] = costs.tolist() 145 | 146 | def set_cost(self, key, idx=None): 147 | if key == 'g': assert idx is not None, 'Evaluation must be done per constraint until parallelized' 148 | 149 | if key == 'c': 150 | self.scale = np.max(np.abs(self.data['c'])) 151 | self.data['cost'] = self.data['c']/self.scale 152 | elif key == 'g': 153 | # Pick the idx'th constraint 154 | self.scale = np.max(np.abs(self.data['g'][:,idx])) 155 | self.data['cost'] = self.data['g'][:,idx]/self.scale 156 | else: 157 | raise 158 | 159 | def preprocess(self, env_type): 160 | 161 | for key in self.data: 162 | self.data[key] = self.get_all(key) 163 | 164 | def save(self, path): 165 | #data = {'frames':self.frames, 'prev_states':self.prev_states, 'next_states':self.next_states, 'rewards':self.rewards, 'is_done':self.is_done, 'actions':self.actions} 166 | #for data, key in zip([self.frames, self.prev_states, self.next_states, self.rewards, self.is_done, self.actions],['frames', 'prev_astates', 'next_states', 'costs', 'is_done', 'actions']) 167 | # dd.io.save(path % key, data) 168 | count = min(self.capacity, self.counter) 169 | dd.io.save(path.format('frames'), self.frames[:count]) 170 | dd.io.save(path.format('prev_states'), self.prev_states[:count]) 171 | dd.io.save(path.format('next_states'), self.next_states[:count]) 172 | dd.io.save(path.format('rewards'), self.rewards[:count]) 173 | dd.io.save(path.format('is_done'), self.is_done[:count]) 174 | dd.io.save(path.format('actions'), self.actions[:count]) 175 | 176 | 177 | 178 | class Dataset(Buffer): 179 | def __init__(self, num_frame_stack, pic_size, n_costs): 180 | 181 | self.pic_size = pic_size 182 | self.num_frame_stack = num_frame_stack 183 | self.data = {'frames':[], 'prev_states':[], 'a':[], 'next_states':[], 'c':[], 'g':[], 'done':[], 'cost':[], 'x_prime_repr':[], 'x_repr':[]} 184 | self.max_trajectory_length = 0 185 | self.n_costs = n_costs 186 | self.episodes = [Buffer(num_frame_stack=self.num_frame_stack,buffer_size=int(200000),min_buffer_size_to_train=0,pic_size = self.pic_size, n_costs = self.n_costs)] 187 | 188 | def append(self, *args): 189 | self.episodes[-1].append(*args) 190 | 191 | # update max_trajectory_length 192 | if self.episodes[-1].get_length() > self.max_trajectory_length: 193 | self.max_trajectory_length = self.episodes[-1].get_length() 194 | 195 | def start_new_episode(self, *args): 196 | # self.episodes.append(Buffer(num_frame_stack=self.num_frame_stack,buffer_size=int(2000),min_buffer_size_to_train=0,pic_size = self.pic_size, n_costs = self.n_costs)) 197 | self.episodes[-1].start_new_episode(args[0]) 198 | 199 | def current_state(self): 200 | return self.episodes[-1].current_state() 201 | 202 | def get_max_trajectory_length(self): 203 | return self.max_trajectory_length 204 | 205 | def __getitem__(self, key): 206 | return self.data[key] 207 | 208 | def __setitem__(self, key, item): 209 | self.data[key] = item 210 | 211 | def __len__(self): 212 | return len(self.data['a'])-5 213 | 214 | def preprocess(self, env_type): 215 | 216 | for key in ['frames', 'prev_states', 'next_states', 'a', 'done', 'c', 'g']: 217 | self.data[key] = self.episodes[-1].get_all(key) 218 | 219 | # [x.preprocess(env_type) for x in self.episodes] 220 | 221 | # for key in self.data: 222 | # if key in ['g', 'prev_states', 'next_states', 'frames']: 223 | # try: 224 | # self.data[key] = np.vstack([x.get_all[key] for x in self.episodes])#.tolist() 225 | # except: 226 | # self.data[key] = np.hstack([x.get_all[key] for x in self.episodes])#.tolist() 227 | # else: 228 | # self.data[key] = np.hstack([x.get_all[key] for x in self.episodes])#.tolist() 229 | 230 | # if env_type == 'lake': 231 | # if key in ['g']: 232 | # try: 233 | # self.data[key] = np.vstack([x[key] for x in self.episodes]).tolist() 234 | # except: 235 | # self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist() 236 | # else: 237 | # self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist() 238 | # elif env_type == 'car': 239 | # if key in ['g', 'x', 'x_prime']: 240 | # try: 241 | # self.data[key] = np.vstack([x[key] for x in self.episodes]).tolist() 242 | # except: 243 | # self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist() 244 | # else: 245 | # self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist() 246 | # else: 247 | # raise 248 | # [x.get_state_action_pairs(env_type) for x in self.episodes] 249 | # self.get_state_action_pairs(env_type) 250 | 251 | def get_state_action_pairs(self, env_type='lake'): 252 | # if 'state_action' in self.data: 253 | # return self.data['state_action'] 254 | # else: 255 | if env_type == 'lake': 256 | pairs = [np.array(self.data['x']).reshape(1,-1).T, np.array(self.data['a']).reshape(1,-1).T ] 257 | elif env_type == 'car': 258 | pairs = [np.array(self('x_repr')), np.array(self.data['a']).reshape(1,-1).T ] 259 | return pairs 260 | 261 | def calculate_cost(self, lamb): 262 | self.scale = np.max(np.abs(np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T)))) 263 | costs = np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))/self.scale 264 | 265 | # costs = costs/np.max(np.abs(costs)) 266 | self.data['cost'] = costs 267 | 268 | # [x.calculate_cost(lamb) for x in self.episodes] 269 | 270 | def set_cost(self, key, idx=None): 271 | if key == 'g': assert idx is not None, 'Evaluation must be done per constraint until parallelized' 272 | 273 | if key == 'c': 274 | self.scale = np.max(np.abs(self.data['c'])) 275 | self.data['cost'] = self.data['c']/self.scale 276 | # [x.set_cost('c') for x in self.episodes] 277 | elif key == 'g': 278 | # Pick the idx'th constraint 279 | self.scale = np.max(np.abs(np.array(self.data['g'])[:,idx])) 280 | self.data['cost'] = np.array(self.data['g'])[:,idx]/self.scale 281 | # [x.set_cost('g', idx) for x in self.episodes] 282 | else: 283 | raise 284 | -------------------------------------------------------------------------------- /seed_2_data/car_data_actions_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_actions_seed_2.h5 -------------------------------------------------------------------------------- /seed_2_data/car_data_frames_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_frames_seed_2.h5 -------------------------------------------------------------------------------- /seed_2_data/car_data_is_done_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_is_done_seed_2.h5 -------------------------------------------------------------------------------- /seed_2_data/car_data_next_states_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_next_states_seed_2.h5 -------------------------------------------------------------------------------- /seed_2_data/car_data_prev_states_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_prev_states_seed_2.h5 -------------------------------------------------------------------------------- /seed_2_data/car_data_rewards_seed_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_rewards_seed_2.h5 -------------------------------------------------------------------------------- /stochastic_policy.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | 3 | from keras import backend as K 4 | 5 | import numpy as np 6 | from copy import deepcopy 7 | 8 | class StochasticPolicy(Model): 9 | def __init__(self, policy, action_space_dim, policy_evalutor, epsilon=0., prob=None): 10 | ''' 11 | A fixed manual policy 12 | ''' 13 | super(StochasticPolicy, self).__init__() 14 | self.policy = policy 15 | 16 | try: 17 | has_layer = self.policy.Q.model.get_layer('inp').input 18 | except: 19 | has_layer = None 20 | 21 | if has_layer is not None: 22 | try: 23 | self.policy.Q.all_actions_func = K.function([self.policy.Q.model.get_layer('inp').input], [self.policy.Q.model.get_layer('dense_2').output]) 24 | except: 25 | self.policy.Q.all_actions_func = K.function([self.policy.Q.model.get_layer('inp').input], [self.policy.Q.model.get_layer('all_actions').output]) 26 | 27 | self.action_space_dim = action_space_dim 28 | 29 | self.epsilon = epsilon 30 | if prob is not None: 31 | self.prob = prob 32 | else: 33 | self.prob = np.ones(self.action_space_dim)/self.action_space_dim 34 | 35 | 36 | #debug purposes 37 | self.policy_evalutor = policy_evalutor 38 | 39 | 40 | def copy_over_to(self, to_): 41 | pass 42 | 43 | def predict(self, X_a): 44 | pass # return [self.model[np.argmax(x_a[:-self.action_space_dim], axis = 1)] == np.argmax(x_a[-self.action_space_dim:], axis=1) for x_a in X_a] 45 | 46 | def fit(self, X, y, verbose=0): 47 | pass 48 | 49 | def representation(self, *args): 50 | if len(args) == 1: 51 | return args[0] 52 | elif len(args) == 2: 53 | return args[0], args[1] 54 | else: 55 | raise NotImplemented 56 | 57 | def all_actions(self, X, x_preprocessed=False,**kw): 58 | 59 | try: 60 | shape_correct = len(self.policy.Q.model.get_layer('inp').input_shape) == (len(np.array(X).shape)) 61 | except: 62 | shape_correct = False 63 | 64 | if shape_correct: 65 | 66 | if np.random.random() < self.epsilon: 67 | arr = -np.eye(self.action_space_dim)[np.random.choice(self.action_space_dim, p=self.prob)] 68 | else: 69 | arr = -np.eye(self.action_space_dim)[self.policy.Q([X], x_preprocessed=x_preprocessed)[0]] 70 | 71 | return np.atleast_2d(arr) 72 | else: 73 | arr = [] 74 | for x in X: 75 | if np.random.random() < self.epsilon: 76 | arr.append(-np.eye(self.action_space_dim)[np.random.choice(self.action_space_dim, p=self.prob)]) 77 | else: 78 | arr.append(-np.eye(self.action_space_dim)[self.policy.Q([x], x_preprocessed=x_preprocessed)[0]]) 79 | 80 | return np.atleast_2d(np.array(arr)) 81 | 82 | -------------------------------------------------------------------------------- /tests/car_fqe.py: -------------------------------------------------------------------------------- 1 | # from pyvirtualdisplay import Display 2 | # display = Display(visible=0, size=(1280, 1024)) 3 | # display.start() 4 | import deepdish as dd 5 | from replay_buffer import Dataset 6 | from config_car import * 7 | import os 8 | import numpy as np 9 | import scipy.signal as signal 10 | from env_nn import CarNN 11 | from keras.models import load_model 12 | np.random.seed(2718) 13 | 14 | which_pi = './videos/ohio/run_1/pi_1.hdf5' 15 | directory = 'seed_2_data' 16 | action_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_actions_seed_2.h5')) 17 | frame_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_frames_seed_2.h5')) 18 | done_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_is_done_seed_2.h5')) 19 | next_state_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_next_states_seed_2.h5')) 20 | current_state_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_prev_states_seed_2.h5')) 21 | cost_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_rewards_seed_2.h5')) 22 | 23 | 24 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32') 25 | for i in range(len(frame_data)): 26 | frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114]) 27 | 28 | dic = {'frames':frame_gray_scale, 29 | 'prev_states': current_state_data, 30 | 'next_states': next_state_data, 31 | 'a': action_data, 32 | 'c':cost_data[:,0], 33 | 'g':cost_data[:,1:], 34 | 'done': done_data 35 | } 36 | 37 | data = Dataset(num_frame_stack, pic_size, (len(constraints) + 1,) ) 38 | data.data = dic 39 | EVALUATING = 'c' 40 | 41 | 42 | def sample_N_trajectories(dataset, N): 43 | dones = np.where(dataset['done'])[0] 44 | dones = np.hstack([[0], dones]) 45 | trajectory_idxs = zip(dones[:-1], dones[1:]) 46 | N = min(len(trajectory_idxs), N) 47 | idxs = np.random.choice(len(trajectory_idxs), size=N, replace=False) 48 | return np.array(trajectory_idxs)[idxs] 49 | 50 | 51 | def create_trajectories(dataset, N): 52 | idxs = sample_N_trajectories(dataset, N) 53 | episodes = [] 54 | for low, high in idxs: 55 | x = np.rollaxis(dataset['frames'][dataset['prev_states'][low:high]],1,4) 56 | actions = np.atleast_2d(dataset['a'][low:high]).T 57 | x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][low:high]],1,4) 58 | dataset_costs = dataset[EVALUATING][low:high] 59 | dones = dataset['done'][low:high] 60 | episode = { 61 | 'x': x, 62 | 'a': actions, 63 | 'x_prime': x_prime, 64 | 'cost': dataset_costs, 65 | 'done': dones, 66 | } 67 | episodes.append(episode) 68 | return episodes 69 | 70 | def pdis(episodes, pi_new, pi_old, gamma): 71 | ''' 72 | Per decision importance sampling 73 | 74 | sum_{t=1}^{max L} gamma^t 1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t 75 | ''' 76 | values = [] 77 | for episode in episodes: 78 | 79 | numerator = pi_new.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)] 80 | denominator = pi_old.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)] 81 | importance_weight = np.cumprod(numerator/denominator) 82 | 83 | values.append( discounted_sum(importance_weight * episode['cost'], gamma) ) 84 | 85 | return np.mean(values) 86 | 87 | def WDR(episodes, pi_new, pi_old, gamma): 88 | # \hat{v}^pi(s) = \sum_t gamma^t * \hat{r}^pi(s,t) 89 | # = \sum_t * \sum_a pi(a|s) \hat{r}^pi(s,a,t) 90 | # = \sum_t * \hat{r}^\pi (s, A, t) where A = argmin_a pi(s), since our pi_new is deterministic 91 | 92 | # WDR = 1/n \sum_i \hat{v}^\pi_new (S_0^{H_i}) 93 | # + \sum_i \sum_t gamma^t w_t^i [R_t^{H_i} + gamma \hat{v}^\pi_new (S_{t+1}^{H_i}) - \hat{q}(S_t^{H_i}, A_t^{H_i})] 94 | 95 | # since pi_new, pi_old, ..etc, deterministic then: 96 | # Thus, WDR = \hat{v}^\pi_new (S_0) + \sum_i \sum_t gamma^t w_t^i [R_t^{H_i} - \hat{r}(S_t^{H_i},A_t^{H_i},0)] 97 | # 98 | # w_t^i = p_t^i / sum_{j=1}^n p_i^j 99 | # 100 | # p_t^i = prod_{i=0}^t pi_new(A_i|S_i) / pi_old(A_i|S_i) 101 | 102 | def discounted_sum(costs, discount): 103 | ''' 104 | Calculate discounted sum of costs 105 | ''' 106 | y = signal.lfilter([1], [1, -discount], x=costs[::-1]) 107 | return y[::-1][0] 108 | 109 | 110 | def main(): 111 | episodes = create_trajectories(data, 50) 112 | 113 | for episode in tqdm(episodes): 114 | numerator = pi_new.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)] 115 | denominator = pi_old.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)] 116 | 117 | model = load_model(which_pi) 118 | pi_new = CarNN(state_space_dim, 119 | action_space_dim, 120 | max_Q_fitting_epochs, 121 | gamma, 122 | model_type=model_type, 123 | num_frame_stack=num_frame_stack) 124 | pi_new.model.set_weights(model.get_weights()) 125 | 126 | pdis_output = pdis(episodes, pi_new, pi_new, gamma) 127 | import pdb; pdb.set_trace() 128 | 129 | main() 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /tests/fqe_test.py: -------------------------------------------------------------------------------- 1 | from pyvirtualdisplay import Display 2 | display = Display(visible=1, size=(1280, 1024)) 3 | display.start() 4 | from fitted_off_policy_evaluation import CarFittedQEvaluation 5 | from exact_policy_evaluation import ExactPolicyEvaluator 6 | from config_car import * 7 | from fitted_algo import FittedAlgo 8 | import numpy as np 9 | from tqdm import tqdm 10 | from env_nn import * 11 | from thread_safe import threadsafe_generator 12 | from keras import backend as K 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 14 | from env_dqns import CarDQN 15 | import deepdish as dd 16 | from keras.models import load_model 17 | import time 18 | from replay_buffer import Dataset 19 | from stochastic_policy import StochasticPolicy 20 | 21 | 22 | model_dir = os.path.join(os.getcwd(), 'models') 23 | old_policy_path = os.path.join(model_dir, old_policy_name) 24 | policy_old = CarDQN(env, 25 | gamma, 26 | action_space_map = action_space_map, 27 | action_space_dim=action_space_dim, 28 | model_type=model_type, 29 | max_time_spent_in_episode=max_time_spent_in_episode, 30 | num_iterations = num_iterations, 31 | sample_every_N_transitions = sample_every_N_transitions, 32 | batchsize = batchsize, 33 | copy_over_target_every_M_training_iterations = copy_over_target_every_M_training_iterations, 34 | buffer_size = buffer_size, 35 | min_epsilon = min_epsilon, 36 | initial_epsilon = initial_epsilon, 37 | epsilon_decay_steps = epsilon_decay_steps, 38 | num_frame_stack=num_frame_stack, 39 | min_buffer_size_to_train=min_buffer_size_to_train, 40 | frame_skip = frame_skip, 41 | pic_size = pic_size, 42 | models_path = os.path.join(model_dir,'weights.{epoch:02d}-{loss:.2f}.hdf5'), 43 | ) 44 | policy_old.Q.model = load_model(old_policy_path) 45 | policy_old.Q.all_actions_func = K.function([policy_old.Q.model.get_layer('inp').input], [policy_old.Q.model.get_layer('all_actions').output]) 46 | print 'Exact Evaluation: ' 47 | exact_policy_algorithm = ExactPolicyEvaluator(action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size = pic_size, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about) 48 | #policy_old.Q.evaluate(render=True, environment_is_dynamic=False, to_monitor=True) 49 | print exact_policy_algorithm.run(policy_old.Q, to_monitor=False) 50 | 51 | 52 | # policy_to_test = StochasticPolicy(policy_old, action_space_dim, exact_policy_algorithm, epsilon=0., prob=prob) 53 | 54 | tic = time.time() 55 | action_data = dd.io.load('./seed_2/car_data_actions_seed_2.h5') 56 | frame_data = dd.io.load('./seed_2/car_data_frames_seed_2.h5') 57 | done_data = dd.io.load('./seed_2/car_data_is_done_seed_2.h5') 58 | next_state_data = dd.io.load('./seed_2/car_data_next_states_seed_2.h5') 59 | current_state_data = dd.io.load('./seed_2/car_data_prev_states_seed_2.h5') 60 | cost_data = dd.io.load('./seed_2/car_data_rewards_seed_2.h5') 61 | 62 | 63 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32') 64 | for i in range(len(frame_data)): 65 | frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114]) 66 | 67 | dic = {'frames':frame_gray_scale, 68 | 'prev_states': current_state_data, 69 | 'next_states': next_state_data, 70 | 'a': action_data, 71 | 'c':cost_data[:,0], 72 | 'g':cost_data[:,1:], 73 | 'done': done_data 74 | } 75 | 76 | data = Dataset(num_frame_stack, pic_size, (len(constraints) + 1,) ) 77 | data.data = dic 78 | 79 | data.data['g'] = data.data['g'][:,constraints_cared_about] 80 | data.data['g'] = (data.data['g'] >= constraint_thresholds[:-1]).astype(int) 81 | 82 | FQE = CarFittedQEvaluation(state_space_dim, action_space_dim, max_eval_fitting_epochs, gamma, model_type=model_type,num_frame_stack=num_frame_stack) 83 | 84 | 85 | FQE.run(policy_old.Q,'c', data, desc='FQE C', g_idx=1, testing=True, epochs=1) 86 | 87 | 88 | def rolling_sum(a, n=4) : ret = np.cumsum(a, axis=1, dtype=float); ret[:, n:] = ret[:, n:] - ret[:, :-n]; return ret[:, n - 1:]; -------------------------------------------------------------------------------- /thread_safe.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | class ThreadSafe: 4 | """Takes an iterator/generator and makes it thread-safe by 5 | serializing call to the `next` method of given iterator/generator. 6 | """ 7 | def __init__(self, it): 8 | self.it = it 9 | self.lock = threading.Lock() 10 | 11 | def __iter__(self): 12 | return self 13 | 14 | def next(self): 15 | with self.lock: 16 | return self.it.next() 17 | 18 | 19 | def threadsafe_generator(f): 20 | """A decorator that takes a generator function and makes it thread-safe. 21 | """ 22 | def g(*a, **kw): 23 | return ThreadSafe(f(*a, **kw)) 24 | return g 25 | 26 | -------------------------------------------------------------------------------- /value_function.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | class ValueFunction(object): 5 | def __init__(self): 6 | ''' 7 | ''' 8 | self.prev_values = [] 9 | self.exact_values = [] 10 | self.eval_values = {} 11 | # self.V = {} 12 | # self.dim_state_space = dim_state_space 13 | # self.non_terminal_states = non_terminal_states 14 | 15 | def append(self, *args): 16 | if len(args) == 1: 17 | value = args[0] 18 | self.prev_values.append(value) 19 | elif len(args) == 2: 20 | value, policy = args 21 | self.prev_values.append(value) 22 | # self.V[self.vectorize(policy)] = value 23 | 24 | def avg(self, append_zero=False): 25 | if append_zero: 26 | return np.hstack([np.mean(self.prev_values, 0), np.array([0])]) 27 | else: 28 | return np.mean(self.prev_values, 0) 29 | 30 | def last(self, append_zero=False): 31 | if append_zero: 32 | return np.hstack([self.prev_values[-1], np.array([0])]) 33 | else: 34 | return np.array(self.prev_values[-1]) 35 | 36 | def add_exact_values(self, values): 37 | self.exact_values.append(values) 38 | 39 | def add_eval_values(self, eval_values, idx): 40 | if idx not in self.eval_values: 41 | self.eval_values[idx] = [] 42 | 43 | self.eval_values[idx].append(eval_values) 44 | 45 | 46 | # def vectorize(self, policy): 47 | # # Can be done for low dim discrete spaces 48 | # return tuple(policy(self.non_terminal_states)) 49 | 50 | # def __getitem__(self, policy): 51 | # pi = self.vectorize(policy) 52 | # if pi in self.V: 53 | # return np.array(self.V[pi]) 54 | # else: 55 | # raise KeyError 56 | 57 | # def __contains__(self, policy): 58 | # pi = self.vectorize(policy) 59 | # if pi in self.V: 60 | # return True 61 | # else: 62 | # return False 63 | --------------------------------------------------------------------------------