├── DQN.py
├── Pipfile
├── README.md
├── car_constraint_values_wo_band.png
├── car_main_value.png
├── car_main_value_wo_band.png
├── car_racing.py
├── config_car.py
├── config_lake.py
├── env_dqns.py
├── env_nn.py
├── exact_policy_evaluation.py
├── experimental_results
    ├── hyperparam_2018_12_18_22_20.png
    ├── lspi.csv
    ├── policy_improvement_grid.h5
    └── results_grid.csv
├── experimental_results_car
    ├── experiment_results_2019_01_02_22_00.csv
    ├── experiment_results_2019_01_03_11_00.csv
    └── experiment_results_2019_01_03_15_00.csv
├── exponentiated_gradient.py
├── fitted_algo.py
├── fitted_off_policy_evaluation.py
├── fittedq.py
├── fixed_policy.py
├── fqe_quality_test.py
├── fqe_quality_test_generalization.py
├── fqi_grid_search.py
├── fqi_seed_2_new.py
├── frozen_lake.py
├── inverse_propensity_scoring.py
├── lake_primal_dual_gap.png
├── lake_values.png
├── lake_values_wo_band.png
├── layer_visualizer.py
├── mdp_approximator.py
├── model.py
├── models
    ├── pi_1.hdf5
    ├── pi_2.hdf5
    ├── pi_old_car_cnn.h5
    ├── pi_old_car_cnn.hdf5
    ├── pi_old_car_cnn1.hdf5
    ├── pi_old_car_cnn_good.hdf5
    ├── pi_old_car_cnn_random_seed.hdf5
    ├── pi_old_car_cnn_seed_2.hdf5
    ├── pi_old_map_size_8_mlp.h5
    ├── weights.01-2362.66.hdf5
    ├── weights.01-2542.47.hdf5
    └── weights.01-2635.64.hdf5
├── neural_network.py
├── optimization_problem.py
├── pi_old_car_cnn_main.hdf5
├── play_car_racing.py
├── plot_fqe_quality_test.py
├── plot_grid_search.py
├── plot_policy_improvement.py
├── plot_policy_improvement_v2.py
├── plot_results.py
├── print_policy.py
├── replay_buffer.py
├── run.py
├── seed_2_data
    ├── car_data_actions_seed_2.h5
    ├── car_data_frames_seed_2.h5
    ├── car_data_is_done_seed_2.h5
    ├── car_data_next_states_seed_2.h5
    ├── car_data_prev_states_seed_2.h5
    └── car_data_rewards_seed_2.h5
├── stochastic_policy.py
├── tests
    ├── car_fqe.py
    └── fqe_test.py
├── thread_safe.py
└── value_function.py


/DQN.py:
--------------------------------------------------------------------------------
  1 | import keras
  2 | import numpy as np
  3 | from replay_buffer import Buffer
  4 | import time
  5 | from keras.callbacks import ModelCheckpoint
  6 | import os
  7 | 
  8 | class DeepQLearning(object):
  9 |     def __init__(self, env, 
 10 |                        gamma, 
 11 |                        model_type='mlp', 
 12 |                        action_space_map = None,
 13 |                        num_iterations = 5000, 
 14 |                        sample_every_N_transitions = 10,
 15 |                        batchsize = 1000,
 16 |                        copy_over_target_every_M_training_iterations = 100,
 17 |                        max_time_spent_in_episode = 100,
 18 |                        buffer_size = 10000,
 19 |                        num_frame_stack=1,
 20 |                        min_buffer_size_to_train=1000,
 21 |                        frame_skip = 1,
 22 |                        pic_size = (96, 96),
 23 |                        models_path = None,
 24 |                        ):
 25 | 
 26 |         self.models_path = models_path
 27 |         self.env = env
 28 |         self.num_iterations = num_iterations
 29 |         self.gamma = gamma
 30 |         self.frame_skip = frame_skip
 31 |         _ = self.env.reset()
 32 |         if self.env.env_type in ['car']: 
 33 |             self.env.render()
 34 |             _, r, _, _ = self.env.step(action_space_map[0])
 35 |             self.buffer = Buffer(buffer_size=buffer_size, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, pic_size = pic_size, n_costs = (len(np.hstack(r)),))        
 36 |         else:
 37 |             self.buffer = Buffer(buffer_size=buffer_size, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, pic_size = (1,), n_costs = (1,))        
 38 |         self.sample_every_N_transitions = sample_every_N_transitions
 39 |         self.batchsize = batchsize
 40 |         self.copy_over_target_every_M_training_iterations = copy_over_target_every_M_training_iterations
 41 |         self.max_time_spent_in_episode = max_time_spent_in_episode
 42 |         self.action_space_map = action_space_map
 43 | 
 44 |     def min_over_a(self, *args, **kw):
 45 |         return self.Q.min_over_a(*args, **kw)
 46 | 
 47 |     def all_actions(self, *args, **kw):
 48 |         return self.Q.all_actions(*args, **kw)
 49 | 
 50 |     # def representation(self, *args, **kw):
 51 |     #     return self.Q.representation(*args, **kw)
 52 | 
 53 |     def learn(self):
 54 |         
 55 |         more_callbacks = [ModelCheckpointExtended(self.models_path)]
 56 |         self.time_steps = 0
 57 |         training_iteration = -1
 58 |         perf = Performance()
 59 |         main_tic = time.time()
 60 |         training_complete = False
 61 |         for i in range(self.num_iterations):
 62 |             if training_complete: continue
 63 |             tic = time.time()
 64 |             x = self.env.reset()
 65 |             if self.env.env_type in ['car']: self.env.render()
 66 |             self.buffer.start_new_episode(x)
 67 |             done = False
 68 |             time_spent_in_episode = 0
 69 |             episode_cost = 0
 70 |             while not done:
 71 |                 #if self.env.env_type in ['car']: self.env.render()
 72 |                 
 73 |                 time_spent_in_episode += 1
 74 |                 self.time_steps += 1
 75 |                 # print time_spent_in_episode
 76 |                 
 77 |                 use_random = np.random.rand(1) < self.epsilon(epoch=i, total_steps=self.time_steps)
 78 |                 if use_random:
 79 |                     action = self.sample_random_action()
 80 |                 else:
 81 |                     action = self.Q(self.buffer.current_state())[0]
 82 | 
 83 |                 if (i % 50) == 0: print use_random, action, self.Q(self.buffer.current_state())[0], self.Q.all_actions(self.buffer.current_state())
 84 | 
 85 |                 # import pdb; pdb.set_trace()
 86 |                 # state = self.buffer.current_state()
 87 |                 # import matplotlib.pyplot as plt
 88 |                 # plt.imshow(state[-1])
 89 |                 # plt.show()
 90 |                 # self.Q.all_actions(state)
 91 | 
 92 |                 cost = []
 93 |                 for _ in range(self.frame_skip):
 94 |                     if done: continue
 95 |                     x_prime, costs, done, _ = self.env.step(self.action_space_map[action])
 96 |                     # import pdb; pdb.set_trace()
 97 |                     cost.append(costs)
 98 | 
 99 |                 cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)
100 |                 early_done, punishment = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_spent_in_episode, total_cost=episode_cost)
101 |           
102 |                 if early_done:
103 |                     cost[0] = cost[0] +  punishment
104 |                 done = done or early_done
105 |                 
106 |                 # self.buffer.append([x,action,x_prime, cost[0], done])
107 |                 self.buffer.append(action, x_prime, cost, done)
108 | 
109 |                 # train
110 |                 is_train = ((self.time_steps % self.sample_every_N_transitions) == 0) and self.buffer.is_enough()
111 | 
112 |                 if is_train:
113 |                     # for _ in range(len(self.buffer.data)/self.sample_every_N_transitions):
114 |                     training_iteration += 1
115 |                     if (training_iteration % self.copy_over_target_every_M_training_iterations) == 0: 
116 |                         self.Q.copy_over_to(self.Q_target)
117 |                     batch_x, batch_a, batch_x_prime, batch_cost, batch_done = self.buffer.sample(self.batchsize)
118 | 
119 |                     target = batch_cost[:,0] + self.gamma*self.Q_target.min_over_a(np.stack(batch_x_prime))[0]*(1-batch_done)
120 |                     X = [batch_x, batch_a]
121 |                     
122 |                     evaluation = self.Q.fit(X,target,epochs=1, batch_size=32, evaluate=False,verbose=False,tqdm_verbose=False, additional_callbacks=more_callbacks)
123 |                 
124 |                 x = x_prime
125 | 
126 |                 episode_cost += cost[0]
127 | 
128 |             if self.env.env_type == 'car': 
129 |                 perf.append(float(self.env.tile_visited_count)/len(self.env.track))
130 |             else:
131 |                 perf.append(episode_cost/self.env.min_cost)
132 | 
133 |             if (i % 1) == 0:
134 |                 print 'Episode %s' % i
135 |                 episode_time = time.time()-tic
136 |                 print 'Total Time: %s. Episode time: %s. Time/Frame: %s' % (np.round(time.time() - main_tic,2), np.round(episode_time, 2), np.round(episode_time/time_spent_in_episode, 2))
137 |                 print 'Episode frames: %s. Total frames: %s. Total train steps: %s' % (time_spent_in_episode, self.time_steps, training_iteration)
138 |                 if self.env.env_type in ['car']:
139 |                     print 'Performance: %s/%s. Score out of 1: %s. Average Score: %s' %  (self.env.tile_visited_count, len(self.env.track), perf.last(), perf.get_avg_performance())
140 |                 else:
141 |                     print 'Score out of 1: %s. Average Score: %s' %  (perf.last(), perf.get_avg_performance())
142 |                 print '*'*20
143 |             if perf.reached_goal():
144 |                 #return more_callbacks[0].all_filepaths[-1]
145 |                 training_complete = True#return self.Q #more_callbacks[0].all_filepaths[-1]
146 |         self.buffer.save(os.path.join(os.getcwd(),'%s_data_{0}.h5' % self.env.env_type))
147 | 
148 |     def __call__(self,*args):
149 |         return self.Q.__call__(*args)
150 | 
151 |     def __deepcopy__(self, memo):
152 |         return self
153 | 
154 | class Performance(object):
155 |     def __init__(self):
156 |         self.goal = .85
157 |         self.avg_over = 20
158 |         self.costs = []
159 | 
160 |     def reached_goal(self):
161 |         if self.get_avg_performance() >= self.goal:
162 |             return True
163 |         else:
164 |             return False
165 | 
166 |     def append(self, cost):
167 |         self.costs.append(cost)
168 | 
169 |     def last(self):
170 |         return np.round(self.costs[-1], 3)
171 | 
172 |     def get_avg_performance(self):
173 |         num_iters = min(self.avg_over, len(self.costs))
174 |         return np.round(sum(self.costs[-num_iters:])/ float(num_iters), 3)
175 | 
176 | 
177 | class ModelCheckpointExtended(ModelCheckpoint):
178 |     def __init__(self, filepath, max_to_keep=5, monitor='loss', *args, **kw):
179 |         super(ModelCheckpointExtended, self).__init__(filepath, *args, **kw)
180 |         self.max_to_keep = max_to_keep
181 |         self.all_filepaths = []
182 | 
183 |     def on_epoch_end(self, epoch, logs=None):
184 |         
185 |         super(ModelCheckpointExtended, self).on_epoch_end(epoch, logs)
186 |         logs = logs or {}
187 |         filepath = self.filepath.format(epoch=epoch + 1, **logs)
188 |         
189 |         self.all_filepaths.append(filepath)
190 |         if len(self.all_filepaths) > self.max_to_keep:
191 |             try:
192 |                 os.remove(self.all_filepaths.pop(0))
193 |             except:
194 |                 pass
195 | 
196 | 
197 | # class Buffer(object):
198 | #     def __init__(self, buffer_size=10000):
199 | #         self.data = []
200 | #         self.size = buffer_size
201 | #         self.idx = -1
202 | 
203 | #     def append(self, datum):
204 | #         self.idx = (self.idx + 1) % self.size
205 |         
206 | #         if len(self.data) > self.idx:
207 | #             self.data[self.idx] = datum
208 | #         else:
209 | #             self.data.append(datum)
210 | 
211 | #     def sample(self, N):
212 | #         N = min(N, len(self.data))
213 | #         rows = np.random.choice(len(self.data), size=N, replace=False)
214 | #         return np.array(self.data)[rows]
215 | 
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | numpy = "*"
10 | tensorflow-gpu = "*"
11 | gym = "*"
12 | keras = "*"
13 | pyyaml = "*"
14 | scipy = "*"
15 | tqdm = "*"
16 | keras-tqdm = "*"
17 | pandas = "*"
18 | matplotlib = "*"
19 | argparse = "*"
20 | "box2d-py" = "*"
21 | pyglet = "*"
22 | pyvirtualdisplay = "*"
23 | scikit-image = "*"
24 | deepdish = "*"
25 | seaborn = "*"
26 | 
27 | [requires]
28 | python_version = "2.7"
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # constrained_batch_policy_learning
 2 | 
 3 | *Note: Use the --headless flag if using a server without a display.
 4 | 
 5 | Otherwise, to run the main algorithm:
 6 | ```python
 7 | pip install pipenv
 8 | pipenv install
 9 | pipenv run python run.py -env car --headless
10 | ```
11 | 
12 | or, for lake,
13 | 
14 | ```python
15 | pipenv run python run.py -env lake --headless
16 | ```
17 | 


--------------------------------------------------------------------------------
/car_constraint_values_wo_band.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_constraint_values_wo_band.png


--------------------------------------------------------------------------------
/car_main_value.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_main_value.png


--------------------------------------------------------------------------------
/car_main_value_wo_band.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/car_main_value_wo_band.png


--------------------------------------------------------------------------------
/config_car.py:
--------------------------------------------------------------------------------
 1 | #### Setup Gym 
 2 | from car_racing import ExtendedCarRacing
 3 | import itertools
 4 | 
 5 | # env = gym.make('CarRacing-v0')
 6 | init_seed = 2
 7 | stochastic_env = False # = not deterministic
 8 | max_pos_costs = 12 # The maximum allowable positive cost before ending episode early
 9 | max_time_spent_in_episode = 2000
10 | env = ExtendedCarRacing(init_seed, stochastic_env, max_pos_costs)
11 | 
12 | #### Hyperparam
13 | gamma = .95
14 | max_epochs = 3000 # max number of epochs over which to collect data
15 | max_Q_fitting_epochs = 50 #max number of epochs over which to converge to Q^\ast.   Fitted Q Iter
16 | max_eval_fitting_epochs = 50 #max number of epochs over which to converge to Q^\pi. Off Policy Eval
17 | lambda_bound = 30. # l1 bound on lagrange multipliers
18 | epsilon = .01 # termination condition for two-player game
19 | deviation_from_old_policy_eps = 0.0 #With what probabaility to deviate from the old policy
20 | # convergence_epsilon = 1e-6 # termination condition for model convergence
21 | # action_space_dim = env.nA # action space dimension
22 | # state_space_dim = env.nS # state space dimension
23 | eta = .01 # param for exponentiated gradient algorithm
24 | # initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states
25 | # non_terminal_states = np.nonzero(((env.desc == 'S') + (env.desc == 'F')).reshape(-1))[0] # Used for dynamic programming. this is an optimization to make the algorithm run faster. In general, you may not have this
26 | max_number_of_main_algo_iterations = 100 # After how many iterations to cut off the main algorithm
27 | model_type = 'cnn'
28 | # old_policy_name = 'pi_old_car_{0}.hdf5'.format(model_type)
29 | old_policy_name = 'pi_old_car_{0}_seed_2.hdf5'.format(model_type)
30 | freeze_cnn_layers = False
31 | starting_lambda = [1.,1.,28.]
32 | 
33 | 
34 | # Constraint 1: We'd like the number of times you brake to be less than 10% of the time 
35 | # Constraint 2: We'd like the car to stay within 15 units of the center of the track 90% of the time 
36 | constraint_thresholds = [1., 5.] + [1]
37 | constraints_cared_about = [-1,2]
38 | constraints = [5.8, 85.] + [0]
39 | 
40 | ## DQN Param
41 | num_iterations = 3000
42 | sample_every_N_transitions = 4
43 | batchsize = 64
44 | copy_over_target_every_M_training_iterations = 250
45 | buffer_size = 20000
46 | min_epsilon = .01
47 | initial_epsilon = 1.
48 | epsilon_decay_steps = 1000 #num_iterations
49 | num_frame_stack=3
50 | min_buffer_size_to_train = 2000
51 | frame_skip=3
52 | pic_size = (96, 96, 3)
53 | 
54 | # Other
55 | 
56 | state_space_dim = (96, 96, num_frame_stack)
57 | 
58 | # action_space_map = { 
59 | #                 0: [0.0,  0.0,  0.0],   # Brake
60 | #                 1: [-0.6, 0.05, 0.0],   # Sharp left
61 | #                 2: [0.6,  0.05, 0.0],   # Sharp right
62 | #                 3: [0.0,  0.3,  0.0]  } # Staight
63 | 
64 | action_space_map = {}
65 | for i, action in enumerate([k for k in itertools.product([-1, 0, 1], [1, 0], [0.2, 0])]):
66 |     action_space_map[i] = action
67 | 
68 | action_space_dim = len(action_space_map)
69 | prob = [1/float(action_space_dim)]*action_space_dim # Probability with which to explore space when deviating from old policy
70 | 
71 | calculate_gap = False # Run Main algo. If False, it skips calc of primal-dual gap
72 | infinite_loop = True # Stop script if reached primal-dual gap threshold
73 | policy_improvement_name = 'car_policy_improvement.h5'
74 | results_name = 'car_results.csv'
75 | 


--------------------------------------------------------------------------------
/config_lake.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #### Setup Gym 
 3 | from frozen_lake import ExtendedFrozenLake
 4 | import numpy as np
 5 | 
 6 | map_size = 8
 7 | # register( id='FrozenLake-no-slip-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'is_slippery': False, 'map_name':'{0}x{0}'.format(map_size)} )
 8 | # env = gym.make('FrozenLake-no-slip-v0')
 9 | max_time_spent_in_episode = 100
10 | env = ExtendedFrozenLake(max_time_spent_in_episode, map_name = '{0}x{0}'.format(map_size), is_slippery= False)
11 | position_of_holes = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'H')]
12 | position_of_goals = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'G')]
13 | 
14 | 
15 | 
16 | #### Hyperparam
17 | gamma = 0.9
18 | max_epochs = 5000 # max number of epochs over which to collect data
19 | max_Q_fitting_epochs = 30 #max number of epochs over which to converge to Q^\ast.   Fitted Q Iter
20 | max_eval_fitting_epochs = 30 #max number of epochs over which to converge to Q^\pi. Off Policy Eval
21 | lambda_bound = 30. # l1 bound on lagrange multipliers
22 | epsilon = .01 # termination condition for two-player game
23 | deviation_from_old_policy_eps = .95 #With what probabaility to deviate from the old policy
24 | # convergence_epsilon = 1e-6 # termination condition for model convergence
25 | action_space_dim = env.nA # action space dimension
26 | state_space_dim = env.nS # state space dimension
27 | eta = 50. # param for exponentiated gradient algorithm
28 | initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states
29 | non_terminal_states = np.nonzero(((env.desc == 'S') + (env.desc == 'F')).reshape(-1))[0] # Used for dynamic programming. this is an optimization to make the algorithm run faster. In general, you may not have this
30 | max_number_of_main_algo_iterations = 100 # After how many iterations to cut off the main algorithm
31 | model_type = 'mlp'
32 | old_policy_name = 'pi_old_map_size_{0}_{1}.h5'.format(map_size, model_type)
33 | constraints = [.1, 0]
34 | starting_lambda = 'uniform'
35 | 
36 | ## DQN Param
37 | num_iterations = 5000
38 | sample_every_N_transitions = 10
39 | batchsize = 1000
40 | copy_over_target_every_M_training_iterations = 100
41 | buffer_size = 10000
42 | num_frame_stack=1
43 | min_buffer_size_to_train=0
44 | frame_skip = 1
45 | pic_size = tuple()
46 | min_epsilon = .02
47 | initial_epsilon = .3
48 | epsilon_decay_steps = 1000 #num_iterations
49 | min_buffer_size_to_train = 2000
50 | 
51 | # Other
52 | stochastic_env = False
53 | action_space_map = { 
54 |                 0: 0,  
55 |                 1: 1,  
56 |                 2: 2,  
57 |                 3: 3  }
58 | 
59 | prob = [1/float(action_space_dim)]*action_space_dim # Probability with which to explore space when deviating from old policy
60 | 
61 | 
62 | calculate_gap = True # Run Main algo. If False, it skips calc of primal-dual gap
63 | infinite_loop = True # Stop script if reached primal-dual gap threshold
64 | policy_improvement_name = 'car_policy_improvement.h5'
65 | results_name = 'car_results.csv'


--------------------------------------------------------------------------------
/env_dqns.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from DQN import DeepQLearning
 3 | from env_nn import *
 4 | 
 5 | class LakeDQN(DeepQLearning):
 6 |     def __init__(self, *args, **kw):
 7 |         holes, goals = kw['position_of_holes'], kw['position_of_goals']
 8 |         del kw['position_of_holes']
 9 |         del kw['position_of_goals']
10 | 
11 |         self.min_epsilon = kw['min_epsilon']
12 |         self.initial_epsilon = kw['initial_epsilon']
13 |         self.epsilon_decay_steps = kw['epsilon_decay_steps']
14 |         for key in ['min_epsilon', 'initial_epsilon', 'epsilon_decay_steps']:
15 |             if key in kw: del kw[key]
16 | 
17 |         super(LakeDQN, self).__init__(*args, **kw)
18 |         
19 |         for key in ['action_space_map','max_time_spent_in_episode','num_iterations','sample_every_N_transitions','batchsize','copy_over_target_every_M_training_iterations', 'buffer_size', 'min_buffer_size_to_train', 'models_path']:
20 |             if key in kw: del kw[key]
21 | 
22 |         kw['position_of_holes'],kw['position_of_goals']  = holes, goals
23 |         self.state_space_dim = self.env.nS
24 |         self.action_space_dim = self.env.nA
25 |         self.Q = LakeNN(self.state_space_dim+self.action_space_dim, 1, [self.env.desc.shape[0], self.env.desc.shape[1]], self.action_space_dim, self.gamma, **kw)
26 |         self.Q_target = LakeNN(self.state_space_dim+self.action_space_dim, 1, [self.env.desc.shape[0], self.env.desc.shape[1]], self.action_space_dim, self.gamma, **kw)
27 | 
28 |     def sample_random_action(self):
29 |         '''
30 |         Uniform random
31 |         '''
32 |         return np.random.choice(self.action_space_dim)
33 | 
34 |     # def epsilon(self, epoch=None, total_steps=None):
35 |     #     return 1./(total_steps/100 + 3)
36 |     def epsilon(self, epoch=None, total_steps=None):
37 |         if epoch >= self.epsilon_decay_steps:
38 |             return self.min_epsilon
39 |         else:
40 |             alpha = epoch / float(self.epsilon_decay_steps)
41 |             current_epsilon = self.initial_epsilon * (1-alpha) + self.min_epsilon * (alpha)
42 |             return current_epsilon
43 | 
44 | class CarDQN(DeepQLearning):
45 |     def __init__(self, *args, **kw):
46 |         
47 |         self.gas_actions = None
48 | 
49 |         self.min_epsilon = kw['min_epsilon']
50 |         self.initial_epsilon = kw['initial_epsilon']
51 |         self.epsilon_decay_steps = kw['epsilon_decay_steps']
52 |         self.action_space_dim = kw['action_space_dim']
53 |         for key in ['action_space_dim', 'min_epsilon', 'initial_epsilon', 'epsilon_decay_steps']:
54 |             if key in kw: del kw[key]
55 | 
56 |         super(CarDQN, self).__init__(*args, **kw) 
57 |         for key in ['action_space_map','max_time_spent_in_episode','num_iterations','sample_every_N_transitions','batchsize','copy_over_target_every_M_training_iterations', 'buffer_size', 'min_buffer_size_to_train', 'models_path']:
58 |             if key in kw: del kw[key]
59 | 
60 |         from config_car import state_space_dim
61 |         self.state_space_dim = state_space_dim
62 |         self.Q = CarNN(self.state_space_dim, self.action_space_dim, self.gamma, **kw)
63 |         self.Q_target = CarNN(self.state_space_dim, self.action_space_dim, self.gamma, **kw)
64 | 
65 |     def sample_random_action(self):
66 |         '''
67 |         Biased (toward movement) random
68 |         '''
69 |         if self.gas_actions is None:
70 |             self.gas_actions = {key:val[1] == 1 and val[2] == 0 for key,val in self.action_space_map.iteritems()}
71 | 
72 |         action_weights = 14. * np.array(self.gas_actions.values()) + 1.0
73 |         action_weights /= np.sum(action_weights)
74 | 
75 |         return np.random.choice(self.gas_actions.keys(), p=action_weights)
76 |         # return np.random.choice(self.action_space_dim)
77 | 
78 |     def epsilon(self, epoch=None, total_steps=None):
79 |         if epoch >= self.epsilon_decay_steps:
80 |             # return max(.08*((2000-epoch)/1000), 0.) + .02
81 |             return self.min_epsilon
82 |         else:
83 |             alpha = epoch / float(self.epsilon_decay_steps)
84 |             current_epsilon = self.initial_epsilon * (1-alpha) + self.min_epsilon * (alpha)
85 |             return current_epsilon
86 |         
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/exact_policy_evaluation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import numpy as np
  4 | import scipy.signal as signal
  5 | from replay_buffer import Buffer
  6 | import os
  7 | 
  8 | 
  9 | class ExactPolicyEvaluator(object):
 10 |     def __init__(self, action_space_map, gamma, env=None, num_frame_stack=None, frame_skip = None, pic_size = None, constraint_thresholds=None, constraints_cared_about=None):
 11 |         '''
 12 |         An implementation of Exact Policy Evaluation through Monte Carlo
 13 | 
 14 |         In this case since the environment is fixed and initial states are fixed
 15 |         then this will be exact
 16 |         '''
 17 |         self.gamma = gamma
 18 |         self.action_space_map = action_space_map
 19 |         self.constraint_thresholds = constraint_thresholds
 20 |         self.constraints_cared_about = constraints_cared_about
 21 | 
 22 |         self.num_frame_stack = num_frame_stack 
 23 |         self.frame_skip = frame_skip
 24 |         self.pic_size = pic_size      
 25 |         self.buffer_size = int(2000)
 26 |         self.min_buffer_size_to_train = 0                                     
 27 |         
 28 |         # self.initial_states = initial_states
 29 |         # self.state_space_dim = state_space_dim
 30 |         if env is not None:
 31 |             self.env = env
 32 |         else:
 33 |             raise
 34 | 
 35 |         self.monitor = Monitor(self.env, 'videos')
 36 | 
 37 |     def run(self, policy, *args, **kw):
 38 | 
 39 |         environment_is_dynamic = not self.env.deterministic
 40 | 
 41 |         if 'policy_is_greedy' not in kw:
 42 |             kw['policy_is_greedy']=True
 43 |             policy_is_greedy=True
 44 |         else:
 45 |             policy_is_greedy= kw['policy_is_greedy']
 46 |         
 47 |         if not isinstance(policy,(list,)):
 48 |             policy = [policy]
 49 | 
 50 | 
 51 |         if not environment_is_dynamic and policy_is_greedy:
 52 |             c,g,perf = self.determinstic_env_and_greedy_policy(policy, **kw)
 53 |             if len(args) > 0:
 54 |                 if args[0] == 'c':
 55 |                     return c
 56 |                 else:
 57 |                     try:
 58 |                         return g[i]
 59 |                     except:
 60 |                         if isinstance(g,(list,)) and len(g) > 1:
 61 |                             assert False, 'Index error'
 62 |                         else:
 63 |                             return g
 64 |             else:
 65 |                 return c,g,perf
 66 | 
 67 |         else:
 68 |             return self.stochastic_env_or_policy(policy, **kw)
 69 | 
 70 |     def get_Qs(self, policy, initial_states, state_space_dim, idx=0):
 71 |         Q = []
 72 |         for initial_state in initial_states:
 73 |             self.env.isd = np.eye(state_space_dim)[initial_state]
 74 | 
 75 |             if not isinstance(policy,(list,)):
 76 |                 policy = [policy]
 77 |             Q.append(self.determinstic_env_and_greedy_policy(policy, render=False, verbose=False)[idx])
 78 |         
 79 |         self.env.isd = np.eye(state_space_dim)[0]
 80 |         return Q
 81 | 
 82 |     def stochastic_env_or_policy(self, policy, render=False, verbose=False, **kw):
 83 |         '''
 84 |         Run the evaluator
 85 |         '''
 86 | 
 87 |         all_c = []
 88 |         all_g = []
 89 |         if len(policy) > 1: import pdb; pdb.set_trace()
 90 |         for pi in policy:
 91 |             trial_c = []
 92 |             trial_g = []
 93 |             for i in range(1):
 94 |                 c = []
 95 |                 g = []
 96 |                 self.buffer = Buffer(num_frame_stack= self.num_frame_stack,buffer_size= self.buffer_size,min_buffer_size_to_train= self.min_buffer_size_to_train,pic_size = self.pic_size,)
 97 |                 x = self.env.reset()
 98 |                 self.buffer.start_new_episode(x)
 99 |                 done = False
100 |                 time_steps = 0
101 |                 
102 |                 while not done:
103 |                     time_steps += 1
104 |                     if (self.env.env_type in ['car']) or render: self.env.render()
105 | 
106 |                     action = pi([self.buffer.current_state()])[0]
107 | 
108 |                     cost = []
109 |                     for _ in range(self.frame_skip):
110 |                         x_prime, costs, done, _ = self.env.step(self.action_space_map[action])
111 |                         # if self.render:
112 |                         #     self.env.render()
113 |                         cost.append(costs)
114 |                         if done:
115 |                             break
116 |                     
117 |                     cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)
118 |                     if self.constraint_thresholds is not None: 
119 |                         cost[1:][self.constraints_cared_about] = np.array(cost[1:])[self.constraints_cared_about] >= self.constraint_thresholds[:-1]
120 | 
121 | 
122 |                     early_done, _ = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps, total_cost=sum(c))
123 |                     done = done or early_done
124 |                     self.buffer.append(action, x_prime, cost[0], done)
125 |                     
126 |                     if verbose: print x,action,x_prime,cost
127 |                     
128 |                     c.append(cost[0].tolist())
129 |                     g.append(cost[1:].tolist())
130 | 
131 |                     x = x_prime
132 |                 trial_c.append(c)
133 |                 trial_g.append(g)
134 | 
135 |             all_c.append(np.mean([self.discounted_sum(x, self.gamma) for x in trial_c]))
136 |             all_g.append(np.mean([ [self.discounted_sum(cost, self.gamma) for cost in np.array(x).T] for x in trial_g], axis=0).tolist())
137 |             # all_g.append(np.mean([self.discounted_sum(x, self.gamma) for x in trial_g]))
138 |         
139 |         c = np.mean(all_c, axis=0)
140 |         g = np.mean(all_g, axis=0)
141 | 
142 |         return c,g
143 | 
144 | 
145 |     def determinstic_env_and_greedy_policy(self, policy, render=False, verbose=False, to_monitor=False, **kw):
146 |         '''
147 |         Run the evaluator
148 |         '''
149 | 
150 |         all_c = []
151 |         all_g = []
152 |         for pi in policy:
153 |             c = []
154 |             g = []
155 |             self.buffer = Buffer(num_frame_stack= self.num_frame_stack,
156 |                                      buffer_size= self.buffer_size,
157 |                                      min_buffer_size_to_train= self.min_buffer_size_to_train,
158 |                                      pic_size = self.pic_size,)
159 |             x = self.env.reset()
160 |             if (self.env.env_type in ['car']) or render: self.env.render()
161 |             self.buffer.start_new_episode(x)
162 |             done = False
163 |             time_steps = 0
164 |             if to_monitor:
165 |                 self.monitor.delete()
166 |             while not done:
167 |                 if (self.env.env_type in ['car']) or render: 
168 |                     if to_monitor: self.monitor.save()
169 |                     # self.env.render()
170 |                 time_steps += 1
171 |                 
172 |                 action = pi(self.buffer.current_state())[0]
173 |                 # action = np.argmin(pi.model.predict(np.rollaxis(np.dot(self.buffer.current_state()/255. , [0.299, 0.587, 0.114])[np.newaxis,...],1,4)))
174 |                 # print self.action_space_map[action]
175 |                 # import pdb; pdb.set_trace()
176 |                 cost = []
177 |                 for _ in range(self.frame_skip):
178 |                     x_prime, costs, done, _ = self.env.step(self.action_space_map[action])
179 |                     # if self.render:
180 |                     if (self.env.env_type in ['car']) or render: self.env.render()
181 |                     cost.append(costs)
182 |                     if done:
183 |                         break
184 |                 
185 |                 cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)
186 |                 if self.constraint_thresholds is not None: 
187 |                     pass
188 |                     #cost[1:][self.constraints_cared_about] = np.array(cost[1:])[self.constraints_cared_about] >= self.constraint_thresholds[:-1]
189 |                 
190 |                 
191 |                 early_done, punishment = self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps, total_cost=sum(c))
192 |                 done = done or early_done
193 | 
194 |                 self.buffer.append(action, x_prime, cost[0]+punishment, done)
195 |                 
196 |                 # if verbose: print x,action,x_prime,cost
197 |                 #print time_steps, cost[0], action
198 |                 # if (time_steps % 50) ==0 : print time_steps, cost[0]+punishment, action
199 |                 # print cost[0] + punishment
200 |                 c.append(cost[0] + punishment)
201 |                 g.append(cost[1:])
202 | 
203 |                 # x_prime , cost, done, _ = self.env.step(self.action_space_map[action])
204 |                 # done = done or self.env.is_early_episode_termination(cost=cost[0], time_steps=time_steps)
205 |                 # self.buffer.append(action, x_prime, cost[0], done)
206 |                 
207 |                 # if verbose: print x,action,x_prime,cost
208 |                 # if render: self.env.render()
209 |                 # c.append(cost[0])
210 |                 # g.append(cost[1])
211 | 
212 |                 x = x_prime
213 |             all_c.append(c)
214 |             all_g.append(g)
215 | 
216 |             if to_monitor: self.monitor.make_video()
217 |             if self.env.env_type in ['car']:  
218 |                 print 'Performance: %s/%s = %s' %  (self.env.tile_visited_count, len(self.env.track), self.env.tile_visited_count/float(len(self.env.track)))
219 |         # import pdb; pdb.set_trace()
220 |         c = np.mean([self.discounted_sum(x, self.gamma) for x in all_c])
221 |         g = np.mean([ [self.discounted_sum(cost, self.gamma) for cost in np.array(x).T] for x in all_g], axis=0).tolist()
222 |         # g = np.mean([self.discounted_sum(np.array(x), self.gamma) for x in all_g], axis=0).tolist()
223 | 
224 |         if not isinstance(g,(list,)):
225 |             g = [g]
226 | 
227 |         if self.env.env_type in ['car']:  
228 |             return c,g, self.env.tile_visited_count/float(len(self.env.track))
229 |         else:
230 |             return c,g, -c
231 | 
232 |     @staticmethod
233 |     def discounted_sum(costs, discount):
234 |         '''
235 |         Calculate discounted sum of costs
236 |         '''
237 |         y = signal.lfilter([1], [1, -discount], x=costs[::-1])
238 |         return y[::-1][0]
239 | 
240 | class Monitor(object):
241 |     def __init__(self, env, filepath):
242 |         self.frame_num = 0
243 |         self.vid_num = 0
244 |         self.filepath = os.path.join(os.getcwd(), filepath)
245 |         if not os.path.exists(self.filepath):
246 |             os.makedirs(self.filepath)
247 |         self.image_name = "image%05d.png"
248 |         self.env = env
249 |         self.images = []
250 | 
251 |     def save(self):
252 |         import matplotlib.pyplot as plt
253 |         full_path = os.path.join(self.filepath, self.image_name % self.frame_num)
254 |         self.images.append(full_path)
255 |         # plt.imsave(full_path, self.env.render('rgb_array'))
256 |         im = self.env.render('human', render_human=True)
257 |         plt.imsave(full_path, im)
258 |         self.frame_num += 1
259 | 
260 |     def make_video(self):
261 |         import subprocess
262 |         current_dir = os.getcwd()
263 |         os.chdir(self.filepath)
264 |         # #'ffmpeg -framerate 8 -i image%05d.png -r 30 -pix_fmt yuv420p car_vid_0.mp4'
265 |         subprocess.call([
266 |             'ffmpeg', '-hide_banner', '-loglevel', 'panic', '-framerate', '8', '-i', self.image_name, '-r', '30', '-pix_fmt', 'yuv420p',
267 |             'car_vid_%s.mp4' % self.vid_num
268 |         ])
269 | 
270 |         self.vid_num += 1
271 |         self.frame_num = 0
272 |         os.chdir(current_dir)
273 | 
274 |     def delete(self):
275 |         self.frame_num = 0
276 |         current_dir = os.getcwd()
277 |         os.chdir(self.filepath)
278 |         
279 |         for file_name in [f for f in os.listdir(os.getcwd()) if '.png' in f]:
280 |              os.remove(file_name)
281 | 
282 |         os.chdir(current_dir)
283 | 
284 |         
285 | 
286 | 
287 | 


--------------------------------------------------------------------------------
/experimental_results/hyperparam_2018_12_18_22_20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/experimental_results/hyperparam_2018_12_18_22_20.png


--------------------------------------------------------------------------------
/experimental_results/lspi.csv:
--------------------------------------------------------------------------------
1 | "(0.0, 0.0, -30.87238959928826, 12.867082852069641, 148.164007478981, 0.7107142857142857)","(0.0, 0.1, -25.94545768997468, 21.70199027024002, 89.24081946770697, 0.65)","(0.0, 0.2, -30.168532518598116, 22.257021833854374, 107.39076865114626, 0.65)","(0.0, 0.30000000000000004, -1.8828082912671142, 23.397868442728914, 14.194376691321793, 0.02142857142857143)","(0.0, 0.4, -4.590344009192178, 30.983625620907457, 3.4567222606865475, 0.02142857142857143)","(0.0, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.0, 0.6000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.0, 0.7000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.0, 0.8, 2.297939913687321, 35.56502472794603, 9.664349912267248, 0.014285714285714285)","(0.0, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.0, 1.0, 10.891434020469823, 24.150004214974345, 4.705301261323525, 0.010714285714285714)","(0.1, 0.0, -24.748313381309416, 13.469845336577208, 114.67437549922263, 0.10357142857142858)","(0.1, 0.1, -32.67756129871718, 18.849802000450236, 86.13297012296938, 0.9285714285714286)","(0.1, 0.2, -20.91840426304937, 16.0054493056005, 122.95385125055645, 0.95)","(0.1, 0.30000000000000004, -18.83435329655354, 24.141972805866068, 69.66527454287201, 0.6321428571428571)","(0.1, 0.4, -2.7137844892848833, 25.569733788701384, 10.119662766900444, 0.02142857142857143)","(0.1, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.1, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.1, 0.7000000000000001, 4.6413991292960235, 28.22368726462355, 9.465169261876653, 0.014285714285714285)","(0.1, 0.8, 2.297939913687321, 35.56502472794603, 9.664349912267248, 0.014285714285714285)","(0.1, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.1, 1.0, 10.891434020469823, 24.150004214974345, 4.705301261323525, 0.010714285714285714)","(0.2, 0.0, -24.748313381309416, 13.469845336577208, 114.67437549922263, 0.10357142857142858)","(0.2, 0.1, -32.6975504649815, 18.844508484929076, 85.99521166871655, 0.7464285714285714)","(0.2, 0.2, -16.14420423787202, 18.482310503597816, 135.3242842447658, 0.7535714285714286)","(0.2, 0.30000000000000004, -17.96785285098834, 21.592840875710646, 68.27198666070595, 0.95)","(0.2, 0.4, -1.8828082912671142, 23.514053630877942, 14.0929711230479, 0.02142857142857143)","(0.2, 0.5, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.2, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.2, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.2, 0.8, -0.9988878403025545, 35.29319244811491, 20.86673618851739, 0.017857142857142856)","(0.2, 0.9, 1.9727973822886717, 36.24283135092373, 9.055573661191975, 0.014285714285714285)","(0.2, 1.0, 2.640195209896431, 34.96761121280503, 11.512898307490847, 0.014285714285714285)","(0.30000000000000004, 0.0, -37.366352683512886, 8.655131082877158, 130.78663469709892, 0.4607142857142857)","(0.30000000000000004, 0.1, -35.23633331836208, 15.116653410287167, 104.24597832621448, 0.4785714285714286)","(0.30000000000000004, 0.2, -16.778659768121216, 18.383569331456822, 135.50375610072064, 0.95)","(0.30000000000000004, 0.30000000000000004, -22.79139400282686, 16.18029772432275, 94.60950347085678, 0.7714285714285715)","(0.30000000000000004, 0.4, -6.526389520136453, 26.876610715963384, 21.941088013242688, 0.025)","(0.30000000000000004, 0.5, 10.891434020469823, 21.577879214974345, 3.7486161816997186, 0.010714285714285714)","(0.30000000000000004, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.30000000000000004, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.30000000000000004, 0.8, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.30000000000000004, 0.9, 5.106994384221383, 27.503868817202427, 14.004486089260535, 0.014285714285714285)","(0.30000000000000004, 1.0, 2.640195209896431, 34.96761121280503, 11.512898307490847, 0.014285714285714285)","(0.4, 0.0, -49.51808233133347, 8.402237596363767, 130.08088253489376, 0.8321428571428572)","(0.4, 0.1, -35.23633331836208, 15.124203849946902, 104.12115167308805, 0.4785714285714286)","(0.4, 0.2, -38.47822503478941, 5.915208989147174, 140.33440044802362, 0.7464285714285714)","(0.4, 0.30000000000000004, -29.374755790747926, 13.309517746562904, 94.9673478042272, 0.95)","(0.4, 0.4, -13.178527186331424, 30.335514310987765, 39.514564863047845, 0.14642857142857144)","(0.4, 0.5, 10.891434020469823, 18.26723791515186, 4.089257249059158, 0.010714285714285714)","(0.4, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.4, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.4, 0.8, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.4, 0.9, 10.891434020469823, 24.033937074349346, 6.274179040273764, 0.010714285714285714)","(0.4, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.5, 0.0, -30.998114130432857, 7.516518168786301, 166.9666230677311, 0.10357142857142858)","(0.5, 0.1, -35.08117877648681, 15.111430052428831, 116.87681106485796, 0.6642857142857143)","(0.5, 0.2, -38.43576004754668, 5.903004129312425, 140.22778928687947, 0.6535714285714286)","(0.5, 0.30000000000000004, -31.501277382990466, 15.928346140297581, 120.07952966692767, 0.9285714285714286)","(0.5, 0.4, -1.8828082912671142, 22.511819094699007, 13.489989370595756, 0.02142857142857143)","(0.5, 0.5, -0.5869787715565078, 26.56534165044393, 1.9626366011276195, 0.017857142857142856)","(0.5, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.5, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.5, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.5, 0.9, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.5, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.6000000000000001, 0.0, -30.998114130432857, 7.516518168786301, 166.95827914277348, 0.10357142857142858)","(0.6000000000000001, 0.1, -49.77443204139867, 3.1525241053713087, 142.88979119642954, 0.7571428571428571)","(0.6000000000000001, 0.2, -36.37118181046612, 2.168736141789653, 125.12577496981973, 0.7535714285714286)","(0.6000000000000001, 0.30000000000000004, -23.917619918151, 18.612682590090827, 109.51548998648326, 0.7571428571428571)","(0.6000000000000001, 0.4, -16.80912853057475, 25.051325386796755, 55.33028450551636, 0.7392857142857143)","(0.6000000000000001, 0.5, -2.1114579498688713, 23.533503973563352, 11.433697403414804, 0.02142857142857143)","(0.6000000000000001, 0.6000000000000001, 3.0004639427481266, 31.644453943577666, 12.796745528901027, 0.014285714285714285)","(0.6000000000000001, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.6000000000000001, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.6000000000000001, 0.9, 1.9727973822886717, 36.24283135092373, 12.04864268380703, 0.014285714285714285)","(0.6000000000000001, 1.0, 10.891434020469823, 24.033937074349346, 6.251521015690073, 0.010714285714285714)","(0.7000000000000001, 0.0, -30.517809626923295, 3.115532712000771, 140.22804519004828, 0.10714285714285714)","(0.7000000000000001, 0.1, -53.59647932118728, 0.6339452797313537, 157.22639231995117, 0.9285714285714286)","(0.7000000000000001, 0.2, -34.3907008089484, 2.831270113650506, 102.43360109954152, 0.625)","(0.7000000000000001, 0.30000000000000004, -23.885364246748072, 18.655017987744426, 109.6178053788328, 0.6535714285714286)","(0.7000000000000001, 0.4, -23.172040167102544, 19.593291455152045, 93.3109526893351, 0.6357142857142857)","(0.7000000000000001, 0.5, -1.8828082912671142, 23.533503973563352, 13.489989370595756, 0.02142857142857143)","(0.7000000000000001, 0.6000000000000001, 10.891434020469823, 21.577879214974345, 4.915735172714833, 0.010714285714285714)","(0.7000000000000001, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.7000000000000001, 0.8, 5.106994384221383, 27.503868817202427, 14.967075475005078, 0.014285714285714285)","(0.7000000000000001, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.7000000000000001, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 0.0, -30.517809626923295, 1.8575583526703745, 150.84933819779647, 0.10714285714285714)","(0.8, 0.1, -53.59328533339544, 0.6241644732013664, 157.20796735572, 0.9142857142857143)","(0.8, 0.2, -44.350115646278844, 2.370291860876593, 150.79763802436585, 0.6535714285714286)","(0.8, 0.30000000000000004, -36.65360783356822, 6.85555252981692, 127.66890081348451, 0.6535714285714286)","(0.8, 0.4, -28.69023667233265, 13.513905397977704, 95.1969742458172, 0.7464285714285714)","(0.8, 0.5, -1.8828082912671142, 23.533503973563352, 13.489989370595756, 0.02142857142857143)","(0.8, 0.6000000000000001, 10.891434020469823, 21.577879214974345, 4.915735172714833, 0.010714285714285714)","(0.8, 0.7000000000000001, 3.0004639427481266, 31.644453943577666, 12.72664986916376, 0.014285714285714285)","(0.8, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.8, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 0.0, -30.517809626923295, 1.476668335005992, 152.9233128473738, 0.10714285714285714)","(0.9, 0.1, -53.35815769429593, 0.6317049705640946, 156.96606713556662, 0.6571428571428571)","(0.9, 0.2, -44.48193015587891, 2.297478229398147, 150.35972803712596, 0.9214285714285714)","(0.9, 0.30000000000000004, -26.24859128746181, 5.450342796473387, 190.0706381948163, 0.09642857142857143)","(0.9, 0.4, -22.94466852822807, 17.705861613273452, 96.6822227929777, 0.75)","(0.9, 0.5, -1.8828082912671142, 22.511819094699007, 13.489989370595756, 0.02142857142857143)","(0.9, 0.6000000000000001, -0.8804199061437927, 23.36313968711469, 13.5047025718302, 0.017857142857142856)","(0.9, 0.7000000000000001, 5.106994384221383, 27.253086629702423, 12.641385213197506, 0.014285714285714285)","(0.9, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(0.9, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 0.0, -30.517809626923295, 1.4401887435416998, 152.96870715709716, 0.10714285714285714)","(1.0, 0.1, -51.77812949566279, 0.6536474492784501, 159.9790481111361, 0.4035714285714286)","(1.0, 0.2, -27.626541265577274, 1.531876264717034, 187.6652679786134, 0.09642857142857143)","(1.0, 0.30000000000000004, -34.373835628250184, 2.002533434179407, 147.72391148128506, 0.46785714285714286)","(1.0, 0.4, -19.091133223784908, 20.983568683604602, 123.45859743847552, 0.9178571428571428)","(1.0, 0.5, -1.760592374266131, 21.781018854056324, 16.58261445985894, 0.02142857142857143)","(1.0, 0.6000000000000001, -0.8804199061437927, 23.36313968711469, 13.5047025718302, 0.017857142857142856)","(1.0, 0.7000000000000001, 5.106994384221383, 27.253086629702423, 12.641385213197506, 0.014285714285714285)","(1.0, 0.8, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 0.9, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)","(1.0, 1.0, 5.106994384221383, 27.503868817202427, 16.537344058235494, 0.014285714285714285)"
2 | 


--------------------------------------------------------------------------------
/experimental_results/policy_improvement_grid.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/experimental_results/policy_improvement_grid.h5


--------------------------------------------------------------------------------
/experimental_results_car/experiment_results_2019_01_03_11_00.csv:
--------------------------------------------------------------------------------
1 | iteration,max_L,min_L,c_exact_avg,g_exact_avg_0,g_exact_avg_1,c_avg,g_avg_0,g_avg_1,c_pi_exact,g_pi_exact_0,g_pi_exact_1,c_pi,g_pi_0,g_pi_1,lambda_0,lambda_1,c_br_exact,g_br_exact_0,g_br_exact_1,c_br,g_br_0,g_br_1
2 | 2.0,6.148661136627197,-574.5399553239504,-198.1867753728582,0.9244763240899999,14.042931089729445,6.148661136627197,6.882289762794971,5.676133691892028,-140.75793204722302,0.9342286880693633,13.112063586052079,10.269734382629395,12.21979808807373,12.74373722076416,9.969514664921242,9.962982068916636,-134.8524506497504,0.0,13.243908279465264,2.394774913787842,0.8382666707038879,1.3645581007003784
3 | 3.0,6.425106048583984,-584.576169644116,-165.98762484239143,0.6501060330924572,14.21169368156832,6.425106048583984,4.148929041624069,3.4076230419799685,-153.6947530179227,0.998001,18.619457899348593,15.05801010131836,1.6015316247940063,0.35395070910453796,9.925027444371498,9.918910075482357,-101.32163468214613,0.0,24.72728498336814,1.4825304746627808,-0.08765091747045517,1.2559328079223633
4 | 4.0,6.676574230194092,-569.3260919943367,-147.7454015120166,0.5654166130502738,15.375663136097678,6.676574230194092,3.9049627065658568,3.539063963169853,-23.490450456600467,0.9801888648295347,11.68609128093784,9.921900749206543,3.4011237621307373,4.371957302093506,9.879761311445243,9.87338172331725,-126.48704510723287,0.0,12.922694045301075,3.5963618755340576,2.5682566165924072,-0.21287555992603302
5 | 5.0,6.462437629699707,-574.2233519092224,-140.63167187065525,0.4723902128926595,15.73495434720103,6.462437629699707,3.7522922288626432,3.3337234503589572,-130.00775776067255,0.0,21.10419176407419,9.533533096313477,3.3523638248443604,3.174515962600708,9.835181814902072,9.827126631899771,-2.5386004956734625,0.0,0.0,4.74336576461792,1.9225209951400757,-0.30694302916526794
6 | 6.0,5.546809196472168,-592.8923824372823,-163.9243814309871,0.37791217031412755,15.587451973055595,5.546809196472168,3.62703645080328,2.845789700075984,-148.0896022302071,0.0,13.257165444910175,-0.44830286502838135,5.215179443359375,2.8287134170532227,9.79177020002498,9.774996883869964,-217.62097308094042,0.0,7.742500215588536,-0.10064613074064255,0.0935686007142067,-0.009326789528131485
7 | 


--------------------------------------------------------------------------------
/experimental_results_car/experiment_results_2019_01_03_15_00.csv:
--------------------------------------------------------------------------------
 1 | iteration,max_L,min_L,c_exact_avg,g_exact_avg_0,g_exact_avg_1,c_avg,g_avg_0,g_avg_1,c_pi_exact,g_pi_exact_0,g_pi_exact_1,c_pi,g_pi_0,g_pi_1,lambda_0,lambda_1,c_br_exact,g_br_exact_0,g_br_exact_1,c_br,g_br_0,g_br_1
 2 | 2.0,6.148661136627197,-574.5399553239504,-198.1867753728582,0.9244763240899999,14.042931089729445,6.148661136627197,6.882289762794971,5.676133691892028,-140.75793204722302,0.9342286880693633,13.112063586052079,10.269734382629395,12.21979808807373,12.74373722076416,9.969514664921242,9.962982068916636,-134.8524506497504,0.0,13.243908279465264,2.394774913787842,0.8382666707038879,1.3645581007003784
 3 | 3.0,6.425106048583984,-584.576169644116,-165.98762484239143,0.6501060330924572,14.21169368156832,6.425106048583984,4.148929041624069,3.4076230419799685,-153.6947530179227,0.998001,18.619457899348593,15.05801010131836,1.6015316247940063,0.35395070910453796,9.925027444371498,9.918910075482357,-101.32163468214613,0.0,24.72728498336814,1.4825304746627808,-0.08765091747045517,1.2559328079223633
 4 | 4.0,6.676574230194092,-569.3260919943367,-147.7454015120166,0.5654166130502738,15.375663136097678,6.676574230194092,3.9049627065658568,3.539063963169853,-23.490450456600467,0.9801888648295347,11.68609128093784,9.921900749206543,3.4011237621307373,4.371957302093506,9.879761311445243,9.87338172331725,-126.48704510723287,0.0,12.922694045301075,3.5963618755340576,2.5682566165924072,-0.21287555992603302
 5 | 5.0,6.462437629699707,-574.2233519092224,-140.63167187065525,0.4723902128926595,15.73495434720103,6.462437629699707,3.7522922288626432,3.3337234503589572,-130.00775776067255,0.0,21.10419176407419,9.533533096313477,3.3523638248443604,3.174515962600708,9.835181814902072,9.827126631899771,-2.5386004956734625,0.0,0.0,4.74336576461792,1.9225209951400757,-0.30694302916526794
 6 | 6.0,5.546809196472168,-592.8923824372823,-163.9243814309871,0.37791217031412755,15.587451973055595,5.546809196472168,3.62703645080328,2.845789700075984,-148.0896022302071,0.0,13.257165444910175,-0.44830286502838135,5.215179443359375,2.8287134170532227,9.79177020002498,9.774996883869964,-217.62097308094042,0.0,7.742500215588536,-0.10064613074064255,0.0935686007142067,-0.009326789528131485
 7 | 7.0,3.827444553375244,-591.6153939177314,-162.18717787838338,0.3472098444537572,16.372180663182757,3.827444553375244,4.382718875755867,3.091123585837583,-147.51519705540926,0.0,15.621867960993018,-14.129043579101562,16.55801773071289,18.87964630126953,9.75821439310982,9.718220090422738,-113.76937185658383,0.0,24.657626784931256,0.13024234771728516,-0.016394317150115967,0.06715112924575806
 8 | 8.0,3.0648725032806396,-501.46193763842234,-162.46840066296284,0.29760843810322046,16.00592572188844,3.0648725032806396,4.796520133316517,3.1554629401969057,-338.85815308597677,0.0,11.121831116525849,-1.2769678831100464,8.990571022033691,7.094027042388916,9.726399901907964,9.67252877144249,-87.02938452345659,0.0,20.770772745096483,-0.606652021408081,1.2419638633728027,7.908010005950928
 9 | 9.0,2.7094998359680176,-569.1150850238164,-171.525873743171,0.355815195191935,15.498535665696489,2.7094998359680176,5.233984155859798,3.252672763634473,-165.41534154458387,0.0,11.678721787001823,2.0317790508270264,14.147116661071777,7.994142055511475,9.692725269397314,9.622130287830839,-136.9522744322237,0.0,15.791085469438224,-1.2493207454681396,0.956859290599823,1.256043791770935
10 | 10.0,2.4874157905578613,-557.3489573717159,-165.57653206627097,0.3363665591698666,15.424112272134662,2.4874157905578613,4.853659489378333,3.0225775541530715,-38.919015507108824,0.0,6.769833000615041,2.765155553817749,5.4268574714660645,3.130974769592285,9.649147925210638,9.572164286242321,-67.00296205336274,0.9029834676116293,13.576669165799476,0.1681966334581375,1.9059392213821411,1.2325890064239502
11 | 11.0,2.563368320465088,-556.9338330950056,-168.31466905274178,0.3170199628355614,15.74150682802861,2.563368320465088,4.789618060551584,3.2093044956028463,-43.477502465370364,0.0,12.540185809619366,0.6309247612953186,6.421274185180664,9.964588165283203,9.604771170052947,9.525801578257058,-147.296070173987,0.0,16.477174533949274,3.9456264972686768,1.4120573997497559,1.2542179822921753
12 | 12.0,2.4370322227478027,-527.423361767437,-172.48643098335398,0.30432482417025697,15.853650768603806,2.4370322227478027,5.0258558272976765,3.3600188912315803,-150.0063790306519,0.0,17.822908796974087,-6.6260833740234375,13.684722900390625,13.825427055358887,9.565612396055803,9.478238170086911,-44.128799941426564,0.0,14.512974359373565,-4.980665683746338,1.489221453666687,4.991724491119385
13 | 13.0,1.02047860622406,-543.5842780136225,-173.80114758790893,0.34818484257911464,16.387899557940642,1.02047860622406,5.1424289941477275,3.4068895703802506,-137.33306714979042,0.0,24.57957710003057,-31.677888870239258,9.981054306030273,8.623197555541992,9.528966004662346,9.43153405836201,-110.30751104064173,0.0,12.249665961879503,1.4286595582962036,2.4452579021453857,1.5892701148986816
14 | 14.0,-0.36623436212539673,-566.9795281820868,-171.58695901705673,0.3358173468247523,16.572092958476333,-0.36623436212539673,5.055496841181929,3.507722426033937,-137.1271940787331,0.9370369888620198,14.985253027358416,-30.010042190551758,6.458477020263672,10.072896957397461,9.48499368693826,9.38665464694548,-66.22942833376419,0.0,9.590799591462146,2.6964595317840576,0.9590728282928467,0.4115790128707886
15 | 


--------------------------------------------------------------------------------
/exponentiated_gradient.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | 
 5 | class ExponentiatedGradient(object):
 6 |     def __init__(self, lambda_bound, number_of_constraints, eta=1., starting_lambda='uniform'):
 7 |         '''
 8 |         '''
 9 |         self.eta = eta
10 |         self.lambda_bound = lambda_bound
11 |         self.number_of_constraints = number_of_constraints
12 |         if starting_lambda == 'uniform':
13 |             self.w_t = self.lambda_bound*np.ones(self.number_of_constraints)/self.number_of_constraints
14 |         else:
15 |             self.w_t = starting_lambda
16 |             self.lambda_bound = np.sum(starting_lambda)
17 |     
18 |     def run(self, gradient):
19 |         self.w_t = self.w_t/self.lambda_bound
20 |         unnormalized_wt = self.w_t*np.exp(self.eta*gradient) # positive since working  w/ costs.
21 |         self.w_t = self.lambda_bound*unnormalized_wt/sum(unnormalized_wt)
22 |         return self.w_t
23 |     
24 |     def get(self):
25 |         return self.w_t


--------------------------------------------------------------------------------
/fitted_algo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from keras import backend as K
 3 | import numpy as np
 4 | 
 5 | class FittedAlgo(object):
 6 |     def __init__(self):
 7 |         '''
 8 |         An implementation of fitted Q iteration
 9 | 
10 |         num_inputs: number of inputs
11 |         dim_of_actions: dimension of action space
12 |         max_epochs: positive int, specifies how many iterations to run the algorithm
13 |         gamma: discount factor
14 |         '''
15 | 
16 |     def init_Q(self):
17 |         '''
18 |         Absract function
19 |         '''
20 |         pass
21 | 
22 |     def fit(self, X, y, epsilon=1e-10, **kw):
23 |         # D_k = {(X,y)} is the dataset of the kth iteration of Fitted Q
24 |         # self.Q_k = self.init_Q(epsilon)
25 |         # K.set_value(self.Q_k.model.optimizer.iterations, 0)
26 |         self.Q_k.epsilon = epsilon
27 |         self.Q_k.fit(X, y, **kw)
28 | 
29 |     def fit_generator(self, generator, epsilon=1e-10, **kw):
30 |         # D_k = {(X,y)} is the dataset of the kth iteration of Fitted Q
31 |         # self.Q_k = self.init_Q(epsilon)
32 |         # K.set_value(self.Q_k.model.optimizer.iterations, 0)
33 |         self.Q_k.epsilon = epsilon
34 |         self.Q_k.fit_generator(generator, **kw)
35 | 
36 |     def skim(self, X_a, x_prime):
37 |         full_set = np.hstack([X_a, x_prime.reshape(1,-1).T])
38 |         idxs = np.unique(full_set, axis=0, return_index=True)[1]
39 |         return idxs
40 | 
41 |     def run(self, dataset):
42 |         '''
43 |         Abstract function
44 |         '''
45 |         pass
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/fitted_off_policy_evaluation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from fitted_algo import FittedAlgo
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from env_nn import *
  6 | from thread_safe import threadsafe_generator
  7 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
  8 | 
  9 | class LakeFittedQEvaluation(FittedAlgo):
 10 |     def __init__(self, initial_states, num_inputs, grid_shape, dim_of_actions, max_epochs, gamma,model_type='mlp', position_of_goals=None, position_of_holes=None, num_frame_stack=None):
 11 | 
 12 |         '''
 13 |         An implementation of fitted Q iteration
 14 | 
 15 |         num_inputs: number of inputs
 16 |         dim_of_actions: dimension of action space
 17 |         max_epochs: positive int, specifies how many iterations to run the algorithm
 18 |         gamma: discount factor
 19 |         '''
 20 |         self.model_type = model_type
 21 |         self.initial_states = initial_states
 22 |         self.num_inputs = num_inputs
 23 |         self.dim_of_actions = dim_of_actions
 24 |         self.max_epochs = max_epochs
 25 |         self.gamma = gamma
 26 |         self.grid_shape = grid_shape
 27 |         self.position_of_holes = position_of_holes
 28 |         self.position_of_goals = position_of_goals
 29 |         self.num_frame_stack = num_frame_stack
 30 | 
 31 |         super(LakeFittedQEvaluation, self).__init__()
 32 | 
 33 |     def run(self, policy, which_cost, dataset, epochs=500, epsilon=1e-8, desc='FQE', g_idx=None, **kw):
 34 |         # dataset is the original dataset generated by pi_{old} to which we will find
 35 |         # an approximately optimal Q
 36 | 
 37 |         self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw)
 38 | 
 39 |         X_a = np.hstack(dataset.get_state_action_pairs('lake'))
 40 |         x_prime = dataset['x_prime']
 41 | 
 42 |         index_of_skim = self.skim(X_a, x_prime)
 43 |         X_a = X_a[index_of_skim]
 44 |         x_prime = x_prime[index_of_skim]
 45 |         dataset.set_cost(which_cost, idx=g_idx)
 46 |         dataset_costs = dataset['cost'][index_of_skim]
 47 |         dones = dataset['done'][index_of_skim]
 48 |         pi_of_x_prime = policy(x_prime)
 49 |         x_prime = x_prime.reshape(-1)
 50 | 
 51 |         values = []
 52 |         for k in tqdm(range(self.max_epochs), desc=desc):
 53 | 
 54 |             # {((x,a), r+gamma* Q(x',pi(x')))}
 55 |             
 56 |             # if k == 0:
 57 |             #     # Q_0 = 0 everywhere
 58 |             #     costs = dataset_costs
 59 |             # else:
 60 |             costs = dataset_costs + (self.gamma*self.Q_k(x_prime, pi_of_x_prime).reshape(-1)*(1-dones.astype(int))).reshape(-1)
 61 | 
 62 |             # if (k >= (self.max_epochs-100)): K.set_value(self.Q_k.model.optimizer.lr, 0.00001)
 63 |             self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0)
 64 |             values.append(np.mean([self.Q_k(state, policy(state)) for state in self.initial_states]))
 65 |             print values[-1]
 66 |             # if not self.Q_k.callbacks_list[0].converged:
 67 |             #     print 'Continuing training due to lack of convergence'
 68 |             #     self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0)
 69 | 
 70 |         return np.mean(values[-10:]), values #np.mean([self.Q_k(state, policy(state)) for state in self.initial_states])
 71 | 
 72 |     def init_Q(self, epsilon=1e-10, **kw):
 73 |         return LakeNN(self.num_inputs, 1, self.grid_shape, self.dim_of_actions, self.gamma, epsilon, **kw)
 74 | 
 75 | class CarFittedQEvaluation(FittedAlgo):
 76 |     def __init__(self, state_space_dim, 
 77 |                        dim_of_actions, 
 78 |                        max_epochs, 
 79 |                        gamma, 
 80 |                        model_type='cnn', 
 81 |                        num_frame_stack=None):
 82 | 
 83 |         '''
 84 |         An implementation of fitted Q iteration
 85 | 
 86 |         num_inputs: number of inputs
 87 |         dim_of_actions: dimension of action space
 88 |         max_epochs: positive int, specifies how many iterations to run the algorithm
 89 |         gamma: discount factor
 90 |         '''
 91 |         self.model_type = model_type
 92 | 
 93 | 
 94 |         self.state_space_dim = state_space_dim
 95 |         self.dim_of_actions = dim_of_actions
 96 |         self.max_epochs = max_epochs
 97 |         self.gamma = gamma
 98 |         self.num_frame_stack = num_frame_stack
 99 |         self.Q_k = None
100 |         self.Q_k_minus_1 = None
101 | 
102 |         earlyStopping = EarlyStopping(monitor='val_loss', min_delta=1e-4,  patience=10, verbose=1, mode='min', restore_best_weights=True)
103 |         mcp_save = ModelCheckpoint('fqi.hdf5', save_best_only=True, monitor='val_loss', mode='min')
104 |         reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
105 | 
106 |         self.more_callbacks = [earlyStopping, mcp_save, reduce_lr_loss]
107 | 
108 |         super(CarFittedQEvaluation, self).__init__()
109 | 
110 |     def run(self, policy, which_cost, dataset, epochs=1, epsilon=1e-8, desc='FQE', g_idx=None, testing=True, **kw):
111 |         # dataset is the original dataset generated by pi_{old} to which we will find
112 |         # an approximately optimal Q
113 |         
114 |         dataset.set_cost(which_cost, idx=g_idx)
115 |         print 'Scale: ', dataset.scale
116 |         # try:
117 |         #     initial_states = np.unique([episode.frames[[0]*episode.num_frame_stack] for episode in dataset.episodes], axis=0)
118 |         # except:
119 |         #     initial_states = np.rollaxis(dataset['frames'][dataset['prev_states'][[0]]],1,4)
120 | 
121 |         initial_states = np.rollaxis(dataset['frames'][dataset['prev_states'][[0]]],1,4)
122 | 
123 |         # if self.Q_k is None:
124 |         self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw)
125 |         self.Q_k_minus_1 = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw)
126 |         x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][[0]]], 1,4)
127 |         self.Q_k.min_over_a([x_prime], x_preprocessed=True)[0]
128 |         self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]
129 |         self.Q_k.copy_over_to(self.Q_k_minus_1)
130 |         values = []
131 | 
132 |         for k in tqdm(range(self.max_epochs), desc=desc):
133 |             batch_size = 32
134 |             
135 |             dataset_length = len(dataset)
136 |             perm = np.random.permutation(range(dataset_length))
137 |             eighty_percent_of_set = int(1.*len(perm))
138 |             training_idxs = perm[:eighty_percent_of_set]
139 |             validation_idxs = perm[eighty_percent_of_set:]
140 |             training_steps_per_epoch = int(.3 * np.ceil(len(training_idxs)/float(batch_size)))
141 |             validation_steps_per_epoch = int(np.ceil(len(validation_idxs)/float(batch_size)))
142 |             # steps_per_epoch = 1 #int(np.ceil(len(dataset)/float(batch_size)))
143 |             train_gen = self.generator(policy, dataset, training_idxs, fixed_permutation=True, batch_size=batch_size)
144 |             # val_gen = self.generator(policy, dataset, validation_idxs, fixed_permutation=True, batch_size=batch_size)
145 |             
146 |             self.fit_generator(train_gen, 
147 |                                steps_per_epoch=training_steps_per_epoch,
148 |                                #validation_data=val_gen, 
149 |                                #validation_steps=validation_steps_per_epoch,
150 |                                epochs=epochs, 
151 |                                max_queue_size=10, 
152 |                                workers=4, 
153 |                                use_multiprocessing=False, 
154 |                                epsilon=epsilon, 
155 |                                evaluate=False, 
156 |                                verbose=0,
157 |                                additional_callbacks = self.more_callbacks)
158 |             self.Q_k.copy_over_to(self.Q_k_minus_1)
159 |             if testing:
160 |                 actions = policy(initial_states[:,np.newaxis,...], x_preprocessed=True)
161 |                 assert len(actions) == initial_states.shape[0]
162 |                 Q_val = self.Q_k.all_actions([initial_states], x_preprocessed=True)[np.arange(len(actions)), actions]
163 |                 values.append(np.mean(Q_val)*dataset.scale)
164 | 
165 |         # initial_states = self.Q_k.representation(initial_states)
166 |         if testing:
167 |             return np.mean(values[-10:]), values
168 |         actions = policy(initial_states[:,np.newaxis,...], x_preprocessed=True)
169 |         Q_val = self.Q_k.all_actions([initial_states], x_preprocessed=True)[np.arange(len(actions)), actions]
170 |         return np.mean(Q_val)*dataset.scale, values
171 | 
172 |     @threadsafe_generator
173 |     def generator(self, policy, dataset, training_idxs, fixed_permutation=False,  batch_size = 64):
174 |         data_length = len(training_idxs)
175 |         steps = int(np.ceil(data_length/float(batch_size)))
176 |         i = -1
177 |         amount_of_data_calcd = 0
178 |         if fixed_permutation:
179 |             calcd_costs = np.empty((len(training_idxs),), dtype='float64')
180 |         while True:
181 |             i = (i + 1) % steps
182 |             # print 'Getting batch: %s to %s' % ((i*batch_size),((i+1)*batch_size))
183 |             if fixed_permutation:
184 |                 if i == 0: perm = np.random.permutation(training_idxs)
185 |                 batch_idxs = perm[(i*batch_size):((i+1)*batch_size)]
186 |             else:
187 |                 batch_idxs = np.random.choice(training_idxs, batch_size)
188 |             # amount_of_data_calcd += len(batch_idxs)
189 |             # import pdb; pdb.set_trace()  
190 |             
191 |             X = np.rollaxis(dataset['frames'][dataset['prev_states'][batch_idxs]],1,4)
192 |             actions = np.atleast_2d(dataset['a'][batch_idxs]).T
193 |             x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][batch_idxs]],1,4)
194 |             dataset_costs = dataset['cost'][batch_idxs]
195 |             dones = dataset['done'][batch_idxs]
196 |             policy_action = dataset['pi_of_x_prime'][batch_idxs]
197 | 
198 |             # if fixed_permutation:
199 |             #     if amount_of_data_calcd <= data_length:
200 |             #         costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int))
201 |             #         calcd_costs[(i*batch_size):((i+1)*batch_size)] = costs
202 |             #     else:
203 |             #         costs = calcd_costs[(i*batch_size):((i+1)*batch_size)]
204 |             # else:
205 |             # policy_action = policy(x_prime[:,np.newaxis,...], x_preprocessed=True)
206 |             Q_val = self.Q_k_minus_1.all_actions([x_prime], x_preprocessed=True)[np.arange(len(policy_action)), policy_action]
207 |             costs = dataset_costs + (self.gamma*Q_val.reshape(-1)*(1-dones.astype(int))).reshape(-1)
208 | 
209 |             X = self.Q_k_minus_1.representation([X], actions, x_preprocessed=True)
210 | 
211 |             yield (X, costs)
212 | 
213 |     def init_Q(self, epsilon=1e-10, **kw):
214 |         return CarNN(self.state_space_dim, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, **kw)
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/fittedq.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from fitted_algo import FittedAlgo
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from env_nn import *
  6 | from thread_safe import threadsafe_generator
  7 | from keras import backend as K
  8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
  9 | 
 10 | class LakeFittedQIteration(FittedAlgo):
 11 |     def __init__(self, num_inputs, grid_shape, dim_of_actions, max_epochs, gamma, model_type='mlp', position_of_goals=None, position_of_holes=None, num_frame_stack=None):
 12 |         '''
 13 |         An implementation of fitted Q iteration
 14 | 
 15 |         num_inputs: number of inputs
 16 |         dim_of_actions: dimension of action space
 17 |         max_epochs: positive int, specifies how many iterations to run the algorithm
 18 |         gamma: discount factor
 19 |         '''
 20 |         self.model_type = model_type
 21 |         self.num_inputs = num_inputs
 22 |         self.grid_shape= grid_shape
 23 |         self.dim_of_actions = dim_of_actions
 24 |         self.max_epochs = max_epochs
 25 |         self.gamma = gamma
 26 |         self.position_of_goals = position_of_goals
 27 |         self.position_of_holes = position_of_holes
 28 |         self.num_frame_stack = num_frame_stack
 29 | 
 30 |         super(LakeFittedQIteration, self).__init__()
 31 | 
 32 | 
 33 |     def run(self, dataset, epochs=3000, epsilon=1e-8, desc='FQI', **kw):
 34 |         # dataset is the original dataset generated by pi_{old} to which we will find
 35 |         # an approximately optimal Q
 36 | 
 37 |         self.Q_k = self.init_Q(model_type=self.model_type, position_of_holes=self.position_of_holes, position_of_goals=self.position_of_goals, num_frame_stack=self.num_frame_stack, **kw)
 38 | 
 39 |         X_a = np.hstack(dataset.get_state_action_pairs())
 40 |         x_prime = dataset['x_prime']
 41 | 
 42 |         index_of_skim = self.skim(X_a, x_prime)
 43 |         X_a = X_a[index_of_skim]
 44 |         x_prime = x_prime[index_of_skim]
 45 |         dataset_costs = dataset['cost'][index_of_skim]
 46 |         dones = dataset['done'][index_of_skim]
 47 |         
 48 |         for k in tqdm(range(self.max_epochs), desc=desc):
 49 |             
 50 |             # {((x,a), c+gamma*min_a Q(x',a))}
 51 |             costs = dataset_costs + self.gamma*self.Q_k.min_over_a(x_prime)[0]*(1-dones.astype(int))
 52 | 
 53 |             self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0)
 54 |             # import pdb; pdb.set_trace()
 55 | 
 56 |             # if not self.Q_k.callbacks_list[0].converged:
 57 |             #     print 'Continuing training due to lack of convergence'
 58 |             #     self.fit(X_a, costs, epochs=epochs, batch_size=X_a.shape[0], epsilon=epsilon, evaluate=False, verbose=0)
 59 | 
 60 |         return self.Q_k, []
 61 | 
 62 |     def init_Q(self, epsilon=1e-10, **kw):
 63 |         return LakeNN(self.num_inputs, 1, self.grid_shape, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, **kw)
 64 | 
 65 | 
 66 | class CarFittedQIteration(FittedAlgo):
 67 |     def __init__(self, state_space_dim, 
 68 |                        dim_of_actions, 
 69 |                        max_epochs, 
 70 |                        gamma, 
 71 |                        model_type='cnn', 
 72 |                        num_frame_stack=None,
 73 |                        initialization=None,
 74 |                        freeze_cnn_layers=False):
 75 |         '''
 76 |         An implementation of fitted Q iteration
 77 | 
 78 |         num_inputs: number of inputs
 79 |         dim_of_actions: dimension of action space
 80 |         max_epochs: positive int, specifies how many iterations to run the algorithm
 81 |         gamma: discount factor
 82 |         '''
 83 |         self.initialization = initialization
 84 |         self.freeze_cnn_layers = freeze_cnn_layers
 85 |         self.model_type = model_type
 86 |         self.state_space_dim = state_space_dim
 87 |         self.dim_of_actions = dim_of_actions
 88 |         self.max_epochs = max_epochs
 89 |         self.gamma = gamma
 90 |         self.num_frame_stack = num_frame_stack
 91 |         self.Q_k = None
 92 |         self.Q_k_minus_1 = None
 93 | 
 94 |         earlyStopping = EarlyStopping(monitor='val_loss', min_delta=1e-4,  patience=10, verbose=1, mode='min', restore_best_weights=True)
 95 |         mcp_save = ModelCheckpoint('fqi.hdf5', save_best_only=True, monitor='val_loss', mode='min')
 96 |         reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, min_delta=1e-4, mode='min')
 97 | 
 98 |         self.more_callbacks = [earlyStopping, mcp_save, reduce_lr_loss]
 99 | 
100 |         super(CarFittedQIteration, self).__init__()
101 | 
102 | 
103 |     def run(self, dataset, epochs=1, epsilon=1e-8, desc='FQI', exact=None, **kw):
104 |         # dataset is the original dataset generated by pi_{old} to which we will find
105 |         # an approximately optimal Q
106 | 
107 |         # if self.Q_k is None:
108 |         self.Q_k = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw)
109 |         self.Q_k_minus_1 = self.init_Q(model_type=self.model_type, num_frame_stack=self.num_frame_stack, **kw)
110 |         x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][[0]]], 1,4)
111 |         self.Q_k.min_over_a([x_prime], x_preprocessed=True)[0]
112 |         self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]
113 |         self.Q_k.copy_over_to(self.Q_k_minus_1)
114 |         values = []
115 | 
116 |         for k in tqdm(range(self.max_epochs), desc=desc):
117 |             batch_size = 64
118 |             
119 |             dataset_length = len(dataset)
120 |             perm = np.random.permutation(range(dataset_length))
121 |             eighty_percent_of_set = int(1.*len(perm))
122 |             training_idxs = perm[:eighty_percent_of_set]
123 |             validation_idxs = perm[eighty_percent_of_set:]
124 |             training_steps_per_epoch = int(np.ceil(len(training_idxs)/float(batch_size)))
125 |             validation_steps_per_epoch = int(np.ceil(len(validation_idxs)/float(batch_size)))
126 |             # steps_per_epoch = 1 #int(np.ceil(len(dataset)/float(batch_size)))
127 |             train_gen = self.generator(dataset, training_idxs, fixed_permutation=True, batch_size=batch_size)
128 |             # val_gen = self.generator(dataset, validation_idxs, fixed_permutation=True, batch_size=batch_size)
129 |             if (k >= (self.max_epochs-10)): K.set_value(self.Q_k.model.optimizer.lr, 0.0001)
130 |             self.fit_generator(train_gen, 
131 |                                steps_per_epoch=training_steps_per_epoch,
132 |                                #validation_data=val_gen, 
133 |                                #validation_steps=validation_steps_per_epoch,
134 |                                epochs=epochs, 
135 |                                max_queue_size=10, 
136 |                                workers=4, 
137 |                                use_multiprocessing=False, 
138 |                                epsilon=epsilon, 
139 |                                evaluate=False, 
140 |                                verbose=0,
141 |                                additional_callbacks = self.more_callbacks)
142 |             self.Q_k.copy_over_to(self.Q_k_minus_1)
143 |             if k >= (self.max_epochs-10):
144 |                 c,g,perf = exact.run(self.Q_k,to_monitor=k==self.max_epochs)
145 |                 values.append([c,perf])
146 |                 
147 |         return self.Q_k, values
148 | 
149 |     @threadsafe_generator
150 |     def generator(self, dataset, training_idxs, fixed_permutation=False,  batch_size = 64):
151 |         data_length = len(training_idxs)
152 |         steps = int(np.ceil(data_length/float(batch_size)))
153 |         i = -1
154 |         amount_of_data_calcd = 0
155 |         if fixed_permutation:
156 |             calcd_costs = np.empty((len(training_idxs),), dtype='float64')
157 |         while True:
158 |             i = (i + 1) % steps
159 |             # print 'Getting batch: %s to %s' % ((i*batch_size),((i+1)*batch_size))
160 |             if fixed_permutation:
161 |                 if i == 0: perm = np.random.permutation(training_idxs)
162 |                 batch_idxs = perm[(i*batch_size):((i+1)*batch_size)]
163 |             else:
164 |                 batch_idxs = np.random.choice(training_idxs, batch_size)
165 |             # amount_of_data_calcd += len(batch_idxs)
166 |             # import pdb; pdb.set_trace()  
167 |             
168 |             X = np.rollaxis(dataset['frames'][dataset['prev_states'][batch_idxs]],1,4)
169 |             actions = np.atleast_2d(dataset['a'][batch_idxs]).T
170 |             x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][batch_idxs]],1,4)
171 |             dataset_costs = dataset['cost'][batch_idxs]
172 |             dones = dataset['done'][batch_idxs]
173 | 
174 |             # if fixed_permutation:
175 |             #     if amount_of_data_calcd <= data_length:
176 |             #         costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int))
177 |             #         calcd_costs[(i*batch_size):((i+1)*batch_size)] = costs
178 |             #     else:
179 |             #         costs = calcd_costs[(i*batch_size):((i+1)*batch_size)]
180 |             # else:
181 |             costs = dataset_costs + self.gamma*self.Q_k_minus_1.min_over_a([x_prime], x_preprocessed=True)[0]*(1-dones.astype(int))
182 | 
183 |             X = self.Q_k_minus_1.representation([X], actions, x_preprocessed=True)
184 | 
185 |             yield (X, costs)
186 | 
187 |     def init_Q(self, epsilon=1e-10, **kw):
188 |         model = CarNN(self.state_space_dim, self.dim_of_actions, self.gamma, convergence_of_model_epsilon=epsilon, freeze_cnn_layers=self.freeze_cnn_layers, **kw)
189 |         if (self.initialization is not None) and self.freeze_cnn_layers:
190 |             self.initialization.Q.copy_over_to(model)
191 |             for layer in model.model.layers:
192 |                 if layer.trainable: 
193 |                     try:
194 |                         layer.kernel.initializer.run( session = K.get_session() )
195 |                     except:
196 |                         pass
197 |                     try:
198 |                         layer.bias.initializer.run( session = K.get_session() )
199 |                     except:
200 |                         pass
201 |         return model
202 | 


--------------------------------------------------------------------------------
/fixed_policy.py:
--------------------------------------------------------------------------------
 1 | from model import Model
 2 | 
 3 | 
 4 | import numpy as np
 5 | from copy import deepcopy
 6 | 
 7 | class FixedPolicy(Model):
 8 |     def __init__(self, policy, action_space_dim, policy_evalutor):
 9 |         '''
10 |         A fixed manual policy
11 |         '''
12 |         super(FixedPolicy, self).__init__()
13 |         self.policy = policy
14 |         self.action_space_dim = action_space_dim
15 | 
16 |         #debug purposes
17 |         self.policy_evalutor = deepcopy(policy_evalutor)
18 |         self.Q = None
19 |         self.get_Q_val()
20 | 
21 |     def get_Q_val(self):
22 |         self.policy_evalutor.initial_states = np.hstack([np.nonzero((self.policy_evalutor.env.desc == 'S').reshape(-1))[0], np.nonzero((self.policy_evalutor.env.desc == 'F').reshape(-1))[0]])
23 |         self.Q_tmp = self.policy_evalutor.get_Qs(self, self.policy_evalutor.initial_states, 64)
24 | 
25 |         self.Q = {}
26 |         for idx, state in enumerate(self.policy_evalutor.initial_states):
27 |             self.Q[state] = np.eye(self.action_space_dim)[self.policy[state]]*(self.Q_tmp[idx]-1e-7)
28 | 
29 |     def copy_over_to(self, to_):
30 |         pass
31 | 
32 |     def predict(self, X_a):
33 |         pass # return [self.model[np.argmax(x_a[:-self.action_space_dim], axis = 1)] == np.argmax(x_a[-self.action_space_dim:], axis=1) for x_a in X_a]
34 | 
35 |     def fit(self, X, y, verbose=0):
36 |         pass
37 | 
38 |     def representation(self, *args, **kw):
39 |         if len(args) == 1:
40 |             return args[0]
41 |         elif len(args) == 2:
42 |             return args[0], args[1]
43 |         else:
44 |             raise NotImplemented
45 | 
46 |     def all_actions(self, X, **kw):
47 |         if self.Q is None:
48 |             return np.array([-np.eye(self.action_space_dim)[self.policy[x]] for x in X])
49 |         else:
50 |             arr = []
51 |             for x in X:
52 |                 try:
53 |                     arr.append(self.Q[x])
54 |                 except:
55 |                     arr.append([0]*self.action_space_dim)
56 |             return np.array(arr)
57 | 
58 | 


--------------------------------------------------------------------------------
/fqe_quality_test_generalization.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | np.set_printoptions(suppress=True)
  4 | np.random.seed(314)
  5 | import tensorflow as tf
  6 | from optimization_problem import Dataset
  7 | from fittedq import FittedQIteration
  8 | from fixed_policy import FixedPolicy
  9 | from fitted_off_policy_evaluation import FittedQEvaluation
 10 | from exact_policy_evaluation import ExactPolicyEvaluator
 11 | from inverse_propensity_scoring import InversePropensityScorer
 12 | from exact_policy_evaluation import ExactPolicyEvaluator
 13 | from optimal_policy import DeepQLearning
 14 | from print_policy import PrintPolicy
 15 | from keras.models import load_model
 16 | import pandas as pd
 17 | import matplotlib
 18 | matplotlib.use('TkAgg')
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | ###
 22 | #paths
 23 | import os
 24 | model_dir = os.path.join(os.getcwd(), 'models')
 25 | if not os.path.exists(model_dir):
 26 |     os.makedirs(model_dir)
 27 | ###
 28 | 
 29 | #### Setup Gym 
 30 | import gym
 31 | from gym.envs.registration import register
 32 | map_size = [4,4]
 33 | register( id='FrozenLake-no-slip-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'is_slippery': False, 'map_name':'{0}x{1}'.format(map_size[0], map_size[1])} )
 34 | env = gym.make('FrozenLake-no-slip-v0')
 35 | position_of_holes = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'H')]
 36 | position_of_goals = np.arange(env.desc.shape[0]*env.desc.shape[1]).reshape(env.desc.shape)[np.nonzero(env.desc == 'G')]
 37 | 
 38 | #### Hyperparam
 39 | gamma = 0.9
 40 | max_fitting_epochs = 10 #max number of epochs over which to converge to Q^\ast
 41 | lambda_bound = 10. # l1 bound on lagrange multipliers
 42 | epsilon = .01 # termination condition for two-player game
 43 | deviation_from_old_policy_eps = .7 #With what probabaility to deviate from the old policy
 44 | # convergence_epsilon = 1e-6 # termination condition for model convergence
 45 | action_space_dim = env.nA # action space dimension
 46 | state_space_dim = env.nS # state space dimension
 47 | eta = 10. # param for exponentiated gradient algorithm
 48 | initial_states = [[0]] #The only initial state is [1,0...,0]. In general, this should be a list of initial states
 49 | policy_evaluator = ExactPolicyEvaluator(initial_states, state_space_dim, gamma)
 50 | 
 51 | #### Get a decent policy. Called pi_old because this will be the policy we use to gather data
 52 | policy_old = None
 53 | old_policy_path = os.path.join(model_dir, 'pi_old.h5')
 54 | policy_old = DeepQLearning(env, gamma)
 55 | if not os.path.isfile(old_policy_path):
 56 |     print 'Learning a policy using DQN'
 57 |     policy_old.learn()
 58 |     policy_old.Q.model.save(old_policy_path)
 59 |     print policy_old.Q.evaluate(render=True)
 60 | else:
 61 |     print 'Loading a policy'
 62 |     policy_old.Q.model = load_model(old_policy_path)
 63 |     print policy_old.Q.evaluate(render=True)
 64 | 
 65 | print 'Old Policy'
 66 | PrintPolicy(env=env).pprint(policy_old)
 67 | 
 68 | # model_dict = {0: 1, 4: 1, 8: 0}
 69 | # for i in range(grid_size*grid_size):
 70 | #     if i not in model_dict:
 71 | #         model_dict[i] = np.random.randint(action_space_dim)
 72 | # policy_old = FixedPolicy(model_dict, action_space_dim, policy_evaluator)
 73 | # PrintPolicy().pprint(policy_old)
 74 | 
 75 | ### Policy to evaluate
 76 | model_dict = {0: 1, 4: 1, 8: 2, 9: 1, 13: 2, 14: 2}
 77 | for i in range(map_size[0]*map_size[1]):
 78 |     if i not in model_dict:
 79 |         model_dict[i] = np.random.randint(action_space_dim)
 80 | policy = FixedPolicy(model_dict, action_space_dim, policy_evaluator)
 81 | 
 82 | print 'Evaluate this policy:'
 83 | PrintPolicy(env=env).pprint(policy)
 84 | 
 85 | #### Problem setup
 86 | 
 87 | def main(policy_old, policy, model_type='cnn'):
 88 | 
 89 |     fqi = FittedQIteration(state_space_dim + action_space_dim, map_size, action_space_dim, max_fitting_epochs, gamma,model_type =model_type )
 90 |     fqe = FittedQEvaluation(initial_states, state_space_dim + action_space_dim, map_size, action_space_dim, max_fitting_epochs, gamma,model_type =model_type )
 91 |     ips = InversePropensityScorer(action_space_dim)
 92 |     exact_evaluation = ExactPolicyEvaluator(initial_states, state_space_dim, gamma, env)
 93 | 
 94 |     max_epochs = np.array([1000]) # np.arange(50,1060,100) # max number of epochs over which to collect data
 95 |     epsilons = np.array([.25]) # np.array([.5])
 96 |     trials = np.array([1,2]) # np.arange(20) 
 97 |     eps_epochs_trials = cartesian_product(epsilons, max_epochs,trials)
 98 |     
 99 |     all_trials_estimators = []
100 |     for epsilon in epsilons:
101 | 
102 |         trials_estimators = []
103 |         for epochs in max_epochs:
104 | 
105 |             trial_estimators = []
106 |             for trial in trials: 
107 |                 estimators = run_trial(policy_old, policy, epochs, epsilon, fqi, fqe, ips, exact_evaluation)
108 |                 
109 |                 trial_estimators.append(estimators)
110 |             trials_estimators.append(trial_estimators)
111 | 
112 |         all_trials_estimators.append(trials_estimators)
113 | 
114 |         # print epsilon, np.mean(all_trials_evaluated[-1]), np.mean(all_trials_approx_ips[-1]), np.mean(all_trials_exact_ips[-1]), np.mean(all_trials_exact[-1])
115 |     
116 |     results = np.hstack([eps_epochs_trials, np.array(all_trials_estimators).reshape(-1, np.array(all_trials_estimators).shape[-1])])
117 |     df = pd.DataFrame(results, columns=['epsilon', 'num_trajectories', 'trial_num', 'exact','fqe'])
118 |     df.to_csv('fqe_quality.csv', index=False)
119 | 
120 | def run_trial(policy_old, policy, epochs, epsilon, fqi, fqe, ips, exact_evaluation):
121 |     #### Collect Data
122 |     num_goal = 0
123 |     num_hole = 0
124 |     dataset = Dataset([0], action_space_dim)
125 |     dataset_removed = Dataset([0], action_space_dim)
126 |     
127 |     data = []
128 |     mapping = {0:np.array([0,-1]), 2:np.array([0,1]), 1:np.array([1,0]), 3:np.array([-1,0])}
129 |     for x in set(np.nonzero(env.desc.reshape(-1) == 'F')[0]).union(set(np.nonzero(env.desc.reshape(-1) == 'S')[0])) :
130 |         for action in range(4):
131 | 
132 |             # if x == 4: import pdb; pdb.set_trace()
133 |             row = int(x/map_size[1])
134 |             col = int(x - row*int(map_size[1]))
135 | 
136 |             new_row, new_col  = np.array([row, col]) + mapping[action]
137 |             if (new_row < 0) or (new_row > (map_size[0]-1)):
138 |                 new_row, new_col = row, col
139 |             elif (new_col < 0) or (new_col > (map_size[1]-1)):
140 |                 new_row, new_col = row, col
141 |             else:
142 |                 pass
143 |             x_prime = new_row*map_size[1] + new_col
144 | 
145 |             if (env.desc[new_row, new_col] == 'H') or (env.desc[new_row, new_col] == 'G'): 
146 |                 done = True
147 |             else:
148 |                 done = False
149 | 
150 |             if env.desc[new_row, new_col] == 'G': 
151 |                 goal = True
152 |             else:
153 |                 goal = False
154 | 
155 |             data.append([x,action,x_prime,-goal,done and not goal,done])
156 | 
157 |     
158 |     for idx, datum in enumerate(data):
159 |         count = idx % 4
160 | 
161 |         if count == 0:
162 |             must_keep = 0
163 |             kept = 0
164 | 
165 |         
166 |         if (count == 3) and (kept == 0):
167 |             must_keep = 1
168 | 
169 |         if (not must_keep) and (np.random.choice([0,1], p=[epsilon, 1-epsilon])):
170 |             kept += 1
171 |             dataset.append(*datum)
172 |         else:
173 |             dataset_removed.append(*datum)
174 | 
175 |         
176 |     dataset.preprocess()
177 |     dataset_removed.preprocess()
178 |     
179 |     print 'Distribution:' 
180 |     print np.histogram(dataset['x'], bins=np.arange(map_size[0]*map_size[1]+1)-.5)[0].reshape(map_size)
181 | 
182 |     print 'Distribution:' 
183 |     print np.histogram(dataset['x_prime'], bins=np.arange(map_size[0]*map_size[1]+1)-.5)[0].reshape(map_size)
184 |     
185 | 
186 |     dataset.set_cost('c')
187 |     dataset_removed.set_cost('c')
188 |     
189 |     # Exact
190 |     exact = exact_evaluation.run(policy)[0]
191 |     print exact
192 | 
193 |     # Importance Sampling
194 |     # approx_ips, exact_ips, approx_pdis, exact_pdis = ips.run(dataset, policy, policy_old, epsilon, gamma)
195 |     
196 |     # FQE
197 | 
198 |     for eps in [1e-3]:
199 | 
200 |         evaluated = []
201 |         for i in range(1):
202 |             evaluated.append(fqe.run(dataset, policy, epochs=5000, epsilon=eps, desc='FQE epsilon %s' % np.round(epsilon,2),position_of_holes=position_of_holes, position_of_goals=position_of_goals))
203 |             PrintPolicy(env=env).pprint(fqe.Q_k)
204 | 
205 |             print evaluated[-1]
206 | 
207 |         evaluated = np.mean(evaluated)
208 |         print evaluated
209 | 
210 |         print np.mean((fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime']) )[0]*(1-dataset['done'])))**2)
211 |         print np.vstack([dataset['x'], dataset['a'], np.round((fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime']) )[0]*(1-dataset['done'])))**2, 2)]).T
212 |         if len(dataset_removed['x']) > 0:
213 |             print np.mean((fqe.Q_k(dataset_removed['x'], dataset_removed['a']).T - (dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime']))[0]*(1-dataset_removed['done'])))**2)
214 | 
215 |     df = pd.DataFrame(np.vstack([dataset['x'], dataset['a'], dataset['x_prime'], dataset['cost'], dataset['done'], np.round(fqe.Q_k(dataset['x'], dataset['a']),3).T, np.around(dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime'])).T*(1-dataset['done']),2)  , (fqe.Q_k(dataset['x'], dataset['a']).T - (dataset['cost'] + gamma*fqe.Q_k(dataset['x_prime'], policy(dataset['x_prime'])).T*(1-dataset['done']) ))  ]).T, columns = ['x','a','x_prime','c','done','Q(x,a)', 'Q(x_,pi(x_))', 'diff'])
216 |     df_outside = pd.DataFrame(np.vstack([dataset_removed['x'], dataset_removed['a'], dataset_removed['x_prime'], dataset_removed['cost'], dataset_removed['done'], np.round(fqe.Q_k(dataset_removed['x'], dataset_removed['a']),3).T, np.around(dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime'])).T*(1-dataset_removed['done']),2)  , (fqe.Q_k(dataset_removed['x'], dataset_removed['a']).T - (dataset_removed['cost'] + gamma*fqe.Q_k(dataset_removed['x_prime'], policy(dataset_removed['x_prime'])).T*(1-dataset_removed['done']) ))  ]).T, columns = ['x','a','x_prime','c','done','Q(x,a)', 'Q(x_,pi(x_))', 'diff'])
217 |     print exact, evaluated
218 | 
219 |     return exact-exact, evaluated-exact
220 | 
221 | def cartesian_product(*arrays):
222 |     la = len(arrays)
223 |     dtype = np.result_type(*arrays)
224 |     arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
225 |     for i, a in enumerate(np.ix_(*arrays)):
226 |         arr[...,i] = a
227 |     return arr.reshape(-1, la)
228 | 
229 | def create_df(array, **kw):
230 |     return pd.DataFrame(array, **kw)
231 | 
232 | 
233 | def custom_plot(x, y, minimum, maximum, **kwargs):
234 |     ax = kwargs.pop('ax', plt.gca())
235 |     base, = ax.plot(x, y, **kwargs)
236 |     ax.fill_between(x, minimum, maximum, facecolor=base.get_color(), alpha=0.15)
237 | 
238 | main(policy_old, policy)
239 | df = pd.read_csv('fqe_quality.csv')
240 | for epsilon, group in df.groupby('epsilon'):
241 |     del group['epsilon']
242 |     # group.set_index('num_trajectories').plot()
243 |     # import pdb; pdb.set_trace()
244 |     means = group.groupby('num_trajectories').mean()
245 |     stds = group.groupby('num_trajectories').std()
246 | 
247 | 
248 |     del means['trial_num']
249 |     del stds['trial_num']
250 | 
251 |     print '*'*20
252 |     print 'Epsilon: %s' % epsilon
253 |     print means
254 |     print stds
255 | 
256 |     fig, ax = plt.subplots(1)
257 |     colors = ['red', 'green', 'blue']
258 |     for i, col in enumerate(['fqe']):
259 |         # import pdb; pdb.set_trace()
260 | 
261 |         x = np.array(means.index)
262 |         mu = np.array(means[col])
263 |         sigma = np.array(stds[col])
264 | 
265 |         lower_bound = mu + sigma
266 |         upper_bound = mu - sigma
267 | 
268 |         custom_plot(x, mu, lower_bound, upper_bound, marker='o', label=col, color=colors[i])
269 |         
270 | 
271 | 
272 |     # means.plot(yerr=stds)
273 | 
274 |     # plt.title(epsilon)
275 |     ax.legend()
276 |     ax.set_title('Probability of exploration: %s' % epsilon)
277 |     ax.set_xlabel('Number of trajectories in dataset')
278 |     ax.set_ylabel('Policy Evaluation Error')
279 |     plt.show()


--------------------------------------------------------------------------------
/fqi_seed_2_new.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import deepdish as dd
  4 | #from thread_safe import threadsafe_generator
  5 | import threading
  6 | 
  7 | import keras
  8 | from keras.models import Sequential, Model, load_model, model_from_config
  9 | from keras.layers import Dense, Conv2D, Flatten, Input, concatenate, Lambda, MaxPooling2D, Dropout, dot
 10 | from keras import optimizers
 11 | from keras import initializers
 12 | from keras import regularizers
 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 14 | from keras.callbacks import Callback, TensorBoard
 15 | from keras.backend import eval
 16 | 
 17 | from car_racing import ExtendedCarRacing
 18 | import itertools
 19 | from exact_policy_evaluation import ExactPolicyEvaluator
 20 | 
 21 | from pyvirtualdisplay import Display
 22 | display = Display(visible=0, size=(1280, 1024))
 23 | display.start()
 24 | 
 25 | # env = gym.make('CarRacing-v0')
 26 | constraint_thresholds = [1., 15.] + [1]
 27 | constraints_cared_about = [-1,2]
 28 | constraints = [300*.1, 300*.1] + [0,0,0,0,0]
 29 | pic_size = (96, 96,3)
 30 | num_frame_stack=3
 31 | frame_skip=3
 32 | gamma=.95
 33 | action_space_map = {}
 34 | for i, action in enumerate([k for k in itertools.product([-1, 0, 1], [1, 0], [0.2, 0])]):
 35 |     action_space_map[i] = action
 36 | 
 37 | init_seed = 2
 38 | stochastic_env = False # = not deterministic
 39 | max_pos_costs = 12 # The maximum allowable positive cost before ending episode early
 40 | max_time_spent_in_episode = 2000
 41 | env = ExtendedCarRacing(init_seed, stochastic_env, max_pos_costs)
 42 | exact_policy_algorithm = ExactPolicyEvaluator(action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size = pic_size, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about)
 43 | env.reset()
 44 | 
 45 | 
 46 | GPU = 0 
 47 | SEED = 0
 48 | np.random.seed(SEED)
 49 | import tensorflow as tf
 50 | tf.set_random_seed(SEED)
 51 | import random
 52 | random.seed(SEED)
 53 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
 54 | session_conf.gpu_options.allow_growth = True
 55 | from keras import backend as K
 56 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 57 | K.set_session(sess)
 58 | 
 59 | LEARNING_RATE = 0.0005
 60 | dim_of_actions = 12
 61 | input_shape = (96,96,3)
 62 | gamma = 0.95
 63 | 
 64 | class threadsafe_iter:
 65 | 	"""Takes an iterator/generator and makes it thread-safe by
 66 | 	serializing call to the `next` method of given iterator/generator.
 67 | 	"""
 68 | 	def __init__(self, it):
 69 | 		self.it = it
 70 | 		self.lock = threading.Lock()
 71 | 
 72 | 	def __iter__(self):
 73 | 		return self
 74 | 
 75 | 	def next(self):
 76 | 		with self.lock:
 77 | 			return self.it.next()
 78 | 
 79 | 
 80 | def threadsafe_generator(f):
 81 | 	"""A decorator that takes a generator function and makes it thread-safe.
 82 | 	"""
 83 | 	def g(*a, **kw):
 84 | 		return threadsafe_iter(f(*a, **kw))
 85 | 	return g
 86 | 
 87 | class NN:
 88 | 	def __init__(self, gpu=0):
 89 | 		self.gpu = gpu
 90 | 		rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)
 91 | 
 92 | 		def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=np.random.randint(2**32))
 93 | 		with tf.device('/gpu:'+str(self.gpu)):
 94 | 			model = Sequential()
 95 | 			model.add(Conv2D(8, (7,7), strides = 3, activation = 'relu', padding = 'same', input_shape = (96,96,3),kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6)))
 96 | 			model.add(MaxPooling2D())
 97 | 			#model.add(Dropout(0.25))
 98 | 			model.add(Conv2D(16,(3,3), strides = 1, activation = 'relu', padding = 'same',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6)))
 99 | 			model.add(MaxPooling2D())
100 | 			#model.add(Dropout(0.25))
101 | 			model.add(Flatten())
102 | 			model.add(Dense(256, activation='relu',kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6)))
103 | 			#model.add(Dropout(0.5))
104 | 			model.add(Dense(dim_of_actions, name='all_actions', activation="linear",kernel_initializer=init(), bias_initializer=init(), kernel_regularizer=regularizers.l2(1e-6)))
105 | 
106 | 			self.model = model
107 | 			self.compile()
108 | 			self.model._make_predict_function()
109 | 		#self.model.summary()
110 | 		
111 | 	def compile(self):
112 | 		def huber_loss(y_true, y_pred, clip_value):
113 | 			# Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and
114 | 			# https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b
115 | 			# for details.
116 | 			assert clip_value > 0.
117 | 
118 | 			x = y_true - y_pred
119 | 			if np.isinf(clip_value):
120 | 				# Spacial case for infinity since Tensorflow does have problems
121 | 				# if we compare `K.abs(x) < np.inf`.
122 | 				return .5 * K.square(x)
123 | 
124 | 			condition = K.abs(x) < clip_value
125 | 			squared_loss = .5 * K.square(x)
126 | 			linear_loss = clip_value * (K.abs(x) - .5 * clip_value)
127 | 			if K.backend() == 'tensorflow':
128 | 				import tensorflow as tf
129 | 				if hasattr(tf, 'select'):
130 | 					return tf.select(condition, squared_loss, linear_loss)  # condition, true, false
131 | 				else:
132 | 					return tf.where(condition, squared_loss, linear_loss)  # condition, true, false
133 | 			elif K.backend() == 'theano':
134 | 				from theano import tensor as T
135 | 				return T.switch(condition, squared_loss, linear_loss)
136 | 			else:
137 | 				raise RuntimeError('Unknown backend "{}".'.format(K.backend()))
138 | 
139 | 		def mean_pred(y_true, y_pred):
140 | 			return K.mean(y_pred)
141 | 
142 | 		def min_pred(y_true, y_pred):
143 | 			return K.min(y_pred)
144 | 
145 | 		def clipped_masked_error(args):
146 | 				y_true, y_pred, mask = args
147 | 				loss = huber_loss(y_true, y_pred, 10)
148 | 				loss *= mask  # apply element-wise mask
149 | 				return K.sum(loss, axis=-1)
150 | 		# Create trainable model. The problem is that we need to mask the output since we only
151 | 		# ever want to update the Q values for a certain action. The way we achieve this is by
152 | 		# using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
153 | 		# to mask out certain parameters by passing in multiple inputs to the Lambda layer.
154 | 		y_pred = self.model.output
155 | 		y_true = Input(name='y_true', shape=(dim_of_actions,))
156 | 		mask = Input(name='mask', shape=(dim_of_actions,))
157 | 		loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='huber')([y_pred, y_true, mask])
158 | 		#predicted_value = Lambda(value_pred, output_shape=(1,), name='predicted_value')([y_pred, mask])
159 | 		#ins = [self.model.input] if type(self.model.input) is not list else self.model.input
160 | 		ins = self.model.input
161 | 		#trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
162 | 		trainable_model = Model(inputs=[ins,y_true, mask], outputs=[loss_out, y_pred])
163 | 		assert len(trainable_model.output_names) == 2
164 | 		#combined_metrics = {trainable_model.output_names[1]: metrics}
165 | 		losses = [
166 | 			lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
167 | 			lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
168 | 		]
169 | 		#trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics)
170 | 		rmsProp = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.95, epsilon=1e-08, decay=0.0)
171 | 		#opt = optimizers.Adam(lr=0.0001, clipnorm = 10)
172 | 		#trainable_model.compile(optimizer=rmsProp, loss=losses)
173 | 		trainable_model.compile(optimizer=rmsProp, loss=losses, metrics = [min_pred])
174 | 		#trainable_model.compile(optimizer='adam', loss=losses, metrics = [min_pred])
175 | 		self.trainable_model = trainable_model
176 | 		#print self.trainable_model.summary()
177 | 		#print self.trainable_model.metrics_names
178 | 		#time.sleep(5)
179 | 
180 | 		self.compiled = True
181 | 
182 | 	def saveWeight(self):
183 | 		self.model.save_weights('fqi_model.h5')
184 | 
185 | 	def loadWeight(self):
186 | 		#path = 'weight/'
187 | 		self.model.load_weights('fqi_model.h5')
188 | 		self.model.reset_states()
189 | 
190 | 	def clear_memory(self):
191 | 		del self.model
192 | 
193 | 
194 | @threadsafe_generator
195 | def data_generator(indices, fixed_permutation=False, batch_size = 64):
196 | 	#data_length = len(dataset['done']) - 1 ## Maybe throw out the very last data point to avoid out of range index error
197 | 	data_length = len(indices)
198 | 	number_of_batches = int(np.floor(data_length/float(batch_size)))
199 | 	#random_permutation = np.random.permutation(np.arange(data_length))
200 | 	random_permutation = np.random.permutation(indices)
201 | 	i= -1
202 | 	while True:
203 | 		i = (i+1) % number_of_batches
204 | 		idxs = random_permutation[(i*batch_size):((i+1)*batch_size)]
205 | 
206 | 		#print idxs
207 | 		x = np.rollaxis(dataset['frames'][dataset['prev_states'][idxs]],1,4)
208 | 		a = dataset['a'][idxs] ## need to make it 2d?
209 | 		x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][idxs]],1,4)
210 | 		c = dataset['c'][idxs] ## scaling the cost back?
211 | 		g = dataset['g'][idxs]
212 | 		dones = dataset['done'][idxs]
213 | 
214 | 		target_q_values = Q_k_minus_1.model.predict(x_prime)
215 | 		assert target_q_values.shape == (batch_size, dim_of_actions)
216 | 		q_batch = np.min(target_q_values, axis=1) ## we're minimizing cost
217 | 		assert q_batch.shape == (batch_size,)
218 | 		
219 | 		targets = np.zeros((batch_size, dim_of_actions))
220 | 		dummy_targets = np.zeros((batch_size,))
221 | 		masks = np.zeros((batch_size, dim_of_actions))
222 | 
223 | 		discounted_q_batch = gamma * q_batch
224 | 		terminalBatch = np.array([1-float(done) for done in dones])
225 | 		assert terminalBatch.shape == (batch_size,)
226 | 		discounted_q_batch *= terminalBatch
227 | 		assert c.shape == discounted_q_batch.shape
228 | 		cost_to_go_batch = c + discounted_q_batch
229 | 		
230 | 		for idx, (target, mask, value, action) in enumerate(zip(targets, masks, cost_to_go_batch, a)):
231 | 			target[action] = value  # update action with estimated accumulated reward
232 | 			dummy_targets[idx] = value
233 | 			mask[action] = 1.  # enable loss for this specific action
234 | 
235 | 		assert x.shape == (batch_size, 96,96,3)
236 | 		assert targets.shape == (batch_size, 12)
237 | 		#assert sum(masks) == batch_size
238 | 
239 | 		yield ([x, targets, masks], [dummy_targets, targets])
240 | 
241 | @threadsafe_generator
242 | def validation_generator(indices, fixed_permutation=False, batch_size = 64):
243 | 	#data_length = len(dataset['done']) - 1 ## Maybe throw out the very last data point to avoid out of range index error
244 | 	data_length = len(indices)
245 | 	number_of_batches = int(np.floor(data_length/float(batch_size)))
246 | 	#random_permutation = np.random.permutation(np.arange(data_length))
247 | 	random_permutation = np.random.permutation(indices)
248 | 	i= -1
249 | 	while True:
250 | 		i = (i+1) % number_of_batches
251 | 		idxs = random_permutation[(i*batch_size):((i+1)*batch_size)]
252 | 
253 | 		#print idxs
254 | 		x = np.rollaxis(dataset['frames'][dataset['prev_states'][idxs]],1,4)
255 | 		a = dataset['a'][idxs] ## need to make it 2d?
256 | 		x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][idxs]],1,4)
257 | 		c = dataset['c'][idxs]## scaling the cost back?
258 | 		g = dataset['g'][idxs]
259 | 		dones = dataset['done'][idxs]
260 | 
261 | 		target_q_values = Q_k_minus_1.model.predict(x_prime)
262 | 		assert target_q_values.shape == (batch_size, dim_of_actions)
263 | 		q_batch = np.min(target_q_values, axis=1) ## we're minimizing cost
264 | 		assert q_batch.shape == (batch_size,)
265 | 		
266 | 		targets = np.zeros((batch_size, dim_of_actions))
267 | 		dummy_targets = np.zeros((batch_size,))
268 | 		masks = np.zeros((batch_size, dim_of_actions))
269 | 
270 | 		discounted_q_batch = gamma * q_batch
271 | 		terminalBatch = np.array([1-float(done) for done in dones])
272 | 		assert terminalBatch.shape == (batch_size,)
273 | 		discounted_q_batch *= terminalBatch
274 | 		assert c.shape == discounted_q_batch.shape
275 | 		cost_to_go_batch = c + discounted_q_batch
276 | 		
277 | 		for idx, (target, mask, value, action) in enumerate(zip(targets, masks, cost_to_go_batch, a)):
278 | 			target[action] = value  # update action with estimated accumulated reward
279 | 			dummy_targets[idx] = value
280 | 			mask[action] = 1.  # enable loss for this specific action
281 | 
282 | 		assert x.shape == (batch_size, 96,96,3)
283 | 		assert targets.shape == (batch_size, 12)
284 | 		#assert sum(masks) == batch_size
285 | 
286 | 		yield ([x, targets, masks], [dummy_targets, targets])
287 | 
288 | def clone_model(model, custom_objects={}):
289 | 	# Requires Keras 1.0.7 since get_config has breaking changes.
290 | 	config = {
291 | 		'class_name': model.__class__.__name__,
292 | 		'config': model.get_config(),
293 | 	}
294 | 	clone = model_from_config(config, custom_objects=custom_objects)
295 | 	clone._make_predict_function()
296 | 	clone.set_weights(model.get_weights())
297 | 	return clone
298 | 
299 | def weight_change_norm(model, target_model):
300 | 	norm_list = []
301 | 	number_of_layers = len(model.layers)
302 | 	for i in range(number_of_layers):
303 | 		model_matrix = model.layers[i].get_weights()
304 | 		target_model_matrix = target_model.layers[i].get_weights()
305 | 		if len(model_matrix) >0:
306 | 			#print "layer ", i, " has shape ", model_matrix[0].shape
307 | 			if model_matrix[0].shape[0] > 0:
308 | 				norm_change = np.linalg.norm(model_matrix[0]-target_model_matrix[0])
309 | 				norm_list.append(norm_change)
310 | 	return sum(norm_list)*1.0/len(norm_list)
311 | 
312 | 
313 | class LossHistory(keras.callbacks.Callback):
314 |     def on_train_begin(self, logs={}):
315 |         self.losses = []
316 | 
317 |     def on_batch_end(self, batch, logs={}):
318 |         self.losses.append(logs.get('loss'))
319 | 
320 | 
321 | action_data = dd.io.load('./seed_2/car_data_actions_seed_2.h5')
322 | frame_data = dd.io.load('./seed_2/car_data_frames_seed_2.h5')
323 | done_data = dd.io.load('./seed_2/car_data_is_done_seed_2.h5')
324 | next_state_data = dd.io.load('./seed_2/car_data_next_states_seed_2.h5')
325 | current_state_data = dd.io.load('./seed_2/car_data_prev_states_seed_2.h5')
326 | cost_data = dd.io.load('./seed_2/car_data_rewards_seed_2.h5')
327 | 
328 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32')
329 | for i in range(len(frame_data)):
330 | 	frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114])
331 | 
332 | dataset = {'frames':frame_gray_scale,
333 | 			'prev_states': current_state_data,
334 | 			'next_states': next_state_data,
335 | 			'a': action_data,
336 | 			'c':cost_data[:,0]/20.3, ## Divide by the largest one
337 | 			'g':cost_data[:,1:],
338 | 			'done': done_data
339 | 			}
340 | 
341 | 
342 | ### Load data set
343 | #dataset = dd.io.load('car_racing_data.h5')
344 | data_length = len(frame_data)-1
345 | ### Start training
346 | 
347 | 
348 | Q_k_minus_1 = NN(gpu = GPU) ## This is the target network, initialize it with something
349 | Q_k = NN(gpu=GPU) ### Initialize the value network with something
350 | #Q_k_minus_1.loadWeight() ### cheat: loading in DQN weights
351 | ## Form the data set?
352 | 
353 | number_of_iter = 100
354 | batch_size = 32
355 | epochs_per_iter = 1 ## per_iter
356 | #steps_per_epoch = data_length / batch_size
357 | 
358 | #mcp_save = ModelCheckpoint('fqi_test_model.hdf5', save_best_only=False, mode='auto', period=1)
359 | 
360 | #reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
361 | history = LossHistory()
362 | iteration_losses = []
363 | stop_training = False
364 | lr_counter = 0
365 | train_iter = 0
366 | #while not stop_training:
367 | for iteration in range(number_of_iter):
368 | 	print "------------"
369 | 	print "Iteration: ", train_iter
370 | 	lr = eval(Q_k.trainable_model.optimizer.lr)
371 | 	print "Current learning rate: ", lr
372 | 	lr_counter += 1
373 | 	## training validation split
374 | 	indices = np.random.permutation(np.arange(data_length))
375 | 	cutoff = int(1*data_length)
376 | 	train_idx = indices[:cutoff]
377 | 	valid_idx = indices[cutoff:]
378 | 	steps_per_epoch = len(train_idx) / batch_size
379 | 	valid_steps = len(valid_idx) / batch_size
380 | 	#gen = data_generator(dataset, fixed_permutation=False, batch_size=batch_size)
381 | 	gen = data_generator(train_idx, fixed_permutation=False, batch_size=batch_size)
382 | 	#valid_gen = validation_generator(valid_idx, fixed_permutation=False, batch_size=batch_size)
383 | 
384 | 	#mcp_save = ModelCheckpoint('FQI_models/fqi_model_1epoch_gamma095_lr00025_'+str(iteration)+'.hdf5', save_best_only=True, monitor='val_loss', mode='min')
385 | 	#Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=8, use_multiprocessing=False, verbose=1, validation_data = valid_gen, validation_steps = valid_steps, callbacks=[history])
386 | 	Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=8, use_multiprocessing=False, verbose=1, callbacks=[history])
387 | 	iter_loss = sum(history.losses) *1.0/ len(history.losses)
388 | 	#print "This iteration loss: ", iter_loss
389 | 	iteration_losses.append(iter_loss)
390 | 	"""
391 | 	if len(iteration_losses) > 5 and iteration_losses[-1]>max(iteration_losses[-6:-1]) and lr_counter >=5:
392 | 		if lr > 0.0001:
393 | 			lr = max(0.0001, lr*0.5)
394 | 			K.set_value(Q_k.trainable_model.optimizer.lr,lr)
395 | 			lr_counter = 0
396 | 		else:
397 | 			stop_training = True
398 | 	"""
399 | 	#Q_k.trainable_model.fit_generator(gen, epochs=epochs_per_iter, steps_per_epoch=steps_per_epoch, max_queue_size=10, workers=3, use_multiprocessing=False, verbose=0, validation_data = valid_gen, validation_steps = valid_steps)
400 | 	#Q_k_minus_1.model = clone_model(Q_k.model)
401 | 	## Test weight change in last layer
402 | 	old_matrix = Q_k_minus_1.model.layers[-1].get_weights()
403 | 	new_matrix = Q_k.model.layers[-1].get_weights()
404 | 	#print "dimension of weight layer ", new_matrix[0].shape
405 | 	#print "Norm of weight change is ", np.linalg.norm(new_matrix[0]-old_matrix[0])
406 | 	print "Norm of weight change is ", weight_change_norm(Q_k.model, Q_k_minus_1.model)
407 | 	print
408 | 	print exact_policy_algorithm.run(Q_k)	
409 | 	Q_k_minus_1.model.set_weights(Q_k.model.get_weights())
410 | 	Q_k.model.save('FQI_models/fqi_model_1epoch_gamma095_lr0005_fixed_'+str(train_iter)+'.hdf5')
411 | 	train_iter += 1
412 | 
413 | 	#Q_k.compile() ## reset optimizer state
414 | 	#Q_k.model.reset_states()
415 | 	#Q_k.trainable_model.reset_states()
416 | 
417 | 
418 | ### Copying model of Q_k over to Q_k_minus_1 before repeating
419 | 


--------------------------------------------------------------------------------
/frozen_lake.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from gym.envs.registration import register
 4 | from gym.envs.toy_text import FrozenLakeEnv
 5 | 
 6 | 
 7 | class ExtendedFrozenLake(FrozenLakeEnv):
 8 |     def __init__(self, early_termination, desc=None, map_name="4x4",is_slippery=True):
 9 |         super(ExtendedFrozenLake, self).__init__(desc=desc, map_name=map_name, is_slippery=is_slippery)
10 |         self.deterministic = True
11 |         self.max_time_steps = early_termination
12 |         self.min_cost = -1. #set by env
13 |         self.env_type = 'lake'
14 | 
15 |     def is_early_episode_termination(self, cost=None, time_steps=None, total_cost=None):
16 |         if time_steps > self.max_time_steps:
17 |             return True, 0.
18 |         else:
19 |             return False, 0.
20 | 
21 |     def step(self, a):
22 |         transitions = self.P[self.s][a]
23 |         i = self.categorical_sample([t[0] for t in transitions], self.np_random)
24 |         p, s, r, d= transitions[i]
25 |         self.s = s
26 |         self.lastaction=a
27 | 
28 |         c = -r
29 |         g = [int(d and not r)]
30 |         return (s, (c,g), d, {"prob" : p})
31 | 
32 |     @staticmethod
33 |     def categorical_sample(prob_n, np_random):
34 |         """
35 |         Sample from categorical distribution
36 |         Each row specifies class probabilities
37 |         """
38 |         prob_n = np.asarray(prob_n)
39 |         csprob_n = np.cumsum(prob_n)
40 |         return (csprob_n > np_random.rand()).argmax()
41 | 


--------------------------------------------------------------------------------
/inverse_propensity_scoring.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from fitted_algo import FittedAlgo
  5 | from mdp_approximator import MDPApproximator
  6 | from model import Model
  7 | import numpy as np
  8 | from tqdm import tqdm
  9 | import scipy.signal as signal
 10 | 
 11 | class InversePropensityScorer(object):
 12 |     def __init__(self, env, state_space_dim, action_space_dim, grid_shape):
 13 |         '''
 14 |         An implementation of fitted Q iteration
 15 | 
 16 |         num_inputs: number of inputs
 17 |         dim_of_actions: dimension of action space
 18 |         max_epochs: positive int, specifies how many iterations to run the algorithm
 19 |         gamma: discount factor
 20 |         '''
 21 |         self.env = env
 22 |         self.action_space_dim = action_space_dim
 23 |         self.state_space_dim = state_space_dim
 24 |         self.grid_shape = grid_shape
 25 |         # self.initial_states = initial_states
 26 | 
 27 |     def run(self, *args, **kw):
 28 |         '''
 29 |         V^pi(s) = sum_{i = 1}^n p(h_j| pi_new, s_0 = s)/p(h_j| pi_old, s_0 = s) H(h_j)
 30 |         h = (s_1, a_1, r_1, s_2, ...)
 31 |         p(h_j | pi, s) = pi(a_0 | s_0)p(r_0 | s_0, a_0)p(s_1 | s_0, a_0)pi(a_1 |s_1) ...
 32 |                        = prod_j pi(a_j | x_j)p(r_j | x_j, a_j)p(s_{j+1} | x_j, a_j)
 33 |         deterministic  = prod_j pi(a_j | x_j) * 1 * 1 
 34 |                        = prod_j pi(a_j | x_j)
 35 |         H(h_j) = r_0 + gamma * r_1 + gamma^2 r_2 + ...
 36 |         
 37 |         '''
 38 | 
 39 | 
 40 |         approx_ips = self.approx_ips(*args)
 41 |         exact_ips = self.exact_ips(*args)
 42 |         approx_pdis = self.approx_pdis(*args)
 43 |         exact_pdis = self.exact_pdis(*args)
 44 |         dr, wdr, am = self.doubly_robust_approx(*args, **kw)
 45 | 
 46 | 
 47 |         return approx_ips, exact_ips, approx_pdis, exact_pdis, dr, wdr, am
 48 | 
 49 |     def approx_pdis(self, dataset, pi_new, pi_old, epsilon, gamma):
 50 |         '''
 51 |         Per decision importance sampling
 52 | 
 53 |         sum_{t=1}^{max L} gamma^t  1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t
 54 |         '''
 55 |         
 56 |         pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes]
 57 | 
 58 |         # approx IPS, pi_old_a_given_x is approximated by the dataset
 59 |         actions = np.eye(self.action_space_dim)[dataset['a']]
 60 |         unique_states_seen = np.unique(dataset['x'])
 61 |         probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen]
 62 | 
 63 |         prob = {}
 64 |         for idx, state in enumerate(unique_states_seen):
 65 |             prob[state] = probabilities[idx]
 66 | 
 67 |         pi_old_a_given_x = [[ prob[x][a]  for x,a in zip(episode['x'],episode['a']) ] for episode in dataset.episodes]
 68 | 
 69 |         pi_new_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in pi_new_a_given_x])
 70 |         pi_old_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,1)) for x in pi_old_a_given_x])
 71 |         costs = [episode['cost'] for episode in dataset.episodes]
 72 |         costs = np.array([np.pad(x, (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in costs])
 73 | 
 74 |         return self.discounted_sum(np.mean(pi_new_cumprod / pi_old_cumprod * costs, axis=0), gamma)
 75 | 
 76 |         # pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x]
 77 |         # pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x]
 78 |         # costs = [episode['cost'] for episode in dataset.episodes]
 79 | 
 80 |         # per_decision = []
 81 |         # for i in range(dataset.get_max_trajectory_length()):
 82 |         #     s = 0
 83 |         #     count = 0
 84 |         #     for trajectory in range(len(costs)):
 85 |         #         try:
 86 |         #             s += pi_new_cumprod[trajectory][i] / pi_old_cumprod[trajectory][i] * costs[trajectory][i]
 87 |         #             count += 1
 88 |         #         except:
 89 |         #             pass
 90 |         #     per_decision.append(s/float(count))
 91 | 
 92 | 
 93 |         # return self.discounted_sum(per_decision, gamma)
 94 | 
 95 |     def exact_pdis(self, dataset, pi_new, pi_old, epsilon, gamma):
 96 |         '''
 97 |         Per decision importance sampling
 98 | 
 99 |         sum_{t=1}^{max L} gamma^t  1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t
100 |         '''
101 |         
102 |         pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes]
103 |         pi_old_a_given_x = [(pi_old(episode['x']) == episode['a'])*(1-epsilon) + (1./self.action_space_dim)*epsilon for episode in dataset.episodes]
104 | 
105 |         pi_new_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in pi_new_a_given_x])
106 |         pi_old_cumprod = np.array([np.pad(np.cumprod(x), (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,1)) for x in pi_old_a_given_x])
107 |         costs = [episode['cost'] for episode in dataset.episodes]
108 |         costs = np.array([np.pad(x, (0,dataset.get_max_trajectory_length()-len(x)), 'constant', constant_values=(0,0)) for x in costs])
109 | 
110 |         return self.discounted_sum(np.mean(pi_new_cumprod / pi_old_cumprod * costs, axis=0), gamma)
111 | 
112 |         # pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x]
113 |         # pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x]
114 |         # costs = [episode['cost'] for episode in dataset.episodes]
115 | 
116 |         # per_decision = []
117 |         # for t in range(dataset.get_max_trajectory_length()):
118 |         #     s = 0
119 |         #     count = 0
120 |         #     for trajectory in range(len(costs)):
121 |         #         try:
122 |         #             s += pi_new_cumprod[trajectory][t] / pi_old_cumprod[trajectory][t] * costs[trajectory][t]
123 |         #             count += 1
124 |         #         except:
125 |         #             pass
126 |         #     per_decision.append(s/count)
127 | 
128 |         # return self.discounted_sum(per_decision, gamma)
129 | 
130 |     def approx_ips(self, dataset, pi_new, pi_old, epsilon, gamma):
131 |         '''
132 |         Inverse propensity scoring (Importance sampling)
133 |         '''
134 |         H_h_j = [self.discounted_sum(episode['cost'], gamma) for episode in dataset.episodes]
135 |         pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes]
136 | 
137 |         # approx IPS, pi_old_a_given_x is approximated by the dataset
138 |         actions = np.eye(self.action_space_dim)[dataset['a']]
139 |         unique_states_seen = np.unique(dataset['x'])
140 |         probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen]
141 | 
142 |         prob = {}
143 |         for idx, state in enumerate(unique_states_seen):
144 |             prob[state] = probabilities[idx]
145 | 
146 |         pi_old_a_given_x = [[ prob[x][a]  for x,a in zip(episode['x'],episode['a'])] for episode in dataset.episodes]
147 | 
148 |         approx_ips= 0
149 |         for i in range(len(H_h_j)):
150 |             prob_new = np.prod(pi_new_a_given_x[i])
151 |             prob_old = np.prod(pi_old_a_given_x[i])
152 |             if (prob_new > 0) and (prob_old == 0):
153 |                 return np.inf
154 |             approx_ips += prob_new/prob_old * H_h_j[i]
155 | 
156 |         if np.isnan(approx_ips):
157 |             approx_ips = np.inf
158 |         else:
159 |             approx_ips /= len(H_h_j)
160 | 
161 |         return approx_ips
162 | 
163 | 
164 |     def exact_ips(self, dataset, pi_new, pi_old, epsilon, gamma):
165 |         H_h_j = [self.discounted_sum(episode['cost'], gamma) for episode in dataset.episodes]
166 |         pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes]
167 | 
168 |         # exact IPS. If you know pi_old, can calculate exactly
169 |         pi_old_a_given_x = [(pi_old(episode['x']) == episode['a'])*(1-epsilon) + (1./self.action_space_dim)*epsilon for episode in dataset.episodes]
170 | 
171 |         exact_ips = 0
172 |         for i in range(len(H_h_j)):
173 |             prob_new = np.prod(pi_new_a_given_x[i])
174 |             prob_old = np.prod(pi_old_a_given_x[i])
175 |             if (prob_new > 0) and (prob_old == 0):
176 |                 return np.inf
177 |             exact_ips += prob_new/prob_old * H_h_j[i]
178 | 
179 |         
180 |         if np.isnan(exact_ips):
181 |             exact_ips = np.inf
182 |         else:
183 |             exact_ips /= len(H_h_j)
184 | 
185 |         return exact_ips
186 | 
187 |     def doubly_robust_approx(self, dataset, pi_new, pi_old, epsilon, gamma, MDP_approximator=None):
188 |         '''
189 |         sum_{i=0}^n sum_{t=0}^\infty gamma^t w_t^i R_t^{H_i} - 
190 |         sum_{i=0}^n sum_{t=0}^\infty gamma^t (w_t^i \hat{Q}(S^{H_i}_t,A^{H_i}_t) - w_{t-1}^i \hat{V}(S^{H_i}_t,A^{H_i}_{t-1})
191 | 
192 |         w_t^i = rho_t^i / n = 1/n * prod_{n=0}^t pi_new(a_n|x_n) / pi_old(a_n|x_n)
193 | 
194 |         '''
195 |         if MDP_approximator is None:
196 |             mdp = MDPApproximator(self.env, self.state_space_dim + self.action_space_dim, self.grid_shape, self.action_space_dim, 500, gamma)
197 |         else:
198 |             mdp = MDP_approximator
199 | 
200 |         mdp.run(dataset)
201 | 
202 |         actions = np.eye(self.action_space_dim)[dataset['a']]
203 |         unique_states_seen = np.unique(dataset['x'])
204 |         probabilities = [np.mean(actions[dataset['x'] == x], axis=0) for x in unique_states_seen]
205 | 
206 |         prob = {}
207 |         for idx, state in enumerate(unique_states_seen):
208 |             prob[state] = probabilities[idx]
209 | 
210 | 
211 |         pi_new_a_given_x = [(pi_new(episode['x']) == episode['a']).astype(float) for episode in dataset.episodes]
212 |         pi_old_a_given_x = [[ prob[x][a]  for x,a in zip(episode['x'],episode['a'])] for episode in dataset.episodes]
213 |         pi_new_cumprod = [np.cumprod(x) for x in pi_new_a_given_x]
214 |         pi_old_cumprod = [np.cumprod(x) for x in pi_old_a_given_x]
215 |         w_t = [pi_new_cumprod[i]/pi_old_cumprod[i] for i in range(len(pi_new_cumprod))]
216 |         def sum_arrays(x,y): 
217 |             max_len = max(len(x), len(y))
218 |             x = np.pad(x, (0,max_len-len(x)), mode='constant', constant_values=0)
219 |             y = np.pad(y, (0,max_len-len(y)), mode='constant', constant_values=0)
220 |             return x+y
221 | 
222 |         norms = reduce(lambda x,y,s_a=sum_arrays: s_a(x,y), w_t)
223 |         how_many_non_zero = np.sum(norms>0)
224 | 
225 |         drs = []
226 |         wdrs = []
227 |         Q_hat = {}
228 |         V_hat = {}
229 | 
230 |         print mdp.V(pi_new, 0)
231 |         for idx, episode in enumerate(dataset.episodes):
232 |             cost = w_t[idx]*episode['cost']
233 |             first_term = self.discounted_sum(cost, gamma)
234 | 
235 |             Q_hats = []
236 |             V_hats = []
237 |             for x,a in zip(episode['x'], episode['a']):
238 |                 if tuple([x,a]) not in Q_hat:
239 |                     Q_ = mdp.Q(pi_new, x, a)
240 |                     Q_hat[tuple([x,a])] = Q_
241 |                 if tuple([x]) not in V_hat:
242 |                     V_ = mdp.V(pi_new, x)
243 |                     V_hat[tuple([x])] = V_
244 |                 
245 |                 Q_hats.append(Q_hat[tuple([x,a])])
246 |                 V_hats.append(V_hat[tuple([x])])
247 |                 
248 |             # DR
249 |             w_t_minus_1 = np.hstack([1, w_t[idx][:-1]])
250 |             cost = w_t[idx]*Q_hats - w_t_minus_1*V_hats
251 |             second_term = self.discounted_sum(cost, gamma)
252 |             drs.append(first_term - second_term)
253 | 
254 |             #WDR
255 |             #normalize w_t
256 |             how_many = min(len(w_t[idx]), how_many_non_zero)
257 |             w_t_  = w_t[idx][:how_many] / np.array(norms[:how_many])
258 |             w_t_ = np.hstack([w_t_, np.zeros(len(w_t[idx])-how_many) ])
259 |             cost = w_t_*episode['cost']
260 |             first_term = self.discounted_sum(cost, gamma)
261 | 
262 |             w_t_minus_1 = np.hstack([1./len(w_t), w_t_[:-1]])
263 |             cost = w_t_*Q_hats - w_t_minus_1*V_hats
264 |             second_term = self.discounted_sum(cost, gamma)
265 |             wdrs.append(first_term - second_term)
266 | 
267 |         if tuple([0]) not in V_hat:
268 |             AM = mdp.V(pi_new, 0)
269 |         else:
270 |             AM = V_hat[tuple([0])]
271 | 
272 |         return np.mean(drs), np.sum(wdrs), AM
273 | 
274 |     @staticmethod
275 |     def discounted_sum(costs, discount):
276 |         '''
277 |         Calculate discounted sum of costs
278 |         '''
279 |         y = signal.lfilter([1], [1, -discount], x=costs[::-1])
280 |         return y[::-1][0]
281 | 


--------------------------------------------------------------------------------
/lake_primal_dual_gap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_primal_dual_gap.png


--------------------------------------------------------------------------------
/lake_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_values.png


--------------------------------------------------------------------------------
/lake_values_wo_band.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/lake_values_wo_band.png


--------------------------------------------------------------------------------
/layer_visualizer.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Model
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | class LayerVisualizer(object):
 5 |     def __init__(self, model):
 6 | 
 7 |         self.layer_outputs = [layer.output for layer in model.layers if layer.name not in ['mask', 'inp']]
 8 |         self.activation_model = Model(inputs=model.input, outputs=self.layer_outputs)
 9 |         
10 |     def display_activation(self, datum, col_size, row_size, act_index):
11 |         activation = self.activation_model.predict(datum)[act_index]
12 |         activation_index=0
13 |         fig, ax = plt.subplots(row_size, col_size, figsize=(row_size*2.5,col_size*1.5))
14 |         for row in range(0,row_size):
15 |             for col in range(0,col_size):
16 |                 ax[row][col].imshow(activation[0, :, :, activation_index], cmap='gray')
17 |                 activation_index += 1
18 | 
19 |         plt.show()


--------------------------------------------------------------------------------
/mdp_approximator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import keras
  4 | from keras.models import Sequential, Model as KerasModel
  5 | from keras.layers import Input, Dense, Flatten, concatenate, dot, MaxPooling2D
  6 | from keras.losses import mean_squared_error
  7 | import scipy.signal as signal
  8 | from env_nn import LakeNN
  9 | from keras import optimizers
 10 | 
 11 | import gym
 12 | 
 13 | 
 14 | 
 15 | class MDPApproximator(LakeNN):
 16 |     def __init__(self, env, *args, **kw):
 17 |         '''
 18 |         Approximate P(s'| s,a)
 19 |         '''
 20 |         self.env = env
 21 | 
 22 |         self.model_type = kw['model_type'] if 'model_type' in kw else 'mlp'
 23 |         self.gamma = .9
 24 |         super(MDPApproximator, self).__init__(68, 1, [8,8], 4, self.gamma, convergence_of_model_epsilon=1e-10, model_type='mlp', num_frame_stack=(1,), frame_skip=1, pic_size = (1,))
 25 |         self.create_model(68,1)
 26 | 
 27 |     def create_model(self, num_inputs, num_outputs):
 28 |         if self.model_type == 'mlp':
 29 |             model = Sequential()
 30 |             def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=np.random.randint(2**32))
 31 |             model.add(Dense(64, activation='tanh', input_shape=(num_inputs,),kernel_initializer=init(), bias_initializer=init()))
 32 |             model.add(Dense(num_outputs, activation='linear',kernel_initializer=init(), bias_initializer=init()))
 33 |             model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy'])
 34 |             self.model = model
 35 |         else:
 36 |             self.model = super(MDPApproximator, self).create_model(num_inputs, num_outputs)
 37 | 
 38 |     def run(self, dataset):
 39 |         '''
 40 |         probability of
 41 |         transitioning from s to s'
 42 |         given action a is the number of
 43 |         times this transition was observed divided by the number
 44 |         of times action a was taken in state s. If D contains no examples
 45 |         of action a being taken in state s, then we assume
 46 |         that taking action a in state s always causes a transition to
 47 |         the terminal absorbing state.
 48 | 
 49 |         Since everything is deterministic then P(s'|s,a) = 0 or 1.
 50 |         '''
 51 |         transitions = np.vstack([dataset['x'],dataset['a'],dataset['x_prime']]).T
 52 |         unique, idx, count = np.unique(transitions, return_index=True, return_counts=True, axis=0)
 53 | 
 54 |         partial_transitions = np.vstack([dataset['x'],dataset['a']]).T
 55 |         unique_a_given_x, idx_a_given_x, count_a_given_x = np.unique(partial_transitions, return_index=True, return_counts=True, axis=0)
 56 | 
 57 |         # key=(state, action). value= number of times a was taking in state
 58 |         all_counts_a_given_x = {tuple(key):value for key,value in zip(unique_a_given_x,count_a_given_x)}
 59 |         
 60 |         prob = {}
 61 |         for idx,row in enumerate(unique): 
 62 |             if tuple(row[:-1]) in prob:
 63 |                 prob[tuple(row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[(row[0],row[1])]
 64 |             else:
 65 |                 prob[tuple(row[:-1])] = {}
 66 |                 prob[tuple(row[:-1])][row[-1]] = count[idx] / all_counts_a_given_x[(row[0],row[1])]
 67 | 
 68 |         all_transitions = np.vstack([dataset['x'],dataset['a'],dataset['x_prime'], dataset['done']]).T
 69 |         self.terminal_transitions = {tuple([x,a,x_prime]):1 for x,a,x_prime in all_transitions[all_transitions[:,-1] == True][:,:-1]}
 70 | 
 71 |         # Actually fitting R, not Q_k
 72 |         self.Q_k = self.model #init_Q(model_type=self.model_type)
 73 |         X_a = np.array(zip(dataset['x'],dataset['a']))#dataset['state_action']
 74 |         x_prime = dataset['x_prime']
 75 |         index_of_skim = self.skim(X_a, x_prime)
 76 |         self.fit(X_a[index_of_skim], dataset['cost'][index_of_skim], batch_size=len(index_of_skim), verbose=0, epochs=1000)
 77 |         self.reward = self
 78 |         self.P = prob
 79 | 
 80 |     def skim(self, X_a, x_prime):
 81 |         full_set = np.hstack([X_a, x_prime.reshape(1,-1).T])
 82 |         idxs = np.unique(full_set, axis=0, return_index=True)[1]
 83 |         return idxs
 84 | 
 85 |     def R(self, *args):
 86 |         # Exact R
 87 |         mapping = {0:[0,-1], 1:[1,0], 2:[0,1], 3:[-1,0]}
 88 |         x = args[0]
 89 |         x, y = np.where(np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape) == x)
 90 |         x,y = x[0], y[0]
 91 |         delta_x,delta_y = mapping[args[1][0]]
 92 |         new_x = x + delta_x
 93 |         new_y = y + delta_y
 94 |         new_x,new_y = (new_x,new_y) if (0 <= new_x < self.env.desc.shape[0] and 0 <= new_y < self.env.desc.shape[1]) else (x,y)
 95 |         return [[1]] if self.env.desc[new_x,new_y]=='H' else [[0]]
 96 | 
 97 |         # Approximated Rewards
 98 |         # return self.reward(*args)
 99 | 
100 |     def transition(self, x, a):
101 |         # Exact MDP dynamics 
102 |         # mapping = {0:[0,-1], 1:[1,0], 2:[0,1], 3:[-1,0]}
103 |         # x, y = np.where(np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape) == x)
104 |         # x,y = x[0], y[0]
105 |         # delta_x,delta_y = mapping[a]
106 |         # new_x = x + delta_x
107 |         # new_y = y + delta_y
108 |         # new_x,new_y = (new_x,new_y) if (0 <= new_x < self.env.desc.shape[0] and 0 <= new_y < self.env.desc.shape[1]) else (x,y)
109 |         # done = True if self.env.desc[new_x,new_y]=='H' else False
110 |         # done = done or (True if self.env.desc[new_x,new_y]=='G' else False)
111 |         # return np.arange(np.prod(self.env.desc.shape)).reshape(self.env.desc.shape)[new_x,new_y], done
112 |         
113 |         #Approximated dynamics
114 |         if tuple([x,a]) in self.P:
115 |             try:
116 |                 state = np.random.choice(self.P[(x,a)].keys(), p=self.P[(x,a)].values())
117 |             except:
118 |                 import pdb; pdb.set_trace()
119 |             done = False
120 |         else:
121 |             state = None
122 |             done = True
123 | 
124 |         return state, done
125 | 
126 |     def Q(self, policy, x, a):
127 | 
128 |         Qs = []
129 | 
130 |         state = x
131 |         original_a = a
132 |         done = False
133 |         costs = []
134 |         trajectory_length = -1
135 |         # Q
136 |         while not done and trajectory_length < 200:
137 |             trajectory_length += 1
138 |             if trajectory_length > 0:
139 |                 a = policy([state])[0]
140 | 
141 |            
142 |             new_state, done = self.transition(state, a)
143 |             costs.append( self.R([state], [a])[0][0] )
144 |             if (tuple([state,a,new_state]) in self.terminal_transitions):
145 |                 done = True
146 |                 
147 |             
148 |             state = new_state
149 | 
150 |         return self.discounted_sum(costs, self.gamma)
151 | 
152 |     def V(self, policy, x):
153 |         state = x
154 |         done = False
155 |         weighted_costs = []
156 |         trajectory_length = -1
157 |         # V
158 |         while not done and trajectory_length < 200:
159 |             trajectory_length += 1
160 |             # Because greedy deterministic policy
161 |             a = policy([state])[0]
162 | 
163 |             new_state, done = self.transition(state, a)
164 |             weighted_costs.append( self.R([state], [a])[0][0] )
165 |             if (tuple([state,a,new_state]) in self.terminal_transitions):
166 |                 done = True
167 |                 
168 |             state = new_state
169 | 
170 |         return self.discounted_sum(weighted_costs, self.gamma)
171 | 
172 |     @staticmethod
173 |     def discounted_sum(costs, discount):
174 |         '''
175 |         Calculate discounted sum of costs
176 |         '''
177 |         y = signal.lfilter([1], [1, -discount], x=costs[::-1])
178 |         return y[::-1][0]
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | class Model(object):
 5 |     def __init__(self):
 6 |         '''
 7 |         Abstract class defining which functions a model should have
 8 |         '''
 9 |         self.model = None
10 | 
11 |     def fit(self, X, y, verbose=0):
12 |         raise NotImplemented
13 | 
14 |     def predict(self, X, a):
15 |         raise NotImplemented
16 | 
17 |     def all_actions(self, X):
18 |         raise NotImplemented
19 | 
20 |     def representation(*args):
21 |         raise NotImplemented
22 | 
23 |     def copy_over_to(self, to_):
24 |         to_.model.set_weights(self.model.get_weights())
25 | 
26 |     def evaluate(self, verbose=False, render=False, **kw):
27 |         return self.policy_evalutor.run(self, verbose=verbose, render=render, **kw)
28 | 
29 |     def min_over_a(self, X, randomized_tiebreaking=False, **kw):
30 |         '''
31 |         Returns min_a Q(X,a), argmin_a Q(X,a)
32 |         '''
33 |         Q_x_a = self.all_actions(X, **kw)
34 |         return self.min_and_argmin(Q_x_a, randomized_tiebreaking, axis=1)
35 | 
36 |     def max_over_a(self, X, randomized_tiebreaking=False, **kw):
37 |         '''
38 |         Returns min_a Q(X,a), argmin_a Q(X,a)
39 |         '''
40 | 
41 |         Q_x_a = self.all_actions(X, **kw)
42 |         return self.max_and_argmax(Q_x_a, randomized_tiebreaking, axis=1)
43 | 
44 |     @staticmethod
45 |     def max_and_argmax(Q, randomized_tiebreaking=False, **kw):
46 |         ''' max + Argmax + Breaks max/argmax ties randomly'''
47 |         if not randomized_tiebreaking:
48 |             return np.max(Q, **kw), np.argmax(Q, **kw)
49 |         else:
50 |             tie_breaker = np.random.random(Q.shape) * (Q==Q.max())
51 |             argmax = np.argmax(tie_breaker, **kw) # this is counter intuitive.
52 |             return Q[np.arange(Q.shape[0]), argmax], argmax
53 | 
54 |     @staticmethod
55 |     def min_and_argmin(Q, randomized_tiebreaking=False, **kw):
56 |         ''' min + Argmin + Breaks min/argmin ties randomly'''
57 |         if not randomized_tiebreaking:
58 |             return np.min(Q, **kw), np.argmin(Q, **kw)
59 |         else:
60 |             tie_breaker = - np.random.random(Q.shape) * (Q==Q.min())
61 |             argmin = np.argmin(tie_breaker, **kw)
62 |             return Q[np.arange(Q.shape[0]), argmin], argmin
63 | 
64 |     def __call__(self, *args, **kw):
65 |         x_preprocessed = kw['x_preprocessed'] if 'x_preprocessed' in kw else False
66 |         if len(args) == 1:
67 |             '''
68 |             Run policy: pi = argmin_a Q(x,a)
69 |             '''
70 |             x = args[0]
71 |             return self.min_over_a(x, False, x_preprocessed=x_preprocessed)[1]
72 |         elif len(args) == 2:
73 |             '''
74 |             Evaluate Q(x,a)
75 |             '''
76 |             x,a = args
77 |             return self.predict(x,a, x_preprocessed=x_preprocessed)
78 |         else:
79 |             raise
80 | 
81 |     @staticmethod
82 |     def cartesian_product(*arrays):
83 |         la = len(arrays)
84 |         dtype = np.result_type(*arrays)
85 |         arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
86 |         for i, a in enumerate(np.ix_(*arrays)):
87 |             arr[...,i] = a
88 |         return arr.reshape(-1, la)
89 |     
90 |     # def cartesian_product(x,y):
91 |     #     return np.hstack([np.tile(x.T, y.shape[1]).T, np.tile(y,x.shape[0]).reshape(-1,y.shape[1])])
92 | 


--------------------------------------------------------------------------------
/models/pi_1.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_1.hdf5


--------------------------------------------------------------------------------
/models/pi_2.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_2.hdf5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn.h5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn.hdf5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn1.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn1.hdf5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn_good.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_good.hdf5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn_random_seed.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_random_seed.hdf5


--------------------------------------------------------------------------------
/models/pi_old_car_cnn_seed_2.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_car_cnn_seed_2.hdf5


--------------------------------------------------------------------------------
/models/pi_old_map_size_8_mlp.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/pi_old_map_size_8_mlp.h5


--------------------------------------------------------------------------------
/models/weights.01-2362.66.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2362.66.hdf5


--------------------------------------------------------------------------------
/models/weights.01-2542.47.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2542.47.hdf5


--------------------------------------------------------------------------------
/models/weights.01-2635.64.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/models/weights.01-2635.64.hdf5


--------------------------------------------------------------------------------
/neural_network.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import keras
  4 | from keras.models import Sequential, Model as KerasModel
  5 | from keras.layers import Input, Dense, Flatten, concatenate, dot
  6 | from keras.losses import mean_squared_error
  7 | from keras import optimizers
  8 | from keras.callbacks import Callback, TensorBoard
  9 | from exact_policy_evaluation import ExactPolicyEvaluator
 10 | from keras_tqdm import TQDMCallback
 11 | from model import Model
 12 | 
 13 | from keras.layers.convolutional import Conv2D
 14 | 
 15 | 
 16 | class NN(Model):
 17 |     def __init__(self, num_inputs, num_outputs, grid_shape, dim_of_actions, gamma, convergence_of_model_epsilon=1e-10, model_type='mlp', position_of_holes=None, position_of_goals=None):
 18 |         '''
 19 |         An implementation of fitted Q iteration
 20 | 
 21 |         num_inputs: number of inputs
 22 |         num_outputs: number of outputs
 23 |         dim_of_actions: dimension of action space
 24 |         convergence_of_model_epsilon: small float. Defines when the model has converged.
 25 |         '''
 26 |         super(NN, self).__init__()
 27 |         self.convergence_of_model_epsilon = convergence_of_model_epsilon 
 28 |         self.model_type = model_type
 29 |         self.dim_of_actions = dim_of_actions
 30 |         self.dim_of_state = grid_shape[0] * grid_shape[1]
 31 |         self.grid_shape = grid_shape
 32 | 
 33 |         if self.model_type == 'cnn':
 34 |             assert position_of_holes is not None
 35 |             assert position_of_goals is not None
 36 | 
 37 |         
 38 |         self.position_of_goals = position_of_goals
 39 | 
 40 |         if position_of_holes is not None:
 41 |             self.position_of_holes = np.zeros(self.dim_of_state)
 42 |             self.position_of_holes[position_of_holes] = 1
 43 |             self.position_of_holes = self.position_of_holes.reshape(self.grid_shape)
 44 |         else:
 45 |             self.position_of_holes = position_of_holes
 46 | 
 47 |         if position_of_goals is not None:
 48 |             self.position_of_goals = np.zeros(self.dim_of_state)
 49 |             self.position_of_goals[position_of_goals] = 1
 50 |             self.position_of_goals = self.position_of_goals.reshape(self.grid_shape)
 51 |         else:
 52 |             self.position_of_goals = position_of_goals
 53 | 
 54 |         self.model = self.create_model(num_inputs, num_outputs)
 55 |         #debug purposes
 56 |         self.policy_evalutor = ExactPolicyEvaluator([0], num_inputs-dim_of_actions, gamma)
 57 | 
 58 |     def copy_over_to(self, to_):
 59 |         to_.model.set_weights(self.model.get_weights())
 60 | 
 61 |     def create_model(self, num_inputs, num_outputs):
 62 |         if self.model_type == 'mlp':
 63 |             model = Sequential()
 64 |             def init(): return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001, seed=np.random.randint(2**32))
 65 |             model.add(Dense(64, activation='tanh', input_shape=(num_inputs,),kernel_initializer=init(), bias_initializer=init()))
 66 |             model.add(Dense(num_outputs, activation='linear',kernel_initializer=init(), bias_initializer=init()))
 67 |             # adam = optimizers.Adam(clipnorm=1.)
 68 |             model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy'])
 69 |         elif self.model_type == 'cnn':
 70 |             # input layer
 71 |             # 3 channels: holes, goals, player
 72 |             # and actions
 73 |             def init(): seed=np.random.randint(2**32); return keras.initializers.TruncatedNormal(mean=0.0, stddev=0.001, seed=seed)
 74 |             inp = Input(shape=(self.grid_shape[0],self.grid_shape[1],1), name='grid')
 75 |             actions = Input(shape=(self.dim_of_actions,), name='mask')
 76 |             neighbors = Input(shape=(2*self.dim_of_actions,), name='holes_and_goals')
 77 |             
 78 |             # Grid feature extraction
 79 | 
 80 |             seed = np.random.randint(2**32)
 81 | 
 82 |             conv1 = Conv2D(16, kernel_size=2, activation='elu', padding='SAME', data_format='channels_last',kernel_initializer=init(), bias_initializer=init())(inp)
 83 |             # conv2 = Conv2D(16, kernel_size=3, activation='elu', padding='SAME', data_format='channels_last',kernel_initializer=init(), bias_initializer=init())(conv1)
 84 |             flat1 = Flatten()(conv1)
 85 |             
 86 |             # Holes + goals feature extractor
 87 |             # flat2 = Dense(20, activation='elu',kernel_initializer=init(), bias_initializer=init())(neighbors)
 88 |             
 89 |             # merge feature extractors
 90 |             # merge = concatenate([flat1, flat2])
 91 |             
 92 |             # interpret
 93 |             hidden1 = Dense(10, activation='elu',kernel_initializer=init(), bias_initializer=init())(flat1)
 94 |             hidden2 = Dense(self.dim_of_actions, activation='linear',kernel_initializer=init(), bias_initializer=init())(hidden1)
 95 |             
 96 |             output = dot([hidden2, actions], 1)
 97 |             # predict
 98 |             # output = Dense(1, activation='linear',kernel_initializer=init(), bias_initializer=init())(hidden1)
 99 |             model = KerasModel(inputs=[inp, neighbors, actions], outputs=output)
100 |             model.compile(loss='mean_squared_error', optimizer='Adam', metrics=['accuracy'])
101 |         else:
102 |             raise NotImplemented
103 | 
104 |         # model.summary()
105 |         return model
106 | 
107 | 
108 |     def fit(self, X, y, verbose=0, batch_size=512, epochs=1000, evaluate=False, tqdm_verbose=True, **kw):
109 | 
110 |         X = self.representation(X[:,0], X[:, 1])
111 |         self.callbacks_list = [EarlyStoppingByConvergence(epsilon=self.convergence_of_model_epsilon, diff =1e-10, verbose=verbose)]#, TQDMCallback(show_inner=False, show_outer=tqdm_verbose)]
112 |         self.model.fit(X,y,verbose=verbose==2, batch_size=batch_size, epochs=epochs, callbacks=self.callbacks_list, **kw)
113 | 
114 |         if evaluate:
115 |             return self.evaluate()
116 |         else:
117 |             return None
118 | 
119 |     def representation(self, *args):
120 |         if self.model_type == 'mlp':
121 |             if len(args) == 1:
122 |                 return np.eye(self.dim_of_state)[np.array(args[0]).astype(int)]
123 |             elif len(args) == 2:
124 |                 return np.hstack([np.eye(self.dim_of_state)[np.array(args[0]).astype(int)], np.eye(self.dim_of_actions)[np.array(args[1]).astype(int)] ])
125 |             else:
126 |                 raise NotImplemented
127 |         elif self.model_type == 'cnn':
128 |             if len(args) == 1:
129 |                 position = np.eye(self.dim_of_state)[np.array(args[0]).astype(int)].reshape(-1,self.grid_shape[0],self.grid_shape[1])
130 |                 X, surrounding = self.create_cnn_rep_helper(position)
131 |                 return [X, surrounding]
132 |             elif len(args) == 2:
133 |                 position = np.eye(self.dim_of_state)[np.array(args[0]).astype(int)].reshape(-1,self.grid_shape[0],self.grid_shape[1])
134 |                 X, surrounding = self.create_cnn_rep_helper(position)
135 |                 return [X, surrounding, np.eye(self.dim_of_actions)[np.array(args[1]).astype(int)] ]
136 |             else:
137 |                 raise NotImplemented
138 |         else:
139 |             raise NotImplemented
140 | 
141 |     def create_cnn_rep_helper(self, position):
142 |         how_many = position.shape[0]
143 |         holes = np.repeat(self.position_of_holes[np.newaxis, :, :], how_many, axis=0)
144 |         goals = np.repeat(self.position_of_goals[np.newaxis, :, :], how_many, axis=0)
145 | 
146 |         ix_x, ix_y, ix_z = np.where(position)
147 |         surrounding = self.is_next_to([self.position_of_holes, self.position_of_goals], ix_y, ix_z)
148 | 
149 |         return np.sum([position*.5, holes*1, goals*(-1)], axis = 0)[:,:,:,np.newaxis], np.hstack(surrounding)
150 | 
151 |     def is_next_to(self, obstacles, x, y):
152 |         # obstacles must be list
153 |         assert np.all(np.array([obstacle.shape for obstacle in obstacles]) == obstacles[0].shape)
154 |         surround = lambda x,y: [(x, y-1), (x+1, y), (x, y+1), (x-1, y)]
155 | 
156 |         ret = []
157 |         for idx in range(len(x)):
158 |             neighbors = []
159 |             for a,b in surround(x[idx], y[idx]):
160 |                 # only works if all obstacles are same shape
161 |                 neighbor = np.vstack([obstacle[a, b] for obstacle in obstacles]) if 0 <= a < obstacles[0].shape[0] and 0 <= b < obstacles[0].shape[1] else np.array([0.]*len(obstacles)).reshape(1,-1).T
162 |                 neighbors.append(neighbor)
163 | 
164 |             ret.append(np.hstack(neighbors))
165 | 
166 |         return np.stack(ret, axis=1)
167 | 
168 |     def predict(self, X, a):
169 |         return self.model.predict(self.representation(X,a))
170 | 
171 |     def all_actions(self, X):
172 |         # X_a = ((x_1, a_1)
173 |                # (x_1, a_2)
174 |                #  ....
175 |                # (x_1, a_m)
176 |                # ...
177 |                # (x_N, a_1)
178 |                # (x_N, a_2)
179 |                #  ...
180 |                #  ...
181 |                # (x_N, a_m))
182 |         X = np.array(X)
183 |         X_a = self.cartesian_product(X, np.arange(self.dim_of_actions))
184 | 
185 | 
186 |         # Q_x_a = ((Q_x1_a1, Q_x1_a2,... Q_x1_am)
187 |                  # (Q_x2_a1, Q_x2_a2,... Q_x2_am)
188 |                  # ...
189 |                  # (Q_xN_a1, Q_xN_a2,... Q_xN_am)
190 |         # by reshaping using C ordering
191 | 
192 |         Q_x_a = self.predict(X_a[:,0], X_a[:,1]).reshape(X.shape[0],self.dim_of_actions,order='C')
193 |         return Q_x_a
194 | 
195 | class EarlyStoppingByConvergence(Callback):
196 |     def __init__(self, monitor='loss', epsilon=0.01, diff=.001, use_both=True, verbose=0):
197 |         super(Callback, self).__init__()
198 |         self.monitor = monitor
199 |         self.epsilon = epsilon
200 |         self.diff = diff
201 |         self.use_both = use_both
202 |         self.verbose = verbose
203 |         self.losses_so_far = []
204 |         self.converged = False
205 | 
206 |     def on_epoch_end(self, epoch, logs={}):
207 |         self.epoch = epoch
208 |         
209 |         current = logs.get(self.monitor)
210 |         if current is None:
211 |             print("Early stopping requires %s available!" % self.monitor)
212 |             exit()
213 |         else:
214 |             self.losses_so_far.append(current)
215 | 
216 |         if self.verbose:
217 |             if (self.epoch % 100) == 0:
218 |                 print 'Epoch %s, loss: %s' % (epoch, self.losses_so_far[-1])
219 |         
220 |         if self.use_both:
221 |             if ((len(self.losses_so_far) > 1) and (np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]) < self.epsilon)) or (self.losses_so_far[-1] < self.diff):
222 |                 self.model.stop_training = True
223 |                 self.converged = True
224 |             else:
225 |                 pass
226 |         else:
227 |             if ((len(self.losses_so_far) > 1) and (np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]) < self.epsilon)):
228 |                 self.model.stop_training = True
229 |                 self.converged = True
230 |             else:
231 |                 pass
232 | 
233 | 
234 |     def on_train_end(self, logs=None):
235 |         if self.epoch > 1:
236 |             if self.verbose > 0:
237 |                 if self.converged:
238 |                     print 'Epoch %s: early stopping. Converged. Delta: %s. Loss: %s' % (self.epoch, np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]), self.losses_so_far[-1])
239 |                 else:
240 |                     print 'Epoch %s. NOT converged. Delta: %s. Loss: %s' % (self.epoch, np.abs(self.losses_so_far[-2] - self.losses_so_far[-1]), self.losses_so_far[-1])
241 | 
242 |     def on_train_begin(self, logs=None):
243 |         # Allow instances to be re-used
244 |         self.losses_so_far = []
245 |         self.converged = False
246 | 
247 | 
248 |             
249 |         
250 | 


--------------------------------------------------------------------------------
/pi_old_car_cnn_main.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/pi_old_car_cnn_main.hdf5


--------------------------------------------------------------------------------
/play_car_racing.py:
--------------------------------------------------------------------------------
 1 | from car_racing import ExtendedCarRacing
 2 | import numpy as np
 3 | 
 4 | if __name__=="__main__":
 5 |     from pyglet.window import key
 6 |     a = np.array( [0.0, 0.0, 0.0] )
 7 |     def key_press(k, mod):
 8 |         global restart
 9 |         if k==0xff0d: restart = True
10 |         if k==key.LEFT:  a[0] = -1.0
11 |         if k==key.RIGHT: a[0] = +1.0
12 |         if k==key.UP:    a[1] = +1.0
13 |         if k==key.DOWN:  a[2] = +0.8   # set 1.0 for wheels to block to zero rotation
14 |     def key_release(k, mod):
15 |         if k==key.LEFT  and a[0]==-1.0: a[0] = 0
16 |         if k==key.RIGHT and a[0]==+1.0: a[0] = 0
17 |         if k==key.UP:    a[1] = 0
18 |         if k==key.DOWN:  a[2] = 0
19 |     env = ExtendedCarRacing(0, False, 12)
20 |     env.render()
21 |     record_video = False
22 |     if record_video:
23 |         env.monitor.start('/tmp/video-test', force=True)
24 |     env.viewer.window.on_key_press = key_press
25 |     env.viewer.window.on_key_release = key_release
26 |     while True:
27 |         env.reset()
28 |         total_reward = 0.0
29 |         steps = 0
30 |         restart = False
31 |         while True:
32 |             s, r, done, info = env.step(a)
33 |             print r[1][1], r[1][3], r[1][4]
34 |             total_reward += r[0]
35 |             if steps % 200 == 0 or done:
36 |             	pass
37 |                 # print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
38 |                 # print("step {} total_reward {:+0.2f}".format(steps, total_reward))
39 |                 #import matplotlib.pyplot as plt
40 |                 #plt.imshow(s)
41 |                 #plt.savefig("test.jpeg")
42 |             steps += 1
43 |             if not record_video: # Faster, but you can as well call env.render() every time to play full window.
44 |                 env.render()
45 |             if done or restart or float(env.tile_visited_count)>139: break
46 |         print steps, float(env.tile_visited_count), len(env.track), float(env.tile_visited_count)/len(env.track)
47 |     env.close()


--------------------------------------------------------------------------------
/plot_fqe_quality_test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib
  3 | matplotlib.use('TkAgg')
  4 | import matplotlib.pyplot as plt
  5 | import os
  6 | import numpy as np
  7 | import seaborn as sns; sns.set(color_codes=True)
  8 | from matplotlib.ticker import FuncFormatter
  9 | def percent(x, pos):
 10 |     return '%1d%%' % (x)
 11 | percent_formatter = FuncFormatter(percent)
 12 | 
 13 | # Colors
 14 | alpha = 0.15
 15 | sns.set(style="whitegrid", palette="Paired")
 16 | 
 17 | colorSet = sns.color_palette("Paired", 10);
 18 | def color_gen():
 19 |     
 20 | 
 21 |     colors = [ "dusty purple", "faded green", "amber", "windows blue", "coral"]
 22 |     colors = sns.xkcd_palette(colors)
 23 |     idx  = -1
 24 |     while 1:
 25 |         idx = (idx + 1) % len(colors)
 26 |         yield colors[idx]
 27 | 
 28 | 
 29 | path = os.path.join(os.getcwd(), 'experimental_results')
 30 | files = os.listdir(path)
 31 | csvs = [f for f in files if 'fqe_quality' in f]
 32 | 
 33 | # tmp = pd.DataFrame([csv.split('.csv')[0].split('_')[2:] for csv in csvs], columns=['year','month','day','hour','minute','a','b'])
 34 | # results_file = 'fqe_quality_' + '_'.join(tmp.sort_values(by=['year','month','day','hour','minute'], ascending=False).iloc[0]) + '.csv'
 35 | # results_file = 'fqe_quality_2018_12_23_11_00_g_cnn.csv'
 36 | # dr_fix = 'fqe_quality_fixed_dr.csv'
 37 | results_file = 'fqe_quality_fixed_dr_tabular_4.csv'
 38 | df = pd.read_csv(os.path.join(path, results_file))
 39 | df['trial_num'] = np.array([[i]*10 for i in range(int(1+max(df['trial_num'])))]).reshape(-1)
 40 | df['num_trajectories'] = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9]*int(max(df['trial_num'])+1)
 41 | # df_dr_fix = pd.read_csv(os.path.join(path, dr_fix))
 42 | 
 43 | # df = df.merge(df_dr_fix, left_on=['epsilon','num_trajectories','trial_num'], right_on=['epsilon','num_trajectories','trial_num'], how='left')
 44 | # for col in [col for col in df.columns if '_y' in col]: 
 45 | #     if 'doubly_robust' not in col: 
 46 | #         del df[col]
 47 | 
 48 | # for col in [col for col in df.columns if ('_x' in col) and ('doubly_robust' in col)]:
 49 | #     del df[col]
 50 | # df.columns = [col.replace('_x', '') for col in df.columns]
 51 | # df.columns = [col.replace('_y', '') for col in df.columns]
 52 | 
 53 | def custom_plot(x, y, minimum, maximum, plot_band=True, zorder=11, alpha=.15, **kwargs):
 54 |     ax = kwargs.pop('ax', plt.gca())
 55 |     base, = ax.plot(x, y, **kwargs)
 56 |     if plot_band:
 57 |         ax.fill_between(x, minimum, maximum, facecolor=base.get_color(), alpha=alpha, zorder=zorder)
 58 | 
 59 | for epsilon, group in df.groupby('epsilon'):
 60 |     del group['epsilon']
 61 |     # group.set_index('num_trajectories').plot()
 62 |     # import pdb; pdb.set_trace()
 63 |     small_value = 1e-10
 64 |     exact = group['approx_pdis'].iloc[0]+small_value
 65 |     print list(group.apply(lambda x: x+exact).groupby('num_trajectories'))[-1][1][['trial_num', 'exact', 'fqe']]
 66 |     means = group.apply(lambda x: x+exact).groupby('num_trajectories').mean()
 67 |     stds = group.apply(lambda x: x+exact).groupby('num_trajectories').std()
 68 | 
 69 |     medians = group.apply(lambda x: x+exact).groupby('num_trajectories').median()
 70 |     lower_quants = group.apply(lambda x: x+exact).groupby('num_trajectories').quantile(.25)
 71 |     upper_quants = group.apply(lambda x: x+exact).groupby('num_trajectories').quantile(.75)
 72 | 
 73 |     del means['trial_num']
 74 |     del stds['trial_num']
 75 |     del medians['trial_num']
 76 |     del lower_quants['trial_num']
 77 |     del upper_quants['trial_num']
 78 | 
 79 |     print '*'*20
 80 |     print 'Epsilon: %s' % epsilon
 81 |     print means
 82 |     print stds
 83 | 
 84 |     fig, ax = plt.subplots(1)
 85 |     colors = color_gen()
 86 |     for i, col in enumerate(['fqe', 'approx_pdis', 'doubly_robust', 'weighted_doubly_robust']):
 87 |         # import pdb; pdb.set_trace()
 88 | 
 89 |         x = np.array(np.unique(group['num_trajectories']))
 90 |         mu = np.array(means[col])
 91 |         sigma = np.array(stds[col])
 92 |         lower_bound = mu + sigma
 93 |         upper_bound = mu - sigma
 94 |         # mu = np.array(medians[col])
 95 |         # lower_bound = np.array(lower_quants[col])
 96 |         # upper_bound = np.array(upper_quants[col])
 97 | 
 98 |         
 99 | 
100 |         col = ['Fitted Q Evaluation (FQE)', 'Per-Decision IS (PDIS)', 'Doubly Robust (DR)', 'Weighted Doubly Robust (WDR)', 'AM'][i]
101 |         if (i == 0) or (i == 3):
102 |             custom_plot(x*100, mu, lower_bound, upper_bound, plot_band=True,zorder=11, marker='o', label=col,  color=colors.next())
103 |         else:
104 |             custom_plot(x*100, mu, lower_bound, upper_bound, plot_band=False,zorder=11, marker='o', label=col,  color=colors.next())
105 |     
106 |     custom_plot(x*100, [exact]*len(x), lower_bound, upper_bound, plot_band=False, marker='o', label='True Value',  color=colors.next())
107 | 
108 | 
109 |     # means.plot(yerr=stds)
110 | 
111 |     # plt.title(epsilon)
112 |     col = color_gen().next()
113 |     print 'Number of Trials: ', max(df['trial_num'])+1
114 |     ax.legend(loc='upper right')
115 |     ax.grid(alpha=.35)
116 |     # ax.set_title('Probability of exploration: %s' % epsilon)
117 |     ax.set_xlabel('Percentage of Data Sub-Sampled for Evaluation')
118 |     ax.set_ylabel('Estimated Constraint Value')
119 |     ax.set_title('Off-Policy Evaluation - Standalone Comparison', fontsize=16)
120 |     ax.xaxis.set_major_formatter(percent_formatter)
121 |     ax.set_ylim(bottom=-1, top=0)
122 |     plt.tight_layout()
123 |     plt.savefig('lake_fqe_vs_others.png', format='png', dpi=300)
124 |     plt.show()
125 | 
126 | 


--------------------------------------------------------------------------------
/plot_grid_search.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import matplotlib
  3 | matplotlib.use('TkAgg')
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import matplotlib.lines as mlines
  8 | from matplotlib.legend import Legend
  9 | import seaborn as sns
 10 | import deepdish as dd
 11 | from mpl_toolkits.axes_grid1 import make_axes_locatable
 12 | sns.set(context="paper")#, font="monospace")
 13 | plt.rc('text', usetex=True)
 14 | #sns.set(style="darkgrid", palette="Paired")
 15 | 
 16 | # Load the datset of correlations between cortical brain networks
 17 | #df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)
 18 | #corrmat = df.corr()
 19 | 
 20 | # matrix = np.load('policy_role_freq.npy')
 21 | import os
 22 | df = pd.read_csv(os.path.join(os.getcwd(),'experimental_results','lspi.csv'))
 23 | df = pd.DataFrame(np.array([[y.strip('(').strip(')') for y in x] for x in [x.split(',') for x in np.array(df.columns)]]).astype(float),
 24 | 		columns = ['lambda_0', 'lambda_1', 'c_pi_exact', 'g_pi_exact_0', 'g_pi_exact_1', 'performance'])
 25 | # import pdb; pdb.set_trace()
 26 | df = pd.read_csv(os.path.join(os.getcwd(),'experimental_results','results_grid.csv'))
 27 | data = dd.io.load(os.path.join(os.getcwd(),'experimental_results','policy_improvement_grid.h5'))
 28 | # performance = np.array(df['performance']) 
 29 | performance = np.array(data['c_performance'])
 30 | df = df[['c_pi_exact','g_pi_exact_0','g_pi_exact_1','lambda_0','lambda_1']]
 31 | 		#labels=['c_pi_exact','g_pi_exact_0','g_pi_exact_1','lambda_0','lambda_1','performance'])
 32 | 
 33 | 
 34 | main = np.array(df['c_pi_exact']).reshape(11,11)
 35 | braking = np.array(df['g_pi_exact_0']).reshape(11,11)
 36 | lane = np.array(df['g_pi_exact_1']).reshape(11,11)
 37 | 
 38 | 
 39 | # import pdb; pdb.set_trace()
 40 | 
 41 | 
 42 | # # Set up the matplotlib figure
 43 | # f, axarr = plt.subplots(nrows=1,ncols=3, figsize=(12, 9))
 44 | # sns.set(font_scale=2)
 45 | # upper_bound = [-60, 8, 135.]#[1.5, 5.]
 46 | # lower_bound = [-30, 0., 0.]#[1.5, 5.]
 47 | # for i, matrix in enumerate([main, braking, lane]):
 48 | # 	sns.heatmap(matrix, cmap = 'summer', ax=axarr[i], vmin= lower_bound[i], vmax =upper_bound[i], square=True)
 49 | # 	axarr[i].tick_params(axis='x', labelsize=18)
 50 | # 	axarr[i].tick_params(axis='y', labelsize=18)
 51 | # 	axarr[i].set_xlabel(r'$\lambda_0$' + ' (Braking Penalty)', fontsize = 18)
 52 | # 	axarr[i].set_ylabel(r'$\lambda_1$' + ' (Center of Lane Penalty)', fontsize = 18)
 53 | 
 54 | 
 55 | 
 56 | # #g.axes[0,0].set_xlabel('axes label 1')
 57 | 
 58 | # # Use matplotlib directly to emphasize known networks
 59 | # """
 60 | # networks = corrmat.columns.get_level_values("network")
 61 | # for i, network in enumerate(networks):
 62 | #     if i and network != networks[i - 1]:
 63 | #         ax.axhline(len(networks) - i, c="w")
 64 | #         ax.axvline(i, c="w")
 65 | # """
 66 | # f.tight_layout()
 67 | # #f.savefig('role_frequency.png', format='png', dpi=300)
 68 | # plt.show()
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | # generate data
 78 | constraints = [5.8, 85.]
 79 | use_rewards = True
 80 | # x = np.linspace(0,1, num=11)
 81 | # y = np.linspace(0,1, num=11)
 82 | # X,Y = np.meshgrid(x,y)
 83 | # signal = main.reshape(-1)
 84 | det = (braking.reshape(-1) < constraints[0]) & (lane.reshape(-1) < constraints[1]) & (performance.reshape(-1) >= .95)  #np.random.poisson(lam=0.5,size=len(x)*len(y))
 85 | det = det.astype(int)
 86 | 
 87 | df_signal = df[['c_pi_exact', 'lambda_0', 'lambda_1']]#pd.DataFrame({"y":df.flatten(), "x":X.flatten(), "intensity":signal})
 88 | df_signal.columns = ['intensity', 'x', 'y']
 89 | df_det = pd.DataFrame({"y":df['lambda_1'], "x":df['lambda_0'], "det":det})
 90 | df_signal['intensity'] = -use_rewards*df_signal['intensity']
 91 | 
 92 | # prepare Dataframes
 93 | dfmark = df_det[df_det["det"]>0]
 94 | 
 95 | #plotting
 96 | fig, ax=plt.subplots()
 97 | divider = make_axes_locatable(ax)
 98 | cax = divider.append_axes('right', size='5%', pad=0.05)
 99 | 
100 | x = df_signal["x"].unique()
101 | y = df_signal["y"].unique()
102 | ext = [x.min()-np.diff(x)[0]/2.,x.max()+np.diff(x)[0]/2., 
103 |        y.min()-np.diff(y)[0]/2.,y.max()+np.diff(y)[0]/2. ]
104 | 
105 | # df_signal['y'] += 1
106 | # df_signal['y'] = 1/df_signal['y']
107 | df = df_signal.pivot(index="y", columns="x")
108 | im = ax.imshow(df, extent=ext, cmap=plt.get_cmap('YlGnBu'), origin='lower')
109 | ax.set_xticks(x)
110 | ax.set_xticklabels([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], fontsize=14)
111 | ax.set_yticks(y)
112 | ax.set_yticklabels([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], fontsize=14)
113 | 
114 | # ax.scatter(dfmark["x"], dfmark["y"], marker="s", s=100, c="crimson")
115 | dx = np.diff(x)[0]; dy = np.diff(y)[0]
116 | recs = []
117 | for (xi,yi), in zip(dfmark[["x","y"]].values):
118 |     rec = plt.Rectangle((xi-dx/2.,yi-dy/2.),dx,dy, fill=False,
119 |                         edgecolor="black", lw=2, hatch='\\')
120 |     recs.append(rec)
121 |     ax.add_artist(rec)
122 | 
123 | rec = plt.Rectangle((0.,0.),0,0, fill=False,
124 |                         edgecolor="black", lw=2, hatch='\\')
125 | recs.append(rec)
126 | 
127 | df = df_signal.merge(df_det, how='left')
128 | 
129 | # good_policies = -use_rewards*np.array(main).reshape(11,11) * det.reshape(11,11)
130 | for (xi, yi) in df[df['det']>0][df[df['det'] > 0]['intensity'] == df[df['det'] > 0]['intensity'].max()][['x','y']].values:
131 |     
132 |     best = plt.Rectangle((xi-dx/2.,yi-dy/2.),dx,dy, fill=False,
133 |                         edgecolor="crimson", lw=2, hatch='*' )
134 |     ax.add_artist(best)
135 | 
136 | best = plt.Rectangle((0.,0.),0,0, fill=False,
137 |                         edgecolor="crimson", lw=2, hatch='*' )
138 | 
139 | # plt.legend([recs[-1], best])
140 | cbar = fig.colorbar(im, cax=cax, orientation='vertical', ticks=np.arange(-10, 60, 10)[::-1])
141 | cbar.ax.set_yticklabels(np.arange(10, -60, -10)[::-1], fontsize=16)
142 | cax.set_ylabel('Main Objective Value', fontsize=18)
143 | ax.set_xlabel(r'$\lambda_0$' + ' (Braking Penalty)', fontsize = 18)
144 | ax.set_ylabel(r'$\lambda_1$' + ' (Center of Lane Penalty)', fontsize = 18)
145 | ax.legend([recs[-1], best], ['Satisfies Constraints', 'Best, Satisfies Constraints'], fontsize=16, loc='upper left', framealpha=.4)
146 | ax.set_title('Regularized FQI Grid Search', fontsize=18)
147 | plt.tight_layout()
148 | plt.savefig('fqi_grid_search.png', format='png', dpi=300)
149 | plt.show()
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/plot_policy_improvement_v2.py:
--------------------------------------------------------------------------------
  1 | import deepdish as dd
  2 | import matplotlib
  3 | matplotlib.use('TkAgg')
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import matplotlib.lines as mlines
  8 | from matplotlib.legend import Legend
  9 | import seaborn as sns; sns.set(color_codes=True)
 10 | import os
 11 | import scipy.signal as signal
 12 | 
 13 | # Colors
 14 | alpha = 0.15
 15 | sns.set(style="whitegrid", palette="Paired")
 16 | colorSet = sns.color_palette("Paired", 10);
 17 | 
 18 | def discounted_sum(costs, discount):
 19 |     '''
 20 |     Calculate discounted sum of costs
 21 |     '''
 22 |     y = signal.lfilter([1], [1, -discount], x=costs[::-1])
 23 |     return y[::-1][0]
 24 | 
 25 | def color_gen():
 26 | 	
 27 | 	colors = [ "dusty purple", "windows blue",  "faded green", "dark pink", "amber"]
 28 | 	colors = sns.xkcd_palette(colors)
 29 | 	idx  = -1
 30 | 	while 1:
 31 | 		idx = (idx + 1) % len(colors)
 32 | 		yield colors[idx]
 33 | 
 34 | # Data setup 
 35 | 
 36 | dones = dd.io.load(os.path.join('seed_2_data', 'car_data_is_done_seed_2.h5'))
 37 | costs = dd.io.load(os.path.join('seed_2_data', 'car_data_rewards_seed_2.h5'))
 38 | dones = np.hstack([0,1+np.where(dones)[0]])
 39 | episodes = []
 40 | for low_, high_ in zip(dones[:-1], dones[1:]):
 41 |     new_episode ={
 42 |         'c': costs[low_:high_, 0].reshape(-1),
 43 |         'brake': costs[low_:high_, -1].reshape(-1),
 44 |         'lane': costs[low_:high_, 3].reshape(-1),
 45 |     }
 46 |     
 47 |     episodes.append(new_episode)
 48 | 
 49 | discounted_costs = np.array([[discounted_sum(x['c'],.95),discounted_sum(x['brake'],.95),discounted_sum(x['lane'],.95)]  for x in episodes])
 50 | data = dd.io.load('car_policy_improvement.h5')
 51 | DQN = [-39.61397106365249, 7.703194041056963, 115.62071639160499]
 52 | LSPI = pd.read_csv('lspi_results.csv')
 53 | plt.rc('text', usetex=True)
 54 | 
 55 | 
 56 | lines, fill_betweens= [], []
 57 | plt.rc_context({'axes.edgecolor':'k'})
 58 | 
 59 | 
 60 | 
 61 | 
 62 | fig = plt.figure(figsize=(12, 6))
 63 | grid = plt.GridSpec(6, 4, wspace=0.6, hspace=0.5)
 64 | ax1 = fig.add_subplot(grid[0:4, :2])
 65 | 
 66 | 
 67 | 
 68 | # fig, ax1 = plt.subplots()
 69 | ax1.grid(alpha=.35)
 70 | max_iterations = 27
 71 | iterations = range(len(data['g_eval'][0][:max_iterations]))
 72 | colors = color_gen()
 73 | constraint_names = ['Braking', 'Center of Lane']
 74 | constraint_upper_bound = [5.8, 85.]#[1.5, 5.]
 75 | locations = ['lower left', 'lower center', 'lower right']
 76 | fontsize = 16
 77 | legend_fontsize = 16
 78 | legend_title_fontsize = 16
 79 | major_tick_mark_size = 14
 80 | 
 81 | 
 82 | def derandomize(data, constraints, min_iteration):
 83 | 	
 84 | 	fqe_c = np.array(data['c_eval'][0])[:,-1]
 85 | 	fqe_g_0 = np.array(data['g_eval'][0])[:,-1]
 86 | 	fqe_g_1 = np.array(data['g_eval'][1])[:,-1]
 87 | 	out = []
 88 | 	for iteration in range(min_iteration, len(fqe_c)):
 89 | 
 90 | 		df_tmp = pd.DataFrame(np.hstack([np.arange(min_iteration,iteration+1).reshape(1,-1).T, fqe_c[min_iteration:(iteration+1)].reshape(1,-1).T, fqe_g_0[min_iteration:(iteration+1)].reshape(1,-1).T, fqe_g_1[min_iteration:(iteration+1)].reshape(1,-1).T ]), columns=['iteration', 'fqe_c', 'fqe_g_0', 'fqe_g_1'])
 91 | 		df_tmp = df_tmp[(df_tmp['fqe_g_0'] < constraints[0]) & (df_tmp['fqe_g_1'] < constraints[1]) ]
 92 | 		try:
 93 | 			argmin = np.argmin(np.array(df_tmp['fqe_c']))
 94 | 			it = int(df_tmp.iloc[argmin]['iteration'])
 95 | 		except:
 96 | 			argmin = 0
 97 | 			it = 0
 98 | 		out.append(np.hstack([iteration, np.hstack([data['c_exacts'][it],  np.array(data['g_exacts'])[it,:-1]]) ]))
 99 | 
100 | 	return pd.DataFrame(out, columns=['iterations', 'c_derandomized', 'g_0_derandomized', 'g_1_derandomized'])
101 | 
102 | 
103 | df_derandom = derandomize(data, np.array(constraint_upper_bound)*.8, 0)
104 | 
105 | legend = []
106 | car_color = colors.next()
107 | derandom_color = colors.next()
108 | c_values = np.array(data['c_eval_actuals'])[:max_iterations,-10:,:] # shape = (iteration #, k, performance)
109 | last = np.cumsum(c_values[:,-1,0])/np.arange(1,1+len(c_values[:,-1,0]))#*100
110 | evaluation = np.array(pd.DataFrame(c_values[:,-1,0]).expanding().mean()).reshape(-1)
111 | std = np.array(pd.DataFrame(c_values[:,-1,0]).expanding().std()).reshape(-1)
112 | 
113 | #evaluation = np.mean(c_values, axis=1)[:,0]#*100
114 | #std = np.std(c_values, axis=1)[:,0]#*100
115 | lines.append( ax1.plot(iterations, last, color = car_color,linestyle='-',markersize=7, label='Exact') )
116 | 
117 | lines.append( ax1.plot(df_derandom['iterations'], df_derandom['c_derandomized'], color = derandom_color,linestyle='-',markersize=7, label='Exact') )
118 | 
119 | # lines.append( ax1.plot(iterations, evaluation, color = car_color,marker='s',markersize=7, label='Mean of last 10') )
120 | # fill_betweens.append( ax1.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = car_color, zorder = 10) )
121 | 
122 | y_err_lower = last - np.min(c_values, axis=1)[:,0]
123 | y_err_higher = last - np.max(c_values, axis=1)[:,0]
124 | legend.append( mlines.Line2D([], [], color=car_color, linestyle='-',
125 |                       markersize=7, label='Percent of Track Covered') )
126 | legend.append( mlines.Line2D([], [], color=derandom_color, linestyle='-',
127 |                       markersize=7, label='Percent of Track Covered') )
128 | # legend.append( mlines.Line2D([], [], color=car_color, marker='s',
129 | #                       markersize=7, label='Percent of Track Covered') )
130 | 
131 | ## Baselines
132 | 	# LSPI
133 | lspi_color = colors.next()
134 | lspi = np.array(LSPI.iloc[:,0])
135 | evaluation = np.array(pd.DataFrame(lspi).expanding().mean()).reshape(-1)
136 | lines.append( ax1.plot(iterations, evaluation, color = lspi_color,markersize=7,linestyle='-' , label='Exact') )
137 | legend.append( mlines.Line2D([], [], color=lspi_color, linestyle='-' ,
138 |                       markersize=7, label='Percent of Track Covered') )
139 | 
140 | 	# DQN
141 | dqn_color = colors.next()
142 | dqn = np.array([DQN[0]]*len(last))
143 | evaluation = np.array(pd.DataFrame(dqn).expanding().mean()).reshape(-1)
144 | lines.append( ax1.plot(iterations, evaluation, color = dqn_color,markersize=7,linestyle='-' , label='Exact') )
145 | legend.append( mlines.Line2D([], [], color=dqn_color, linestyle='-' ,
146 |                       markersize=7, label='Percent of Track Covered') )
147 | 
148 | 	# Pi_D
149 | pi_d_color = colors.next()
150 | evaluation = np.mean(discounted_costs[:,0]).reshape(-1)
151 | lines.append( ax1.plot(iterations, [evaluation]*len(iterations), color = pi_d_color,markersize=7,linestyle='-' , label='Exact') )
152 | legend.append( mlines.Line2D([], [], color=pi_d_color, linestyle='-' ,
153 |                       markersize=7, label='Percent of Track Covered') )
154 | 
155 | legend.append( mlines.Line2D([], [], color='k', linestyle='--' ,
156 |                       markersize=7, label='Percent of Track Covered') )
157 | 
158 | ax1.set_xlabel('Iteration (t)', fontsize=fontsize)
159 | ax1.set_ylabel('Value (Main Objective)', color='k', fontsize=fontsize+2)
160 | ax1.tick_params(axis='y', labelcolor='k')
161 | ax1.set_ylim(bottom=-55, top=-15)
162 | # ax1.set_ylim(bottom=20, top=55)
163 | ax1.set_xlim(-.5, 25)
164 | labels = np.array(['FQE', 'Algo 2', 'Mean', 'Regularized LSPI', 'Online-RL \n(no constraint)', 'Algo 2 \n(Derandomized)', r'$\pi_D$', 'Constraint Threshold'])
165 | leg = Legend(ax1, 
166 | 			np.array(legend)[[0,1,3,2,4,5]], 
167 | 			labels[[1,5,4,3,6,7]], 
168 | 			loc='lower left', 
169 | 			bbox_to_anchor=(.05,.02),
170 | 			bbox_transform=fig.transFigure,
171 | 			ncol = 2,
172 | 			frameon=True, 
173 | 			fontsize = legend_fontsize-1)
174 | ax1.add_artist(leg)
175 | 
176 | plt.tick_params(axis='both', which='major', labelsize=major_tick_mark_size)
177 | plt.tick_params(axis='both', which='minor', labelsize=8)
178 | ax1.set_title('Main Objective - Accumulated Cost', fontsize=fontsize+2)
179 | # plt.subplots_adjust(right=0.7)
180 | # plt.tight_layout()#rect=[0,.2,1,1])
181 | # plt.tight_layout(rect=[-.025,-.025,.675,1.025])
182 | # plt.savefig('car_main_value_wo_band.png', format='png', dpi=300)
183 | # plt.show()
184 | # import pdb; pdb.set_trace()
185 | 
186 | 
187 | 
188 | 
189 | # car_color = colors.next()
190 | # c_values = np.array(data['c_eval'][0])[:,-10:] # shape = (iteration #, k)
191 | # last = np.mean(c_values, axis=1)#c_values[:,-1,0]#*100
192 | # evaluation = np.mean(c_values, axis=1)#*100
193 | # std = np.std(c_values, axis=1)#*100
194 | # lines.append( ax1.plot(iterations, last, color = car_color,marker='o',markersize=7, label='Percent of Track Covered') )
195 | # fill_betweens.append( ax1.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = car_color, zorder = 10) )
196 | # legend.append( mlines.Line2D([], [], color=car_color, marker='o',
197 | #                       markersize=7, label='Percent of Track Covered') )
198 | 
199 | #labels = np.array([r"$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50}\widehat{C^{i}}(\pi_{Q_{50}})$", r"$Exact: \;\; C(\pi_{Q_{50}})$", r'$Mean: \;\; \frac{1}{10}\sum_{i=40}^{50} C(\pi_{Q_i})$'])
200 | # labels = np.array(['FQE', r'$Our \; C$', 'Mean', r'$DQN \; C$', r'$LSPI \; C$'])
201 | # leg = Legend(ax1, np.array(legend)[[0,1,2]], labels[[1,3,4]], title='Main Objective', loc=locations[0], frameon=False, fontsize = legend_fontsize) #shadow=True, fancybox=True, 
202 | # ax1.add_artist(leg)
203 | # ax = ax1.twinx()
204 | 
205 | # tex_labels = [[r'$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50} \widehat{G^{i}_0}(\pi_{Q_{50}})$', r'$Exact: \;\; G_0(\pi_{Q_{50}})$'],
206 | # 			  [r'$FQE: \;\; \frac{1}{10}\sum_{i=40}^{50} \widehat{G^{i}_1}(\pi_{Q_{50}})$', r'$Exact: \;\; G_1(\pi_{Q_{50}})$']]
207 | # tex_labels = [[r'$FQE \; G_0$', r'$Our \; G_0$'], [r'$FQE \; G_1$', r'$Our \; G_1$']]
208 | # baseline_labels = [[r'$DQN \; G_0$', r'$LSPI \; G_0$'], [r'$DQN \; G_1$', r'$LSPI \; G_1$']]
209 | tex_labels = [[r'$FQE \; G_0$', 'Algo 2', 'Algo 2 (Derandomized)'], [r'$FQE \; G_1$', 'Algo 2', 'Algo 2 (Derandomized)']]
210 | # baseline_labels = [['DDQN', 'LSPI'], ['DDQN', 'LSPI']]
211 | baseline_labels = [['Online-RL (no constraint)', 'Regularized LSPI', r'$\pi_D$'], ['Online-RL (no constraint)', 'Regularized LSPI', r'$\pi_D$']]
212 | # plt.clf()
213 | # fig, axs = plt.subplots(2, sharex=True)
214 | axs = []
215 | axs.append(fig.add_subplot(grid[:3, 2:]))
216 | axs.append(fig.add_subplot(grid[3:, 2:]))
217 | 
218 | 
219 | for idx in data['g_eval'].keys():
220 | 	colors = color_gen()
221 | 	ax = axs[idx]
222 | 	ax.grid(alpha=.35)
223 | 	legend = []
224 | 	# FQE
225 | 	constraint = np.array(data['g_eval'][idx]) # shape = (iteration #, k) referring to Q_k
226 | 	constraint = constraint[:max_iterations,-10:]
227 | 	
228 | 	# evaluation = np.array(pd.DataFrame(np.mean(constraint, axis = 1)/constraint_upper_bound[idx]).expanding().mean()).reshape(-1)
229 | 	# std = np.array(pd.DataFrame(np.mean(constraint, axis = 1)/constraint_upper_bound[idx]).expanding().std()).reshape(-1)
230 | 
231 | 	# evaluation = np.mean(constraint, axis = 1)/constraint_upper_bound[idx]
232 | 	# std = np.std(constraint, axis=1)/constraint_upper_bound[idx]
233 | 	
234 | 	label = constraint_names[idx]
235 | 	
236 | 	# lines.append( ax.plot(iterations, evaluation, color = color,marker='o',markersize=7, label=tex_labels[idx][0]) )
237 | 	# fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx) )
238 | 	
239 | 	#legend.append( mlines.Line2D([], [], color=color, marker='o',
240 |     #                      markersize=7, label=tex_labels[idx][0]) )
241 | 
242 |     # EXACT
243 | 	g_exacts = np.array(data['g_exacts'])[:max_iterations,idx]#/constraint_upper_bound[idx]
244 | 	evaluation = np.array(pd.DataFrame(g_exacts).expanding().mean()).reshape(-1)
245 | 	std = np.array(pd.DataFrame(g_exacts).expanding().std()).reshape(-1)
246 | 
247 | 	lines.append( ax.plot(iterations, evaluation, color = car_color ,linestyle='-' ,linewidth=2.0, label=tex_labels[idx][1])  )
248 | 	legend.append( mlines.Line2D([], [], color=car_color, linestyle='-' ,linewidth=2.0,
249 |                           markersize=7, label=tex_labels[idx][1]) )
250 | 
251 | 	# Derandomized
252 | 	lines.append( ax.plot(df_derandom['iterations'], df_derandom['g_%s_derandomized' % idx], linewidth=2.0,color = derandom_color,linestyle='-',markersize=7, label='Exact') )
253 | 	legend.append( mlines.Line2D([], [], color=derandom_color, linestyle='-' ,linewidth=2.0,
254 |                           markersize=7, label=tex_labels[idx][2]) )
255 | 
256 | 
257 | 	# fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) )
258 | 
259 | 	## BASELINES
260 | 
261 | 	# fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) )
262 | 		# LSPI
263 | 	baseline = np.array(LSPI.iloc[:,idx+1])*constraint_upper_bound[idx]
264 | 	evaluation = np.array(pd.DataFrame(baseline).expanding().mean()).reshape(-1)
265 | 	std = np.array(pd.DataFrame(baseline).expanding().std()).reshape(-1)
266 | 
267 | 	lines.append( ax.plot(iterations, evaluation, color = lspi_color , linestyle='-', linewidth=2.0,label=baseline_labels[idx][1])  )
268 | 	legend.append( mlines.Line2D([], [], color=lspi_color, linestyle='-', linewidth=2.0,
269 |                           markersize=7, label=baseline_labels[idx][1]) )
270 | 	# fill_betweens.append( ax.fill_between(iterations,evaluation-std, evaluation+std, alpha = alpha, color = color, zorder = 11+2*idx+1) )
271 | 		# DQN
272 | 	baseline = np.array([DQN[idx+1]]*len(evaluation))#/constraint_upper_bound[idx]
273 | 	evaluation = np.array(pd.DataFrame(baseline).expanding().mean()).reshape(-1)
274 | 	std = np.array(pd.DataFrame(baseline).expanding().std()).reshape(-1)
275 | 
276 | 	lines.append( ax.plot(iterations, evaluation, color = dqn_color , linestyle='-' ,linewidth=2.0, label=baseline_labels[idx][0])  )
277 | 	legend.append( mlines.Line2D([], [], color=dqn_color, linestyle='-',linewidth=2.0,
278 |                           markersize=7, label=baseline_labels[idx][0]) )
279 | 
280 | 		# Pi_D
281 | 	evaluation = np.mean(discounted_costs[:,idx+1]).reshape(-1)
282 | 	lines.append( ax.plot(iterations, [evaluation]*len(iterations), color = pi_d_color,markersize=7,linewidth=2.0,linestyle='-' , label=baseline_labels[idx][2]) )
283 | 	legend.append( mlines.Line2D([], [], color=pi_d_color, linestyle='-' ,linewidth=2.0,
284 |                       markersize=7, label=baseline_labels[idx][2]) )
285 | 
286 | 
287 | 	# THRESHOLD
288 | 	constraint_violation = [constraint_upper_bound[idx]]*len(iterations)
289 | 	lines.append( ax.plot(iterations, constraint_violation, color = 'k', linestyle=':', linewidth=2.0, marker=None) )
290 | 	legend.append( mlines.Line2D([], [], color='k', linestyle=':', marker=None,  linewidth=2.0, label='Constraint Threshold') )
291 | 
292 | 
293 | 	labels = []
294 | 	for i in range(len(legend)):
295 | 		line = legend[i]
296 | 		try:
297 | 			labels += [line.label]
298 | 		except:
299 | 			labels += [line.get_label()]
300 | 	
301 | 	if idx == 1:
302 | 		# leg = Legend(ax, 
303 | 		# 			legend, 
304 | 		# 			labels, 
305 | 		# 			title=label, 
306 | 		# 			# loc='center left', 
307 | 		# 			bbox_to_anchor=(0.5, -0.05),
308 | 		# 			ncol = 3,
309 | 		# 			frameon=False, 
310 | 		# 			# bbox_to_anchor=(1, 0.5), 
311 | 		# 			fontsize = legend_fontsize-3)
312 | 		leg = Legend(ax, 
313 | 			legend, 
314 | 			labels,
315 | 			loc='lower center', 
316 | 			bbox_to_anchor=(.5,0),
317 | 			bbox_transform=fig.transFigure,
318 | 			ncol = 2,
319 | 			frameon=True, 
320 | 			fontsize = legend_fontsize-2)
321 | 
322 | 		plt.setp(leg.get_title(),fontsize='%s' % legend_title_fontsize)
323 | 		# ax.add_artist(leg)
324 | 
325 | 	if idx == 1:
326 | 		ax.set_xlabel('Iteration (t)', fontsize=fontsize)
327 | 	lab = ['Value (Braking)', 'Value (Lane Center)'][idx]
328 | 	ax.set_ylabel(lab, color='k', fontsize=fontsize+2)
329 | 	# ax.set_ylim(bottom=-1, top=3)
330 | 	ax.tick_params(axis='y', labelcolor='k')
331 | 	if idx == 0: ax.set_ylim(bottom=-2, top=15)
332 | 	ax.set_xlim(-.5, 25)
333 | axs[0].tick_params(axis='both', which='major', labelsize=major_tick_mark_size)
334 | axs[0].set_xticklabels([])
335 | axs[1].tick_params(axis='both', which='major', labelsize=major_tick_mark_size)
336 | axs[0].set_title('Constraints - Accumulated Cost', fontsize=fontsize+2)
337 | plt.tight_layout(rect=[-.025,-.025,1.025,1.025])#rect=[0,.2,1,1])
338 | plt.savefig('car_all_values_wo_band.png', format='png', dpi=300)
339 | plt.show()
340 | 
341 | import pdb; pdb.set_trace()
342 | 
343 | 
344 | 


--------------------------------------------------------------------------------
/plot_results.py:
--------------------------------------------------------------------------------
  1 | import deepdish as dd
  2 | import matplotlib
  3 | matplotlib.use('TkAgg')
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import matplotlib.lines as mlines
  8 | from matplotlib.legend import Legend
  9 | import seaborn as sns; sns.set(color_codes=True)
 10 | import os
 11 | from exponentiated_gradient import ExponentiatedGradient
 12 | from matplotlib.lines import Line2D
 13 | 
 14 | # Colors
 15 | alpha = 0.15
 16 | sns.set(style="whitegrid", palette="Paired")
 17 | colorSet = sns.color_palette("Paired", 10);
 18 | 
 19 | def color_gen():
 20 | 
 21 | 	colors = ["dark pink","dusty purple", "amber", "faded green", "windows blue", ]
 22 | 	colors = sns.xkcd_palette(colors)
 23 | 	idx  = -1
 24 | 	while 1:
 25 | 		idx = (idx + 1) % len(colors)
 26 | 		yield colors[idx]
 27 | 
 28 | plt.rc('text', usetex=True)
 29 | EG = ExponentiatedGradient(5., 2, 10.)
 30 | 
 31 | path = os.path.join(os.getcwd(), 'experimental_results')
 32 | files = os.listdir(path)
 33 | csvs = [f for f in files if 'experiment_results' in f]
 34 | tmp = pd.DataFrame([csv.split('.csv')[0].split('_')[2:] for csv in csvs], columns=['year','month','day','hour','minute'])
 35 | results_file = 'experiment_results_' + '_'.join(tmp.sort_values(by=['year','month','day','hour','minute'], ascending=False).iloc[0]) + '.csv'
 36 | 
 37 | # results_file = 'experiment_results_12_18_2018_22_20.csv'
 38 | df = pd.read_csv(os.path.join(path, results_file))
 39 | df['iteration'] -= 2
 40 | 
 41 | df['primal_dual_gap'] = df['max_L'] - df['min_L']
 42 | 
 43 | # plt.plot(df['iteration'], df['primal_dual_gap'])
 44 | # plt.show()
 45 | def unrandomize(df, constraints, min_iteration):
 46 | 	df = df[df['iteration'] >= min_iteration]
 47 | 
 48 | 	out = []
 49 | 	for iteration in range(min_iteration, int(max(df['iteration']))):
 50 | 		df_tmp = df[df['iteration'] <= iteration]
 51 | 		df_tmp = df_tmp[df_tmp['g_pi'] < constraints[0]]
 52 | 		argmin = np.argmin(np.array(df_tmp['c_pi']))
 53 | 		out.append(np.hstack([iteration, np.array(df_tmp.iloc[argmin][['c_pi_exact', 'g_pi_exact']]) ]))
 54 | 
 55 | 	return pd.DataFrame(out, columns=['iteration', 'c_unrandomized', 'g_unrandomized'])
 56 | 
 57 | 
 58 | df_unrandom = unrandomize(df, [.1], 5)
 59 | 
 60 | 
 61 | # f, ax = plt.subplots()
 62 | # ax.plot(df['iteration'], df['primal_dual_gap'])
 63 | # ax.set_title('Primal Dual Gap')
 64 | 
 65 | # import pdb; pdb.set_trace()
 66 | 
 67 | fontsize=20
 68 | colors = color_gen()
 69 | color_optimal = colors.next()
 70 | plt.plot(df['iteration'], df['primal_dual_gap'], color='b', label='Empirical Gap')
 71 | plt.plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, label='Minimum/Optimal Gap')
 72 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize)
 73 | plt.ylabel('Primal-Dual Gap ' + r'$(\widehat{L}_{max} - \widehat{L}_{min})$', fontsize=fontsize)
 74 | plt.legend(fontsize=fontsize)
 75 | plt.tick_params(axis='both', which='major', labelsize=fontsize)
 76 | plt.xlim((-1,150))
 77 | plt.ylim((-.02,2))
 78 | plt.tight_layout()
 79 | plt.title('Convergence Behavior of Algo 2', fontsize=fontsize)
 80 | plt.savefig('lake_primal_dual_gap.png', format='png', dpi=300)
 81 | plt.clf()
 82 | plt.show()
 83 | 
 84 | # W BANDS
 85 | fontsize=16
 86 | f, axarr = plt.subplots(2, sharex=True)
 87 | colors = color_gen()
 88 | color_optimal = colors.next()
 89 | color_main = colors.next()
 90 | vals = pd.DataFrame(df['c_pi'])
 91 | evaluation = np.array(vals.expanding().mean()).reshape(-1)
 92 | std = np.array(vals.expanding().std()).reshape(-1)
 93 | axarr[0].plot(df['iteration'], evaluation, color=color_main, label='Algo 2')
 94 | axarr[0].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11)
 95 | color_pi_d = colors.next()
 96 | axarr[0].plot(df['iteration'], [-9.94428910084026e-05]*len(df['iteration']), color=color_pi_d, label=r'$\pi_D$') 
 97 | axarr[0].fill_between(df['iteration'],[-9.94428910084026e-05-0.002297397386833141]*len(df['iteration']), [-9.94428910084026e-05+0.002297397386833141]*len(df['iteration']), alpha = alpha, color = color_main, zorder = 11)
 98 | # axarr[0].set_ylabel('Main Objective Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize)
 99 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), color=color_optimal, label='Optimal Value')
100 | axarr[0].set_ylabel('Main Objective Value', fontsize=fontsize-2)
101 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main)
102 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d)
103 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal)
104 | # axarr[0].legend([line0, line2, line4], ['Algo 2', r'$\pi_D$', 'Optimal Value'], loc='lower right', fontsize=12, frameon=True)
105 | # axarr[0].legend(fontsize=fontsize, loc='lower right', frameon=True)
106 | axarr[0].grid(alpha=.35)
107 | 
108 | 
109 | evaluation = np.array(pd.DataFrame(df['g_pi_exact']).expanding().mean()).reshape(-1)
110 | std = np.array(pd.DataFrame(df['g_pi_exact']).expanding().std()).reshape(-1)
111 | axarr[1].plot(df['iteration'], evaluation, linewidth=2.0, linestyle=(0,[8,8]), color=color_main, label=r'$G(\widehat{\pi_t})$')
112 | axarr[1].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11)
113 | axarr[1].plot(df['iteration'], [0.15173932921437544]*len(df['iteration']), color=color_pi_d, label=r'$pi_D$') 
114 | axarr[1].fill_between(df['iteration'],[0.15173932921437544-0.162341876715503]*len(df['iteration']),[0.15173932921437544+0.162341876715503]*len(df['iteration']), alpha = alpha, color = color_pi_d, zorder = 11)
115 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, linewidth=2., linestyle=(8,[8,8]), label='Optimal value')
116 | axarr[1].plot(df['iteration'], [.1]*len(df['iteration']), color = 'k', linestyle=':', linewidth=2.0, label='Threshold', marker=None)
117 | 
118 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main)
119 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d)
120 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal)
121 | line5 = Line2D([0,1],[0,1],linestyle='--', color='k')
122 | axarr[1].legend([line0, line2, line4, line5], ['Algo 2', r'$\pi_D$', 'Optimal Value', 'Constraint Threshold'], loc='lower center', bbox_to_anchor=(.5,0), bbox_transform=f.transFigure, ncol = 2, fontsize=fontsize, frameon=True)
123 | 
124 | # axarr[1].set_ylabel('Constraint Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize)
125 | axarr[1].set_ylabel('Constraint Value', fontsize=fontsize-2)
126 | # axarr[1].legend([line0, line1, line2, line3], ['Our Algorithm', 'DDQN (no constraint)', 'Optimal Value', 'Threshold'], loc='lower right', fontsize=fontsize, frameon=True)
127 | axarr[1].grid(alpha=.35)
128 | axarr[1].set_ylim(-.05, .35)
129 | 
130 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize)
131 | plt.xlim((-1,150))
132 | # plt.ylim((-.02,2))
133 | plt.tight_layout(rect=[0,.15,1,1])
134 | fig = plt.gcf()
135 | size = fig.get_size_inches()
136 | # fig.set_size_inches(size[0], size[1]+.75)
137 | axarr[0].set_title('Main Objective and Constraint -  Accumulated Cost', fontsize=fontsize)
138 | plt.savefig('lake_values.png', format='png', dpi=300)
139 | plt.show()
140 | 
141 | # WO BANDS
142 | f, axarr = plt.subplots(2, sharex=True)
143 | colors = color_gen()
144 | color_optimal = colors.next()
145 | color_main = colors.next()
146 | vals = pd.DataFrame(df['c_pi'])
147 | evaluation = np.array(vals.expanding().mean()).reshape(-1)
148 | std = np.array(vals.expanding().std()).reshape(-1)
149 | axarr[0].plot(df['iteration'], evaluation, color=color_main, label='Algo 2')
150 | # axarr[0].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11)
151 | color_pi_d = colors.next()
152 | axarr[0].plot(df['iteration'], [-9.94428910084026e-05]*len(df['iteration']), color=color_pi_d, label=r'$\pi_D$') 
153 | spacing = 8
154 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(0*spacing,[spacing,spacing*2]), color=color_optimal, label='Optimal Value')
155 | color_ddqn = colors.next()
156 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(1*spacing,[spacing,spacing*2]), color=color_ddqn, label='DDQN (no constraint)')
157 | color_unrandomized = colors.next()
158 | # axarr[0].plot(df_unrandom['iteration'], df_unrandom['c_unrandomized'], linestyle=(16,[8,24]), color=color_unrandomized, label='Algo 2 (Unrandomized)')
159 | axarr[0].plot(df['iteration'], [-(.9**13)]*len(df['iteration']), linestyle=(2*spacing,[spacing,spacing*2]), color=color_unrandomized, label='Algo 2 (Unrandomized)')
160 | 
161 | # axarr[0].set_ylabel('Main Objective Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize)
162 | axarr[0].set_ylabel('Main Objective Value', fontsize=fontsize-2)
163 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main)
164 | line1 = Line2D([0,1],[0,1],linestyle='-', color=color_unrandomized)
165 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d)
166 | line3 = Line2D([0,1],[0,1],linestyle='-', color=color_ddqn)
167 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal)
168 | # axarr[0].legend([line0, line1, line2, line3, line4], ['Algo 2', 'Algo 2 (Derandomized)', r'$\pi_D$', 'Online-RL (no constraint)', 'Optimal Value' ], loc='lower right', fontsize=12, frameon=True)
169 | # axarr[0].legend(fontsize=fontsize, loc='lower right', frameon=True)
170 | axarr[0].grid(alpha=.35)
171 | 
172 | 
173 | evaluation = np.array(pd.DataFrame(df['g_pi_exact']).expanding().mean()).reshape(-1)
174 | std = np.array(pd.DataFrame(df['g_pi_exact']).expanding().std()).reshape(-1)
175 | axarr[1].plot(df['iteration'], evaluation, linewidth=2.0, linestyle=(0*spacing,[spacing,spacing*3]), color=color_main, label=r'$G(\widehat{\pi_t})$')
176 | # axarr[1].fill_between(df['iteration'],evaluation-std, evaluation+std, alpha = alpha, color = color_main, zorder = 11)
177 | axarr[1].plot(df['iteration'], [0.15173932921437544]*len(df['iteration']), color=color_pi_d, label=r'$pi_D$') 
178 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_optimal, linewidth=2.0, linestyle=(1*spacing,[spacing,spacing*3]), label='Optimal value')
179 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_unrandomized, linewidth=2.0, linestyle=(2*spacing,[spacing,spacing*3]), label='Unrandomized')
180 | axarr[1].plot(df['iteration'], [0]*len(df['iteration']), color=color_ddqn, linewidth=2.0, linestyle=(3*spacing,[spacing,spacing*3]), label='DDQN')
181 | axarr[1].plot(df['iteration'], [.1]*len(df['iteration']), color = 'k', linestyle=':', linewidth=2.0, label='Threshold', marker=None)
182 | 
183 | line0 = Line2D([0,1],[0,1],linestyle='-', color=color_main)
184 | line1 = Line2D([0,1],[0,1],linestyle='-', color=color_unrandomized)
185 | line2 = Line2D([0,1],[0,1],linestyle='-', color=color_pi_d)
186 | line3 = Line2D([0,1],[0,1],linestyle='-', color=color_ddqn)
187 | line4 = Line2D([0,1],[0,1],linestyle='-', color=color_optimal)
188 | line5 = Line2D([0,1],[0,1],linestyle='--', color='k')
189 | axarr[1].legend([line0, line1, line2, line3, line4, line5], ['Algo 2', 'Algo 2 (Derandomized)', r'$\pi_D$', 'Online-RL (no constraint)', 'Optimal Value', 'Constraint Threshold'], loc='lower center', bbox_to_anchor=(.5,0), bbox_transform=f.transFigure, ncol = 2,  fontsize=fontsize-2, frameon=True)
190 | 
191 | # axarr[1].set_ylabel('Constraint Value \n of ' + r'$\widehat{\pi_t}$', fontsize=fontsize)
192 | axarr[1].set_ylabel('Constraint Value', fontsize=fontsize-2)
193 | # axarr[1].legend([line0, line1, line2, line3], ['Our Algorithm', 'DDQN (no constraint)', 'Optimal Value', 'Threshold'], loc='lower right', fontsize=fontsize, frameon=True)
194 | axarr[1].grid(alpha=.35)
195 | 
196 | plt.xlabel('Iteration ' + r'$(t)$', fontsize=fontsize)
197 | plt.xlim((-1,150))
198 | # plt.ylim((-.02,2))
199 | plt.tight_layout(rect=[0,.18,1,1])
200 | fig = plt.gcf()
201 | size = fig.get_size_inches()
202 | axarr[0].set_title('Main Objective and Constraint -  Accumulated Cost', fontsize=fontsize)
203 | # fig.set_size_inches(size[0], size[1]+.75)
204 | plt.savefig('lake_values_wo_band.png', format='png', dpi=300)
205 | plt.show()
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | # # Two subplots, the axes array is 1-d
213 | # number_of_constraints = len([col for col in df.columns if 'g_avg_' in col])
214 | # f, axarr = plt.subplots(2+number_of_constraints, sharex=True)
215 | # axarr[0].plot(df['iteration'], df['primal_dual_gap'], label='gap')
216 | # # axarr[0].plot(df['iteration'], pd.ewma(df['primal_dual_gap'], span=100), label='moving average')
217 | # axarr[0].plot(df['iteration'], [0]*len(df['iteration']), color='g', label='Minimum/Optimal Gap')
218 | # axarr[0].set_title('Primal Dual Gap')
219 | # axarr[0].legend()
220 | # axarr[1].plot(df['iteration'], df['c_avg'], color='b', label='C fqe')
221 | # axarr[1].plot(df['iteration'], df['c_exact_avg'], color = 'r', label='C exact')
222 | # # axarr[1].plot(df['iteration'], [-0.254186583]*len(df['iteration']), color='g', label='C optimal')
223 | # # axarr[1].scatter(0, -2.763302804497763e-05, marker='x', color='k', label='C pi_old')
224 | # axarr[1].set_title('Value of C of mean policy')
225 | # axarr[1].legend()
226 | # for col in range(number_of_constraints):
227 | # 	axarr[2+col].plot(df['iteration'], df['g_avg_%s' % col], color='b', label='G_%s fqe' % col)
228 | # 	axarr[2+col].plot(df['iteration'], df['g_exact_avg_%s' % col], color='r', label='G_%s exact' % col)
229 | # 	# axarr[2].plot(df['iteration'], [0.]*len(df['iteration']), color='g', label='G optimal')
230 | # 	# axarr[2].scatter(0, 0.13755537388963082, marker='x', color='k', label='G pi_old')
231 | # 	axarr[2+col].set_title('Value of G_%s of mean policy' % col)
232 | # 	axarr[2+col].legend()
233 | # plt.show()
234 | 
235 | # # Number episodes achieved goal: 5. Number episodes fell in hole: 4890
236 | # # C(pi_old): -7.560596707938992e-06. G(pi_old): 0.13777596062648703
237 | # # Percentage of State/Action space seen: 0.943396226415
238 | 
239 | 


--------------------------------------------------------------------------------
/print_policy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class PrintPolicy(object):
  4 | 	def __init__(self, size=[4,4], env=None):
  5 | 		self.mapping = {0:'<', 1:'v', 2:'>', 3:'^'}
  6 | 		self.size = size
  7 | 		self.action_space_dim = len(self.mapping.keys())
  8 | 		self.env = env
  9 | 
 10 | 	def pprint(self, *args):
 11 | 		if len(args) == 1:
 12 | 			pi = args[0]
 13 | 			size = self.size[0]*self.size[1]
 14 | 			if not isinstance(pi,(list,)):
 15 | 				pi = [pi]
 16 | 
 17 | 			if len(pi) == 0: return
 18 | 	
 19 | 		
 20 | 			states = np.array(range(size)).reshape(1,-1).T
 21 | 			actions_for_each_pi = np.hstack([[np.eye(self.action_space_dim)[p.min_over_a(np.arange(size))[1]] for p in pi]])
 22 | 			policy = np.hstack([states, np.argmax(actions_for_each_pi.mean(0), 1).reshape(1,-1).T])
 23 | 
 24 | 			Qs_for_each_pi = np.vstack([np.array([p.all_actions(np.arange(size))]) for p in pi])
 25 | 			Q = np.hstack([states, np.mean(Qs_for_each_pi,axis=0)[np.arange(Qs_for_each_pi.shape[1]),policy[:,1]].reshape(-1,1)])
 26 | 		else:
 27 | 			raise
 28 | 
 29 | 		direction_grid = [['H' for x in range(self.size[1])] for y in range(self.size[0])]
 30 | 		direction_grid[-1][-1] = 'G'
 31 | 		# direction_grid[0][0] = '  S '
 32 | 
 33 | 		Q_grid = [['  H  ' for x in range(self.size[1])] for y in range(self.size[0])]
 34 | 		Q_grid[-1][-1] = '  G  '
 35 | 		# Q_grid[0][0] = '  S '
 36 | 
 37 | 		
 38 | 
 39 | 
 40 | 		for direction in policy:
 41 | 			row = int(direction[0]/self.size[1])
 42 | 			col = int(direction[0] - row*int(self.size[1]))
 43 | 			direction_grid[row][col] = self.mapping[direction[1]]
 44 | 
 45 | 		for value in Q:
 46 | 			row = int(value[0]/self.size[1])
 47 | 			col = int(value[0] - row*int(self.size[1]))
 48 | 			Q_grid[row][col] = value[1]
 49 | 
 50 | 		if self.env is not None:
 51 | 			direction_grid = np.array(direction_grid)
 52 | 			Q_grid = np.array(Q_grid).astype(str)
 53 | 			
 54 | 			holes = np.where(self.env.desc == 'H')
 55 | 			starts = np.where(self.env.desc == 'S')
 56 | 			goals = np.where(self.env.desc == 'G')
 57 | 			
 58 | 
 59 | 			direction_grid[holes] = 'H'
 60 | 			# direction_grid[starts] = 'S'
 61 | 			direction_grid[goals] = 'G'
 62 | 			Q_grid[holes] = '  H  '
 63 | 			# Q_grid[starts] = '  S  '
 64 | 			Q_grid[goals] = '  G  '
 65 | 
 66 | 			direction_grid = direction_grid.tolist()
 67 | 			Q_grid = Q_grid.tolist()
 68 | 
 69 | 
 70 | 		for i in range(2*len(direction_grid)+1):
 71 | 			row = []
 72 | 			for j in range(2*len(direction_grid[0])+1):
 73 | 				if (i % 2) == 1 & (j % 2) == 1:
 74 | 					row.append(direction_grid[(i-1)/2][(j-1)/2])
 75 | 				elif (j % 2) == 0:
 76 | 					row.append('|')
 77 | 				else:
 78 | 					row.append('_')
 79 | 			print ' '.join(row)
 80 | 		print
 81 | 
 82 | 		for i in range(2*len(Q_grid)+1):
 83 | 			row = []
 84 | 			for j in range(2*len(Q_grid[0])+1):
 85 | 				if (i % 2) == 1 & (j % 2) == 1:
 86 | 					try:
 87 | 						val = float(Q_grid[(i-1)/2][(j-1)/2])
 88 | 						sign = '+'*(val > 0) + '-'*(val<=0)
 89 | 						val = str(np.abs(round(val,2)))
 90 | 						row.append(sign + val)
 91 | 					except:
 92 | 						val = Q_grid[(i-1)/2][(j-1)/2]
 93 | 						row.append(val)
 94 | 				elif (j % 2) == 0:
 95 | 					row.append('|')
 96 | 				else:
 97 | 					row.append('_____')
 98 | 			print ' '.join(row)
 99 | 		print
100 | 


--------------------------------------------------------------------------------
/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import deepdish as dd
  4 | 
  5 | class Buffer(object):
  6 |     """
  7 |     This saves the agent's experience in windowed cache.
  8 |     Each frame is saved only once but state is stack of num_frame_stack frames
  9 | 
 10 |     In the beginning of an episode the frame-stack is padded
 11 |     with the beginning frame
 12 |     """
 13 | 
 14 |     def __init__(self,
 15 |             num_frame_stack=1,
 16 |             buffer_size=10000,
 17 |             min_buffer_size_to_train=1000,
 18 |             pic_size = (96,96),
 19 |             action_space_dim = 4,
 20 |             n_costs = (),
 21 |     ):
 22 |         self.n_costs = n_costs
 23 |         self.pic_size = pic_size
 24 |         self.action_space_dim = action_space_dim
 25 |         self.num_frame_stack = num_frame_stack
 26 |         self.capacity = buffer_size
 27 |         self.counter = -1
 28 |         self.exp_idx = -1
 29 |         self.frame_window = None
 30 |         self.max_frame_cache = self.capacity + 2 * self.num_frame_stack + 1
 31 |         self.frame_idx = self.counter % self.max_frame_cache
 32 |         self.init_caches()
 33 |         self.expecting_new_episode = True
 34 |         self.min_buffer_size_to_train = min_buffer_size_to_train
 35 |         self.data = {'x':[], 'a':[], 'x_prime':[], 'c':[], 'g':[], 'done':[], 'cost':[]}
 36 | 
 37 |     def append(self, action, frame, reward, done):
 38 |         assert self.frame_window is not None, "start episode first"
 39 |         self.counter += 1
 40 |         self.frame_idx = self.counter % self.max_frame_cache
 41 |         self.exp_idx = (self.exp_idx + 1) % self.capacity
 42 |         
 43 |         exp_idx = self.exp_idx
 44 |         self.prev_states[exp_idx] = self.frame_window
 45 |         self.frame_window = np.append(self.frame_window[1:], self.frame_idx)
 46 |         self.next_states[exp_idx] = self.frame_window
 47 |         self.actions[exp_idx] = action
 48 |         self.is_done[exp_idx] = done
 49 |         self.frames[self.frame_idx] = frame
 50 |         self.rewards[exp_idx] = reward
 51 |         if done:
 52 |             self.expecting_new_episode = True
 53 | 
 54 |     def start_new_episode(self, frame):
 55 |         # it should be okay not to increment counter here
 56 |         # because episode ending frames are not used
 57 |         assert self.expecting_new_episode, "previous episode didn't end yet"
 58 |         self.counter += 1
 59 |         self.frame_idx = self.counter % self.max_frame_cache
 60 |         self.frame_window = np.repeat(self.frame_idx, self.num_frame_stack)
 61 |         self.frames[self.frame_idx] = frame
 62 |         self.expecting_new_episode = False
 63 | 
 64 |     def is_over(self):
 65 |         return self.expecting_new_episode
 66 | 
 67 |     def get_length(self):
 68 |         return min(self.capacity, self.exp_idx)
 69 | 
 70 |     def sample(self, N):
 71 |         count = min(self.capacity, self.exp_idx)
 72 |         minimum = max(count-40000, 0) # UNHARDCODE THIS. THIS IS FOR USING BUFFER AS SAVER + Exp Replay
 73 |         batchidx = np.random.randint(minimum, count, size=N)
 74 | 
 75 |         x = self.frames[self.prev_states[batchidx]]
 76 |         action = self.actions[batchidx]
 77 |         x_prime = self.frames[self.next_states[batchidx]]
 78 |         reward = self.rewards[batchidx]
 79 |         done = self.is_done[batchidx]
 80 |         
 81 |         return [x, action, x_prime, reward, done]
 82 | 
 83 |     def get_all(self, key):
 84 |         valid_states = min(self.capacity, self.exp_idx)
 85 |         if key == 'x':
 86 |             return self.frames[self.prev_states[:valid_states]]
 87 |         elif key == 'a':
 88 |             return self.actions[:valid_states]
 89 |         elif key == 'x_prime':
 90 |             return self.frames[self.next_states[:valid_states]]
 91 |         elif key == 'c':
 92 |             return self.rewards[:valid_states][:, 0]
 93 |         elif key == 'g':
 94 |             return self.rewards[:valid_states][:, 1:]
 95 |         elif key == 'done':
 96 |             return self.is_done[:valid_states]
 97 |         elif key == 'cost':
 98 |             return []
 99 |         elif key == 'frames':
100 |             maximum = max(np.max(self.prev_states[:valid_states]), np.max(self.next_states[:valid_states])) + 1
101 |             return self.frames[:maximum]
102 |         elif key == 'prev_states':
103 |             return self.prev_states[:valid_states]
104 |         elif key == 'next_states':
105 |             return self.next_states[:valid_states]
106 |         else:
107 |             raise
108 |             
109 |     def is_enough(self):
110 |         return self.exp_idx > self.min_buffer_size_to_train
111 | 
112 |     def current_state(self):
113 |         # assert not self.expecting_new_episode, "start new episode first"'
114 |         assert self.frame_window is not None, "do something first"
115 |         if len(self.pic_size) == 2:
116 |             return np.rollaxis(self.frames[self.frame_window], 0,3)
117 |         else:
118 |             return self.frames[self.frame_window]
119 | 
120 |     def init_caches(self):
121 |         self.rewards = np.empty((self.capacity,) + self.n_costs, dtype="float64")
122 |         self.prev_states = np.empty((self.capacity, self.num_frame_stack), dtype="uint32")
123 |         self.next_states = np.empty((self.capacity, self.num_frame_stack), dtype="uint32")
124 |         self.is_done = np.empty(self.capacity, "uint8")
125 |         self.actions = np.empty((self.capacity), dtype="uint8")
126 |         self.frames = np.empty((self.max_frame_cache,) + self.pic_size, dtype="uint8")
127 | 
128 |     def get_state_action_pairs(self, env_type='lake'):
129 |         if 'state_action' in self.data:
130 |             return self.data['state_action']
131 |         else:
132 |             if env_type == 'lake':
133 |                 pairs = [np.array(self.data['x']), np.array(self.data['a']).reshape(1,-1).T ]
134 |             elif env_type == 'car':
135 |                 pairs = [np.array(self.data['x']), np.array(self.data['a']).reshape(1,-1).T ]
136 |             self.data['state_action'] = pairs
137 | 
138 |     def calculate_cost(self, lamb):
139 |         self.scale = np.max(np.abs(np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))))
140 |         costs = np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))/self.scale
141 | 
142 | 
143 |         # costs = costs/np.max(np.abs(costs))
144 |         self.data['cost'] = costs.tolist()
145 | 
146 |     def set_cost(self, key, idx=None):
147 |         if key == 'g': assert idx is not None, 'Evaluation must be done per constraint until parallelized'
148 | 
149 |         if key == 'c':
150 |             self.scale = np.max(np.abs(self.data['c']))
151 |             self.data['cost'] = self.data['c']/self.scale
152 |         elif key == 'g':
153 |             # Pick the idx'th constraint
154 |             self.scale = np.max(np.abs(self.data['g'][:,idx]))
155 |             self.data['cost'] = self.data['g'][:,idx]/self.scale
156 |         else:
157 |             raise
158 | 
159 |     def preprocess(self, env_type):
160 | 
161 |         for key in self.data:
162 |             self.data[key] = self.get_all(key)
163 | 
164 |     def save(self, path):
165 |         #data = {'frames':self.frames, 'prev_states':self.prev_states, 'next_states':self.next_states, 'rewards':self.rewards, 'is_done':self.is_done, 'actions':self.actions}
166 |         #for data, key in zip([self.frames, self.prev_states, self.next_states, self.rewards, self.is_done, self.actions],['frames', 'prev_astates', 'next_states', 'costs', 'is_done', 'actions'])
167 |         #       dd.io.save(path % key, data)
168 |         count = min(self.capacity, self.counter)
169 |         dd.io.save(path.format('frames'), self.frames[:count])
170 |         dd.io.save(path.format('prev_states'), self.prev_states[:count])
171 |         dd.io.save(path.format('next_states'), self.next_states[:count])
172 |         dd.io.save(path.format('rewards'), self.rewards[:count])
173 |         dd.io.save(path.format('is_done'), self.is_done[:count])
174 |         dd.io.save(path.format('actions'), self.actions[:count])
175 | 
176 | 
177 | 
178 | class Dataset(Buffer):
179 |     def __init__(self, num_frame_stack, pic_size, n_costs):
180 |         
181 |         self.pic_size = pic_size
182 |         self.num_frame_stack = num_frame_stack
183 |         self.data = {'frames':[], 'prev_states':[], 'a':[], 'next_states':[], 'c':[], 'g':[], 'done':[], 'cost':[], 'x_prime_repr':[], 'x_repr':[]}
184 |         self.max_trajectory_length = 0
185 |         self.n_costs = n_costs
186 |         self.episodes = [Buffer(num_frame_stack=self.num_frame_stack,buffer_size=int(200000),min_buffer_size_to_train=0,pic_size = self.pic_size, n_costs = self.n_costs)]
187 | 
188 |     def append(self, *args):
189 |         self.episodes[-1].append(*args)
190 | 
191 |         # update max_trajectory_length
192 |         if self.episodes[-1].get_length() > self.max_trajectory_length:
193 |             self.max_trajectory_length = self.episodes[-1].get_length()
194 | 
195 |     def start_new_episode(self, *args):
196 |         # self.episodes.append(Buffer(num_frame_stack=self.num_frame_stack,buffer_size=int(2000),min_buffer_size_to_train=0,pic_size = self.pic_size, n_costs = self.n_costs))
197 |         self.episodes[-1].start_new_episode(args[0])
198 | 
199 |     def current_state(self):
200 |         return self.episodes[-1].current_state()
201 |         
202 |     def get_max_trajectory_length(self):
203 |         return self.max_trajectory_length
204 |         
205 |     def __getitem__(self, key):
206 |         return self.data[key]
207 | 
208 |     def __setitem__(self, key, item):
209 |         self.data[key] = item
210 | 
211 |     def __len__(self):
212 |         return len(self.data['a'])-5
213 | 
214 |     def preprocess(self, env_type):
215 | 
216 |         for key in ['frames', 'prev_states', 'next_states', 'a', 'done', 'c', 'g']:
217 |             self.data[key] = self.episodes[-1].get_all(key)
218 |         
219 |         # [x.preprocess(env_type) for x in self.episodes]
220 | 
221 |         # for key in self.data:
222 |         #     if key in ['g', 'prev_states', 'next_states', 'frames']:
223 |         #         try:
224 |         #             self.data[key] = np.vstack([x.get_all[key] for x in self.episodes])#.tolist()
225 |         #         except:
226 |         #             self.data[key] = np.hstack([x.get_all[key] for x in self.episodes])#.tolist()
227 |         #     else:
228 |         #         self.data[key] = np.hstack([x.get_all[key] for x in self.episodes])#.tolist()
229 | 
230 |         #     if env_type == 'lake':
231 |         #         if key in ['g']:
232 |         #             try:
233 |         #                 self.data[key] = np.vstack([x[key] for x in self.episodes]).tolist()
234 |         #             except:
235 |         #                 self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist()
236 |         #         else:
237 |         #             self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist()
238 |         #     elif env_type == 'car':
239 |         #         if key in ['g', 'x', 'x_prime']:
240 |         #             try:
241 |         #                 self.data[key] = np.vstack([x[key] for x in self.episodes]).tolist()
242 |         #             except:
243 |         #                 self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist()
244 |         #         else:
245 |         #             self.data[key] = np.hstack([x[key] for x in self.episodes]).tolist()
246 |         #     else:
247 |         #         raise
248 |         # [x.get_state_action_pairs(env_type) for x in self.episodes]
249 |         # self.get_state_action_pairs(env_type)
250 | 
251 |     def get_state_action_pairs(self, env_type='lake'):
252 |         # if 'state_action' in self.data:
253 |         #     return self.data['state_action']
254 |         # else:
255 |         if env_type == 'lake':
256 |             pairs = [np.array(self.data['x']).reshape(1,-1).T, np.array(self.data['a']).reshape(1,-1).T ]
257 |         elif env_type == 'car':
258 |             pairs = [np.array(self('x_repr')), np.array(self.data['a']).reshape(1,-1).T ]
259 |         return pairs
260 | 
261 |     def calculate_cost(self, lamb):
262 |         self.scale = np.max(np.abs(np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))))
263 |         costs = np.array(self.data['c'] + np.dot(lamb[:-1], np.array(self.data['g']).T))/self.scale
264 |         
265 |         # costs = costs/np.max(np.abs(costs))
266 |         self.data['cost'] = costs
267 | 
268 |         # [x.calculate_cost(lamb) for x in self.episodes]
269 | 
270 |     def set_cost(self, key, idx=None):
271 |         if key == 'g': assert idx is not None, 'Evaluation must be done per constraint until parallelized'
272 | 
273 |         if key == 'c':
274 |             self.scale = np.max(np.abs(self.data['c']))
275 |             self.data['cost'] = self.data['c']/self.scale
276 |             # [x.set_cost('c') for x in self.episodes]
277 |         elif key == 'g':
278 |             # Pick the idx'th constraint
279 |             self.scale = np.max(np.abs(np.array(self.data['g'])[:,idx]))
280 |             self.data['cost'] = np.array(self.data['g'])[:,idx]/self.scale
281 |             # [x.set_cost('g', idx) for x in self.episodes]
282 |         else:
283 |             raise
284 | 


--------------------------------------------------------------------------------
/seed_2_data/car_data_actions_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_actions_seed_2.h5


--------------------------------------------------------------------------------
/seed_2_data/car_data_frames_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_frames_seed_2.h5


--------------------------------------------------------------------------------
/seed_2_data/car_data_is_done_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_is_done_seed_2.h5


--------------------------------------------------------------------------------
/seed_2_data/car_data_next_states_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_next_states_seed_2.h5


--------------------------------------------------------------------------------
/seed_2_data/car_data_prev_states_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_prev_states_seed_2.h5


--------------------------------------------------------------------------------
/seed_2_data/car_data_rewards_seed_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clvoloshin/constrained_batch_policy_learning/37ea126556f021046e6db6ea4b7ca994294d14bf/seed_2_data/car_data_rewards_seed_2.h5


--------------------------------------------------------------------------------
/stochastic_policy.py:
--------------------------------------------------------------------------------
 1 | from model import Model
 2 | 
 3 | from keras import backend as K
 4 | 
 5 | import numpy as np
 6 | from copy import deepcopy
 7 | 
 8 | class StochasticPolicy(Model):
 9 |     def __init__(self, policy, action_space_dim, policy_evalutor, epsilon=0., prob=None):
10 |         '''
11 |         A fixed manual policy
12 |         '''
13 |         super(StochasticPolicy, self).__init__()
14 |         self.policy = policy
15 | 
16 |         try:
17 |             has_layer = self.policy.Q.model.get_layer('inp').input
18 |         except:
19 |             has_layer = None
20 | 
21 |         if has_layer is not None:
22 |             try:
23 |                 self.policy.Q.all_actions_func = K.function([self.policy.Q.model.get_layer('inp').input], [self.policy.Q.model.get_layer('dense_2').output])
24 |             except:
25 |                 self.policy.Q.all_actions_func = K.function([self.policy.Q.model.get_layer('inp').input], [self.policy.Q.model.get_layer('all_actions').output])
26 | 
27 |         self.action_space_dim = action_space_dim
28 | 
29 |         self.epsilon = epsilon
30 |         if prob is not None:
31 |             self.prob = prob
32 |         else:
33 |             self.prob = np.ones(self.action_space_dim)/self.action_space_dim
34 | 
35 | 
36 |         #debug purposes
37 |         self.policy_evalutor = policy_evalutor
38 | 
39 | 
40 |     def copy_over_to(self, to_):
41 |         pass
42 | 
43 |     def predict(self, X_a):
44 |         pass # return [self.model[np.argmax(x_a[:-self.action_space_dim], axis = 1)] == np.argmax(x_a[-self.action_space_dim:], axis=1) for x_a in X_a]
45 | 
46 |     def fit(self, X, y, verbose=0):
47 |         pass
48 | 
49 |     def representation(self, *args):
50 |         if len(args) == 1:
51 |             return args[0]
52 |         elif len(args) == 2:
53 |             return args[0], args[1]
54 |         else:
55 |             raise NotImplemented
56 | 
57 |     def all_actions(self, X, x_preprocessed=False,**kw):
58 | 
59 |         try:
60 |             shape_correct = len(self.policy.Q.model.get_layer('inp').input_shape) == (len(np.array(X).shape))
61 |         except:
62 |             shape_correct = False
63 | 
64 |         if shape_correct:
65 | 
66 |             if np.random.random() < self.epsilon:
67 |                 arr = -np.eye(self.action_space_dim)[np.random.choice(self.action_space_dim, p=self.prob)]
68 |             else:
69 |                 arr = -np.eye(self.action_space_dim)[self.policy.Q([X], x_preprocessed=x_preprocessed)[0]]
70 | 
71 |             return np.atleast_2d(arr)
72 |         else:
73 |             arr = []
74 |             for x in X:
75 |                 if np.random.random() < self.epsilon:
76 |                     arr.append(-np.eye(self.action_space_dim)[np.random.choice(self.action_space_dim, p=self.prob)])
77 |                 else:
78 |                     arr.append(-np.eye(self.action_space_dim)[self.policy.Q([x], x_preprocessed=x_preprocessed)[0]])
79 | 
80 |             return np.atleast_2d(np.array(arr))
81 | 
82 | 


--------------------------------------------------------------------------------
/tests/car_fqe.py:
--------------------------------------------------------------------------------
  1 | # from pyvirtualdisplay import Display
  2 | # display = Display(visible=0, size=(1280, 1024))
  3 | # display.start()
  4 | import deepdish as dd
  5 | from replay_buffer import Dataset
  6 | from config_car import *
  7 | import os
  8 | import numpy as np
  9 | import scipy.signal as signal
 10 | from env_nn import CarNN
 11 | from keras.models import load_model
 12 | np.random.seed(2718)
 13 | 
 14 | which_pi = './videos/ohio/run_1/pi_1.hdf5'
 15 | directory = 'seed_2_data'
 16 | action_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_actions_seed_2.h5'))
 17 | frame_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_frames_seed_2.h5'))
 18 | done_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_is_done_seed_2.h5'))
 19 | next_state_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_next_states_seed_2.h5'))
 20 | current_state_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_prev_states_seed_2.h5'))
 21 | cost_data = dd.io.load(os.path.join(os.getcwd(), directory, 'car_data_rewards_seed_2.h5'))
 22 | 
 23 | 
 24 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32')
 25 | for i in range(len(frame_data)):
 26 |     frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114])
 27 | 
 28 | dic = {'frames':frame_gray_scale,
 29 |             'prev_states': current_state_data,
 30 |             'next_states': next_state_data,
 31 |             'a': action_data,
 32 |             'c':cost_data[:,0],
 33 |             'g':cost_data[:,1:],
 34 |             'done': done_data
 35 |             }
 36 | 
 37 | data = Dataset(num_frame_stack, pic_size, (len(constraints) + 1,) )
 38 | data.data = dic  
 39 | EVALUATING = 'c'
 40 | 
 41 | 
 42 | def sample_N_trajectories(dataset, N):
 43 |     dones = np.where(dataset['done'])[0]
 44 |     dones = np.hstack([[0], dones])
 45 |     trajectory_idxs = zip(dones[:-1], dones[1:])
 46 |     N = min(len(trajectory_idxs), N)
 47 |     idxs = np.random.choice(len(trajectory_idxs), size=N, replace=False)
 48 |     return np.array(trajectory_idxs)[idxs]
 49 | 
 50 | 
 51 | def create_trajectories(dataset, N):
 52 |     idxs = sample_N_trajectories(dataset, N)
 53 |     episodes = []
 54 |     for low, high in idxs:
 55 |         x = np.rollaxis(dataset['frames'][dataset['prev_states'][low:high]],1,4)
 56 |         actions = np.atleast_2d(dataset['a'][low:high]).T
 57 |         x_prime = np.rollaxis(dataset['frames'][dataset['next_states'][low:high]],1,4)
 58 |         dataset_costs = dataset[EVALUATING][low:high]
 59 |         dones = dataset['done'][low:high]
 60 |         episode = {
 61 |                     'x': x,
 62 |                     'a': actions,
 63 |                     'x_prime': x_prime,
 64 |                     'cost': dataset_costs,
 65 |                     'done': dones,
 66 |                     }
 67 |         episodes.append(episode)
 68 |     return episodes
 69 | 
 70 | def pdis(episodes, pi_new, pi_old, gamma):
 71 |     '''
 72 |     Per decision importance sampling
 73 | 
 74 |     sum_{t=1}^{max L} gamma^t  1/n sum_{i=1}^n (PI_{tau=1}^t p_new/p_old) R^i_t
 75 |     '''
 76 |     values = []
 77 |     for episode in episodes:
 78 | 
 79 |         numerator = pi_new.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)]
 80 |         denominator = pi_old.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)]
 81 |         importance_weight = np.cumprod(numerator/denominator)
 82 | 
 83 |         values.append( discounted_sum(importance_weight * episode['cost'], gamma) )
 84 | 
 85 |     return np.mean(values)
 86 | 
 87 | def WDR(episodes, pi_new, pi_old, gamma):
 88 |     # \hat{v}^pi(s) = \sum_t gamma^t * \hat{r}^pi(s,t) 
 89 |     #               = \sum_t * \sum_a pi(a|s) \hat{r}^pi(s,a,t)
 90 |     #               = \sum_t * \hat{r}^\pi (s, A, t) where A = argmin_a pi(s), since our pi_new is deterministic
 91 | 
 92 |     # WDR = 1/n \sum_i \hat{v}^\pi_new (S_0^{H_i}) 
 93 |     #            + \sum_i \sum_t gamma^t w_t^i [R_t^{H_i} + gamma \hat{v}^\pi_new (S_{t+1}^{H_i}) - \hat{q}(S_t^{H_i}, A_t^{H_i})]
 94 | 
 95 |     # since pi_new, pi_old, ..etc, deterministic then:
 96 |     # Thus, WDR = \hat{v}^\pi_new (S_0) + \sum_i \sum_t gamma^t w_t^i [R_t^{H_i} - \hat{r}(S_t^{H_i},A_t^{H_i},0)]
 97 |     # 
 98 |     # w_t^i = p_t^i / sum_{j=1}^n p_i^j
 99 |     #
100 |     # p_t^i = prod_{i=0}^t pi_new(A_i|S_i) / pi_old(A_i|S_i) 
101 | 
102 | def discounted_sum(costs, discount):
103 |     '''
104 |     Calculate discounted sum of costs
105 |     '''
106 |     y = signal.lfilter([1], [1, -discount], x=costs[::-1])
107 |     return y[::-1][0]
108 | 
109 | 
110 | def main():
111 |     episodes = create_trajectories(data, 50)
112 | 
113 |     for episode in tqdm(episodes):
114 |         numerator = pi_new.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)]
115 |         denominator = pi_old.all_actions([episode['x']], x_preprocessed=True)[np.arange(len(episode['a'])), episode['a'].reshape(-1)]
116 |     
117 |     model = load_model(which_pi)
118 |     pi_new = CarNN(state_space_dim, 
119 |                   action_space_dim, 
120 |                   max_Q_fitting_epochs, 
121 |                   gamma, 
122 |                   model_type=model_type,
123 |                   num_frame_stack=num_frame_stack)
124 |     pi_new.model.set_weights(model.get_weights())
125 | 
126 |     pdis_output = pdis(episodes, pi_new, pi_new, gamma)
127 |     import pdb; pdb.set_trace()
128 | 
129 | main()
130 | 
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/tests/fqe_test.py:
--------------------------------------------------------------------------------
 1 | from pyvirtualdisplay import Display
 2 | display = Display(visible=1, size=(1280, 1024))
 3 | display.start()
 4 | from fitted_off_policy_evaluation import CarFittedQEvaluation
 5 | from exact_policy_evaluation import ExactPolicyEvaluator
 6 | from config_car import *
 7 | from fitted_algo import FittedAlgo
 8 | import numpy as np
 9 | from tqdm import tqdm
10 | from env_nn import *
11 | from thread_safe import threadsafe_generator
12 | from keras import backend as K
13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
14 | from env_dqns import CarDQN
15 | import deepdish as dd
16 | from keras.models import load_model
17 | import time
18 | from replay_buffer import Dataset
19 | from stochastic_policy import StochasticPolicy
20 | 
21 | 
22 | model_dir = os.path.join(os.getcwd(), 'models')
23 | old_policy_path = os.path.join(model_dir, old_policy_name)
24 | policy_old = CarDQN(env, 
25 |                     gamma, 
26 |                     action_space_map = action_space_map, 
27 |                     action_space_dim=action_space_dim, 
28 |                     model_type=model_type,
29 |                     max_time_spent_in_episode=max_time_spent_in_episode,
30 |                     num_iterations = num_iterations,
31 |                     sample_every_N_transitions = sample_every_N_transitions,
32 |                     batchsize = batchsize,
33 |                     copy_over_target_every_M_training_iterations = copy_over_target_every_M_training_iterations,
34 |                     buffer_size = buffer_size,
35 |                     min_epsilon = min_epsilon,
36 |                     initial_epsilon = initial_epsilon,
37 |                     epsilon_decay_steps = epsilon_decay_steps,
38 |                     num_frame_stack=num_frame_stack,
39 |                     min_buffer_size_to_train=min_buffer_size_to_train,
40 |                     frame_skip = frame_skip,
41 |                     pic_size = pic_size,
42 |                     models_path = os.path.join(model_dir,'weights.{epoch:02d}-{loss:.2f}.hdf5'),
43 |                     )
44 | policy_old.Q.model = load_model(old_policy_path)
45 | policy_old.Q.all_actions_func = K.function([policy_old.Q.model.get_layer('inp').input], [policy_old.Q.model.get_layer('all_actions').output])
46 | print 'Exact Evaluation: '
47 | exact_policy_algorithm = ExactPolicyEvaluator(action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size = pic_size, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about)
48 | #policy_old.Q.evaluate(render=True, environment_is_dynamic=False, to_monitor=True)
49 | print exact_policy_algorithm.run(policy_old.Q, to_monitor=False)
50 | 
51 | 
52 | # policy_to_test = StochasticPolicy(policy_old, action_space_dim, exact_policy_algorithm, epsilon=0., prob=prob)
53 | 
54 | tic = time.time()
55 | action_data = dd.io.load('./seed_2/car_data_actions_seed_2.h5')
56 | frame_data = dd.io.load('./seed_2/car_data_frames_seed_2.h5')
57 | done_data = dd.io.load('./seed_2/car_data_is_done_seed_2.h5')
58 | next_state_data = dd.io.load('./seed_2/car_data_next_states_seed_2.h5')
59 | current_state_data = dd.io.load('./seed_2/car_data_prev_states_seed_2.h5')
60 | cost_data = dd.io.load('./seed_2/car_data_rewards_seed_2.h5')
61 | 
62 | 
63 | frame_gray_scale = np.zeros((len(frame_data),96,96)).astype('float32')
64 | for i in range(len(frame_data)):
65 |     frame_gray_scale[i,:,:] = np.dot(frame_data[i,:,:,:]/255. , [0.299, 0.587, 0.114])
66 | 
67 | dic = {'frames':frame_gray_scale,
68 |             'prev_states': current_state_data,
69 |             'next_states': next_state_data,
70 |             'a': action_data,
71 |             'c':cost_data[:,0],
72 |             'g':cost_data[:,1:],
73 |             'done': done_data
74 |             }
75 | 
76 | data = Dataset(num_frame_stack, pic_size, (len(constraints) + 1,) )
77 | data.data = dic  
78 | 
79 | data.data['g'] = data.data['g'][:,constraints_cared_about]
80 | data.data['g'] = (data.data['g'] >= constraint_thresholds[:-1]).astype(int)   
81 | 
82 | FQE = CarFittedQEvaluation(state_space_dim, action_space_dim, max_eval_fitting_epochs, gamma, model_type=model_type,num_frame_stack=num_frame_stack)
83 | 
84 | 
85 | FQE.run(policy_old.Q,'c', data, desc='FQE C', g_idx=1, testing=True, epochs=1)
86 | 
87 | 
88 | def rolling_sum(a, n=4) : ret = np.cumsum(a, axis=1, dtype=float); ret[:, n:] = ret[:, n:] - ret[:, :-n]; return ret[:, n - 1:];


--------------------------------------------------------------------------------
/thread_safe.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | class ThreadSafe:
 4 |     """Takes an iterator/generator and makes it thread-safe by
 5 |     serializing call to the `next` method of given iterator/generator.
 6 |     """
 7 |     def __init__(self, it):
 8 |         self.it = it
 9 |         self.lock = threading.Lock()
10 | 
11 |     def __iter__(self):
12 |         return self
13 | 
14 |     def next(self):
15 |         with self.lock:
16 |             return self.it.next()
17 | 
18 | 
19 | def threadsafe_generator(f):
20 |     """A decorator that takes a generator function and makes it thread-safe.
21 |     """
22 |     def g(*a, **kw):
23 |         return ThreadSafe(f(*a, **kw))
24 |     return g
25 | 
26 | 


--------------------------------------------------------------------------------
/value_function.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | class ValueFunction(object):
 5 |     def __init__(self):
 6 |         '''
 7 |         '''
 8 |         self.prev_values = []
 9 |         self.exact_values = []
10 |         self.eval_values = {}
11 |         # self.V = {}
12 |         # self.dim_state_space = dim_state_space
13 |         # self.non_terminal_states = non_terminal_states
14 | 
15 |     def append(self, *args):
16 |         if len(args) == 1:
17 |             value = args[0]
18 |             self.prev_values.append(value)
19 |         elif len(args) == 2:
20 |             value, policy = args
21 |             self.prev_values.append(value)
22 |             # self.V[self.vectorize(policy)] = value
23 | 
24 |     def avg(self, append_zero=False):
25 |         if append_zero:
26 |             return np.hstack([np.mean(self.prev_values, 0), np.array([0])])
27 |         else:
28 |             return np.mean(self.prev_values, 0)
29 | 
30 |     def last(self, append_zero=False):
31 |         if append_zero:
32 |             return np.hstack([self.prev_values[-1], np.array([0])])
33 |         else:
34 |             return np.array(self.prev_values[-1])
35 | 
36 |     def add_exact_values(self, values):
37 |         self.exact_values.append(values)
38 | 
39 |     def add_eval_values(self, eval_values, idx):
40 |         if idx not in self.eval_values:
41 |             self.eval_values[idx] = []
42 |         
43 |         self.eval_values[idx].append(eval_values)
44 | 
45 | 
46 |     # def vectorize(self, policy):
47 |     #     # Can be done for low dim discrete spaces
48 |     #     return tuple(policy(self.non_terminal_states))
49 | 
50 |     # def __getitem__(self, policy):
51 |     #     pi = self.vectorize(policy)
52 |     #     if pi in self.V:
53 |     #         return np.array(self.V[pi])
54 |     #     else:
55 |     #         raise KeyError
56 | 
57 |     # def __contains__(self, policy):
58 |     #     pi = self.vectorize(policy)
59 |     #     if pi in self.V:
60 |     #         return True
61 |     #     else:
62 |     #         return False
63 | 


--------------------------------------------------------------------------------