├── .gitignore ├── OC_theano.py ├── README.md ├── __init__.py ├── nnet.py ├── train.py ├── utils ├── __init__.py ├── helper.py └── plot.py └── watch.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | models/ 4 | temp_files/ 5 | models_oc/ 6 | *.pkl 7 | *results/ 8 | *results_oc/ 9 | temp* 10 | core* 11 | *.swp 12 | -------------------------------------------------------------------------------- /OC_theano.py: -------------------------------------------------------------------------------- 1 | from nnet import Model, MLP3D 2 | import numpy as np 3 | import sys,pickle,os,theano 4 | import theano.tensor as T 5 | from lasagne.updates import norm_constraint 6 | from collections import OrderedDict 7 | 8 | def clip_grads(grads, clip, clip_type): 9 | if clip > 0.1: 10 | if clip_type == "norm": 11 | grads = [norm_constraint(p, clip) if p.ndim > 1 else T.clip(p, -clip, clip) for p in grads] 12 | elif clip_type == "global": 13 | norm = T.sqrt(T.sum([T.sum(T.sqr(g)) for g in grads])*2) + 1e-7 14 | scale = clip * T.min([1/norm,1./clip]).astype("float32") 15 | grads = [g*scale for g in grads] 16 | return grads 17 | 18 | def rmsprop(params, grads, clip=0, rho=0.99, eps=0.1, clip_type="norm"): 19 | grads = clip_grads(grads, clip, clip_type) 20 | updates = OrderedDict() 21 | all_grads, rms_weights = [], [] 22 | for param, grad in zip(params, grads): 23 | acc_rms = theano.shared(param.get_value() * 0) 24 | rms_weights.append(acc_rms) 25 | acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2 26 | updates[acc_rms] = acc_rms_new 27 | all_grads.append(grad / T.sqrt(acc_rms_new + eps)) 28 | return updates, all_grads, rms_weights 29 | 30 | 31 | class AOCAgent_THEANO(): 32 | def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None): 33 | print "USING OPTION CRITIC" 34 | self.args = args 35 | self.id_num = id_num 36 | self.num_actions = num_actions 37 | self.num_moves = num_moves 38 | self.reset_storing() 39 | self.rng = np.random.RandomState(100+id_num) 40 | model_network = [{"model_type": "conv", "filter_size": [8,8], "pool": [1,1], "stride": [4,4], "out_size": 16, "activation": "relu"}, 41 | {"model_type": "conv", "filter_size": [4,4], "pool": [1,1], "stride": [2,2], "out_size": 32, "activation": "relu"}, 42 | {"model_type": "mlp", "out_size": 256, "activation": "relu"}] 43 | out = [None,model_network[-1]["out_size"]] 44 | self.conv = Model(model_network, input_size=[None,args.concat_frames*(1 if args.grayscale else 3),84,84]) 45 | self.termination_model = Model([{"model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W":0}], input_size=out) 46 | self.Q_val_model = Model([{"model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W":0}], input_size=out) 47 | self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax") 48 | self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params 49 | self.set_rms_shared_weights(shared_arr) 50 | 51 | x = T.ftensor4() 52 | y = T.fvector() 53 | a = T.ivector() 54 | o = T.ivector() 55 | delib = T.fscalar() 56 | 57 | s = self.conv.apply(x/np.float32(255)) 58 | intra_option_policy = self.options_model.apply(s, o) 59 | 60 | q_vals = self.Q_val_model.apply(s) 61 | disc_q = theano.gradient.disconnected_grad(q_vals) 62 | current_option_q = q_vals[T.arange(o.shape[0]), o] 63 | disc_opt_q = disc_q[T.arange(o.shape[0]), o] 64 | terms = self.termination_model.apply(s) 65 | o_term = terms[T.arange(o.shape[0]), o] 66 | V = T.max(q_vals, axis=1)*(1-self.args.option_epsilon) + (self.args.option_epsilon*T.mean(q_vals, axis=1)) 67 | disc_V = theano.gradient.disconnected_grad(V) 68 | 69 | aggr = T.mean #T.sum 70 | log_eps = 0.0001 71 | 72 | critic_cost = aggr(args.critic_coef*0.5*T.sqr(y-current_option_q)) 73 | termination_grad = aggr(o_term*((disc_opt_q-disc_V)+delib)) 74 | entropy = -aggr(T.sum(intra_option_policy*T.log(intra_option_policy+log_eps), axis=1))*args.entropy_reg 75 | pg = aggr((T.log(intra_option_policy[T.arange(a.shape[0]), a]+log_eps)) * (y-disc_opt_q)) 76 | cost = pg + entropy - critic_cost - termination_grad 77 | 78 | grads = T.grad(cost*args.update_freq, self.params) 79 | #grads = T.grad(cost, self.params) 80 | updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type) 81 | self.share_rms(shared_arr) 82 | 83 | self.get_state = theano.function([x], s, on_unused_input='warn') 84 | self.get_policy = theano.function([s, o], intra_option_policy) 85 | self.get_termination = theano.function([x], terms) 86 | self.get_q = theano.function([x], q_vals) 87 | self.get_q_from_s = theano.function([s], q_vals) 88 | self.get_V = theano.function([x], V) 89 | 90 | self.rms_grads = theano.function([x,a,y,o, delib], grad_rms, updates=updates, on_unused_input='warn') 91 | print "ALL COMPILED" 92 | 93 | if not self.args.testing: 94 | self.init_tracker() 95 | self.initialized = False 96 | 97 | def update_weights(self, x, a, y, o, moves, delib): 98 | args = self.args 99 | self.num_moves.value += moves 100 | lr = np.max([args.init_lr * (args.max_num_frames-self.num_moves.value)/args.max_num_frames, 0]).astype("float32") 101 | 102 | cumul = self.rms_grads(x,a,y,o,delib) 103 | for i in range(len(cumul)): 104 | self.shared_arr[i] += lr*cumul[i] 105 | self.params[i].set_value(self.shared_arr[i]) 106 | return 107 | 108 | def load_values(self, values): 109 | assert(len(self.params+self.rms_weights) == len(values)) 110 | for p, v in zip(self.params+self.rms_weights, values): p.set_value(v) 111 | 112 | def save_values(self, folder_name): 113 | pickle.dump([p.get_value() for p in self.params+self.rms_weights], open(folder_name+"/tmp_model.pkl", "wb")) 114 | os.system("mv "+folder_name+"/tmp_model.pkl "+folder_name+"/model.pkl") 115 | #try: # server creates too many core files 116 | # os.system("rm ./core*") 117 | #except: 118 | # pass 119 | 120 | def get_param_vals(self): 121 | return [m.get_value() for m in self.params+self.rms_weights] 122 | 123 | def set_rms_shared_weights(self, shared_arr): 124 | if shared_arr is not None: 125 | self.shared_arr = [np.frombuffer(s, dtype="float32").reshape(p.get_value().shape) for s, p in zip(shared_arr, self.params)] 126 | self.rms_shared_arr = shared_arr[len(self.params):] 127 | if self.args.init_num_moves > 0: 128 | for s, p in zip(shared_arr, self.params): 129 | p.set_value(np.frombuffer(s, dtype="float32").reshape(p.get_value().shape)) 130 | print "LOADED VALUES" 131 | 132 | def share_rms(self, shared_arr): 133 | # Ties rms params between threads with borrow=True flag 134 | if self.args.rms_shared and shared_arr is not None: 135 | assert(len(self.rms_weights) == len(self.rms_shared_arr)) 136 | for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr): 137 | rms_w.set_value(np.frombuffer(s_rms_w, dtype="float32").reshape(rms_w.get_value().shape), borrow=True) 138 | 139 | def get_action(self, x): 140 | p = self.get_policy([self.current_s], [self.current_o]) 141 | return self.rng.choice(range(self.num_actions), p=p[-1]) 142 | 143 | def get_policy_over_options(self, s): 144 | return self.get_q_from_s(s)[0].argmax() if self.rng.rand() > self.args.option_epsilon else self.rng.randint(self.args.num_options) 145 | 146 | def update_internal_state(self, x): 147 | self.current_s = self.get_state([x])[0] 148 | self.delib = self.args.delib_cost 149 | 150 | if self.terminated: 151 | self.current_o = self.get_policy_over_options([self.current_s]) 152 | self.o_tracker_chosen[self.current_o] += 1 153 | 154 | self.o_tracker_steps[self.current_o] += 1 155 | 156 | def init_tracker(self): 157 | csv_things = ["moves", "reward", "term_prob"] 158 | csv_things += ["opt_chosen"+str(ccc) for ccc in range(self.args.num_options)] 159 | csv_things += ["opt_steps"+str(ccc) for ccc in range(self.args.num_options)] 160 | with open(self.args.folder_name+"/data.csv", "a") as myfile: 161 | myfile.write(",".join([str(cc) for cc in csv_things])+"\n") 162 | 163 | def tracker(self): 164 | term_prob = float(self.termination_counter)/self.frame_counter*100 165 | csv_things = [self.num_moves.value, self.total_reward, round(term_prob,1)]+list(self.o_tracker_chosen)+list(self.o_tracker_steps) 166 | with open(self.args.folder_name+"/data.csv", "a") as myfile: 167 | myfile.write(",".join([str(cc) for cc in csv_things])+"\n") 168 | 169 | def reset_tracker(self): 170 | self.termination_counter = 0 171 | self.frame_counter = 0 172 | self.o_tracker_chosen = np.zeros(self.args.num_options,) 173 | self.o_tracker_steps = np.zeros(self.args.num_options,) 174 | 175 | def reset(self, x): 176 | if not self.args.testing and self.initialized: self.tracker() 177 | self.total_reward = 0 178 | self.terminated = True 179 | self.reset_tracker() 180 | self.update_internal_state(x) 181 | self.initialized = True 182 | 183 | def reset_storing(self): 184 | self.a_seq = np.zeros((self.args.max_update_freq,), dtype="int32") 185 | self.o_seq = np.zeros((self.args.max_update_freq,), dtype="int32") 186 | self.r_seq = np.zeros((self.args.max_update_freq,), dtype="float32") 187 | self.x_seq = np.zeros((self.args.max_update_freq, self.args.concat_frames*(1 if self.args.grayscale else 3),84,84),dtype="float32") 188 | self.t_counter = 0 189 | 190 | def store(self, x, new_x, action, raw_reward, done, death): 191 | end_ep = done or (death and self.args.death_ends_episode) 192 | self.frame_counter += 1 193 | 194 | self.total_reward += raw_reward 195 | reward = np.clip(raw_reward, -1, 1) 196 | 197 | self.terminated = self.get_termination([new_x])[0][self.current_o] > self.rng.rand() 198 | self.termination_counter += self.terminated 199 | 200 | self.x_seq[self.t_counter] = np.copy(x) 201 | self.o_seq[self.t_counter] = np.copy(self.current_o) 202 | self.a_seq[self.t_counter] = np.copy(action) 203 | self.r_seq[self.t_counter] = np.copy(float(reward)) - (float(self.terminated)*self.delib*(1-float(end_ep))) 204 | 205 | self.t_counter += 1 206 | 207 | # do n-step return to option termination. 208 | # cut off at self.args.max_update_freq 209 | # min steps: self.args.update_freq (usually 5 like a3c) 210 | # this doesn't make option length a minimum of 5 (they can still terminate). only batch size 211 | option_term = (self.terminated and self.t_counter >= self.args.update_freq) 212 | if self.t_counter == self.args.max_update_freq or end_ep or option_term: 213 | if not self.args.testing: 214 | V = self.get_V([new_x])[0] if self.terminated else self.get_q([new_x])[0][self.current_o] 215 | R = 0 if end_ep else V 216 | V = [] 217 | for j in range(self.t_counter-1,-1,-1): 218 | R = np.float32(self.r_seq[j] + self.args.gamma*R) 219 | V.append(R) 220 | self.update_weights(self.x_seq[:self.t_counter], self.a_seq[:self.t_counter], V[::-1], 221 | self.o_seq[:self.t_counter], self.t_counter, self.delib+self.args.margin_cost) 222 | self.reset_storing() 223 | if not end_ep: 224 | self.update_internal_state(new_x) 225 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # When Waiting is not an Option : Learning Options with a Deliberation Cost 2 | 3 | Arxiv link: https://arxiv.org/pdf/1709.04571.pdf 4 | 5 | ## Installation 6 | 7 | Here's a list of all dependencies: 8 | 9 | - Numpy 10 | - Theano 11 | - Lasagne 12 | - Argparse 13 | - OpenAI Gym [Atari] 14 | - matplotlib 15 | - cv2 (OpenCV) 16 | - PIL (Image) 17 | 18 | ## Training 19 | 20 | To train, run following command: 21 | ``` 22 | python train.py --sub-env Breakout --num-options 8 --num-threads 16 --folder-name Breakout_model 23 | ``` 24 | 25 | To view a list of available parameters, run: 26 | ``` 27 | print train.py --help 28 | ``` 29 | 30 | During training, you can run utils/plot.py to view the training curve. Every argument given can be a path to a different run, which will put all runs on the same plot. 31 | ``` 32 | python utils/plot.py models/Breakout_model/ models/Breakout_model_v2/ models/Breakout_model_v3/ 33 | ``` 34 | 35 | ## Testing 36 | 37 | To watch model after training, run watch.py and give it the path the saved model files. e.g.: 38 | ``` 39 | python watch.py models/Breakout_model/ 40 | ``` 41 | 42 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeanharb/a2oc_delib/42e25ce74c4ccc8e9e2f0667cb511aca4af45066/__init__.py -------------------------------------------------------------------------------- /nnet.py: -------------------------------------------------------------------------------- 1 | import theano, lasagne 2 | import theano.tensor as T 3 | import math, csv, time, sys, os, pdb, copy 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 5 | from lasagne.layers import Conv2DLayer, conv, Upscale2DLayer 6 | if theano.config.device.startswith("gpu"): 7 | from lasagne.layers import cuda_convnet 8 | import numpy as np 9 | 10 | def get_init(m, t): 11 | inits = {"zeros": lasagne.init.Constant(0.), "norm": lasagne.init.Normal(0.1)} 12 | if t not in m: 13 | if t == "b": 14 | return lasagne.init.Constant(0.) 15 | return lasagne.init.GlorotUniform() 16 | elif isinstance(m[t], basestring): 17 | return inits[m[t]] 18 | elif isinstance(m[t], int): 19 | return lasagne.init.Constant(m[t]) 20 | else: 21 | return m[t] 22 | 23 | def get_activation(activation): 24 | if activation == "softmax": 25 | output = T.nnet.softmax 26 | elif activation is None: 27 | output = None 28 | elif activation == "tanh": 29 | output = T.tanh 30 | elif activation == "relu": 31 | output = T.nnet.relu 32 | elif "leaky_relu" in activation: 33 | output = lambda x: T.nnet.relu(x, alpha=float(activation.split(" ")[1])) 34 | elif activation == "linear": 35 | output = None 36 | elif activation == "sigmoid": 37 | output = T.nnet.sigmoid 38 | elif activation == "hard_sigmoid": 39 | output = T.nnet.hard_sigmoid 40 | else: 41 | print "activation not recognized:", activation 42 | raise NotImplementedError 43 | return output 44 | 45 | class MLP3D(): 46 | def __init__(self, input_size=None, num_options=None, out_size=None, activation="softmax"): 47 | option_out_size = out_size 48 | limits = (6./np.sqrt(input_size + option_out_size))/num_options 49 | self.options_W = theano.shared(np.random.uniform(size=(num_options, input_size, option_out_size), high=limits, low=-limits).astype("float32")) 50 | self.options_b = theano.shared(np.zeros((num_options, option_out_size)).astype("float32")) 51 | self.activation = get_activation(activation) 52 | self.params = [self.options_W, self.options_b] 53 | 54 | def apply(self, inputs, option=None): 55 | W = self.options_W[option] 56 | b = self.options_b[option] 57 | 58 | out = T.sum(inputs.dimshuffle(0,1,'x')*W, axis=1) + b 59 | return out if self.activation is None else self.activation(out) 60 | 61 | def save_params(self): 62 | return [i.get_value() for i in self.params] 63 | 64 | def load_params(self, values): 65 | print "LOADING NNET..", 66 | for p, value in zip(self.params, values): 67 | p.set_value(value.astype("float32")) 68 | print "LOADED" 69 | 70 | class Model(): 71 | def __call__(self, *args, **kwargs): 72 | return self.apply(*args, **kwargs) 73 | 74 | def get_activation(self, model): 75 | activation = model["activation"] if "activation" in model else "linear" 76 | return get_activation(activation) 77 | 78 | def create_layer(self, inputs, model, dnn_type=True): 79 | 80 | if model["model_type"] == "conv": 81 | if dnn_type: 82 | import lasagne.layers.dnn as dnn 83 | conv_type = dnn.Conv2DDNNLayer if dnn_type else Conv2DLayer 84 | poolsize = tuple(model["pool"]) if "pool" in model else (1,1) 85 | stride = tuple(model["stride"]) if "stride" in model else (1,1) 86 | layer = conv_type(inputs, 87 | model["out_size"], 88 | filter_size=model["filter_size"], 89 | stride=stride, 90 | nonlinearity=self.get_activation(model), 91 | W=get_init(model, "W"), 92 | b=get_init(model, "b"), 93 | pad="valid" if "pad" not in model else model["pad"]) 94 | elif model["model_type"] == "mlp": 95 | layer = lasagne.layers.DenseLayer(inputs, 96 | num_units=model["out_size"], 97 | nonlinearity=self.get_activation(model), 98 | W=get_init(model, "W"), 99 | b=get_init(model, "b")) 100 | elif model["model_type"] == "option": 101 | layer = MLP3D(model, inputs, nonlinearity=self.get_activation(model)) 102 | else: 103 | print "UNKNOWN LAYER NAME" 104 | raise NotImplementedError 105 | return layer 106 | 107 | def __init__(self, model_in, input_size=None, rng=1234, dnn_type=False): 108 | """ 109 | example model: 110 | model = [{"model_type": "conv", "filter_size": [5,5], "pool": [1,1], "stride": [1,1], "out_size": 5}, 111 | {"model_type": "conv", "filter_size": [7,7], "pool": [1,1], "stride": [1,1], "out_size": 15}, 112 | {"model_type": "mlp", "out_size": 300, "activation": "tanh"}, 113 | {"model_type": "mlp", "out_size": 10, "activation": "softmax"}] 114 | """ 115 | self.theano_rng = RandomStreams(rng) 116 | rng = np.random.RandomState(rng) 117 | lasagne.random.set_rng(rng) 118 | 119 | new_layer = tuple(input_size) if isinstance(input_size, list) else input_size 120 | model = [model_in] if isinstance(model_in, dict) else model_in 121 | 122 | print "Building following model..." 123 | print model 124 | 125 | self.model = model 126 | self.input_size = input_size 127 | self.out_size = model_in[-1]["out_size"] 128 | self.dnn_type = dnn_type 129 | 130 | # create neural net layers 131 | self.params = [] 132 | self.layers = [] 133 | for i, m in enumerate(model): 134 | new_layer = self.create_layer(new_layer, m, dnn_type=dnn_type) 135 | self.params += new_layer.get_params() 136 | self.layers.append(new_layer) 137 | 138 | print "Build complete." 139 | print 140 | 141 | def apply(self, x): 142 | last_layer_inputs = x 143 | for i, m in enumerate(self.model): 144 | if m["model_type"] in ["mlp", "logistic", "advantage"] and last_layer_inputs.ndim > 2: 145 | last_layer_inputs = last_layer_inputs.flatten(2) 146 | last_layer_inputs = self.layers[i].get_output_for(last_layer_inputs) 147 | return last_layer_inputs 148 | 149 | def save_params(self): 150 | return [i.get_value() for i in self.params] 151 | 152 | def load_params(self, values): 153 | print "LOADING NNET..", 154 | for p, value in zip(self.params, values): 155 | p.set_value(value.astype("float32")) 156 | print "LOADED" 157 | 158 | if __name__ == "__main__": 159 | pass 160 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Value, Array, RawArray 2 | from collections import OrderedDict 3 | import numpy as np 4 | from OC_theano import AOCAgent_THEANO 5 | import cv2,copy,sys,pickle,os,time,argparse 6 | from PIL import Image 7 | from utils.helper import foldercreation, str2bool, get_folder_name 8 | 9 | class Environment(): 10 | def reset(self): 11 | raise NotImplementedError 12 | 13 | def render(self): 14 | raise NotImplementedError 15 | 16 | def act(self): 17 | raise NotImplementedError 18 | 19 | def get_frame_count(self): 20 | raise NotImplementedError 21 | 22 | class ALE_env(Environment): 23 | def __init__(self, args, rng=None): 24 | import gym 25 | env = gym.make(args.sub_env+"NoFrameskip-v4") 26 | self.args = args 27 | self.rng = rng 28 | self.env = env 29 | self.action_space = self.env.action_space.n 30 | self.obs_space = self.env.observation_space.shape 31 | 32 | if self.args.testing: 33 | import matplotlib.pyplot as plt 34 | plt.ion() 35 | plt.show(block=False) 36 | 37 | def get_lives(self): 38 | return self.env.unwrapped.ale.lives() 39 | 40 | def noops(self): 41 | num_actions = self.rng.randint(1, self.args.max_start_nullops) 42 | for i in range(np.max([num_actions//self.args.frame_skip, self.args.concat_frames])): 43 | self.act(0) 44 | if self.env.unwrapped.get_action_meanings()[1] == 'FIRE': 45 | self.act(1) 46 | 47 | def reset(self): 48 | self.current_x = np.zeros((self.args.concat_frames*(1 if self.args.grayscale else 3), 84, 84), dtype="float32") 49 | self.new_obs = self.env.reset() 50 | self.lives = self.get_lives() 51 | self.noops() 52 | return self.current_x 53 | 54 | def render(self): 55 | a = 2 56 | if a == 1: #can see what the agent sees 57 | import matplotlib.pyplot as plt 58 | plt.clf() 59 | if self.args.grayscale: 60 | plt.imshow(self.xx, cmap="Greys_r") 61 | else: 62 | x = np.swapaxes(self.xx, 0,2) 63 | x_ = np.copy(x[0]) 64 | x[0] = x[2] 65 | x[2] = x_ 66 | plt.imshow(x) 67 | plt.draw() 68 | plt.pause(0.0001) 69 | else: 70 | self.env.render() 71 | 72 | def get_new_frame(self, new_frame): 73 | a = 1 if self.args.grayscale else 3 74 | self.current_x[:-a] = self.current_x[a:] 75 | self.current_x[-a:] = new_frame 76 | 77 | def act(self, action): 78 | raw_reward, dones, done = 0, 0, False 79 | for i in range(self.args.frame_skip): 80 | if done: 81 | break 82 | new_obs, rew, done, info = self.env.step(action) 83 | self.old_obs = np.copy(self.new_obs) 84 | self.new_obs = new_obs 85 | raw_reward += rew 86 | dones += done 87 | new_frame = self.preprocess(self.new_obs, self.old_obs) 88 | self.get_new_frame(new_frame) 89 | dones += (self.get_frame_count() > self.args.max_frames_ep) 90 | 91 | new_lives = self.get_lives() 92 | death = new_lives < self.lives 93 | self.lives = new_lives 94 | if death and not bool(int(dones)): 95 | self.noops() 96 | return self.current_x, raw_reward, bool(int(dones)), death 97 | 98 | def preprocess(self, im, last_im): 99 | if self.args.color_max: 100 | im = np.maximum(im, last_im) 101 | if self.args.grayscale: 102 | proportions = [0.299, 0.587, 0.114] 103 | im = np.sum(im * proportions, axis=2) 104 | #im = cv2.resize(im, (84, 110), interpolation=cv2.INTER_AREA)[18:102, :] 105 | im = Image.fromarray(im).resize((84, 84), resample=Image.BILINEAR) 106 | x = np.array(im).astype("int32") 107 | if not self.args.grayscale: 108 | x = np.swapaxes(x, 0, 2) 109 | self.xx = x 110 | return x 111 | 112 | def get_frame_count(self): 113 | return self.env.unwrapped.ale.getEpisodeFrameNumber() 114 | 115 | class Training(): 116 | def __init__(self, rng, id_num, arr, num_moves, args): 117 | self.args = args 118 | self.rng = rng 119 | self.num_moves = num_moves 120 | self.id_num = id_num 121 | 122 | self.env = ALE_env(args, rng=rng) 123 | self.agent = AOCAgent_THEANO(self.env.action_space, id_num, arr, num_moves, args) 124 | 125 | self.train() 126 | 127 | def train(self): 128 | total_reward = 0 129 | x = self.env.reset() 130 | self.agent.reset(x) 131 | timer = time.time() 132 | recent_fps = [] 133 | frame_counter = 0 134 | total_games = 0 135 | done = False 136 | 137 | while self.num_moves.value < self.args.max_num_frames: 138 | if done: 139 | #ugly code, beautiful print 140 | total_games += 1 141 | secs = round(time.time()-timer, 1) 142 | frames = self.env.get_frame_count() 143 | fps = int(frames/secs) 144 | recent_fps = recent_fps[-9:]+[fps] 145 | eta = ((self.args.max_num_frames-self.num_moves.value)*self.args.frame_skip/(self.args.num_threads*np.mean(recent_fps))) 146 | print "id: %d\treward: %d\ttime: %.1f\tframes: %d\t %dfps \tmoves: %d \t ETA: %dh %dm %ds \t%.2f%%" % \ 147 | (self.id_num, total_reward, secs, frames, fps, self.num_moves.value, int(eta/3600), int(eta/60)%60, int(eta%60), 148 | float(self.num_moves.value)/self.args.max_num_frames*100) 149 | timer = time.time() 150 | frame_counter = 0 151 | 152 | if total_games % 1 == 0 and self.id_num == 1 and not self.args.testing: 153 | self.agent.save_values(folder_name) 154 | print "saved model" 155 | total_reward = 0 156 | x = self.env.reset() 157 | self.agent.reset(x) 158 | done = False 159 | 160 | action = self.agent.get_action(x) 161 | new_x, reward, done, death = self.env.act(action) 162 | self.agent.store(x, new_x, action, reward, done, death) 163 | if self.args.testing: 164 | self.env.render() 165 | total_reward += reward 166 | x = np.copy(new_x) 167 | 168 | def parse_params(): 169 | parser = argparse.ArgumentParser() 170 | parser.add_argument('--sub-env', type=str, default="Breakout") 171 | parser.add_argument('--testing', type=str2bool, default=False) 172 | parser.add_argument('--update-freq', type=int, default=5) 173 | parser.add_argument('--max-update-freq', type=int, default=30) 174 | parser.add_argument('--num-threads', type=int, default=16) 175 | parser.add_argument('--death-ends-episode', type=str2bool, default=True) 176 | parser.add_argument('--max-start-nullops', type=int, default=30) 177 | parser.add_argument('--frame-skip', type=int, default=4) 178 | parser.add_argument('--concat-frames', type=int, default=4) 179 | parser.add_argument('--entropy-reg', type=float, default=0.01) 180 | parser.add_argument('--gamma', type=float, default=0.99) 181 | parser.add_argument('--clip', type=float, default=40) 182 | parser.add_argument('--clip-type', type=str, default="global", choices=["norm", "global"]) 183 | parser.add_argument('--color-averaging', type=str2bool, default=False) 184 | parser.add_argument('--color-max', type=str2bool, default=True) 185 | parser.add_argument('--grayscale', type=str2bool, default=True) 186 | parser.add_argument('--max-num-frames', type=int, default=80000000) 187 | parser.add_argument('--max-frames-ep', type=int, default=72000) 188 | parser.add_argument('--init-lr', type=float, default=0.0007) 189 | parser.add_argument('--rms-shared', type=str2bool, default=True) 190 | parser.add_argument('--critic-coef', type=float, default=1.) 191 | parser.add_argument('--num-options', type=int, default=8) 192 | parser.add_argument('--option-epsilon', type=float, default=0.1) 193 | parser.add_argument('--delib-cost', type=float, default=0.0) 194 | parser.add_argument('--margin-cost', type=float, default=0.0) 195 | parser.add_argument('--save-path', type=str, default="models") 196 | parser.add_argument('--load-folder', type=str, default="") # if not empty, will load folder to resume training 197 | parser.add_argument('--folder-name', type=str, default="") 198 | parser.add_argument('--resume-if-exists', type=str2bool, default=False) # for server that kills and restarts processes 199 | return parser.parse_known_args()[0] #parser.parse_args() 200 | 201 | 202 | if __name__ == '__main__': 203 | params = parse_params() 204 | 205 | folder_name = get_folder_name(params) if params.folder_name == "" else params.folder_name 206 | attempted_path = "./"+params.save_path+"/"+folder_name 207 | print "->", attempted_path, os.path.isdir(attempted_path) 208 | if params.resume_if_exists and os.path.isdir(attempted_path): 209 | params.load_folder = attempted_path 210 | print "RESUMING TRAINING AUTOMATICALLY" 211 | 212 | init_num_moves = 0 213 | if params.load_folder != "": 214 | folder_name = params.load_folder 215 | with open(folder_name+"/data.csv", "rb") as file: 216 | for last in file: 217 | if last.split(",")[0].isdigit(): 218 | init_num_moves = int(last.split(",")[0]) 219 | init_weights = pickle.load(open(folder_name+"/model.pkl", "rb")) 220 | is_testing = copy.deepcopy(params.testing) 221 | params = pickle.load(open(params.load_folder+"/params.pkl", "rb")) 222 | params.testing = is_testing 223 | if is_testing: 224 | params.num_threads = 1 225 | else: 226 | folder_name = foldercreation(folder_name, params) 227 | pickle.dump(params, open(folder_name+"/params.pkl", "wb")) 228 | 229 | setattr(params, "folder_name", folder_name) 230 | 231 | setattr(params, "init_num_moves", init_num_moves) 232 | print "init_num_moves:", init_num_moves 233 | 234 | f = lambda rng, i, shared_arr, num_moves, args: Training(rng, i, shared_arr, num_moves, args) 235 | 236 | env = ALE_env(params) 237 | if init_num_moves == 0: 238 | init_weights = (AOCAgent_THEANO(env.action_space, 0, args=params)).get_param_vals() 239 | 240 | num_moves = Value("i", init_num_moves, lock=False) 241 | arr = [Array('f', m.flatten(), lock=False) for m in init_weights] 242 | seed = np.random.randint(10000) 243 | for i in range(params.num_threads): 244 | Process(target=f, args=(np.random.RandomState(seed+i), i+1, arr, num_moves, params)).start() 245 | 246 | 247 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeanharb/a2oc_delib/42e25ce74c4ccc8e9e2f0667cb511aca4af45066/utils/__init__.py -------------------------------------------------------------------------------- /utils/helper.py: -------------------------------------------------------------------------------- 1 | import os, datetime, pickle as pkl 2 | 3 | def create_dir(p, num_suffix=False): 4 | i = 0 5 | while True: 6 | try: 7 | new_dir = p+(("_v"+str(i) if i > 0 else "") if num_suffix else "") 8 | os.makedirs(new_dir) 9 | break 10 | except OSError, e: 11 | if e.errno != 17: 12 | raise # This was not a "directory exist" error.. 13 | else: 14 | i += 1 15 | if not num_suffix: 16 | break 17 | return new_dir 18 | 19 | def get_folder_name(args): 20 | folder_name = args.sub_env+"_"+str(args.num_options)+"opts_"+str(args.delib_cost)+"delib_"+ \ 21 | str(args.num_threads)+"_"+str(args.max_num_frames//80000000)+"day" 22 | return folder_name 23 | 24 | def foldercreation(folder_name, args): 25 | tempdir = os.path.join(os.getcwd(), args.save_path) 26 | create_dir(tempdir) 27 | #folder_name = folder_name if folder_name is not None else datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 28 | mydir = os.path.join(tempdir, folder_name) 29 | return create_dir(mydir, num_suffix=True) 30 | 31 | def str2bool(v): 32 | if v.lower() not in ("yes", "true", "t", "1", "no", "false", "f", "0"): 33 | print "Inserted unrecognized string for bool value. Must be one of the following:" 34 | print " ".join(["yes", "true", "t", "1", "no", "false", "f", "0"]) 35 | print "Note: Capitalization doesn't matter." 36 | raise NotImplementedError 37 | return v.lower() in ("yes", "true", "t", "1") 38 | -------------------------------------------------------------------------------- /utils/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn 2 | import matplotlib.pyplot as plt 3 | import sys, time 4 | import numpy as np 5 | 6 | plt.ion() 7 | fig = plt.figure() 8 | plt.show(block=False) 9 | refresh_rate = 5.0 10 | 11 | def handle_close(evt): 12 | sys.exit() 13 | 14 | show_term = "--term" in sys.argv 15 | while True: 16 | try: 17 | data = [] 18 | indices = [] 19 | plt.clf() 20 | weight_moves = False 21 | for i in range(len(sys.argv[1:])): 22 | if "--" in sys.argv[i+1]: 23 | continue 24 | d = [] 25 | e = [] 26 | filename = sys.argv[i+1] 27 | if ".csv" not in filename: filename += "/data.csv" 28 | f = open(filename, "rb") 29 | for j, line in enumerate(f): 30 | if not line.split(",")[0].isdigit(): continue 31 | if weight_moves or ("," in line): 32 | d.append(float(line.split(",")[1+show_term])) 33 | e.append(int(line.split(",")[0])) 34 | weight_moves = True 35 | else: 36 | d.append(int(line)) 37 | weight_moves = False 38 | f.close() 39 | data.append(d) 40 | indices.append(e) 41 | 42 | a = int(max([len(each) for each in data])/250)+1 43 | #weight_moves = False 44 | if weight_moves: 45 | a = int(float(max([i[-1] for i in indices]))/250) 46 | d2 = [] 47 | all_p = [] 48 | i = -1 49 | for temp_i in range(len(sys.argv[1:])): 50 | if "--" in sys.argv[temp_i+1]: 51 | continue 52 | i += 1 53 | if weight_moves: 54 | frame_interval = a 55 | new_matrix = [] 56 | one_row = [] 57 | counter = 0 58 | count = 0 59 | while count < len(data[i]): 60 | if indices[i][count] > (counter+1)*frame_interval: 61 | if len(one_row) == 0: 62 | if len(new_matrix) == 0: 63 | one_row = [data[i][count]] 64 | else: 65 | one_row = [new_matrix[-1]] 66 | new_matrix.append(np.mean(one_row)) 67 | one_row = [] 68 | counter += 1 69 | else: 70 | one_row.append(data[i][count]) 71 | count += 1 72 | p, = plt.plot(np.array(range(len(new_matrix)))*frame_interval, np.array(new_matrix)) 73 | else: 74 | p, = plt.plot(np.array(data[i][:-(len(data[i])%a)]).reshape(((len(data[i])-(len(data[i])%a))/a,a)).mean(axis=1).flatten()) 75 | all_p.append(p) 76 | legends = [] 77 | for dd in sys.argv[1:]: 78 | if "--" not in dd: legends.append(dd.split("/")[-2]) 79 | plt.legend(all_p, legends, loc=2) 80 | fig.canvas.mpl_connect('close_event', handle_close) 81 | plt.draw() 82 | plt.pause(refresh_rate) 83 | except Exception, e: 84 | print e 85 | time.sleep(2) 86 | pass 87 | -------------------------------------------------------------------------------- /watch.py: -------------------------------------------------------------------------------- 1 | from train import Training, parse_params 2 | import pickle as pkl, sys, cv2 3 | from multiprocessing import Value 4 | import numpy as np 5 | 6 | p = pkl.load(open(sys.argv[1]+"/model.pkl", "rb")) 7 | args = pkl.load(open(sys.argv[1]+"/params.pkl", "rb")) 8 | temp_p = parse_params() 9 | for a in args.__dict__: 10 | setattr(temp_p,a, args.__dict__[a]) 11 | args = temp_p 12 | print args 13 | print 14 | args.testing = True 15 | setattr(args, "init_num_moves", 2) 16 | args.fps = 60 17 | t = Training(np.random.RandomState(), 0, p, Value("i", 0, lock=False), args) 18 | --------------------------------------------------------------------------------