├── .gitignore
├── OC_theano.py
├── README.md
├── __init__.py
├── nnet.py
├── train.py
├── utils
    ├── __init__.py
    ├── helper.py
    └── plot.py
└── watch.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.pyc
 3 | models/
 4 | temp_files/
 5 | models_oc/
 6 | *.pkl
 7 | *results/
 8 | *results_oc/
 9 | temp*
10 | core*
11 | *.swp
12 | 


--------------------------------------------------------------------------------
/OC_theano.py:
--------------------------------------------------------------------------------
  1 | from nnet import Model, MLP3D
  2 | import numpy as np
  3 | import sys,pickle,os,theano
  4 | import theano.tensor as T
  5 | from lasagne.updates import norm_constraint
  6 | from collections import OrderedDict
  7 | 
  8 | def clip_grads(grads, clip, clip_type):
  9 |   if clip > 0.1: 
 10 |     if clip_type == "norm":
 11 |       grads = [norm_constraint(p, clip) if p.ndim > 1 else T.clip(p, -clip, clip) for p in grads]
 12 |     elif clip_type == "global":
 13 |       norm = T.sqrt(T.sum([T.sum(T.sqr(g)) for g in grads])*2) + 1e-7
 14 |       scale = clip * T.min([1/norm,1./clip]).astype("float32")
 15 |       grads = [g*scale for g in grads]
 16 |   return grads
 17 | 
 18 | def rmsprop(params, grads, clip=0, rho=0.99, eps=0.1, clip_type="norm"):
 19 |   grads = clip_grads(grads, clip, clip_type)
 20 |   updates = OrderedDict()
 21 |   all_grads, rms_weights = [], []
 22 |   for param, grad in zip(params, grads):
 23 |     acc_rms = theano.shared(param.get_value() * 0)
 24 |     rms_weights.append(acc_rms)
 25 |     acc_rms_new = rho * acc_rms + (1 - rho) * grad ** 2
 26 |     updates[acc_rms] = acc_rms_new
 27 |     all_grads.append(grad / T.sqrt(acc_rms_new + eps))
 28 |   return updates, all_grads, rms_weights
 29 | 
 30 | 
 31 | class AOCAgent_THEANO():
 32 |   def __init__(self, num_actions, id_num, shared_arr=None, num_moves=None, args=None):
 33 |     print "USING OPTION CRITIC"
 34 |     self.args = args
 35 |     self.id_num = id_num
 36 |     self.num_actions = num_actions
 37 |     self.num_moves = num_moves
 38 |     self.reset_storing()
 39 |     self.rng = np.random.RandomState(100+id_num)
 40 |     model_network = [{"model_type": "conv", "filter_size": [8,8], "pool": [1,1], "stride": [4,4], "out_size": 16, "activation": "relu"},
 41 |                      {"model_type": "conv", "filter_size": [4,4], "pool": [1,1], "stride": [2,2], "out_size": 32, "activation": "relu"},
 42 |                      {"model_type": "mlp", "out_size": 256, "activation": "relu"}]
 43 |     out = [None,model_network[-1]["out_size"]]
 44 |     self.conv = Model(model_network, input_size=[None,args.concat_frames*(1 if args.grayscale else 3),84,84])
 45 |     self.termination_model = Model([{"model_type": "mlp", "out_size": args.num_options, "activation": "sigmoid", "W":0}], input_size=out)
 46 |     self.Q_val_model = Model([{"model_type": "mlp", "out_size": args.num_options, "activation": "linear", "W":0}], input_size=out)
 47 |     self.options_model = MLP3D(input_size=out[1], num_options=args.num_options, out_size=num_actions, activation="softmax")
 48 |     self.params = self.conv.params + self.Q_val_model.params + self.options_model.params + self.termination_model.params
 49 |     self.set_rms_shared_weights(shared_arr)
 50 | 
 51 |     x = T.ftensor4()
 52 |     y = T.fvector()
 53 |     a = T.ivector()
 54 |     o = T.ivector()
 55 |     delib = T.fscalar()
 56 | 
 57 |     s = self.conv.apply(x/np.float32(255))
 58 |     intra_option_policy = self.options_model.apply(s, o)
 59 | 
 60 |     q_vals = self.Q_val_model.apply(s)
 61 |     disc_q = theano.gradient.disconnected_grad(q_vals)
 62 |     current_option_q = q_vals[T.arange(o.shape[0]), o]
 63 |     disc_opt_q = disc_q[T.arange(o.shape[0]), o]
 64 |     terms = self.termination_model.apply(s)
 65 |     o_term = terms[T.arange(o.shape[0]), o]
 66 |     V = T.max(q_vals, axis=1)*(1-self.args.option_epsilon) + (self.args.option_epsilon*T.mean(q_vals, axis=1))
 67 |     disc_V = theano.gradient.disconnected_grad(V)
 68 | 
 69 |     aggr = T.mean #T.sum
 70 |     log_eps = 0.0001
 71 | 
 72 |     critic_cost = aggr(args.critic_coef*0.5*T.sqr(y-current_option_q))
 73 |     termination_grad = aggr(o_term*((disc_opt_q-disc_V)+delib))
 74 |     entropy = -aggr(T.sum(intra_option_policy*T.log(intra_option_policy+log_eps), axis=1))*args.entropy_reg
 75 |     pg = aggr((T.log(intra_option_policy[T.arange(a.shape[0]), a]+log_eps)) * (y-disc_opt_q))
 76 |     cost = pg + entropy - critic_cost - termination_grad
 77 |     
 78 |     grads = T.grad(cost*args.update_freq, self.params)
 79 |     #grads = T.grad(cost, self.params)
 80 |     updates, grad_rms, self.rms_weights = rmsprop(self.params, grads, clip=args.clip, clip_type=args.clip_type)
 81 |     self.share_rms(shared_arr)
 82 | 
 83 |     self.get_state = theano.function([x], s, on_unused_input='warn')
 84 |     self.get_policy = theano.function([s, o], intra_option_policy)
 85 |     self.get_termination = theano.function([x], terms)
 86 |     self.get_q = theano.function([x], q_vals)
 87 |     self.get_q_from_s = theano.function([s], q_vals)
 88 |     self.get_V = theano.function([x], V)
 89 | 
 90 |     self.rms_grads = theano.function([x,a,y,o, delib], grad_rms, updates=updates, on_unused_input='warn')
 91 |     print "ALL COMPILED"
 92 | 
 93 |     if not self.args.testing:
 94 |       self.init_tracker()
 95 |     self.initialized = False
 96 | 
 97 |   def update_weights(self, x, a, y, o, moves, delib):
 98 |     args = self.args
 99 |     self.num_moves.value += moves
100 |     lr = np.max([args.init_lr * (args.max_num_frames-self.num_moves.value)/args.max_num_frames, 0]).astype("float32")
101 | 
102 |     cumul = self.rms_grads(x,a,y,o,delib)
103 |     for i in range(len(cumul)):
104 |       self.shared_arr[i] += lr*cumul[i]
105 |       self.params[i].set_value(self.shared_arr[i])
106 |     return
107 | 
108 |   def load_values(self, values):
109 |     assert(len(self.params+self.rms_weights) == len(values))
110 |     for p, v in zip(self.params+self.rms_weights, values): p.set_value(v)
111 | 
112 |   def save_values(self, folder_name):
113 |     pickle.dump([p.get_value() for p in self.params+self.rms_weights], open(folder_name+"/tmp_model.pkl", "wb"))
114 |     os.system("mv "+folder_name+"/tmp_model.pkl "+folder_name+"/model.pkl")
115 |     #try: # server creates too many core files
116 |     #  os.system("rm ./core*")
117 |     #except:
118 |     #  pass
119 | 
120 |   def get_param_vals(self):
121 |     return [m.get_value() for m in self.params+self.rms_weights]
122 | 
123 |   def set_rms_shared_weights(self, shared_arr):
124 |     if shared_arr is not None:
125 |       self.shared_arr = [np.frombuffer(s, dtype="float32").reshape(p.get_value().shape) for s, p in zip(shared_arr, self.params)] 
126 |       self.rms_shared_arr = shared_arr[len(self.params):]
127 |       if self.args.init_num_moves > 0:
128 |         for s, p in zip(shared_arr, self.params):
129 |           p.set_value(np.frombuffer(s, dtype="float32").reshape(p.get_value().shape))
130 |         print "LOADED VALUES"
131 | 
132 |   def share_rms(self, shared_arr):
133 |     # Ties rms params between threads with borrow=True flag
134 |     if self.args.rms_shared and shared_arr is not None:
135 |       assert(len(self.rms_weights) == len(self.rms_shared_arr))
136 |       for rms_w, s_rms_w in zip(self.rms_weights, self.rms_shared_arr): 
137 |         rms_w.set_value(np.frombuffer(s_rms_w, dtype="float32").reshape(rms_w.get_value().shape), borrow=True)
138 | 
139 |   def get_action(self, x):
140 |     p = self.get_policy([self.current_s], [self.current_o])
141 |     return self.rng.choice(range(self.num_actions), p=p[-1])
142 | 
143 |   def get_policy_over_options(self, s):
144 |     return self.get_q_from_s(s)[0].argmax() if self.rng.rand() > self.args.option_epsilon else self.rng.randint(self.args.num_options)
145 | 
146 |   def update_internal_state(self, x):
147 |     self.current_s = self.get_state([x])[0]
148 |     self.delib = self.args.delib_cost
149 | 
150 |     if self.terminated:
151 |       self.current_o = self.get_policy_over_options([self.current_s])
152 |       self.o_tracker_chosen[self.current_o] += 1
153 | 
154 |     self.o_tracker_steps[self.current_o] += 1
155 | 
156 |   def init_tracker(self):
157 |     csv_things = ["moves", "reward", "term_prob"]
158 |     csv_things += ["opt_chosen"+str(ccc) for ccc in range(self.args.num_options)]
159 |     csv_things += ["opt_steps"+str(ccc) for ccc in range(self.args.num_options)]
160 |     with open(self.args.folder_name+"/data.csv", "a") as myfile:
161 |       myfile.write(",".join([str(cc) for cc in csv_things])+"\n")
162 | 
163 |   def tracker(self):
164 |     term_prob = float(self.termination_counter)/self.frame_counter*100
165 |     csv_things = [self.num_moves.value, self.total_reward, round(term_prob,1)]+list(self.o_tracker_chosen)+list(self.o_tracker_steps)
166 |     with open(self.args.folder_name+"/data.csv", "a") as myfile:
167 |       myfile.write(",".join([str(cc) for cc in csv_things])+"\n")
168 | 
169 |   def reset_tracker(self):
170 |     self.termination_counter = 0
171 |     self.frame_counter = 0
172 |     self.o_tracker_chosen = np.zeros(self.args.num_options,)
173 |     self.o_tracker_steps = np.zeros(self.args.num_options,)
174 | 
175 |   def reset(self, x):
176 |     if not self.args.testing and self.initialized: self.tracker()
177 |     self.total_reward = 0
178 |     self.terminated = True
179 |     self.reset_tracker()
180 |     self.update_internal_state(x)
181 |     self.initialized = True
182 |     
183 |   def reset_storing(self):
184 |     self.a_seq = np.zeros((self.args.max_update_freq,), dtype="int32")
185 |     self.o_seq = np.zeros((self.args.max_update_freq,), dtype="int32")
186 |     self.r_seq = np.zeros((self.args.max_update_freq,), dtype="float32")
187 |     self.x_seq = np.zeros((self.args.max_update_freq, self.args.concat_frames*(1 if self.args.grayscale else 3),84,84),dtype="float32")
188 |     self.t_counter = 0
189 | 
190 |   def store(self, x, new_x, action, raw_reward, done, death):
191 |     end_ep = done or (death and self.args.death_ends_episode)
192 |     self.frame_counter += 1
193 |     
194 |     self.total_reward += raw_reward
195 |     reward = np.clip(raw_reward, -1, 1)
196 | 
197 |     self.terminated = self.get_termination([new_x])[0][self.current_o] > self.rng.rand()
198 |     self.termination_counter += self.terminated
199 | 
200 |     self.x_seq[self.t_counter] = np.copy(x)
201 |     self.o_seq[self.t_counter] = np.copy(self.current_o)
202 |     self.a_seq[self.t_counter] = np.copy(action)
203 |     self.r_seq[self.t_counter] = np.copy(float(reward)) - (float(self.terminated)*self.delib*(1-float(end_ep)))
204 | 
205 |     self.t_counter += 1
206 | 
207 |     # do n-step return to option termination. 
208 |     # cut off at self.args.max_update_freq
209 |     # min steps: self.args.update_freq (usually 5 like a3c)
210 |     # this doesn't make option length a minimum of 5 (they can still terminate). only batch size
211 |     option_term = (self.terminated and self.t_counter >= self.args.update_freq)
212 |     if self.t_counter == self.args.max_update_freq or end_ep or option_term:
213 |       if not self.args.testing:
214 |         V = self.get_V([new_x])[0] if self.terminated else self.get_q([new_x])[0][self.current_o]
215 |         R = 0 if end_ep else V
216 |         V = []
217 |         for j in range(self.t_counter-1,-1,-1):
218 |           R = np.float32(self.r_seq[j] + self.args.gamma*R)
219 |           V.append(R)
220 |         self.update_weights(self.x_seq[:self.t_counter], self.a_seq[:self.t_counter], V[::-1], 
221 |                             self.o_seq[:self.t_counter], self.t_counter, self.delib+self.args.margin_cost)
222 |       self.reset_storing()
223 |     if not end_ep:
224 |       self.update_internal_state(new_x)
225 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # When Waiting is not an Option : Learning Options with a Deliberation Cost
 2 | 
 3 | Arxiv link: https://arxiv.org/pdf/1709.04571.pdf
 4 | 
 5 | ## Installation
 6 | 
 7 | Here's a list of all dependencies:
 8 | 
 9 | - Numpy
10 | - Theano
11 | - Lasagne
12 | - Argparse
13 | - OpenAI Gym [Atari]
14 | - matplotlib
15 | - cv2 (OpenCV)
16 | - PIL (Image)
17 | 
18 | ## Training
19 | 
20 | To train, run following command:
21 | ```
22 | python train.py --sub-env Breakout --num-options 8 --num-threads 16 --folder-name Breakout_model
23 | ```
24 | 
25 | To view a list of available parameters, run:
26 | ```
27 | print train.py --help
28 | ```
29 | 
30 | During training, you can run utils/plot.py to view the training curve. Every argument given can be a path to a different run, which will put all runs on the same plot.
31 | ```
32 | python utils/plot.py models/Breakout_model/ models/Breakout_model_v2/ models/Breakout_model_v3/
33 | ```
34 | 
35 | ## Testing
36 | 
37 | To watch model after training, run watch.py and give it the path the saved model files. e.g.:
38 | ```
39 | python watch.py models/Breakout_model/
40 | ```
41 | 
42 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeanharb/a2oc_delib/42e25ce74c4ccc8e9e2f0667cb511aca4af45066/__init__.py


--------------------------------------------------------------------------------
/nnet.py:
--------------------------------------------------------------------------------
  1 | import theano, lasagne
  2 | import theano.tensor as T
  3 | import math, csv, time, sys, os, pdb, copy
  4 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  5 | from lasagne.layers import Conv2DLayer, conv, Upscale2DLayer
  6 | if theano.config.device.startswith("gpu"):
  7 |   from lasagne.layers import cuda_convnet
  8 | import numpy as np
  9 | 
 10 | def get_init(m, t):
 11 |   inits = {"zeros": lasagne.init.Constant(0.), "norm": lasagne.init.Normal(0.1)}
 12 |   if t not in m:
 13 |     if t == "b":
 14 |       return lasagne.init.Constant(0.)
 15 |     return lasagne.init.GlorotUniform()
 16 |   elif isinstance(m[t], basestring):
 17 |    return inits[m[t]]
 18 |   elif isinstance(m[t], int):
 19 |     return lasagne.init.Constant(m[t])
 20 |   else:
 21 |     return m[t]
 22 | 
 23 | def get_activation(activation):
 24 |   if activation == "softmax":
 25 |     output = T.nnet.softmax
 26 |   elif activation is None:
 27 |     output = None
 28 |   elif activation == "tanh":
 29 |     output = T.tanh
 30 |   elif activation == "relu":
 31 |     output = T.nnet.relu
 32 |   elif "leaky_relu" in activation:
 33 |     output = lambda x: T.nnet.relu(x, alpha=float(activation.split(" ")[1]))
 34 |   elif activation == "linear":
 35 |     output = None
 36 |   elif activation == "sigmoid":
 37 |     output = T.nnet.sigmoid
 38 |   elif activation == "hard_sigmoid":
 39 |     output = T.nnet.hard_sigmoid
 40 |   else:
 41 |     print "activation not recognized:", activation
 42 |     raise NotImplementedError
 43 |   return output
 44 | 
 45 | class MLP3D():
 46 |   def __init__(self, input_size=None, num_options=None, out_size=None, activation="softmax"):
 47 |     option_out_size = out_size
 48 |     limits = (6./np.sqrt(input_size + option_out_size))/num_options
 49 |     self.options_W = theano.shared(np.random.uniform(size=(num_options, input_size, option_out_size), high=limits, low=-limits).astype("float32"))
 50 |     self.options_b = theano.shared(np.zeros((num_options, option_out_size)).astype("float32"))
 51 |     self.activation = get_activation(activation)
 52 |     self.params = [self.options_W, self.options_b]
 53 | 
 54 |   def apply(self, inputs, option=None):
 55 |     W = self.options_W[option]
 56 |     b = self.options_b[option]
 57 | 
 58 |     out = T.sum(inputs.dimshuffle(0,1,'x')*W, axis=1) + b
 59 |     return out if self.activation is None else self.activation(out)
 60 | 
 61 |   def save_params(self):
 62 |     return [i.get_value() for i in self.params]
 63 | 
 64 |   def load_params(self, values):
 65 |     print "LOADING NNET..",
 66 |     for p, value in zip(self.params, values):
 67 |       p.set_value(value.astype("float32"))
 68 |     print "LOADED"
 69 | 
 70 | class Model():
 71 |   def __call__(self, *args, **kwargs):
 72 |     return self.apply(*args, **kwargs)
 73 | 
 74 |   def get_activation(self, model):
 75 |     activation = model["activation"] if "activation" in model else "linear"
 76 |     return get_activation(activation)
 77 | 
 78 |   def create_layer(self, inputs, model, dnn_type=True):
 79 | 
 80 |     if model["model_type"] == "conv":
 81 |       if dnn_type:
 82 |         import lasagne.layers.dnn as dnn
 83 |       conv_type = dnn.Conv2DDNNLayer if dnn_type else Conv2DLayer
 84 |       poolsize = tuple(model["pool"]) if "pool" in model else (1,1)
 85 |       stride = tuple(model["stride"]) if "stride" in model else (1,1)
 86 |       layer = conv_type(inputs, 
 87 |         model["out_size"], 
 88 |         filter_size=model["filter_size"], 
 89 |         stride=stride, 
 90 |         nonlinearity=self.get_activation(model),
 91 |         W=get_init(model, "W"),
 92 |         b=get_init(model, "b"),
 93 |         pad="valid" if "pad" not in model else model["pad"])
 94 |     elif model["model_type"] == "mlp":
 95 |       layer = lasagne.layers.DenseLayer(inputs,
 96 |         num_units=model["out_size"],
 97 |         nonlinearity=self.get_activation(model),
 98 |         W=get_init(model, "W"),
 99 |         b=get_init(model, "b"))
100 |     elif model["model_type"] == "option":
101 |       layer = MLP3D(model, inputs, nonlinearity=self.get_activation(model))
102 |     else:
103 |       print "UNKNOWN LAYER NAME"
104 |       raise NotImplementedError
105 |     return layer
106 | 
107 |   def __init__(self, model_in, input_size=None, rng=1234, dnn_type=False):
108 |     """
109 |     example model:
110 |     model = [{"model_type": "conv", "filter_size": [5,5], "pool": [1,1], "stride": [1,1], "out_size": 5},
111 |              {"model_type": "conv", "filter_size": [7,7], "pool": [1,1], "stride": [1,1], "out_size": 15},
112 |              {"model_type": "mlp", "out_size": 300, "activation": "tanh"},
113 |              {"model_type": "mlp", "out_size": 10, "activation": "softmax"}]
114 |     """
115 |     self.theano_rng = RandomStreams(rng)
116 |     rng = np.random.RandomState(rng)
117 |     lasagne.random.set_rng(rng)
118 | 
119 |     new_layer = tuple(input_size) if isinstance(input_size, list) else input_size
120 |     model = [model_in] if isinstance(model_in, dict) else model_in
121 |     
122 |     print "Building following model..."
123 |     print model
124 | 
125 |     self.model = model
126 |     self.input_size = input_size
127 |     self.out_size = model_in[-1]["out_size"]
128 |     self.dnn_type = dnn_type
129 | 
130 |     # create neural net layers
131 |     self.params = []
132 |     self.layers = []
133 |     for i, m in enumerate(model):
134 |       new_layer = self.create_layer(new_layer, m, dnn_type=dnn_type)
135 |       self.params += new_layer.get_params()
136 |       self.layers.append(new_layer)
137 | 
138 |     print "Build complete."
139 |     print
140 | 
141 |   def apply(self, x):
142 |     last_layer_inputs = x
143 |     for i, m in enumerate(self.model):
144 |       if m["model_type"] in ["mlp", "logistic", "advantage"] and last_layer_inputs.ndim > 2:
145 |         last_layer_inputs = last_layer_inputs.flatten(2)
146 |       last_layer_inputs = self.layers[i].get_output_for(last_layer_inputs)
147 |     return last_layer_inputs
148 | 
149 |   def save_params(self):
150 |     return [i.get_value() for i in self.params]
151 | 
152 |   def load_params(self, values):
153 |     print "LOADING NNET..",
154 |     for p, value in zip(self.params, values):
155 |       p.set_value(value.astype("float32"))
156 |     print "LOADED"
157 | 
158 | if __name__ == "__main__":
159 |   pass
160 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Value, Array, RawArray
  2 | from collections import OrderedDict
  3 | import numpy as np
  4 | from OC_theano import AOCAgent_THEANO
  5 | import cv2,copy,sys,pickle,os,time,argparse
  6 | from PIL import Image
  7 | from utils.helper import foldercreation, str2bool, get_folder_name
  8 | 
  9 | class Environment():
 10 |   def reset(self):
 11 |     raise NotImplementedError
 12 | 
 13 |   def render(self):
 14 |     raise NotImplementedError
 15 | 
 16 |   def act(self):
 17 |     raise NotImplementedError
 18 | 
 19 |   def get_frame_count(self):
 20 |     raise NotImplementedError
 21 | 
 22 | class ALE_env(Environment):
 23 |   def __init__(self, args, rng=None):
 24 |     import gym
 25 |     env = gym.make(args.sub_env+"NoFrameskip-v4")
 26 |     self.args = args
 27 |     self.rng = rng
 28 |     self.env = env
 29 |     self.action_space = self.env.action_space.n
 30 |     self.obs_space = self.env.observation_space.shape
 31 | 
 32 |     if self.args.testing:
 33 |       import matplotlib.pyplot as plt
 34 |       plt.ion()
 35 |       plt.show(block=False)
 36 | 
 37 |   def get_lives(self):
 38 |     return self.env.unwrapped.ale.lives()
 39 | 
 40 |   def noops(self):
 41 |     num_actions = self.rng.randint(1, self.args.max_start_nullops)
 42 |     for i in range(np.max([num_actions//self.args.frame_skip, self.args.concat_frames])):
 43 |       self.act(0)
 44 |     if self.env.unwrapped.get_action_meanings()[1] == 'FIRE':
 45 |       self.act(1)
 46 | 
 47 |   def reset(self):
 48 |     self.current_x = np.zeros((self.args.concat_frames*(1 if self.args.grayscale else 3), 84, 84), dtype="float32")
 49 |     self.new_obs = self.env.reset()
 50 |     self.lives = self.get_lives()
 51 |     self.noops()
 52 |     return self.current_x
 53 | 
 54 |   def render(self):
 55 |     a = 2
 56 |     if a == 1: #can see what the agent sees
 57 |       import matplotlib.pyplot as plt
 58 |       plt.clf()
 59 |       if self.args.grayscale:
 60 |         plt.imshow(self.xx, cmap="Greys_r")
 61 |       else:
 62 |         x = np.swapaxes(self.xx, 0,2)
 63 |         x_ = np.copy(x[0])
 64 |         x[0] = x[2]
 65 |         x[2] = x_
 66 |         plt.imshow(x)
 67 |       plt.draw()
 68 |       plt.pause(0.0001)
 69 |     else:
 70 |       self.env.render()
 71 | 
 72 |   def get_new_frame(self, new_frame):
 73 |     a = 1 if self.args.grayscale else 3
 74 |     self.current_x[:-a] = self.current_x[a:]
 75 |     self.current_x[-a:] = new_frame
 76 | 
 77 |   def act(self, action):
 78 |     raw_reward, dones, done = 0, 0, False
 79 |     for i in range(self.args.frame_skip):
 80 |       if done:
 81 |         break
 82 |       new_obs, rew, done, info = self.env.step(action)
 83 |       self.old_obs = np.copy(self.new_obs)
 84 |       self.new_obs = new_obs
 85 |       raw_reward += rew
 86 |       dones += done
 87 |     new_frame = self.preprocess(self.new_obs, self.old_obs)
 88 |     self.get_new_frame(new_frame)
 89 |     dones += (self.get_frame_count() > self.args.max_frames_ep)
 90 |     
 91 |     new_lives = self.get_lives()
 92 |     death = new_lives < self.lives
 93 |     self.lives = new_lives
 94 |     if death and not bool(int(dones)):
 95 |       self.noops()
 96 |     return self.current_x, raw_reward, bool(int(dones)), death
 97 | 
 98 |   def preprocess(self, im, last_im):
 99 |     if self.args.color_max:
100 |       im = np.maximum(im, last_im)
101 |     if self.args.grayscale:
102 |       proportions = [0.299, 0.587, 0.114]
103 |       im = np.sum(im * proportions, axis=2)
104 |     #im = cv2.resize(im, (84, 110), interpolation=cv2.INTER_AREA)[18:102, :]
105 |     im = Image.fromarray(im).resize((84, 84), resample=Image.BILINEAR)
106 |     x = np.array(im).astype("int32")
107 |     if not self.args.grayscale:
108 |       x = np.swapaxes(x, 0, 2)
109 |     self.xx = x
110 |     return x
111 | 
112 |   def get_frame_count(self):
113 |     return self.env.unwrapped.ale.getEpisodeFrameNumber()
114 | 
115 | class Training():
116 |   def __init__(self, rng, id_num, arr, num_moves, args):
117 |     self.args = args
118 |     self.rng = rng
119 |     self.num_moves = num_moves
120 |     self.id_num = id_num
121 | 
122 |     self.env = ALE_env(args, rng=rng)
123 |     self.agent = AOCAgent_THEANO(self.env.action_space, id_num, arr, num_moves, args)
124 | 
125 |     self.train()
126 | 
127 |   def train(self):
128 |     total_reward = 0
129 |     x = self.env.reset()
130 |     self.agent.reset(x)
131 |     timer = time.time()
132 |     recent_fps = []
133 |     frame_counter = 0
134 |     total_games = 0
135 |     done = False
136 | 
137 |     while self.num_moves.value < self.args.max_num_frames:
138 |       if done:
139 |         #ugly code, beautiful print
140 |         total_games += 1
141 |         secs = round(time.time()-timer, 1)
142 |         frames = self.env.get_frame_count()
143 |         fps = int(frames/secs)
144 |         recent_fps = recent_fps[-9:]+[fps]
145 |         eta = ((self.args.max_num_frames-self.num_moves.value)*self.args.frame_skip/(self.args.num_threads*np.mean(recent_fps)))
146 |         print "id: %d\treward: %d\ttime: %.1f\tframes: %d\t %dfps  \tmoves: %d \t ETA: %dh %dm %ds  \t%.2f%%" % \
147 |         (self.id_num, total_reward, secs, frames, fps, self.num_moves.value, int(eta/3600), int(eta/60)%60, int(eta%60), 
148 |           float(self.num_moves.value)/self.args.max_num_frames*100)
149 |         timer = time.time()
150 |         frame_counter = 0
151 | 
152 |         if total_games % 1 == 0 and self.id_num == 1 and not self.args.testing:
153 |           self.agent.save_values(folder_name)
154 |           print "saved model"
155 |         total_reward = 0
156 |         x = self.env.reset()
157 |         self.agent.reset(x)
158 |         done = False
159 | 
160 |       action = self.agent.get_action(x)
161 |       new_x, reward, done, death = self.env.act(action)
162 |       self.agent.store(x, new_x, action, reward, done, death)
163 |       if self.args.testing:
164 |         self.env.render()
165 |       total_reward += reward
166 |       x = np.copy(new_x)
167 | 
168 | def parse_params():
169 |   parser = argparse.ArgumentParser()
170 |   parser.add_argument('--sub-env', type=str, default="Breakout")
171 |   parser.add_argument('--testing', type=str2bool, default=False)
172 |   parser.add_argument('--update-freq', type=int, default=5)
173 |   parser.add_argument('--max-update-freq', type=int, default=30)
174 |   parser.add_argument('--num-threads', type=int, default=16)
175 |   parser.add_argument('--death-ends-episode', type=str2bool, default=True)
176 |   parser.add_argument('--max-start-nullops', type=int, default=30)
177 |   parser.add_argument('--frame-skip', type=int, default=4)
178 |   parser.add_argument('--concat-frames', type=int, default=4)
179 |   parser.add_argument('--entropy-reg', type=float, default=0.01)
180 |   parser.add_argument('--gamma', type=float, default=0.99)
181 |   parser.add_argument('--clip', type=float, default=40)
182 |   parser.add_argument('--clip-type', type=str, default="global", choices=["norm", "global"])
183 |   parser.add_argument('--color-averaging', type=str2bool, default=False)
184 |   parser.add_argument('--color-max', type=str2bool, default=True)
185 |   parser.add_argument('--grayscale', type=str2bool, default=True)
186 |   parser.add_argument('--max-num-frames', type=int, default=80000000)
187 |   parser.add_argument('--max-frames-ep', type=int, default=72000)
188 |   parser.add_argument('--init-lr', type=float, default=0.0007)
189 |   parser.add_argument('--rms-shared', type=str2bool, default=True)
190 |   parser.add_argument('--critic-coef', type=float, default=1.)
191 |   parser.add_argument('--num-options', type=int, default=8)
192 |   parser.add_argument('--option-epsilon', type=float, default=0.1)
193 |   parser.add_argument('--delib-cost', type=float, default=0.0)
194 |   parser.add_argument('--margin-cost', type=float, default=0.0)
195 |   parser.add_argument('--save-path', type=str, default="models") 
196 |   parser.add_argument('--load-folder', type=str, default="") # if not empty, will load folder to resume training
197 |   parser.add_argument('--folder-name', type=str, default="")
198 |   parser.add_argument('--resume-if-exists', type=str2bool, default=False) # for server that kills and restarts processes
199 |   return parser.parse_known_args()[0] #parser.parse_args()
200 | 
201 | 
202 | if __name__ == '__main__':
203 |   params = parse_params()
204 | 
205 |   folder_name = get_folder_name(params) if params.folder_name == "" else params.folder_name
206 |   attempted_path = "./"+params.save_path+"/"+folder_name
207 |   print "->", attempted_path, os.path.isdir(attempted_path)
208 |   if params.resume_if_exists and os.path.isdir(attempted_path):
209 |     params.load_folder = attempted_path
210 |     print "RESUMING TRAINING AUTOMATICALLY"
211 | 
212 |   init_num_moves = 0
213 |   if params.load_folder != "":
214 |     folder_name = params.load_folder
215 |     with open(folder_name+"/data.csv", "rb") as file:
216 |       for last in file:
217 |         if last.split(",")[0].isdigit():
218 |           init_num_moves = int(last.split(",")[0])
219 |     init_weights = pickle.load(open(folder_name+"/model.pkl", "rb"))
220 |     is_testing = copy.deepcopy(params.testing)
221 |     params = pickle.load(open(params.load_folder+"/params.pkl", "rb"))
222 |     params.testing = is_testing
223 |     if is_testing:
224 |       params.num_threads = 1
225 |   else:
226 |     folder_name = foldercreation(folder_name, params)
227 |     pickle.dump(params, open(folder_name+"/params.pkl", "wb"))
228 | 
229 |   setattr(params, "folder_name", folder_name)
230 | 
231 |   setattr(params, "init_num_moves", init_num_moves)
232 |   print "init_num_moves:", init_num_moves
233 | 
234 |   f = lambda rng, i, shared_arr, num_moves, args: Training(rng, i, shared_arr, num_moves, args)
235 | 
236 |   env = ALE_env(params)
237 |   if init_num_moves == 0:
238 |     init_weights = (AOCAgent_THEANO(env.action_space, 0, args=params)).get_param_vals()
239 |     
240 |   num_moves = Value("i", init_num_moves, lock=False)
241 |   arr = [Array('f', m.flatten(), lock=False) for m in init_weights]
242 |   seed = np.random.randint(10000)
243 |   for i in range(params.num_threads):
244 |     Process(target=f, args=(np.random.RandomState(seed+i), i+1, arr, num_moves, params)).start()
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeanharb/a2oc_delib/42e25ce74c4ccc8e9e2f0667cb511aca4af45066/utils/__init__.py


--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
 1 | import os, datetime, pickle as pkl
 2 | 
 3 | def create_dir(p, num_suffix=False):
 4 |   i = 0
 5 |   while True:
 6 |     try:
 7 |       new_dir = p+(("_v"+str(i) if i > 0 else "") if num_suffix else "")
 8 |       os.makedirs(new_dir)
 9 |       break
10 |     except OSError, e:
11 |       if e.errno != 17:
12 |         raise # This was not a "directory exist" error..
13 |       else:
14 |         i += 1
15 |         if not num_suffix:
16 |           break
17 |   return new_dir
18 | 
19 | def get_folder_name(args):
20 |   folder_name = args.sub_env+"_"+str(args.num_options)+"opts_"+str(args.delib_cost)+"delib_"+ \
21 |             str(args.num_threads)+"_"+str(args.max_num_frames//80000000)+"day"
22 |   return folder_name
23 | 
24 | def foldercreation(folder_name, args):
25 |   tempdir = os.path.join(os.getcwd(), args.save_path)
26 |   create_dir(tempdir)
27 |   #folder_name = folder_name if folder_name is not None else datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
28 |   mydir = os.path.join(tempdir, folder_name)
29 |   return create_dir(mydir, num_suffix=True)
30 | 
31 | def str2bool(v):
32 |   if v.lower() not in ("yes", "true", "t", "1", "no", "false", "f", "0"):
33 |     print "Inserted unrecognized string for bool value. Must be one of the following:"
34 |     print " ".join(["yes", "true", "t", "1", "no", "false", "f", "0"])
35 |     print "Note: Capitalization doesn't matter."
36 |     raise NotImplementedError
37 |   return v.lower() in ("yes", "true", "t", "1")
38 | 


--------------------------------------------------------------------------------
/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import seaborn
 2 | import matplotlib.pyplot as plt
 3 | import sys, time
 4 | import numpy as np
 5 | 
 6 | plt.ion()
 7 | fig = plt.figure()
 8 | plt.show(block=False)
 9 | refresh_rate = 5.0
10 | 
11 | def handle_close(evt):
12 |   sys.exit()
13 | 
14 | show_term = "--term" in sys.argv
15 | while True:
16 |   try:
17 |     data = []
18 |     indices = []
19 |     plt.clf()
20 |     weight_moves = False
21 |     for i in range(len(sys.argv[1:])):
22 |       if "--" in sys.argv[i+1]:
23 |         continue
24 |       d = []
25 |       e = []
26 |       filename = sys.argv[i+1]
27 |       if ".csv" not in filename: filename += "/data.csv"
28 |       f = open(filename, "rb")
29 |       for j, line in enumerate(f):
30 |         if not line.split(",")[0].isdigit(): continue
31 |         if weight_moves or ("," in line):
32 |           d.append(float(line.split(",")[1+show_term]))
33 |           e.append(int(line.split(",")[0]))
34 |           weight_moves = True
35 |         else:
36 |           d.append(int(line))
37 |           weight_moves = False
38 |       f.close()
39 |       data.append(d)
40 |       indices.append(e)
41 | 
42 |     a = int(max([len(each) for each in data])/250)+1
43 |     #weight_moves = False
44 |     if weight_moves:
45 |       a = int(float(max([i[-1] for i in indices]))/250)
46 |     d2 = []
47 |     all_p = []
48 |     i = -1
49 |     for temp_i in range(len(sys.argv[1:])):
50 |       if "--" in sys.argv[temp_i+1]:
51 |         continue
52 |       i += 1
53 |       if weight_moves:
54 |         frame_interval = a
55 |         new_matrix = []
56 |         one_row = []
57 |         counter = 0
58 |         count = 0
59 |         while count < len(data[i]):
60 |           if indices[i][count] > (counter+1)*frame_interval:
61 |             if len(one_row) == 0:
62 |               if len(new_matrix) == 0:
63 |                 one_row = [data[i][count]]
64 |               else:
65 |                 one_row = [new_matrix[-1]]
66 |             new_matrix.append(np.mean(one_row))
67 |             one_row = []
68 |             counter += 1
69 |           else:
70 |             one_row.append(data[i][count])
71 |             count += 1
72 |         p, = plt.plot(np.array(range(len(new_matrix)))*frame_interval, np.array(new_matrix))
73 |       else:
74 |         p, = plt.plot(np.array(data[i][:-(len(data[i])%a)]).reshape(((len(data[i])-(len(data[i])%a))/a,a)).mean(axis=1).flatten())
75 |       all_p.append(p)
76 |     legends = []
77 |     for dd in sys.argv[1:]:
78 |       if "--" not in dd: legends.append(dd.split("/")[-2])
79 |     plt.legend(all_p, legends, loc=2)
80 |     fig.canvas.mpl_connect('close_event', handle_close)
81 |     plt.draw()
82 |     plt.pause(refresh_rate)
83 |   except Exception, e:
84 |     print e
85 |     time.sleep(2)
86 |     pass
87 | 


--------------------------------------------------------------------------------
/watch.py:
--------------------------------------------------------------------------------
 1 | from train import Training, parse_params
 2 | import pickle as pkl, sys, cv2
 3 | from multiprocessing import Value
 4 | import numpy as np
 5 | 
 6 | p = pkl.load(open(sys.argv[1]+"/model.pkl", "rb"))
 7 | args = pkl.load(open(sys.argv[1]+"/params.pkl", "rb"))
 8 | temp_p = parse_params()
 9 | for a in args.__dict__:
10 |   setattr(temp_p,a, args.__dict__[a])
11 | args = temp_p
12 | print args
13 | print
14 | args.testing = True
15 | setattr(args, "init_num_moves", 2)
16 | args.fps = 60
17 | t = Training(np.random.RandomState(), 0, p, Value("i", 0, lock=False), args)
18 | 


--------------------------------------------------------------------------------