├── Doing_RL_with_PPO.pdf
├── README.md
├── openai_baselines_ppo
    ├── console_util.py
    ├── dataset.py
    ├── distributions.py
    ├── logger.py
    ├── math_util.py
    ├── misc_util.py
    ├── mlp_policy.py
    ├── mpi_adam.py
    ├── mpi_moments.py
    ├── mpi_running_mean_std.py
    ├── pposgd_simple.py
    ├── run_roboschool.py
    ├── save
    │   ├── Humanoid-v1.data-00000-of-00001
    │   ├── Humanoid-v1.index
    │   └── Humanoid-v1.meta
    ├── test_roboschool.py
    └── tf_util.py
└── ppo.py


/Doing_RL_with_PPO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wooridle/DeepRL-PPO-tutorial/84687cb24e8c8c68090fd83759acd6eebecbcf03/Doing_RL_with_PPO.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepRL-PPO-tutorial
 2 | This repository contains tutorial material on Doing DeepRL with PPO in GDG DevFest 2017 Seoul.
 3 | 
 4 | ## 발표자료
 5 | 
 6 | [Doing_RL_with_PPO.pdf](./Doing_RL_with_PPO.pdf) 입니다.
 7 | 
 8 | ## Roboschool 설치 가이드
 9 | 
10 | 설치에 앞서 roboschool은 Mac과 Linux 운영체제만 지원합니다.
11 | 
12 | 가장 먼저 roboshcool을 깃헙에서 다운받습니다.
13 | 
14 | ```
15 | git clone https://github.com/openai/roboschool
16 | ```
17 | 
18 | 
19 | 
20 | 그리고 먼저 ROBOSCHOOL_PATH를 설정해야 합니다. 이 설정은 설치때만 이용하기 때문에 현재 shell에서만 적용되도록 경로를 설정해 줍니다. /path/to/roboschool 대신 자신이 다운받은 roboschool 경로를 설정합니다.
21 | 
22 | ```
23 | ROBOSCHOOL_PATH=/path/to/roboschool
24 | ```
25 | 
26 | 
27 | 
28 | 이제 roboschool 설치에 필요한 패키지들을 설치합니다. 각각의 운영체제에 맞는 설치를 이용하면 됩니다.
29 | 
30 | - Linux
31 | 
32 | ```
33 | sudo apt install cmake ffmpeg pkg-config qtbase5-dev libqt5opengl5-dev libpython3.5-dev libboost-python-dev libtinyxml-dev
34 | ```
35 | 
36 | - Mac
37 | 
38 | ```
39 | # Will not work on Mavericks: unsupported by homebrew, some libraries won't compile, upgrade first
40 | brew install python3
41 | brew install cmake tinyxml assimp ffmpeg qt
42 | brew install boost-python --without-python --with-python3 --build-from-source
43 | export PATH=/usr/local/bin:/usr/local/opt/qt5/bin:$PATH
44 | export PKG_CONFIG_PATH=/usr/local/opt/qt5/lib/pkgconfig
45 | ```
46 | 
47 | - Mac, Anaconda with Python 3
48 | 
49 | ```
50 | brew install cmake tinyxml assimp ffmpeg
51 | brew install boost-python --without-python --with-python3 --build-from-source
52 | conda install qt
53 | export PKG_CONFIG_PATH=$(dirname $(dirname $(which python)))/lib/pkgconfig
54 | ```
55 | 
56 | 
57 | 
58 | 그 다음은 roboschool을 돌리는데 필요한 물리엔진인 bullet3를 설치해야 합니다. 먼저 bullet3를 먼저 깃헙에서 받아와서 빌드를 해야합니다.
59 | ** git clone한 roboschool 디렉토리에서 bullet3를 설치해주세요.
60 | 
61 | ```
62 | git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision
63 | mkdir bullet3/build
64 | cd bullet3/build
65 | cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 _DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF  -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..
66 | make -j4
67 | make install
68 | cd ../..
69 | ```
70 | 
71 | 
72 | 
73 | 마지막으로 gym과 roboschool을 설치하면 됩니다. (python2 버전을 이용한다면 pip를 이용하면 됩니다.)
74 | 
75 | ```
76 | pip3 install gym
77 | pip3 install -e $ROBOSCHOOL_PATH
78 | ```
79 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, float): rep = "%g"%x
20 |     else: rep = str(x)
21 |     return " "*(l - len(rep)) + rep
22 | 
23 | color2num = dict(
24 |     gray=30,
25 |     red=31,
26 |     green=32,
27 |     yellow=33,
28 |     blue=34,
29 |     magenta=35,
30 |     cyan=36,
31 |     white=37,
32 |     crimson=38
33 | )
34 | 
35 | def colorize(string, color, bold=False, highlight=False):
36 |     attr = []
37 |     num = color2num[color]
38 |     if highlight: num += 10
39 |     attr.append(str(num))
40 |     if bold: attr.append('1')
41 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
42 | 
43 | 
44 | MESSAGE_DEPTH = 0
45 | 
46 | @contextmanager
47 | def timed(msg):
48 |     global MESSAGE_DEPTH #pylint: disable=W0603
49 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
50 |     tstart = time.time()
51 |     MESSAGE_DEPTH += 1
52 |     yield
53 |     MESSAGE_DEPTH -= 1
54 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
55 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from tensorflow.python.ops import nn
  6 | 
  7 | class Pd(object):
  8 |     """
  9 |     A particular probability distribution
 10 |     """
 11 |     def flatparam(self):
 12 |         raise NotImplementedError
 13 |     def mode(self):
 14 |         raise NotImplementedError
 15 |     def neglogp(self, x):
 16 |         # Usually it's easier to define the negative logprob
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 |     def logp(self, x):
 25 |         return - self.neglogp(x)
 26 | 
 27 | class PdType(object):
 28 |     """
 29 |     Parametrized family of probability distributions
 30 |     """
 31 |     def pdclass(self):
 32 |         raise NotImplementedError
 33 |     def pdfromflat(self, flat):
 34 |         return self.pdclass()(flat)
 35 |     def param_shape(self):
 36 |         raise NotImplementedError
 37 |     def sample_shape(self):
 38 |         raise NotImplementedError
 39 |     def sample_dtype(self):
 40 |         raise NotImplementedError
 41 | 
 42 |     def param_placeholder(self, prepend_shape, name=None):
 43 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 44 |     def sample_placeholder(self, prepend_shape, name=None):
 45 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 46 | 
 47 | class CategoricalPdType(PdType):
 48 |     def __init__(self, ncat):
 49 |         self.ncat = ncat
 50 |     def pdclass(self):
 51 |         return CategoricalPd
 52 |     def param_shape(self):
 53 |         return [self.ncat]
 54 |     def sample_shape(self):
 55 |         return []
 56 |     def sample_dtype(self):
 57 |         return tf.int32
 58 | 
 59 | 
 60 | class MultiCategoricalPdType(PdType):
 61 |     def __init__(self, low, high):
 62 |         self.low = low
 63 |         self.high = high
 64 |         self.ncats = high - low + 1
 65 |     def pdclass(self):
 66 |         return MultiCategoricalPd
 67 |     def pdfromflat(self, flat):
 68 |         return MultiCategoricalPd(self.low, self.high, flat)
 69 |     def param_shape(self):
 70 |         return [sum(self.ncats)]
 71 |     def sample_shape(self):
 72 |         return [len(self.ncats)]
 73 |     def sample_dtype(self):
 74 |         return tf.int32
 75 | 
 76 | class DiagGaussianPdType(PdType):
 77 |     def __init__(self, size):
 78 |         self.size = size
 79 |     def pdclass(self):
 80 |         return DiagGaussianPd
 81 |     def param_shape(self):
 82 |         return [2*self.size]
 83 |     def sample_shape(self):
 84 |         return [self.size]
 85 |     def sample_dtype(self):
 86 |         return tf.float32
 87 | 
 88 | class BernoulliPdType(PdType):
 89 |     def __init__(self, size):
 90 |         self.size = size
 91 |     def pdclass(self):
 92 |         return BernoulliPd
 93 |     def param_shape(self):
 94 |         return [self.size]
 95 |     def sample_shape(self):
 96 |         return [self.size]
 97 |     def sample_dtype(self):
 98 |         return tf.int32
 99 | 
100 | # WRONG SECOND DERIVATIVES
101 | # class CategoricalPd(Pd):
102 | #     def __init__(self, logits):
103 | #         self.logits = logits
104 | #         self.ps = tf.nn.softmax(logits)
105 | #     @classmethod
106 | #     def fromflat(cls, flat):
107 | #         return cls(flat)
108 | #     def flatparam(self):
109 | #         return self.logits
110 | #     def mode(self):
111 | #         return U.argmax(self.logits, axis=-1)
112 | #     def logp(self, x):
113 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
114 | #     def kl(self, other):
115 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
116 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
117 | #     def entropy(self):
118 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
119 | #     def sample(self):
120 | #         u = tf.random_uniform(tf.shape(self.logits))
121 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
122 | 
123 | class CategoricalPd(Pd):
124 |     def __init__(self, logits):
125 |         self.logits = logits
126 |     def flatparam(self):
127 |         return self.logits
128 |     def mode(self):
129 |         return U.argmax(self.logits, axis=-1)
130 |     def neglogp(self, x):
131 |         # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
132 |         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
133 |         #       the implementation does not allow second-order derivatives...
134 |         one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
135 |         return tf.nn.softmax_cross_entropy_with_logits(
136 |             logits=self.logits,
137 |             labels=one_hot_actions)
138 |     def kl(self, other):
139 |         a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
140 |         a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
141 |         ea0 = tf.exp(a0)
142 |         ea1 = tf.exp(a1)
143 |         z0 = U.sum(ea0, axis=-1, keepdims=True)
144 |         z1 = U.sum(ea1, axis=-1, keepdims=True)
145 |         p0 = ea0 / z0
146 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
147 |     def entropy(self):
148 |         a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
149 |         ea0 = tf.exp(a0)
150 |         z0 = U.sum(ea0, axis=-1, keepdims=True)
151 |         p0 = ea0 / z0
152 |         return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
153 |     def sample(self):
154 |         u = tf.random_uniform(tf.shape(self.logits))
155 |         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
156 |     @classmethod
157 |     def fromflat(cls, flat):
158 |         return cls(flat)
159 | 
160 | class MultiCategoricalPd(Pd):
161 |     def __init__(self, low, high, flat):
162 |         self.flat = flat
163 |         self.low = tf.constant(low, dtype=tf.int32)
164 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
165 |     def flatparam(self):
166 |         return self.flat
167 |     def mode(self):
168 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
169 |     def neglogp(self, x):
170 |         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
171 |     def kl(self, other):
172 |         return tf.add_n([
173 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
174 |             ])
175 |     def entropy(self):
176 |         return tf.add_n([p.entropy() for p in self.categoricals])
177 |     def sample(self):
178 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
179 |     @classmethod
180 |     def fromflat(cls, flat):
181 |         raise NotImplementedError
182 | 
183 | class DiagGaussianPd(Pd):
184 |     def __init__(self, flat):
185 |         self.flat = flat
186 |         mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
187 |         self.mean = mean
188 |         self.logstd = logstd
189 |         self.std = tf.exp(logstd)
190 |     def flatparam(self):
191 |         return self.flat
192 |     def mode(self):
193 |         return self.mean
194 |     def neglogp(self, x):
195 |         return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
196 |                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
197 |                + U.sum(self.logstd, axis=-1)
198 |     def kl(self, other):
199 |         assert isinstance(other, DiagGaussianPd)
200 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
201 |     def entropy(self):
202 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
203 |     def sample(self):
204 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
205 |     @classmethod
206 |     def fromflat(cls, flat):
207 |         return cls(flat)
208 | 
209 | class BernoulliPd(Pd):
210 |     def __init__(self, logits):
211 |         self.logits = logits
212 |         self.ps = tf.sigmoid(logits)
213 |     def flatparam(self):
214 |         return self.logits
215 |     def mode(self):
216 |         return tf.round(self.ps)
217 |     def neglogp(self, x):
218 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
219 |     def kl(self, other):
220 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
221 |     def entropy(self):
222 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
223 |     def sample(self):
224 |         u = tf.random_uniform(tf.shape(self.ps))
225 |         return tf.to_float(math_ops.less(u, self.ps))
226 |     @classmethod
227 |     def fromflat(cls, flat):
228 |         return cls(flat)
229 | 
230 | def make_pdtype(ac_space):
231 |     from gym import spaces
232 |     if isinstance(ac_space, spaces.Box):
233 |         assert len(ac_space.shape) == 1
234 |         return DiagGaussianPdType(ac_space.shape[0])
235 |     elif isinstance(ac_space, spaces.Discrete):
236 |         return CategoricalPdType(ac_space.n)
237 |     elif isinstance(ac_space, spaces.MultiDiscrete):
238 |         return MultiCategoricalPdType(ac_space.low, ac_space.high)
239 |     elif isinstance(ac_space, spaces.MultiBinary):
240 |         return BernoulliPdType(ac_space.n)
241 |     else:
242 |         raise NotImplementedError
243 | 
244 | def shape_el(v, i):
245 |     maybe = v.get_shape()[i]
246 |     if maybe is not None:
247 |         return maybe
248 |     else:
249 |         return tf.shape(v)[i]
250 | 
251 | @U.in_session
252 | def test_probtypes():
253 |     np.random.seed(0)
254 | 
255 |     pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
256 |     diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
257 |     validate_probtype(diag_gauss, pdparam_diag_gauss)
258 | 
259 |     pdparam_categorical = np.array([-.2, .3, .5])
260 |     categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
261 |     validate_probtype(categorical, pdparam_categorical)
262 | 
263 |     pdparam_bernoulli = np.array([-.2, .3, .5])
264 |     bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
265 |     validate_probtype(bernoulli, pdparam_bernoulli)
266 | 
267 | 
268 | def validate_probtype(probtype, pdparam):
269 |     N = 100000
270 |     # Check to see if mean negative log likelihood == differential entropy
271 |     Mval = np.repeat(pdparam[None, :], N, axis=0)
272 |     M = probtype.param_placeholder([N])
273 |     X = probtype.sample_placeholder([N])
274 |     pd = probtype.pdclass()(M)
275 |     calcloglik = U.function([X, M], pd.logp(X))
276 |     calcent = U.function([M], pd.entropy())
277 |     Xval = U.eval(pd.sample(), feed_dict={M:Mval})
278 |     logliks = calcloglik(Xval, Mval)
279 |     entval_ll = - logliks.mean() #pylint: disable=E1101
280 |     entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
281 |     entval = calcent(Mval).mean() #pylint: disable=E1101
282 |     assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
283 | 
284 |     # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
285 |     M2 = probtype.param_placeholder([N])
286 |     pd2 = probtype.pdclass()(M2)
287 |     q = pdparam + np.random.randn(pdparam.size) * 0.1
288 |     Mval2 = np.repeat(q[None, :], N, axis=0)
289 |     calckl = U.function([M, M2], pd.kl(pd2))
290 |     klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
291 |     logliks = calcloglik(Xval, Mval2)
292 |     klval_ll = - entval - logliks.mean() #pylint: disable=E1101
293 |     klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
294 |     assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
295 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import os.path as osp
  5 | import json
  6 | import time
  7 | import datetime
  8 | import tempfile
  9 | 
 10 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
 11 | 
 12 | DEBUG = 10
 13 | INFO = 20
 14 | WARN = 30
 15 | ERROR = 40
 16 | 
 17 | DISABLED = 50
 18 | 
 19 | class OutputFormat(object):
 20 |     def writekvs(self, kvs):
 21 |         """
 22 |         Write key-value pairs
 23 |         """
 24 |         raise NotImplementedError
 25 | 
 26 |     def writeseq(self, args):
 27 |         """
 28 |         Write a sequence of other data (e.g. a logging message)
 29 |         """
 30 |         pass
 31 | 
 32 |     def close(self):
 33 |         return
 34 | 
 35 | 
 36 | class HumanOutputFormat(OutputFormat):
 37 |     def __init__(self, file):
 38 |         self.file = file
 39 | 
 40 |     def writekvs(self, kvs):
 41 |         # Create strings for printing
 42 |         key2str = {}
 43 |         for (key, val) in sorted(kvs.items()):
 44 |             if isinstance(val, float):
 45 |                 valstr = '%-8.3g' % (val,)
 46 |             else:
 47 |                 valstr = str(val)
 48 |             key2str[self._truncate(key)] = self._truncate(valstr)
 49 | 
 50 |         # Find max widths
 51 |         keywidth = max(map(len, key2str.keys()))
 52 |         valwidth = max(map(len, key2str.values()))
 53 | 
 54 |         # Write out the data
 55 |         dashes = '-' * (keywidth + valwidth + 7)
 56 |         lines = [dashes]
 57 |         for (key, val) in sorted(key2str.items()):
 58 |             lines.append('| %s%s | %s%s |' % (
 59 |                 key,
 60 |                 ' ' * (keywidth - len(key)),
 61 |                 val,
 62 |                 ' ' * (valwidth - len(val)),
 63 |             ))
 64 |         lines.append(dashes)
 65 |         self.file.write('\n'.join(lines) + '\n')
 66 | 
 67 |         # Flush the output to the file
 68 |         self.file.flush()
 69 | 
 70 |     def _truncate(self, s):
 71 |         return s[:20] + '...' if len(s) > 23 else s
 72 | 
 73 |     def writeseq(self, args):
 74 |         for arg in args:
 75 |             self.file.write(arg)
 76 |         self.file.write('\n')
 77 |         self.file.flush()
 78 | 
 79 | class JSONOutputFormat(OutputFormat):
 80 |     def __init__(self, file):
 81 |         self.file = file
 82 | 
 83 |     def writekvs(self, kvs):
 84 |         for k, v in sorted(kvs.items()):
 85 |             if hasattr(v, 'dtype'):
 86 |                 v = v.tolist()
 87 |                 kvs[k] = float(v)
 88 |         self.file.write(json.dumps(kvs) + '\n')
 89 |         self.file.flush()
 90 | 
 91 | class TensorBoardOutputFormat(OutputFormat):
 92 |     """
 93 |     Dumps key/value pairs into TensorBoard's numeric format.
 94 |     """
 95 |     def __init__(self, dir):
 96 |         os.makedirs(dir, exist_ok=True)
 97 |         self.dir = dir
 98 |         self.step = 1
 99 |         prefix = 'events'
100 |         path = osp.join(osp.abspath(dir), prefix)
101 |         import tensorflow as tf
102 |         from tensorflow.python import pywrap_tensorflow        
103 |         from tensorflow.core.util import event_pb2
104 |         from tensorflow.python.util import compat
105 |         self.tf = tf
106 |         self.event_pb2 = event_pb2
107 |         self.pywrap_tensorflow = pywrap_tensorflow
108 |         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
109 | 
110 |     def writekvs(self, kvs):
111 |         def summary_val(k, v):
112 |             kwargs = {'tag': k, 'simple_value': float(v)}
113 |             return self.tf.Summary.Value(**kwargs)
114 |         summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
115 |         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
116 |         event.step = self.step # is there any reason why you'd want to specify the step?
117 |         self.writer.WriteEvent(event)
118 |         self.writer.Flush()
119 |         self.step += 1
120 | 
121 |     def close(self):
122 |         if self.writer:
123 |             self.writer.Close()
124 |             self.writer = None
125 | 
126 | 
127 | def make_output_format(format, ev_dir):
128 |     os.makedirs(ev_dir, exist_ok=True)
129 |     if format == 'stdout':
130 |         return HumanOutputFormat(sys.stdout)
131 |     elif format == 'log':
132 |         log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
133 |         return HumanOutputFormat(log_file)
134 |     elif format == 'json':
135 |         json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
136 |         return JSONOutputFormat(json_file)
137 |     elif format == 'tensorboard':
138 |         return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
139 |     else:
140 |         raise ValueError('Unknown format specified: %s' % (format,))
141 | 
142 | # ================================================================
143 | # API
144 | # ================================================================
145 | 
146 | def logkv(key, val):
147 |     """
148 |     Log a value of some diagnostic
149 |     Call this once for each diagnostic quantity, each iteration
150 |     """
151 |     Logger.CURRENT.logkv(key, val)
152 | 
153 | def logkvs(d):
154 |     """
155 |     Log a dictionary of key-value pairs
156 |     """
157 |     for (k, v) in d.items():
158 |         logkv(k, v)
159 | 
160 | def dumpkvs():
161 |     """
162 |     Write all of the diagnostics from the current iteration
163 | 
164 |     level: int. (see logger.py docs) If the global logger level is higher than
165 |                 the level argument here, don't print to stdout.
166 |     """
167 |     Logger.CURRENT.dumpkvs()
168 | 
169 | def getkvs():
170 |     return Logger.CURRENT.name2val    
171 | 
172 | 
173 | def log(*args, level=INFO):
174 |     """
175 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
176 |     """
177 |     Logger.CURRENT.log(*args, level=level)
178 | 
179 | 
180 | def debug(*args):
181 |     log(*args, level=DEBUG)
182 | 
183 | 
184 | def info(*args):
185 |     log(*args, level=INFO)
186 | 
187 | 
188 | def warn(*args):
189 |     log(*args, level=WARN)
190 | 
191 | 
192 | def error(*args):
193 |     log(*args, level=ERROR)
194 | 
195 | 
196 | def set_level(level):
197 |     """
198 |     Set logging threshold on current logger.
199 |     """
200 |     Logger.CURRENT.set_level(level)
201 | 
202 | def get_dir():
203 |     """
204 |     Get directory that log files are being written to.
205 |     will be None if there is no output directory (i.e., if you didn't call start)
206 |     """
207 |     return Logger.CURRENT.get_dir()
208 | 
209 | record_tabular = logkv
210 | dump_tabular = dumpkvs
211 | 
212 | # ================================================================
213 | # Backend
214 | # ================================================================
215 | 
216 | class Logger(object):
217 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
218 |                     # So that you can still log to the terminal without setting up any output files
219 |     CURRENT = None  # Current logger being used by the free functions above
220 | 
221 |     def __init__(self, dir, output_formats):
222 |         self.name2val = {}  # values this iteration
223 |         self.level = INFO
224 |         self.dir = dir
225 |         self.output_formats = output_formats
226 | 
227 |     # Logging API, forwarded
228 |     # ----------------------------------------
229 |     def logkv(self, key, val):
230 |         self.name2val[key] = val
231 | 
232 |     def dumpkvs(self):
233 |         if self.level == DISABLED: return
234 |         for fmt in self.output_formats:
235 |             fmt.writekvs(self.name2val)
236 |         self.name2val.clear()
237 | 
238 |     def log(self, *args, level=INFO):
239 |         if self.level <= level:
240 |             self._do_log(args)
241 | 
242 |     # Configuration
243 |     # ----------------------------------------
244 |     def set_level(self, level):
245 |         self.level = level
246 | 
247 |     def get_dir(self):
248 |         return self.dir
249 | 
250 |     def close(self):
251 |         for fmt in self.output_formats:
252 |             fmt.close()
253 | 
254 |     # Misc
255 |     # ----------------------------------------
256 |     def _do_log(self, args):
257 |         for fmt in self.output_formats:
258 |             fmt.writeseq(args)
259 | 
260 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
261 | 
262 | def configure(dir=None, format_strs=None):
263 |     assert Logger.CURRENT is Logger.DEFAULT,\
264 |         "Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
265 |     prevlogger = Logger.CURRENT
266 |     if dir is None:
267 |         dir = os.getenv('OPENAI_LOGDIR')
268 |     if dir is None:
269 |         dir = osp.join(tempfile.gettempdir(), 
270 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
271 |     if format_strs is None:
272 |         format_strs = LOG_OUTPUT_FORMATS
273 |     output_formats = [make_output_format(f, dir) for f in format_strs]
274 |     Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
275 |     log('Logging to %s'%dir)
276 | 
277 | if os.getenv('OPENAI_LOGDIR'): 
278 |     # if OPENAI_LOGDIR is set, configure the logger on import
279 |     # this kind of nasty (unexpected to user), but I don't know how else to inject the logger
280 |     # to a script that's getting run in a subprocess
281 |     configure(dir=os.getenv('OPENAI_LOGDIR'))
282 | 
283 | def reset():
284 |     Logger.CURRENT = Logger.DEFAULT
285 |     log('Reset logger')
286 | 
287 | # ================================================================
288 | 
289 | def _demo():
290 |     info("hi")
291 |     debug("shouldn't appear")
292 |     set_level(DEBUG)
293 |     debug("should appear")
294 |     dir = "/tmp/testlogging"
295 |     if os.path.exists(dir):
296 |         shutil.rmtree(dir)
297 |     with session(dir=dir):
298 |         logkv("a", 3)
299 |         logkv("b", 2.5)
300 |         dumpkvs()
301 |         logkv("b", -2.5)
302 |         logkv("a", 5.5)
303 |         dumpkvs()
304 |         info("^^^ should see a = 5.5")
305 | 
306 |     logkv("b", -2.5)
307 |     dumpkvs()
308 | 
309 |     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
310 |     dumpkvs()
311 | 
312 | 
313 | if __name__ == "__main__":
314 |     _demo()
315 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/openai_baselines_ppo/misc_util.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import os
  4 | import pickle
  5 | import random
  6 | import tempfile
  7 | import time
  8 | import zipfile
  9 | 
 10 | 
 11 | def zipsame(*seqs):
 12 |     L = len(seqs[0])
 13 |     assert all(len(seq) == L for seq in seqs[1:])
 14 |     return zip(*seqs)
 15 | 
 16 | 
 17 | def unpack(seq, sizes):
 18 |     """
 19 |     Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
 20 |     None = just one bare element, not a list
 21 | 
 22 |     Example:
 23 |     unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
 24 |     """
 25 |     seq = list(seq)
 26 |     it = iter(seq)
 27 |     assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
 28 |     for size in sizes:
 29 |         if size is None:
 30 |             yield it.__next__()
 31 |         else:
 32 |             li = []
 33 |             for _ in range(size):
 34 |                 li.append(it.__next__())
 35 |             yield li
 36 | 
 37 | 
 38 | class EzPickle(object):
 39 |     """Objects that are pickled and unpickled via their constructor
 40 |     arguments.
 41 | 
 42 |     Example usage:
 43 | 
 44 |         class Dog(Animal, EzPickle):
 45 |             def __init__(self, furcolor, tailkind="bushy"):
 46 |                 Animal.__init__()
 47 |                 EzPickle.__init__(furcolor, tailkind)
 48 |                 ...
 49 | 
 50 |     When this object is unpickled, a new Dog will be constructed by passing the provided
 51 |     furcolor and tailkind into the constructor. However, philosophers are still not sure
 52 |     whether it is still the same dog.
 53 | 
 54 |     This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
 55 |     and Atari.
 56 |     """
 57 | 
 58 |     def __init__(self, *args, **kwargs):
 59 |         self._ezpickle_args = args
 60 |         self._ezpickle_kwargs = kwargs
 61 | 
 62 |     def __getstate__(self):
 63 |         return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
 64 | 
 65 |     def __setstate__(self, d):
 66 |         out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
 67 |         self.__dict__.update(out.__dict__)
 68 | 
 69 | 
 70 | def set_global_seeds(i):
 71 |     try:
 72 |         import tensorflow as tf
 73 |     except ImportError:
 74 |         pass
 75 |     else:
 76 |         tf.set_random_seed(i)
 77 |     np.random.seed(i)
 78 |     random.seed(i)
 79 | 
 80 | 
 81 | def pretty_eta(seconds_left):
 82 |     """Print the number of seconds in human readable format.
 83 | 
 84 |     Examples:
 85 |     2 days
 86 |     2 hours and 37 minutes
 87 |     less than a minute
 88 | 
 89 |     Paramters
 90 |     ---------
 91 |     seconds_left: int
 92 |         Number of seconds to be converted to the ETA
 93 |     Returns
 94 |     -------
 95 |     eta: str
 96 |         String representing the pretty ETA.
 97 |     """
 98 |     minutes_left = seconds_left // 60
 99 |     seconds_left %= 60
100 |     hours_left = minutes_left // 60
101 |     minutes_left %= 60
102 |     days_left = hours_left // 24
103 |     hours_left %= 24
104 | 
105 |     def helper(cnt, name):
106 |         return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
107 | 
108 |     if days_left > 0:
109 |         msg = helper(days_left, 'day')
110 |         if hours_left > 0:
111 |             msg += ' and ' + helper(hours_left, 'hour')
112 |         return msg
113 |     if hours_left > 0:
114 |         msg = helper(hours_left, 'hour')
115 |         if minutes_left > 0:
116 |             msg += ' and ' + helper(minutes_left, 'minute')
117 |         return msg
118 |     if minutes_left > 0:
119 |         return helper(minutes_left, 'minute')
120 |     return 'less than a minute'
121 | 
122 | 
123 | class RunningAvg(object):
124 |     def __init__(self, gamma, init_value=None):
125 |         """Keep a running estimate of a quantity. This is a bit like mean
126 |         but more sensitive to recent changes.
127 | 
128 |         Parameters
129 |         ----------
130 |         gamma: float
131 |             Must be between 0 and 1, where 0 is the most sensitive to recent
132 |             changes.
133 |         init_value: float or None
134 |             Initial value of the estimate. If None, it will be set on the first update.
135 |         """
136 |         self._value = init_value
137 |         self._gamma = gamma
138 | 
139 |     def update(self, new_val):
140 |         """Update the estimate.
141 | 
142 |         Parameters
143 |         ----------
144 |         new_val: float
145 |             new observated value of estimated quantity.
146 |         """
147 |         if self._value is None:
148 |             self._value = new_val
149 |         else:
150 |             self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
151 | 
152 |     def __float__(self):
153 |         """Get the current estimate"""
154 |         return self._value
155 | 
156 | 
157 | class SimpleMonitor(gym.Wrapper):
158 |     def __init__(self, env):
159 |         """Adds two qunatities to info returned by every step:
160 | 
161 |             num_steps: int
162 |                 Number of steps takes so far
163 |             rewards: [float]
164 |                 All the cumulative rewards for the episodes completed so far.
165 |         """
166 |         super().__init__(env)
167 |         # current episode state
168 |         self._current_reward = None
169 |         self._num_steps = None
170 |         # temporary monitor state that we do not save
171 |         self._time_offset = None
172 |         self._total_steps = None
173 |         # monitor state
174 |         self._episode_rewards = []
175 |         self._episode_lengths = []
176 |         self._episode_end_times = []
177 | 
178 |     def _reset(self):
179 |         obs = self.env.reset()
180 |         # recompute temporary state if needed
181 |         if self._time_offset is None:
182 |             self._time_offset = time.time()
183 |             if len(self._episode_end_times) > 0:
184 |                 self._time_offset -= self._episode_end_times[-1]
185 |         if self._total_steps is None:
186 |             self._total_steps = sum(self._episode_lengths)
187 |         # update monitor state
188 |         if self._current_reward is not None:
189 |             self._episode_rewards.append(self._current_reward)
190 |             self._episode_lengths.append(self._num_steps)
191 |             self._episode_end_times.append(time.time() - self._time_offset)
192 |         # reset episode state
193 |         self._current_reward = 0
194 |         self._num_steps = 0
195 | 
196 |         return obs
197 | 
198 |     def _step(self, action):
199 |         obs, rew, done, info = self.env.step(action)
200 |         self._current_reward += rew
201 |         self._num_steps += 1
202 |         self._total_steps += 1
203 |         info['steps'] = self._total_steps
204 |         info['rewards'] = self._episode_rewards
205 |         return (obs, rew, done, info)
206 | 
207 |     def get_state(self):
208 |         return {
209 |             'env_id': self.env.unwrapped.spec.id,
210 |             'episode_data': {
211 |                 'episode_rewards': self._episode_rewards,
212 |                 'episode_lengths': self._episode_lengths,
213 |                 'episode_end_times': self._episode_end_times,
214 |                 'initial_reset_time': 0,
215 |             }
216 |         }
217 | 
218 |     def set_state(self, state):
219 |         assert state['env_id'] == self.env.unwrapped.spec.id
220 |         ed = state['episode_data']
221 |         self._episode_rewards = ed['episode_rewards']
222 |         self._episode_lengths = ed['episode_lengths']
223 |         self._episode_end_times = ed['episode_end_times']
224 | 
225 | 
226 | def boolean_flag(parser, name, default=False, help=None):
227 |     """Add a boolean flag to argparse parser.
228 | 
229 |     Parameters
230 |     ----------
231 |     parser: argparse.Parser
232 |         parser to add the flag to
233 |     name: str
234 |         --<name> will enable the flag, while --no-<name> will disable it
235 |     default: bool or None
236 |         default value of the flag
237 |     help: str
238 |         help string for the flag
239 |     """
240 |     dest = name.replace('-', '_')
241 |     parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
242 |     parser.add_argument("--no-" + name, action="store_false", dest=dest)
243 | 
244 | 
245 | def get_wrapper_by_name(env, classname):
246 |     """Given an a gym environment possibly wrapped multiple times, returns a wrapper
247 |     of class named classname or raises ValueError if no such wrapper was applied
248 | 
249 |     Parameters
250 |     ----------
251 |     env: gym.Env of gym.Wrapper
252 |         gym environment
253 |     classname: str
254 |         name of the wrapper
255 | 
256 |     Returns
257 |     -------
258 |     wrapper: gym.Wrapper
259 |         wrapper named classname
260 |     """
261 |     currentenv = env
262 |     while True:
263 |         if classname == currentenv.class_name():
264 |             return currentenv
265 |         elif isinstance(currentenv, gym.Wrapper):
266 |             currentenv = currentenv.env
267 |         else:
268 |             raise ValueError("Couldn't find wrapper named %s" % classname)
269 | 
270 | 
271 | def relatively_safe_pickle_dump(obj, path, compression=False):
272 |     """This is just like regular pickle dump, except from the fact that failure cases are
273 |     different:
274 | 
275 |         - It's never possible that we end up with a pickle in corrupted state.
276 |         - If a there was a different file at the path, that file will remain unchanged in the
277 |           even of failure (provided that filesystem rename is atomic).
278 |         - it is sometimes possible that we end up with useless temp file which needs to be
279 |           deleted manually (it will be removed automatically on the next function call)
280 | 
281 |     The indended use case is periodic checkpoints of experiment state, such that we never
282 |     corrupt previous checkpoints if the current one fails.
283 | 
284 |     Parameters
285 |     ----------
286 |     obj: object
287 |         object to pickle
288 |     path: str
289 |         path to the output file
290 |     compression: bool
291 |         if true pickle will be compressed
292 |     """
293 |     temp_storage = path + ".relatively_safe"
294 |     if compression:
295 |         # Using gzip here would be simpler, but the size is limited to 2GB
296 |         with tempfile.NamedTemporaryFile() as uncompressed_file:
297 |             pickle.dump(obj, uncompressed_file)
298 |             with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
299 |                 myzip.write(uncompressed_file.name, "data")
300 |     else:
301 |         with open(temp_storage, "wb") as f:
302 |             pickle.dump(obj, f)
303 |     os.rename(temp_storage, path)
304 | 
305 | 
306 | def pickle_load(path, compression=False):
307 |     """Unpickle a possible compressed pickle.
308 | 
309 |     Parameters
310 |     ----------
311 |     path: str
312 |         path to the output file
313 |     compression: bool
314 |         if true assumes that pickle was compressed when created and attempts decompression.
315 | 
316 |     Returns
317 |     -------
318 |     obj: object
319 |         the unpickled object
320 |     """
321 | 
322 |     if compression:
323 |         with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
324 |             with myzip.open("data") as f:
325 |                 return pickle.load(f)
326 |     else:
327 |         with open(path, "rb") as f:
328 |             return pickle.load(f)
329 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from mpi_running_mean_std import RunningMeanStd
 2 | import tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from distributions import make_pdtype
 6 | 
 7 | class MlpPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, *args, **kwargs):
10 |         with tf.variable_scope(name):
11 |             self._init(*args, **kwargs)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 |         #obz = ob
22 | 
23 |         #with tf.variable_scope("obfilter"):
24 |         #    self.ob_rms = RunningMeanStd(shape=ob_space.shape)
25 | 
26 |         #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
27 |         last_out = ob
28 |         for i in range(num_hid_layers):
29 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
30 |         self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
31 | 
32 |         last_out = ob
33 |         for i in range(num_hid_layers):
34 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
35 |         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
36 |             mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
37 |             logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer)
38 |             pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
39 |         else:
40 |             pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
41 | 
42 |         self.pd = pdtype.pdfromflat(pdparam)
43 | 
44 |         self.state_in = []
45 |         self.state_out = []
46 | 
47 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
48 |         ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
49 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
50 | 
51 |     def act(self, stochastic, ob):
52 |         ac1, vpred1 =  self._act(stochastic, [ob])
53 |         return ac1[0], vpred1[0]
54 |     def get_variables(self):
55 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
56 |     def get_trainable_variables(self):
57 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
58 |     def get_initial_state(self):
59 |         return []
60 | 
61 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 |     
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)
80 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from misc_util import zipsame
 4 | 
 5 | def mpi_moments(x, axis=0):
 6 |     x = np.asarray(x, dtype='float64')
 7 |     newshape = list(x.shape)
 8 |     newshape.pop(axis)
 9 |     n = np.prod(newshape,dtype=int)
10 |     totalvec = np.zeros(n*2+1, 'float64')
11 |     addvec = np.concatenate([x.sum(axis=axis).ravel(), 
12 |         np.square(x).sum(axis=axis).ravel(), 
13 |         np.array([x.shape[axis]],dtype='float64')])
14 |     MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
15 |     sum = totalvec[:n]
16 |     sumsq = totalvec[n:2*n]
17 |     count = totalvec[2*n]
18 |     if count == 0:
19 |         mean = np.empty(newshape); mean[:] = np.nan
20 |         std = np.empty(newshape); std[:] = np.nan
21 |     else:
22 |         mean = sum/count
23 |         std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
24 |     return mean, std, count
25 | 
26 | 
27 | def test_runningmeanstd():
28 |     comm = MPI.COMM_WORLD
29 |     np.random.seed(0)
30 |     for (triple,axis) in [
31 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
32 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
33 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
34 |         ]:
35 | 
36 | 
37 |         x = np.concatenate(triple, axis=axis)
38 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
39 | 
40 | 
41 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
42 | 
43 |         for (a1,a2) in zipsame(ms1, ms2):
44 |             print(a1, a2)
45 |             assert np.allclose(a1, a2)
46 |             print("ok!")
47 | 
48 | if __name__ == "__main__":
49 |     #mpirun -np 3 python <script>
50 |     test_runningmeanstd()
51 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf
  3 | import tf_util as U
  4 | import numpy as np
  5 | 
  6 | class RunningMeanStd(object):
  7 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  8 |     def __init__(self, epsilon=1e-2, shape=()):
  9 | 
 10 |         self._sum = tf.get_variable(
 11 |             dtype=tf.float64,
 12 |             shape=shape,
 13 |             initializer=tf.constant_initializer(0.0),
 14 |             name="runningsum", trainable=False)
 15 |         self._sumsq = tf.get_variable(
 16 |             dtype=tf.float64,
 17 |             shape=shape,
 18 |             initializer=tf.constant_initializer(epsilon),
 19 |             name="runningsumsq", trainable=False)
 20 |         self._count = tf.get_variable(
 21 |             dtype=tf.float64,
 22 |             shape=(),
 23 |             initializer=tf.constant_initializer(epsilon),
 24 |             name="count", trainable=False)
 25 |         self.shape = shape
 26 | 
 27 |         self.mean = tf.to_float(self._sum / self._count)
 28 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 29 | 
 30 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 31 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 32 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 33 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 34 |             updates=[tf.assign_add(self._sum, newsum),
 35 |                      tf.assign_add(self._sumsq, newsumsq),
 36 |                      tf.assign_add(self._count, newcount)])
 37 | 
 38 | 
 39 |     def update(self, x):
 40 |         x = x.astype('float64')
 41 |         n = int(np.prod(self.shape))
 42 |         totalvec = np.zeros(n*2+1, 'float64')
 43 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 44 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 45 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 46 | 
 47 | @U.in_session
 48 | def test_runningmeanstd():
 49 |     for (x1, x2, x3) in [
 50 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 51 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 52 |         ]:
 53 | 
 54 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 55 |         U.initialize()
 56 | 
 57 |         x = np.concatenate([x1, x2, x3], axis=0)
 58 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 59 |         rms.update(x1)
 60 |         rms.update(x2)
 61 |         rms.update(x3)
 62 |         ms2 = U.eval([rms.mean, rms.std])
 63 | 
 64 |         assert np.allclose(ms1, ms2)
 65 | 
 66 | @U.in_session
 67 | def test_dist():
 68 |     np.random.seed(0)
 69 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 70 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 71 | 
 72 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 73 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 74 | 
 75 |     comm = MPI.COMM_WORLD
 76 |     assert comm.Get_size()==2
 77 |     if comm.Get_rank()==0:
 78 |         x1,x2,x3 = p1,p2,p3
 79 |     elif comm.Get_rank()==1:
 80 |         x1,x2,x3 = q1,q2,q3
 81 |     else:
 82 |         assert False
 83 | 
 84 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 85 |     U.initialize()
 86 | 
 87 |     rms.update(x1)
 88 |     rms.update(x2)
 89 |     rms.update(x3)
 90 | 
 91 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 92 | 
 93 |     def checkallclose(x,y):
 94 |         print(x,y)
 95 |         return np.allclose(x,y)
 96 | 
 97 |     assert checkallclose(
 98 |         bigvec.mean(axis=0),
 99 |         U.eval(rms.mean)
100 |     )
101 |     assert checkallclose(
102 |         bigvec.std(axis=0),
103 |         U.eval(rms.std)
104 |     )
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     # Run with mpirun -np 2 python <filename>
109 |     test_dist()
110 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/pposgd_simple.py:
--------------------------------------------------------------------------------
  1 | from dataset import Dataset
  2 | from math_util import explained_variance
  3 | from console_util import fmt_row
  4 | from misc_util import zipsame
  5 | import logger
  6 | import tf_util as U
  7 | import tensorflow as tf, numpy as np
  8 | import time
  9 | from mpi_adam import MpiAdam
 10 | from mpi_moments import mpi_moments
 11 | from mpi4py import MPI
 12 | from collections import deque
 13 | from gym import spaces, wrappers
 14 | 
 15 | def traj_segment_generator(pi, env, horizon, stochastic):
 16 |     #env = wrappers.Monitor(env, directory="./video/Humanoid", force=True)
 17 |     t = 0
 18 |     ac = env.action_space.sample() # not used, just so we have the datatype
 19 |     new = True # marks if we're on first timestep of an episode
 20 |     ob = env.reset()
 21 | 
 22 |     cur_ep_ret = 0 # return in current episode
 23 |     cur_ep_len = 0 # len of current episode
 24 |     ep_rets = [] # returns of completed episodes in this segment
 25 |     ep_lens = [] # lengths of ...
 26 | 
 27 |     # Initialize history arrays
 28 |     obs = np.array([ob for _ in range(horizon)])
 29 |     rews = np.zeros(horizon, 'float32')
 30 |     vpreds = np.zeros(horizon, 'float32')
 31 |     news = np.zeros(horizon, 'int32')
 32 |     acs = np.array([ac for _ in range(horizon)])
 33 |     prevacs = acs.copy()
 34 | 
 35 |     while True:
 36 |         #env.render()
 37 |         prevac = ac
 38 |         ac, vpred = pi.act(stochastic, ob)
 39 |         # Slight weirdness here because we need value function at time T
 40 |         # before returning segment [0, T-1] so we get the correct
 41 |         # terminal value
 42 |         if t > 0 and t % horizon == 0:
 43 |             yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
 44 |                     "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
 45 |                     "ep_rets" : ep_rets, "ep_lens" : ep_lens}
 46 |             # Be careful!!! if you change the downstream algorithm to aggregate
 47 |             # several of these batches, then be sure to do a deepcopy
 48 |             ep_rets = []
 49 |             ep_lens = []
 50 |         i = t % horizon
 51 |         
 52 |         obs[i] = ob
 53 |         vpreds[i] = vpred
 54 |         news[i] = new
 55 |         acs[i] = ac
 56 |         prevacs[i] = prevac
 57 |         ob, rew, new, _ = env.step(ac)
 58 |         rews[i] = rew
 59 | 
 60 |         cur_ep_ret += rew
 61 |         cur_ep_len += 1
 62 |         if new:
 63 |             print("Reward: {}".format(cur_ep_ret))
 64 |             ep_rets.append(cur_ep_ret)
 65 |             ep_lens.append(cur_ep_len)
 66 |             cur_ep_ret = 0
 67 |             cur_ep_len = 0
 68 |             ob = env.reset()
 69 |         t += 1
 70 | 
 71 | def add_vtarg_and_adv(seg, gamma, lam):
 72 |     """
 73 |     Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
 74 |     """
 75 |     new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
 76 |     vpred = np.append(seg["vpred"], seg["nextvpred"])
 77 |     T = len(seg["rew"])
 78 |     seg["adv"] = gaelam = np.empty(T, 'float32')
 79 |     rew = seg["rew"]
 80 |     lastgaelam = 0
 81 |     for t in reversed(range(T)):
 82 |         nonterminal = 1-new[t+1]
 83 |         delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
 84 |         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
 85 |     seg["tdlamret"] = seg["adv"] + seg["vpred"]
 86 | 
 87 | def learn(env, policy_func, *,
 88 |         timesteps_per_batch, # timesteps per actor per update
 89 |         clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
 90 |         optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
 91 |         gamma, lam, # advantage estimation
 92 |         max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
 93 |         callback=None, # you can do anything in the callback, since it takes locals(), globals()
 94 |         adam_epsilon=1e-5,
 95 |         schedule='constant' # annealing for stepsize parameters (epsilon and adam)
 96 |         ):
 97 |     # Setup losses and stuff
 98 |     # ----------------------------------------
 99 |     ob_space = env.observation_space
100 |     ac_space = env.action_space
101 |     pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
102 |     oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
103 |     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
104 |     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
105 | 
106 |     lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
107 |     clip_param = clip_param * lrmult # Annealed cliping parameter epislon
108 | 
109 |     ob = U.get_placeholder_cached(name="ob")
110 |     ac = pi.pdtype.sample_placeholder([None])
111 | 
112 |     kloldnew = oldpi.pd.kl(pi.pd)
113 |     ent = pi.pd.entropy()
114 |     meankl = U.mean(kloldnew)
115 |     meanent = U.mean(ent)
116 |     pol_entpen = (-entcoeff) * meanent
117 | 
118 |     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
119 |     surr1 = ratio * atarg # surrogate from conservative policy iteration
120 |     surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
121 |     pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
122 |     vf_loss = U.mean(tf.square(pi.vpred - ret))
123 |     total_loss = pol_surr + pol_entpen + vf_loss
124 |     losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
125 |     loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
126 | 
127 |     var_list = pi.get_trainable_variables()
128 |     lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
129 |     adam = MpiAdam(var_list, epsilon=adam_epsilon)
130 | 
131 |     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
132 |         for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
133 |     compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
134 | 
135 |     U.initialize()
136 |     adam.sync()
137 | 
138 |     U.load_state("save/Humanoid-v1")
139 | 
140 |     # Prepare for rollouts
141 |     # ----------------------------------------
142 |     seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
143 | 
144 |     episodes_so_far = 0
145 |     timesteps_so_far = 0
146 |     iters_so_far = 0
147 |     tstart = time.time()
148 |     lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
149 |     rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
150 | 
151 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"
152 | 
153 |     while True:
154 |         if callback: callback(locals(), globals())
155 |         if max_timesteps and timesteps_so_far >= max_timesteps:
156 |             break
157 |         elif max_episodes and episodes_so_far >= max_episodes:
158 |             break
159 |         elif max_iters and iters_so_far >= max_iters:
160 |             break
161 |         elif max_seconds and time.time() - tstart >= max_seconds:
162 |             break
163 | 
164 |         if schedule == 'constant':
165 |             cur_lrmult = 1.0
166 |         elif schedule == 'linear':
167 |             cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
168 |         else:
169 |             raise NotImplementedError
170 | 
171 |         logger.log("********** Iteration %i ************"%iters_so_far)
172 | 
173 |         seg = seg_gen.__next__()
174 |         add_vtarg_and_adv(seg, gamma, lam)
175 | 
176 |         # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
177 |         ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
178 |         vpredbefore = seg["vpred"] # predicted value function before udpate
179 |         atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
180 |         d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
181 |         optim_batchsize = optim_batchsize or ob.shape[0]
182 | 
183 |         #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
184 | 
185 |         assign_old_eq_new() # set old parameter values to new parameter values
186 |         logger.log("Optimizing...")
187 |         logger.log(fmt_row(13, loss_names))
188 |         # Here we do a bunch of optimization epochs over the data
189 |         for _ in range(optim_epochs):
190 |             losses = [] # list of tuples, each of which gives the loss for a minibatch
191 |             for batch in d.iterate_once(optim_batchsize):
192 |                 *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
193 |                 adam.update(g, optim_stepsize * cur_lrmult) 
194 |                 losses.append(newlosses)
195 |             logger.log(fmt_row(13, np.mean(losses, axis=0)))
196 | 
197 |         logger.log("Evaluating losses...")
198 |         losses = []
199 |         for batch in d.iterate_once(optim_batchsize):
200 |             newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
201 |             losses.append(newlosses)            
202 |         meanlosses,_,_ = mpi_moments(losses, axis=0)
203 |         logger.log(fmt_row(13, meanlosses))
204 |         for (lossval, name) in zipsame(meanlosses, loss_names):
205 |             logger.record_tabular("loss_"+name, lossval)
206 |         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
207 |         lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
208 |         listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
209 |         lens, rews = map(flatten_lists, zip(*listoflrpairs))
210 |         lenbuffer.extend(lens)
211 |         rewbuffer.extend(rews)
212 |         logger.record_tabular("EpLenMean", np.mean(lenbuffer))
213 |         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
214 |         logger.record_tabular("EpThisIter", len(lens))
215 |         episodes_so_far += len(lens)
216 |         timesteps_so_far += sum(lens)
217 |         iters_so_far += 1
218 |         logger.record_tabular("EpisodesSoFar", episodes_so_far)
219 |         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
220 |         logger.record_tabular("TimeElapsed", time.time() - tstart)
221 |         if MPI.COMM_WORLD.Get_rank()==0:
222 |             logger.dump_tabular()
223 |         U.save_state("save/Humanoid-v1")
224 | 
225 | def flatten_lists(listoflists):
226 |     return [el for list_ in listoflists for el in list_]
227 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/run_roboschool.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import tf_util as U
 3 | import os.path as osp
 4 | import gym, logging
 5 | import roboschool
 6 | import logger
 7 | import sys
 8 | 
 9 | #from OpenGL import GL
10 | 
11 | def train(env_id, num_timesteps, seed):
12 |     import mlp_policy, pposgd_simple
13 |     U.make_session(num_cpu=1, num_gpu=0).__enter__()
14 |     env = gym.make(env_id)
15 |     def policy_fn(name, ob_space, ac_space):
16 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
17 |             hid_size=128, num_hid_layers=2)
18 |     env.seed(seed)
19 |     pposgd_simple.learn(env, policy_fn, 
20 |             max_timesteps=num_timesteps,
21 |             timesteps_per_batch=2048,
22 |             clip_param=0.2, entcoeff=0.0,
23 |             optim_epochs=10, optim_stepsize=1e-4, optim_batchsize=64,
24 |             gamma=0.99, lam=0.95, schedule='constant',
25 |         )
26 |     env.close()
27 | 
28 | def main():
29 |     import argparse
30 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
31 |     parser.add_argument('--env', help='environment ID', default='RoboschoolHumanoid-v1')
32 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
33 |     args = parser.parse_args()
34 |     train(args.env, num_timesteps=2e7, seed=args.seed)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 
40 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/save/Humanoid-v1.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wooridle/DeepRL-PPO-tutorial/84687cb24e8c8c68090fd83759acd6eebecbcf03/openai_baselines_ppo/save/Humanoid-v1.data-00000-of-00001


--------------------------------------------------------------------------------
/openai_baselines_ppo/save/Humanoid-v1.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wooridle/DeepRL-PPO-tutorial/84687cb24e8c8c68090fd83759acd6eebecbcf03/openai_baselines_ppo/save/Humanoid-v1.index


--------------------------------------------------------------------------------
/openai_baselines_ppo/save/Humanoid-v1.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wooridle/DeepRL-PPO-tutorial/84687cb24e8c8c68090fd83759acd6eebecbcf03/openai_baselines_ppo/save/Humanoid-v1.meta


--------------------------------------------------------------------------------
/openai_baselines_ppo/test_roboschool.py:
--------------------------------------------------------------------------------
 1 | import tf_util as U
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | import gym
 5 | from gym import wrappers
 6 | import roboschool
 7 | from copy import deepcopy
 8 | 
 9 | #from OpenGL import GL
10 | 
11 | def run():
12 |     import mlp_policy_robo
13 |     U.make_session(num_cpu=1).__enter__()
14 |     env = gym.make("RoboschoolHumanoid-v1")
15 |     #env = wrappers.Monitor(env, directory="./video/HalfCheeta-v1", force=True)
16 |     def policy_fn(name, ob_space, ac_space):
17 |         return mlp_policy_robo.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
18 |                                     hid_size=128, num_hid_layers=2)
19 |     ob_space = env.observation_space
20 |     ac_space = env.action_space
21 |     pi = policy_fn("pi", ob_space, ac_space)
22 |     oldpi = policy_fn("oldpi", ob_space, ac_space)
23 | 
24 |     U.load_state("save/Humanoid-v1")
25 |     for epi in range(100):
26 |         ob = env.reset()
27 | 
28 |         total_reward = 0
29 |         step = 0
30 |         while True:
31 |             env.render("human")
32 |             ac, v = pi.act(True, ob)
33 |             
34 |             ob, rew, new, info = env.step(ac)
35 |             step += 1
36 | 
37 |             total_reward += rew
38 |             
39 |             if new:
40 |                 print("Reward: {}, Step: {}".format(total_reward, step))
41 |                 break
42 | 
43 | run()
44 | 


--------------------------------------------------------------------------------
/openai_baselines_ppo/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf  # pylint: ignore-module
  3 | import builtins
  4 | import functools
  5 | import copy
  6 | import os
  7 | import collections
  8 | 
  9 | 
 10 | # ================================================================
 11 | # Make consistent with numpy
 12 | # ================================================================
 13 | 
 14 | clip = tf.clip_by_value
 15 | 
 16 | 
 17 | def sum(x, axis=None, keepdims=False):
 18 |     axis = None if axis is None else [axis]
 19 |     return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
 20 | 
 21 | 
 22 | def mean(x, axis=None, keepdims=False):
 23 |     axis = None if axis is None else [axis]
 24 |     return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
 25 | 
 26 | 
 27 | def var(x, axis=None, keepdims=False):
 28 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 29 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 30 | 
 31 | 
 32 | def std(x, axis=None, keepdims=False):
 33 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 34 | 
 35 | 
 36 | def max(x, axis=None, keepdims=False):
 37 |     axis = None if axis is None else [axis]
 38 |     return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
 39 | 
 40 | 
 41 | def min(x, axis=None, keepdims=False):
 42 |     axis = None if axis is None else [axis]
 43 |     return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
 44 | 
 45 | 
 46 | def concatenate(arrs, axis=0):
 47 |     return tf.concat(axis=axis, values=arrs)
 48 | 
 49 | 
 50 | def argmax(x, axis=None):
 51 |     return tf.argmax(x, axis=axis)
 52 | 
 53 | 
 54 | def switch(condition, then_expression, else_expression):
 55 |     """Switches between two operations depending on a scalar value (int or bool).
 56 |     Note that both `then_expression` and `else_expression`
 57 |     should be symbolic tensors of the *same shape*.
 58 | 
 59 |     # Arguments
 60 |         condition: scalar tensor.
 61 |         then_expression: TensorFlow operation.
 62 |         else_expression: TensorFlow operation.
 63 |     """
 64 |     x_shape = copy.copy(then_expression.get_shape())
 65 |     x = tf.cond(tf.cast(condition, 'bool'),
 66 |                 lambda: then_expression,
 67 |                 lambda: else_expression)
 68 |     x.set_shape(x_shape)
 69 |     return x
 70 | 
 71 | # ================================================================
 72 | # Extras
 73 | # ================================================================
 74 | 
 75 | 
 76 | def l2loss(params):
 77 |     if len(params) == 0:
 78 |         return tf.constant(0.0)
 79 |     else:
 80 |         return tf.add_n([sum(tf.square(p)) for p in params])
 81 | 
 82 | 
 83 | def lrelu(x, leak=0.2):
 84 |     f1 = 0.5 * (1 + leak)
 85 |     f2 = 0.5 * (1 - leak)
 86 |     return f1 * x + f2 * abs(x)
 87 | 
 88 | 
 89 | def categorical_sample_logits(X):
 90 |     # https://github.com/tensorflow/tensorflow/issues/456
 91 |     U = tf.random_uniform(tf.shape(X))
 92 |     return argmax(X - tf.log(-tf.log(U)), axis=1)
 93 | 
 94 | 
 95 | # ================================================================
 96 | # Inputs
 97 | # ================================================================
 98 | 
 99 | 
100 | def is_placeholder(x):
101 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
102 | 
103 | 
104 | class TfInput(object):
105 |     def __init__(self, name="(unnamed)"):
106 |         """Generalized Tensorflow placeholder. The main differences are:
107 |             - possibly uses multiple placeholders internally and returns multiple values
108 |             - can apply light postprocessing to the value feed to placeholder.
109 |         """
110 |         self.name = name
111 | 
112 |     def get(self):
113 |         """Return the tf variable(s) representing the possibly postprocessed value
114 |         of placeholder(s).
115 |         """
116 |         raise NotImplemented()
117 | 
118 |     def make_feed_dict(data):
119 |         """Given data input it to the placeholder(s)."""
120 |         raise NotImplemented()
121 | 
122 | 
123 | class PlacholderTfInput(TfInput):
124 |     def __init__(self, placeholder):
125 |         """Wrapper for regular tensorflow placeholder."""
126 |         super().__init__(placeholder.name)
127 |         self._placeholder = placeholder
128 | 
129 |     def get(self):
130 |         return self._placeholder
131 | 
132 |     def make_feed_dict(self, data):
133 |         return {self._placeholder: data}
134 | 
135 | 
136 | class BatchInput(PlacholderTfInput):
137 |     def __init__(self, shape, dtype=tf.float32, name=None):
138 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
139 | 
140 |         Parameters
141 |         ----------
142 |         shape: [int]
143 |             shape of a single elemenet of the batch
144 |         dtype: tf.dtype
145 |             number representation used for tensor contents
146 |         name: str
147 |             name of the underlying placeholder
148 |         """
149 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
150 | 
151 | 
152 | class Uint8Input(PlacholderTfInput):
153 |     def __init__(self, shape, name=None):
154 |         """Takes input in uint8 format which is cast to float32 and divided by 255
155 |         before passing it to the model.
156 | 
157 |         On GPU this ensures lower data transfer times.
158 | 
159 |         Parameters
160 |         ----------
161 |         shape: [int]
162 |             shape of the tensor.
163 |         name: str
164 |             name of the underlying placeholder
165 |         """
166 | 
167 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
168 |         self._shape = shape
169 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
170 | 
171 |     def get(self):
172 |         return self._output
173 | 
174 | 
175 | def ensure_tf_input(thing):
176 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
177 |     if isinstance(thing, TfInput):
178 |         return thing
179 |     elif is_placeholder(thing):
180 |         return PlacholderTfInput(thing)
181 |     else:
182 |         raise ValueError("Must be a placeholder or TfInput")
183 | 
184 | # ================================================================
185 | # Mathematical utils
186 | # ================================================================
187 | 
188 | 
189 | def huber_loss(x, delta=1.0):
190 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
191 |     return tf.where(
192 |         tf.abs(x) < delta,
193 |         tf.square(x) * 0.5,
194 |         delta * (tf.abs(x) - 0.5 * delta)
195 |     )
196 | 
197 | # ================================================================
198 | # Optimizer utils
199 | # ================================================================
200 | 
201 | 
202 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
203 |     """Minimized `objective` using `optimizer` w.r.t. variables in
204 |     `var_list` while ensure the norm of the gradients for each
205 |     variable is clipped to `clip_val`
206 |     """
207 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
208 |     for i, (grad, var) in enumerate(gradients):
209 |         if grad is not None:
210 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
211 |     return optimizer.apply_gradients(gradients)
212 | 
213 | 
214 | # ================================================================
215 | # Global session
216 | # ================================================================
217 | 
218 | def get_session():
219 |     """Returns recently made Tensorflow session"""
220 |     return tf.get_default_session()
221 | 
222 | 
223 | def make_session(num_cpu, num_gpu=1):
224 |     """Returns a session that will use <num_cpu> CPU's only"""
225 |     tf_config = tf.ConfigProto(
226 |         inter_op_parallelism_threads=num_cpu,
227 |         intra_op_parallelism_threads=num_cpu,
228 |         device_count={"GPU":num_gpu})
229 |     return tf.Session(config=tf_config)
230 | 
231 | 
232 | def single_threaded_session():
233 |     """Returns a session which will only use a single CPU"""
234 |     return make_session(1)
235 | 
236 | 
237 | ALREADY_INITIALIZED = set()
238 | 
239 | 
240 | def initialize():
241 |     """Initialize all the uninitialized variables in the global scope."""
242 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
243 |     get_session().run(tf.variables_initializer(new_variables))
244 |     ALREADY_INITIALIZED.update(new_variables)
245 | 
246 | 
247 | def eval(expr, feed_dict=None):
248 |     if feed_dict is None:
249 |         feed_dict = {}
250 |     return get_session().run(expr, feed_dict=feed_dict)
251 | 
252 | 
253 | VALUE_SETTERS = collections.OrderedDict()
254 | 
255 | 
256 | def set_value(v, val):
257 |     global VALUE_SETTERS
258 |     if v in VALUE_SETTERS:
259 |         set_op, set_endpoint = VALUE_SETTERS[v]
260 |     else:
261 |         set_endpoint = tf.placeholder(v.dtype)
262 |         set_op = v.assign(set_endpoint)
263 |         VALUE_SETTERS[v] = (set_op, set_endpoint)
264 |     get_session().run(set_op, feed_dict={set_endpoint: val})
265 | 
266 | 
267 | # ================================================================
268 | # Saving variables
269 | # ================================================================
270 | 
271 | 
272 | def load_state(fname):
273 |     saver = tf.train.Saver()
274 |     saver.restore(get_session(), fname)
275 | 
276 | 
277 | def save_state(fname):
278 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
279 |     saver = tf.train.Saver()
280 |     saver.save(get_session(), fname)
281 | 
282 | # ================================================================
283 | # Model components
284 | # ================================================================
285 | 
286 | 
287 | def normc_initializer(std=1.0):
288 |     def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
289 |         out = np.random.randn(*shape).astype(np.float32)
290 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
291 |         return tf.constant(out)
292 |     return _initializer
293 | 
294 | 
295 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
296 |            summary_tag=None):
297 |     with tf.variable_scope(name):
298 |         stride_shape = [1, stride[0], stride[1], 1]
299 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
300 | 
301 |         # there are "num input feature maps * filter height * filter width"
302 |         # inputs to each hidden unit
303 |         fan_in = intprod(filter_shape[:3])
304 |         # each unit in the lower layer receives a gradient from:
305 |         # "num output feature maps * filter height * filter width" /
306 |         #   pooling size
307 |         fan_out = intprod(filter_shape[:2]) * num_filters
308 |         # initialize weights with random weights
309 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
310 | 
311 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
312 |                             collections=collections)
313 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
314 |                             collections=collections)
315 | 
316 |         if summary_tag is not None:
317 |             tf.summary.image(summary_tag,
318 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
319 |                                           [2, 0, 1, 3]),
320 |                              max_images=10)
321 | 
322 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
323 | 
324 | 
325 | def dense(x, size, name, weight_init=None, bias=True):
326 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
327 |     ret = tf.matmul(x, w)
328 |     if bias:
329 |         b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
330 |         return ret + b
331 |     else:
332 |         return ret
333 | 
334 | 
335 | def wndense(x, size, name, init_scale=1.0):
336 |     v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
337 |                         initializer=tf.random_normal_initializer(0, 0.05))
338 |     g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
339 |     b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
340 | 
341 |     # use weight normalization (Salimans & Kingma, 2016)
342 |     x = tf.matmul(x, v)
343 |     scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
344 |     return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
345 | 
346 | 
347 | def densenobias(x, size, name, weight_init=None):
348 |     return dense(x, size, name, weight_init=weight_init, bias=False)
349 | 
350 | 
351 | def dropout(x, pkeep, phase=None, mask=None):
352 |     mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
353 |     if phase is None:
354 |         return mask * x
355 |     else:
356 |         return switch(phase, mask * x, pkeep * x)
357 | 
358 | 
359 | # ================================================================
360 | # Theano-like Function
361 | # ================================================================
362 | 
363 | 
364 | 
365 | def function(inputs, outputs, updates=None, givens=None):
366 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
367 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
368 |     values to be fed to the input's placeholders and produces the values of the expressions
369 |     in outputs.
370 | 
371 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
372 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
373 | 
374 |     Example:
375 |         x = tf.placeholder(tf.int32, (), name="x")
376 |         y = tf.placeholder(tf.int32, (), name="y")
377 |         z = 3 * x + 2 * y
378 |         lin = function([x, y], z, givens={y: 0})
379 | 
380 |         with single_threaded_session():
381 |             initialize()
382 | 
383 |             assert lin(2) == 6
384 |             assert lin(x=3) == 9
385 |             assert lin(2, 2) == 10
386 |             assert lin(x=2, y=3) == 12
387 | 
388 |     Parameters
389 |     ----------
390 |     inputs: [tf.placeholder or TfInput]
391 |         list of input arguments
392 |     outputs: [tf.Variable] or tf.Variable
393 |         list of outputs or a single output to be returned from function. Returned
394 |         value will also have the same shape.
395 |     """
396 |     if isinstance(outputs, list):
397 |         return _Function(inputs, outputs, updates, givens=givens)
398 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
399 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
400 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
401 |     else:
402 |         f = _Function(inputs, [outputs], updates, givens=givens)
403 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
404 | 
405 | 
406 | class _Function(object):
407 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
408 |         for inpt in inputs:
409 |             if not issubclass(type(inpt), TfInput):
410 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput"
411 |         self.inputs = inputs
412 |         updates = updates or []
413 |         self.update_group = tf.group(*updates)
414 |         self.outputs_update = list(outputs) + [self.update_group]
415 |         self.givens = {} if givens is None else givens
416 |         self.check_nan = check_nan
417 | 
418 |     def _feed_input(self, feed_dict, inpt, value):
419 |         if issubclass(type(inpt), TfInput):
420 |             feed_dict.update(inpt.make_feed_dict(value))
421 |         elif is_placeholder(inpt):
422 |             feed_dict[inpt] = value
423 | 
424 |     def __call__(self, *args, **kwargs):
425 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
426 |         feed_dict = {}
427 |         # Update the args
428 |         for inpt, value in zip(self.inputs, args):
429 |             self._feed_input(feed_dict, inpt, value)
430 |         # Update the kwargs
431 |         kwargs_passed_inpt_names = set()
432 |         for inpt in self.inputs[len(args):]:
433 |             inpt_name = inpt.name.split(':')[0]
434 |             inpt_name = inpt_name.split('/')[-1]
435 |             assert inpt_name not in kwargs_passed_inpt_names, \
436 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
437 |             if inpt_name in kwargs:
438 |                 kwargs_passed_inpt_names.add(inpt_name)
439 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
440 |             else:
441 |                 assert inpt in self.givens, "Missing argument " + inpt_name
442 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
443 |         # Update feed dict with givens.
444 |         for inpt in self.givens:
445 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
446 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
447 |         if self.check_nan:
448 |             if any(np.isnan(r).any() for r in results):
449 |                 raise RuntimeError("Nan detected")
450 |         return results
451 | 
452 | 
453 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
454 |     if isinstance(outputs, list):
455 |         return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
456 |     else:
457 |         f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
458 |         return lambda *inputs: f(*inputs)[0]
459 | 
460 | 
461 | class _MemFriendlyFunction(object):
462 |     def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
463 |         self.nondata_inputs = nondata_inputs
464 |         self.data_inputs = data_inputs
465 |         self.outputs = list(outputs)
466 |         self.batch_size = batch_size
467 | 
468 |     def __call__(self, *inputvals):
469 |         assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
470 |         nondata_vals = inputvals[0:len(self.nondata_inputs)]
471 |         data_vals = inputvals[len(self.nondata_inputs):]
472 |         feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
473 |         n = data_vals[0].shape[0]
474 |         for v in data_vals[1:]:
475 |             assert v.shape[0] == n
476 |         for i_start in range(0, n, self.batch_size):
477 |             slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
478 |             for (var, val) in zip(self.data_inputs, slice_vals):
479 |                 feed_dict[var] = val
480 |             results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
481 |             if i_start == 0:
482 |                 sum_results = results
483 |             else:
484 |                 for i in range(len(results)):
485 |                     sum_results[i] = sum_results[i] + results[i]
486 |         for i in range(len(results)):
487 |             sum_results[i] = sum_results[i] / n
488 |         return sum_results
489 | 
490 | # ================================================================
491 | # Modules
492 | # ================================================================
493 | 
494 | 
495 | class Module(object):
496 |     def __init__(self, name):
497 |         self.name = name
498 |         self.first_time = True
499 |         self.scope = None
500 |         self.cache = {}
501 | 
502 |     def __call__(self, *args):
503 |         if args in self.cache:
504 |             print("(%s) retrieving value from cache" % (self.name,))
505 |             return self.cache[args]
506 |         with tf.variable_scope(self.name, reuse=not self.first_time):
507 |             scope = tf.get_variable_scope().name
508 |             if self.first_time:
509 |                 self.scope = scope
510 |                 print("(%s) running function for the first time" % (self.name,))
511 |             else:
512 |                 assert self.scope == scope, "Tried calling function with a different scope"
513 |                 print("(%s) running function on new inputs" % (self.name,))
514 |             self.first_time = False
515 |             out = self._call(*args)
516 |         self.cache[args] = out
517 |         return out
518 | 
519 |     def _call(self, *args):
520 |         raise NotImplementedError
521 | 
522 |     @property
523 |     def trainable_variables(self):
524 |         assert self.scope is not None, "need to call module once before getting variables"
525 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
526 | 
527 |     @property
528 |     def variables(self):
529 |         assert self.scope is not None, "need to call module once before getting variables"
530 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
531 | 
532 | 
533 | def module(name):
534 |     @functools.wraps
535 |     def wrapper(f):
536 |         class WrapperModule(Module):
537 |             def _call(self, *args):
538 |                 return f(*args)
539 |         return WrapperModule(name)
540 |     return wrapper
541 | 
542 | # ================================================================
543 | # Graph traversal
544 | # ================================================================
545 | 
546 | 
547 | VARIABLES = {}
548 | 
549 | 
550 | def get_parents(node):
551 |     return node.op.inputs
552 | 
553 | 
554 | def topsorted(outputs):
555 |     """
556 |     Topological sort via non-recursive depth-first search
557 |     """
558 |     assert isinstance(outputs, (list, tuple))
559 |     marks = {}
560 |     out = []
561 |     stack = []  # pylint: disable=W0621
562 |     # i: node
563 |     # jidx = number of children visited so far from that node
564 |     # marks: state of each node, which is one of
565 |     #   0: haven't visited
566 |     #   1: have visited, but not done visiting children
567 |     #   2: done visiting children
568 |     for x in outputs:
569 |         stack.append((x, 0))
570 |         while stack:
571 |             (i, jidx) = stack.pop()
572 |             if jidx == 0:
573 |                 m = marks.get(i, 0)
574 |                 if m == 0:
575 |                     marks[i] = 1
576 |                 elif m == 1:
577 |                     raise ValueError("not a dag")
578 |                 else:
579 |                     continue
580 |             ps = get_parents(i)
581 |             if jidx == len(ps):
582 |                 marks[i] = 2
583 |                 out.append(i)
584 |             else:
585 |                 stack.append((i, jidx + 1))
586 |                 j = ps[jidx]
587 |                 stack.append((j, 0))
588 |     return out
589 | 
590 | 
591 | # ================================================================
592 | # Flat vectors
593 | # ================================================================
594 | 
595 | def var_shape(x):
596 |     out = x.get_shape().as_list()
597 |     assert all(isinstance(a, int) for a in out), \
598 |         "shape function assumes that shape is fully known"
599 |     return out
600 | 
601 | 
602 | def numel(x):
603 |     return intprod(var_shape(x))
604 | 
605 | 
606 | def intprod(x):
607 |     return int(np.prod(x))
608 | 
609 | 
610 | def flatgrad(loss, var_list, clip_norm=None):
611 |     grads = tf.gradients(loss, var_list)
612 |     if clip_norm is not None:
613 |         grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
614 |     return tf.concat(axis=0, values=[
615 |         tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
616 |         for (v, grad) in zip(var_list, grads)
617 |     ])
618 | 
619 | 
620 | class SetFromFlat(object):
621 |     def __init__(self, var_list, dtype=tf.float32):
622 |         assigns = []
623 |         shapes = list(map(var_shape, var_list))
624 |         total_size = np.sum([intprod(shape) for shape in shapes])
625 | 
626 |         self.theta = theta = tf.placeholder(dtype, [total_size])
627 |         start = 0
628 |         assigns = []
629 |         for (shape, v) in zip(shapes, var_list):
630 |             size = intprod(shape)
631 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
632 |             start += size
633 |         self.op = tf.group(*assigns)
634 | 
635 |     def __call__(self, theta):
636 |         get_session().run(self.op, feed_dict={self.theta: theta})
637 | 
638 | 
639 | class GetFlat(object):
640 |     def __init__(self, var_list):
641 |         self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
642 | 
643 |     def __call__(self):
644 |         return get_session().run(self.op)
645 | 
646 | # ================================================================
647 | # Misc
648 | # ================================================================
649 | 
650 | 
651 | def fancy_slice_2d(X, inds0, inds1):
652 |     """
653 |     like numpy X[inds0, inds1]
654 |     XXX this implementation is bad
655 |     """
656 |     inds0 = tf.cast(inds0, tf.int64)
657 |     inds1 = tf.cast(inds1, tf.int64)
658 |     shape = tf.cast(tf.shape(X), tf.int64)
659 |     ncols = shape[1]
660 |     Xflat = tf.reshape(X, [-1])
661 |     return tf.gather(Xflat, inds0 * ncols + inds1)
662 | 
663 | 
664 | # ================================================================
665 | # Scopes
666 | # ================================================================
667 | 
668 | 
669 | def scope_vars(scope, trainable_only=False):
670 |     """
671 |     Get variables inside a scope
672 |     The scope can be specified as a string
673 | 
674 |     Parameters
675 |     ----------
676 |     scope: str or VariableScope
677 |         scope in which the variables reside.
678 |     trainable_only: bool
679 |         whether or not to return only the variables that were marked as trainable.
680 | 
681 |     Returns
682 |     -------
683 |     vars: [tf.Variable]
684 |         list of variables in `scope`.
685 |     """
686 |     return tf.get_collection(
687 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
688 |         scope=scope if isinstance(scope, str) else scope.name
689 |     )
690 | 
691 | 
692 | def scope_name():
693 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
694 |     return tf.get_variable_scope().name
695 | 
696 | 
697 | def absolute_scope_name(relative_scope_name):
698 |     """Appends parent scope name to `relative_scope_name`"""
699 |     return scope_name() + "/" + relative_scope_name
700 | 
701 | 
702 | def lengths_to_mask(lengths_b, max_length):
703 |     """
704 |     Turns a vector of lengths into a boolean mask
705 | 
706 |     Args:
707 |         lengths_b: an integer vector of lengths
708 |         max_length: maximum length to fill the mask
709 | 
710 |     Returns:
711 |         a boolean array of shape (batch_size, max_length)
712 |         row[i] consists of True repeated lengths_b[i] times, followed by False
713 |     """
714 |     lengths_b = tf.convert_to_tensor(lengths_b)
715 |     assert lengths_b.get_shape().ndims == 1
716 |     mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
717 |     return mask_bt
718 | 
719 | 
720 | def in_session(f):
721 |     @functools.wraps(f)
722 |     def newfunc(*args, **kwargs):
723 |         with tf.Session():
724 |             f(*args, **kwargs)
725 |     return newfunc
726 | 
727 | 
728 | _PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
729 | 
730 | 
731 | def get_placeholder(name, dtype, shape):
732 |     if name in _PLACEHOLDER_CACHE:
733 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
734 |         assert dtype1 == dtype and shape1 == shape
735 |         return out
736 |     else:
737 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
738 |         _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
739 |         return out
740 | 
741 | 
742 | def get_placeholder_cached(name):
743 |     return _PLACEHOLDER_CACHE[name][0]
744 | 
745 | 
746 | def flattenallbut0(x):
747 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
748 | 
749 | 
750 | def reset():
751 |     global _PLACEHOLDER_CACHE
752 |     global VARIABLES
753 |     _PLACEHOLDER_CACHE = {}
754 |     VARIABLES = {}
755 |     tf.reset_default_graph()
756 | 


--------------------------------------------------------------------------------
/ppo.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | import roboschool
  5 | 
  6 | 
  7 | class Network(object):
  8 |     def __init__(self, env, scope, num_layers, num_units, obs_plc, act_plc, trainable=True):
  9 |         self.env = env
 10 |         self.observation_size = env.observation_space.shape[0]
 11 |         assert isinstance(env.action_space, gym.spaces.Box)
 12 |         self.action_size = env.action_space.shape[0]
 13 |         self.trainable = trainable
 14 | 
 15 |         self.scope = scope
 16 | 
 17 |         self.obs_place = obs_plc
 18 |         self.acts_place = act_plc
 19 | 
 20 |         self.p, self.v, self.logstd = self._build_network(num_layers=num_layers, num_units=num_units)
 21 |         self.act_op = self.action_sample()
 22 | 
 23 |     def _build_network(self, num_layers, num_units):
 24 |         with tf.variable_scope(self.scope):
 25 |             x = self.obs_place
 26 |             for i in range(num_layers):
 27 |                 x = tf.layers.dense(x, units=num_units, activation=tf.nn.tanh, name="p_fc"+str(i),
 28 |                                     trainable=self.trainable)
 29 |             action = tf.layers.dense(x, units=self.action_size, activation=tf.tanh,
 30 |                                      name="p_fc"+str(num_layers), trainable=self.trainable)
 31 | 
 32 |             x = self.obs_place
 33 |             for i in range(num_layers):
 34 |                 x = tf.layers.dense(x, units=num_units, activation=tf.nn.tanh, name="v_fc"+str(i),
 35 |                                     trainable=self.trainable)
 36 |             value = tf.layers.dense(x, units=1, activation=None, name="v_fc"+str(num_layers),
 37 |                                     trainable=self.trainable)
 38 | 
 39 |             logstd = tf.get_variable(name="logstd", shape=[self.action_size],
 40 |                                      initializer=tf.zeros_initializer)
 41 | 
 42 |         return action, value, logstd
 43 | 
 44 |     def action_sample(self):
 45 |         return self.p + tf.exp(self.logstd) * tf.random_normal(tf.shape(self.p))
 46 | 
 47 |     def get_variables(self):
 48 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
 49 | 
 50 |     def get_trainable_variables(self):
 51 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
 52 | 
 53 | 
 54 | class PPOAgent(object):
 55 |     def __init__(self, env):
 56 |         self.env = env
 57 | 
 58 |         ## hyperparameters
 59 |         self.learning_rate = 1e-4
 60 |         self.epochs = 10
 61 |         self.step_size = 3072
 62 |         self.gamma = 0.99
 63 |         self.lam = 0.95
 64 |         self.clip_param = 0.2
 65 |         self.batch_size = 64
 66 | 
 67 |         ## placeholders
 68 |         self.adv_place = tf.placeholder(shape=[None], dtype=tf.float32)
 69 |         self.return_place = tf.placeholder(shape=[None], dtype=tf.float32)
 70 | 
 71 |         self.obs_place = tf.placeholder(shape=[None, env.observation_space.shape[0]],
 72 |                                         name="ob", dtype=tf.float32)
 73 |         self.acts_place = tf.placeholder(shape=[None, env.action_space.shape[0]],
 74 |                                          name="ac", dtype=tf.float32)
 75 | 
 76 |         ## build network
 77 |         self.net = Network(env=self.env,
 78 |                            scope="pi",
 79 |                            num_layers=2,
 80 |                            num_units=128,
 81 |                            obs_plc=self.obs_place,
 82 |                            act_plc=self.acts_place)
 83 | 
 84 |         self.old_net = Network(env=self.env,
 85 |                                scope="old_pi",
 86 |                                num_layers=2,
 87 |                                num_units=128,
 88 |                                obs_plc=self.obs_place,
 89 |                                act_plc=self.acts_place,
 90 |                                trainable=False)
 91 | 
 92 |         # tensorflow operators
 93 |         self.assign_op = self.assign(self.net, self.old_net)
 94 |         self.ent, self.pol_loss, self.vf_loss, self.update_op = self.update()
 95 |         self.saver = tf.train.Saver()
 96 | 
 97 |     @staticmethod
 98 |     def logp(net):
 99 |         logp = -(0.5 * tf.reduce_sum(tf.square((net.acts_place - net.p) / tf.exp(net.logstd)), axis=-1) \
100 |             + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(net.p)[-1]) \
101 |             + tf.reduce_sum(net.logstd, axis=-1))
102 | 
103 |         return logp
104 | 
105 |     @staticmethod
106 |     def entropy(net):
107 |         ent = tf.reduce_sum(net.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
108 |         return ent
109 | 
110 |     @staticmethod
111 |     def assign(net, old_net):
112 |         assign_op = []
113 |         for (newv, oldv) in zip(net.get_variables(), old_net.get_variables()):
114 |             assign_op.append(tf.assign(oldv, newv))
115 | 
116 |         return assign_op
117 | 
118 |     def traj_generator(self):
119 |         t = 0
120 |         action = env.action_space.sample()
121 |         done = True
122 |         ob = env.reset()
123 | 
124 |         cur_ep_return = 0
125 |         cur_ep_length = 0
126 |         ep_returns = []
127 |         ep_lengths = []
128 | 
129 |         obs = np.array([ob for _ in range(self.step_size)])
130 |         rewards = np.zeros(self.step_size, 'float32')
131 |         values = np.zeros(self.step_size, 'float32')
132 |         dones = np.zeros(self.step_size, 'int32')
133 |         actions = np.array([action for _ in range(self.step_size)])
134 |         prevactions = actions.copy()
135 | 
136 |         while True:
137 |             prevaction = action
138 |             action, value = self.act(ob)
139 |             #print(value)
140 |             if t > 0 and t % self.step_size == 0:
141 |                 yield {"ob": obs, "reward":rewards, "value": values,
142 |                        "done": dones, "action": actions, "prevaction": prevactions,
143 |                        "nextvalue": value*(1 - done), "ep_returns": ep_returns,
144 |                        "ep_lengths": ep_lengths}
145 | 
146 |                 ep_returns = []
147 |                 ep_lengths = []
148 | 
149 |             i = t % self.step_size
150 |             obs[i] = ob
151 |             values[i] = value
152 |             dones[i] = done
153 |             actions[i] = action[0]
154 |             prevactions[i] = prevaction
155 | 
156 |             ob, reward, done, _ = env.step(action[0])
157 |             rewards[i] = reward
158 | 
159 |             cur_ep_return += reward
160 |             cur_ep_length += 1
161 | 
162 |             if done:
163 |                 print("Reward: {}".format(cur_ep_return))
164 |                 ep_returns.append(cur_ep_return)
165 |                 ep_lengths.append(cur_ep_length)
166 |                 cur_ep_return = 0
167 |                 cur_ep_length = 0
168 |                 ob = env.reset()
169 |             t += 1
170 | 
171 |     def act(self, ob):
172 |         action, value = tf.get_default_session().run([self.net.act_op, self.net.v], feed_dict={
173 |             self.net.obs_place: ob[None]
174 |         })
175 |         return action, value
176 | 
177 |     def run(self):
178 |         traj_gen = self.traj_generator()
179 |         iteration = 0
180 | 
181 |         for _ in range(100000):
182 |             iteration += 1
183 |             print("\n================= iteration {} =================".format(iteration))
184 |             traj = traj_gen.__next__()
185 |             self.add_vtarg_and_adv(traj)
186 | 
187 |             tf.get_default_session().run(self.assign_op)
188 | 
189 |             traj["advantage"] = (traj["advantage"]-np.mean(traj["advantage"]))/np.std(traj["advantage"])
190 | 
191 |             len = int(self.step_size / self.batch_size)
192 |             for _ in range(self.epochs):
193 |                 vf_loss = 0
194 |                 pol_loss = 0
195 |                 entropy = 0
196 |                 for i in range(len):
197 |                     cur = i*self.batch_size
198 |                     *step_losses, _ = tf.get_default_session().run([self.ent, self.vf_loss, self.pol_loss, self.update_op],feed_dict = {self.obs_place: traj["ob"][cur:cur+self.batch_size],
199 |                                                        self.acts_place: traj["action"][cur:cur+self.batch_size],
200 |                                                        self.adv_place: traj["advantage"][cur:cur+self.batch_size],
201 |                                                        self.return_place: traj["return"][cur:cur+self.batch_size]})
202 | 
203 |                     entropy += step_losses[0]/len
204 |                     vf_loss += step_losses[1]/len
205 |                     pol_loss += step_losses[2]/len
206 |                 print("vf_loss: {:.5f}, pol_loss: {:.5f}, entorpy: {:.5f}".format(vf_loss, pol_loss, entropy))
207 | 
208 |             if iteration % 10 == 0:
209 |                 self.save_model('./model/Humanoid')
210 | 
211 |     def update(self):
212 |         ent = self.entropy(self.net)
213 |         ratio = tf.exp(self.logp(self.net) - tf.stop_gradient(self.logp(self.old_net)))
214 |         surr1 = ratio * self.adv_place
215 |         surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * self.adv_place
216 | 
217 |         pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
218 |         vf_loss = tf.reduce_mean(tf.square(self.net.v - self.return_place))
219 | 
220 |         total_loss = pol_surr + 10*vf_loss
221 | 
222 |         update_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(total_loss)
223 | 
224 |         return ent, pol_surr, vf_loss, update_op
225 | 
226 |     def add_vtarg_and_adv(self, traj):
227 |         done = np.append(traj["done"], 0)
228 |         value = np.append(traj["value"], traj["nextvalue"])
229 |         T = len(traj["reward"])
230 |         traj["advantage"] = gaelam = np.empty(T, 'float32')
231 |         reward = traj["reward"]
232 |         lastgaelam = 0
233 | 
234 |         for t in reversed(range(T)):
235 |             nonterminal = 1 - done[t+1]
236 |             delta = reward[t] + self.gamma * value[t+1] * nonterminal - value[t]
237 |             gaelam[t] = lastgaelam = delta + self.gamma * self.lam * nonterminal * lastgaelam
238 |         traj["return"] = traj["advantage"] + traj["value"]
239 | 
240 |     def save_model(self, model_path):
241 |         self.saver.save(tf.get_default_session(), model_path)
242 |         print("model saved")
243 | 
244 |     def restore_model(self, model_path):
245 |         self.saver.restore(tf.get_default_session(), model_path)
246 |         print("model restored")
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     env = gym.make("RoboschoolHumanoid-v1")
251 |     sess = tf.InteractiveSession()
252 |     ppo = PPOAgent(env)
253 |     tf.get_default_session().run(tf.global_variables_initializer())
254 |     ppo.restore_model("./model/Humanoid")
255 |     ppo.run()
256 | 
257 |     env.close()
258 | 


--------------------------------------------------------------------------------