├── Doing_RL_with_PPO.pdf ├── README.md ├── openai_baselines_ppo ├── console_util.py ├── dataset.py ├── distributions.py ├── logger.py ├── math_util.py ├── misc_util.py ├── mlp_policy.py ├── mpi_adam.py ├── mpi_moments.py ├── mpi_running_mean_std.py ├── pposgd_simple.py ├── run_roboschool.py ├── save │ ├── Humanoid-v1.data-00000-of-00001 │ ├── Humanoid-v1.index │ └── Humanoid-v1.meta ├── test_roboschool.py └── tf_util.py └── ppo.py /Doing_RL_with_PPO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wooridle/DeepRL-PPO-tutorial/84687cb24e8c8c68090fd83759acd6eebecbcf03/Doing_RL_with_PPO.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepRL-PPO-tutorial 2 | This repository contains tutorial material on Doing DeepRL with PPO in GDG DevFest 2017 Seoul. 3 | 4 | ## 발표자료 5 | 6 | [Doing_RL_with_PPO.pdf](./Doing_RL_with_PPO.pdf) 입니다. 7 | 8 | ## Roboschool 설치 가이드 9 | 10 | 설치에 앞서 roboschool은 Mac과 Linux 운영체제만 지원합니다. 11 | 12 | 가장 먼저 roboshcool을 깃헙에서 다운받습니다. 13 | 14 | ``` 15 | git clone https://github.com/openai/roboschool 16 | ``` 17 | 18 | 19 | 20 | 그리고 먼저 ROBOSCHOOL_PATH를 설정해야 합니다. 이 설정은 설치때만 이용하기 때문에 현재 shell에서만 적용되도록 경로를 설정해 줍니다. /path/to/roboschool 대신 자신이 다운받은 roboschool 경로를 설정합니다. 21 | 22 | ``` 23 | ROBOSCHOOL_PATH=/path/to/roboschool 24 | ``` 25 | 26 | 27 | 28 | 이제 roboschool 설치에 필요한 패키지들을 설치합니다. 각각의 운영체제에 맞는 설치를 이용하면 됩니다. 29 | 30 | - Linux 31 | 32 | ``` 33 | sudo apt install cmake ffmpeg pkg-config qtbase5-dev libqt5opengl5-dev libpython3.5-dev libboost-python-dev libtinyxml-dev 34 | ``` 35 | 36 | - Mac 37 | 38 | ``` 39 | # Will not work on Mavericks: unsupported by homebrew, some libraries won't compile, upgrade first 40 | brew install python3 41 | brew install cmake tinyxml assimp ffmpeg qt 42 | brew install boost-python --without-python --with-python3 --build-from-source 43 | export PATH=/usr/local/bin:/usr/local/opt/qt5/bin:$PATH 44 | export PKG_CONFIG_PATH=/usr/local/opt/qt5/lib/pkgconfig 45 | ``` 46 | 47 | - Mac, Anaconda with Python 3 48 | 49 | ``` 50 | brew install cmake tinyxml assimp ffmpeg 51 | brew install boost-python --without-python --with-python3 --build-from-source 52 | conda install qt 53 | export PKG_CONFIG_PATH=$(dirname $(dirname $(which python)))/lib/pkgconfig 54 | ``` 55 | 56 | 57 | 58 | 그 다음은 roboschool을 돌리는데 필요한 물리엔진인 bullet3를 설치해야 합니다. 먼저 bullet3를 먼저 깃헙에서 받아와서 빌드를 해야합니다. 59 | ** git clone한 roboschool 디렉토리에서 bullet3를 설치해주세요. 60 | 61 | ``` 62 | git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision 63 | mkdir bullet3/build 64 | cd bullet3/build 65 | cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 _DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF .. 66 | make -j4 67 | make install 68 | cd ../.. 69 | ``` 70 | 71 | 72 | 73 | 마지막으로 gym과 roboschool을 설치하면 됩니다. (python2 버전을 이용한다면 pip를 이용하면 됩니다.) 74 | 75 | ``` 76 | pip3 install gym 77 | pip3 install -e $ROBOSCHOOL_PATH 78 | ``` 79 | -------------------------------------------------------------------------------- /openai_baselines_ppo/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | def fmt_row(width, row, header=False): 11 | out = " | ".join(fmt_item(x, width) for x in row) 12 | if header: out = out + "\n" + "-"*len(out) 13 | return out 14 | 15 | def fmt_item(x, l): 16 | if isinstance(x, np.ndarray): 17 | assert x.ndim==0 18 | x = x.item() 19 | if isinstance(x, float): rep = "%g"%x 20 | else: rep = str(x) 21 | return " "*(l - len(rep)) + rep 22 | 23 | color2num = dict( 24 | gray=30, 25 | red=31, 26 | green=32, 27 | yellow=33, 28 | blue=34, 29 | magenta=35, 30 | cyan=36, 31 | white=37, 32 | crimson=38 33 | ) 34 | 35 | def colorize(string, color, bold=False, highlight=False): 36 | attr = [] 37 | num = color2num[color] 38 | if highlight: num += 10 39 | attr.append(str(num)) 40 | if bold: attr.append('1') 41 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 42 | 43 | 44 | MESSAGE_DEPTH = 0 45 | 46 | @contextmanager 47 | def timed(msg): 48 | global MESSAGE_DEPTH #pylint: disable=W0603 49 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 50 | tstart = time.time() 51 | MESSAGE_DEPTH += 1 52 | yield 53 | MESSAGE_DEPTH -= 1 54 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 55 | -------------------------------------------------------------------------------- /openai_baselines_ppo/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /openai_baselines_ppo/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from tensorflow.python.ops import nn 6 | 7 | class Pd(object): 8 | """ 9 | A particular probability distribution 10 | """ 11 | def flatparam(self): 12 | raise NotImplementedError 13 | def mode(self): 14 | raise NotImplementedError 15 | def neglogp(self, x): 16 | # Usually it's easier to define the negative logprob 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | def logp(self, x): 25 | return - self.neglogp(x) 26 | 27 | class PdType(object): 28 | """ 29 | Parametrized family of probability distributions 30 | """ 31 | def pdclass(self): 32 | raise NotImplementedError 33 | def pdfromflat(self, flat): 34 | return self.pdclass()(flat) 35 | def param_shape(self): 36 | raise NotImplementedError 37 | def sample_shape(self): 38 | raise NotImplementedError 39 | def sample_dtype(self): 40 | raise NotImplementedError 41 | 42 | def param_placeholder(self, prepend_shape, name=None): 43 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 44 | def sample_placeholder(self, prepend_shape, name=None): 45 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 46 | 47 | class CategoricalPdType(PdType): 48 | def __init__(self, ncat): 49 | self.ncat = ncat 50 | def pdclass(self): 51 | return CategoricalPd 52 | def param_shape(self): 53 | return [self.ncat] 54 | def sample_shape(self): 55 | return [] 56 | def sample_dtype(self): 57 | return tf.int32 58 | 59 | 60 | class MultiCategoricalPdType(PdType): 61 | def __init__(self, low, high): 62 | self.low = low 63 | self.high = high 64 | self.ncats = high - low + 1 65 | def pdclass(self): 66 | return MultiCategoricalPd 67 | def pdfromflat(self, flat): 68 | return MultiCategoricalPd(self.low, self.high, flat) 69 | def param_shape(self): 70 | return [sum(self.ncats)] 71 | def sample_shape(self): 72 | return [len(self.ncats)] 73 | def sample_dtype(self): 74 | return tf.int32 75 | 76 | class DiagGaussianPdType(PdType): 77 | def __init__(self, size): 78 | self.size = size 79 | def pdclass(self): 80 | return DiagGaussianPd 81 | def param_shape(self): 82 | return [2*self.size] 83 | def sample_shape(self): 84 | return [self.size] 85 | def sample_dtype(self): 86 | return tf.float32 87 | 88 | class BernoulliPdType(PdType): 89 | def __init__(self, size): 90 | self.size = size 91 | def pdclass(self): 92 | return BernoulliPd 93 | def param_shape(self): 94 | return [self.size] 95 | def sample_shape(self): 96 | return [self.size] 97 | def sample_dtype(self): 98 | return tf.int32 99 | 100 | # WRONG SECOND DERIVATIVES 101 | # class CategoricalPd(Pd): 102 | # def __init__(self, logits): 103 | # self.logits = logits 104 | # self.ps = tf.nn.softmax(logits) 105 | # @classmethod 106 | # def fromflat(cls, flat): 107 | # return cls(flat) 108 | # def flatparam(self): 109 | # return self.logits 110 | # def mode(self): 111 | # return U.argmax(self.logits, axis=-1) 112 | # def logp(self, x): 113 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 114 | # def kl(self, other): 115 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 116 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 117 | # def entropy(self): 118 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 119 | # def sample(self): 120 | # u = tf.random_uniform(tf.shape(self.logits)) 121 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 122 | 123 | class CategoricalPd(Pd): 124 | def __init__(self, logits): 125 | self.logits = logits 126 | def flatparam(self): 127 | return self.logits 128 | def mode(self): 129 | return U.argmax(self.logits, axis=-1) 130 | def neglogp(self, x): 131 | # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 132 | # Note: we can't use sparse_softmax_cross_entropy_with_logits because 133 | # the implementation does not allow second-order derivatives... 134 | one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) 135 | return tf.nn.softmax_cross_entropy_with_logits( 136 | logits=self.logits, 137 | labels=one_hot_actions) 138 | def kl(self, other): 139 | a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True) 140 | a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True) 141 | ea0 = tf.exp(a0) 142 | ea1 = tf.exp(a1) 143 | z0 = U.sum(ea0, axis=-1, keepdims=True) 144 | z1 = U.sum(ea1, axis=-1, keepdims=True) 145 | p0 = ea0 / z0 146 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) 147 | def entropy(self): 148 | a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True) 149 | ea0 = tf.exp(a0) 150 | z0 = U.sum(ea0, axis=-1, keepdims=True) 151 | p0 = ea0 / z0 152 | return U.sum(p0 * (tf.log(z0) - a0), axis=-1) 153 | def sample(self): 154 | u = tf.random_uniform(tf.shape(self.logits)) 155 | return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) 156 | @classmethod 157 | def fromflat(cls, flat): 158 | return cls(flat) 159 | 160 | class MultiCategoricalPd(Pd): 161 | def __init__(self, low, high, flat): 162 | self.flat = flat 163 | self.low = tf.constant(low, dtype=tf.int32) 164 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 165 | def flatparam(self): 166 | return self.flat 167 | def mode(self): 168 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 169 | def neglogp(self, x): 170 | return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 171 | def kl(self, other): 172 | return tf.add_n([ 173 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 174 | ]) 175 | def entropy(self): 176 | return tf.add_n([p.entropy() for p in self.categoricals]) 177 | def sample(self): 178 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 179 | @classmethod 180 | def fromflat(cls, flat): 181 | raise NotImplementedError 182 | 183 | class DiagGaussianPd(Pd): 184 | def __init__(self, flat): 185 | self.flat = flat 186 | mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) 187 | self.mean = mean 188 | self.logstd = logstd 189 | self.std = tf.exp(logstd) 190 | def flatparam(self): 191 | return self.flat 192 | def mode(self): 193 | return self.mean 194 | def neglogp(self, x): 195 | return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \ 196 | + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ 197 | + U.sum(self.logstd, axis=-1) 198 | def kl(self, other): 199 | assert isinstance(other, DiagGaussianPd) 200 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) 201 | def entropy(self): 202 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) 203 | def sample(self): 204 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 205 | @classmethod 206 | def fromflat(cls, flat): 207 | return cls(flat) 208 | 209 | class BernoulliPd(Pd): 210 | def __init__(self, logits): 211 | self.logits = logits 212 | self.ps = tf.sigmoid(logits) 213 | def flatparam(self): 214 | return self.logits 215 | def mode(self): 216 | return tf.round(self.ps) 217 | def neglogp(self, x): 218 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) 219 | def kl(self, other): 220 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 221 | def entropy(self): 222 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) 223 | def sample(self): 224 | u = tf.random_uniform(tf.shape(self.ps)) 225 | return tf.to_float(math_ops.less(u, self.ps)) 226 | @classmethod 227 | def fromflat(cls, flat): 228 | return cls(flat) 229 | 230 | def make_pdtype(ac_space): 231 | from gym import spaces 232 | if isinstance(ac_space, spaces.Box): 233 | assert len(ac_space.shape) == 1 234 | return DiagGaussianPdType(ac_space.shape[0]) 235 | elif isinstance(ac_space, spaces.Discrete): 236 | return CategoricalPdType(ac_space.n) 237 | elif isinstance(ac_space, spaces.MultiDiscrete): 238 | return MultiCategoricalPdType(ac_space.low, ac_space.high) 239 | elif isinstance(ac_space, spaces.MultiBinary): 240 | return BernoulliPdType(ac_space.n) 241 | else: 242 | raise NotImplementedError 243 | 244 | def shape_el(v, i): 245 | maybe = v.get_shape()[i] 246 | if maybe is not None: 247 | return maybe 248 | else: 249 | return tf.shape(v)[i] 250 | 251 | @U.in_session 252 | def test_probtypes(): 253 | np.random.seed(0) 254 | 255 | pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) 256 | diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 257 | validate_probtype(diag_gauss, pdparam_diag_gauss) 258 | 259 | pdparam_categorical = np.array([-.2, .3, .5]) 260 | categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 261 | validate_probtype(categorical, pdparam_categorical) 262 | 263 | pdparam_bernoulli = np.array([-.2, .3, .5]) 264 | bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 265 | validate_probtype(bernoulli, pdparam_bernoulli) 266 | 267 | 268 | def validate_probtype(probtype, pdparam): 269 | N = 100000 270 | # Check to see if mean negative log likelihood == differential entropy 271 | Mval = np.repeat(pdparam[None, :], N, axis=0) 272 | M = probtype.param_placeholder([N]) 273 | X = probtype.sample_placeholder([N]) 274 | pd = probtype.pdclass()(M) 275 | calcloglik = U.function([X, M], pd.logp(X)) 276 | calcent = U.function([M], pd.entropy()) 277 | Xval = U.eval(pd.sample(), feed_dict={M:Mval}) 278 | logliks = calcloglik(Xval, Mval) 279 | entval_ll = - logliks.mean() #pylint: disable=E1101 280 | entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 281 | entval = calcent(Mval).mean() #pylint: disable=E1101 282 | assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas 283 | 284 | # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] 285 | M2 = probtype.param_placeholder([N]) 286 | pd2 = probtype.pdclass()(M2) 287 | q = pdparam + np.random.randn(pdparam.size) * 0.1 288 | Mval2 = np.repeat(q[None, :], N, axis=0) 289 | calckl = U.function([M, M2], pd.kl(pd2)) 290 | klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 291 | logliks = calcloglik(Xval, Mval2) 292 | klval_ll = - entval - logliks.mean() #pylint: disable=E1101 293 | klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 294 | assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas 295 | -------------------------------------------------------------------------------- /openai_baselines_ppo/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import os.path as osp 5 | import json 6 | import time 7 | import datetime 8 | import tempfile 9 | 10 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json'] 11 | 12 | DEBUG = 10 13 | INFO = 20 14 | WARN = 30 15 | ERROR = 40 16 | 17 | DISABLED = 50 18 | 19 | class OutputFormat(object): 20 | def writekvs(self, kvs): 21 | """ 22 | Write key-value pairs 23 | """ 24 | raise NotImplementedError 25 | 26 | def writeseq(self, args): 27 | """ 28 | Write a sequence of other data (e.g. a logging message) 29 | """ 30 | pass 31 | 32 | def close(self): 33 | return 34 | 35 | 36 | class HumanOutputFormat(OutputFormat): 37 | def __init__(self, file): 38 | self.file = file 39 | 40 | def writekvs(self, kvs): 41 | # Create strings for printing 42 | key2str = {} 43 | for (key, val) in sorted(kvs.items()): 44 | if isinstance(val, float): 45 | valstr = '%-8.3g' % (val,) 46 | else: 47 | valstr = str(val) 48 | key2str[self._truncate(key)] = self._truncate(valstr) 49 | 50 | # Find max widths 51 | keywidth = max(map(len, key2str.keys())) 52 | valwidth = max(map(len, key2str.values())) 53 | 54 | # Write out the data 55 | dashes = '-' * (keywidth + valwidth + 7) 56 | lines = [dashes] 57 | for (key, val) in sorted(key2str.items()): 58 | lines.append('| %s%s | %s%s |' % ( 59 | key, 60 | ' ' * (keywidth - len(key)), 61 | val, 62 | ' ' * (valwidth - len(val)), 63 | )) 64 | lines.append(dashes) 65 | self.file.write('\n'.join(lines) + '\n') 66 | 67 | # Flush the output to the file 68 | self.file.flush() 69 | 70 | def _truncate(self, s): 71 | return s[:20] + '...' if len(s) > 23 else s 72 | 73 | def writeseq(self, args): 74 | for arg in args: 75 | self.file.write(arg) 76 | self.file.write('\n') 77 | self.file.flush() 78 | 79 | class JSONOutputFormat(OutputFormat): 80 | def __init__(self, file): 81 | self.file = file 82 | 83 | def writekvs(self, kvs): 84 | for k, v in sorted(kvs.items()): 85 | if hasattr(v, 'dtype'): 86 | v = v.tolist() 87 | kvs[k] = float(v) 88 | self.file.write(json.dumps(kvs) + '\n') 89 | self.file.flush() 90 | 91 | class TensorBoardOutputFormat(OutputFormat): 92 | """ 93 | Dumps key/value pairs into TensorBoard's numeric format. 94 | """ 95 | def __init__(self, dir): 96 | os.makedirs(dir, exist_ok=True) 97 | self.dir = dir 98 | self.step = 1 99 | prefix = 'events' 100 | path = osp.join(osp.abspath(dir), prefix) 101 | import tensorflow as tf 102 | from tensorflow.python import pywrap_tensorflow 103 | from tensorflow.core.util import event_pb2 104 | from tensorflow.python.util import compat 105 | self.tf = tf 106 | self.event_pb2 = event_pb2 107 | self.pywrap_tensorflow = pywrap_tensorflow 108 | self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) 109 | 110 | def writekvs(self, kvs): 111 | def summary_val(k, v): 112 | kwargs = {'tag': k, 'simple_value': float(v)} 113 | return self.tf.Summary.Value(**kwargs) 114 | summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) 115 | event = self.event_pb2.Event(wall_time=time.time(), summary=summary) 116 | event.step = self.step # is there any reason why you'd want to specify the step? 117 | self.writer.WriteEvent(event) 118 | self.writer.Flush() 119 | self.step += 1 120 | 121 | def close(self): 122 | if self.writer: 123 | self.writer.Close() 124 | self.writer = None 125 | 126 | 127 | def make_output_format(format, ev_dir): 128 | os.makedirs(ev_dir, exist_ok=True) 129 | if format == 'stdout': 130 | return HumanOutputFormat(sys.stdout) 131 | elif format == 'log': 132 | log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') 133 | return HumanOutputFormat(log_file) 134 | elif format == 'json': 135 | json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') 136 | return JSONOutputFormat(json_file) 137 | elif format == 'tensorboard': 138 | return TensorBoardOutputFormat(osp.join(ev_dir, 'tb')) 139 | else: 140 | raise ValueError('Unknown format specified: %s' % (format,)) 141 | 142 | # ================================================================ 143 | # API 144 | # ================================================================ 145 | 146 | def logkv(key, val): 147 | """ 148 | Log a value of some diagnostic 149 | Call this once for each diagnostic quantity, each iteration 150 | """ 151 | Logger.CURRENT.logkv(key, val) 152 | 153 | def logkvs(d): 154 | """ 155 | Log a dictionary of key-value pairs 156 | """ 157 | for (k, v) in d.items(): 158 | logkv(k, v) 159 | 160 | def dumpkvs(): 161 | """ 162 | Write all of the diagnostics from the current iteration 163 | 164 | level: int. (see logger.py docs) If the global logger level is higher than 165 | the level argument here, don't print to stdout. 166 | """ 167 | Logger.CURRENT.dumpkvs() 168 | 169 | def getkvs(): 170 | return Logger.CURRENT.name2val 171 | 172 | 173 | def log(*args, level=INFO): 174 | """ 175 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 176 | """ 177 | Logger.CURRENT.log(*args, level=level) 178 | 179 | 180 | def debug(*args): 181 | log(*args, level=DEBUG) 182 | 183 | 184 | def info(*args): 185 | log(*args, level=INFO) 186 | 187 | 188 | def warn(*args): 189 | log(*args, level=WARN) 190 | 191 | 192 | def error(*args): 193 | log(*args, level=ERROR) 194 | 195 | 196 | def set_level(level): 197 | """ 198 | Set logging threshold on current logger. 199 | """ 200 | Logger.CURRENT.set_level(level) 201 | 202 | def get_dir(): 203 | """ 204 | Get directory that log files are being written to. 205 | will be None if there is no output directory (i.e., if you didn't call start) 206 | """ 207 | return Logger.CURRENT.get_dir() 208 | 209 | record_tabular = logkv 210 | dump_tabular = dumpkvs 211 | 212 | # ================================================================ 213 | # Backend 214 | # ================================================================ 215 | 216 | class Logger(object): 217 | DEFAULT = None # A logger with no output files. (See right below class definition) 218 | # So that you can still log to the terminal without setting up any output files 219 | CURRENT = None # Current logger being used by the free functions above 220 | 221 | def __init__(self, dir, output_formats): 222 | self.name2val = {} # values this iteration 223 | self.level = INFO 224 | self.dir = dir 225 | self.output_formats = output_formats 226 | 227 | # Logging API, forwarded 228 | # ---------------------------------------- 229 | def logkv(self, key, val): 230 | self.name2val[key] = val 231 | 232 | def dumpkvs(self): 233 | if self.level == DISABLED: return 234 | for fmt in self.output_formats: 235 | fmt.writekvs(self.name2val) 236 | self.name2val.clear() 237 | 238 | def log(self, *args, level=INFO): 239 | if self.level <= level: 240 | self._do_log(args) 241 | 242 | # Configuration 243 | # ---------------------------------------- 244 | def set_level(self, level): 245 | self.level = level 246 | 247 | def get_dir(self): 248 | return self.dir 249 | 250 | def close(self): 251 | for fmt in self.output_formats: 252 | fmt.close() 253 | 254 | # Misc 255 | # ---------------------------------------- 256 | def _do_log(self, args): 257 | for fmt in self.output_formats: 258 | fmt.writeseq(args) 259 | 260 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) 261 | 262 | def configure(dir=None, format_strs=None): 263 | assert Logger.CURRENT is Logger.DEFAULT,\ 264 | "Only call logger.configure() when it's in the default state. Try calling logger.reset() first." 265 | prevlogger = Logger.CURRENT 266 | if dir is None: 267 | dir = os.getenv('OPENAI_LOGDIR') 268 | if dir is None: 269 | dir = osp.join(tempfile.gettempdir(), 270 | datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) 271 | if format_strs is None: 272 | format_strs = LOG_OUTPUT_FORMATS 273 | output_formats = [make_output_format(f, dir) for f in format_strs] 274 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) 275 | log('Logging to %s'%dir) 276 | 277 | if os.getenv('OPENAI_LOGDIR'): 278 | # if OPENAI_LOGDIR is set, configure the logger on import 279 | # this kind of nasty (unexpected to user), but I don't know how else to inject the logger 280 | # to a script that's getting run in a subprocess 281 | configure(dir=os.getenv('OPENAI_LOGDIR')) 282 | 283 | def reset(): 284 | Logger.CURRENT = Logger.DEFAULT 285 | log('Reset logger') 286 | 287 | # ================================================================ 288 | 289 | def _demo(): 290 | info("hi") 291 | debug("shouldn't appear") 292 | set_level(DEBUG) 293 | debug("should appear") 294 | dir = "/tmp/testlogging" 295 | if os.path.exists(dir): 296 | shutil.rmtree(dir) 297 | with session(dir=dir): 298 | logkv("a", 3) 299 | logkv("b", 2.5) 300 | dumpkvs() 301 | logkv("b", -2.5) 302 | logkv("a", 5.5) 303 | dumpkvs() 304 | info("^^^ should see a = 5.5") 305 | 306 | logkv("b", -2.5) 307 | dumpkvs() 308 | 309 | logkv("a", "longasslongasslongasslongasslongasslongassvalue") 310 | dumpkvs() 311 | 312 | 313 | if __name__ == "__main__": 314 | _demo() 315 | -------------------------------------------------------------------------------- /openai_baselines_ppo/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) -------------------------------------------------------------------------------- /openai_baselines_ppo/misc_util.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import os 4 | import pickle 5 | import random 6 | import tempfile 7 | import time 8 | import zipfile 9 | 10 | 11 | def zipsame(*seqs): 12 | L = len(seqs[0]) 13 | assert all(len(seq) == L for seq in seqs[1:]) 14 | return zip(*seqs) 15 | 16 | 17 | def unpack(seq, sizes): 18 | """ 19 | Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'. 20 | None = just one bare element, not a list 21 | 22 | Example: 23 | unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6]) 24 | """ 25 | seq = list(seq) 26 | it = iter(seq) 27 | assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes) 28 | for size in sizes: 29 | if size is None: 30 | yield it.__next__() 31 | else: 32 | li = [] 33 | for _ in range(size): 34 | li.append(it.__next__()) 35 | yield li 36 | 37 | 38 | class EzPickle(object): 39 | """Objects that are pickled and unpickled via their constructor 40 | arguments. 41 | 42 | Example usage: 43 | 44 | class Dog(Animal, EzPickle): 45 | def __init__(self, furcolor, tailkind="bushy"): 46 | Animal.__init__() 47 | EzPickle.__init__(furcolor, tailkind) 48 | ... 49 | 50 | When this object is unpickled, a new Dog will be constructed by passing the provided 51 | furcolor and tailkind into the constructor. However, philosophers are still not sure 52 | whether it is still the same dog. 53 | 54 | This is generally needed only for environments which wrap C/C++ code, such as MuJoCo 55 | and Atari. 56 | """ 57 | 58 | def __init__(self, *args, **kwargs): 59 | self._ezpickle_args = args 60 | self._ezpickle_kwargs = kwargs 61 | 62 | def __getstate__(self): 63 | return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} 64 | 65 | def __setstate__(self, d): 66 | out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) 67 | self.__dict__.update(out.__dict__) 68 | 69 | 70 | def set_global_seeds(i): 71 | try: 72 | import tensorflow as tf 73 | except ImportError: 74 | pass 75 | else: 76 | tf.set_random_seed(i) 77 | np.random.seed(i) 78 | random.seed(i) 79 | 80 | 81 | def pretty_eta(seconds_left): 82 | """Print the number of seconds in human readable format. 83 | 84 | Examples: 85 | 2 days 86 | 2 hours and 37 minutes 87 | less than a minute 88 | 89 | Paramters 90 | --------- 91 | seconds_left: int 92 | Number of seconds to be converted to the ETA 93 | Returns 94 | ------- 95 | eta: str 96 | String representing the pretty ETA. 97 | """ 98 | minutes_left = seconds_left // 60 99 | seconds_left %= 60 100 | hours_left = minutes_left // 60 101 | minutes_left %= 60 102 | days_left = hours_left // 24 103 | hours_left %= 24 104 | 105 | def helper(cnt, name): 106 | return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else '')) 107 | 108 | if days_left > 0: 109 | msg = helper(days_left, 'day') 110 | if hours_left > 0: 111 | msg += ' and ' + helper(hours_left, 'hour') 112 | return msg 113 | if hours_left > 0: 114 | msg = helper(hours_left, 'hour') 115 | if minutes_left > 0: 116 | msg += ' and ' + helper(minutes_left, 'minute') 117 | return msg 118 | if minutes_left > 0: 119 | return helper(minutes_left, 'minute') 120 | return 'less than a minute' 121 | 122 | 123 | class RunningAvg(object): 124 | def __init__(self, gamma, init_value=None): 125 | """Keep a running estimate of a quantity. This is a bit like mean 126 | but more sensitive to recent changes. 127 | 128 | Parameters 129 | ---------- 130 | gamma: float 131 | Must be between 0 and 1, where 0 is the most sensitive to recent 132 | changes. 133 | init_value: float or None 134 | Initial value of the estimate. If None, it will be set on the first update. 135 | """ 136 | self._value = init_value 137 | self._gamma = gamma 138 | 139 | def update(self, new_val): 140 | """Update the estimate. 141 | 142 | Parameters 143 | ---------- 144 | new_val: float 145 | new observated value of estimated quantity. 146 | """ 147 | if self._value is None: 148 | self._value = new_val 149 | else: 150 | self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val 151 | 152 | def __float__(self): 153 | """Get the current estimate""" 154 | return self._value 155 | 156 | 157 | class SimpleMonitor(gym.Wrapper): 158 | def __init__(self, env): 159 | """Adds two qunatities to info returned by every step: 160 | 161 | num_steps: int 162 | Number of steps takes so far 163 | rewards: [float] 164 | All the cumulative rewards for the episodes completed so far. 165 | """ 166 | super().__init__(env) 167 | # current episode state 168 | self._current_reward = None 169 | self._num_steps = None 170 | # temporary monitor state that we do not save 171 | self._time_offset = None 172 | self._total_steps = None 173 | # monitor state 174 | self._episode_rewards = [] 175 | self._episode_lengths = [] 176 | self._episode_end_times = [] 177 | 178 | def _reset(self): 179 | obs = self.env.reset() 180 | # recompute temporary state if needed 181 | if self._time_offset is None: 182 | self._time_offset = time.time() 183 | if len(self._episode_end_times) > 0: 184 | self._time_offset -= self._episode_end_times[-1] 185 | if self._total_steps is None: 186 | self._total_steps = sum(self._episode_lengths) 187 | # update monitor state 188 | if self._current_reward is not None: 189 | self._episode_rewards.append(self._current_reward) 190 | self._episode_lengths.append(self._num_steps) 191 | self._episode_end_times.append(time.time() - self._time_offset) 192 | # reset episode state 193 | self._current_reward = 0 194 | self._num_steps = 0 195 | 196 | return obs 197 | 198 | def _step(self, action): 199 | obs, rew, done, info = self.env.step(action) 200 | self._current_reward += rew 201 | self._num_steps += 1 202 | self._total_steps += 1 203 | info['steps'] = self._total_steps 204 | info['rewards'] = self._episode_rewards 205 | return (obs, rew, done, info) 206 | 207 | def get_state(self): 208 | return { 209 | 'env_id': self.env.unwrapped.spec.id, 210 | 'episode_data': { 211 | 'episode_rewards': self._episode_rewards, 212 | 'episode_lengths': self._episode_lengths, 213 | 'episode_end_times': self._episode_end_times, 214 | 'initial_reset_time': 0, 215 | } 216 | } 217 | 218 | def set_state(self, state): 219 | assert state['env_id'] == self.env.unwrapped.spec.id 220 | ed = state['episode_data'] 221 | self._episode_rewards = ed['episode_rewards'] 222 | self._episode_lengths = ed['episode_lengths'] 223 | self._episode_end_times = ed['episode_end_times'] 224 | 225 | 226 | def boolean_flag(parser, name, default=False, help=None): 227 | """Add a boolean flag to argparse parser. 228 | 229 | Parameters 230 | ---------- 231 | parser: argparse.Parser 232 | parser to add the flag to 233 | name: str 234 | -- will enable the flag, while --no- will disable it 235 | default: bool or None 236 | default value of the flag 237 | help: str 238 | help string for the flag 239 | """ 240 | dest = name.replace('-', '_') 241 | parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help) 242 | parser.add_argument("--no-" + name, action="store_false", dest=dest) 243 | 244 | 245 | def get_wrapper_by_name(env, classname): 246 | """Given an a gym environment possibly wrapped multiple times, returns a wrapper 247 | of class named classname or raises ValueError if no such wrapper was applied 248 | 249 | Parameters 250 | ---------- 251 | env: gym.Env of gym.Wrapper 252 | gym environment 253 | classname: str 254 | name of the wrapper 255 | 256 | Returns 257 | ------- 258 | wrapper: gym.Wrapper 259 | wrapper named classname 260 | """ 261 | currentenv = env 262 | while True: 263 | if classname == currentenv.class_name(): 264 | return currentenv 265 | elif isinstance(currentenv, gym.Wrapper): 266 | currentenv = currentenv.env 267 | else: 268 | raise ValueError("Couldn't find wrapper named %s" % classname) 269 | 270 | 271 | def relatively_safe_pickle_dump(obj, path, compression=False): 272 | """This is just like regular pickle dump, except from the fact that failure cases are 273 | different: 274 | 275 | - It's never possible that we end up with a pickle in corrupted state. 276 | - If a there was a different file at the path, that file will remain unchanged in the 277 | even of failure (provided that filesystem rename is atomic). 278 | - it is sometimes possible that we end up with useless temp file which needs to be 279 | deleted manually (it will be removed automatically on the next function call) 280 | 281 | The indended use case is periodic checkpoints of experiment state, such that we never 282 | corrupt previous checkpoints if the current one fails. 283 | 284 | Parameters 285 | ---------- 286 | obj: object 287 | object to pickle 288 | path: str 289 | path to the output file 290 | compression: bool 291 | if true pickle will be compressed 292 | """ 293 | temp_storage = path + ".relatively_safe" 294 | if compression: 295 | # Using gzip here would be simpler, but the size is limited to 2GB 296 | with tempfile.NamedTemporaryFile() as uncompressed_file: 297 | pickle.dump(obj, uncompressed_file) 298 | with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip: 299 | myzip.write(uncompressed_file.name, "data") 300 | else: 301 | with open(temp_storage, "wb") as f: 302 | pickle.dump(obj, f) 303 | os.rename(temp_storage, path) 304 | 305 | 306 | def pickle_load(path, compression=False): 307 | """Unpickle a possible compressed pickle. 308 | 309 | Parameters 310 | ---------- 311 | path: str 312 | path to the output file 313 | compression: bool 314 | if true assumes that pickle was compressed when created and attempts decompression. 315 | 316 | Returns 317 | ------- 318 | obj: object 319 | the unpickled object 320 | """ 321 | 322 | if compression: 323 | with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip: 324 | with myzip.open("data") as f: 325 | return pickle.load(f) 326 | else: 327 | with open(path, "rb") as f: 328 | return pickle.load(f) 329 | -------------------------------------------------------------------------------- /openai_baselines_ppo/mlp_policy.py: -------------------------------------------------------------------------------- 1 | from mpi_running_mean_std import RunningMeanStd 2 | import tf_util as U 3 | import tensorflow as tf 4 | import gym 5 | from distributions import make_pdtype 6 | 7 | class MlpPolicy(object): 8 | recurrent = False 9 | def __init__(self, name, *args, **kwargs): 10 | with tf.variable_scope(name): 11 | self._init(*args, **kwargs) 12 | self.scope = tf.get_variable_scope().name 13 | 14 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 15 | assert isinstance(ob_space, gym.spaces.Box) 16 | 17 | self.pdtype = pdtype = make_pdtype(ac_space) 18 | sequence_length = None 19 | 20 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 21 | #obz = ob 22 | 23 | #with tf.variable_scope("obfilter"): 24 | # self.ob_rms = RunningMeanStd(shape=ob_space.shape) 25 | 26 | #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 27 | last_out = ob 28 | for i in range(num_hid_layers): 29 | last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) 30 | self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] 31 | 32 | last_out = ob 33 | for i in range(num_hid_layers): 34 | last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) 35 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 36 | mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) 37 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer) 38 | pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) 39 | else: 40 | pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) 41 | 42 | self.pd = pdtype.pdfromflat(pdparam) 43 | 44 | self.state_in = [] 45 | self.state_out = [] 46 | 47 | stochastic = tf.placeholder(dtype=tf.bool, shape=()) 48 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) 49 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 50 | 51 | def act(self, stochastic, ob): 52 | ac1, vpred1 = self._act(stochastic, [ob]) 53 | return ac1[0], vpred1[0] 54 | def get_variables(self): 55 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 56 | def get_trainable_variables(self): 57 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 58 | def get_initial_state(self): 59 | return [] 60 | 61 | -------------------------------------------------------------------------------- /openai_baselines_ppo/mpi_adam.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import tf_util as U 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | class MpiAdam(object): 7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 8 | self.var_list = var_list 9 | self.beta1 = beta1 10 | self.beta2 = beta2 11 | self.epsilon = epsilon 12 | self.scale_grad_by_procs = scale_grad_by_procs 13 | size = sum(U.numel(v) for v in var_list) 14 | self.m = np.zeros(size, 'float32') 15 | self.v = np.zeros(size, 'float32') 16 | self.t = 0 17 | self.setfromflat = U.SetFromFlat(var_list) 18 | self.getflat = U.GetFlat(var_list) 19 | self.comm = MPI.COMM_WORLD if comm is None else comm 20 | 21 | def update(self, localg, stepsize): 22 | if self.t % 100 == 0: 23 | self.check_synced() 24 | localg = localg.astype('float32') 25 | globalg = np.zeros_like(localg) 26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 27 | if self.scale_grad_by_procs: 28 | globalg /= self.comm.Get_size() 29 | 30 | self.t += 1 31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 35 | self.setfromflat(self.getflat() + step) 36 | 37 | def sync(self): 38 | theta = self.getflat() 39 | self.comm.Bcast(theta, root=0) 40 | self.setfromflat(theta) 41 | 42 | def check_synced(self): 43 | if self.comm.Get_rank() == 0: # this is root 44 | theta = self.getflat() 45 | self.comm.Bcast(theta, root=0) 46 | else: 47 | thetalocal = self.getflat() 48 | thetaroot = np.empty_like(thetalocal) 49 | self.comm.Bcast(thetaroot, root=0) 50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 51 | 52 | @U.in_session 53 | def test_MpiAdam(): 54 | np.random.seed(0) 55 | tf.set_random_seed(0) 56 | 57 | a = tf.Variable(np.random.randn(3).astype('float32')) 58 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 60 | 61 | stepsize = 1e-2 62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 63 | do_update = U.function([], loss, updates=[update_op]) 64 | 65 | tf.get_default_session().run(tf.global_variables_initializer()) 66 | for i in range(10): 67 | print(i,do_update()) 68 | 69 | tf.set_random_seed(0) 70 | tf.get_default_session().run(tf.global_variables_initializer()) 71 | 72 | var_list = [a,b] 73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) 74 | adam = MpiAdam(var_list) 75 | 76 | for i in range(10): 77 | l,g = lossandgrad() 78 | adam.update(g, stepsize) 79 | print(i,l) 80 | -------------------------------------------------------------------------------- /openai_baselines_ppo/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from misc_util import zipsame 4 | 5 | def mpi_moments(x, axis=0): 6 | x = np.asarray(x, dtype='float64') 7 | newshape = list(x.shape) 8 | newshape.pop(axis) 9 | n = np.prod(newshape,dtype=int) 10 | totalvec = np.zeros(n*2+1, 'float64') 11 | addvec = np.concatenate([x.sum(axis=axis).ravel(), 12 | np.square(x).sum(axis=axis).ravel(), 13 | np.array([x.shape[axis]],dtype='float64')]) 14 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 15 | sum = totalvec[:n] 16 | sumsq = totalvec[n:2*n] 17 | count = totalvec[2*n] 18 | if count == 0: 19 | mean = np.empty(newshape); mean[:] = np.nan 20 | std = np.empty(newshape); std[:] = np.nan 21 | else: 22 | mean = sum/count 23 | std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0)) 24 | return mean, std, count 25 | 26 | 27 | def test_runningmeanstd(): 28 | comm = MPI.COMM_WORLD 29 | np.random.seed(0) 30 | for (triple,axis) in [ 31 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 32 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 33 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 34 | ]: 35 | 36 | 37 | x = np.concatenate(triple, axis=axis) 38 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 39 | 40 | 41 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 42 | 43 | for (a1,a2) in zipsame(ms1, ms2): 44 | print(a1, a2) 45 | assert np.allclose(a1, a2) 46 | print("ok!") 47 | 48 | if __name__ == "__main__": 49 | #mpirun -np 3 python