├── baselines ├── __init__.py ├── deepq │ ├── prediction │ │ ├── __init__.py │ │ ├── tool │ │ │ ├── __init__.py │ │ │ ├── visualize.py │ │ │ ├── visualize_all.py │ │ │ ├── dump_tfrecords.py │ │ │ ├── compute_mean.py │ │ │ ├── summary.py │ │ │ ├── episode_reader.py │ │ │ └── episode_collector.py │ │ ├── tfacvp │ │ │ ├── __init__.py │ │ │ ├── tf_ops.py │ │ │ ├── util.py │ │ │ ├── dataset.py │ │ │ ├── old_model.py │ │ │ └── model.py │ │ ├── setup.sh │ │ ├── example │ │ │ ├── pred-00.png │ │ │ ├── pred-01.png │ │ │ ├── pred-02.png │ │ │ ├── pred-03.png │ │ │ ├── pred-04.png │ │ │ ├── pred-05.png │ │ │ ├── pred-06.png │ │ │ ├── pred-07.png │ │ │ ├── pred-08.png │ │ │ ├── atari-gray │ │ │ │ ├── a_t.npy │ │ │ │ ├── s_t.npy │ │ │ │ ├── pred.png │ │ │ │ ├── x_t_1.npy │ │ │ │ ├── README.md │ │ │ │ └── example.py │ │ │ └── atari-rgb │ │ │ │ ├── mean.npy │ │ │ │ ├── example.npy │ │ │ │ ├── example.png │ │ │ │ ├── README.md │ │ │ │ └── example.py │ │ ├── misc │ │ │ ├── check_caffe_weight.py │ │ │ ├── test_conv1.py │ │ │ ├── check.py │ │ │ └── test_caffe.py │ │ ├── old_train.sh │ │ ├── README.md │ │ ├── train.sh │ │ └── train.py │ ├── experiments │ │ ├── __init__.py │ │ ├── atari │ │ │ ├── __init__.py │ │ │ ├── download_model.py │ │ │ ├── model.py │ │ │ ├── wang2015_eval.py │ │ │ ├── enjoy.py │ │ │ └── train.py │ │ ├── enjoy_cartpole.py │ │ ├── enjoy_pong.py │ │ ├── train_cartpole.py │ │ ├── train_pong.py │ │ └── custom_cartpole.py │ ├── __init__.py │ ├── models.py │ ├── replay_buffer.py │ ├── simple.py │ └── build_graph.py ├── common │ ├── __init__.py │ ├── tests │ │ ├── test_schedules.py │ │ ├── test_tf_util.py │ │ └── test_segment_tree.py │ ├── schedules.py │ ├── segment_tree.py │ ├── azure_utils.py │ ├── atari_wrappers_deprecated.py │ ├── misc_util.py │ └── tf_util.py └── logger.py ├── atari-pre-trained-agents └── README.md ├── atari-visual-foresight └── README.md ├── data ├── logo.jpg └── cartpole.gif ├── .gitignore ├── setup.py ├── LICENSE └── README.md /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /atari-pre-trained-agents/README.md: -------------------------------------------------------------------------------- 1 | # Save pre-trained atari agents here -------------------------------------------------------------------------------- /atari-visual-foresight/README.md: -------------------------------------------------------------------------------- 1 | # Save pre-trained visual foresight model here -------------------------------------------------------------------------------- /baselines/deepq/prediction/setup.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=$(pwd):$PYTHONPATH 2 | -------------------------------------------------------------------------------- /baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from baselines.common.misc_util import * 5 | -------------------------------------------------------------------------------- /data/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/data/logo.jpg -------------------------------------------------------------------------------- /data/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/data/cartpole.gif -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-00.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-01.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-02.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-03.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-04.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-05.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-06.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-07.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/pred-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-08.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/a_t.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/a_t.npy -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/s_t.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/s_t.npy -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-rgb/mean.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/mean.npy -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/pred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/pred.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/x_t_1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/x_t_1.npy -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-rgb/example.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/example.npy -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-rgb/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/example.png -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/visualize.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import sys 4 | 5 | img = np.load(sys.argv[1]).astype(np.uint8) 6 | cv2.imshow('img', img) 7 | cv2.waitKey(0) 8 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/misc/check_caffe_weight.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle as pickle 3 | import sys 4 | 5 | data = pickle.load(open(sys.argv[1], "rb" )) 6 | for key in data: 7 | print key, data[key].shape 8 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/visualize_all.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import sys 4 | 5 | ss = np.load(sys.argv[1]).astype(np.uint8) 6 | for s in ss 7 | for i in range(0, 12, 3): 8 | cv2.imshow('img%d' % i,s[:,:,i:i+3]) 9 | cv2.waitKey(0) 10 | -------------------------------------------------------------------------------- /baselines/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.deepq import models # noqa 2 | from baselines.deepq.build_graph import build_act, build_train # noqa 3 | 4 | from baselines.deepq.simple import learn, load # noqa 5 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa 6 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/old_train.sh: -------------------------------------------------------------------------------- 1 | GAME=$1 2 | NUM_ACT=$2 3 | COLOR=$3 4 | TRAIN="${GAME}/train" 5 | TEST="${GAME}/test" 6 | MEAN="${GAME}/mean.npy" 7 | LOG="models/${GAME}-${COLOR}-model" 8 | 9 | python train.py --train ${TRAIN} --test ${TEST} --mean ${MEAN} --num_act ${NUM_ACT} --color ${COLOR} --log ${LOG} 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.py~ 4 | .DS_Store 5 | .idea 6 | 7 | # Setuptools distribution and build folders. 8 | /dist/ 9 | /build 10 | 11 | # Virtualenv 12 | /env 13 | 14 | # Python egg metadata, regenerated from source files by setuptools. 15 | /*.egg-info 16 | 17 | *.sublime-project 18 | *.sublime-workspace 19 | 20 | .idea 21 | 22 | logs/ 23 | 24 | .ipynb_checkpoints 25 | ghostdriver.log 26 | 27 | htmlcov 28 | 29 | junk -------------------------------------------------------------------------------- /baselines/deepq/prediction/README.md: -------------------------------------------------------------------------------- 1 | # Installtion 2 | ``` 3 | cd tensorflow-action-conditional-video-prediction 4 | source setup.sh 5 | ``` 6 | 7 | # Train 8 | ## Atari 9 | ``` 10 | ./train_atari.sh ${game name} ${num_act} ${colorspace [rgb|gray]} {gpu id} 11 | e.g. ./train_atari.sh MsPacman-v0 9 gray 0 12 | ``` 13 | 14 | # Test 15 | 16 | # Model zoo 17 | Since size of model is too large, please download pretrained models from [here](https://drive.google.com/drive/u/0/folders/0B5wysG7CaEswVnNJdUkyZ29DR2s) 18 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/dump_tfrecords.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from episode_reader import EpisodeReader 3 | 4 | import sys, os 5 | 6 | if __name__ == '__main__': 7 | reader = EpisodeReader(path=sys.argv[1], height=84, width=84) 8 | dir = sys.argv[2] 9 | i = 0 10 | for s, a, x_t_1 in reader.read(): 11 | np.save(os.path.join(dir, '%05d-s') % i, s) 12 | np.save(os.path.join(dir, '%05d-x_t_1') % i, x_t_1) 13 | np.save(os.path.join(dir, '%05d-a' % i), np.asarray([a])) 14 | i += 1 15 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("CartPole-v0") 8 | act = deepq.load("cartpole_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -p long 3 | #SBATCH --gres=gpu:k80:2 4 | #SBATCH -J Pong 5 | #SBATCH -o Pong.log 6 | 7 | GAME=$1 8 | NUM_ACT=$2 9 | COLOR=$3 10 | DATA_DIR="${GAME}_episodes" 11 | TRAIN="${DATA_DIR}/train" 12 | TEST="${DATA_DIR}/test" 13 | MEAN="${DATA_DIR}/mean.npy" 14 | LOG="models/${GAME}" 15 | 16 | hostname 17 | echo $CUDA_VISIBLE_DEVICES 18 | source activate tf 19 | export PYTHONHOME="/home/yclin/miniconda2/envs/tf" 20 | srun python train.py --train ${TRAIN} --test ${TEST} --mean ${MEAN} --num_act ${NUM_ACT} --color ${COLOR} --log ${LOG} 21 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-rgb/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This example use MsPacman-v0 for demostrating infer next frame with current four frames and action. 3 | ```example.npy``` is a [84, 84, 12] numpy array (DQN settings). 4 | 5 | # Usage 6 | ``` 7 | python example.py --load {tensorflow model checkpoint} 8 | ``` 9 | 10 | # Integrate with your code 11 | ``` 12 | from tfacvp.model import ActionConditionalVideoPredictionModel 13 | from tfacvp.util import post_process 14 | 15 | model = ActionConditionalVideoPredictionModel(num_act=num_act, is_train=False) 16 | sess = tf.Session() 17 | model.restore(sess, args.load) 18 | model.predict(sess, s, a) 19 | ``` 20 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame 5 | 6 | 7 | def main(): 8 | env = gym.make("PongNoFrameskip-v4") 9 | env = ScaledFloatFrame(wrap_dqn(env)) 10 | act = deepq.load("pong_model.pkl") 11 | 12 | while True: 13 | obs, done = env.reset(), False 14 | episode_rew = 0 15 | while not done: 16 | env.render() 17 | obs, rew, done, _ = env.step(act(obs[None])[0]) 18 | episode_rew += rew 19 | print("Episode reward", episode_rew) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/compute_mean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os, sys, cv2 3 | import glob 4 | from tqdm import * 5 | 6 | from episode_reader import EpisodeReader 7 | 8 | if __name__ == '__main__': 9 | path = sys.argv[1] 10 | mean_path = sys.argv[2] 11 | 12 | mean = np.zeros([84, 84, 1], dtype=np.float64) 13 | n = 0 14 | for path in tqdm(glob.glob(os.path.join(path, '*.tfrecords'))): 15 | try: 16 | reader = EpisodeReader(path) 17 | for s, a, x in reader.read(): 18 | mean += s[:,:,-1:] 19 | n += 1 20 | except: 21 | print ('Fail to load %s' % path) 22 | mean /= n 23 | np.save(mean_path, mean) 24 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def callback(lcl, glb): 7 | # stop training if reward exceeds 199 8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 9 | return is_solved 10 | 11 | 12 | def main(): 13 | env = gym.make("CartPole-v0") 14 | model = deepq.models.mlp([64]) 15 | act = deepq.learn( 16 | env, 17 | q_func=model, 18 | lr=1e-3, 19 | max_timesteps=100000, 20 | buffer_size=50000, 21 | exploration_fraction=0.1, 22 | exploration_final_eps=0.02, 23 | print_freq=10, 24 | callback=callback 25 | ) 26 | print("Saving model to cartpole_model.pkl") 27 | act.save("cartpole_model.pkl") 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info.major != 3: 5 | print("This Python is only compatible with Python 3, but you are running " 6 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 7 | 8 | setup(name='baselines', 9 | packages=[package for package in find_packages() 10 | if package.startswith('baselines')], 11 | install_requires=[ 12 | 'gym>=0.9.1', 13 | 'scipy', 14 | 'tqdm', 15 | 'joblib', 16 | 'zmq', 17 | 'dill', 18 | 'tensorflow >= 1.0.0', 19 | 'azure==1.0.3', 20 | 'progressbar2', 21 | ], 22 | description="OpenAI baselines: high quality implementations of reinforcement learning algorithms", 23 | author="OpenAI", 24 | url='https://github.com/openai/baselines', 25 | author_email="gym@openai.com", 26 | version="0.1.3") 27 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame 5 | 6 | 7 | def main(): 8 | env = gym.make("PongNoFrameskip-v4") 9 | env = ScaledFloatFrame(wrap_dqn(env)) 10 | model = deepq.models.cnn_to_mlp( 11 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 12 | hiddens=[256], 13 | dueling=True 14 | ) 15 | act = deepq.learn( 16 | env, 17 | q_func=model, 18 | lr=1e-4, 19 | max_timesteps=2000000, 20 | buffer_size=10000, 21 | exploration_fraction=0.1, 22 | exploration_final_eps=0.01, 23 | train_freq=4, 24 | learning_starts=10000, 25 | target_network_update_freq=1000, 26 | gamma=0.99, 27 | prioritized_replay=True 28 | ) 29 | act.save("pong_model.pkl") 30 | env.close() 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This example demostrates how to use ActionConditionalVideoPredcitionModel to predict next frame conditions on current state and action. 3 | ```example.npy``` is a [84, 84, 12] numpy array (DQN settings). 4 | 5 | # Usage 6 | ``` 7 | python example.py --load {tensorflow model checkpoint} --data {state npy file(e.g. s_t_.npy)} --mean {image mean} --num_act {number of action in the action space} --act {which action you want to take, 0 <= act < num_act} 8 | ``` 9 | 10 | # Integrate with your code 11 | ``` 12 | from tfacvp.model import ActionConditionalVideoPredictionModel 13 | from tfacvp.util import post_process_gray, pre_process_gray 14 | 15 | mean = np.load(meanfile_path) 16 | 17 | sess = tf.Session() 18 | 19 | model = ActionConditionalVideoPredictionModel(num_act=num_act, is_train=False) 20 | model.restore(sess, chekckpoint_path) 21 | 22 | s = pre_process_gray(s, mean, scale) 23 | model.predict(sess, s, a) 24 | ``` 25 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/summary.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | 5 | import argparse 6 | import sys, os 7 | import logging 8 | 9 | from model import ActionConditionalVideoPredictionModel 10 | from dataset import Dataset 11 | 12 | def get_config(args): 13 | config = tf.ConfigProto() 14 | config.gpu_options.allow_growth = True 15 | return config 16 | 17 | def main(args): 18 | with tf.Graph().as_default() as graph: 19 | model = ActionConditionalVideoPredictionModel(num_act=args.num_act) 20 | for var in tf.trainable_variables(): 21 | print var 22 | with tf.variable_scope('', reuse=True) as scope: 23 | print tf.get_variable('conv1/w') 24 | 25 | 26 | if __name__ == '__main__': 27 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--num_act', help='num acts', type=int, required=True) 30 | args = parser.parse_args() 31 | 32 | main(args) 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/episode_reader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import os, sys, cv2 4 | 5 | class EpisodeReader(object): 6 | def __init__(self, path, height=84, width=84): 7 | self.reader = tf.python_io.tf_record_iterator(path=path) 8 | self.height = height 9 | self.width = width 10 | 11 | def read(self): 12 | for string_record in self.reader: 13 | example = tf.train.Example() 14 | example.ParseFromString(string_record) 15 | 16 | a_t = int(example.features.feature['a_t'] 17 | .int64_list 18 | .value[0]) 19 | 20 | s_t_string = (example.features.feature['s_t'] 21 | .bytes_list 22 | .value[0]) 23 | 24 | x_t_1_string = (example.features.feature['x_t_1'] 25 | .bytes_list 26 | .value[0]) 27 | 28 | s_t_raw = np.fromstring(s_t_string, dtype=np.uint8) 29 | s_t = s_t_raw.reshape((self.height, self.width, -1)) 30 | 31 | x_t_1_raw = np.fromstring(x_t_1_string, dtype=np.uint8) 32 | x_t_1 = x_t_1_raw.reshape((self.height, self.width, -1)) 33 | 34 | s_t = s_t.astype(np.float32) 35 | x_t_1 = x_t_1.astype(np.float32) 36 | 37 | yield s_t, a_t, x_t_1 38 | 39 | def __call__(self): 40 | yield self.read() 41 | 42 | 43 | -------------------------------------------------------------------------------- /baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | set_value, 7 | single_threaded_session 8 | ) 9 | 10 | 11 | def test_set_value(): 12 | a = tf.Variable(42.) 13 | with single_threaded_session(): 14 | set_value(a, 5) 15 | assert a.eval() == 5 16 | g = tf.get_default_graph() 17 | g.finalize() 18 | set_value(a, 6) 19 | assert a.eval() == 6 20 | 21 | # test the test 22 | try: 23 | assert a.eval() == 7 24 | except AssertionError: 25 | pass 26 | else: 27 | assert False, "assertion should have failed" 28 | 29 | 30 | def test_function(): 31 | tf.reset_default_graph() 32 | x = tf.placeholder(tf.int32, (), name="x") 33 | y = tf.placeholder(tf.int32, (), name="y") 34 | z = 3 * x + 2 * y 35 | lin = function([x, y], z, givens={y: 0}) 36 | 37 | with single_threaded_session(): 38 | initialize() 39 | 40 | assert lin(2) == 6 41 | assert lin(x=3) == 9 42 | assert lin(2, 2) == 10 43 | assert lin(x=2, y=3) == 12 44 | 45 | 46 | def test_multikwargs(): 47 | tf.reset_default_graph() 48 | x = tf.placeholder(tf.int32, (), name="x") 49 | with tf.variable_scope("other"): 50 | x2 = tf.placeholder(tf.int32, (), name="x") 51 | z = 3 * x + 2 * x2 52 | 53 | lin = function([x, x2], z, givens={x2: 0}) 54 | with single_threaded_session(): 55 | initialize() 56 | assert lin(2) == 6 57 | assert lin(2, 2) == 10 58 | expt_caught = False 59 | try: 60 | lin(x=2) 61 | except AssertionError: 62 | expt_caught = True 63 | assert expt_caught 64 | 65 | 66 | if __name__ == '__main__': 67 | test_set_value() 68 | test_function() 69 | test_multikwargs() 70 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tool/episode_collector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import os, sys, cv2 4 | 5 | class EpisodeCollector(object): 6 | ''' 7 | Episode logger, this class is designed to collect state, action for ActionConditionalVideoPrediction training datas 8 | ''' 9 | def __init__(self, path, preprocess_func, skip=4): 10 | # path: Where to save .tfrecord file. (str) 11 | # preprocess_func: Frame preprocess function. (function) 12 | # skip: Drop $skip frames, since common RL algorithm use 4 frame as one state. 13 | # However first 3 frames are black, we don't want to record these state including black frame. (int) 14 | self.timestep = 0 15 | self.preprocess_func = preprocess_func 16 | self.writer = tf.python_io.TFRecordWriter(path) 17 | self.skip = skip 18 | 19 | def _int64_feature(self, value): 20 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 21 | 22 | def _bytes_feature(self, value): 23 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 24 | 25 | def save(self, s, a, x_next): 26 | # s: RL state, usually 4 stacked frames (e.g. ndarray, shape=[84, 84, 12]) 27 | # a: action (int) 28 | # x_next: next frame. (e.g. nadrrray, shape=[84, 84, 3], [210, 160, 3]) 29 | 30 | self.timestep += 1 31 | if self.timestep > self.skip: 32 | s_raw = s.tostring() 33 | 34 | x_next = self.preprocess_func(x_next) 35 | x_next_raw = x_next.tostring() 36 | 37 | example = tf.train.Example(features=tf.train.Features(feature={ 38 | 'a_t': self._int64_feature(a), 39 | 's_t': self._bytes_feature(s_raw), 40 | 'x_t_1': self._bytes_feature(x_next_raw)})) 41 | self.writer.write(example.SerializeToString()) 42 | 43 | def close(self): 44 | self.writer.close() 45 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/download_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import progressbar 3 | 4 | from baselines.common.azure_utils import Container 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser("Download a pretrained model from Azure.") 9 | # Environment 10 | parser.add_argument("--model-dir", type=str, default=None, 11 | help="save model in this directory this directory. ") 12 | parser.add_argument("--account-name", type=str, default="openaisciszymon", 13 | help="account name for Azure Blob Storage") 14 | parser.add_argument("--account-key", type=str, default=None, 15 | help="account key for Azure Blob Storage") 16 | parser.add_argument("--container", type=str, default="dqn-blogpost", 17 | help="container name and blob name separated by colon serparated by colon") 18 | parser.add_argument("--blob", type=str, default=None, help="blob with the model") 19 | return parser.parse_args() 20 | 21 | 22 | def main(): 23 | args = parse_args() 24 | c = Container(account_name=args.account_name, 25 | account_key=args.account_key, 26 | container_name=args.container) 27 | 28 | if args.blob is None: 29 | print("Listing available models:") 30 | print() 31 | for blob in sorted(c.list(prefix="model-")): 32 | print(blob) 33 | else: 34 | print("Downloading {} to {}...".format(args.blob, args.model_dir)) 35 | bar = None 36 | 37 | def callback(current, total): 38 | nonlocal bar 39 | if bar is None: 40 | bar = progressbar.ProgressBar(max_value=total) 41 | bar.update(current) 42 | 43 | assert c.exists(args.blob), "model {} does not exist".format(args.blob) 44 | 45 | assert args.model_dir is not None 46 | 47 | c.get(args.model_dir, args.blob, callback=callback) 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-rgb/example.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | import argparse 5 | import sys, os 6 | import logging 7 | 8 | def get_config(args): 9 | config = tf.ConfigProto() 10 | config.gpu_options.allow_growth = True 11 | return config 12 | 13 | def get_cv_image(img, mean, scale): 14 | return img 15 | 16 | def main(args): 17 | from tfacvp.model import ActionConditionalVideoPredictionModel 18 | from tfacvp.util import post_process_rgb 19 | 20 | with tf.Graph().as_default() as graph: 21 | logging.info('Create model [num_act = %d] for testing' % (args.num_act)) 22 | model = ActionConditionalVideoPredictionModel(num_act=args.num_act, is_train=False) 23 | 24 | config = get_config(args) 25 | s = np.load(args.data) 26 | mean = np.load(args.mean) 27 | scale = 255.0 28 | 29 | with tf.Session(config=config) as sess: 30 | logging.info('Loading weights from %s' % (args.load)) 31 | model.restore(sess, args.load) 32 | 33 | for i in range(args.num_act): 34 | logging.info('Predict next frame condition on action %d' % (i)) 35 | a = np.identity(args.num_act)[i] 36 | x_t_1_pred_batch = model.predict(sess, s[np.newaxis, :], a[np.newaxis, :])[0] 37 | 38 | img = x_t_1_pred_batch[0] 39 | img = post_process(img, mean, scale) 40 | img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 41 | cv2.imwrite('pred-%02d.png' % i, img) 42 | 43 | 44 | if __name__ == '__main__': 45 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('--data', help='testing data npy', type=str, default='example.npy') 48 | parser.add_argument('--mean', help='image mean path', type=str, default='mean.npy') 49 | parser.add_argument('--load', help='model weight path', type=str, required=True) 50 | parser.add_argument('--num_act', help='num acts', type=int, default=9) 51 | args = parser.parse_args() 52 | main(args) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/misc/test_conv1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | 5 | import argparse 6 | import sys, os 7 | import logging 8 | 9 | import cPickle as pickle 10 | 11 | from model import ActionConditionalVideoPredictionModel 12 | from dataset import Dataset, CaffeDataset 13 | from util import post_process 14 | 15 | def get_config(args): 16 | config = tf.ConfigProto() 17 | config.gpu_options.allow_growth = True 18 | return config 19 | 20 | def load_caffe_model(x, path): 21 | with open(path, 'rb') as f: 22 | data = pickle.load(f) 23 | w = tf.Variable(data['conv1/w'], dtype=tf.float32) 24 | b = tf.Variable(data['conv1/b'], dtype=tf.float32) 25 | l = tf.nn.conv2d(x, w, strides=[1, 2, 2, 1], padding='VALID', name='conv2d') 26 | l = tf.nn.bias_add(l, b, name='bias_add') 27 | return l 28 | 29 | def main(args): 30 | with tf.Graph().as_default() as graph: 31 | # Create dataset 32 | logging.info('Create data flow from %s' % args.data) 33 | caffe_dataset = CaffeDataset(dir=args.data, num_act=args.num_act, mean_path=args.mean) 34 | 35 | # Config session 36 | config = get_config(args) 37 | 38 | x = tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, 12]) 39 | op = load_caffe_model(x, args.load) 40 | 41 | init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 42 | 43 | # Start session 44 | with tf.Session(config=config) as sess: 45 | sess.run(init) 46 | i = 0 47 | for s, a in caffe_dataset(5): 48 | pred_data = sess.run([op], feed_dict={x: [s]})[0] 49 | print pred_data.shape 50 | np.save('tf-%03d.npy' % i, pred_data) 51 | i += 1 52 | 53 | if __name__ == '__main__': 54 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--log', help='summary directory', type=str, default='caffe-test') 57 | parser.add_argument('--data', help='testing data directory', type=str, required=True) 58 | parser.add_argument('--mean', help='image mean path', type=str, required=True) 59 | parser.add_argument('--load', help='caffe-dumped model path', type=str, required=True) 60 | parser.add_argument('--num_act', help='num acts', type=int, required=True) 61 | args = parser.parse_args() 62 | 63 | main(args) 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as layers 3 | 4 | 5 | def model(img_in, num_actions, scope, reuse=False, concat_softmax=False): 6 | """As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf""" 7 | with tf.variable_scope(scope, reuse=reuse): 8 | out = img_in 9 | with tf.variable_scope("convnet"): 10 | # original architecture 11 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 12 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 13 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 14 | out = layers.flatten(out) 15 | 16 | with tf.variable_scope("action_value"): 17 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 18 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 19 | if concat_softmax: 20 | out = tf.nn.softmax(out) 21 | 22 | return out 23 | 24 | 25 | def dueling_model(img_in, num_actions, scope, reuse=False): 26 | """As described in https://arxiv.org/abs/1511.06581""" 27 | with tf.variable_scope(scope, reuse=reuse): 28 | out = img_in 29 | with tf.variable_scope("convnet"): 30 | # original architecture 31 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 32 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 33 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 34 | out = layers.flatten(out) 35 | 36 | with tf.variable_scope("state_value"): 37 | state_hidden = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 38 | state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None) 39 | with tf.variable_scope("action_value"): 40 | actions_hidden = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 41 | action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None) 42 | action_scores_mean = tf.reduce_mean(action_scores, 1) 43 | action_scores = action_scores - tf.expand_dims(action_scores_mean, 1) 44 | 45 | return state_score + action_scores 46 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/tf_ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def ReLu(x, name, reuse=False): 5 | with tf.variable_scope(name, reuse=reuse) as scope: 6 | l = tf.nn.relu(x) 7 | return l 8 | 9 | def Conv2D(x, filter_shape, out_dim, strides, padding, name, reuse=False): 10 | # x: input tensor (float32)[n, w, h, c] 11 | # filter_shape: conv2d filter (int)[w, h] 12 | # out_dim: output channels (int) 13 | # strides: conv2d stride size (int) 14 | # padding: padding type (str) 15 | # name: variable scope (str) 16 | 17 | with tf.variable_scope(name, reuse=reuse) as scope: 18 | in_dim = x.get_shape()[-1] 19 | w = tf.get_variable('w', shape=filter_shape + [in_dim, out_dim], initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform=True)) 20 | b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0)) 21 | l = tf.nn.conv2d(x, w, strides=[1, strides, strides, 1], padding=padding, name='conv2d') 22 | l = tf.nn.bias_add(l, b, name='bias_add') 23 | return l 24 | 25 | def FC(x, out_dim, name, initializer=tf.contrib.layers.xavier_initializer(uniform=True), reuse=False): 26 | # x: input tensor (float32)[n, in_dim] 27 | # out_dim: output channels (int) 28 | # name: variable scope (str) 29 | 30 | x = tf.contrib.layers.flatten(x) 31 | with tf.variable_scope(name, reuse=reuse) as scope: 32 | in_dim = x.get_shape()[-1] 33 | w = tf.get_variable('w', shape=[in_dim, out_dim], initializer=initializer) 34 | b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0)) 35 | l = tf.add(tf.matmul(x, w), b, name='add') 36 | return l 37 | 38 | def Deconv2D(x, filter_shape, output_shape, out_dim, strides, padding, name, reuse=False): 39 | # x: input tensor (float32) [n, w, h, c] 40 | # filter_shape: conv2d filter (int)[w, h] 41 | # out_dim: output channels (int) 42 | # strides: conv2d stride size (int) 43 | # padding: padding type (str) 44 | # name: variable scope (str) 45 | 46 | with tf.variable_scope(name, reuse=reuse) as scope: 47 | in_dim = x.get_shape()[-1] 48 | w = tf.get_variable('w', shape=filter_shape + [out_dim, in_dim], initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform=True)) 49 | b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0)) 50 | l = tf.nn.conv2d_transpose(x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, name='deconv2d') 51 | l = tf.nn.bias_add(l, b, name='bias_add') 52 | return l 53 | 54 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/misc/check.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | 5 | import argparse 6 | import sys, os 7 | import logging 8 | 9 | from model import ActionConditionalVideoPredictionModel 10 | from dataset import Dataset 11 | 12 | def get_config(args): 13 | config = tf.ConfigProto() 14 | config.gpu_options.allow_growth = True 15 | return config 16 | 17 | def main(args): 18 | with tf.Graph().as_default() as graph: 19 | # Create dataset 20 | logging.info('Create data flow from %s' % args.train) 21 | train_data = Dataset(directory=args.train, mean_path=args.mean, batch_size=args.batch_size, num_threads=2, capacity=10000) 22 | 23 | # Create initializer 24 | init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 25 | 26 | # Config session 27 | config = get_config(args) 28 | 29 | # Setup summary 30 | check_summary_writer = tf.summary.FileWriter(os.path.join(args.log, 'check'), graph) 31 | 32 | check_op = tf.cast(train_data()['x_t_1'] * 255.0 + train_data()['mean'], tf.uint8) 33 | 34 | tf.summary.image('x_t_1_batch_restore', check_op, collections=['check']) 35 | check_summary_op = tf.summary.merge_all('check') 36 | 37 | # Start session 38 | with tf.Session(config=config) as sess: 39 | coord = tf.train.Coordinator() 40 | sess.run(init) 41 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 42 | for i in range(10): 43 | x_t_1_batch, summary = sess.run([check_op, check_summary_op]) 44 | check_summary_writer.add_summary(summary, i) 45 | coord.request_stop() 46 | coord.join(threads) 47 | 48 | 49 | if __name__ == '__main__': 50 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('--log', help='summary directory', type=str, default='example/log') 53 | parser.add_argument('--train', help='training data directory', type=str, default='example/train') 54 | parser.add_argument('--test', help='testing data directory', type=str, default='example/test') 55 | parser.add_argument('--mean', help='image mean path', type=str, default='example/mean.npy') 56 | parser.add_argument('--lr', help='learning rate', type=float, default=1e-4) 57 | parser.add_argument('--epoch', help='epoch', type=int, default=15000000) 58 | parser.add_argument('--show_per_epoch', help='epoch', type=int, default=1000) 59 | parser.add_argument('--test_per_epoch', help='epoch', type=int, default=2000) 60 | parser.add_argument('--batch_size', help='batch size', type=int, default=32) 61 | parser.add_argument('--test_batch_size', help='batch size', type=int, default=64) 62 | args = parser.parse_args() 63 | 64 | main(args) 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/misc/test_caffe.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import caffe 3 | import six 4 | import numpy as np 5 | from collections import OrderedDict 6 | from caffe.proto import caffe_pb2 as PB 7 | 8 | import net as N 9 | import cv2 10 | import os 11 | 12 | from dataset import CaffeDataset 13 | 14 | import argparse 15 | import logging 16 | 17 | def post_process(data, mean, scale): 18 | t = data.copy().squeeze() 19 | t /= scale 20 | t += mean 21 | t = t.clip(0, 255) 22 | return t.astype('uint8').squeeze().transpose([1, 0, 2]).transpose([0, 2, 1]) 23 | 24 | class CaffeActionConditionalVideoPredictionModel(object): 25 | def __init__(self, mean, weight, K, num_act, num_step=1, data_path='test'): 26 | self.K = K 27 | self.num_act = num_act 28 | self.num_step = num_step 29 | 30 | caffe.set_mode_gpu() 31 | caffe.set_device(0) 32 | 33 | test_net_file, net_proto = N.create_netfile(1, data_path, mean, K, K, 34 | 1, num_act, num_step=self.num_step, mode='test') 35 | 36 | self.test_net = caffe.Net(test_net_file, caffe.TEST) 37 | self.test_net.copy_from(weight) 38 | 39 | def predict(self, s, a, layer='x_hat-05'): 40 | # s: state (1, 4, 84, 84, 3) 41 | # a: action (1, 1, num_act) 42 | 43 | ''' 44 | Load data to test_net 45 | data = [1, K, 84, 84, 3] 46 | ''' 47 | self.test_net.blobs['data'].data[:] = s 48 | self.test_net.blobs['act'].data[:] = a 49 | self.test_net.forward() 50 | 51 | pred_data = self.test_net.blobs[args.layer].data[:] 52 | 53 | return pred_data 54 | 55 | def main(args): 56 | data = CaffeDataset(dir=args.data, num_act=args.num_act, mean_path=args.mean, mode='caffe') 57 | model = CaffeActionConditionalVideoPredictionModel(mean=args.mean, weight=args.weight, K=4, num_act=args.num_act) 58 | 59 | i = 0 60 | w = model.test_net.params['conv1'][0].data[:] 61 | np.save('conv1_w.npy', w) 62 | for s, a in data(5): 63 | pred_data = model.predict(s, a) 64 | print pred_data.shape 65 | np.save('caffe-%03d.npy' % i, pred_data) 66 | #pred_img = post_process(pred_data, data.mean, 1./255) 67 | #cv2.imwrite('%03d-caffe.png' % i, pred_img) 68 | i += 1 69 | 70 | if __name__ == '__main__': 71 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument('--data', help='testing data directory', type=str, required=True) 74 | parser.add_argument('--mean', help='image mean path', type=str, required=True) 75 | parser.add_argument('--weight', help='caffe model', type=str, required=True) 76 | parser.add_argument('--num_act', help='num acts', type=int, required=True) 77 | parser.add_argument('--layer', help='output layer', type=str, required=True) 78 | args = parser.parse_args() 79 | 80 | main(args) 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/example/atari-gray/example.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | import argparse 5 | import sys, os 6 | import logging 7 | 8 | def get_config(args): 9 | config = tf.ConfigProto() 10 | config.gpu_options.allow_growth = True 11 | return config 12 | 13 | def get_cv_image(img, mean, scale): 14 | return img 15 | 16 | def main(args): 17 | from tfacvp.model import ActionConditionalVideoPredictionModel 18 | from tfacvp.util import post_process_gray, pre_process_state_gray 19 | 20 | with tf.Graph().as_default() as graph: 21 | # Define tensorflow computation graph 22 | # In this example, I hardcode the arguments num_channel and num_frame for grayscale atari settings 23 | logging.info('Create model [num_act = %d, num_channel = %d, num_frame = %d] for testing' % (args.num_act, 1, 4)) 24 | model = ActionConditionalVideoPredictionModel(num_act=args.num_act, 25 | num_channel=1, num_frame=4, 26 | is_train=False) 27 | 28 | # Get tensorflow session configuration 29 | config = get_config(args) 30 | 31 | # Load testing state for predicting next frame 32 | scale = 255.0 33 | s = np.load(args.data) 34 | mean = np.load(args.mean) 35 | 36 | with tf.Session(config=config) as sess: 37 | # Restore the model from checkpoint 38 | # If you want to combine with your model, you should notice variable scope otherwise you might get some bugs 39 | logging.info('Loading weights from %s' % (args.load)) 40 | model.restore(sess, args.load) 41 | 42 | # Predict next frame condition on specified action 43 | logging.info('Predict next frame condition on action %d' % (args.act)) 44 | 45 | # To one hot vector 46 | a = np.identity(args.num_act)[args.act] 47 | 48 | # Predict next frame 49 | s = pre_process_state_gray(s, mean, (1.0 / scale), 4) 50 | print np.max(s), np.min(s) 51 | x_t_1_pred_batch = model.predict(sess, s[np.newaxis, :], a[np.newaxis, :])[0] 52 | 53 | # Post process predicted frame for visualization 54 | img = x_t_1_pred_batch[0] 55 | img = post_process_gray(img, mean, scale) 56 | cv2.imwrite('pred.png' , img) 57 | 58 | 59 | if __name__ == '__main__': 60 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('--data', help='testing data (.npy), ndarray(shape = [84,84,4])', type=str, required=True) 63 | parser.add_argument('--mean', help='image mean path (should be shipped with pre-trained model)', type=str, required=True) 64 | parser.add_argument('--load', help='model weight path (tensorflow checkpoint)', type=str, required=True) 65 | parser.add_argument('--num_act', help='number of actions in the game\'s action space', type=int, required=True) 66 | parser.add_argument('--act', help='which action you want to take', type=int, required=True) 67 | args = parser.parse_args() 68 | main(args) 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/util.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | def _pre_process(s, mean, scale, num_frame): 6 | # s: [h, w, c*num_frame] 7 | # mean: [h, w, c] 8 | # scale: float32 9 | #print s.shape, mean.shape, np.tile(mean, [1, 1, 4]).shape 10 | #s -= np.tile(mean, [1, 1, num_frame]) 11 | s = s - mean 12 | s = s * scale 13 | return s 14 | 15 | def pre_process_state_rgb(s, mean, scale, num_frame): 16 | return _pre_process(s, mean, scale, num_frame) 17 | 18 | def pre_process_state_gray(s, mean, scale, num_frame): 19 | s = _transform_state_color_space_np(s) 20 | mean = _transform_frame_color_space_np(mean) 21 | return _pre_process(s, mean, scale, num_frame) 22 | 23 | def _post_process(x, mean, scale=255.0): 24 | x *= scale 25 | x += mean 26 | x = np.clip(x, 0, scale) 27 | x = x.astype(np.uint8) 28 | return x 29 | 30 | def post_process_rgb(x, mean, scale): 31 | return _post_process(x, mean, scale) 32 | 33 | def post_process_gray(x, mean, scale): 34 | # x: [h, w, 1] (assume gray) 35 | # mean: [h, w, c*num_frame] 36 | # scale: float32 37 | mean = _transform_frame_color_space_np(mean) 38 | return _post_process(x, mean, scale) 39 | 40 | def _np_one_hot(x, n): 41 | y = np.zeros([len(x), n]) 42 | y[np.arange(len(x)), x] = 1 43 | return y 44 | 45 | def _read_and_decode(directory, s_t_shape, num_act, x_t_1_shape): 46 | filenames = tf.train.match_filenames_once('./%s/*.tfrecords' % (directory)) 47 | filename_queue = tf.train.string_input_producer(filenames) 48 | 49 | reader = tf.TFRecordReader() 50 | 51 | _, serialized_example = reader.read(filename_queue) 52 | features = tf.parse_single_example(serialized_example, 53 | features={ 54 | 'a_t': tf.FixedLenFeature([], tf.int64), 55 | 's_t' : tf.FixedLenFeature([], tf.string), 56 | 'x_t_1' : tf.FixedLenFeature([], tf.string), 57 | }) 58 | 59 | s_t = tf.decode_raw(features['s_t'], tf.uint8) 60 | x_t_1 = tf.decode_raw(features['x_t_1'], tf.uint8) 61 | 62 | s_t = tf.reshape(s_t, s_t_shape) 63 | x_t_1 = tf.reshape(x_t_1, x_t_1_shape) 64 | 65 | s_t = tf.cast(s_t, tf.float32) 66 | x_t_1 = tf.cast(x_t_1, tf.float32) 67 | 68 | a_t = tf.cast(features['a_t'], tf.int32) 69 | a_t = tf.one_hot(a_t, num_act) 70 | 71 | return s_t, a_t, x_t_1 72 | 73 | def _transform_frame_color_space(x): 74 | # x: [h, w, c] 75 | return tf.image.rgb_to_grayscale(x) 76 | 77 | def _transform_state_color_space(s): 78 | # s: [h, w, c*num_frame] 79 | num_splits = int(s.shape[-1] / 3) 80 | return tf.concat([_transform_frame_color_space(x) for x in tf.split(s, num_splits, axis=2)], axis=2) 81 | 82 | def _transform_frame_color_space_np(x): 83 | return cv2.cvtColor(x, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis] 84 | 85 | def _transform_state_color_space_np(s): 86 | # s: [h, w, c*num_frame] 87 | num_splits = int(s.shape[-1] / 3) 88 | return np.concatenate([cv2.cvtColor(x, cv2.COLOR_RGB2GRAY)[:,:,np.newaxis] for x in np.split(s, num_splits, axis=2)], axis=2) 89 | 90 | -------------------------------------------------------------------------------- /baselines/deepq/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as layers 3 | 4 | 5 | def _mlp(hiddens, inpt, num_actions, scope, reuse=False): 6 | with tf.variable_scope(scope, reuse=reuse): 7 | out = inpt 8 | for hidden in hiddens: 9 | out = layers.fully_connected(out, num_outputs=hidden, activation_fn=tf.nn.relu) 10 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 11 | return out 12 | 13 | 14 | def mlp(hiddens=[]): 15 | """This model takes as input an observation and returns values of all actions. 16 | 17 | Parameters 18 | ---------- 19 | hiddens: [int] 20 | list of sizes of hidden layers 21 | 22 | Returns 23 | ------- 24 | q_func: function 25 | q_function for DQN algorithm. 26 | """ 27 | return lambda *args, **kwargs: _mlp(hiddens, *args, **kwargs) 28 | 29 | 30 | def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False): 31 | with tf.variable_scope(scope, reuse=reuse): 32 | out = inpt 33 | with tf.variable_scope("convnet"): 34 | for num_outputs, kernel_size, stride in convs: 35 | out = layers.convolution2d(out, 36 | num_outputs=num_outputs, 37 | kernel_size=kernel_size, 38 | stride=stride, 39 | activation_fn=tf.nn.relu) 40 | out = layers.flatten(out) 41 | with tf.variable_scope("action_value"): 42 | action_out = out 43 | for hidden in hiddens: 44 | action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=tf.nn.relu) 45 | action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None) 46 | 47 | if dueling: 48 | with tf.variable_scope("state_value"): 49 | state_out = out 50 | for hidden in hiddens: 51 | state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=tf.nn.relu) 52 | state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None) 53 | action_scores_mean = tf.reduce_mean(action_scores, 1) 54 | action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1) 55 | return state_score + action_scores_centered 56 | else: 57 | return action_scores 58 | return out 59 | 60 | 61 | def cnn_to_mlp(convs, hiddens, dueling=False): 62 | """This model takes as input an observation and returns values of all actions. 63 | 64 | Parameters 65 | ---------- 66 | convs: [(int, int int)] 67 | list of convolutional layers in form of 68 | (num_outputs, kernel_size, stride) 69 | hiddens: [int] 70 | list of sizes of hidden layers 71 | dueling: bool 72 | if true double the output MLP to compute a baseline 73 | for action scores 74 | 75 | Returns 76 | ------- 77 | q_func: function 78 | q_function for DQN algorithm. 79 | """ 80 | 81 | return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, *args, **kwargs) 82 | 83 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/wang2015_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | import os 5 | 6 | import baselines.common.tf_util as U 7 | 8 | from baselines import deepq 9 | from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds 10 | from baselines.common.atari_wrappers_deprecated import wrap_dqn 11 | from baselines.deepq.experiments.atari.model import model, dueling_model 12 | 13 | 14 | def make_env(game_name): 15 | env = gym.make(game_name + "NoFrameskip-v4") 16 | env_monitored = SimpleMonitor(env) 17 | env = wrap_dqn(env_monitored) 18 | return env_monitored, env 19 | 20 | 21 | def parse_args(): 22 | parser = argparse.ArgumentParser("Evaluate an already learned DQN model.") 23 | # Environment 24 | parser.add_argument("--env", type=str, required=True, help="name of the game") 25 | parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ") 26 | boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value") 27 | boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model") 28 | 29 | return parser.parse_args() 30 | 31 | 32 | def wang2015_eval(game_name, act, stochastic): 33 | print("==================== wang2015 evaluation ====================") 34 | episode_rewards = [] 35 | 36 | for num_noops in range(1, 31): 37 | env_monitored, eval_env = make_env(game_name) 38 | eval_env.unwrapped.seed(1) 39 | 40 | get_wrapper_by_name(eval_env, "NoopResetEnv").override_num_noops = num_noops 41 | 42 | eval_episode_steps = 0 43 | done = True 44 | while True: 45 | if done: 46 | obs = eval_env.reset() 47 | eval_episode_steps += 1 48 | action = act(np.array(obs)[None], stochastic=stochastic)[0] 49 | 50 | obs, reward, done, info = eval_env.step(action) 51 | if done: 52 | obs = eval_env.reset() 53 | if len(info["rewards"]) > 0: 54 | episode_rewards.append(info["rewards"][0]) 55 | break 56 | if info["steps"] > 108000: # 5 minutes of gameplay 57 | episode_rewards.append(env_monitored._current_reward) 58 | break 59 | print("Num steps in episode {} was {} yielding {} reward".format( 60 | num_noops, eval_episode_steps, episode_rewards[-1]), flush=True) 61 | print("Evaluation results: " + str(np.mean(episode_rewards))) 62 | print("=============================================================") 63 | return np.mean(episode_rewards) 64 | 65 | 66 | def main(): 67 | set_global_seeds(1) 68 | args = parse_args() 69 | with U.make_session(4) as sess: # noqa 70 | _, env = make_env(args.env) 71 | act = deepq.build_act( 72 | make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), 73 | q_func=dueling_model if args.dueling else model, 74 | num_actions=env.action_space.n) 75 | 76 | U.load_state(os.path.join(args.model_dir, "saved")) 77 | wang2015_eval(args.env, act, stochastic=args.stochastic) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Detecting Adversarial Attacks on Neural Network Policies with Visual Foresight 2 | 3 | ![](https://user-images.githubusercontent.com/7057863/30933455-9e86ba96-a398-11e7-87fa-d6339ad60c51.gif) 4 | 5 | **DISCLAIMER**: This repository is a modified version of [openai/baselines](https://github.com/openai/gym). 6 | 7 | ### Publication 8 | 9 | Paper: https://drive.google.com/file/d/0B50cbskLVq-ed2F3eUw4SWQxbUU/view 10 | 11 | ``` 12 | @article{Lin2017RLAttackDetection, 13 | title={Detecting Adversarial Attacks on Neural Network Policies with Visual Foresight}, 14 | author={Lin, Yen-Chen and Liu, Ming-Yu and Sun, Min and Huang, Jia-Bin}, 15 | journal={arXiv preprint arXiv:1710.00814}, 16 | year={2017} 17 | } 18 | ``` 19 | 20 | 21 | ### Dependencies 22 | - Python 3 23 | - cleverhans v2.0.0 24 | 25 | ``` 26 | pip install -e git+http://github.com/tensorflow/cleverhans.git#egg=cleverhans 27 | ``` 28 | 29 | - others (e.g., gym, baselines, ...) 30 | 31 | ``` 32 | git clone https://github.com/yenchenlin/rl-attack-detection.git 33 | cd rl-attack-detection 34 | pip install -e . 35 | ``` 36 | 37 | 38 | ### Example 39 | Here I'll use Atari game Freeway as an example to demonstrate how to run the code. 40 | 41 | Let's start by switch to the home directory: 42 | 43 | ``` 44 | cd rl-attack-detection 45 | ``` 46 | 47 | **1. Download pre-trained agent** 48 | 49 | Download [this repository](https://drive.google.com/open?id=0B50cbskLVq-eRzBtNktCVE1SSms) which contains pre-trained DQN agents for Freeway to `./atari-pre-trained-agents/`. 50 | 51 | **2. Run pre-trained agent** 52 | 53 | Test the performance of the pre-trained agent: 54 | 55 | ``` 56 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway 57 | ``` 58 | 59 | For game Freeway, you should see output similar to follows: 60 | 61 | ``` 62 | 29.0 63 | 27.0 64 | 28.0 65 | ... 66 | ``` 67 | This means that our agent is now a master of the game! 68 | 69 | **3. Perform adversarial attack** 70 | 71 | Use adversarial example crafted by FGSM to attack deep RL agent: 72 | 73 | ``` 74 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway --attack fgsm 75 | ``` 76 | 77 | **Other attacks:** argument passed to `--attack` can be `fgsm`, `iterative`, `cwl2`. 78 | 79 | 80 | You should see output similar to follows: 81 | 82 | ``` 83 | 0.0 84 | 0.0 85 | 0.0 86 | ... 87 | ``` 88 | 89 | which means that the agent is fooled by adversary and went crazy! 90 | 91 | **4. Use visual foresight as defense** 92 | 93 | To protect the agent, first download [this repository](https://drive.google.com/drive/folders/0B50cbskLVq-eTGxqNWtkSGJsRzQ) which contains pre-trained visual foresight module for Freeway to `./atari-visual-foresight/`. 94 | 95 | Then, we can use visual foresight to protect deep RL agent: 96 | 97 | ``` 98 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway --attack fgsm --defense foresight 99 | ``` 100 | 101 | Now, you should see similar outputs to **step. 2**, which means that our agents work well again. 102 | 103 | ### Add More Attacks 104 | To use new attack methods, you can add the attack code [here](https://github.com/yenchenlin/rl-attack-detection/blob/master/baselines/deepq/build_graph.py#L156). 105 | Generally, attack methods that follow the interface of [cleverhans](https://github.com/tensorflow/cleverhans) can be added within few lines. 106 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/custom_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import itertools 3 | import numpy as np 4 | import tensorflow as tf 5 | import tensorflow.contrib.layers as layers 6 | 7 | import baselines.common.tf_util as U 8 | 9 | from baselines import logger 10 | from baselines import deepq 11 | from baselines.deepq.replay_buffer import ReplayBuffer 12 | from baselines.common.schedules import LinearSchedule 13 | 14 | 15 | def model(inpt, num_actions, scope, reuse=False): 16 | """This model takes as input an observation and returns values of all actions.""" 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = inpt 19 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) 20 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 21 | return out 22 | 23 | 24 | if __name__ == '__main__': 25 | with U.make_session(8): 26 | # Create the environment 27 | env = gym.make("CartPole-v0") 28 | # Create all the functions necessary to train the model 29 | act, train, update_target, debug = deepq.build_train( 30 | make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), 31 | q_func=model, 32 | num_actions=env.action_space.n, 33 | optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), 34 | ) 35 | # Create the replay buffer 36 | replay_buffer = ReplayBuffer(50000) 37 | # Create the schedule for exploration starting from 1 (every action is random) down to 38 | # 0.02 (98% of actions are selected according to values predicted by the model). 39 | exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) 40 | 41 | # Initialize the parameters and copy them to the target network. 42 | U.initialize() 43 | update_target() 44 | 45 | episode_rewards = [0.0] 46 | obs = env.reset() 47 | for t in itertools.count(): 48 | # Take action and update exploration to the newest value 49 | action = act(obs[None], update_eps=exploration.value(t))[0] 50 | new_obs, rew, done, _ = env.step(action) 51 | # Store transition in the replay buffer. 52 | replay_buffer.add(obs, action, rew, new_obs, float(done)) 53 | obs = new_obs 54 | 55 | episode_rewards[-1] += rew 56 | if done: 57 | obs = env.reset() 58 | episode_rewards.append(0) 59 | 60 | is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 61 | if is_solved: 62 | # Show off the result 63 | env.render() 64 | else: 65 | # Minimize the error in Bellman's equation on a batch sampled from replay buffer. 66 | if t > 1000: 67 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) 68 | train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) 69 | # Update target network periodically. 70 | if t % 1000 == 0: 71 | update_target() 72 | 73 | if done and len(episode_rewards) % 10 == 0: 74 | logger.record_tabular("steps", t) 75 | logger.record_tabular("episodes", len(episode_rewards)) 76 | logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) 77 | logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) 78 | logger.dump_tabular() 79 | -------------------------------------------------------------------------------- /baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import cv2 4 | 5 | import argparse 6 | import sys, os 7 | import logging 8 | 9 | from tfacvp.model import ActionConditionalVideoPredictionModel 10 | from tfacvp.dataset import Dataset 11 | 12 | MODEL_NUM_CHANNELS_DEFS = {'rgb': 3, 'gray': 1} 13 | MODEL_NUM_FRAMES = 4 14 | DATASET_NUM_CHANNELS = 1 15 | DATASET_NUM_FRAMES = 4 16 | S_SHAPE = (84, 84, DATASET_NUM_CHANNELS * DATASET_NUM_FRAMES) 17 | X_SHAPE = (84, 84, DATASET_NUM_CHANNELS) 18 | 19 | def get_config(args): 20 | config = tf.ConfigProto() 21 | config.gpu_options.allow_growth = True 22 | return config 23 | 24 | def main(args): 25 | with tf.Graph().as_default() as graph: 26 | # Create dataset 27 | logging.info('Create data flow from %s [colorspace = %s]' % (args.train, args.color)) 28 | train_data = Dataset(directory=args.train, 29 | num_act=args.num_act, 30 | mean_path=args.mean, 31 | batch_size=args.batch_size, 32 | s_t_shape=S_SHAPE, 33 | x_t_1_shape=X_SHAPE, 34 | num_threads=4, capacity=10000) 35 | 36 | # Create model 37 | logging.info('Create model for training [lr = %f, epochs = %d, batch_size = %d]' % (args.lr, args.epoch, args.batch_size) ) 38 | model = ActionConditionalVideoPredictionModel(inputs=train_data(), 39 | num_act=args.num_act, 40 | num_channel=MODEL_NUM_CHANNELS_DEFS[args.color], 41 | num_frame=MODEL_NUM_FRAMES, 42 | optimizer_args={'lr': args.lr}) 43 | 44 | # Create prediction summary 45 | ground_truth_image = tf.cast(model.inputs['x_t_1'] * 255.0 + train_data.mean_const, tf.uint8) 46 | pred_image = tf.cast(model.output * 255.0 + train_data.mean_const, tf.uint8) 47 | tf.summary.image('ground', ground_truth_image, collections=['train']) 48 | tf.summary.image('pred', pred_image, collections=['train']) 49 | 50 | # Create initializer 51 | init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 52 | 53 | # Get optimizer operation and loss opearation from model 54 | train_op = model.train 55 | loss_op = model.loss 56 | global_step_var = model.global_step 57 | 58 | # Config session 59 | config = get_config(args) 60 | 61 | # Setup summary 62 | train_summary_op = tf.summary.merge_all('train') 63 | 64 | # Setup supervisor 65 | sv = tf.train.Supervisor(logdir=os.path.join(args.log, 'train'), 66 | init_op=init, 67 | graph=graph, 68 | summary_op=train_summary_op, 69 | global_step=global_step_var, 70 | saver=tf.train.Saver(max_to_keep=None), 71 | save_model_secs=1200) 72 | 73 | # Start session 74 | with sv.managed_session(config=config) as sess: 75 | sv.start_queue_runners(sess) 76 | for epoch in range(args.epoch): 77 | if (epoch) % args.show_per_epoch == 0: 78 | _, train_loss, train_summary, global_step = sess.run([train_op, loss_op, train_summary_op, global_step_var]) 79 | logging.info('Epoch %d: Training L2 loss = %f' % (global_step, train_loss)) 80 | sv.summary_computed(sess, train_summary) 81 | else: 82 | sess.run([train_op]) 83 | sv.request_stop() 84 | 85 | 86 | if __name__ == '__main__': 87 | logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('--log', help='summary directory', type=str, default='example/log') 90 | parser.add_argument('--train', help='training data directory', type=str, required=True) 91 | parser.add_argument('--test', help='testing data directory', type=str, required=True) 92 | parser.add_argument('--mean', help='image mean path', type=str, required=True) 93 | parser.add_argument('--num_act', help='num acts', type=int, required=True) 94 | parser.add_argument('--color', help='colorspace', type=str, choices=['rgb', 'gray'], required=True) 95 | parser.add_argument('--lr', help='learning rate', type=float, default=1e-4) 96 | parser.add_argument('--epoch', help='epoch', type=int, default=15000000) 97 | parser.add_argument('--show_per_epoch', help='epoch', type=int, default=1000) 98 | parser.add_argument('--test_per_epoch', help='epoch', type=int, default=2000) 99 | parser.add_argument('--batch_size', help='batch size', type=int, default=32) 100 | parser.add_argument('--test_batch_size', help='batch size', type=int, default=64) 101 | args = parser.parse_args() 102 | 103 | main(args) 104 | -------------------------------------------------------------------------------- /baselines/common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """Build a Segment Tree data structure. 7 | 8 | https://en.wikipedia.org/wiki/Segment_tree 9 | 10 | Can be used as regular array, but with two 11 | important differences: 12 | 13 | a) setting item's value is slightly slower. 14 | It is O(lg capacity) instead of O(1). 15 | b) user has access to an efficient `reduce` 16 | operation which reduces `operation` over 17 | a contiguous subsequence of items in the 18 | array. 19 | 20 | Paramters 21 | --------- 22 | capacity: int 23 | Total size of the array - must be a power of two. 24 | operation: lambda obj, obj -> obj 25 | and operation for combining elements (eg. sum, max) 26 | must for a mathematical group together with the set of 27 | possible values for array elements. 28 | neutral_element: obj 29 | neutral element for the operation above. eg. float('-inf') 30 | for max and 0 for sum. 31 | """ 32 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 33 | self._capacity = capacity 34 | self._value = [neutral_element for _ in range(2 * capacity)] 35 | self._operation = operation 36 | 37 | def _reduce_helper(self, start, end, node, node_start, node_end): 38 | if start == node_start and end == node_end: 39 | return self._value[node] 40 | mid = (node_start + node_end) // 2 41 | if end <= mid: 42 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 43 | else: 44 | if mid + 1 <= start: 45 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 46 | else: 47 | return self._operation( 48 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 49 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 50 | ) 51 | 52 | def reduce(self, start=0, end=None): 53 | """Returns result of applying `self.operation` 54 | to a contiguous subsequence of the array. 55 | 56 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 57 | 58 | Parameters 59 | ---------- 60 | start: int 61 | beginning of the subsequence 62 | end: int 63 | end of the subsequences 64 | 65 | Returns 66 | ------- 67 | reduced: obj 68 | result of reducing self.operation over the specified range of array elements. 69 | """ 70 | if end is None: 71 | end = self._capacity 72 | if end < 0: 73 | end += self._capacity 74 | end -= 1 75 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 76 | 77 | def __setitem__(self, idx, val): 78 | # index of the leaf 79 | idx += self._capacity 80 | self._value[idx] = val 81 | idx //= 2 82 | while idx >= 1: 83 | self._value[idx] = self._operation( 84 | self._value[2 * idx], 85 | self._value[2 * idx + 1] 86 | ) 87 | idx //= 2 88 | 89 | def __getitem__(self, idx): 90 | assert 0 <= idx < self._capacity 91 | return self._value[self._capacity + idx] 92 | 93 | 94 | class SumSegmentTree(SegmentTree): 95 | def __init__(self, capacity): 96 | super(SumSegmentTree, self).__init__( 97 | capacity=capacity, 98 | operation=operator.add, 99 | neutral_element=0.0 100 | ) 101 | 102 | def sum(self, start=0, end=None): 103 | """Returns arr[start] + ... + arr[end]""" 104 | return super(SumSegmentTree, self).reduce(start, end) 105 | 106 | def find_prefixsum_idx(self, prefixsum): 107 | """Find the highest index `i` in the array such that 108 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 109 | 110 | if array values are probabilities, this function 111 | allows to sample indexes according to the discrete 112 | probability efficiently. 113 | 114 | Parameters 115 | ---------- 116 | perfixsum: float 117 | upperbound on the sum of array prefix 118 | 119 | Returns 120 | ------- 121 | idx: int 122 | highest index satisfying the prefixsum constraint 123 | """ 124 | assert 0 <= prefixsum <= self.sum() + 1e-5 125 | idx = 1 126 | while idx < self._capacity: # while non-leaf 127 | if self._value[2 * idx] > prefixsum: 128 | idx = 2 * idx 129 | else: 130 | prefixsum -= self._value[2 * idx] 131 | idx = 2 * idx + 1 132 | return idx - self._capacity 133 | 134 | 135 | class MinSegmentTree(SegmentTree): 136 | def __init__(self, capacity): 137 | super(MinSegmentTree, self).__init__( 138 | capacity=capacity, 139 | operation=min, 140 | neutral_element=float('inf') 141 | ) 142 | 143 | def min(self, start=0, end=None): 144 | """Returns min(arr[start], ..., arr[end])""" 145 | 146 | return super(MinSegmentTree, self).reduce(start, end) 147 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/enjoy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import os 4 | import numpy as np 5 | 6 | from gym.monitoring import VideoRecorder 7 | 8 | import baselines.common.tf_util as U 9 | 10 | from baselines import deepq 11 | from baselines.common.misc_util import ( 12 | boolean_flag, 13 | SimpleMonitor, 14 | ) 15 | from baselines.common.atari_wrappers_deprecated import wrap_dqn 16 | from baselines.deepq.experiments.atari.model import model, dueling_model 17 | import tensorflow as tf 18 | import cv2 19 | from collections import deque 20 | 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser("Run an already learned DQN model.") 24 | # Environment 25 | parser.add_argument("--env", type=str, required=True, help="name of the game") 26 | parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ") 27 | parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.") 28 | boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value") 29 | boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model") 30 | parser.add_argument("--attack", type=str, default=None, help="Method to attack the model.") 31 | parser.add_argument("--defense", type=str, default=None, help="Method to defend the attack.") 32 | 33 | return parser.parse_args() 34 | 35 | 36 | def make_env(game_name): 37 | env = gym.make(game_name + "NoFrameskip-v4") 38 | env = SimpleMonitor(env) 39 | env = wrap_dqn(env) 40 | return env 41 | 42 | 43 | def load_visual_foresight(game_name): 44 | sess = U.get_session() 45 | from baselines.deepq.prediction.tfacvp.model import ActionConditionalVideoPredictionModel 46 | gen_dir = './atari-visual-foresight/' 47 | model_path = os.path.join(gen_dir, '{}/model.ckpt'.format(game_name)) 48 | mean_path = os.path.join(gen_dir, '{}/mean.npy'.format(game_name)) 49 | game_screen_mean = np.load(mean_path) 50 | with tf.variable_scope('G'): 51 | foresight = ActionConditionalVideoPredictionModel(num_act=env.action_space.n, num_channel=1, is_train=False) 52 | foresight.restore(sess, model_path, 'G') 53 | return foresight, game_screen_mean 54 | 55 | 56 | def foresee(sess, obs, act, gt, mean, model, n_actions, step): 57 | onehot_act = np.zeros((1, n_actions)) 58 | onehot_act[0, act] = 1 59 | obs = obs - mean[None] 60 | obs = obs * 1/255.0 61 | pred_frame = model.predict(sess, obs, onehot_act)[0] 62 | pred_frame = pred_frame* 255.0 63 | pred_frame = pred_frame + mean[None] 64 | #print(gt[:, :, -1].shape, pred_frame.shape) 65 | #print(np.sum(gt[:, :, -1][:, :, np.newaxis] - pred_frame[0, :, :, :])) 66 | #cv2.imwrite('./tmp/gt_{}.png'.format(step), gt[:, :, -1][:, :, np.newaxis]) 67 | #cv2.imwrite('./tmp/pred_{}.png'.format(step), pred_frame[0, :, :, :]) 68 | return pred_frame[0, :, :, 0] 69 | 70 | 71 | def play(env, act, craft_adv_obs, stochastic, video_path, game_name, attack, defense): 72 | if defense == 'foresight': 73 | vf, game_screen_mean = load_visual_foresight(game_name) 74 | pred_obs = deque(maxlen=4) 75 | 76 | num_episodes = 0 77 | video_recorder = None 78 | video_recorder = VideoRecorder( 79 | env, video_path, enabled=video_path is not None) 80 | 81 | t = 0 82 | obs = env.reset() 83 | while True: 84 | #env.unwrapped.render() 85 | video_recorder.capture_frame() 86 | 87 | # Attack 88 | if craft_adv_obs != None: 89 | # Craft adv. examples 90 | adv_obs = craft_adv_obs(np.array(obs)[None], stochastic=stochastic)[0] 91 | action = act(np.array(adv_obs)[None], stochastic=stochastic)[0] 92 | else: 93 | # Normal 94 | action = act(np.array(obs)[None], stochastic=stochastic)[0] 95 | 96 | # Defense 97 | if t > 4 and defense == 'foresight': 98 | pred_obs.append( 99 | foresee(U.get_session(), old_obs, old_action, np.array(obs), game_screen_mean, vf, 100 | env.action_space.n, t) 101 | ) 102 | if len(pred_obs) == 4: 103 | action = act(np.stack(pred_obs, axis=2)[None], stochastic=stochastic)[0] 104 | 105 | old_obs = obs 106 | old_action = action 107 | 108 | # RL loop 109 | obs, rew, done, info = env.step(action) 110 | t += 1 111 | if done: 112 | t = 0 113 | obs = env.reset() 114 | if len(info["rewards"]) > num_episodes: 115 | if len(info["rewards"]) == 1 and video_recorder.enabled: 116 | # save video of first episode 117 | print("Saved video.") 118 | video_recorder.close() 119 | video_recorder.enabled = False 120 | print(info["rewards"][-1]) 121 | num_episodes = len(info["rewards"]) 122 | 123 | 124 | if __name__ == '__main__': 125 | with U.make_session(4) as sess: 126 | args = parse_args() 127 | env = make_env(args.env) 128 | # Build graph and load agents 129 | act, craft_adv_obs = deepq.build_act( 130 | make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), 131 | q_func=dueling_model if args.dueling else model, 132 | num_actions=env.action_space.n, 133 | attack=args.attack, 134 | model_path=os.path.join(args.model_dir, "saved") 135 | ) 136 | play(env, act, craft_adv_obs, args.stochastic, args.video, args.env, args.attack, args.defense) 137 | -------------------------------------------------------------------------------- /baselines/common/azure_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import zipfile 4 | 5 | from azure.common import AzureMissingResourceHttpError 6 | from azure.storage.blob import BlobService 7 | from shutil import unpack_archive 8 | from threading import Event 9 | 10 | """TODOS: 11 | - use Azure snapshots instead of hacky backups 12 | """ 13 | 14 | 15 | def fixed_list_blobs(service, *args, **kwargs): 16 | """By defualt list_containers only returns a subset of results. 17 | 18 | This function attempts to fix this. 19 | """ 20 | res = [] 21 | next_marker = None 22 | while next_marker is None or len(next_marker) > 0: 23 | kwargs['marker'] = next_marker 24 | gen = service.list_blobs(*args, **kwargs) 25 | for b in gen: 26 | res.append(b.name) 27 | next_marker = gen.next_marker 28 | return res 29 | 30 | 31 | def make_archive(source_path, dest_path): 32 | if source_path.endswith(os.path.sep): 33 | source_path = source_path.rstrip(os.path.sep) 34 | prefix_path = os.path.dirname(source_path) 35 | with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf: 36 | if os.path.isdir(source_path): 37 | for dirname, subdirs, files in os.walk(source_path): 38 | zf.write(dirname, os.path.relpath(dirname, prefix_path)) 39 | for filename in files: 40 | filepath = os.path.join(dirname, filename) 41 | zf.write(filepath, os.path.relpath(filepath, prefix_path)) 42 | else: 43 | zf.write(source_path, os.path.relpath(source_path, prefix_path)) 44 | 45 | 46 | class Container(object): 47 | services = {} 48 | 49 | def __init__(self, account_name, account_key, container_name, maybe_create=False): 50 | self._account_name = account_name 51 | self._container_name = container_name 52 | if account_name not in Container.services: 53 | Container.services[account_name] = BlobService(account_name, account_key) 54 | self._service = Container.services[account_name] 55 | if maybe_create: 56 | self._service.create_container(self._container_name, fail_on_exist=False) 57 | 58 | def put(self, source_path, blob_name, callback=None): 59 | """Upload a file or directory from `source_path` to azure blob `blob_name`. 60 | 61 | Upload progress can be traced by an optional callback. 62 | """ 63 | upload_done = Event() 64 | 65 | def progress_callback(current, total): 66 | if callback: 67 | callback(current, total) 68 | if current >= total: 69 | upload_done.set() 70 | 71 | # Attempt to make backup if an existing version is already available 72 | try: 73 | x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format( 74 | self._account_name, 75 | self._container_name, 76 | blob_name 77 | ) 78 | self._service.copy_blob( 79 | container_name=self._container_name, 80 | blob_name=blob_name + ".backup", 81 | x_ms_copy_source=x_ms_copy_source 82 | ) 83 | except AzureMissingResourceHttpError: 84 | pass 85 | 86 | with tempfile.TemporaryDirectory() as td: 87 | arcpath = os.path.join(td, "archive.zip") 88 | make_archive(source_path, arcpath) 89 | self._service.put_block_blob_from_path( 90 | container_name=self._container_name, 91 | blob_name=blob_name, 92 | file_path=arcpath, 93 | max_connections=4, 94 | progress_callback=progress_callback, 95 | max_retries=10) 96 | upload_done.wait() 97 | 98 | def get(self, dest_path, blob_name, callback=None): 99 | """Download a file or directory to `dest_path` to azure blob `blob_name`. 100 | 101 | Warning! If directory is downloaded the `dest_path` is the parent directory. 102 | 103 | Upload progress can be traced by an optional callback. 104 | """ 105 | download_done = Event() 106 | 107 | def progress_callback(current, total): 108 | if callback: 109 | callback(current, total) 110 | if current >= total: 111 | download_done.set() 112 | 113 | with tempfile.TemporaryDirectory() as td: 114 | arcpath = os.path.join(td, "archive.zip") 115 | for backup_blob_name in [blob_name, blob_name + '.backup']: 116 | try: 117 | blob_size = self._service.get_blob_properties( 118 | blob_name=backup_blob_name, 119 | container_name=self._container_name 120 | )['content-length'] 121 | if int(blob_size) > 0: 122 | self._service.get_blob_to_path( 123 | container_name=self._container_name, 124 | blob_name=backup_blob_name, 125 | file_path=arcpath, 126 | max_connections=4, 127 | progress_callback=progress_callback, 128 | max_retries=10) 129 | unpack_archive(arcpath, dest_path) 130 | download_done.wait() 131 | return True 132 | except AzureMissingResourceHttpError: 133 | pass 134 | return False 135 | 136 | def list(self, prefix=None): 137 | """List all blobs in the container.""" 138 | return fixed_list_blobs(self._service, self._container_name, prefix=prefix) 139 | 140 | def exists(self, blob_name): 141 | """Returns true if `blob_name` exists in container.""" 142 | try: 143 | self._service.get_blob_properties( 144 | blob_name=blob_name, 145 | container_name=self._container_name 146 | ) 147 | return True 148 | except AzureMissingResourceHttpError: 149 | return False 150 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import logging 4 | import os, glob, cv2, re 5 | 6 | from tool.episode_reader import EpisodeReader 7 | from tfacvp.util import _read_and_decode 8 | from tfacvp.util import * 9 | 10 | class Dataset(object): 11 | def __init__(self, directory, num_act, mean_path, num_threads=1, capacity=1e5, batch_size=32, 12 | scale=(1.0/255.0), s_t_shape=[84, 84, 4], x_t_1_shape=[84, 84, 1], colorspace='gray'): 13 | self.scale = scale 14 | self.s_t_shape = s_t_shape 15 | self.x_t_1_shape = x_t_1_shape 16 | 17 | # Load image mean 18 | mean = np.load(os.path.join(mean_path)) 19 | 20 | # Prepare data flow 21 | s_t, a_t, x_t_1 = _read_and_decode(directory, 22 | s_t_shape=s_t_shape, 23 | num_act=num_act, 24 | x_t_1_shape=x_t_1_shape) 25 | self.mean = mean 26 | self.s_t_batch, self.a_t_batch, self.x_t_1_batch = tf.train.shuffle_batch([s_t, a_t, x_t_1], 27 | batch_size=batch_size, capacity=capacity, 28 | min_after_dequeue=int(capacity*0.25), 29 | num_threads=num_threads) 30 | 31 | # Subtract image mean (according to J Oh design) 32 | self.mean_const = tf.constant(mean, dtype=tf.float32) 33 | print(self.mean_const.get_shape()) 34 | self.s_t_batch = (self.s_t_batch - tf.tile(self.mean_const, [1, 1, 4])) * scale 35 | self.x_t_1_batch = (self.x_t_1_batch - self.mean_const) * scale 36 | 37 | def __call__(self): 38 | return {'s_t': self.s_t_batch, 39 | 'a_t': self.a_t_batch, 40 | 'x_t_1': self.x_t_1_batch} 41 | 42 | class CaffeDataset(object): 43 | ''' 44 | Used to load data with directory structure in original paper 45 | ''' 46 | def __init__(self, dir, num_act, mean_path, mode='tf', scale=(1./255.), img_shape=[84, 84], num_frame=4, num_channel=3): 47 | # dir: image data directory, each image should be named as %05d.png 48 | # num_act: number of action in action space (only support discrete action) 49 | # mean_path: mean image file path (NOTE: you must convert mean.binaryproto to npy file) 50 | # mode: tf or caffe (differ in s, a format) 51 | # num_frame: initial frame 52 | # num_channel: number of channel per frame 53 | self.num_act = num_act 54 | self.dir = dir 55 | self.mode = mode 56 | self.scale = scale 57 | self.img_shaep = img_shape 58 | self.num_frame = num_frame 59 | self.num_channel = num_channel 60 | 61 | pat = re.compile('.*npy') 62 | if pat.match(mean_path): 63 | logging.info('Load mean with npy') 64 | self.mean = np.load(mean_path) 65 | else: 66 | import caffe 67 | logging.info('Load mean with caffe') 68 | with open(mean_path, 'rb') as mean_file: 69 | mean_blob = caffe.proto.caffe_pb2.BlobProto() 70 | mean_bin = mean_file.read() 71 | mean_blob.ParseFromString(mean_bin) 72 | self.mean = caffe.io.blobproto_to_array(mean_blob).squeeze() 73 | 74 | if self.mode == 'tf': 75 | self.mean = np.transpose(self.mean, [1, 2, 0]) 76 | 77 | def _process_frame(self, s, img): 78 | # s: state np array 79 | # img: frame input 80 | img = img.astype(np.float32) 81 | if self.mode == 'caffe': 82 | img = np.transpose(img, [2, 0, 1]) 83 | img -= self.mean 84 | img *= self.scale 85 | if self.mode == 'tf': 86 | s[:, :, :-self.num_channel] = s[:, :, self.num_channel:] 87 | s[:, :, -self.num_channel:] = img 88 | else: 89 | s[:-1, :, :, :] = s[1:, :, :, :] 90 | s[-1, :, :, :] = img 91 | return s 92 | 93 | def _process_act(self, a, act): 94 | if self.mode == 'tf': 95 | a[:-1] = a[1:] 96 | a[-1] = act 97 | else: 98 | a[:, :-1] = a[:, 1:] 99 | a[:, -1] = act 100 | return a 101 | 102 | def __call__(self, max_iter=None): 103 | with open(os.path.join(self.dir, 'act.log')) as act_log: 104 | cnt_frame = 0 105 | lim = self.num_frame 106 | if self.mode == 'tf': 107 | s = np.zeros(self.img_shape + [self.num_frame * self.num_channel], dtype=np.float32) 108 | a = np.zeros([self.num_frame, 1], dtype=np.int32) 109 | else: 110 | s = np.zeros([self.num_frame, self.num_channel] + self.img_shape, dtype=np.float32) 111 | a = np.zeros([self.num_frame, 1], dtype=np.int32) 112 | 113 | for filename in sorted(glob.glob(os.path.join(self.dir, '*.png')))[:max_iter]: 114 | logging.info('%s' % filename) 115 | img = cv2.imread(filename) 116 | 117 | s = self._process_frame(s, img) 118 | a = self._process_act(a, int(act_log.readline()[:-1])) 119 | 120 | if cnt_frame < lim: 121 | cnt_frame += 1 122 | else: 123 | yield s, _np_one_hot(a[-1], self.num_act) 124 | 125 | class NumpyDataset(object): 126 | def __init__(self, path, mean_path, num_act, scale=(1./255.), s_shape=[84,84,12]): 127 | # path: tfrecords path 128 | # num_act: number of action in action space 129 | # mean_path: mean file path (must be a npy file, with [h, w, c]) 130 | # scale: image scale 131 | # s_shape: state shape [batch_size, h, w, c * num_frame] 132 | self.path = path 133 | self.mean = np.load(mean_path) 134 | self.num_act = num_act 135 | self.scale = scale 136 | self.s_shape = s_shape 137 | 138 | def _preprocess(self, s, a, x_t_1): 139 | s -= np.tile(self.mean, [4]) 140 | s *= self.scale 141 | x_t_1 -= self.mean 142 | x_t_1 *= self.scale 143 | a = _np_one_hot([a], self.num_act) 144 | return s, a, x_t_1 145 | 146 | def __call__(self, max_iter=None): 147 | reader = EpisodeReader(self.path, self.s_shape[0], self.s_shape[1]) 148 | i = 0 149 | for s, a, x_t_1 in reader.read(): 150 | yield self._preprocess(s, a, x_t_1) 151 | if max_iter and i >= max_iter: 152 | break 153 | i += 1 154 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/old_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import re 4 | 5 | from .tf_ops import ReLu, Conv2D, FC, Deconv2D 6 | 7 | NUM_CHANNELS = 3 8 | NUM_FRAMES = 4 9 | 10 | class ActionConditionalVideoPredictionModel(object): 11 | def __init__(self, num_act, inputs=None, 12 | is_train=True, 13 | with_summary=True, 14 | loss_args=None, 15 | optimizer_args=None): 16 | # num_act: number of action in action space (only discrete) 17 | # inputs: used to create model inputs (dict) 18 | # is_train: is training phase 19 | # loss_args: loss function arguments (e.g. lamb) 20 | # optimizer_args: optimizer arguments (e.g. optimizer type, learning rate, ...) (dict) 21 | self.is_train = is_train 22 | self.num_act = num_act 23 | self.optimizer_args = optimizer_args 24 | self.loss_args = loss_args 25 | self._create_input(inputs) 26 | self._create_model() 27 | self._create_output() 28 | self._create_loss() 29 | 30 | if self.is_train: 31 | self._create_optimizer() 32 | if with_summary: 33 | self._create_summary() 34 | 35 | def _create_input(self, inputs): 36 | # inputs: if None, use tf.placeholder as input 37 | # if not None, expected inputs is a dict 38 | if inputs == None: 39 | self.inputs = {'s_t': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (NUM_CHANNELS * NUM_FRAMES)]), 40 | 'a_t': tf.placeholder(dtype=tf.int32, shape=[None, self.num_act]), 41 | 'x_t_1': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (NUM_CHANNELS)])} 42 | else: 43 | assert type(inputs) is dict 44 | self.inputs = inputs 45 | 46 | def _create_model(self): 47 | self.encode = self._create_encoder(self.inputs['s_t']) 48 | self.act_embed = self._create_action_embedding(self.inputs['a_t']) 49 | self.decode = self._create_decoder(self.encode, self.act_embed) 50 | 51 | def _create_output(self): 52 | self.output = self.decode 53 | 54 | def _create_loss(self): 55 | lamb = self.loss_args['lamb'] if self.loss_args else 0.0 56 | with tf.variable_scope('loss', reuse=not self.is_train) as scope: 57 | t = self.inputs['x_t_1'] 58 | penalty = tf.reduce_sum(lamb * tf.stack([tf.nn.l2_loss(var) for var in tf.trainable_variables()]), name='regularization') 59 | self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output - t, name='l2') + penalty) 60 | 61 | def _create_optimizer(self): 62 | lr = self.optimizer_args['lr'] if self.optimizer_args else 1e-4 63 | with tf.variable_scope('optimize', reuse=not self.is_train) as scope: 64 | # Setup global_step, optimizer 65 | self.global_step = tf.get_variable('global_step', shape=(), initializer=tf.constant_initializer(0.0), trainable=False) 66 | 67 | self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 1e5, 0.9, staircase=True) 68 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name='optimizer') 69 | 70 | # According to original paper code, learning rate of bias is 2x of base learning rate 71 | grads_vars = self.optimizer.compute_gradients(self.loss) 72 | bias_pattern = re.compile('.*/b') 73 | grads_vars_mult = [] 74 | for grad, var in grads_vars: 75 | if bias_pattern.match(var.op.name): 76 | grads_vars_mult.append((grad * 2.0, var)) 77 | else: 78 | grads_vars_mult.append((grad, var)) 79 | 80 | # According to original paper, gradient should be clipped with [-0.1, 0.1] 81 | grads_clip = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_vars_mult] 82 | self.train = self.optimizer.apply_gradients(grads_clip, global_step=self.global_step) 83 | 84 | def _create_encoder(self, x): 85 | # x: input image (tensor([batch_size, 84, 84, 12])) 86 | l = Conv2D(x, [6, 6], 64, 2, 'VALID', 'conv1') 87 | l = ReLu(l, 'relu1') 88 | l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv2') 89 | l = ReLu(l, 'relu2') 90 | l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv3') 91 | l = ReLu(l, 'relu3') 92 | l = FC(l, 1024, 'ip1') 93 | l = ReLu(l, 'relu4') 94 | l = FC(l, 2048, 'enc-factor', initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) 95 | return l 96 | 97 | def _create_action_embedding(self, act): 98 | # act: action input (tensor([batch_size, num_act])) (one-hot vector) 99 | act = tf.cast(act, tf.float32) 100 | l = FC(act, 2048, 'act-embed', initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)) 101 | return l 102 | 103 | def _create_decoder(self, encode, act_embed): 104 | # encode: encode layer 105 | # act_embed: action embedding layer 106 | batch_size = tf.shape(encode)[0] 107 | l = tf.multiply(encode, act_embed, name='merge') 108 | l = FC(l, 1024, 'dec') 109 | l = FC(l, 64 * 10 * 10, 'ip4') 110 | l = ReLu(l, 'relu1') 111 | l = tf.reshape(l, [-1, 10, 10, 64], name='dec-reshape') 112 | l = Deconv2D(l, [6, 6], [batch_size, 20, 20, 64], 64, 2, 'SAME', 'deconv3') 113 | l = ReLu(l, 'relu2') 114 | l = Deconv2D(l, [6, 6], [batch_size, 40, 40, 64], 64, 2, 'SAME', 'deconv2') 115 | l = ReLu(l, 'relu3') 116 | l = Deconv2D(l, [6, 6], [batch_size, 84, 84, NUM_CHANNELS], 3, 2, 'VALID', 'x_hat-05') 117 | return l 118 | 119 | def _create_summary(self): 120 | if self.is_train: 121 | tf.summary.scalar("learning_rate", self.learning_rate, collections=['train']) 122 | tf.summary.scalar("loss", self.loss, collections=['train']) 123 | tf.summary.image('x_pred_t_1', tf.cast(self.decode * 255.0, tf.uint8), collections=['train']) 124 | tf.summary.image('x_t_1', tf.cast(self.inputs['x_t_1'] * 255.0, tf.uint8), collections=['train']) 125 | 126 | def restore(self, sess, ckpt, var_scope=None): 127 | # sess: tf session 128 | # ckpt: ckpt path (str) 129 | if var_scope != None: 130 | all_vars = tf.all_variables() 131 | g_vars = [k for k in all_vars if k.name.startswith(var_scope)] 132 | 133 | saver = tf.train.Saver({v.op.name[2:]: v for v in g_vars}) 134 | saver.restore(sess, ckpt) 135 | 136 | def predict(self, sess, s, a): 137 | # sess: tf session 138 | # s: state at t [batch_size, 84, 84, NUM_CHANNELS * NUM_FRAMES] 139 | # a: action at t [batch_size, num_act] 140 | assert s.shape[1:] == (84, 84, NUM_CHANNELS * NUM_FRAMES) 141 | assert len(a.shape) == 2 142 | assert a.shape[1] == self.num_act 143 | 144 | return sess.run([self.output], feed_dict={self.inputs['s_t']: s, 145 | self.inputs['a_t']: a}) 146 | 147 | -------------------------------------------------------------------------------- /baselines/deepq/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 5 | 6 | 7 | class ReplayBuffer(object): 8 | def __init__(self, size): 9 | """Create Prioritized Replay buffer. 10 | 11 | Parameters 12 | ---------- 13 | size: int 14 | Max number of transitions to store in the buffer. When the buffer 15 | overflows the old memories are dropped. 16 | """ 17 | self._storage = [] 18 | self._maxsize = size 19 | self._next_idx = 0 20 | 21 | def __len__(self): 22 | return len(self._storage) 23 | 24 | def add(self, obs_t, action, reward, obs_tp1, done): 25 | data = (obs_t, action, reward, obs_tp1, done) 26 | 27 | if self._next_idx >= len(self._storage): 28 | self._storage.append(data) 29 | else: 30 | self._storage[self._next_idx] = data 31 | self._next_idx = (self._next_idx + 1) % self._maxsize 32 | 33 | def _encode_sample(self, idxes): 34 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 35 | for i in idxes: 36 | data = self._storage[i] 37 | obs_t, action, reward, obs_tp1, done = data 38 | obses_t.append(np.array(obs_t, copy=False)) 39 | actions.append(np.array(action, copy=False)) 40 | rewards.append(reward) 41 | obses_tp1.append(np.array(obs_tp1, copy=False)) 42 | dones.append(done) 43 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 44 | 45 | def sample(self, batch_size): 46 | """Sample a batch of experiences. 47 | 48 | Parameters 49 | ---------- 50 | batch_size: int 51 | How many transitions to sample. 52 | 53 | Returns 54 | ------- 55 | obs_batch: np.array 56 | batch of observations 57 | act_batch: np.array 58 | batch of actions executed given obs_batch 59 | rew_batch: np.array 60 | rewards received as results of executing act_batch 61 | next_obs_batch: np.array 62 | next set of observations seen after executing act_batch 63 | done_mask: np.array 64 | done_mask[i] = 1 if executing act_batch[i] resulted in 65 | the end of an episode and 0 otherwise. 66 | """ 67 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 68 | return self._encode_sample(idxes) 69 | 70 | 71 | class PrioritizedReplayBuffer(ReplayBuffer): 72 | def __init__(self, size, alpha): 73 | """Create Prioritized Replay buffer. 74 | 75 | Parameters 76 | ---------- 77 | size: int 78 | Max number of transitions to store in the buffer. When the buffer 79 | overflows the old memories are dropped. 80 | alpha: float 81 | how much prioritization is used 82 | (0 - no prioritization, 1 - full prioritization) 83 | 84 | See Also 85 | -------- 86 | ReplayBuffer.__init__ 87 | """ 88 | super(PrioritizedReplayBuffer, self).__init__(size) 89 | assert alpha > 0 90 | self._alpha = alpha 91 | 92 | it_capacity = 1 93 | while it_capacity < size: 94 | it_capacity *= 2 95 | 96 | self._it_sum = SumSegmentTree(it_capacity) 97 | self._it_min = MinSegmentTree(it_capacity) 98 | self._max_priority = 1.0 99 | 100 | def add(self, *args, **kwargs): 101 | """See ReplayBuffer.store_effect""" 102 | idx = self._next_idx 103 | super().add(*args, **kwargs) 104 | self._it_sum[idx] = self._max_priority ** self._alpha 105 | self._it_min[idx] = self._max_priority ** self._alpha 106 | 107 | def _sample_proportional(self, batch_size): 108 | res = [] 109 | for _ in range(batch_size): 110 | # TODO(szymon): should we ensure no repeats? 111 | mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) 112 | idx = self._it_sum.find_prefixsum_idx(mass) 113 | res.append(idx) 114 | return res 115 | 116 | def sample(self, batch_size, beta): 117 | """Sample a batch of experiences. 118 | 119 | compared to ReplayBuffer.sample 120 | it also returns importance weights and idxes 121 | of sampled experiences. 122 | 123 | 124 | Parameters 125 | ---------- 126 | batch_size: int 127 | How many transitions to sample. 128 | beta: float 129 | To what degree to use importance weights 130 | (0 - no corrections, 1 - full correction) 131 | 132 | Returns 133 | ------- 134 | obs_batch: np.array 135 | batch of observations 136 | act_batch: np.array 137 | batch of actions executed given obs_batch 138 | rew_batch: np.array 139 | rewards received as results of executing act_batch 140 | next_obs_batch: np.array 141 | next set of observations seen after executing act_batch 142 | done_mask: np.array 143 | done_mask[i] = 1 if executing act_batch[i] resulted in 144 | the end of an episode and 0 otherwise. 145 | weights: np.array 146 | Array of shape (batch_size,) and dtype np.float32 147 | denoting importance weight of each sampled transition 148 | idxes: np.array 149 | Array of shape (batch_size,) and dtype np.int32 150 | idexes in buffer of sampled experiences 151 | """ 152 | assert beta > 0 153 | 154 | idxes = self._sample_proportional(batch_size) 155 | 156 | weights = [] 157 | p_min = self._it_min.min() / self._it_sum.sum() 158 | max_weight = (p_min * len(self._storage)) ** (-beta) 159 | 160 | for idx in idxes: 161 | p_sample = self._it_sum[idx] / self._it_sum.sum() 162 | weight = (p_sample * len(self._storage)) ** (-beta) 163 | weights.append(weight / max_weight) 164 | weights = np.array(weights) 165 | encoded_sample = self._encode_sample(idxes) 166 | return tuple(list(encoded_sample) + [weights, idxes]) 167 | 168 | def update_priorities(self, idxes, priorities): 169 | """Update priorities of sampled transitions. 170 | 171 | sets priority of transition at index idxes[i] in buffer 172 | to priorities[i]. 173 | 174 | Parameters 175 | ---------- 176 | idxes: [int] 177 | List of idxes of sampled transitions 178 | priorities: [float] 179 | List of updated priorities corresponding to 180 | transitions at the sampled idxes denoted by 181 | variable `idxes`. 182 | """ 183 | assert len(idxes) == len(priorities) 184 | for idx, priority in zip(idxes, priorities): 185 | assert priority > 0 186 | assert 0 <= idx < len(self._storage) 187 | self._it_sum[idx] = priority ** self._alpha 188 | self._it_min[idx] = priority ** self._alpha 189 | 190 | self._max_priority = max(self._max_priority, priority) 191 | -------------------------------------------------------------------------------- /baselines/deepq/prediction/tfacvp/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import re 4 | 5 | from .tf_ops import ReLu, Conv2D, FC, Deconv2D 6 | 7 | class ActionConditionalVideoPredictionModel(object): 8 | def __init__(self, num_act, num_channel=3, num_frame=4, inputs=None, 9 | is_train=True, 10 | with_summary=True, 11 | loss_args=None, 12 | optimizer_args=None): 13 | # num_act: number of action in action space (only discrete) 14 | # num_channel: number of channel in one frame 15 | # num_frame: number of frame in one state 16 | # inputs: used to create model inputs (dict) 17 | # is_train: is training phase 18 | # loss_args: loss function arguments (e.g. lamb) 19 | # optimizer_args: optimizer arguments (e.g. optimizer type, learning rate, ...) (dict) 20 | self.is_train = is_train 21 | self.num_act = num_act 22 | self.num_channel = num_channel 23 | self.num_frame = num_frame 24 | self.optimizer_args = optimizer_args 25 | self.loss_args = loss_args 26 | self._create_input(inputs) 27 | self._create_model() 28 | self._create_output() 29 | self._create_loss() 30 | 31 | if self.is_train: 32 | self._create_optimizer() 33 | if with_summary: 34 | self._create_summary() 35 | 36 | def _create_input(self, inputs): 37 | # inputs: if None, use tf.placeholder as input 38 | # if not None, expected inputs is a dict 39 | if inputs == None: 40 | self.inputs = {'s_t': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (self.num_channel * self.num_frame)]), 41 | 'a_t': tf.placeholder(dtype=tf.int32, shape=[None, self.num_act]), 42 | 'x_t_1': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (self.num_channel)])} 43 | else: 44 | assert type(inputs) is dict 45 | self.inputs = inputs 46 | 47 | def _create_model(self): 48 | self.encode = self._create_encoder(self.inputs['s_t']) 49 | self.act_embed = self._create_action_embedding(self.inputs['a_t']) 50 | self.decode = self._create_decoder(self.encode, self.act_embed) 51 | 52 | def _create_output(self): 53 | self.output = self.decode 54 | 55 | def _create_loss(self): 56 | lamb = self.loss_args['lamb'] if self.loss_args else 0.0 57 | with tf.variable_scope('loss', reuse=not self.is_train) as scope: 58 | t = self.inputs['x_t_1'] 59 | penalty = tf.reduce_sum(lamb * tf.stack([tf.nn.l2_loss(var) for var in tf.trainable_variables()]), name='regularization') 60 | self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output - t, name='l2') + penalty) 61 | 62 | def _create_optimizer(self): 63 | lr = self.optimizer_args['lr'] if self.optimizer_args else 1e-4 64 | with tf.variable_scope('optimize', reuse=not self.is_train) as scope: 65 | # Setup global_step, optimizer 66 | self.global_step = tf.get_variable('global_step', shape=(), initializer=tf.constant_initializer(0.0), trainable=False) 67 | 68 | self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 1e5, 0.9, staircase=True) 69 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name='optimizer') 70 | 71 | # According to original paper code, learning rate of bias is 2x of base learning rate 72 | grads_vars = self.optimizer.compute_gradients(self.loss) 73 | bias_pattern = re.compile('.*/b') 74 | grads_vars_mult = [] 75 | for grad, var in grads_vars: 76 | if bias_pattern.match(var.op.name): 77 | grads_vars_mult.append((grad * 2.0, var)) 78 | else: 79 | grads_vars_mult.append((grad, var)) 80 | 81 | # According to original paper, gradient should be clipped with [-0.1, 0.1] 82 | grads_clip = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_vars_mult] 83 | self.train = self.optimizer.apply_gradients(grads_clip, global_step=self.global_step) 84 | 85 | def _create_encoder(self, x): 86 | # x: input image (tensor([batch_size, 84, 84, 12])) 87 | l = Conv2D(x, [6, 6], 64, 2, 'VALID', 'conv1') 88 | l = ReLu(l, 'relu1') 89 | l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv2') 90 | l = ReLu(l, 'relu2') 91 | l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv3') 92 | l = ReLu(l, 'relu3') 93 | l = FC(l, 1024, 'ip1') 94 | l = ReLu(l, 'relu4') 95 | l = FC(l, 2048, 'enc-factor', initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) 96 | return l 97 | 98 | def _create_action_embedding(self, act): 99 | # act: action input (tensor([batch_size, num_act])) (one-hot vector) 100 | act = tf.cast(act, tf.float32) 101 | l = FC(act, 2048, 'act-embed', initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)) 102 | return l 103 | 104 | def _create_decoder(self, encode, act_embed): 105 | # encode: encode layer 106 | # act_embed: action embedding layer 107 | batch_size = tf.shape(encode)[0] 108 | l = tf.multiply(encode, act_embed, name='merge') 109 | l = FC(l, 1024, 'dec') 110 | l = FC(l, 64 * 10 * 10, 'ip4') 111 | l = ReLu(l, 'relu1') 112 | l = tf.reshape(l, [-1, 10, 10, 64], name='dec-reshape') 113 | l = Deconv2D(l, [6, 6], [batch_size, 20, 20, 64], 64, 2, 'SAME', 'deconv3') 114 | l = ReLu(l, 'relu2') 115 | l = Deconv2D(l, [6, 6], [batch_size, 40, 40, 64], 64, 2, 'SAME', 'deconv2') 116 | l = ReLu(l, 'relu3') 117 | l = Deconv2D(l, [6, 6], [batch_size, 84, 84, self.num_channel], self.num_channel, 2, 'VALID', 'x_hat-05') 118 | return l 119 | 120 | def _create_summary(self): 121 | if self.is_train: 122 | tf.summary.scalar("learning_rate", self.learning_rate, collections=['train']) 123 | tf.summary.scalar("loss", self.loss, collections=['train']) 124 | tf.summary.image('x_pred_t_1', tf.cast(self.decode * 255.0, tf.uint8), collections=['train']) 125 | tf.summary.image('x_t_1', tf.cast(self.inputs['x_t_1'] * 255.0, tf.uint8), collections=['train']) 126 | 127 | 128 | def restore(self, sess, ckpt, var_scope=None): 129 | # sess: tf session 130 | # ckpt: ckpt path (str) 131 | if var_scope != None: 132 | all_vars = tf.all_variables() 133 | g_vars = [k for k in all_vars if k.name.startswith(var_scope)] 134 | saver = tf.train.Saver({v.op.name[2:]: v for v in g_vars}) 135 | else: 136 | saver = tf.train.Saver() 137 | 138 | saver.restore(sess, ckpt) 139 | 140 | 141 | def predict(self, sess, s, a): 142 | # sess: tf session 143 | # s: state at t [batch_size, 84, 84, self.num_channel * self.num_frame] 144 | # a: action at t [batch_size, num_act] 145 | assert s.shape[1:] == (84, 84, self.num_channel * self.num_frame) 146 | assert len(a.shape) == 2 147 | assert a.shape[1] == self.num_act 148 | 149 | return sess.run([self.output], feed_dict={self.inputs['s_t']: s, 150 | self.inputs['a_t']: a}) 151 | 152 | -------------------------------------------------------------------------------- /baselines/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | See README.md for a description of the logging API. 4 | 5 | OFF state corresponds to having Logger.CURRENT == Logger.DEFAULT 6 | ON state is otherwise 7 | 8 | """ 9 | 10 | from collections import OrderedDict 11 | import os 12 | import sys 13 | import shutil 14 | import os.path as osp 15 | import json 16 | 17 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json'] 18 | 19 | DEBUG = 10 20 | INFO = 20 21 | WARN = 30 22 | ERROR = 40 23 | 24 | DISABLED = 50 25 | 26 | 27 | class OutputFormat(object): 28 | def writekvs(self, kvs): 29 | """ 30 | Write key-value pairs 31 | """ 32 | raise NotImplementedError 33 | 34 | def writeseq(self, args): 35 | """ 36 | Write a sequence of other data (e.g. a logging message) 37 | """ 38 | pass 39 | 40 | def close(self): 41 | return 42 | 43 | 44 | class HumanOutputFormat(OutputFormat): 45 | def __init__(self, file): 46 | self.file = file 47 | 48 | def writekvs(self, kvs): 49 | # Create strings for printing 50 | key2str = OrderedDict() 51 | for (key, val) in kvs.items(): 52 | valstr = '%-8.3g' % (val,) if hasattr(val, '__float__') else val 53 | key2str[self._truncate(key)] = self._truncate(valstr) 54 | 55 | # Find max widths 56 | keywidth = max(map(len, key2str.keys())) 57 | valwidth = max(map(len, key2str.values())) 58 | 59 | # Write out the data 60 | dashes = '-' * (keywidth + valwidth + 7) 61 | lines = [dashes] 62 | for (key, val) in key2str.items(): 63 | lines.append('| %s%s | %s%s |' % ( 64 | key, 65 | ' ' * (keywidth - len(key)), 66 | val, 67 | ' ' * (valwidth - len(val)), 68 | )) 69 | lines.append(dashes) 70 | self.file.write('\n'.join(lines) + '\n') 71 | 72 | # Flush the output to the file 73 | self.file.flush() 74 | 75 | def _truncate(self, s): 76 | return s[:20] + '...' if len(s) > 23 else s 77 | 78 | def writeseq(self, args): 79 | for arg in args: 80 | self.file.write(arg) 81 | self.file.write('\n') 82 | self.file.flush() 83 | 84 | 85 | class JSONOutputFormat(OutputFormat): 86 | def __init__(self, file): 87 | self.file = file 88 | 89 | def writekvs(self, kvs): 90 | for k, v in kvs.items(): 91 | if hasattr(v, 'dtype'): 92 | v = v.tolist() 93 | kvs[k] = float(v) 94 | self.file.write(json.dumps(kvs) + '\n') 95 | self.file.flush() 96 | 97 | 98 | def make_output_format(format, ev_dir): 99 | os.makedirs(ev_dir, exist_ok=True) 100 | if format == 'stdout': 101 | return HumanOutputFormat(sys.stdout) 102 | elif format == 'log': 103 | log_file = open(osp.join(ev_dir, 'log.txt'), 'wt') 104 | return HumanOutputFormat(log_file) 105 | elif format == 'json': 106 | json_file = open(osp.join(ev_dir, 'progress.json'), 'wt') 107 | return JSONOutputFormat(json_file) 108 | else: 109 | raise ValueError('Unknown format specified: %s' % (format,)) 110 | 111 | # ================================================================ 112 | # API 113 | # ================================================================ 114 | 115 | 116 | def logkv(key, val): 117 | """ 118 | Log a value of some diagnostic 119 | Call this once for each diagnostic quantity, each iteration 120 | """ 121 | Logger.CURRENT.logkv(key, val) 122 | 123 | 124 | def dumpkvs(): 125 | """ 126 | Write all of the diagnostics from the current iteration 127 | 128 | level: int. (see logger.py docs) If the global logger level is higher than 129 | the level argument here, don't print to stdout. 130 | """ 131 | Logger.CURRENT.dumpkvs() 132 | 133 | 134 | # for backwards compatibility 135 | record_tabular = logkv 136 | dump_tabular = dumpkvs 137 | 138 | 139 | def log(*args, level=INFO): 140 | """ 141 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 142 | """ 143 | Logger.CURRENT.log(*args, level=level) 144 | 145 | 146 | def debug(*args): 147 | log(*args, level=DEBUG) 148 | 149 | 150 | def info(*args): 151 | log(*args, level=INFO) 152 | 153 | 154 | def warn(*args): 155 | log(*args, level=WARN) 156 | 157 | 158 | def error(*args): 159 | log(*args, level=ERROR) 160 | 161 | 162 | def set_level(level): 163 | """ 164 | Set logging threshold on current logger. 165 | """ 166 | Logger.CURRENT.set_level(level) 167 | 168 | 169 | def get_dir(): 170 | """ 171 | Get directory that log files are being written to. 172 | will be None if there is no output directory (i.e., if you didn't call start) 173 | """ 174 | return Logger.CURRENT.get_dir() 175 | 176 | 177 | def get_expt_dir(): 178 | sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir() [%s]\n" % (get_dir(),)) 179 | return get_dir() 180 | 181 | 182 | # ================================================================ 183 | # Backend 184 | # ================================================================ 185 | 186 | 187 | class Logger(object): 188 | DEFAULT = None # A logger with no output files. (See right below class definition) 189 | # So that you can still log to the terminal without setting up any output files 190 | CURRENT = None # Current logger being used by the free functions above 191 | 192 | def __init__(self, dir, output_formats): 193 | self.name2val = OrderedDict() # values this iteration 194 | self.level = INFO 195 | self.dir = dir 196 | self.output_formats = output_formats 197 | 198 | # Logging API, forwarded 199 | # ---------------------------------------- 200 | def logkv(self, key, val): 201 | self.name2val[key] = val 202 | 203 | def dumpkvs(self): 204 | for fmt in self.output_formats: 205 | fmt.writekvs(self.name2val) 206 | self.name2val.clear() 207 | 208 | def log(self, *args, level=INFO): 209 | if self.level <= level: 210 | self._do_log(args) 211 | 212 | # Configuration 213 | # ---------------------------------------- 214 | def set_level(self, level): 215 | self.level = level 216 | 217 | def get_dir(self): 218 | return self.dir 219 | 220 | def close(self): 221 | for fmt in self.output_formats: 222 | fmt.close() 223 | 224 | # Misc 225 | # ---------------------------------------- 226 | def _do_log(self, args): 227 | for fmt in self.output_formats: 228 | fmt.writeseq(args) 229 | 230 | 231 | # ================================================================ 232 | 233 | Logger.DEFAULT = Logger(output_formats=[HumanOutputFormat(sys.stdout)], dir=None) 234 | Logger.CURRENT = Logger.DEFAULT 235 | 236 | 237 | class session(object): 238 | """ 239 | Context manager that sets up the loggers for an experiment. 240 | """ 241 | 242 | CURRENT = None # Set to a LoggerContext object using enter/exit or context manager 243 | 244 | def __init__(self, dir, format_strs=None): 245 | self.dir = dir 246 | if format_strs is None: 247 | format_strs = LOG_OUTPUT_FORMATS 248 | output_formats = [make_output_format(f, dir) for f in format_strs] 249 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) 250 | 251 | def __enter__(self): 252 | os.makedirs(self.evaluation_dir(), exist_ok=True) 253 | output_formats = [make_output_format(f, self.evaluation_dir()) for f in LOG_OUTPUT_FORMATS] 254 | Logger.CURRENT = Logger(dir=self.dir, output_formats=output_formats) 255 | 256 | def __exit__(self, *args): 257 | Logger.CURRENT.close() 258 | Logger.CURRENT = Logger.DEFAULT 259 | 260 | def evaluation_dir(self): 261 | return self.dir 262 | 263 | 264 | # ================================================================ 265 | 266 | 267 | def _demo(): 268 | info("hi") 269 | debug("shouldn't appear") 270 | set_level(DEBUG) 271 | debug("should appear") 272 | dir = "/tmp/testlogging" 273 | if os.path.exists(dir): 274 | shutil.rmtree(dir) 275 | with session(dir=dir): 276 | record_tabular("a", 3) 277 | record_tabular("b", 2.5) 278 | dump_tabular() 279 | record_tabular("b", -2.5) 280 | record_tabular("a", 5.5) 281 | dump_tabular() 282 | info("^^^ should see a = 5.5") 283 | 284 | record_tabular("b", -2.5) 285 | dump_tabular() 286 | 287 | record_tabular("a", "longasslongasslongasslongasslongasslongassvalue") 288 | dump_tabular() 289 | 290 | 291 | if __name__ == "__main__": 292 | _demo() 293 | -------------------------------------------------------------------------------- /baselines/common/atari_wrappers_deprecated.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import gym 3 | import numpy as np 4 | 5 | from collections import deque 6 | from gym import spaces 7 | 8 | 9 | class NoopResetEnv(gym.Wrapper): 10 | def __init__(self, env=None, noop_max=30): 11 | """Sample initial states by taking random number of no-ops on reset. 12 | No-op is assumed to be action 0. 13 | """ 14 | super(NoopResetEnv, self).__init__(env) 15 | self.noop_max = noop_max 16 | self.override_num_noops = None 17 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 18 | 19 | def _reset(self): 20 | """ Do no-op action for a number of steps in [1, noop_max].""" 21 | self.env.reset() 22 | if self.override_num_noops is not None: 23 | noops = self.override_num_noops 24 | else: 25 | noops = np.random.randint(1, self.noop_max + 1) 26 | assert noops > 0 27 | obs = None 28 | for _ in range(noops): 29 | obs, _, done, _ = self.env.step(0) 30 | if done: 31 | obs = self.env.reset() 32 | return obs 33 | 34 | 35 | class FireResetEnv(gym.Wrapper): 36 | def __init__(self, env=None): 37 | """For environments where the user need to press FIRE for the game to start.""" 38 | super(FireResetEnv, self).__init__(env) 39 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 40 | assert len(env.unwrapped.get_action_meanings()) >= 3 41 | 42 | def _reset(self): 43 | self.env.reset() 44 | obs, _, done, _ = self.env.step(1) 45 | if done: 46 | self.env.reset() 47 | obs, _, done, _ = self.env.step(2) 48 | if done: 49 | self.env.reset() 50 | return obs 51 | 52 | 53 | class EpisodicLifeEnv(gym.Wrapper): 54 | def __init__(self, env=None): 55 | """Make end-of-life == end-of-episode, but only reset on true game over. 56 | Done by DeepMind for the DQN and co. since it helps value estimation. 57 | """ 58 | super(EpisodicLifeEnv, self).__init__(env) 59 | self.lives = 0 60 | self.was_real_done = True 61 | self.was_real_reset = False 62 | 63 | def _step(self, action): 64 | obs, reward, done, info = self.env.step(action) 65 | self.was_real_done = done 66 | # check current lives, make loss of life terminal, 67 | # then update lives to handle bonus lives 68 | lives = self.env.unwrapped.ale.lives() 69 | if lives < self.lives and lives > 0: 70 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 71 | # so its important to keep lives > 0, so that we only reset once 72 | # the environment advertises done. 73 | done = True 74 | self.lives = lives 75 | return obs, reward, done, info 76 | 77 | def _reset(self): 78 | """Reset only when lives are exhausted. 79 | This way all states are still reachable even though lives are episodic, 80 | and the learner need not know about any of this behind-the-scenes. 81 | """ 82 | if self.was_real_done: 83 | obs = self.env.reset() 84 | self.was_real_reset = True 85 | else: 86 | # no-op step to advance from terminal/lost life state 87 | obs, _, _, _ = self.env.step(0) 88 | self.was_real_reset = False 89 | self.lives = self.env.unwrapped.ale.lives() 90 | return obs 91 | 92 | 93 | class MaxAndSkipEnv(gym.Wrapper): 94 | def __init__(self, env=None, skip=4): 95 | """Return only every `skip`-th frame""" 96 | super(MaxAndSkipEnv, self).__init__(env) 97 | # most recent raw observations (for max pooling across time steps) 98 | self._obs_buffer = deque(maxlen=2) 99 | self._skip = skip 100 | 101 | def _step(self, action): 102 | total_reward = 0.0 103 | done = None 104 | for _ in range(self._skip): 105 | obs, reward, done, info = self.env.step(action) 106 | self._obs_buffer.append(obs) 107 | total_reward += reward 108 | if done: 109 | break 110 | 111 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 112 | 113 | return max_frame, total_reward, done, info 114 | 115 | def _reset(self): 116 | """Clear past frame buffer and init. to first obs. from inner env.""" 117 | self._obs_buffer.clear() 118 | obs = self.env.reset() 119 | self._obs_buffer.append(obs) 120 | return obs 121 | 122 | 123 | class ProcessFrame84(gym.ObservationWrapper): 124 | def __init__(self, env=None): 125 | super(ProcessFrame84, self).__init__(env) 126 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 127 | 128 | def _observation(self, obs): 129 | return ProcessFrame84.process(obs) 130 | 131 | @staticmethod 132 | def process(frame): 133 | resized_screen = None 134 | if frame.size == 210 * 160 * 3: 135 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 136 | elif frame.size == 250 * 160 * 3: 137 | img = np.reshape(frame, [250, 160, 3]).astype(np.float32) 138 | else: 139 | assert False, "Unknown resolution." 140 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 141 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) 142 | x_t = resized_screen[18:102, :] 143 | x_t = np.reshape(x_t, [84, 84, 1]) 144 | return x_t.astype(np.uint8) 145 | 146 | 147 | class ClippedRewardsWrapper(gym.RewardWrapper): 148 | def _reward(self, reward): 149 | """Change all the positive rewards to 1, negative to -1 and keep zero.""" 150 | return np.sign(reward) 151 | 152 | 153 | class LazyFrames(object): 154 | def __init__(self, frames): 155 | """This object ensures that common frames between the observations are only stored once. 156 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 157 | buffers. 158 | 159 | This object should only be converted to numpy array before being passed to the model. 160 | 161 | You'd not belive how complex the previous solution was.""" 162 | self._frames = frames 163 | 164 | def __array__(self, dtype=None): 165 | out = np.concatenate(self._frames, axis=2) 166 | if dtype is not None: 167 | out = out.astype(dtype) 168 | return out 169 | 170 | 171 | class FrameStack(gym.Wrapper): 172 | def __init__(self, env, k): 173 | """Stack k last frames. 174 | 175 | Returns lazy array, which is much more memory efficient. 176 | 177 | See Also 178 | -------- 179 | baselines.common.atari_wrappers.LazyFrames 180 | """ 181 | gym.Wrapper.__init__(self, env) 182 | self.k = k 183 | self.frames = deque([], maxlen=k) 184 | shp = env.observation_space.shape 185 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k)) 186 | 187 | def _reset(self): 188 | ob = self.env.reset() 189 | for _ in range(self.k): 190 | self.frames.append(ob) 191 | return self._get_ob() 192 | 193 | def _step(self, action): 194 | ob, reward, done, info = self.env.step(action) 195 | self.frames.append(ob) 196 | return self._get_ob(), reward, done, info 197 | 198 | def _get_ob(self): 199 | assert len(self.frames) == self.k 200 | return LazyFrames(list(self.frames)) 201 | 202 | 203 | class ScaledFloatFrame(gym.ObservationWrapper): 204 | def _observation(self, obs): 205 | # careful! This undoes the memory optimization, use 206 | # with smaller replay buffers only. 207 | return np.array(obs).astype(np.float32) / 255.0 208 | 209 | 210 | def wrap_dqn(env): 211 | """Apply a common set of wrappers for Atari games.""" 212 | assert 'NoFrameskip' in env.spec.id 213 | env = EpisodicLifeEnv(env) 214 | env = NoopResetEnv(env, noop_max=30) 215 | env = MaxAndSkipEnv(env, skip=4) 216 | if 'FIRE' in env.unwrapped.get_action_meanings(): 217 | env = FireResetEnv(env) 218 | env = ProcessFrame84(env) 219 | env = FrameStack(env, 4) 220 | env = ClippedRewardsWrapper(env) 221 | return env 222 | 223 | 224 | class A2cProcessFrame(gym.Wrapper): 225 | def __init__(self, env): 226 | gym.Wrapper.__init__(self, env) 227 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 228 | 229 | def _step(self, action): 230 | ob, reward, done, info = self.env.step(action) 231 | return A2cProcessFrame.process(ob), reward, done, info 232 | 233 | def _reset(self): 234 | return A2cProcessFrame.process(self.env.reset()) 235 | 236 | @staticmethod 237 | def process(frame): 238 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 239 | frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA) 240 | return frame.reshape(84, 84, 1) 241 | -------------------------------------------------------------------------------- /baselines/common/misc_util.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import os 4 | import pickle 5 | import random 6 | import tempfile 7 | import time 8 | import zipfile 9 | 10 | 11 | def zipsame(*seqs): 12 | L = len(seqs[0]) 13 | assert all(len(seq) == L for seq in seqs[1:]) 14 | return zip(*seqs) 15 | 16 | 17 | def unpack(seq, sizes): 18 | """ 19 | Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'. 20 | None = just one bare element, not a list 21 | 22 | Example: 23 | unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6]) 24 | """ 25 | seq = list(seq) 26 | it = iter(seq) 27 | assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes) 28 | for size in sizes: 29 | if size is None: 30 | yield it.__next__() 31 | else: 32 | li = [] 33 | for _ in range(size): 34 | li.append(it.__next__()) 35 | yield li 36 | 37 | 38 | class EzPickle(object): 39 | """Objects that are pickled and unpickled via their constructor 40 | arguments. 41 | 42 | Example usage: 43 | 44 | class Dog(Animal, EzPickle): 45 | def __init__(self, furcolor, tailkind="bushy"): 46 | Animal.__init__() 47 | EzPickle.__init__(furcolor, tailkind) 48 | ... 49 | 50 | When this object is unpickled, a new Dog will be constructed by passing the provided 51 | furcolor and tailkind into the constructor. However, philosophers are still not sure 52 | whether it is still the same dog. 53 | 54 | This is generally needed only for environments which wrap C/C++ code, such as MuJoCo 55 | and Atari. 56 | """ 57 | 58 | def __init__(self, *args, **kwargs): 59 | self._ezpickle_args = args 60 | self._ezpickle_kwargs = kwargs 61 | 62 | def __getstate__(self): 63 | return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} 64 | 65 | def __setstate__(self, d): 66 | out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) 67 | self.__dict__.update(out.__dict__) 68 | 69 | 70 | def set_global_seeds(i): 71 | try: 72 | import tensorflow as tf 73 | except ImportError: 74 | pass 75 | else: 76 | tf.set_random_seed(i) 77 | np.random.seed(i) 78 | random.seed(i) 79 | 80 | 81 | def pretty_eta(seconds_left): 82 | """Print the number of seconds in human readable format. 83 | 84 | Examples: 85 | 2 days 86 | 2 hours and 37 minutes 87 | less than a minute 88 | 89 | Paramters 90 | --------- 91 | seconds_left: int 92 | Number of seconds to be converted to the ETA 93 | Returns 94 | ------- 95 | eta: str 96 | String representing the pretty ETA. 97 | """ 98 | minutes_left = seconds_left // 60 99 | seconds_left %= 60 100 | hours_left = minutes_left // 60 101 | minutes_left %= 60 102 | days_left = hours_left // 24 103 | hours_left %= 24 104 | 105 | def helper(cnt, name): 106 | return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else '')) 107 | 108 | if days_left > 0: 109 | msg = helper(days_left, 'day') 110 | if hours_left > 0: 111 | msg += ' and ' + helper(hours_left, 'hour') 112 | return msg 113 | if hours_left > 0: 114 | msg = helper(hours_left, 'hour') 115 | if minutes_left > 0: 116 | msg += ' and ' + helper(minutes_left, 'minute') 117 | return msg 118 | if minutes_left > 0: 119 | return helper(minutes_left, 'minute') 120 | return 'less than a minute' 121 | 122 | 123 | class RunningAvg(object): 124 | def __init__(self, gamma, init_value=None): 125 | """Keep a running estimate of a quantity. This is a bit like mean 126 | but more sensitive to recent changes. 127 | 128 | Parameters 129 | ---------- 130 | gamma: float 131 | Must be between 0 and 1, where 0 is the most sensitive to recent 132 | changes. 133 | init_value: float or None 134 | Initial value of the estimate. If None, it will be set on the first update. 135 | """ 136 | self._value = init_value 137 | self._gamma = gamma 138 | 139 | def update(self, new_val): 140 | """Update the estimate. 141 | 142 | Parameters 143 | ---------- 144 | new_val: float 145 | new observated value of estimated quantity. 146 | """ 147 | if self._value is None: 148 | self._value = new_val 149 | else: 150 | self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val 151 | 152 | def __float__(self): 153 | """Get the current estimate""" 154 | return self._value 155 | 156 | 157 | class SimpleMonitor(gym.Wrapper): 158 | def __init__(self, env=None): 159 | """Adds two qunatities to info returned by every step: 160 | 161 | num_steps: int 162 | Number of steps takes so far 163 | rewards: [float] 164 | All the cumulative rewards for the episodes completed so far. 165 | """ 166 | super().__init__(env) 167 | # current episode state 168 | self._current_reward = None 169 | self._num_steps = None 170 | # temporary monitor state that we do not save 171 | self._time_offset = None 172 | self._total_steps = None 173 | # monitor state 174 | self._episode_rewards = [] 175 | self._episode_lengths = [] 176 | self._episode_end_times = [] 177 | 178 | def _reset(self): 179 | obs = self.env.reset() 180 | # recompute temporary state if needed 181 | if self._time_offset is None: 182 | self._time_offset = time.time() 183 | if len(self._episode_end_times) > 0: 184 | self._time_offset -= self._episode_end_times[-1] 185 | if self._total_steps is None: 186 | self._total_steps = sum(self._episode_lengths) 187 | # update monitor state 188 | if self._current_reward is not None: 189 | self._episode_rewards.append(self._current_reward) 190 | self._episode_lengths.append(self._num_steps) 191 | self._episode_end_times.append(time.time() - self._time_offset) 192 | # reset episode state 193 | self._current_reward = 0 194 | self._num_steps = 0 195 | 196 | return obs 197 | 198 | def _step(self, action): 199 | obs, rew, done, info = self.env.step(action) 200 | self._current_reward += rew 201 | self._num_steps += 1 202 | self._total_steps += 1 203 | info['steps'] = self._total_steps 204 | info['rewards'] = self._episode_rewards 205 | return (obs, rew, done, info) 206 | 207 | def get_state(self): 208 | return { 209 | 'env_id': self.env.unwrapped.spec.id, 210 | 'episode_data': { 211 | 'episode_rewards': self._episode_rewards, 212 | 'episode_lengths': self._episode_lengths, 213 | 'episode_end_times': self._episode_end_times, 214 | 'initial_reset_time': 0, 215 | } 216 | } 217 | 218 | def set_state(self, state): 219 | assert state['env_id'] == self.env.unwrapped.spec.id 220 | ed = state['episode_data'] 221 | self._episode_rewards = ed['episode_rewards'] 222 | self._episode_lengths = ed['episode_lengths'] 223 | self._episode_end_times = ed['episode_end_times'] 224 | 225 | 226 | def boolean_flag(parser, name, default=False, help=None): 227 | """Add a boolean flag to argparse parser. 228 | 229 | Parameters 230 | ---------- 231 | parser: argparse.Parser 232 | parser to add the flag to 233 | name: str 234 | -- will enable the flag, while --no- will disable it 235 | default: bool or None 236 | default value of the flag 237 | help: str 238 | help string for the flag 239 | """ 240 | parser.add_argument("--" + name, action="store_true", default=default, help=help) 241 | parser.add_argument("--no-" + name, action="store_false", dest=name) 242 | 243 | 244 | def get_wrapper_by_name(env, classname): 245 | """Given an a gym environment possibly wrapped multiple times, returns a wrapper 246 | of class named classname or raises ValueError if no such wrapper was applied 247 | 248 | Parameters 249 | ---------- 250 | env: gym.Env of gym.Wrapper 251 | gym environment 252 | classname: str 253 | name of the wrapper 254 | 255 | Returns 256 | ------- 257 | wrapper: gym.Wrapper 258 | wrapper named classname 259 | """ 260 | currentenv = env 261 | while True: 262 | if classname == currentenv.class_name(): 263 | return currentenv 264 | elif isinstance(currentenv, gym.Wrapper): 265 | currentenv = currentenv.env 266 | else: 267 | raise ValueError("Couldn't find wrapper named %s" % classname) 268 | 269 | 270 | def relatively_safe_pickle_dump(obj, path, compression=False): 271 | """This is just like regular pickle dump, except from the fact that failure cases are 272 | different: 273 | 274 | - It's never possible that we end up with a pickle in corrupted state. 275 | - If a there was a different file at the path, that file will remain unchanged in the 276 | even of failure (provided that filesystem rename is atomic). 277 | - it is sometimes possible that we end up with useless temp file which needs to be 278 | deleted manually (it will be removed automatically on the next function call) 279 | 280 | The indended use case is periodic checkpoints of experiment state, such that we never 281 | corrupt previous checkpoints if the current one fails. 282 | 283 | Parameters 284 | ---------- 285 | obj: object 286 | object to pickle 287 | path: str 288 | path to the output file 289 | compression: bool 290 | if true pickle will be compressed 291 | """ 292 | temp_storage = path + ".relatively_safe" 293 | if compression: 294 | # Using gzip here would be simpler, but the size is limited to 2GB 295 | with tempfile.NamedTemporaryFile() as uncompressed_file: 296 | pickle.dump(obj, uncompressed_file) 297 | with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip: 298 | myzip.write(uncompressed_file.name, "data") 299 | else: 300 | with open(temp_storage, "wb") as f: 301 | pickle.dump(obj, f) 302 | os.rename(temp_storage, path) 303 | 304 | 305 | def pickle_load(path, compression=False): 306 | """Unpickle a possible compressed pickle. 307 | 308 | Parameters 309 | ---------- 310 | path: str 311 | path to the output file 312 | compression: bool 313 | if true assumes that pickle was compressed when created and attempts decompression. 314 | 315 | Returns 316 | ------- 317 | obj: object 318 | the unpickled object 319 | """ 320 | 321 | if compression: 322 | with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip: 323 | with myzip.open("data") as f: 324 | return pickle.load(f) 325 | else: 326 | with open(path, "rb") as f: 327 | return pickle.load(f) 328 | -------------------------------------------------------------------------------- /baselines/deepq/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import dill 4 | import tempfile 5 | import tensorflow as tf 6 | import zipfile 7 | 8 | import baselines.common.tf_util as U 9 | 10 | from baselines import logger 11 | from baselines.common.schedules import LinearSchedule 12 | from baselines import deepq 13 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer 14 | 15 | 16 | class ActWrapper(object): 17 | def __init__(self, act, act_params): 18 | self._act = act 19 | self._act_params = act_params 20 | 21 | @staticmethod 22 | def load(path, num_cpu=16): 23 | with open(path, "rb") as f: 24 | model_data, act_params = dill.load(f) 25 | act = deepq.build_act(**act_params) 26 | sess = U.make_session(num_cpu=num_cpu) 27 | sess.__enter__() 28 | with tempfile.TemporaryDirectory() as td: 29 | arc_path = os.path.join(td, "packed.zip") 30 | with open(arc_path, "wb") as f: 31 | f.write(model_data) 32 | 33 | zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) 34 | U.load_state(os.path.join(td, "model")) 35 | 36 | return ActWrapper(act, act_params) 37 | 38 | def __call__(self, *args, **kwargs): 39 | return self._act(*args, **kwargs) 40 | 41 | def save(self, path): 42 | """Save model to a pickle located at `path`""" 43 | with tempfile.TemporaryDirectory() as td: 44 | U.save_state(os.path.join(td, "model")) 45 | arc_name = os.path.join(td, "packed.zip") 46 | with zipfile.ZipFile(arc_name, 'w') as zipf: 47 | for root, dirs, files in os.walk(td): 48 | for fname in files: 49 | file_path = os.path.join(root, fname) 50 | if file_path != arc_name: 51 | zipf.write(file_path, os.path.relpath(file_path, td)) 52 | with open(arc_name, "rb") as f: 53 | model_data = f.read() 54 | with open(path, "wb") as f: 55 | dill.dump((model_data, self._act_params), f) 56 | 57 | 58 | def load(path, num_cpu=16): 59 | """Load act function that was returned by learn function. 60 | 61 | Parameters 62 | ---------- 63 | path: str 64 | path to the act function pickle 65 | num_cpu: int 66 | number of cpus to use for executing the policy 67 | 68 | Returns 69 | ------- 70 | act: ActWrapper 71 | function that takes a batch of observations 72 | and returns actions. 73 | """ 74 | return ActWrapper.load(path, num_cpu=num_cpu) 75 | 76 | 77 | def learn(env, 78 | q_func, 79 | lr=5e-4, 80 | max_timesteps=100000, 81 | buffer_size=50000, 82 | exploration_fraction=0.1, 83 | exploration_final_eps=0.02, 84 | train_freq=1, 85 | batch_size=32, 86 | print_freq=1, 87 | checkpoint_freq=10000, 88 | learning_starts=1000, 89 | gamma=1.0, 90 | target_network_update_freq=500, 91 | prioritized_replay=False, 92 | prioritized_replay_alpha=0.6, 93 | prioritized_replay_beta0=0.4, 94 | prioritized_replay_beta_iters=None, 95 | prioritized_replay_eps=1e-6, 96 | num_cpu=16, 97 | callback=None): 98 | """Train a deepq model. 99 | 100 | Parameters 101 | ------- 102 | env : gym.Env 103 | environment to train on 104 | q_func: (tf.Variable, int, str, bool) -> tf.Variable 105 | the model that takes the following inputs: 106 | observation_in: object 107 | the output of observation placeholder 108 | num_actions: int 109 | number of actions 110 | scope: str 111 | reuse: bool 112 | should be passed to outer variable scope 113 | and returns a tensor of shape (batch_size, num_actions) with values of every action. 114 | lr: float 115 | learning rate for adam optimizer 116 | max_timesteps: int 117 | number of env steps to optimizer for 118 | buffer_size: int 119 | size of the replay buffer 120 | exploration_fraction: float 121 | fraction of entire training period over which the exploration rate is annealed 122 | exploration_final_eps: float 123 | final value of random action probability 124 | train_freq: int 125 | update the model every `train_freq` steps. 126 | batch_size: int 127 | size of a batched sampled from replay buffer for training 128 | print_freq: int 129 | how often to print out training progress 130 | set to None to disable printing 131 | checkpoint_freq: int 132 | how often to save the model. This is so that the best version is restored 133 | at the end of the training. If you do not wish to restore the best version at 134 | the end of the training set this variable to None. 135 | learning_starts: int 136 | how many steps of the model to collect transitions for before learning starts 137 | gamma: float 138 | discount factor 139 | target_network_update_freq: int 140 | update the target network every `target_network_update_freq` steps. 141 | prioritized_replay: True 142 | if True prioritized replay buffer will be used. 143 | prioritized_replay_alpha: float 144 | alpha parameter for prioritized replay buffer 145 | prioritized_replay_beta0: float 146 | initial value of beta for prioritized replay buffer 147 | prioritized_replay_beta_iters: int 148 | number of iterations over which beta will be annealed from initial value 149 | to 1.0. If set to None equals to max_timesteps. 150 | prioritized_replay_eps: float 151 | epsilon to add to the TD errors when updating priorities. 152 | num_cpu: int 153 | number of cpus to use for training 154 | callback: (locals, globals) -> None 155 | function called at every steps with state of the algorithm. 156 | If callback returns true training stops. 157 | 158 | Returns 159 | ------- 160 | act: ActWrapper 161 | Wrapper over act function. Adds ability to save it and load it. 162 | See header of baselines/deepq/categorical.py for details on the act function. 163 | """ 164 | # Create all the functions necessary to train the model 165 | 166 | sess = U.make_session(num_cpu=num_cpu) 167 | sess.__enter__() 168 | 169 | def make_obs_ph(name): 170 | return U.BatchInput(env.observation_space.shape, name=name) 171 | 172 | act, train, update_target, debug = deepq.build_train( 173 | make_obs_ph=make_obs_ph, 174 | q_func=q_func, 175 | num_actions=env.action_space.n, 176 | optimizer=tf.train.AdamOptimizer(learning_rate=lr), 177 | gamma=gamma, 178 | grad_norm_clipping=10 179 | ) 180 | act_params = { 181 | 'make_obs_ph': make_obs_ph, 182 | 'q_func': q_func, 183 | 'num_actions': env.action_space.n, 184 | } 185 | # Create the replay buffer 186 | if prioritized_replay: 187 | replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) 188 | if prioritized_replay_beta_iters is None: 189 | prioritized_replay_beta_iters = max_timesteps 190 | beta_schedule = LinearSchedule(prioritized_replay_beta_iters, 191 | initial_p=prioritized_replay_beta0, 192 | final_p=1.0) 193 | else: 194 | replay_buffer = ReplayBuffer(buffer_size) 195 | beta_schedule = None 196 | # Create the schedule for exploration starting from 1. 197 | exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), 198 | initial_p=1.0, 199 | final_p=exploration_final_eps) 200 | 201 | # Initialize the parameters and copy them to the target network. 202 | U.initialize() 203 | update_target() 204 | 205 | episode_rewards = [0.0] 206 | saved_mean_reward = None 207 | obs = env.reset() 208 | with tempfile.TemporaryDirectory() as td: 209 | model_saved = False 210 | model_file = os.path.join(td, "model") 211 | for t in range(max_timesteps): 212 | if callback is not None: 213 | if callback(locals(), globals()): 214 | break 215 | # Take action and update exploration to the newest value 216 | action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] 217 | new_obs, rew, done, _ = env.step(action) 218 | # Store transition in the replay buffer. 219 | replay_buffer.add(obs, action, rew, new_obs, float(done)) 220 | obs = new_obs 221 | 222 | episode_rewards[-1] += rew 223 | if done: 224 | obs = env.reset() 225 | episode_rewards.append(0.0) 226 | 227 | if t > learning_starts and t % train_freq == 0: 228 | # Minimize the error in Bellman's equation on a batch sampled from replay buffer. 229 | if prioritized_replay: 230 | experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) 231 | (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience 232 | else: 233 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) 234 | weights, batch_idxes = np.ones_like(rewards), None 235 | td_errors = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) 236 | if prioritized_replay: 237 | new_priorities = np.abs(td_errors) + prioritized_replay_eps 238 | replay_buffer.update_priorities(batch_idxes, new_priorities) 239 | 240 | if t > learning_starts and t % target_network_update_freq == 0: 241 | # Update target network periodically. 242 | update_target() 243 | 244 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 245 | num_episodes = len(episode_rewards) 246 | if done and print_freq is not None and len(episode_rewards) % print_freq == 0: 247 | logger.record_tabular("steps", t) 248 | logger.record_tabular("episodes", num_episodes) 249 | logger.record_tabular("mean 100 episode reward", mean_100ep_reward) 250 | logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) 251 | logger.dump_tabular() 252 | 253 | if (checkpoint_freq is not None and t > learning_starts and 254 | num_episodes > 100 and t % checkpoint_freq == 0): 255 | if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: 256 | if print_freq is not None: 257 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 258 | saved_mean_reward, mean_100ep_reward)) 259 | U.save_state(model_file) 260 | model_saved = True 261 | saved_mean_reward = mean_100ep_reward 262 | if model_saved: 263 | if print_freq is not None: 264 | logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) 265 | U.load_state(model_file) 266 | 267 | return ActWrapper(act, act_params) 268 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/atari/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | import os 5 | import tensorflow as tf 6 | import tempfile 7 | import time 8 | 9 | import baselines.common.tf_util as U 10 | 11 | from baselines import logger 12 | from baselines import deepq 13 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer 14 | from baselines.common.misc_util import ( 15 | boolean_flag, 16 | pickle_load, 17 | pretty_eta, 18 | relatively_safe_pickle_dump, 19 | set_global_seeds, 20 | RunningAvg, 21 | SimpleMonitor 22 | ) 23 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule 24 | # when updating this to non-deperecated ones, it is important to 25 | # copy over LazyFrames 26 | from baselines.common.atari_wrappers_deprecated import wrap_dqn 27 | from baselines.common.azure_utils import Container 28 | from .model import model, dueling_model 29 | 30 | 31 | def parse_args(): 32 | parser = argparse.ArgumentParser("DQN experiments for Atari games") 33 | # Environment 34 | parser.add_argument("--env", type=str, default="Pong", help="name of the game") 35 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 36 | # Core DQN parameters 37 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 38 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 39 | parser.add_argument("--num-steps", type=int, default=int(2e8), help="total number of steps to run the environment for") 40 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 41 | parser.add_argument("--learning-freq", type=int, default=4, help="number of iterations between every optimization step") 42 | parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update") 43 | # Bells and whistles 44 | boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning") 45 | boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model") 46 | boolean_flag(parser, "prioritized", default=False, help="whether or not to use prioritized replay buffer") 47 | parser.add_argument("--prioritized-alpha", type=float, default=0.6, help="alpha parameter for prioritized replay buffer") 48 | parser.add_argument("--prioritized-beta0", type=float, default=0.4, help="initial value of beta parameters for prioritized replay") 49 | parser.add_argument("--prioritized-eps", type=float, default=1e-6, help="eps parameter for prioritized replay buffer") 50 | # Checkpointing 51 | parser.add_argument("--save-dir", type=str, default=None, help="directory in which training state and model should be saved.") 52 | parser.add_argument("--save-azure-container", type=str, default=None, 53 | help="It present data will saved/loaded from Azure. Should be in format ACCOUNT_NAME:ACCOUNT_KEY:CONTAINER") 54 | parser.add_argument("--save-freq", type=int, default=1e6, help="save model once every time this many iterations are completed") 55 | boolean_flag(parser, "load-on-start", default=True, help="if true and model was previously saved then training will be resumed") 56 | return parser.parse_args() 57 | 58 | 59 | def make_env(game_name): 60 | env = gym.make(game_name + "NoFrameskip-v4") 61 | monitored_env = SimpleMonitor(env) # puts rewards and number of steps in info, before environment is wrapped 62 | env = wrap_dqn(monitored_env) # applies a bunch of modification to simplify the observation space (downsample, make b/w) 63 | return env, monitored_env 64 | 65 | 66 | def maybe_save_model(savedir, container, state): 67 | """This function checkpoints the model and state of the training algorithm.""" 68 | if savedir is None: 69 | return 70 | start_time = time.time() 71 | model_dir = "model-{}".format(state["num_iters"]) 72 | U.save_state(os.path.join(savedir, model_dir, "saved")) 73 | if container is not None: 74 | container.put(os.path.join(savedir, model_dir), model_dir) 75 | relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True) 76 | if container is not None: 77 | container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip') 78 | relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl')) 79 | if container is not None: 80 | container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl') 81 | logger.log("Saved model in {} seconds\n".format(time.time() - start_time)) 82 | 83 | 84 | def maybe_load_model(savedir, container): 85 | """Load model if present at the specified path.""" 86 | if savedir is None: 87 | return 88 | 89 | state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip')) 90 | if container is not None: 91 | logger.log("Attempting to download model from Azure") 92 | found_model = container.get(savedir, 'training_state.pkl.zip') 93 | else: 94 | found_model = os.path.exists(state_path) 95 | if found_model: 96 | state = pickle_load(state_path, compression=True) 97 | model_dir = "model-{}".format(state["num_iters"]) 98 | if container is not None: 99 | container.get(savedir, model_dir) 100 | U.load_state(os.path.join(savedir, model_dir, "saved")) 101 | logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"])) 102 | return state 103 | 104 | 105 | if __name__ == '__main__': 106 | args = parse_args() 107 | # Parse savedir and azure container. 108 | savedir = args.save_dir 109 | if args.save_azure_container is not None: 110 | account_name, account_key, container_name = args.save_azure_container.split(":") 111 | container = Container(account_name=account_name, 112 | account_key=account_key, 113 | container_name=container_name, 114 | maybe_create=True) 115 | if savedir is None: 116 | # Careful! This will not get cleaned up. Docker spoils the developers. 117 | savedir = tempfile.TemporaryDirectory().name 118 | else: 119 | container = None 120 | # Create and seed the env. 121 | env, monitored_env = make_env(args.env) 122 | if args.seed > 0: 123 | set_global_seeds(args.seed) 124 | env.unwrapped.seed(args.seed) 125 | 126 | with U.make_session(4) as sess: 127 | # Create training graph and replay buffer 128 | act, train, update_target, debug = deepq.build_train( 129 | make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), 130 | q_func=dueling_model if args.dueling else model, 131 | num_actions=env.action_space.n, 132 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4), 133 | gamma=0.99, 134 | grad_norm_clipping=10, 135 | double_q=args.double_q 136 | ) 137 | 138 | approximate_num_iters = args.num_steps / 4 139 | exploration = PiecewiseSchedule([ 140 | (0, 1.0), 141 | (approximate_num_iters / 50, 0.1), 142 | (approximate_num_iters / 5, 0.01) 143 | ], outside_value=0.01) 144 | 145 | if args.prioritized: 146 | replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha) 147 | beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0) 148 | else: 149 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 150 | 151 | U.initialize() 152 | update_target() 153 | num_iters = 0 154 | 155 | # Load the model 156 | state = maybe_load_model(savedir, container) 157 | if state is not None: 158 | num_iters, replay_buffer = state["num_iters"], state["replay_buffer"], 159 | monitored_env.set_state(state["monitor_state"]) 160 | 161 | start_time, start_steps = None, None 162 | steps_per_iter = RunningAvg(0.999) 163 | iteration_time_est = RunningAvg(0.999) 164 | obs = env.reset() 165 | 166 | # Main trianing loop 167 | while True: 168 | num_iters += 1 169 | # Take action and store transition in the replay buffer. 170 | action = act(np.array(obs)[None], update_eps=exploration.value(num_iters))[0] 171 | new_obs, rew, done, info = env.step(action) 172 | replay_buffer.add(obs, action, rew, new_obs, float(done)) 173 | obs = new_obs 174 | if done: 175 | obs = env.reset() 176 | 177 | if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and 178 | num_iters % args.learning_freq == 0): 179 | # Sample a bunch of transitions from replay buffer 180 | if args.prioritized: 181 | experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters)) 182 | (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience 183 | else: 184 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 185 | weights = np.ones_like(rewards) 186 | # Minimize the error in Bellman's equation and compute TD-error 187 | td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) 188 | # Update the priorities in the replay buffer 189 | if args.prioritized: 190 | new_priorities = np.abs(td_errors) + args.prioritized_eps 191 | replay_buffer.update_priorities(batch_idxes, new_priorities) 192 | # Update target network. 193 | if num_iters % args.target_update_freq == 0: 194 | update_target() 195 | 196 | if start_time is not None: 197 | steps_per_iter.update(info['steps'] - start_steps) 198 | iteration_time_est.update(time.time() - start_time) 199 | start_time, start_steps = time.time(), info["steps"] 200 | 201 | # Save the model and training state. 202 | if num_iters > 0 and (num_iters % args.save_freq == 0 or info["steps"] > args.num_steps): 203 | maybe_save_model(savedir, container, { 204 | 'replay_buffer': replay_buffer, 205 | 'num_iters': num_iters, 206 | 'monitor_state': monitored_env.get_state() 207 | }) 208 | 209 | if info["steps"] > args.num_steps: 210 | break 211 | 212 | if done: 213 | steps_left = args.num_steps - info["steps"] 214 | completion = np.round(info["steps"] / args.num_steps, 1) 215 | 216 | logger.record_tabular("% completion", completion) 217 | logger.record_tabular("steps", info["steps"]) 218 | logger.record_tabular("iters", num_iters) 219 | logger.record_tabular("episodes", len(info["rewards"])) 220 | logger.record_tabular("reward (100 epi mean)", np.mean(info["rewards"][-100:])) 221 | logger.record_tabular("exploration", exploration.value(num_iters)) 222 | if args.prioritized: 223 | logger.record_tabular("max priority", replay_buffer._max_priority) 224 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 225 | if steps_per_iter._value is not None else "calculating...") 226 | logger.dump_tabular() 227 | logger.log() 228 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 229 | logger.log() 230 | -------------------------------------------------------------------------------- /baselines/deepq/build_graph.py: -------------------------------------------------------------------------------- 1 | """Deep Q learning graph 2 | 3 | The functions in this file can are used to create the following functions: 4 | 5 | ======= act ======== 6 | 7 | Function to chose an action given an observation 8 | 9 | Parameters 10 | ---------- 11 | observation: object 12 | Observation that can be feed into the output of make_obs_ph 13 | stochastic: bool 14 | if set to False all the actions are always deterministic (default False) 15 | update_eps_ph: float 16 | update epsilon a new value, if negative not update happens 17 | (default: no update) 18 | 19 | Returns 20 | ------- 21 | Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for 22 | every element of the batch. 23 | 24 | 25 | ======= train ======= 26 | 27 | Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error: 28 | 29 | td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) 30 | loss = huber_loss[td_error] 31 | 32 | Parameters 33 | ---------- 34 | obs_t: object 35 | a batch of observations 36 | action: np.array 37 | actions that were selected upon seeing obs_t. 38 | dtype must be int32 and shape must be (batch_size,) 39 | reward: np.array 40 | immediate reward attained after executing those actions 41 | dtype must be float32 and shape must be (batch_size,) 42 | obs_tp1: object 43 | observations that followed obs_t 44 | done: np.array 45 | 1 if obs_t was the last observation in the episode and 0 otherwise 46 | obs_tp1 gets ignored, but must be of the valid shape. 47 | dtype must be float32 and shape must be (batch_size,) 48 | weight: np.array 49 | imporance weights for every element of the batch (gradient is multiplied 50 | by the importance weight) dtype must be float32 and shape must be (batch_size,) 51 | 52 | Returns 53 | ------- 54 | td_error: np.array 55 | a list of differences between Q(s,a) and the target in Bellman's equation. 56 | dtype is float32 and shape is (batch_size,) 57 | 58 | ======= update_target ======== 59 | 60 | copy the parameters from optimized Q function to the target Q function. 61 | In Q learning we actually optimize the following error: 62 | 63 | Q(s,a) - (r + gamma * max_a' Q'(s', a')) 64 | 65 | Where Q' is lagging behind Q to stablize the learning. For example for Atari 66 | 67 | Q' is set to Q once every 10000 updates training steps. 68 | 69 | """ 70 | import tensorflow as tf 71 | import baselines.common.tf_util as U 72 | from cleverhans.attacks import FastGradientMethod, BasicIterativeMethod, CarliniWagnerL2 73 | from cleverhans.model import CallableModelWrapper 74 | 75 | 76 | def build_act(make_obs_ph, q_func, num_actions, attack=None, scope="deepq", reuse=None, model_path=''): 77 | """Creates the act function: 78 | 79 | Parameters 80 | ---------- 81 | make_obs_ph: str -> tf.placeholder or TfInput 82 | a function that take a name and creates a placeholder of input with that name 83 | q_func: (tf.Variable, int, str, bool) -> tf.Variable 84 | the model that takes the following inputs: 85 | observation_in: object 86 | the output of observation placeholder 87 | num_actions: int 88 | number of actions 89 | scope: str 90 | reuse: bool 91 | should be passed to outer variable scope 92 | and returns a tensor of shape (batch_size, num_actions) with values of every action. 93 | num_actions: int 94 | number of actions. 95 | scope: str or VariableScope 96 | optional scope for variable_scope. 97 | reuse: bool or None 98 | whether or not the variables should be reused. To be able to reuse the scope must be given. 99 | 100 | Returns 101 | ------- 102 | act: (tf.Variable, bool, float) -> tf.Variable 103 | function to select and action given observation. 104 | ` See the top of the file for details. 105 | """ 106 | with tf.variable_scope(scope, reuse=reuse): 107 | observations_ph = U.ensure_tf_input(make_obs_ph("observation")) 108 | stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") 109 | update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") 110 | 111 | eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) 112 | 113 | q_values = q_func(observations_ph.get(), num_actions, scope="q_func", concat_softmax=True) 114 | deterministic_actions = tf.argmax(q_values, axis=1) 115 | batch_size = tf.shape(observations_ph.get())[0] 116 | random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) 117 | chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps 118 | stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) 119 | 120 | output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) 121 | update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) 122 | 123 | act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], 124 | outputs=output_actions, 125 | givens={update_eps_ph: -1.0, stochastic_ph: True}, 126 | updates=[update_eps_expr]) 127 | 128 | # Load model before attacks graph construction so that TF won't 129 | # complain can't load parameters for attack 130 | U.load_state(model_path) 131 | 132 | if attack != None: 133 | if attack == 'fgsm': 134 | def wrapper(x): 135 | return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) 136 | adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) 137 | adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, 138 | clip_min=0, clip_max=1.0) * 255.0 139 | elif attack == 'iterative': 140 | def wrapper(x): 141 | return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) 142 | adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) 143 | adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, 144 | clip_min=0, clip_max=1.0) * 255.0 145 | elif attack == 'cwl2': 146 | def wrapper(x): 147 | return q_func(x, num_actions, scope="q_func", reuse=True) 148 | adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) 149 | cw_params = {'binary_search_steps': 1, 150 | 'max_iterations': 100, 151 | 'learning_rate': 0.1, 152 | 'initial_const': 10, 153 | 'clip_min': 0, 154 | 'clip_max': 1.0} 155 | adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 156 | 157 | craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], 158 | outputs=adv_observations, 159 | givens={update_eps_ph: -1.0, stochastic_ph: True}, 160 | updates=[update_eps_expr]) 161 | 162 | if attack == None: 163 | craft_adv_obs = None 164 | 165 | return act, craft_adv_obs 166 | 167 | 168 | def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): 169 | """Creates the train function: 170 | 171 | Parameters 172 | ---------- 173 | make_obs_ph: str -> tf.placeholder or TfInput 174 | a function that takes a name and creates a placeholder of input with that name 175 | q_func: (tf.Variable, int, str, bool) -> tf.Variable 176 | the model that takes the following inputs: 177 | observation_in: object 178 | the output of observation placeholder 179 | num_actions: int 180 | number of actions 181 | scope: str 182 | reuse: bool 183 | should be passed to outer variable scope 184 | and returns a tensor of shape (batch_size, num_actions) with values of every action. 185 | num_actions: int 186 | number of actions 187 | reuse: bool 188 | whether or not to reuse the graph variables 189 | optimizer: tf.train.Optimizer 190 | optimizer to use for the Q-learning objective. 191 | grad_norm_clipping: float or None 192 | clip gradient norms to this value. If None no clipping is performed. 193 | gamma: float 194 | discount rate. 195 | double_q: bool 196 | if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). 197 | In general it is a good idea to keep it enabled. 198 | scope: str or VariableScope 199 | optional scope for variable_scope. 200 | reuse: bool or None 201 | whether or not the variables should be reused. To be able to reuse the scope must be given. 202 | 203 | Returns 204 | ------- 205 | act: (tf.Variable, bool, float) -> tf.Variable 206 | function to select and action given observation. 207 | ` See the top of the file for details. 208 | train: (object, np.array, np.array, object, np.array, np.array) -> np.array 209 | optimize the error in Bellman's equation. 210 | ` See the top of the file for details. 211 | update_target: () -> () 212 | copy the parameters from optimized Q function to the target Q function. 213 | ` See the top of the file for details. 214 | debug: {str: function} 215 | a bunch of functions to print debug data like q_values. 216 | """ 217 | act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) 218 | 219 | with tf.variable_scope(scope, reuse=reuse): 220 | # set up placeholders 221 | obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) 222 | act_t_ph = tf.placeholder(tf.int32, [None], name="action") 223 | rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") 224 | obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) 225 | done_mask_ph = tf.placeholder(tf.float32, [None], name="done") 226 | importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") 227 | 228 | # q network evaluation 229 | q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act 230 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) 231 | 232 | # target q network evalution 233 | q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") 234 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) 235 | 236 | # q scores for actions which we know were selected in the given state. 237 | q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) 238 | 239 | # compute estimate of best possible value starting from state at t + 1 240 | if double_q: 241 | q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) 242 | q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) 243 | q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) 244 | else: 245 | q_tp1_best = tf.reduce_max(q_tp1, 1) 246 | q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best 247 | 248 | # compute RHS of bellman equation 249 | q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked 250 | 251 | # compute the error (potentially clipped) 252 | td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) 253 | errors = U.huber_loss(td_error) 254 | weighted_error = tf.reduce_mean(importance_weights_ph * errors) 255 | # compute optimization op (potentially with gradient clipping) 256 | if grad_norm_clipping is not None: 257 | optimize_expr = U.minimize_and_clip(optimizer, 258 | weighted_error, 259 | var_list=q_func_vars, 260 | clip_val=grad_norm_clipping) 261 | else: 262 | optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) 263 | 264 | # update_target_fn will be called periodically to copy Q network to target Q network 265 | update_target_expr = [] 266 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 267 | sorted(target_q_func_vars, key=lambda v: v.name)): 268 | update_target_expr.append(var_target.assign(var)) 269 | update_target_expr = tf.group(*update_target_expr) 270 | 271 | # Create callable functions 272 | train = U.function( 273 | inputs=[ 274 | obs_t_input, 275 | act_t_ph, 276 | rew_t_ph, 277 | obs_tp1_input, 278 | done_mask_ph, 279 | importance_weights_ph 280 | ], 281 | outputs=td_error, 282 | updates=[optimize_expr] 283 | ) 284 | update_target = U.function([], [], updates=[update_target_expr]) 285 | 286 | q_values = U.function([obs_t_input], q_t) 287 | 288 | return act_f, train, update_target, {'q_values': q_values} 289 | -------------------------------------------------------------------------------- /baselines/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf # pylint: ignore-module 3 | import builtins 4 | import functools 5 | import copy 6 | import os 7 | import collections 8 | 9 | 10 | # ================================================================ 11 | # Make consistent with numpy 12 | # ================================================================ 13 | 14 | clip = tf.clip_by_value 15 | 16 | 17 | def sum(x, axis=None, keepdims=False): 18 | axis = None if axis is None else [axis] 19 | return tf.reduce_sum(x, axis=axis, keep_dims=keepdims) 20 | 21 | 22 | def mean(x, axis=None, keepdims=False): 23 | axis = None if axis is None else [axis] 24 | return tf.reduce_mean(x, axis=axis, keep_dims=keepdims) 25 | 26 | 27 | def var(x, axis=None, keepdims=False): 28 | meanx = mean(x, axis=axis, keepdims=keepdims) 29 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 30 | 31 | 32 | def std(x, axis=None, keepdims=False): 33 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 34 | 35 | 36 | def max(x, axis=None, keepdims=False): 37 | axis = None if axis is None else [axis] 38 | return tf.reduce_max(x, axis=axis, keep_dims=keepdims) 39 | 40 | 41 | def min(x, axis=None, keepdims=False): 42 | axis = None if axis is None else [axis] 43 | return tf.reduce_min(x, axis=axis, keep_dims=keepdims) 44 | 45 | 46 | def concatenate(arrs, axis=0): 47 | return tf.concat(axis=axis, values=arrs) 48 | 49 | 50 | def argmax(x, axis=None): 51 | return tf.argmax(x, axis=axis) 52 | 53 | 54 | def switch(condition, then_expression, else_expression): 55 | """Switches between two operations depending on a scalar value (int or bool). 56 | Note that both `then_expression` and `else_expression` 57 | should be symbolic tensors of the *same shape*. 58 | 59 | # Arguments 60 | condition: scalar tensor. 61 | then_expression: TensorFlow operation. 62 | else_expression: TensorFlow operation. 63 | """ 64 | x_shape = copy.copy(then_expression.get_shape()) 65 | x = tf.cond(tf.cast(condition, 'bool'), 66 | lambda: then_expression, 67 | lambda: else_expression) 68 | x.set_shape(x_shape) 69 | return x 70 | 71 | # ================================================================ 72 | # Extras 73 | # ================================================================ 74 | 75 | 76 | def l2loss(params): 77 | if len(params) == 0: 78 | return tf.constant(0.0) 79 | else: 80 | return tf.add_n([sum(tf.square(p)) for p in params]) 81 | 82 | 83 | def lrelu(x, leak=0.2): 84 | f1 = 0.5 * (1 + leak) 85 | f2 = 0.5 * (1 - leak) 86 | return f1 * x + f2 * abs(x) 87 | 88 | 89 | def categorical_sample_logits(X): 90 | # https://github.com/tensorflow/tensorflow/issues/456 91 | U = tf.random_uniform(tf.shape(X)) 92 | return argmax(X - tf.log(-tf.log(U)), axis=1) 93 | 94 | 95 | # ================================================================ 96 | # Inputs 97 | # ================================================================ 98 | 99 | 100 | def is_placeholder(x): 101 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 102 | 103 | 104 | class TfInput(object): 105 | def __init__(self, name="(unnamed)"): 106 | """Generalized Tensorflow placeholder. The main differences are: 107 | - possibly uses multiple placeholders internally and returns multiple values 108 | - can apply light postprocessing to the value feed to placeholder. 109 | """ 110 | self.name = name 111 | 112 | def get(self): 113 | """Return the tf variable(s) representing the possibly postprocessed value 114 | of placeholder(s). 115 | """ 116 | raise NotImplemented() 117 | 118 | def make_feed_dict(data): 119 | """Given data input it to the placeholder(s).""" 120 | raise NotImplemented() 121 | 122 | 123 | class PlacholderTfInput(TfInput): 124 | def __init__(self, placeholder): 125 | """Wrapper for regular tensorflow placeholder.""" 126 | super().__init__(placeholder.name) 127 | self._placeholder = placeholder 128 | 129 | def get(self): 130 | return self._placeholder 131 | 132 | def make_feed_dict(self, data): 133 | return {self._placeholder: data} 134 | 135 | 136 | class BatchInput(PlacholderTfInput): 137 | def __init__(self, shape, dtype=tf.float32, name=None): 138 | """Creates a placeholder for a batch of tensors of a given shape and dtype 139 | 140 | Parameters 141 | ---------- 142 | shape: [int] 143 | shape of a single elemenet of the batch 144 | dtype: tf.dtype 145 | number representation used for tensor contents 146 | name: str 147 | name of the underlying placeholder 148 | """ 149 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 150 | 151 | 152 | class Uint8Input(PlacholderTfInput): 153 | def __init__(self, shape, name=None): 154 | """Takes input in uint8 format which is cast to float32 and divided by 255 155 | before passing it to the model. 156 | 157 | On GPU this ensures lower data transfer times. 158 | 159 | Parameters 160 | ---------- 161 | shape: [int] 162 | shape of the tensor. 163 | name: str 164 | name of the underlying placeholder 165 | """ 166 | 167 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 168 | self._shape = shape 169 | self._output = tf.cast(super().get(), tf.float32) / 255.0 170 | 171 | def get(self): 172 | return self._output 173 | 174 | 175 | def ensure_tf_input(thing): 176 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 177 | if isinstance(thing, TfInput): 178 | return thing 179 | elif is_placeholder(thing): 180 | return PlacholderTfInput(thing) 181 | else: 182 | raise ValueError("Must be a placeholder or TfInput") 183 | 184 | # ================================================================ 185 | # Mathematical utils 186 | # ================================================================ 187 | 188 | 189 | def huber_loss(x, delta=1.0): 190 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 191 | return tf.where( 192 | tf.abs(x) < delta, 193 | tf.square(x) * 0.5, 194 | delta * (tf.abs(x) - 0.5 * delta) 195 | ) 196 | 197 | # ================================================================ 198 | # Optimizer utils 199 | # ================================================================ 200 | 201 | 202 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 203 | """Minimized `objective` using `optimizer` w.r.t. variables in 204 | `var_list` while ensure the norm of the gradients for each 205 | variable is clipped to `clip_val` 206 | """ 207 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 208 | for i, (grad, var) in enumerate(gradients): 209 | if grad is not None: 210 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 211 | return optimizer.apply_gradients(gradients) 212 | 213 | 214 | # ================================================================ 215 | # Global session 216 | # ================================================================ 217 | 218 | def get_session(): 219 | """Returns recently made Tensorflow session""" 220 | return tf.get_default_session() 221 | 222 | 223 | def make_session(num_cpu): 224 | """Returns a session that will use CPU's only""" 225 | tf_config = tf.ConfigProto( 226 | inter_op_parallelism_threads=num_cpu, 227 | intra_op_parallelism_threads=num_cpu) 228 | return tf.Session(config=tf_config) 229 | 230 | 231 | def single_threaded_session(): 232 | """Returns a session which will only use a single CPU""" 233 | return make_session(1) 234 | 235 | 236 | ALREADY_INITIALIZED = set() 237 | 238 | 239 | def initialize(): 240 | """Initialize all the uninitialized variables in the global scope.""" 241 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 242 | get_session().run(tf.variables_initializer(new_variables)) 243 | ALREADY_INITIALIZED.update(new_variables) 244 | 245 | 246 | def eval(expr, feed_dict=None): 247 | if feed_dict is None: 248 | feed_dict = {} 249 | return get_session().run(expr, feed_dict=feed_dict) 250 | 251 | 252 | VALUE_SETTERS = collections.OrderedDict() 253 | 254 | 255 | def set_value(v, val): 256 | global VALUE_SETTERS 257 | if v in VALUE_SETTERS: 258 | set_op, set_endpoint = VALUE_SETTERS[v] 259 | else: 260 | set_endpoint = tf.placeholder(v.dtype) 261 | set_op = v.assign(set_endpoint) 262 | VALUE_SETTERS[v] = (set_op, set_endpoint) 263 | get_session().run(set_op, feed_dict={set_endpoint: val}) 264 | 265 | 266 | # ================================================================ 267 | # Saving variables 268 | # ================================================================ 269 | 270 | 271 | def load_state(fname): 272 | saver = tf.train.Saver() 273 | saver.restore(get_session(), fname) 274 | 275 | 276 | def save_state(fname): 277 | os.makedirs(os.path.dirname(fname), exist_ok=True) 278 | saver = tf.train.Saver() 279 | saver.save(get_session(), fname) 280 | 281 | # ================================================================ 282 | # Model components 283 | # ================================================================ 284 | 285 | 286 | def normc_initializer(std=1.0): 287 | def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 288 | out = np.random.randn(*shape).astype(np.float32) 289 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 290 | return tf.constant(out) 291 | return _initializer 292 | 293 | 294 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, 295 | summary_tag=None): 296 | with tf.variable_scope(name): 297 | stride_shape = [1, stride[0], stride[1], 1] 298 | filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] 299 | 300 | # there are "num input feature maps * filter height * filter width" 301 | # inputs to each hidden unit 302 | fan_in = intprod(filter_shape[:3]) 303 | # each unit in the lower layer receives a gradient from: 304 | # "num output feature maps * filter height * filter width" / 305 | # pooling size 306 | fan_out = intprod(filter_shape[:2]) * num_filters 307 | # initialize weights with random weights 308 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 309 | 310 | w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), 311 | collections=collections) 312 | b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), 313 | collections=collections) 314 | 315 | if summary_tag is not None: 316 | tf.summary.image(summary_tag, 317 | tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), 318 | [2, 0, 1, 3]), 319 | max_images=10) 320 | 321 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 322 | 323 | 324 | def dense(x, size, name, weight_init=None, bias=True): 325 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 326 | ret = tf.matmul(x, w) 327 | if bias: 328 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer()) 329 | return ret + b 330 | else: 331 | return ret 332 | 333 | 334 | def wndense(x, size, name, init_scale=1.0): 335 | v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], 336 | initializer=tf.random_normal_initializer(0, 0.05)) 337 | g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale)) 338 | b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0)) 339 | 340 | # use weight normalization (Salimans & Kingma, 2016) 341 | x = tf.matmul(x, v) 342 | scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) 343 | return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) 344 | 345 | 346 | def densenobias(x, size, name, weight_init=None): 347 | return dense(x, size, name, weight_init=weight_init, bias=False) 348 | 349 | 350 | def dropout(x, pkeep, phase=None, mask=None): 351 | mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask 352 | if phase is None: 353 | return mask * x 354 | else: 355 | return switch(phase, mask * x, pkeep * x) 356 | 357 | 358 | # ================================================================ 359 | # Theano-like Function 360 | # ================================================================ 361 | 362 | 363 | 364 | def function(inputs, outputs, updates=None, givens=None): 365 | """Just like Theano function. Take a bunch of tensorflow placeholders and expressions 366 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 367 | values to be fed to the input's placeholders and produces the values of the expressions 368 | in outputs. 369 | 370 | Input values can be passed in the same order as inputs or can be provided as kwargs based 371 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 372 | 373 | Example: 374 | x = tf.placeholder(tf.int32, (), name="x") 375 | y = tf.placeholder(tf.int32, (), name="y") 376 | z = 3 * x + 2 * y 377 | lin = function([x, y], z, givens={y: 0}) 378 | 379 | with single_threaded_session(): 380 | initialize() 381 | 382 | assert lin(2) == 6 383 | assert lin(x=3) == 9 384 | assert lin(2, 2) == 10 385 | assert lin(x=2, y=3) == 12 386 | 387 | Parameters 388 | ---------- 389 | inputs: [tf.placeholder or TfInput] 390 | list of input arguments 391 | outputs: [tf.Variable] or tf.Variable 392 | list of outputs or a single output to be returned from function. Returned 393 | value will also have the same shape. 394 | """ 395 | if isinstance(outputs, list): 396 | return _Function(inputs, outputs, updates, givens=givens) 397 | elif isinstance(outputs, (dict, collections.OrderedDict)): 398 | f = _Function(inputs, outputs.values(), updates, givens=givens) 399 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 400 | else: 401 | f = _Function(inputs, [outputs], updates, givens=givens) 402 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 403 | 404 | 405 | class _Function(object): 406 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 407 | for inpt in inputs: 408 | if not issubclass(type(inpt), TfInput): 409 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput" 410 | self.inputs = inputs 411 | updates = updates or [] 412 | self.update_group = tf.group(*updates) 413 | self.outputs_update = list(outputs) + [self.update_group] 414 | self.givens = {} if givens is None else givens 415 | self.check_nan = check_nan 416 | 417 | def _feed_input(self, feed_dict, inpt, value): 418 | if issubclass(type(inpt), TfInput): 419 | feed_dict.update(inpt.make_feed_dict(value)) 420 | elif is_placeholder(inpt): 421 | feed_dict[inpt] = value 422 | 423 | def __call__(self, *args, **kwargs): 424 | assert len(args) <= len(self.inputs), "Too many arguments provided" 425 | feed_dict = {} 426 | # Update the args 427 | for inpt, value in zip(self.inputs, args): 428 | self._feed_input(feed_dict, inpt, value) 429 | # Update the kwargs 430 | kwargs_passed_inpt_names = set() 431 | for inpt in self.inputs[len(args):]: 432 | inpt_name = inpt.name.split(':')[0] 433 | inpt_name = inpt_name.split('/')[-1] 434 | assert inpt_name not in kwargs_passed_inpt_names, \ 435 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 436 | if inpt_name in kwargs: 437 | kwargs_passed_inpt_names.add(inpt_name) 438 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 439 | else: 440 | assert inpt in self.givens, "Missing argument " + inpt_name 441 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 442 | # Update feed dict with givens. 443 | for inpt in self.givens: 444 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 445 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 446 | if self.check_nan: 447 | if any(np.isnan(r).any() for r in results): 448 | raise RuntimeError("Nan detected") 449 | return results 450 | 451 | 452 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): 453 | if isinstance(outputs, list): 454 | return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) 455 | else: 456 | f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) 457 | return lambda *inputs: f(*inputs)[0] 458 | 459 | 460 | class _MemFriendlyFunction(object): 461 | def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): 462 | self.nondata_inputs = nondata_inputs 463 | self.data_inputs = data_inputs 464 | self.outputs = list(outputs) 465 | self.batch_size = batch_size 466 | 467 | def __call__(self, *inputvals): 468 | assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs) 469 | nondata_vals = inputvals[0:len(self.nondata_inputs)] 470 | data_vals = inputvals[len(self.nondata_inputs):] 471 | feed_dict = dict(zip(self.nondata_inputs, nondata_vals)) 472 | n = data_vals[0].shape[0] 473 | for v in data_vals[1:]: 474 | assert v.shape[0] == n 475 | for i_start in range(0, n, self.batch_size): 476 | slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals] 477 | for (var, val) in zip(self.data_inputs, slice_vals): 478 | feed_dict[var] = val 479 | results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict) 480 | if i_start == 0: 481 | sum_results = results 482 | else: 483 | for i in range(len(results)): 484 | sum_results[i] = sum_results[i] + results[i] 485 | for i in range(len(results)): 486 | sum_results[i] = sum_results[i] / n 487 | return sum_results 488 | 489 | # ================================================================ 490 | # Modules 491 | # ================================================================ 492 | 493 | 494 | class Module(object): 495 | def __init__(self, name): 496 | self.name = name 497 | self.first_time = True 498 | self.scope = None 499 | self.cache = {} 500 | 501 | def __call__(self, *args): 502 | if args in self.cache: 503 | print("(%s) retrieving value from cache" % (self.name,)) 504 | return self.cache[args] 505 | with tf.variable_scope(self.name, reuse=not self.first_time): 506 | scope = tf.get_variable_scope().name 507 | if self.first_time: 508 | self.scope = scope 509 | print("(%s) running function for the first time" % (self.name,)) 510 | else: 511 | assert self.scope == scope, "Tried calling function with a different scope" 512 | print("(%s) running function on new inputs" % (self.name,)) 513 | self.first_time = False 514 | out = self._call(*args) 515 | self.cache[args] = out 516 | return out 517 | 518 | def _call(self, *args): 519 | raise NotImplementedError 520 | 521 | @property 522 | def trainable_variables(self): 523 | assert self.scope is not None, "need to call module once before getting variables" 524 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 525 | 526 | @property 527 | def variables(self): 528 | assert self.scope is not None, "need to call module once before getting variables" 529 | return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) 530 | 531 | 532 | def module(name): 533 | @functools.wraps 534 | def wrapper(f): 535 | class WrapperModule(Module): 536 | def _call(self, *args): 537 | return f(*args) 538 | return WrapperModule(name) 539 | return wrapper 540 | 541 | # ================================================================ 542 | # Graph traversal 543 | # ================================================================ 544 | 545 | 546 | VARIABLES = {} 547 | 548 | 549 | def get_parents(node): 550 | return node.op.inputs 551 | 552 | 553 | def topsorted(outputs): 554 | """ 555 | Topological sort via non-recursive depth-first search 556 | """ 557 | assert isinstance(outputs, (list, tuple)) 558 | marks = {} 559 | out = [] 560 | stack = [] # pylint: disable=W0621 561 | # i: node 562 | # jidx = number of children visited so far from that node 563 | # marks: state of each node, which is one of 564 | # 0: haven't visited 565 | # 1: have visited, but not done visiting children 566 | # 2: done visiting children 567 | for x in outputs: 568 | stack.append((x, 0)) 569 | while stack: 570 | (i, jidx) = stack.pop() 571 | if jidx == 0: 572 | m = marks.get(i, 0) 573 | if m == 0: 574 | marks[i] = 1 575 | elif m == 1: 576 | raise ValueError("not a dag") 577 | else: 578 | continue 579 | ps = get_parents(i) 580 | if jidx == len(ps): 581 | marks[i] = 2 582 | out.append(i) 583 | else: 584 | stack.append((i, jidx + 1)) 585 | j = ps[jidx] 586 | stack.append((j, 0)) 587 | return out 588 | 589 | 590 | # ================================================================ 591 | # Flat vectors 592 | # ================================================================ 593 | 594 | def var_shape(x): 595 | out = x.get_shape().as_list() 596 | assert all(isinstance(a, int) for a in out), \ 597 | "shape function assumes that shape is fully known" 598 | return out 599 | 600 | 601 | def numel(x): 602 | return intprod(var_shape(x)) 603 | 604 | 605 | def intprod(x): 606 | return int(np.prod(x)) 607 | 608 | 609 | def flatgrad(loss, var_list): 610 | grads = tf.gradients(loss, var_list) 611 | return tf.concat(axis=0, values=[ 612 | tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) 613 | for (v, grad) in zip(var_list, grads) 614 | ]) 615 | 616 | 617 | class SetFromFlat(object): 618 | def __init__(self, var_list, dtype=tf.float32): 619 | assigns = [] 620 | shapes = list(map(var_shape, var_list)) 621 | total_size = np.sum([intprod(shape) for shape in shapes]) 622 | 623 | self.theta = theta = tf.placeholder(dtype, [total_size]) 624 | start = 0 625 | assigns = [] 626 | for (shape, v) in zip(shapes, var_list): 627 | size = intprod(shape) 628 | assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape))) 629 | start += size 630 | self.op = tf.group(*assigns) 631 | 632 | def __call__(self, theta): 633 | get_session().run(self.op, feed_dict={self.theta: theta}) 634 | 635 | 636 | class GetFlat(object): 637 | def __init__(self, var_list): 638 | self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) 639 | 640 | def __call__(self): 641 | return get_session().run(self.op) 642 | 643 | # ================================================================ 644 | # Misc 645 | # ================================================================ 646 | 647 | 648 | def fancy_slice_2d(X, inds0, inds1): 649 | """ 650 | like numpy X[inds0, inds1] 651 | XXX this implementation is bad 652 | """ 653 | inds0 = tf.cast(inds0, tf.int64) 654 | inds1 = tf.cast(inds1, tf.int64) 655 | shape = tf.cast(tf.shape(X), tf.int64) 656 | ncols = shape[1] 657 | Xflat = tf.reshape(X, [-1]) 658 | return tf.gather(Xflat, inds0 * ncols + inds1) 659 | 660 | 661 | # ================================================================ 662 | # Scopes 663 | # ================================================================ 664 | 665 | 666 | def scope_vars(scope, trainable_only=False): 667 | """ 668 | Get variables inside a scope 669 | The scope can be specified as a string 670 | 671 | Parameters 672 | ---------- 673 | scope: str or VariableScope 674 | scope in which the variables reside. 675 | trainable_only: bool 676 | whether or not to return only the variables that were marked as trainable. 677 | 678 | Returns 679 | ------- 680 | vars: [tf.Variable] 681 | list of variables in `scope`. 682 | """ 683 | return tf.get_collection( 684 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES, 685 | scope=scope if isinstance(scope, str) else scope.name 686 | ) 687 | 688 | 689 | def scope_name(): 690 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 691 | return tf.get_variable_scope().name 692 | 693 | 694 | def absolute_scope_name(relative_scope_name): 695 | """Appends parent scope name to `relative_scope_name`""" 696 | return scope_name() + "/" + relative_scope_name 697 | 698 | 699 | def lengths_to_mask(lengths_b, max_length): 700 | """ 701 | Turns a vector of lengths into a boolean mask 702 | 703 | Args: 704 | lengths_b: an integer vector of lengths 705 | max_length: maximum length to fill the mask 706 | 707 | Returns: 708 | a boolean array of shape (batch_size, max_length) 709 | row[i] consists of True repeated lengths_b[i] times, followed by False 710 | """ 711 | lengths_b = tf.convert_to_tensor(lengths_b) 712 | assert lengths_b.get_shape().ndims == 1 713 | mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) 714 | return mask_bt 715 | 716 | 717 | def in_session(f): 718 | @functools.wraps(f) 719 | def newfunc(*args, **kwargs): 720 | with tf.Session(): 721 | f(*args, **kwargs) 722 | return newfunc 723 | 724 | 725 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) 726 | 727 | 728 | def get_placeholder(name, dtype, shape): 729 | if name in _PLACEHOLDER_CACHE: 730 | out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] 731 | assert dtype1 == dtype and shape1 == shape 732 | return out 733 | else: 734 | out = tf.placeholder(dtype=dtype, shape=shape, name=name) 735 | _PLACEHOLDER_CACHE[name] = (out, dtype, shape) 736 | return out 737 | 738 | 739 | def get_placeholder_cached(name): 740 | return _PLACEHOLDER_CACHE[name][0] 741 | 742 | 743 | def flattenallbut0(x): 744 | return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) 745 | 746 | 747 | def reset(): 748 | global _PLACEHOLDER_CACHE 749 | global VARIABLES 750 | _PLACEHOLDER_CACHE = {} 751 | VARIABLES = {} 752 | tf.reset_default_graph() 753 | --------------------------------------------------------------------------------