├── baselines
    ├── __init__.py
    ├── deepq
    │   ├── prediction
    │   │   ├── __init__.py
    │   │   ├── tool
    │   │   │   ├── __init__.py
    │   │   │   ├── visualize.py
    │   │   │   ├── visualize_all.py
    │   │   │   ├── dump_tfrecords.py
    │   │   │   ├── compute_mean.py
    │   │   │   ├── summary.py
    │   │   │   ├── episode_reader.py
    │   │   │   └── episode_collector.py
    │   │   ├── tfacvp
    │   │   │   ├── __init__.py
    │   │   │   ├── tf_ops.py
    │   │   │   ├── util.py
    │   │   │   ├── dataset.py
    │   │   │   ├── old_model.py
    │   │   │   └── model.py
    │   │   ├── setup.sh
    │   │   ├── example
    │   │   │   ├── pred-00.png
    │   │   │   ├── pred-01.png
    │   │   │   ├── pred-02.png
    │   │   │   ├── pred-03.png
    │   │   │   ├── pred-04.png
    │   │   │   ├── pred-05.png
    │   │   │   ├── pred-06.png
    │   │   │   ├── pred-07.png
    │   │   │   ├── pred-08.png
    │   │   │   ├── atari-gray
    │   │   │   │   ├── a_t.npy
    │   │   │   │   ├── s_t.npy
    │   │   │   │   ├── pred.png
    │   │   │   │   ├── x_t_1.npy
    │   │   │   │   ├── README.md
    │   │   │   │   └── example.py
    │   │   │   └── atari-rgb
    │   │   │   │   ├── mean.npy
    │   │   │   │   ├── example.npy
    │   │   │   │   ├── example.png
    │   │   │   │   ├── README.md
    │   │   │   │   └── example.py
    │   │   ├── misc
    │   │   │   ├── check_caffe_weight.py
    │   │   │   ├── test_conv1.py
    │   │   │   ├── check.py
    │   │   │   └── test_caffe.py
    │   │   ├── old_train.sh
    │   │   ├── README.md
    │   │   ├── train.sh
    │   │   └── train.py
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── atari
    │   │   │   ├── __init__.py
    │   │   │   ├── download_model.py
    │   │   │   ├── model.py
    │   │   │   ├── wang2015_eval.py
    │   │   │   ├── enjoy.py
    │   │   │   └── train.py
    │   │   ├── enjoy_cartpole.py
    │   │   ├── enjoy_pong.py
    │   │   ├── train_cartpole.py
    │   │   ├── train_pong.py
    │   │   └── custom_cartpole.py
    │   ├── __init__.py
    │   ├── models.py
    │   ├── replay_buffer.py
    │   ├── simple.py
    │   └── build_graph.py
    ├── common
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── test_schedules.py
    │   │   ├── test_tf_util.py
    │   │   └── test_segment_tree.py
    │   ├── schedules.py
    │   ├── segment_tree.py
    │   ├── azure_utils.py
    │   ├── atari_wrappers_deprecated.py
    │   ├── misc_util.py
    │   └── tf_util.py
    └── logger.py
├── atari-pre-trained-agents
    └── README.md
├── atari-visual-foresight
    └── README.md
├── data
    ├── logo.jpg
    └── cartpole.gif
├── .gitignore
├── setup.py
├── LICENSE
└── README.md


/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/atari-pre-trained-agents/README.md:
--------------------------------------------------------------------------------
1 | # Save pre-trained atari agents here


--------------------------------------------------------------------------------
/atari-visual-foresight/README.md:
--------------------------------------------------------------------------------
1 | # Save pre-trained visual foresight model here


--------------------------------------------------------------------------------
/baselines/deepq/prediction/setup.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=$(pwd):$PYTHONPATH
2 | 


--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | from baselines.common.misc_util import *
5 | 


--------------------------------------------------------------------------------
/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/data/logo.jpg


--------------------------------------------------------------------------------
/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/data/cartpole.gif


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-00.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-01.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-02.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-03.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-04.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-05.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-06.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-07.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/pred-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/pred-08.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/a_t.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/a_t.npy


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/s_t.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/s_t.npy


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-rgb/mean.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/mean.npy


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/pred.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/x_t_1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-gray/x_t_1.npy


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-rgb/example.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/example.npy


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-rgb/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yenchenlin/rl-attack-detection/HEAD/baselines/deepq/prediction/example/atari-rgb/example.png


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/visualize.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import sys
4 | 
5 | img = np.load(sys.argv[1]).astype(np.uint8)
6 | cv2.imshow('img', img)
7 | cv2.waitKey(0)
8 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/misc/check_caffe_weight.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cPickle as pickle
3 | import sys
4 | 
5 | data = pickle.load(open(sys.argv[1], "rb" ))
6 | for key in data:
7 |     print key, data[key].shape
8 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/visualize_all.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | ss = np.load(sys.argv[1]).astype(np.uint8)
 6 | for s in ss
 7 |     for i in range(0, 12, 3):
 8 |         cv2.imshow('img%d' % i,s[:,:,i:i+3])
 9 |     cv2.waitKey(0)
10 | 


--------------------------------------------------------------------------------
/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models  # noqa
2 | from baselines.deepq.build_graph import build_act, build_train  # noqa
3 | 
4 | from baselines.deepq.simple import learn, load  # noqa
5 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
6 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/old_train.sh:
--------------------------------------------------------------------------------
 1 | GAME=$1
 2 | NUM_ACT=$2
 3 | COLOR=$3
 4 | TRAIN="${GAME}/train"
 5 | TEST="${GAME}/test"
 6 | MEAN="${GAME}/mean.npy"
 7 | LOG="models/${GAME}-${COLOR}-model"
 8 | 
 9 | python train.py --train ${TRAIN} --test ${TEST} --mean ${MEAN} --num_act ${NUM_ACT} --color ${COLOR} --log ${LOG}
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | *.py~
 4 | .DS_Store
 5 | .idea
 6 | 
 7 | # Setuptools distribution and build folders.
 8 | /dist/
 9 | /build
10 | 
11 | # Virtualenv
12 | /env
13 | 
14 | # Python egg metadata, regenerated from source files by setuptools.
15 | /*.egg-info
16 | 
17 | *.sublime-project
18 | *.sublime-workspace
19 | 
20 | .idea
21 | 
22 | logs/
23 | 
24 | .ipynb_checkpoints
25 | ghostdriver.log
26 | 
27 | htmlcov
28 | 
29 | junk


--------------------------------------------------------------------------------
/baselines/deepq/prediction/README.md:
--------------------------------------------------------------------------------
 1 | # Installtion
 2 | ```
 3 | cd tensorflow-action-conditional-video-prediction
 4 | source setup.sh
 5 | ```
 6 | 
 7 | # Train
 8 | ## Atari
 9 | ```
10 | ./train_atari.sh ${game name} ${num_act} ${colorspace [rgb|gray]} {gpu id}
11 | e.g. ./train_atari.sh MsPacman-v0 9 gray 0
12 | ```
13 | 
14 | # Test
15 | 
16 | # Model zoo
17 | Since size of model is too large, please download pretrained models from [here](https://drive.google.com/drive/u/0/folders/0B5wysG7CaEswVnNJdUkyZ29DR2s)
18 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/dump_tfrecords.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from episode_reader import EpisodeReader
 3 | 
 4 | import sys, os
 5 | 
 6 | if __name__ == '__main__':
 7 |     reader = EpisodeReader(path=sys.argv[1], height=84, width=84)
 8 |     dir = sys.argv[2]
 9 |     i = 0
10 |     for s, a, x_t_1 in reader.read():
11 |         np.save(os.path.join(dir, '%05d-s') % i, s)
12 |         np.save(os.path.join(dir, '%05d-x_t_1') % i, x_t_1)
13 |         np.save(os.path.join(dir, '%05d-a' % i), np.asarray([a]))
14 |         i += 1
15 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("CartPole-v0")
 8 |     act = deepq.load("cartpole_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -p long
 3 | #SBATCH --gres=gpu:k80:2
 4 | #SBATCH -J Pong
 5 | #SBATCH -o Pong.log
 6 | 
 7 | GAME=$1
 8 | NUM_ACT=$2
 9 | COLOR=$3
10 | DATA_DIR="${GAME}_episodes"
11 | TRAIN="${DATA_DIR}/train"
12 | TEST="${DATA_DIR}/test"
13 | MEAN="${DATA_DIR}/mean.npy"
14 | LOG="models/${GAME}"
15 | 
16 | hostname
17 | echo $CUDA_VISIBLE_DEVICES
18 | source activate tf
19 | export PYTHONHOME="/home/yclin/miniconda2/envs/tf"
20 | srun python train.py --train ${TRAIN} --test ${TEST} --mean ${MEAN} --num_act ${NUM_ACT} --color ${COLOR} --log ${LOG}
21 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-rgb/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This example use MsPacman-v0 for demostrating infer next frame with current four frames and action.
 3 | ```example.npy``` is a [84, 84, 12] numpy array (DQN settings). 
 4 | 
 5 | # Usage
 6 | ```
 7 | python example.py --load {tensorflow model checkpoint}
 8 | ```
 9 | 
10 | # Integrate with your code
11 | ```
12 | from tfacvp.model import ActionConditionalVideoPredictionModel
13 | from tfacvp.util import post_process 
14 | 
15 | model = ActionConditionalVideoPredictionModel(num_act=num_act, is_train=False)
16 | sess = tf.Session()
17 | model.restore(sess, args.load)
18 | model.predict(sess, s, a)
19 | ```
20 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
 5 | 
 6 | 
 7 | def main():
 8 |     env = gym.make("PongNoFrameskip-v4")
 9 |     env = ScaledFloatFrame(wrap_dqn(env))
10 |     act = deepq.load("pong_model.pkl")
11 | 
12 |     while True:
13 |         obs, done = env.reset(), False
14 |         episode_rew = 0
15 |         while not done:
16 |             env.render()
17 |             obs, rew, done, _ = env.step(act(obs[None])[0])
18 |             episode_rew += rew
19 |         print("Episode reward", episode_rew)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/compute_mean.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os, sys, cv2
 3 | import glob
 4 | from tqdm import *
 5 | 
 6 | from episode_reader import EpisodeReader
 7 | 
 8 | if __name__ == '__main__':
 9 |     path = sys.argv[1]
10 |     mean_path = sys.argv[2]
11 | 
12 |     mean = np.zeros([84, 84, 1], dtype=np.float64)
13 |     n = 0
14 |     for path in tqdm(glob.glob(os.path.join(path, '*.tfrecords'))):
15 |         try:
16 |             reader = EpisodeReader(path)
17 |             for s, a, x in reader.read():
18 |                 mean += s[:,:,-1:]
19 |                 n += 1
20 |         except:
21 |             print ('Fail to load %s' % path)
22 |     mean /= n
23 |     np.save(mean_path, mean)
24 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def callback(lcl, glb):
 7 |     # stop training if reward exceeds 199
 8 |     is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
 9 |     return is_solved
10 | 
11 | 
12 | def main():
13 |     env = gym.make("CartPole-v0")
14 |     model = deepq.models.mlp([64])
15 |     act = deepq.learn(
16 |         env,
17 |         q_func=model,
18 |         lr=1e-3,
19 |         max_timesteps=100000,
20 |         buffer_size=50000,
21 |         exploration_fraction=0.1,
22 |         exploration_final_eps=0.02,
23 |         print_freq=10,
24 |         callback=callback
25 |     )
26 |     print("Saving model to cartpole_model.pkl")
27 |     act.save("cartpole_model.pkl")
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info.major != 3:
 5 |     print("This Python is only compatible with Python 3, but you are running "
 6 |           "Python {}. The installation will likely fail.".format(sys.version_info.major))
 7 | 
 8 | setup(name='baselines',
 9 |       packages=[package for package in find_packages()
10 |                 if package.startswith('baselines')],
11 |       install_requires=[
12 |           'gym>=0.9.1',
13 |           'scipy',
14 |           'tqdm',
15 |           'joblib',
16 |           'zmq',
17 |           'dill',
18 |           'tensorflow >= 1.0.0',
19 |           'azure==1.0.3',
20 |           'progressbar2',
21 |       ],
22 |       description="OpenAI baselines: high quality implementations of reinforcement learning algorithms",
23 |       author="OpenAI",
24 |       url='https://github.com/openai/baselines',
25 |       author_email="gym@openai.com",
26 |       version="0.1.3")
27 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
 5 | 
 6 | 
 7 | def main():
 8 |     env = gym.make("PongNoFrameskip-v4")
 9 |     env = ScaledFloatFrame(wrap_dqn(env))
10 |     model = deepq.models.cnn_to_mlp(
11 |         convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
12 |         hiddens=[256],
13 |         dueling=True
14 |     )
15 |     act = deepq.learn(
16 |         env,
17 |         q_func=model,
18 |         lr=1e-4,
19 |         max_timesteps=2000000,
20 |         buffer_size=10000,
21 |         exploration_fraction=0.1,
22 |         exploration_final_eps=0.01,
23 |         train_freq=4,
24 |         learning_starts=10000,
25 |         target_network_update_freq=1000,
26 |         gamma=0.99,
27 |         prioritized_replay=True
28 |     )
29 |     act.save("pong_model.pkl")
30 |     env.close()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This example demostrates how to use ActionConditionalVideoPredcitionModel to predict next frame conditions on current state and action.
 3 | ```example.npy``` is a [84, 84, 12] numpy array (DQN settings). 
 4 | 
 5 | # Usage
 6 | ```
 7 | python example.py --load {tensorflow model checkpoint} --data {state npy file(e.g. s_t_.npy)} --mean {image mean} --num_act {number of action in the action space} --act {which action you want to take, 0 <= act < num_act}
 8 | ```
 9 | 
10 | # Integrate with your code
11 | ```
12 | from tfacvp.model import ActionConditionalVideoPredictionModel
13 | from tfacvp.util import post_process_gray, pre_process_gray
14 | 
15 | mean = np.load(meanfile_path)
16 | 
17 | sess = tf.Session()
18 | 
19 | model = ActionConditionalVideoPredictionModel(num_act=num_act, is_train=False)
20 | model.restore(sess, chekckpoint_path)
21 | 
22 | s = pre_process_gray(s, mean, scale)
23 | model.predict(sess, s, a)
24 | ```
25 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/summary.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | import argparse
 6 | import sys, os
 7 | import logging
 8 | 
 9 | from model import ActionConditionalVideoPredictionModel
10 | from dataset import Dataset
11 | 
12 | def get_config(args):
13 |     config = tf.ConfigProto()
14 |     config.gpu_options.allow_growth = True
15 |     return config
16 | 
17 | def main(args):
18 |     with tf.Graph().as_default() as graph:   
19 |         model = ActionConditionalVideoPredictionModel(num_act=args.num_act)
20 |         for var in tf.trainable_variables():
21 |             print var
22 |         with tf.variable_scope('', reuse=True) as scope:
23 |             print tf.get_variable('conv1/w')
24 |         
25 |     
26 | if __name__ == '__main__':
27 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument('--num_act', help='num acts', type=int, required=True)
30 |     args = parser.parse_args()
31 | 
32 |     main(args)
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/episode_reader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import os, sys, cv2
 4 | 
 5 | class EpisodeReader(object):
 6 |     def __init__(self, path, height=84, width=84):
 7 |         self.reader = tf.python_io.tf_record_iterator(path=path)
 8 |         self.height = height
 9 |         self.width = width
10 |     
11 |     def read(self):
12 |         for string_record in self.reader:
13 |             example = tf.train.Example()
14 |             example.ParseFromString(string_record)
15 |             
16 |             a_t = int(example.features.feature['a_t']
17 |                                          .int64_list
18 |                                          .value[0]) 
19 |            
20 |             s_t_string = (example.features.feature['s_t']
21 |                                           .bytes_list
22 |                                           .value[0])
23 | 
24 |             x_t_1_string = (example.features.feature['x_t_1']
25 |                                           .bytes_list
26 |                                           .value[0])
27 |                        
28 |             s_t_raw = np.fromstring(s_t_string, dtype=np.uint8)
29 |             s_t = s_t_raw.reshape((self.height, self.width, -1))
30 | 
31 |             x_t_1_raw = np.fromstring(x_t_1_string, dtype=np.uint8)
32 |             x_t_1 = x_t_1_raw.reshape((self.height, self.width, -1))
33 | 
34 |             s_t = s_t.astype(np.float32)
35 |             x_t_1 = x_t_1.astype(np.float32)
36 |             
37 |             yield s_t, a_t, x_t_1
38 | 
39 |     def __call__(self):
40 |         yield self.read()
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     set_value,
 7 |     single_threaded_session
 8 | )
 9 | 
10 | 
11 | def test_set_value():
12 |     a = tf.Variable(42.)
13 |     with single_threaded_session():
14 |         set_value(a, 5)
15 |         assert a.eval() == 5
16 |         g = tf.get_default_graph()
17 |         g.finalize()
18 |         set_value(a, 6)
19 |         assert a.eval() == 6
20 | 
21 |         # test the test
22 |         try:
23 |             assert a.eval() == 7
24 |         except AssertionError:
25 |             pass
26 |         else:
27 |             assert False, "assertion should have failed"
28 | 
29 | 
30 | def test_function():
31 |     tf.reset_default_graph()
32 |     x = tf.placeholder(tf.int32, (), name="x")
33 |     y = tf.placeholder(tf.int32, (), name="y")
34 |     z = 3 * x + 2 * y
35 |     lin = function([x, y], z, givens={y: 0})
36 | 
37 |     with single_threaded_session():
38 |         initialize()
39 | 
40 |         assert lin(2) == 6
41 |         assert lin(x=3) == 9
42 |         assert lin(2, 2) == 10
43 |         assert lin(x=2, y=3) == 12
44 | 
45 | 
46 | def test_multikwargs():
47 |     tf.reset_default_graph()
48 |     x = tf.placeholder(tf.int32, (), name="x")
49 |     with tf.variable_scope("other"):
50 |         x2 = tf.placeholder(tf.int32, (), name="x")
51 |     z = 3 * x + 2 * x2
52 | 
53 |     lin = function([x, x2], z, givens={x2: 0})
54 |     with single_threaded_session():
55 |         initialize()
56 |         assert lin(2) == 6
57 |         assert lin(2, 2) == 10
58 |         expt_caught = False
59 |         try:
60 |             lin(x=2)
61 |         except AssertionError:
62 |             expt_caught = True
63 |         assert expt_caught
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     test_set_value()
68 |     test_function()
69 |     test_multikwargs()
70 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tool/episode_collector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import os, sys, cv2
 4 | 
 5 | class EpisodeCollector(object):
 6 |     '''
 7 |         Episode logger, this class is designed to collect state, action for ActionConditionalVideoPrediction training datas
 8 |     '''
 9 |     def __init__(self, path, preprocess_func, skip=4):
10 |         # path: Where to save .tfrecord file. (str)
11 |         # preprocess_func: Frame preprocess function. (function)
12 |         # skip: Drop $skip frames, since common RL algorithm use 4 frame as one state.
13 |         #       However first 3 frames are black, we don't want to record these state including black frame. (int)
14 |         self.timestep = 0
15 |         self.preprocess_func = preprocess_func
16 |         self.writer = tf.python_io.TFRecordWriter(path)
17 |         self.skip = skip
18 | 
19 |     def _int64_feature(self, value):
20 |         return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
21 | 
22 |     def _bytes_feature(self, value):
23 |         return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
24 | 
25 |     def save(self, s, a, x_next):
26 |         # s: RL state, usually 4 stacked frames (e.g. ndarray, shape=[84, 84, 12])
27 |         # a: action (int)
28 |         # x_next: next frame. (e.g. nadrrray, shape=[84, 84, 3], [210, 160, 3])
29 | 
30 |         self.timestep += 1
31 |         if self.timestep > self.skip:
32 |             s_raw = s.tostring()
33 | 
34 |             x_next = self.preprocess_func(x_next)
35 |             x_next_raw = x_next.tostring()
36 | 
37 |             example = tf.train.Example(features=tf.train.Features(feature={
38 |                 'a_t': self._int64_feature(a),
39 |                 's_t': self._bytes_feature(s_raw),
40 |                 'x_t_1': self._bytes_feature(x_next_raw)}))
41 |             self.writer.write(example.SerializeToString())
42 | 
43 |     def close(self):
44 |         self.writer.close()
45 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/download_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import progressbar
 3 | 
 4 | from baselines.common.azure_utils import Container
 5 | 
 6 | 
 7 | def parse_args():
 8 |     parser = argparse.ArgumentParser("Download a pretrained model from Azure.")
 9 |     # Environment
10 |     parser.add_argument("--model-dir", type=str, default=None,
11 |                         help="save model in this directory this directory. ")
12 |     parser.add_argument("--account-name", type=str, default="openaisciszymon",
13 |                         help="account name for Azure Blob Storage")
14 |     parser.add_argument("--account-key", type=str, default=None,
15 |                         help="account key for Azure Blob Storage")
16 |     parser.add_argument("--container", type=str, default="dqn-blogpost",
17 |                         help="container name and blob name separated by colon serparated by colon")
18 |     parser.add_argument("--blob", type=str, default=None, help="blob with the model")
19 |     return parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     args = parse_args()
24 |     c = Container(account_name=args.account_name,
25 |                   account_key=args.account_key,
26 |                   container_name=args.container)
27 | 
28 |     if args.blob is None:
29 |         print("Listing available models:")
30 |         print()
31 |         for blob in sorted(c.list(prefix="model-")):
32 |             print(blob)
33 |     else:
34 |         print("Downloading {} to {}...".format(args.blob, args.model_dir))
35 |         bar = None
36 | 
37 |         def callback(current, total):
38 |             nonlocal bar
39 |             if bar is None:
40 |                 bar = progressbar.ProgressBar(max_value=total)
41 |             bar.update(current)
42 | 
43 |         assert c.exists(args.blob), "model {} does not exist".format(args.blob)
44 | 
45 |         assert args.model_dir is not None
46 | 
47 |         c.get(args.model_dir, args.blob, callback=callback)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-rgb/example.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | import argparse
 5 | import sys, os
 6 | import logging
 7 | 
 8 | def get_config(args):
 9 |     config = tf.ConfigProto()
10 |     config.gpu_options.allow_growth = True
11 |     return config
12 | 
13 | def get_cv_image(img, mean, scale):
14 |        return img
15 | 
16 | def main(args):
17 |     from tfacvp.model import ActionConditionalVideoPredictionModel
18 |     from tfacvp.util import post_process_rgb
19 | 
20 |     with tf.Graph().as_default() as graph:    
21 |         logging.info('Create model [num_act = %d] for testing' % (args.num_act))
22 |         model = ActionConditionalVideoPredictionModel(num_act=args.num_act, is_train=False)
23 |        
24 |         config = get_config(args)
25 |         s = np.load(args.data)
26 |         mean = np.load(args.mean)
27 |         scale = 255.0
28 | 
29 |         with tf.Session(config=config) as sess:
30 |             logging.info('Loading weights from %s' % (args.load))
31 |             model.restore(sess, args.load)
32 | 
33 |             for i in range(args.num_act):
34 |                 logging.info('Predict next frame condition on action %d' % (i))
35 |                 a = np.identity(args.num_act)[i]
36 |                 x_t_1_pred_batch = model.predict(sess, s[np.newaxis, :], a[np.newaxis, :])[0]
37 | 
38 |                 img = x_t_1_pred_batch[0]
39 |                 img = post_process(img, mean, scale)
40 |                 img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
41 |                 cv2.imwrite('pred-%02d.png' % i, img)
42 |            
43 | 
44 | if __name__ == '__main__':
45 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument('--data', help='testing data npy', type=str, default='example.npy')
48 |     parser.add_argument('--mean', help='image mean path', type=str, default='mean.npy')
49 |     parser.add_argument('--load', help='model weight path', type=str, required=True)
50 |     parser.add_argument('--num_act', help='num acts', type=int, default=9)
51 |     args = parser.parse_args()
52 |     main(args)
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/misc/test_conv1.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | import argparse
 6 | import sys, os
 7 | import logging
 8 | 
 9 | import cPickle as pickle
10 | 
11 | from model import ActionConditionalVideoPredictionModel
12 | from dataset import Dataset, CaffeDataset
13 | from util import post_process
14 | 
15 | def get_config(args):
16 |     config = tf.ConfigProto()
17 |     config.gpu_options.allow_growth = True
18 |     return config
19 | 
20 | def load_caffe_model(x, path):
21 |     with open(path, 'rb') as f:
22 |         data = pickle.load(f)
23 |         w = tf.Variable(data['conv1/w'], dtype=tf.float32)
24 |         b = tf.Variable(data['conv1/b'], dtype=tf.float32)
25 |         l = tf.nn.conv2d(x, w, strides=[1, 2, 2, 1], padding='VALID', name='conv2d')
26 |         l = tf.nn.bias_add(l, b, name='bias_add')
27 |     return l
28 | 
29 | def main(args):
30 |     with tf.Graph().as_default() as graph:
31 |         # Create dataset
32 |         logging.info('Create data flow from %s' % args.data)
33 |         caffe_dataset = CaffeDataset(dir=args.data, num_act=args.num_act, mean_path=args.mean)
34 |        
35 |         # Config session
36 |         config = get_config(args)
37 | 
38 |         x = tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, 12])
39 |         op = load_caffe_model(x, args.load)
40 | 
41 |         init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
42 |         
43 |         # Start session
44 |         with tf.Session(config=config) as sess:
45 |             sess.run(init)
46 |             i = 0
47 |             for s, a in caffe_dataset(5):
48 |                 pred_data = sess.run([op], feed_dict={x: [s]})[0]
49 |                 print pred_data.shape
50 |                 np.save('tf-%03d.npy' % i, pred_data)
51 |                 i += 1
52 |            
53 | if __name__ == '__main__':
54 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('--log', help='summary directory', type=str, default='caffe-test')
57 |     parser.add_argument('--data', help='testing data directory', type=str, required=True)
58 |     parser.add_argument('--mean', help='image mean path', type=str, required=True)
59 |     parser.add_argument('--load', help='caffe-dumped model path', type=str, required=True)
60 |     parser.add_argument('--num_act', help='num acts', type=int, required=True)
61 |     args = parser.parse_args()
62 | 
63 |     main(args)
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.layers as layers
 3 | 
 4 | 
 5 | def model(img_in, num_actions, scope, reuse=False, concat_softmax=False):
 6 |     """As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
 7 |     with tf.variable_scope(scope, reuse=reuse):
 8 |         out = img_in
 9 |         with tf.variable_scope("convnet"):
10 |             # original architecture
11 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
12 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
13 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
14 |         out = layers.flatten(out)
15 | 
16 |         with tf.variable_scope("action_value"):
17 |             out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
18 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
19 |             if concat_softmax:
20 |                 out = tf.nn.softmax(out)
21 | 
22 |         return out
23 | 
24 | 
25 | def dueling_model(img_in, num_actions, scope, reuse=False):
26 |     """As described in https://arxiv.org/abs/1511.06581"""
27 |     with tf.variable_scope(scope, reuse=reuse):
28 |         out = img_in
29 |         with tf.variable_scope("convnet"):
30 |             # original architecture
31 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
32 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
33 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
34 |         out = layers.flatten(out)
35 | 
36 |         with tf.variable_scope("state_value"):
37 |             state_hidden = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
38 |             state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
39 |         with tf.variable_scope("action_value"):
40 |             actions_hidden = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
41 |             action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
42 |             action_scores_mean = tf.reduce_mean(action_scores, 1)
43 |             action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
44 | 
45 |         return state_score + action_scores
46 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/tf_ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def ReLu(x, name, reuse=False):
 5 |     with tf.variable_scope(name, reuse=reuse) as scope:
 6 |         l = tf.nn.relu(x)
 7 |     return l
 8 | 
 9 | def Conv2D(x, filter_shape, out_dim, strides, padding, name, reuse=False):
10 |     # x: input tensor (float32)[n, w, h, c]
11 |     # filter_shape: conv2d filter (int)[w, h]
12 |     # out_dim: output channels (int)
13 |     # strides: conv2d stride size (int)
14 |     # padding: padding type (str)
15 |     # name: variable scope (str)
16 |            
17 |     with tf.variable_scope(name, reuse=reuse) as scope:
18 |         in_dim = x.get_shape()[-1]
19 |         w = tf.get_variable('w', shape=filter_shape + [in_dim, out_dim], initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform=True))
20 |         b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0))
21 |         l = tf.nn.conv2d(x, w, strides=[1, strides, strides, 1], padding=padding, name='conv2d')
22 |         l = tf.nn.bias_add(l, b, name='bias_add')
23 |     return l
24 | 
25 | def FC(x, out_dim, name, initializer=tf.contrib.layers.xavier_initializer(uniform=True), reuse=False):
26 |     # x: input tensor (float32)[n, in_dim]
27 |     # out_dim: output channels (int)
28 |     # name: variable scope (str)
29 | 
30 |     x = tf.contrib.layers.flatten(x)
31 |     with tf.variable_scope(name, reuse=reuse) as scope:
32 |         in_dim = x.get_shape()[-1]
33 |         w = tf.get_variable('w', shape=[in_dim, out_dim], initializer=initializer)
34 |         b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0))
35 |         l = tf.add(tf.matmul(x, w), b, name='add')
36 |     return l
37 | 
38 | def Deconv2D(x, filter_shape, output_shape, out_dim, strides, padding, name, reuse=False):
39 |     # x: input tensor (float32) [n, w, h, c]
40 |     # filter_shape: conv2d filter (int)[w, h]
41 |     # out_dim: output channels (int)
42 |     # strides: conv2d stride size (int)
43 |     # padding: padding type (str)
44 |     # name: variable scope (str)
45 | 
46 |     with tf.variable_scope(name, reuse=reuse) as scope:
47 |         in_dim = x.get_shape()[-1]
48 |         w = tf.get_variable('w', shape=filter_shape + [out_dim, in_dim], initializer=tf.contrib.layers.xavier_initializer_conv2d(uniform=True))
49 |         b = tf.get_variable('b', shape=[out_dim], initializer=tf.constant_initializer(0.0))
50 |         l = tf.nn.conv2d_transpose(x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, name='deconv2d')
51 |         l = tf.nn.bias_add(l, b, name='bias_add')
52 |     return l
53 | 
54 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/misc/check.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | import argparse
 6 | import sys, os
 7 | import logging
 8 | 
 9 | from model import ActionConditionalVideoPredictionModel
10 | from dataset import Dataset
11 | 
12 | def get_config(args):
13 |     config = tf.ConfigProto()
14 |     config.gpu_options.allow_growth = True
15 |     return config
16 | 
17 | def main(args):
18 |     with tf.Graph().as_default() as graph:
19 |         # Create dataset
20 |         logging.info('Create data flow from %s' % args.train)
21 |         train_data = Dataset(directory=args.train, mean_path=args.mean, batch_size=args.batch_size, num_threads=2, capacity=10000)
22 |     
23 |         # Create initializer
24 |         init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
25 |          
26 |         # Config session
27 |         config = get_config(args)
28 |         
29 |         # Setup summary
30 |         check_summary_writer = tf.summary.FileWriter(os.path.join(args.log, 'check'), graph)
31 | 
32 |         check_op = tf.cast(train_data()['x_t_1'] * 255.0 + train_data()['mean'], tf.uint8)
33 |  
34 |         tf.summary.image('x_t_1_batch_restore', check_op, collections=['check'])
35 |         check_summary_op = tf.summary.merge_all('check')
36 | 
37 |         # Start session
38 |         with tf.Session(config=config) as sess:
39 |             coord = tf.train.Coordinator()
40 |             sess.run(init)
41 |             threads = tf.train.start_queue_runners(sess=sess, coord=coord)
42 |             for i in range(10):
43 |                 x_t_1_batch, summary = sess.run([check_op, check_summary_op])
44 |                 check_summary_writer.add_summary(summary, i)
45 |             coord.request_stop()
46 |             coord.join(threads)
47 |         
48 |     
49 | if __name__ == '__main__':
50 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('--log', help='summary directory', type=str, default='example/log')
53 |     parser.add_argument('--train', help='training data directory', type=str, default='example/train')
54 |     parser.add_argument('--test', help='testing data directory', type=str, default='example/test')
55 |     parser.add_argument('--mean', help='image mean path', type=str, default='example/mean.npy')
56 |     parser.add_argument('--lr', help='learning rate', type=float, default=1e-4)
57 |     parser.add_argument('--epoch', help='epoch', type=int, default=15000000)
58 |     parser.add_argument('--show_per_epoch', help='epoch', type=int, default=1000)
59 |     parser.add_argument('--test_per_epoch', help='epoch', type=int, default=2000)
60 |     parser.add_argument('--batch_size', help='batch size', type=int, default=32)
61 |     parser.add_argument('--test_batch_size', help='batch size', type=int, default=64)
62 |     args = parser.parse_args()
63 | 
64 |     main(args)
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/misc/test_caffe.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import caffe
 3 | import six
 4 | import numpy as np
 5 | from collections import OrderedDict
 6 | from caffe.proto import caffe_pb2 as PB
 7 | 
 8 | import net as N
 9 | import cv2
10 | import os
11 | 
12 | from dataset import CaffeDataset
13 | 
14 | import argparse
15 | import logging
16 | 
17 | def post_process(data, mean, scale):
18 |   t = data.copy().squeeze()
19 |   t /= scale
20 |   t += mean
21 |   t = t.clip(0, 255)
22 |   return t.astype('uint8').squeeze().transpose([1, 0, 2]).transpose([0, 2, 1])
23 | 
24 | class CaffeActionConditionalVideoPredictionModel(object):
25 |     def __init__(self, mean, weight, K, num_act, num_step=1, data_path='test'):
26 |         self.K = K
27 |         self.num_act = num_act
28 |         self.num_step = num_step
29 | 
30 |         caffe.set_mode_gpu()
31 |         caffe.set_device(0)
32 | 
33 |         test_net_file, net_proto = N.create_netfile(1, data_path, mean, K, K,
34 |             1, num_act, num_step=self.num_step, mode='test')
35 | 
36 |         self.test_net = caffe.Net(test_net_file, caffe.TEST)
37 |         self.test_net.copy_from(weight)
38 |     
39 |     def predict(self, s, a, layer='x_hat-05'):
40 |         # s: state (1, 4, 84, 84, 3)
41 |         # a: action (1, 1, num_act)
42 | 
43 |         '''
44 |         Load data to test_net
45 |         data = [1, K, 84, 84, 3]
46 |         '''
47 |         self.test_net.blobs['data'].data[:] = s
48 |         self.test_net.blobs['act'].data[:] = a
49 |         self.test_net.forward()
50 |         
51 |         pred_data = self.test_net.blobs[args.layer].data[:]
52 | 
53 |         return pred_data
54 |  
55 | def main(args):
56 |     data = CaffeDataset(dir=args.data, num_act=args.num_act, mean_path=args.mean, mode='caffe')
57 |     model = CaffeActionConditionalVideoPredictionModel(mean=args.mean, weight=args.weight, K=4, num_act=args.num_act)
58 |     
59 |     i = 0
60 |     w = model.test_net.params['conv1'][0].data[:]
61 |     np.save('conv1_w.npy', w)
62 |     for s, a in data(5):
63 |         pred_data = model.predict(s, a)
64 |         print pred_data.shape
65 |         np.save('caffe-%03d.npy' % i, pred_data)
66 |         #pred_img = post_process(pred_data, data.mean, 1./255)
67 |         #cv2.imwrite('%03d-caffe.png' % i, pred_img)
68 |         i += 1
69 |     
70 | if __name__ == '__main__':
71 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument('--data', help='testing data directory', type=str, required=True)
74 |     parser.add_argument('--mean', help='image mean path', type=str, required=True)
75 |     parser.add_argument('--weight', help='caffe model', type=str, required=True)
76 |     parser.add_argument('--num_act', help='num acts', type=int, required=True) 
77 |     parser.add_argument('--layer', help='output layer', type=str, required=True)
78 |     args = parser.parse_args()
79 | 
80 |     main(args)
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/example/atari-gray/example.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import cv2
 4 | import argparse
 5 | import sys, os
 6 | import logging
 7 | 
 8 | def get_config(args):
 9 |     config = tf.ConfigProto()
10 |     config.gpu_options.allow_growth = True
11 |     return config
12 | 
13 | def get_cv_image(img, mean, scale):
14 |        return img
15 | 
16 | def main(args):
17 |     from tfacvp.model import ActionConditionalVideoPredictionModel
18 |     from tfacvp.util import post_process_gray, pre_process_state_gray
19 | 
20 |     with tf.Graph().as_default() as graph:
21 |         # Define tensorflow computation graph
22 |         # In this example, I hardcode the arguments num_channel and num_frame for grayscale atari settings
23 |         logging.info('Create model [num_act = %d, num_channel = %d, num_frame = %d] for testing' % (args.num_act, 1, 4))
24 |         model = ActionConditionalVideoPredictionModel(num_act=args.num_act,
25 |                                                     num_channel=1, num_frame=4,
26 |                                                     is_train=False)
27 | 
28 |         # Get tensorflow session configuration
29 |         config = get_config(args)
30 | 
31 |         # Load testing state for predicting next frame
32 |         scale = 255.0
33 |         s = np.load(args.data)
34 |         mean = np.load(args.mean)
35 | 
36 |         with tf.Session(config=config) as sess:
37 |             # Restore the model from checkpoint
38 |             # If you want to combine with your model, you should notice variable scope otherwise you might get some bugs
39 |             logging.info('Loading weights from %s' % (args.load))
40 |             model.restore(sess, args.load)
41 | 
42 |             # Predict next frame condition on specified action
43 |             logging.info('Predict next frame condition on action %d' % (args.act))
44 | 
45 |             # To one hot vector
46 |             a = np.identity(args.num_act)[args.act]
47 | 
48 |             # Predict next frame
49 |             s = pre_process_state_gray(s, mean, (1.0 / scale), 4)
50 |             print np.max(s), np.min(s)
51 |             x_t_1_pred_batch = model.predict(sess, s[np.newaxis, :], a[np.newaxis, :])[0]
52 | 
53 |             # Post process predicted frame for visualization
54 |             img = x_t_1_pred_batch[0]
55 |             img = post_process_gray(img, mean, scale)
56 |             cv2.imwrite('pred.png' , img)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('--data', help='testing data (.npy), ndarray(shape = [84,84,4])', type=str, required=True)
63 |     parser.add_argument('--mean', help='image mean path (should be shipped with pre-trained model)', type=str, required=True)
64 |     parser.add_argument('--load', help='model weight path (tensorflow checkpoint)', type=str, required=True)
65 |     parser.add_argument('--num_act', help='number of actions in the game\'s action space', type=int, required=True)
66 |     parser.add_argument('--act', help='which action you want to take', type=int, required=True)
67 |     args = parser.parse_args()
68 |     main(args)
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/util.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | def _pre_process(s, mean, scale, num_frame):
 6 |     # s: [h, w, c*num_frame]
 7 |     # mean: [h, w, c]
 8 |     # scale: float32
 9 |     #print s.shape, mean.shape, np.tile(mean, [1, 1, 4]).shape
10 |     #s -= np.tile(mean, [1, 1, num_frame])
11 |     s = s - mean
12 |     s = s * scale
13 |     return s
14 | 
15 | def pre_process_state_rgb(s, mean, scale, num_frame):
16 |     return _pre_process(s, mean, scale, num_frame)
17 | 
18 | def pre_process_state_gray(s, mean, scale, num_frame):
19 |     s = _transform_state_color_space_np(s)
20 |     mean = _transform_frame_color_space_np(mean)
21 |     return _pre_process(s, mean, scale, num_frame)
22 | 
23 | def _post_process(x, mean, scale=255.0):
24 |     x *= scale
25 |     x += mean
26 |     x = np.clip(x, 0, scale)
27 |     x = x.astype(np.uint8)
28 |     return x
29 | 
30 | def post_process_rgb(x, mean, scale):
31 |     return _post_process(x, mean, scale)
32 | 
33 | def post_process_gray(x, mean, scale):
34 |     # x: [h, w, 1] (assume gray)
35 |     # mean: [h, w, c*num_frame]
36 |     # scale: float32
37 |     mean = _transform_frame_color_space_np(mean)
38 |     return _post_process(x, mean, scale)
39 | 
40 | def _np_one_hot(x, n):
41 |     y = np.zeros([len(x), n])
42 |     y[np.arange(len(x)), x] = 1
43 |     return y
44 | 
45 | def _read_and_decode(directory, s_t_shape, num_act, x_t_1_shape):
46 |     filenames = tf.train.match_filenames_once('./%s/*.tfrecords' % (directory))
47 |     filename_queue = tf.train.string_input_producer(filenames)
48 | 
49 |     reader = tf.TFRecordReader()
50 | 
51 |     _, serialized_example = reader.read(filename_queue)
52 |     features = tf.parse_single_example(serialized_example,
53 |                                        features={
54 |                                        'a_t': tf.FixedLenFeature([], tf.int64),
55 |                                        's_t' : tf.FixedLenFeature([], tf.string),
56 |                                        'x_t_1' : tf.FixedLenFeature([], tf.string),
57 |                                        })
58 | 
59 |     s_t = tf.decode_raw(features['s_t'], tf.uint8)
60 |     x_t_1 = tf.decode_raw(features['x_t_1'], tf.uint8)
61 | 
62 |     s_t = tf.reshape(s_t, s_t_shape)
63 |     x_t_1 = tf.reshape(x_t_1, x_t_1_shape)
64 | 
65 |     s_t = tf.cast(s_t, tf.float32)
66 |     x_t_1 = tf.cast(x_t_1, tf.float32)
67 | 
68 |     a_t = tf.cast(features['a_t'], tf.int32)
69 |     a_t = tf.one_hot(a_t, num_act)
70 | 
71 |     return s_t, a_t, x_t_1
72 | 
73 | def _transform_frame_color_space(x):
74 |     # x: [h, w, c]
75 |     return tf.image.rgb_to_grayscale(x)
76 | 
77 | def _transform_state_color_space(s):
78 |     # s: [h, w, c*num_frame]
79 |     num_splits = int(s.shape[-1] / 3)
80 |     return tf.concat([_transform_frame_color_space(x) for x in tf.split(s, num_splits, axis=2)], axis=2)
81 | 
82 | def _transform_frame_color_space_np(x):
83 |     return cv2.cvtColor(x, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis]
84 | 
85 | def _transform_state_color_space_np(s):
86 |     # s: [h, w, c*num_frame]
87 |     num_splits = int(s.shape[-1] / 3)
88 |     return np.concatenate([cv2.cvtColor(x, cv2.COLOR_RGB2GRAY)[:,:,np.newaxis] for x in np.split(s, num_splits, axis=2)], axis=2)
89 | 
90 | 


--------------------------------------------------------------------------------
/baselines/deepq/models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.layers as layers
 3 | 
 4 | 
 5 | def _mlp(hiddens, inpt, num_actions, scope, reuse=False):
 6 |     with tf.variable_scope(scope, reuse=reuse):
 7 |         out = inpt
 8 |         for hidden in hiddens:
 9 |             out = layers.fully_connected(out, num_outputs=hidden, activation_fn=tf.nn.relu)
10 |         out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
11 |         return out
12 | 
13 | 
14 | def mlp(hiddens=[]):
15 |     """This model takes as input an observation and returns values of all actions.
16 | 
17 |     Parameters
18 |     ----------
19 |     hiddens: [int]
20 |         list of sizes of hidden layers
21 | 
22 |     Returns
23 |     -------
24 |     q_func: function
25 |         q_function for DQN algorithm.
26 |     """
27 |     return lambda *args, **kwargs: _mlp(hiddens, *args, **kwargs)
28 | 
29 | 
30 | def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False):
31 |     with tf.variable_scope(scope, reuse=reuse):
32 |         out = inpt
33 |         with tf.variable_scope("convnet"):
34 |             for num_outputs, kernel_size, stride in convs:
35 |                 out = layers.convolution2d(out,
36 |                                            num_outputs=num_outputs,
37 |                                            kernel_size=kernel_size,
38 |                                            stride=stride,
39 |                                            activation_fn=tf.nn.relu)
40 |         out = layers.flatten(out)
41 |         with tf.variable_scope("action_value"):
42 |             action_out = out
43 |             for hidden in hiddens:
44 |                 action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
45 |             action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
46 | 
47 |         if dueling:
48 |             with tf.variable_scope("state_value"):
49 |                 state_out = out
50 |                 for hidden in hiddens:
51 |                     state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=tf.nn.relu)
52 |                 state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
53 |             action_scores_mean = tf.reduce_mean(action_scores, 1)
54 |             action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
55 |             return state_score + action_scores_centered
56 |         else:
57 |             return action_scores
58 |         return out
59 | 
60 | 
61 | def cnn_to_mlp(convs, hiddens, dueling=False):
62 |     """This model takes as input an observation and returns values of all actions.
63 | 
64 |     Parameters
65 |     ----------
66 |     convs: [(int, int int)]
67 |         list of convolutional layers in form of
68 |         (num_outputs, kernel_size, stride)
69 |     hiddens: [int]
70 |         list of sizes of hidden layers
71 |     dueling: bool
72 |         if true double the output MLP to compute a baseline
73 |         for action scores
74 | 
75 |     Returns
76 |     -------
77 |     q_func: function
78 |         q_function for DQN algorithm.
79 |     """
80 | 
81 |     return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, *args, **kwargs)
82 | 
83 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/wang2015_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import baselines.common.tf_util as U
 7 | 
 8 | from baselines import deepq
 9 | from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds
10 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
11 | from baselines.deepq.experiments.atari.model import model, dueling_model
12 | 
13 | 
14 | def make_env(game_name):
15 |     env = gym.make(game_name + "NoFrameskip-v4")
16 |     env_monitored = SimpleMonitor(env)
17 |     env = wrap_dqn(env_monitored)
18 |     return env_monitored, env
19 | 
20 | 
21 | def parse_args():
22 |     parser = argparse.ArgumentParser("Evaluate an already learned DQN model.")
23 |     # Environment
24 |     parser.add_argument("--env", type=str, required=True, help="name of the game")
25 |     parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
26 |     boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
27 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | def wang2015_eval(game_name, act, stochastic):
33 |     print("==================== wang2015 evaluation ====================")
34 |     episode_rewards = []
35 | 
36 |     for num_noops in range(1, 31):
37 |         env_monitored, eval_env = make_env(game_name)
38 |         eval_env.unwrapped.seed(1)
39 | 
40 |         get_wrapper_by_name(eval_env, "NoopResetEnv").override_num_noops = num_noops
41 | 
42 |         eval_episode_steps = 0
43 |         done = True
44 |         while True:
45 |             if done:
46 |                 obs = eval_env.reset()
47 |             eval_episode_steps += 1
48 |             action = act(np.array(obs)[None], stochastic=stochastic)[0]
49 | 
50 |             obs, reward, done, info = eval_env.step(action)
51 |             if done:
52 |                 obs = eval_env.reset()
53 |             if len(info["rewards"]) > 0:
54 |                 episode_rewards.append(info["rewards"][0])
55 |                 break
56 |             if info["steps"] > 108000:  # 5 minutes of gameplay
57 |                 episode_rewards.append(env_monitored._current_reward)
58 |                 break
59 |         print("Num steps in episode {} was {} yielding {} reward".format(
60 |               num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
61 |     print("Evaluation results: " + str(np.mean(episode_rewards)))
62 |     print("=============================================================")
63 |     return np.mean(episode_rewards)
64 | 
65 | 
66 | def main():
67 |     set_global_seeds(1)
68 |     args = parse_args()
69 |     with U.make_session(4) as sess:  # noqa
70 |         _, env = make_env(args.env)
71 |         act = deepq.build_act(
72 |             make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
73 |             q_func=dueling_model if args.dueling else model,
74 |             num_actions=env.action_space.n)
75 | 
76 |         U.load_state(os.path.join(args.model_dir, "saved"))
77 |         wang2015_eval(args.env, act, stochastic=args.stochastic)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Detecting Adversarial Attacks on Neural Network Policies with Visual Foresight
  2 | 
  3 | ![](https://user-images.githubusercontent.com/7057863/30933455-9e86ba96-a398-11e7-87fa-d6339ad60c51.gif)
  4 | 
  5 | **DISCLAIMER**: This repository is a modified version of [openai/baselines](https://github.com/openai/gym).
  6 | 
  7 | ### Publication
  8 | 
  9 | Paper: https://drive.google.com/file/d/0B50cbskLVq-ed2F3eUw4SWQxbUU/view
 10 | 
 11 | ```
 12 | @article{Lin2017RLAttackDetection,
 13 |   title={Detecting Adversarial Attacks on Neural Network Policies with Visual Foresight},
 14 |   author={Lin, Yen-Chen and Liu, Ming-Yu and Sun, Min and Huang, Jia-Bin},
 15 |   journal={arXiv preprint arXiv:1710.00814},
 16 |   year={2017}
 17 | }
 18 | ```
 19 | 
 20 | 
 21 | ### Dependencies
 22 | - Python 3
 23 | - cleverhans v2.0.0
 24 | 
 25 | ```
 26 | pip install -e git+http://github.com/tensorflow/cleverhans.git#egg=cleverhans
 27 | ```
 28 | 
 29 | - others (e.g., gym, baselines, ...)
 30 | 
 31 | ```
 32 | git clone https://github.com/yenchenlin/rl-attack-detection.git
 33 | cd rl-attack-detection
 34 | pip install -e .
 35 | ```
 36 | 
 37 | 
 38 | ### Example
 39 | Here I'll use Atari game Freeway as an example to demonstrate how to run the code.
 40 | 
 41 | Let's start by switch to the home directory:
 42 | 
 43 | ```
 44 | cd rl-attack-detection
 45 | ```
 46 | 
 47 | **1. Download pre-trained agent**
 48 | 
 49 | Download [this repository](https://drive.google.com/open?id=0B50cbskLVq-eRzBtNktCVE1SSms) which contains pre-trained DQN agents for Freeway to `./atari-pre-trained-agents/`.
 50 | 
 51 | **2. Run pre-trained agent**
 52 | 
 53 | Test the performance of the pre-trained agent:
 54 | 
 55 | ```
 56 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway
 57 | ```
 58 | 
 59 | For game Freeway, you should see output similar to follows:
 60 | 
 61 | ```
 62 | 29.0
 63 | 27.0
 64 | 28.0
 65 | ...
 66 | ```
 67 | This means that our agent is now a master of the game!
 68 | 
 69 | **3. Perform adversarial attack**
 70 | 
 71 | Use adversarial example crafted by FGSM to attack deep RL agent:
 72 | 
 73 | ```
 74 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway --attack fgsm
 75 | ```
 76 | 
 77 | **Other attacks:** argument passed to `--attack` can be `fgsm`, `iterative`, `cwl2`.
 78 | 
 79 | 
 80 | You should see output similar to follows:
 81 | 
 82 | ```
 83 | 0.0
 84 | 0.0
 85 | 0.0
 86 | ...
 87 | ```
 88 | 
 89 | which means that the agent is fooled by adversary and went crazy!
 90 | 
 91 | **4. Use visual foresight as defense**
 92 | 
 93 | To protect the agent, first download [this repository](https://drive.google.com/drive/folders/0B50cbskLVq-eTGxqNWtkSGJsRzQ) which contains pre-trained visual foresight module for Freeway to `./atari-visual-foresight/`.
 94 | 
 95 | Then, we can use visual foresight to protect deep RL agent:
 96 | 
 97 | ```
 98 | python -m baselines.deepq.experiments.atari.enjoy --model-dir ./atari-pre-trained-agents/Freeway --env Freeway --attack fgsm --defense foresight
 99 | ```
100 | 
101 | Now, you should see similar outputs to **step. 2**, which means that our agents work well again.
102 | 
103 | ### Add More Attacks
104 | To use new attack methods, you can add the attack code [here](https://github.com/yenchenlin/rl-attack-detection/blob/master/baselines/deepq/build_graph.py#L156).
105 | Generally, attack methods that follow the interface of [cleverhans](https://github.com/tensorflow/cleverhans) can be added within few lines.
106 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/custom_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import itertools
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import tensorflow.contrib.layers as layers
 6 | 
 7 | import baselines.common.tf_util as U
 8 | 
 9 | from baselines import logger
10 | from baselines import deepq
11 | from baselines.deepq.replay_buffer import ReplayBuffer
12 | from baselines.common.schedules import LinearSchedule
13 | 
14 | 
15 | def model(inpt, num_actions, scope, reuse=False):
16 |     """This model takes as input an observation and returns values of all actions."""
17 |     with tf.variable_scope(scope, reuse=reuse):
18 |         out = inpt
19 |         out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
20 |         out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
21 |         return out
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     with U.make_session(8):
26 |         # Create the environment
27 |         env = gym.make("CartPole-v0")
28 |         # Create all the functions necessary to train the model
29 |         act, train, update_target, debug = deepq.build_train(
30 |             make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
31 |             q_func=model,
32 |             num_actions=env.action_space.n,
33 |             optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
34 |         )
35 |         # Create the replay buffer
36 |         replay_buffer = ReplayBuffer(50000)
37 |         # Create the schedule for exploration starting from 1 (every action is random) down to
38 |         # 0.02 (98% of actions are selected according to values predicted by the model).
39 |         exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
40 | 
41 |         # Initialize the parameters and copy them to the target network.
42 |         U.initialize()
43 |         update_target()
44 | 
45 |         episode_rewards = [0.0]
46 |         obs = env.reset()
47 |         for t in itertools.count():
48 |             # Take action and update exploration to the newest value
49 |             action = act(obs[None], update_eps=exploration.value(t))[0]
50 |             new_obs, rew, done, _ = env.step(action)
51 |             # Store transition in the replay buffer.
52 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
53 |             obs = new_obs
54 | 
55 |             episode_rewards[-1] += rew
56 |             if done:
57 |                 obs = env.reset()
58 |                 episode_rewards.append(0)
59 | 
60 |             is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
61 |             if is_solved:
62 |                 # Show off the result
63 |                 env.render()
64 |             else:
65 |                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
66 |                 if t > 1000:
67 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
68 |                     train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
69 |                 # Update target network periodically.
70 |                 if t % 1000 == 0:
71 |                     update_target()
72 | 
73 |             if done and len(episode_rewards) % 10 == 0:
74 |                 logger.record_tabular("steps", t)
75 |                 logger.record_tabular("episodes", len(episode_rewards))
76 |                 logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
77 |                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
78 |                 logger.dump_tabular()
79 | 


--------------------------------------------------------------------------------
/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import cv2
  4 | 
  5 | import argparse
  6 | import sys, os
  7 | import logging
  8 | 
  9 | from tfacvp.model import ActionConditionalVideoPredictionModel
 10 | from tfacvp.dataset import Dataset
 11 | 
 12 | MODEL_NUM_CHANNELS_DEFS = {'rgb': 3, 'gray': 1}
 13 | MODEL_NUM_FRAMES = 4
 14 | DATASET_NUM_CHANNELS = 1
 15 | DATASET_NUM_FRAMES = 4
 16 | S_SHAPE = (84, 84, DATASET_NUM_CHANNELS * DATASET_NUM_FRAMES)
 17 | X_SHAPE = (84, 84, DATASET_NUM_CHANNELS)
 18 | 
 19 | def get_config(args):
 20 |     config = tf.ConfigProto()
 21 |     config.gpu_options.allow_growth = True
 22 |     return config
 23 | 
 24 | def main(args):
 25 |     with tf.Graph().as_default() as graph:
 26 |         # Create dataset
 27 |         logging.info('Create data flow from %s [colorspace = %s]' % (args.train, args.color))
 28 |         train_data = Dataset(directory=args.train,
 29 |                     num_act=args.num_act,
 30 |                     mean_path=args.mean,
 31 |                     batch_size=args.batch_size,
 32 |                     s_t_shape=S_SHAPE,
 33 |                     x_t_1_shape=X_SHAPE,
 34 |                     num_threads=4, capacity=10000)
 35 | 
 36 |         # Create model
 37 |         logging.info('Create model for training [lr = %f, epochs = %d, batch_size = %d]' % (args.lr, args.epoch, args.batch_size) )
 38 |         model = ActionConditionalVideoPredictionModel(inputs=train_data(),
 39 |                                                     num_act=args.num_act,
 40 |                                                     num_channel=MODEL_NUM_CHANNELS_DEFS[args.color],
 41 |                                                     num_frame=MODEL_NUM_FRAMES,
 42 |                                                     optimizer_args={'lr': args.lr})
 43 | 
 44 |         # Create prediction summary
 45 |         ground_truth_image = tf.cast(model.inputs['x_t_1'] * 255.0 + train_data.mean_const, tf.uint8)
 46 |         pred_image = tf.cast(model.output * 255.0 + train_data.mean_const, tf.uint8)
 47 |         tf.summary.image('ground', ground_truth_image, collections=['train'])
 48 |         tf.summary.image('pred', pred_image, collections=['train'])
 49 | 
 50 |         # Create initializer
 51 |         init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
 52 | 
 53 |         # Get optimizer operation and loss opearation from model
 54 |         train_op = model.train
 55 |         loss_op = model.loss
 56 |         global_step_var = model.global_step
 57 | 
 58 |         # Config session
 59 |         config = get_config(args)
 60 | 
 61 |         # Setup summary
 62 |         train_summary_op = tf.summary.merge_all('train')
 63 | 
 64 |         # Setup supervisor
 65 |         sv = tf.train.Supervisor(logdir=os.path.join(args.log, 'train'),
 66 |                 init_op=init,
 67 |                 graph=graph,
 68 |                 summary_op=train_summary_op,
 69 |                 global_step=global_step_var,
 70 |                 saver=tf.train.Saver(max_to_keep=None),
 71 |                 save_model_secs=1200)
 72 | 
 73 |         # Start session
 74 |         with sv.managed_session(config=config) as sess:
 75 |             sv.start_queue_runners(sess)
 76 |             for epoch in range(args.epoch):
 77 |                 if (epoch) % args.show_per_epoch == 0:
 78 |                     _, train_loss, train_summary, global_step = sess.run([train_op, loss_op, train_summary_op, global_step_var])
 79 |                     logging.info('Epoch %d: Training L2 loss = %f' % (global_step, train_loss))
 80 |                     sv.summary_computed(sess, train_summary)
 81 |                 else:
 82 |                     sess.run([train_op])
 83 |             sv.request_stop()
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
 88 |     parser = argparse.ArgumentParser()
 89 |     parser.add_argument('--log', help='summary directory', type=str, default='example/log')
 90 |     parser.add_argument('--train', help='training data directory', type=str, required=True)
 91 |     parser.add_argument('--test', help='testing data directory', type=str, required=True)
 92 |     parser.add_argument('--mean', help='image mean path', type=str, required=True)
 93 |     parser.add_argument('--num_act', help='num acts', type=int, required=True)
 94 |     parser.add_argument('--color', help='colorspace', type=str, choices=['rgb', 'gray'], required=True)
 95 |     parser.add_argument('--lr', help='learning rate', type=float, default=1e-4)
 96 |     parser.add_argument('--epoch', help='epoch', type=int, default=15000000)
 97 |     parser.add_argument('--show_per_epoch', help='epoch', type=int, default=1000)
 98 |     parser.add_argument('--test_per_epoch', help='epoch', type=int, default=2000)
 99 |     parser.add_argument('--batch_size', help='batch size', type=int, default=32)
100 |     parser.add_argument('--test_batch_size', help='batch size', type=int, default=64)
101 |     args = parser.parse_args()
102 | 
103 |     main(args)
104 | 


--------------------------------------------------------------------------------
/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient `reduce`
 16 |                operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the
 18 |                array.
 19 | 
 20 |         Paramters
 21 |         ---------
 22 |         capacity: int
 23 |             Total size of the array - must be a power of two.
 24 |         operation: lambda obj, obj -> obj
 25 |             and operation for combining elements (eg. sum, max)
 26 |             must for a mathematical group together with the set of
 27 |             possible values for array elements.
 28 |         neutral_element: obj
 29 |             neutral element for the operation above. eg. float('-inf')
 30 |             for max and 0 for sum.
 31 |         """
 32 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 33 |         self._capacity = capacity
 34 |         self._value = [neutral_element for _ in range(2 * capacity)]
 35 |         self._operation = operation
 36 | 
 37 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 38 |         if start == node_start and end == node_end:
 39 |             return self._value[node]
 40 |         mid = (node_start + node_end) // 2
 41 |         if end <= mid:
 42 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 43 |         else:
 44 |             if mid + 1 <= start:
 45 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 46 |             else:
 47 |                 return self._operation(
 48 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 49 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 50 |                 )
 51 | 
 52 |     def reduce(self, start=0, end=None):
 53 |         """Returns result of applying `self.operation`
 54 |         to a contiguous subsequence of the array.
 55 | 
 56 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 | 
 65 |         Returns
 66 |         -------
 67 |         reduced: obj
 68 |             result of reducing self.operation over the specified range of array elements.
 69 |         """
 70 |         if end is None:
 71 |             end = self._capacity
 72 |         if end < 0:
 73 |             end += self._capacity
 74 |         end -= 1
 75 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 76 | 
 77 |     def __setitem__(self, idx, val):
 78 |         # index of the leaf
 79 |         idx += self._capacity
 80 |         self._value[idx] = val
 81 |         idx //= 2
 82 |         while idx >= 1:
 83 |             self._value[idx] = self._operation(
 84 |                 self._value[2 * idx],
 85 |                 self._value[2 * idx + 1]
 86 |             )
 87 |             idx //= 2
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         assert 0 <= idx < self._capacity
 91 |         return self._value[self._capacity + idx]
 92 | 
 93 | 
 94 | class SumSegmentTree(SegmentTree):
 95 |     def __init__(self, capacity):
 96 |         super(SumSegmentTree, self).__init__(
 97 |             capacity=capacity,
 98 |             operation=operator.add,
 99 |             neutral_element=0.0
100 |         )
101 | 
102 |     def sum(self, start=0, end=None):
103 |         """Returns arr[start] + ... + arr[end]"""
104 |         return super(SumSegmentTree, self).reduce(start, end)
105 | 
106 |     def find_prefixsum_idx(self, prefixsum):
107 |         """Find the highest index `i` in the array such that
108 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
109 | 
110 |         if array values are probabilities, this function
111 |         allows to sample indexes according to the discrete
112 |         probability efficiently.
113 | 
114 |         Parameters
115 |         ----------
116 |         perfixsum: float
117 |             upperbound on the sum of array prefix
118 | 
119 |         Returns
120 |         -------
121 |         idx: int
122 |             highest index satisfying the prefixsum constraint
123 |         """
124 |         assert 0 <= prefixsum <= self.sum() + 1e-5
125 |         idx = 1
126 |         while idx < self._capacity:  # while non-leaf
127 |             if self._value[2 * idx] > prefixsum:
128 |                 idx = 2 * idx
129 |             else:
130 |                 prefixsum -= self._value[2 * idx]
131 |                 idx = 2 * idx + 1
132 |         return idx - self._capacity
133 | 
134 | 
135 | class MinSegmentTree(SegmentTree):
136 |     def __init__(self, capacity):
137 |         super(MinSegmentTree, self).__init__(
138 |             capacity=capacity,
139 |             operation=min,
140 |             neutral_element=float('inf')
141 |         )
142 | 
143 |     def min(self, start=0, end=None):
144 |         """Returns min(arr[start], ...,  arr[end])"""
145 | 
146 |         return super(MinSegmentTree, self).reduce(start, end)
147 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/enjoy.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import os
  4 | import numpy as np
  5 | 
  6 | from gym.monitoring import VideoRecorder
  7 | 
  8 | import baselines.common.tf_util as U
  9 | 
 10 | from baselines import deepq
 11 | from baselines.common.misc_util import (
 12 |     boolean_flag,
 13 |     SimpleMonitor,
 14 | )
 15 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
 16 | from baselines.deepq.experiments.atari.model import model, dueling_model
 17 | import tensorflow as tf
 18 | import cv2
 19 | from collections import deque
 20 | 
 21 | 
 22 | def parse_args():
 23 |     parser = argparse.ArgumentParser("Run an already learned DQN model.")
 24 |     # Environment
 25 |     parser.add_argument("--env", type=str, required=True, help="name of the game")
 26 |     parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
 27 |     parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.")
 28 |     boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
 29 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
 30 |     parser.add_argument("--attack", type=str, default=None, help="Method to attack the model.")
 31 |     parser.add_argument("--defense", type=str, default=None, help="Method to defend the attack.")
 32 | 
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | def make_env(game_name):
 37 |     env = gym.make(game_name + "NoFrameskip-v4")
 38 |     env = SimpleMonitor(env)
 39 |     env = wrap_dqn(env)
 40 |     return env
 41 | 
 42 | 
 43 | def load_visual_foresight(game_name):
 44 |     sess = U.get_session()
 45 |     from baselines.deepq.prediction.tfacvp.model import ActionConditionalVideoPredictionModel
 46 |     gen_dir = './atari-visual-foresight/'
 47 |     model_path = os.path.join(gen_dir, '{}/model.ckpt'.format(game_name))
 48 |     mean_path = os.path.join(gen_dir, '{}/mean.npy'.format(game_name))
 49 |     game_screen_mean = np.load(mean_path)
 50 |     with tf.variable_scope('G'):
 51 |         foresight = ActionConditionalVideoPredictionModel(num_act=env.action_space.n, num_channel=1, is_train=False)
 52 |         foresight.restore(sess, model_path, 'G')
 53 |     return foresight, game_screen_mean
 54 | 
 55 | 
 56 | def foresee(sess, obs, act, gt, mean, model, n_actions, step):
 57 |     onehot_act = np.zeros((1, n_actions))
 58 |     onehot_act[0, act] = 1
 59 |     obs = obs - mean[None]
 60 |     obs = obs * 1/255.0
 61 |     pred_frame = model.predict(sess, obs, onehot_act)[0]
 62 |     pred_frame = pred_frame* 255.0
 63 |     pred_frame = pred_frame + mean[None]
 64 |     #print(gt[:, :, -1].shape, pred_frame.shape)
 65 |     #print(np.sum(gt[:, :, -1][:, :, np.newaxis] - pred_frame[0, :, :, :]))
 66 |     #cv2.imwrite('./tmp/gt_{}.png'.format(step), gt[:, :, -1][:, :, np.newaxis])
 67 |     #cv2.imwrite('./tmp/pred_{}.png'.format(step), pred_frame[0, :, :, :])
 68 |     return pred_frame[0, :, :, 0]
 69 | 
 70 | 
 71 | def play(env, act, craft_adv_obs, stochastic, video_path, game_name, attack, defense):
 72 |     if defense == 'foresight':
 73 |         vf, game_screen_mean = load_visual_foresight(game_name)
 74 |         pred_obs = deque(maxlen=4)
 75 | 
 76 |     num_episodes = 0
 77 |     video_recorder = None
 78 |     video_recorder = VideoRecorder(
 79 |         env, video_path, enabled=video_path is not None)
 80 | 
 81 |     t = 0
 82 |     obs = env.reset()
 83 |     while True:
 84 |         #env.unwrapped.render()
 85 |         video_recorder.capture_frame()
 86 | 
 87 | 	# Attack
 88 |         if craft_adv_obs != None:
 89 |             # Craft adv. examples
 90 |             adv_obs = craft_adv_obs(np.array(obs)[None], stochastic=stochastic)[0]
 91 |             action = act(np.array(adv_obs)[None], stochastic=stochastic)[0]
 92 |         else:
 93 |             # Normal
 94 |             action = act(np.array(obs)[None], stochastic=stochastic)[0]
 95 | 
 96 | 	# Defense
 97 |         if t > 4 and defense == 'foresight':
 98 |             pred_obs.append(
 99 |                 foresee(U.get_session(), old_obs, old_action, np.array(obs), game_screen_mean, vf,
100 |                         env.action_space.n, t)
101 |             )
102 |             if len(pred_obs) == 4:
103 |                 action = act(np.stack(pred_obs, axis=2)[None], stochastic=stochastic)[0]
104 | 
105 |         old_obs = obs
106 |         old_action = action
107 | 
108 |         # RL loop
109 |         obs, rew, done, info = env.step(action)
110 |         t += 1
111 |         if done:
112 |             t = 0
113 |             obs = env.reset()
114 |         if len(info["rewards"]) > num_episodes:
115 |             if len(info["rewards"]) == 1 and video_recorder.enabled:
116 |                 # save video of first episode
117 |                 print("Saved video.")
118 |                 video_recorder.close()
119 |                 video_recorder.enabled = False
120 |             print(info["rewards"][-1])
121 |             num_episodes = len(info["rewards"])
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     with U.make_session(4) as sess:
126 |         args = parse_args()
127 |         env = make_env(args.env)
128 |         # Build graph and load agents
129 |         act, craft_adv_obs = deepq.build_act(
130 |             make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
131 |             q_func=dueling_model if args.dueling else model,
132 |             num_actions=env.action_space.n,
133 |             attack=args.attack,
134 |             model_path=os.path.join(args.model_dir, "saved")
135 |         )
136 |         play(env, act, craft_adv_obs, args.stochastic, args.video, args.env, args.attack, args.defense)
137 | 


--------------------------------------------------------------------------------
/baselines/common/azure_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import zipfile
  4 | 
  5 | from azure.common import AzureMissingResourceHttpError
  6 | from azure.storage.blob import BlobService
  7 | from shutil import unpack_archive
  8 | from threading import Event
  9 | 
 10 | """TODOS:
 11 |    - use Azure snapshots instead of hacky backups
 12 | """
 13 | 
 14 | 
 15 | def fixed_list_blobs(service, *args, **kwargs):
 16 |     """By defualt list_containers only returns a subset of results.
 17 | 
 18 |     This function attempts to fix this.
 19 |     """
 20 |     res = []
 21 |     next_marker = None
 22 |     while next_marker is None or len(next_marker) > 0:
 23 |         kwargs['marker'] = next_marker
 24 |         gen = service.list_blobs(*args, **kwargs)
 25 |         for b in gen:
 26 |             res.append(b.name)
 27 |         next_marker = gen.next_marker
 28 |     return res
 29 | 
 30 | 
 31 | def make_archive(source_path, dest_path):
 32 |     if source_path.endswith(os.path.sep):
 33 |         source_path = source_path.rstrip(os.path.sep)
 34 |     prefix_path = os.path.dirname(source_path)
 35 |     with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
 36 |         if os.path.isdir(source_path):
 37 |             for dirname, subdirs, files in os.walk(source_path):
 38 |                 zf.write(dirname, os.path.relpath(dirname, prefix_path))
 39 |                 for filename in files:
 40 |                     filepath = os.path.join(dirname, filename)
 41 |                     zf.write(filepath, os.path.relpath(filepath, prefix_path))
 42 |         else:
 43 |             zf.write(source_path, os.path.relpath(source_path, prefix_path))
 44 | 
 45 | 
 46 | class Container(object):
 47 |     services = {}
 48 | 
 49 |     def __init__(self, account_name, account_key, container_name, maybe_create=False):
 50 |         self._account_name = account_name
 51 |         self._container_name = container_name
 52 |         if account_name not in Container.services:
 53 |             Container.services[account_name] = BlobService(account_name, account_key)
 54 |         self._service = Container.services[account_name]
 55 |         if maybe_create:
 56 |             self._service.create_container(self._container_name, fail_on_exist=False)
 57 | 
 58 |     def put(self, source_path, blob_name, callback=None):
 59 |         """Upload a file or directory from `source_path` to azure blob `blob_name`.
 60 | 
 61 |         Upload progress can be traced by an optional callback.
 62 |         """
 63 |         upload_done = Event()
 64 | 
 65 |         def progress_callback(current, total):
 66 |             if callback:
 67 |                 callback(current, total)
 68 |             if current >= total:
 69 |                 upload_done.set()
 70 | 
 71 |         # Attempt to make backup if an existing version is already available
 72 |         try:
 73 |             x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format(
 74 |                 self._account_name,
 75 |                 self._container_name,
 76 |                 blob_name
 77 |             )
 78 |             self._service.copy_blob(
 79 |                 container_name=self._container_name,
 80 |                 blob_name=blob_name + ".backup",
 81 |                 x_ms_copy_source=x_ms_copy_source
 82 |             )
 83 |         except AzureMissingResourceHttpError:
 84 |             pass
 85 | 
 86 |         with tempfile.TemporaryDirectory() as td:
 87 |             arcpath = os.path.join(td, "archive.zip")
 88 |             make_archive(source_path, arcpath)
 89 |             self._service.put_block_blob_from_path(
 90 |                 container_name=self._container_name,
 91 |                 blob_name=blob_name,
 92 |                 file_path=arcpath,
 93 |                 max_connections=4,
 94 |                 progress_callback=progress_callback,
 95 |                 max_retries=10)
 96 |             upload_done.wait()
 97 | 
 98 |     def get(self, dest_path, blob_name, callback=None):
 99 |         """Download a file or directory to `dest_path` to azure blob `blob_name`.
100 | 
101 |         Warning! If directory is downloaded the `dest_path` is the parent directory.
102 | 
103 |         Upload progress can be traced by an optional callback.
104 |         """
105 |         download_done = Event()
106 | 
107 |         def progress_callback(current, total):
108 |             if callback:
109 |                 callback(current, total)
110 |             if current >= total:
111 |                 download_done.set()
112 | 
113 |         with tempfile.TemporaryDirectory() as td:
114 |             arcpath = os.path.join(td, "archive.zip")
115 |             for backup_blob_name in [blob_name, blob_name + '.backup']:
116 |                 try:
117 |                     blob_size = self._service.get_blob_properties(
118 |                         blob_name=backup_blob_name,
119 |                         container_name=self._container_name
120 |                     )['content-length']
121 |                     if int(blob_size) > 0:
122 |                         self._service.get_blob_to_path(
123 |                             container_name=self._container_name,
124 |                             blob_name=backup_blob_name,
125 |                             file_path=arcpath,
126 |                             max_connections=4,
127 |                             progress_callback=progress_callback,
128 |                             max_retries=10)
129 |                         unpack_archive(arcpath, dest_path)
130 |                         download_done.wait()
131 |                         return True
132 |                 except AzureMissingResourceHttpError:
133 |                     pass
134 |         return False
135 | 
136 |     def list(self, prefix=None):
137 |         """List all blobs in the container."""
138 |         return fixed_list_blobs(self._service, self._container_name, prefix=prefix)
139 | 
140 |     def exists(self, blob_name):
141 |         """Returns true if `blob_name` exists in container."""
142 |         try:
143 |             self._service.get_blob_properties(
144 |                 blob_name=blob_name,
145 |                 container_name=self._container_name
146 |             )
147 |             return True
148 |         except AzureMissingResourceHttpError:
149 |             return False
150 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/dataset.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import logging
  4 | import os, glob, cv2, re
  5 | 
  6 | from tool.episode_reader import EpisodeReader
  7 | from tfacvp.util import _read_and_decode
  8 | from tfacvp.util import *
  9 | 
 10 | class Dataset(object):
 11 |     def __init__(self, directory, num_act, mean_path, num_threads=1, capacity=1e5, batch_size=32,
 12 |                 scale=(1.0/255.0), s_t_shape=[84, 84, 4], x_t_1_shape=[84, 84, 1], colorspace='gray'):
 13 |         self.scale = scale
 14 |         self.s_t_shape = s_t_shape
 15 |         self.x_t_1_shape = x_t_1_shape
 16 | 
 17 |         # Load image mean
 18 |         mean = np.load(os.path.join(mean_path))
 19 | 
 20 |         # Prepare data flow
 21 |         s_t, a_t, x_t_1 = _read_and_decode(directory,
 22 |                                         s_t_shape=s_t_shape,
 23 |                                         num_act=num_act,
 24 |                                         x_t_1_shape=x_t_1_shape)
 25 |         self.mean = mean
 26 |         self.s_t_batch, self.a_t_batch, self.x_t_1_batch = tf.train.shuffle_batch([s_t, a_t, x_t_1],
 27 |                                                             batch_size=batch_size, capacity=capacity,
 28 |                                                             min_after_dequeue=int(capacity*0.25),
 29 |                                                             num_threads=num_threads)
 30 | 
 31 |         # Subtract image mean (according to J Oh design)
 32 |         self.mean_const = tf.constant(mean, dtype=tf.float32)
 33 |         print(self.mean_const.get_shape())
 34 |         self.s_t_batch = (self.s_t_batch - tf.tile(self.mean_const, [1, 1, 4])) * scale
 35 |         self.x_t_1_batch = (self.x_t_1_batch - self.mean_const) * scale
 36 | 
 37 |     def __call__(self):
 38 |         return {'s_t': self.s_t_batch,
 39 |                 'a_t': self.a_t_batch,
 40 |                 'x_t_1': self.x_t_1_batch}
 41 | 
 42 | class CaffeDataset(object):
 43 |     '''
 44 |         Used to load data with directory structure in original paper
 45 |     '''
 46 |     def __init__(self, dir, num_act, mean_path, mode='tf', scale=(1./255.), img_shape=[84, 84], num_frame=4, num_channel=3):
 47 |         # dir: image data directory, each image should be named as %05d.png
 48 |         # num_act: number of action in action space (only support discrete action)
 49 |         # mean_path: mean image file path (NOTE: you must convert mean.binaryproto to npy file)
 50 |         # mode: tf or caffe (differ in s, a format)
 51 |         # num_frame: initial frame
 52 |         # num_channel: number of channel per frame
 53 |         self.num_act = num_act
 54 |         self.dir = dir
 55 |         self.mode = mode
 56 |         self.scale = scale
 57 |         self.img_shaep = img_shape
 58 |         self.num_frame = num_frame
 59 |         self.num_channel = num_channel
 60 | 
 61 |         pat = re.compile('.*npy')
 62 |         if pat.match(mean_path):
 63 |             logging.info('Load mean with npy')
 64 |             self.mean = np.load(mean_path)
 65 |         else:
 66 |             import caffe
 67 |             logging.info('Load mean with caffe')
 68 |             with open(mean_path, 'rb') as mean_file:
 69 |                 mean_blob = caffe.proto.caffe_pb2.BlobProto()
 70 |                 mean_bin = mean_file.read()
 71 |                 mean_blob.ParseFromString(mean_bin)
 72 |                 self.mean = caffe.io.blobproto_to_array(mean_blob).squeeze()
 73 | 
 74 |                 if self.mode == 'tf':
 75 |                     self.mean = np.transpose(self.mean, [1, 2, 0])
 76 | 
 77 |     def _process_frame(self, s, img):
 78 |         # s: state np array
 79 |         # img: frame input
 80 |         img = img.astype(np.float32)
 81 |         if self.mode == 'caffe':
 82 |             img = np.transpose(img, [2, 0, 1])
 83 |         img -= self.mean
 84 |         img *= self.scale
 85 |         if self.mode == 'tf':
 86 |             s[:, :, :-self.num_channel] = s[:, :, self.num_channel:]
 87 |             s[:, :, -self.num_channel:] = img
 88 |         else:
 89 |             s[:-1, :, :, :] = s[1:, :, :, :]
 90 |             s[-1, :, :, :] = img
 91 |         return s
 92 | 
 93 |     def _process_act(self, a, act):
 94 |         if self.mode == 'tf':
 95 |             a[:-1] = a[1:]
 96 |             a[-1] = act
 97 |         else:
 98 |             a[:, :-1] = a[:, 1:]
 99 |             a[:, -1] = act
100 |         return a
101 | 
102 |     def __call__(self, max_iter=None):
103 |         with open(os.path.join(self.dir, 'act.log')) as act_log:
104 |             cnt_frame = 0
105 |             lim = self.num_frame
106 |             if self.mode == 'tf':
107 |                 s = np.zeros(self.img_shape + [self.num_frame * self.num_channel], dtype=np.float32)
108 |                 a = np.zeros([self.num_frame, 1], dtype=np.int32)
109 |             else:
110 |                 s = np.zeros([self.num_frame, self.num_channel] + self.img_shape, dtype=np.float32)
111 |                 a = np.zeros([self.num_frame, 1], dtype=np.int32)
112 | 
113 |             for filename in sorted(glob.glob(os.path.join(self.dir, '*.png')))[:max_iter]:
114 |                 logging.info('%s' % filename)
115 |                 img = cv2.imread(filename)
116 | 
117 |                 s = self._process_frame(s, img)
118 |                 a = self._process_act(a, int(act_log.readline()[:-1]))
119 | 
120 |                 if cnt_frame < lim:
121 |                     cnt_frame += 1
122 |                 else:
123 |                     yield s, _np_one_hot(a[-1], self.num_act)
124 | 
125 | class NumpyDataset(object):
126 |     def __init__(self, path, mean_path, num_act, scale=(1./255.), s_shape=[84,84,12]):
127 |         # path: tfrecords path
128 |         # num_act: number of action in action space
129 |         # mean_path: mean file path (must be a npy file, with [h, w, c])
130 |         # scale: image scale
131 |         # s_shape: state shape [batch_size, h, w, c * num_frame]
132 |         self.path = path
133 |         self.mean = np.load(mean_path)
134 |         self.num_act = num_act
135 |         self.scale = scale
136 |         self.s_shape = s_shape
137 | 
138 |     def _preprocess(self, s, a, x_t_1):
139 |         s -= np.tile(self.mean, [4])
140 |         s *= self.scale
141 |         x_t_1 -= self.mean
142 |         x_t_1 *= self.scale
143 |         a = _np_one_hot([a], self.num_act)
144 |         return s, a, x_t_1
145 | 
146 |     def __call__(self, max_iter=None):
147 |         reader = EpisodeReader(self.path, self.s_shape[0], self.s_shape[1])
148 |         i = 0
149 |         for s, a, x_t_1 in reader.read():
150 |             yield self._preprocess(s, a, x_t_1)
151 |             if max_iter and i >= max_iter:
152 |                 break
153 |             i += 1
154 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/old_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import re
  4 | 
  5 | from .tf_ops import ReLu, Conv2D, FC, Deconv2D
  6 | 
  7 | NUM_CHANNELS = 3
  8 | NUM_FRAMES = 4
  9 | 
 10 | class ActionConditionalVideoPredictionModel(object):
 11 |     def __init__(self, num_act, inputs=None,
 12 |                             is_train=True,
 13 |                             with_summary=True,
 14 |                             loss_args=None,
 15 |                             optimizer_args=None):
 16 |         # num_act: number of action in action space (only discrete)
 17 |         # inputs: used to create model inputs (dict)
 18 |         # is_train: is training phase
 19 |         # loss_args: loss function arguments (e.g. lamb)
 20 |         # optimizer_args: optimizer arguments (e.g. optimizer type, learning rate, ...) (dict)
 21 |         self.is_train = is_train
 22 |         self.num_act = num_act
 23 |         self.optimizer_args = optimizer_args
 24 |         self.loss_args = loss_args
 25 |         self._create_input(inputs)
 26 |         self._create_model()
 27 |         self._create_output()
 28 |         self._create_loss()
 29 | 
 30 |         if self.is_train:
 31 |             self._create_optimizer()
 32 |         if with_summary:
 33 |             self._create_summary()
 34 | 
 35 |     def _create_input(self, inputs):
 36 |         # inputs: if None, use tf.placeholder as input
 37 |         #         if not None, expected inputs is a dict
 38 |         if inputs == None:
 39 |             self.inputs = {'s_t': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (NUM_CHANNELS * NUM_FRAMES)]),
 40 |                        'a_t': tf.placeholder(dtype=tf.int32, shape=[None, self.num_act]),
 41 |                        'x_t_1': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (NUM_CHANNELS)])}
 42 |         else:
 43 |             assert type(inputs) is dict
 44 |             self.inputs = inputs
 45 | 
 46 |     def _create_model(self):
 47 |         self.encode = self._create_encoder(self.inputs['s_t'])
 48 |         self.act_embed = self._create_action_embedding(self.inputs['a_t'])
 49 |         self.decode = self._create_decoder(self.encode, self.act_embed)
 50 | 
 51 |     def _create_output(self):
 52 |         self.output = self.decode
 53 | 
 54 |     def _create_loss(self):
 55 |         lamb = self.loss_args['lamb'] if self.loss_args else 0.0
 56 |         with tf.variable_scope('loss', reuse=not self.is_train) as scope:
 57 |             t = self.inputs['x_t_1']
 58 |             penalty = tf.reduce_sum(lamb * tf.stack([tf.nn.l2_loss(var) for var in tf.trainable_variables()]), name='regularization')
 59 |             self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output - t, name='l2') + penalty)
 60 | 
 61 |     def _create_optimizer(self):
 62 |         lr = self.optimizer_args['lr'] if self.optimizer_args else 1e-4
 63 |         with tf.variable_scope('optimize', reuse=not self.is_train) as scope:
 64 |             # Setup global_step, optimizer
 65 |             self.global_step = tf.get_variable('global_step', shape=(), initializer=tf.constant_initializer(0.0), trainable=False)
 66 | 
 67 |             self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 1e5, 0.9, staircase=True)
 68 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name='optimizer')
 69 | 
 70 |             # According to original paper code, learning rate of bias is 2x of base learning rate
 71 |             grads_vars = self.optimizer.compute_gradients(self.loss)
 72 |             bias_pattern = re.compile('.*/b')
 73 |             grads_vars_mult = []
 74 |             for grad, var in grads_vars:
 75 |                 if bias_pattern.match(var.op.name):
 76 |                     grads_vars_mult.append((grad * 2.0, var))
 77 |                 else:
 78 |                     grads_vars_mult.append((grad, var))
 79 | 
 80 |             # According to original paper, gradient should be clipped with [-0.1, 0.1]
 81 |             grads_clip = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_vars_mult]
 82 |             self.train = self.optimizer.apply_gradients(grads_clip, global_step=self.global_step)
 83 | 
 84 |     def _create_encoder(self, x):
 85 |         # x: input image (tensor([batch_size, 84, 84, 12]))
 86 |         l = Conv2D(x, [6, 6], 64, 2, 'VALID', 'conv1')
 87 |         l = ReLu(l, 'relu1')
 88 |         l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv2')
 89 |         l = ReLu(l, 'relu2')
 90 |         l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv3')
 91 |         l = ReLu(l, 'relu3')
 92 |         l = FC(l, 1024, 'ip1')
 93 |         l = ReLu(l, 'relu4')
 94 |         l = FC(l, 2048, 'enc-factor', initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
 95 |         return l
 96 | 
 97 |     def _create_action_embedding(self, act):
 98 |         # act: action input (tensor([batch_size, num_act])) (one-hot vector)
 99 |         act = tf.cast(act, tf.float32)
100 |         l = FC(act, 2048, 'act-embed', initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
101 |         return l
102 | 
103 |     def _create_decoder(self, encode, act_embed):
104 |         # encode: encode layer
105 |         # act_embed: action embedding layer
106 |         batch_size = tf.shape(encode)[0]
107 |         l = tf.multiply(encode, act_embed, name='merge')
108 |         l = FC(l, 1024, 'dec')
109 |         l = FC(l, 64 * 10 * 10, 'ip4')
110 |         l = ReLu(l, 'relu1')
111 |         l = tf.reshape(l, [-1, 10, 10, 64], name='dec-reshape')
112 |         l = Deconv2D(l, [6, 6], [batch_size, 20, 20, 64], 64, 2, 'SAME', 'deconv3')
113 |         l = ReLu(l, 'relu2')
114 |         l = Deconv2D(l, [6, 6], [batch_size, 40, 40, 64], 64, 2, 'SAME', 'deconv2')
115 |         l = ReLu(l, 'relu3')
116 |         l = Deconv2D(l, [6, 6], [batch_size, 84, 84, NUM_CHANNELS], 3, 2, 'VALID', 'x_hat-05')
117 |         return l
118 | 
119 |     def _create_summary(self):
120 |         if self.is_train:
121 |             tf.summary.scalar("learning_rate", self.learning_rate, collections=['train'])
122 |         tf.summary.scalar("loss", self.loss, collections=['train'])
123 |         tf.summary.image('x_pred_t_1', tf.cast(self.decode * 255.0, tf.uint8), collections=['train'])
124 |         tf.summary.image('x_t_1', tf.cast(self.inputs['x_t_1'] * 255.0, tf.uint8), collections=['train'])
125 | 
126 |     def restore(self, sess, ckpt, var_scope=None):
127 |         # sess: tf session
128 |         # ckpt: ckpt path (str)
129 |         if var_scope != None:
130 |             all_vars = tf.all_variables()
131 |             g_vars = [k for k in all_vars if k.name.startswith(var_scope)]
132 | 
133 |         saver = tf.train.Saver({v.op.name[2:]: v for v in g_vars})
134 |         saver.restore(sess, ckpt)
135 | 
136 |     def predict(self, sess, s, a):
137 |         # sess: tf session
138 |         # s: state at t [batch_size, 84, 84, NUM_CHANNELS * NUM_FRAMES]
139 |         # a: action at t [batch_size, num_act]
140 |         assert s.shape[1:] == (84, 84, NUM_CHANNELS * NUM_FRAMES)
141 |         assert len(a.shape) == 2
142 |         assert a.shape[1] == self.num_act
143 | 
144 |         return sess.run([self.output], feed_dict={self.inputs['s_t']: s,
145 |                                                   self.inputs['a_t']: a})
146 | 
147 | 


--------------------------------------------------------------------------------
/baselines/deepq/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  5 | 
  6 | 
  7 | class ReplayBuffer(object):
  8 |     def __init__(self, size):
  9 |         """Create Prioritized Replay buffer.
 10 | 
 11 |         Parameters
 12 |         ----------
 13 |         size: int
 14 |             Max number of transitions to store in the buffer. When the buffer
 15 |             overflows the old memories are dropped.
 16 |         """
 17 |         self._storage = []
 18 |         self._maxsize = size
 19 |         self._next_idx = 0
 20 | 
 21 |     def __len__(self):
 22 |         return len(self._storage)
 23 | 
 24 |     def add(self, obs_t, action, reward, obs_tp1, done):
 25 |         data = (obs_t, action, reward, obs_tp1, done)
 26 | 
 27 |         if self._next_idx >= len(self._storage):
 28 |             self._storage.append(data)
 29 |         else:
 30 |             self._storage[self._next_idx] = data
 31 |         self._next_idx = (self._next_idx + 1) % self._maxsize
 32 | 
 33 |     def _encode_sample(self, idxes):
 34 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
 35 |         for i in idxes:
 36 |             data = self._storage[i]
 37 |             obs_t, action, reward, obs_tp1, done = data
 38 |             obses_t.append(np.array(obs_t, copy=False))
 39 |             actions.append(np.array(action, copy=False))
 40 |             rewards.append(reward)
 41 |             obses_tp1.append(np.array(obs_tp1, copy=False))
 42 |             dones.append(done)
 43 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
 44 | 
 45 |     def sample(self, batch_size):
 46 |         """Sample a batch of experiences.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         batch_size: int
 51 |             How many transitions to sample.
 52 | 
 53 |         Returns
 54 |         -------
 55 |         obs_batch: np.array
 56 |             batch of observations
 57 |         act_batch: np.array
 58 |             batch of actions executed given obs_batch
 59 |         rew_batch: np.array
 60 |             rewards received as results of executing act_batch
 61 |         next_obs_batch: np.array
 62 |             next set of observations seen after executing act_batch
 63 |         done_mask: np.array
 64 |             done_mask[i] = 1 if executing act_batch[i] resulted in
 65 |             the end of an episode and 0 otherwise.
 66 |         """
 67 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
 68 |         return self._encode_sample(idxes)
 69 | 
 70 | 
 71 | class PrioritizedReplayBuffer(ReplayBuffer):
 72 |     def __init__(self, size, alpha):
 73 |         """Create Prioritized Replay buffer.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         size: int
 78 |             Max number of transitions to store in the buffer. When the buffer
 79 |             overflows the old memories are dropped.
 80 |         alpha: float
 81 |             how much prioritization is used
 82 |             (0 - no prioritization, 1 - full prioritization)
 83 | 
 84 |         See Also
 85 |         --------
 86 |         ReplayBuffer.__init__
 87 |         """
 88 |         super(PrioritizedReplayBuffer, self).__init__(size)
 89 |         assert alpha > 0
 90 |         self._alpha = alpha
 91 | 
 92 |         it_capacity = 1
 93 |         while it_capacity < size:
 94 |             it_capacity *= 2
 95 | 
 96 |         self._it_sum = SumSegmentTree(it_capacity)
 97 |         self._it_min = MinSegmentTree(it_capacity)
 98 |         self._max_priority = 1.0
 99 | 
100 |     def add(self, *args, **kwargs):
101 |         """See ReplayBuffer.store_effect"""
102 |         idx = self._next_idx
103 |         super().add(*args, **kwargs)
104 |         self._it_sum[idx] = self._max_priority ** self._alpha
105 |         self._it_min[idx] = self._max_priority ** self._alpha
106 | 
107 |     def _sample_proportional(self, batch_size):
108 |         res = []
109 |         for _ in range(batch_size):
110 |             # TODO(szymon): should we ensure no repeats?
111 |             mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
112 |             idx = self._it_sum.find_prefixsum_idx(mass)
113 |             res.append(idx)
114 |         return res
115 | 
116 |     def sample(self, batch_size, beta):
117 |         """Sample a batch of experiences.
118 | 
119 |         compared to ReplayBuffer.sample
120 |         it also returns importance weights and idxes
121 |         of sampled experiences.
122 | 
123 | 
124 |         Parameters
125 |         ----------
126 |         batch_size: int
127 |             How many transitions to sample.
128 |         beta: float
129 |             To what degree to use importance weights
130 |             (0 - no corrections, 1 - full correction)
131 | 
132 |         Returns
133 |         -------
134 |         obs_batch: np.array
135 |             batch of observations
136 |         act_batch: np.array
137 |             batch of actions executed given obs_batch
138 |         rew_batch: np.array
139 |             rewards received as results of executing act_batch
140 |         next_obs_batch: np.array
141 |             next set of observations seen after executing act_batch
142 |         done_mask: np.array
143 |             done_mask[i] = 1 if executing act_batch[i] resulted in
144 |             the end of an episode and 0 otherwise.
145 |         weights: np.array
146 |             Array of shape (batch_size,) and dtype np.float32
147 |             denoting importance weight of each sampled transition
148 |         idxes: np.array
149 |             Array of shape (batch_size,) and dtype np.int32
150 |             idexes in buffer of sampled experiences
151 |         """
152 |         assert beta > 0
153 | 
154 |         idxes = self._sample_proportional(batch_size)
155 | 
156 |         weights = []
157 |         p_min = self._it_min.min() / self._it_sum.sum()
158 |         max_weight = (p_min * len(self._storage)) ** (-beta)
159 | 
160 |         for idx in idxes:
161 |             p_sample = self._it_sum[idx] / self._it_sum.sum()
162 |             weight = (p_sample * len(self._storage)) ** (-beta)
163 |             weights.append(weight / max_weight)
164 |         weights = np.array(weights)
165 |         encoded_sample = self._encode_sample(idxes)
166 |         return tuple(list(encoded_sample) + [weights, idxes])
167 | 
168 |     def update_priorities(self, idxes, priorities):
169 |         """Update priorities of sampled transitions.
170 | 
171 |         sets priority of transition at index idxes[i] in buffer
172 |         to priorities[i].
173 | 
174 |         Parameters
175 |         ----------
176 |         idxes: [int]
177 |             List of idxes of sampled transitions
178 |         priorities: [float]
179 |             List of updated priorities corresponding to
180 |             transitions at the sampled idxes denoted by
181 |             variable `idxes`.
182 |         """
183 |         assert len(idxes) == len(priorities)
184 |         for idx, priority in zip(idxes, priorities):
185 |             assert priority > 0
186 |             assert 0 <= idx < len(self._storage)
187 |             self._it_sum[idx] = priority ** self._alpha
188 |             self._it_min[idx] = priority ** self._alpha
189 | 
190 |             self._max_priority = max(self._max_priority, priority)
191 | 


--------------------------------------------------------------------------------
/baselines/deepq/prediction/tfacvp/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import re
  4 | 
  5 | from .tf_ops import ReLu, Conv2D, FC, Deconv2D
  6 | 
  7 | class ActionConditionalVideoPredictionModel(object):
  8 |     def __init__(self, num_act, num_channel=3, num_frame=4, inputs=None,
  9 |                             is_train=True,
 10 |                             with_summary=True,
 11 |                             loss_args=None,
 12 |                             optimizer_args=None):
 13 |         # num_act: number of action in action space (only discrete)
 14 |         # num_channel: number of channel in one frame
 15 |         # num_frame: number of frame in one state
 16 |         # inputs: used to create model inputs (dict)
 17 |         # is_train: is training phase
 18 |         # loss_args: loss function arguments (e.g. lamb)
 19 |         # optimizer_args: optimizer arguments (e.g. optimizer type, learning rate, ...) (dict)
 20 |         self.is_train = is_train
 21 |         self.num_act = num_act
 22 |         self.num_channel = num_channel
 23 |         self.num_frame = num_frame
 24 |         self.optimizer_args = optimizer_args
 25 |         self.loss_args = loss_args
 26 |         self._create_input(inputs)
 27 |         self._create_model()
 28 |         self._create_output()
 29 |         self._create_loss()
 30 | 
 31 |         if self.is_train:
 32 |             self._create_optimizer()
 33 |         if with_summary:
 34 |             self._create_summary()
 35 | 
 36 |     def _create_input(self, inputs):
 37 |         # inputs: if None, use tf.placeholder as input
 38 |         #         if not None, expected inputs is a dict
 39 |         if inputs == None:
 40 |             self.inputs = {'s_t': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (self.num_channel * self.num_frame)]),
 41 |                        'a_t': tf.placeholder(dtype=tf.int32, shape=[None, self.num_act]),
 42 |                        'x_t_1': tf.placeholder(dtype=tf.float32, shape=[None, 84, 84, (self.num_channel)])}
 43 |         else:
 44 |             assert type(inputs) is dict
 45 |             self.inputs = inputs
 46 | 
 47 |     def _create_model(self):
 48 |         self.encode = self._create_encoder(self.inputs['s_t'])
 49 |         self.act_embed = self._create_action_embedding(self.inputs['a_t'])
 50 |         self.decode = self._create_decoder(self.encode, self.act_embed)
 51 | 
 52 |     def _create_output(self):
 53 |         self.output = self.decode
 54 | 
 55 |     def _create_loss(self):
 56 |         lamb = self.loss_args['lamb'] if self.loss_args else 0.0
 57 |         with tf.variable_scope('loss', reuse=not self.is_train) as scope:
 58 |             t = self.inputs['x_t_1']
 59 |             penalty = tf.reduce_sum(lamb * tf.stack([tf.nn.l2_loss(var) for var in tf.trainable_variables()]), name='regularization')
 60 |             self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output - t, name='l2') + penalty)
 61 | 
 62 |     def _create_optimizer(self):
 63 |         lr = self.optimizer_args['lr'] if self.optimizer_args else 1e-4
 64 |         with tf.variable_scope('optimize', reuse=not self.is_train) as scope:
 65 |             # Setup global_step, optimizer
 66 |             self.global_step = tf.get_variable('global_step', shape=(), initializer=tf.constant_initializer(0.0), trainable=False)
 67 | 
 68 |             self.learning_rate = tf.train.exponential_decay(lr, self.global_step, 1e5, 0.9, staircase=True)
 69 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name='optimizer')
 70 | 
 71 |             # According to original paper code, learning rate of bias is 2x of base learning rate
 72 |             grads_vars = self.optimizer.compute_gradients(self.loss)
 73 |             bias_pattern = re.compile('.*/b')
 74 |             grads_vars_mult = []
 75 |             for grad, var in grads_vars:
 76 |                 if bias_pattern.match(var.op.name):
 77 |                     grads_vars_mult.append((grad * 2.0, var))
 78 |                 else:
 79 |                     grads_vars_mult.append((grad, var))
 80 | 
 81 |             # According to original paper, gradient should be clipped with [-0.1, 0.1]
 82 |             grads_clip = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_vars_mult]
 83 |             self.train = self.optimizer.apply_gradients(grads_clip, global_step=self.global_step)
 84 | 
 85 |     def _create_encoder(self, x):
 86 |         # x: input image (tensor([batch_size, 84, 84, 12]))
 87 |         l = Conv2D(x, [6, 6], 64, 2, 'VALID', 'conv1')
 88 |         l = ReLu(l, 'relu1')
 89 |         l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv2')
 90 |         l = ReLu(l, 'relu2')
 91 |         l = Conv2D(l, [6, 6], 64, 2, 'SAME', 'conv3')
 92 |         l = ReLu(l, 'relu3')
 93 |         l = FC(l, 1024, 'ip1')
 94 |         l = ReLu(l, 'relu4')
 95 |         l = FC(l, 2048, 'enc-factor', initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
 96 |         return l
 97 | 
 98 |     def _create_action_embedding(self, act):
 99 |         # act: action input (tensor([batch_size, num_act])) (one-hot vector)
100 |         act = tf.cast(act, tf.float32)
101 |         l = FC(act, 2048, 'act-embed', initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
102 |         return l
103 | 
104 |     def _create_decoder(self, encode, act_embed):
105 |         # encode: encode layer
106 |         # act_embed: action embedding layer
107 |         batch_size = tf.shape(encode)[0]
108 |         l = tf.multiply(encode, act_embed, name='merge')
109 |         l = FC(l, 1024, 'dec')
110 |         l = FC(l, 64 * 10 * 10, 'ip4')
111 |         l = ReLu(l, 'relu1')
112 |         l = tf.reshape(l, [-1, 10, 10, 64], name='dec-reshape')
113 |         l = Deconv2D(l, [6, 6], [batch_size, 20, 20, 64], 64, 2, 'SAME', 'deconv3')
114 |         l = ReLu(l, 'relu2')
115 |         l = Deconv2D(l, [6, 6], [batch_size, 40, 40, 64], 64, 2, 'SAME', 'deconv2')
116 |         l = ReLu(l, 'relu3')
117 |         l = Deconv2D(l, [6, 6], [batch_size, 84, 84, self.num_channel], self.num_channel, 2, 'VALID', 'x_hat-05')
118 |         return l
119 | 
120 |     def _create_summary(self):
121 |         if self.is_train:
122 |             tf.summary.scalar("learning_rate", self.learning_rate, collections=['train'])
123 |         tf.summary.scalar("loss", self.loss, collections=['train'])
124 |         tf.summary.image('x_pred_t_1', tf.cast(self.decode * 255.0, tf.uint8), collections=['train'])
125 |         tf.summary.image('x_t_1', tf.cast(self.inputs['x_t_1'] * 255.0, tf.uint8), collections=['train'])
126 | 
127 | 
128 |     def restore(self, sess, ckpt, var_scope=None):
129 |         # sess: tf session
130 |         # ckpt: ckpt path (str)
131 |         if var_scope != None:
132 |             all_vars = tf.all_variables()
133 |             g_vars = [k for k in all_vars if k.name.startswith(var_scope)]
134 |             saver = tf.train.Saver({v.op.name[2:]: v for v in g_vars})
135 |         else:
136 |             saver = tf.train.Saver()
137 | 
138 |         saver.restore(sess, ckpt)
139 | 
140 | 
141 |     def predict(self, sess, s, a):
142 |         # sess: tf session
143 |         # s: state at t [batch_size, 84, 84, self.num_channel * self.num_frame]
144 |         # a: action at t [batch_size, num_act]
145 |         assert s.shape[1:] == (84, 84, self.num_channel * self.num_frame)
146 |         assert len(a.shape) == 2
147 |         assert a.shape[1] == self.num_act
148 | 
149 |         return sess.run([self.output], feed_dict={self.inputs['s_t']: s,
150 |                                                   self.inputs['a_t']: a})
151 | 
152 | 


--------------------------------------------------------------------------------
/baselines/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | See README.md for a description of the logging API.
  4 | 
  5 | OFF state corresponds to having Logger.CURRENT == Logger.DEFAULT
  6 | ON state is otherwise
  7 | 
  8 | """
  9 | 
 10 | from collections import OrderedDict
 11 | import os
 12 | import sys
 13 | import shutil
 14 | import os.path as osp
 15 | import json
 16 | 
 17 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
 18 | 
 19 | DEBUG = 10
 20 | INFO = 20
 21 | WARN = 30
 22 | ERROR = 40
 23 | 
 24 | DISABLED = 50
 25 | 
 26 | 
 27 | class OutputFormat(object):
 28 |     def writekvs(self, kvs):
 29 |         """
 30 |         Write key-value pairs
 31 |         """
 32 |         raise NotImplementedError
 33 | 
 34 |     def writeseq(self, args):
 35 |         """
 36 |         Write a sequence of other data (e.g. a logging message)
 37 |         """
 38 |         pass
 39 | 
 40 |     def close(self):
 41 |         return
 42 | 
 43 | 
 44 | class HumanOutputFormat(OutputFormat):
 45 |     def __init__(self, file):
 46 |         self.file = file
 47 | 
 48 |     def writekvs(self, kvs):
 49 |         # Create strings for printing
 50 |         key2str = OrderedDict()
 51 |         for (key, val) in kvs.items():
 52 |             valstr = '%-8.3g' % (val,) if hasattr(val, '__float__') else val
 53 |             key2str[self._truncate(key)] = self._truncate(valstr)
 54 | 
 55 |         # Find max widths
 56 |         keywidth = max(map(len, key2str.keys()))
 57 |         valwidth = max(map(len, key2str.values()))
 58 | 
 59 |         # Write out the data
 60 |         dashes = '-' * (keywidth + valwidth + 7)
 61 |         lines = [dashes]
 62 |         for (key, val) in key2str.items():
 63 |             lines.append('| %s%s | %s%s |' % (
 64 |                 key,
 65 |                 ' ' * (keywidth - len(key)),
 66 |                 val,
 67 |                 ' ' * (valwidth - len(val)),
 68 |             ))
 69 |         lines.append(dashes)
 70 |         self.file.write('\n'.join(lines) + '\n')
 71 | 
 72 |         # Flush the output to the file
 73 |         self.file.flush()
 74 | 
 75 |     def _truncate(self, s):
 76 |         return s[:20] + '...' if len(s) > 23 else s
 77 | 
 78 |     def writeseq(self, args):
 79 |         for arg in args:
 80 |             self.file.write(arg)
 81 |         self.file.write('\n')
 82 |         self.file.flush()
 83 | 
 84 | 
 85 | class JSONOutputFormat(OutputFormat):
 86 |     def __init__(self, file):
 87 |         self.file = file
 88 | 
 89 |     def writekvs(self, kvs):
 90 |         for k, v in kvs.items():
 91 |             if hasattr(v, 'dtype'):
 92 |                 v = v.tolist()
 93 |                 kvs[k] = float(v)
 94 |         self.file.write(json.dumps(kvs) + '\n')
 95 |         self.file.flush()
 96 | 
 97 | 
 98 | def make_output_format(format, ev_dir):
 99 |     os.makedirs(ev_dir, exist_ok=True)
100 |     if format == 'stdout':
101 |         return HumanOutputFormat(sys.stdout)
102 |     elif format == 'log':
103 |         log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
104 |         return HumanOutputFormat(log_file)
105 |     elif format == 'json':
106 |         json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
107 |         return JSONOutputFormat(json_file)
108 |     else:
109 |         raise ValueError('Unknown format specified: %s' % (format,))
110 | 
111 | # ================================================================
112 | # API
113 | # ================================================================
114 | 
115 | 
116 | def logkv(key, val):
117 |     """
118 |     Log a value of some diagnostic
119 |     Call this once for each diagnostic quantity, each iteration
120 |     """
121 |     Logger.CURRENT.logkv(key, val)
122 | 
123 | 
124 | def dumpkvs():
125 |     """
126 |     Write all of the diagnostics from the current iteration
127 | 
128 |     level: int. (see logger.py docs) If the global logger level is higher than
129 |                 the level argument here, don't print to stdout.
130 |     """
131 |     Logger.CURRENT.dumpkvs()
132 | 
133 | 
134 | # for backwards compatibility
135 | record_tabular = logkv
136 | dump_tabular = dumpkvs
137 | 
138 | 
139 | def log(*args, level=INFO):
140 |     """
141 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
142 |     """
143 |     Logger.CURRENT.log(*args, level=level)
144 | 
145 | 
146 | def debug(*args):
147 |     log(*args, level=DEBUG)
148 | 
149 | 
150 | def info(*args):
151 |     log(*args, level=INFO)
152 | 
153 | 
154 | def warn(*args):
155 |     log(*args, level=WARN)
156 | 
157 | 
158 | def error(*args):
159 |     log(*args, level=ERROR)
160 | 
161 | 
162 | def set_level(level):
163 |     """
164 |     Set logging threshold on current logger.
165 |     """
166 |     Logger.CURRENT.set_level(level)
167 | 
168 | 
169 | def get_dir():
170 |     """
171 |     Get directory that log files are being written to.
172 |     will be None if there is no output directory (i.e., if you didn't call start)
173 |     """
174 |     return Logger.CURRENT.get_dir()
175 | 
176 | 
177 | def get_expt_dir():
178 |     sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir() [%s]\n" % (get_dir(),))
179 |     return get_dir()
180 | 
181 | 
182 | # ================================================================
183 | # Backend
184 | # ================================================================
185 | 
186 | 
187 | class Logger(object):
188 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
189 |                     # So that you can still log to the terminal without setting up any output files
190 |     CURRENT = None  # Current logger being used by the free functions above
191 | 
192 |     def __init__(self, dir, output_formats):
193 |         self.name2val = OrderedDict()  # values this iteration
194 |         self.level = INFO
195 |         self.dir = dir
196 |         self.output_formats = output_formats
197 | 
198 |     # Logging API, forwarded
199 |     # ----------------------------------------
200 |     def logkv(self, key, val):
201 |         self.name2val[key] = val
202 | 
203 |     def dumpkvs(self):
204 |         for fmt in self.output_formats:
205 |             fmt.writekvs(self.name2val)
206 |         self.name2val.clear()
207 | 
208 |     def log(self, *args, level=INFO):
209 |         if self.level <= level:
210 |             self._do_log(args)
211 | 
212 |     # Configuration
213 |     # ----------------------------------------
214 |     def set_level(self, level):
215 |         self.level = level
216 | 
217 |     def get_dir(self):
218 |         return self.dir
219 | 
220 |     def close(self):
221 |         for fmt in self.output_formats:
222 |             fmt.close()
223 | 
224 |     # Misc
225 |     # ----------------------------------------
226 |     def _do_log(self, args):
227 |         for fmt in self.output_formats:
228 |             fmt.writeseq(args)
229 | 
230 | 
231 | # ================================================================
232 | 
233 | Logger.DEFAULT = Logger(output_formats=[HumanOutputFormat(sys.stdout)], dir=None)
234 | Logger.CURRENT = Logger.DEFAULT
235 | 
236 | 
237 | class session(object):
238 |     """
239 |     Context manager that sets up the loggers for an experiment.
240 |     """
241 | 
242 |     CURRENT = None  # Set to a LoggerContext object using enter/exit or context manager
243 | 
244 |     def __init__(self, dir, format_strs=None):
245 |         self.dir = dir
246 |         if format_strs is None:
247 |             format_strs = LOG_OUTPUT_FORMATS
248 |         output_formats = [make_output_format(f, dir) for f in format_strs]
249 |         Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
250 | 
251 |     def __enter__(self):
252 |         os.makedirs(self.evaluation_dir(), exist_ok=True)
253 |         output_formats = [make_output_format(f, self.evaluation_dir()) for f in LOG_OUTPUT_FORMATS]
254 |         Logger.CURRENT = Logger(dir=self.dir, output_formats=output_formats)
255 | 
256 |     def __exit__(self, *args):
257 |         Logger.CURRENT.close()
258 |         Logger.CURRENT = Logger.DEFAULT
259 | 
260 |     def evaluation_dir(self):
261 |         return self.dir
262 | 
263 | 
264 | # ================================================================
265 | 
266 | 
267 | def _demo():
268 |     info("hi")
269 |     debug("shouldn't appear")
270 |     set_level(DEBUG)
271 |     debug("should appear")
272 |     dir = "/tmp/testlogging"
273 |     if os.path.exists(dir):
274 |         shutil.rmtree(dir)
275 |     with session(dir=dir):
276 |         record_tabular("a", 3)
277 |         record_tabular("b", 2.5)
278 |         dump_tabular()
279 |         record_tabular("b", -2.5)
280 |         record_tabular("a", 5.5)
281 |         dump_tabular()
282 |         info("^^^ should see a = 5.5")
283 | 
284 |     record_tabular("b", -2.5)
285 |     dump_tabular()
286 | 
287 |     record_tabular("a", "longasslongasslongasslongasslongasslongassvalue")
288 |     dump_tabular()
289 | 
290 | 
291 | if __name__ == "__main__":
292 |     _demo()
293 | 


--------------------------------------------------------------------------------
/baselines/common/atari_wrappers_deprecated.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import numpy as np
  4 | 
  5 | from collections import deque
  6 | from gym import spaces
  7 | 
  8 | 
  9 | class NoopResetEnv(gym.Wrapper):
 10 |     def __init__(self, env=None, noop_max=30):
 11 |         """Sample initial states by taking random number of no-ops on reset.
 12 |         No-op is assumed to be action 0.
 13 |         """
 14 |         super(NoopResetEnv, self).__init__(env)
 15 |         self.noop_max = noop_max
 16 |         self.override_num_noops = None
 17 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 18 | 
 19 |     def _reset(self):
 20 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 21 |         self.env.reset()
 22 |         if self.override_num_noops is not None:
 23 |             noops = self.override_num_noops
 24 |         else:
 25 |             noops = np.random.randint(1, self.noop_max + 1)
 26 |         assert noops > 0
 27 |         obs = None
 28 |         for _ in range(noops):
 29 |             obs, _, done, _ = self.env.step(0)
 30 |             if done:
 31 |                 obs = self.env.reset()
 32 |         return obs
 33 | 
 34 | 
 35 | class FireResetEnv(gym.Wrapper):
 36 |     def __init__(self, env=None):
 37 |         """For environments where the user need to press FIRE for the game to start."""
 38 |         super(FireResetEnv, self).__init__(env)
 39 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 40 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 41 | 
 42 |     def _reset(self):
 43 |         self.env.reset()
 44 |         obs, _, done, _ = self.env.step(1)
 45 |         if done:
 46 |             self.env.reset()
 47 |         obs, _, done, _ = self.env.step(2)
 48 |         if done:
 49 |             self.env.reset()
 50 |         return obs
 51 | 
 52 | 
 53 | class EpisodicLifeEnv(gym.Wrapper):
 54 |     def __init__(self, env=None):
 55 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 56 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 57 |         """
 58 |         super(EpisodicLifeEnv, self).__init__(env)
 59 |         self.lives = 0
 60 |         self.was_real_done = True
 61 |         self.was_real_reset = False
 62 | 
 63 |     def _step(self, action):
 64 |         obs, reward, done, info = self.env.step(action)
 65 |         self.was_real_done = done
 66 |         # check current lives, make loss of life terminal,
 67 |         # then update lives to handle bonus lives
 68 |         lives = self.env.unwrapped.ale.lives()
 69 |         if lives < self.lives and lives > 0:
 70 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 71 |             # so its important to keep lives > 0, so that we only reset once
 72 |             # the environment advertises done.
 73 |             done = True
 74 |         self.lives = lives
 75 |         return obs, reward, done, info
 76 | 
 77 |     def _reset(self):
 78 |         """Reset only when lives are exhausted.
 79 |         This way all states are still reachable even though lives are episodic,
 80 |         and the learner need not know about any of this behind-the-scenes.
 81 |         """
 82 |         if self.was_real_done:
 83 |             obs = self.env.reset()
 84 |             self.was_real_reset = True
 85 |         else:
 86 |             # no-op step to advance from terminal/lost life state
 87 |             obs, _, _, _ = self.env.step(0)
 88 |             self.was_real_reset = False
 89 |         self.lives = self.env.unwrapped.ale.lives()
 90 |         return obs
 91 | 
 92 | 
 93 | class MaxAndSkipEnv(gym.Wrapper):
 94 |     def __init__(self, env=None, skip=4):
 95 |         """Return only every `skip`-th frame"""
 96 |         super(MaxAndSkipEnv, self).__init__(env)
 97 |         # most recent raw observations (for max pooling across time steps)
 98 |         self._obs_buffer = deque(maxlen=2)
 99 |         self._skip = skip
100 | 
101 |     def _step(self, action):
102 |         total_reward = 0.0
103 |         done = None
104 |         for _ in range(self._skip):
105 |             obs, reward, done, info = self.env.step(action)
106 |             self._obs_buffer.append(obs)
107 |             total_reward += reward
108 |             if done:
109 |                 break
110 | 
111 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
112 | 
113 |         return max_frame, total_reward, done, info
114 | 
115 |     def _reset(self):
116 |         """Clear past frame buffer and init. to first obs. from inner env."""
117 |         self._obs_buffer.clear()
118 |         obs = self.env.reset()
119 |         self._obs_buffer.append(obs)
120 |         return obs
121 | 
122 | 
123 | class ProcessFrame84(gym.ObservationWrapper):
124 |     def __init__(self, env=None):
125 |         super(ProcessFrame84, self).__init__(env)
126 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
127 | 
128 |     def _observation(self, obs):
129 |         return ProcessFrame84.process(obs)
130 | 
131 |     @staticmethod
132 |     def process(frame):
133 |         resized_screen = None
134 |         if frame.size == 210 * 160 * 3:
135 |             img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
136 |         elif frame.size == 250 * 160 * 3:
137 |             img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
138 |         else:
139 |             assert False, "Unknown resolution."
140 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
141 |         resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
142 |         x_t = resized_screen[18:102, :]
143 |         x_t = np.reshape(x_t, [84, 84, 1])
144 |         return x_t.astype(np.uint8)
145 | 
146 | 
147 | class ClippedRewardsWrapper(gym.RewardWrapper):
148 |     def _reward(self, reward):
149 |         """Change all the positive rewards to 1, negative to -1 and keep zero."""
150 |         return np.sign(reward)
151 | 
152 | 
153 | class LazyFrames(object):
154 |     def __init__(self, frames):
155 |         """This object ensures that common frames between the observations are only stored once.
156 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
157 |         buffers.
158 | 
159 |         This object should only be converted to numpy array before being passed to the model.
160 | 
161 |         You'd not belive how complex the previous solution was."""
162 |         self._frames = frames
163 | 
164 |     def __array__(self, dtype=None):
165 |         out = np.concatenate(self._frames, axis=2)
166 |         if dtype is not None:
167 |             out = out.astype(dtype)
168 |         return out
169 | 
170 | 
171 | class FrameStack(gym.Wrapper):
172 |     def __init__(self, env, k):
173 |         """Stack k last frames.
174 | 
175 |         Returns lazy array, which is much more memory efficient.
176 | 
177 |         See Also
178 |         --------
179 |         baselines.common.atari_wrappers.LazyFrames
180 |         """
181 |         gym.Wrapper.__init__(self, env)
182 |         self.k = k
183 |         self.frames = deque([], maxlen=k)
184 |         shp = env.observation_space.shape
185 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
186 | 
187 |     def _reset(self):
188 |         ob = self.env.reset()
189 |         for _ in range(self.k):
190 |             self.frames.append(ob)
191 |         return self._get_ob()
192 | 
193 |     def _step(self, action):
194 |         ob, reward, done, info = self.env.step(action)
195 |         self.frames.append(ob)
196 |         return self._get_ob(), reward, done, info
197 | 
198 |     def _get_ob(self):
199 |         assert len(self.frames) == self.k
200 |         return LazyFrames(list(self.frames))
201 | 
202 | 
203 | class ScaledFloatFrame(gym.ObservationWrapper):
204 |     def _observation(self, obs):
205 |         # careful! This undoes the memory optimization, use
206 |         # with smaller replay buffers only.
207 |         return np.array(obs).astype(np.float32) / 255.0
208 | 
209 | 
210 | def wrap_dqn(env):
211 |     """Apply a common set of wrappers for Atari games."""
212 |     assert 'NoFrameskip' in env.spec.id
213 |     env = EpisodicLifeEnv(env)
214 |     env = NoopResetEnv(env, noop_max=30)
215 |     env = MaxAndSkipEnv(env, skip=4)
216 |     if 'FIRE' in env.unwrapped.get_action_meanings():
217 |         env = FireResetEnv(env)
218 |     env = ProcessFrame84(env)
219 |     env = FrameStack(env, 4)
220 |     env = ClippedRewardsWrapper(env)
221 |     return env
222 | 
223 | 
224 | class A2cProcessFrame(gym.Wrapper):
225 |     def __init__(self, env):
226 |         gym.Wrapper.__init__(self, env)
227 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
228 | 
229 |     def _step(self, action):
230 |         ob, reward, done, info = self.env.step(action)
231 |         return A2cProcessFrame.process(ob), reward, done, info
232 | 
233 |     def _reset(self):
234 |         return A2cProcessFrame.process(self.env.reset())
235 | 
236 |     @staticmethod
237 |     def process(frame):
238 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
239 |         frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
240 |         return frame.reshape(84, 84, 1)
241 | 


--------------------------------------------------------------------------------
/baselines/common/misc_util.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import os
  4 | import pickle
  5 | import random
  6 | import tempfile
  7 | import time
  8 | import zipfile
  9 | 
 10 | 
 11 | def zipsame(*seqs):
 12 |     L = len(seqs[0])
 13 |     assert all(len(seq) == L for seq in seqs[1:])
 14 |     return zip(*seqs)
 15 | 
 16 | 
 17 | def unpack(seq, sizes):
 18 |     """
 19 |     Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
 20 |     None = just one bare element, not a list
 21 | 
 22 |     Example:
 23 |     unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
 24 |     """
 25 |     seq = list(seq)
 26 |     it = iter(seq)
 27 |     assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
 28 |     for size in sizes:
 29 |         if size is None:
 30 |             yield it.__next__()
 31 |         else:
 32 |             li = []
 33 |             for _ in range(size):
 34 |                 li.append(it.__next__())
 35 |             yield li
 36 | 
 37 | 
 38 | class EzPickle(object):
 39 |     """Objects that are pickled and unpickled via their constructor
 40 |     arguments.
 41 | 
 42 |     Example usage:
 43 | 
 44 |         class Dog(Animal, EzPickle):
 45 |             def __init__(self, furcolor, tailkind="bushy"):
 46 |                 Animal.__init__()
 47 |                 EzPickle.__init__(furcolor, tailkind)
 48 |                 ...
 49 | 
 50 |     When this object is unpickled, a new Dog will be constructed by passing the provided
 51 |     furcolor and tailkind into the constructor. However, philosophers are still not sure
 52 |     whether it is still the same dog.
 53 | 
 54 |     This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
 55 |     and Atari.
 56 |     """
 57 | 
 58 |     def __init__(self, *args, **kwargs):
 59 |         self._ezpickle_args = args
 60 |         self._ezpickle_kwargs = kwargs
 61 | 
 62 |     def __getstate__(self):
 63 |         return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
 64 | 
 65 |     def __setstate__(self, d):
 66 |         out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
 67 |         self.__dict__.update(out.__dict__)
 68 | 
 69 | 
 70 | def set_global_seeds(i):
 71 |     try:
 72 |         import tensorflow as tf
 73 |     except ImportError:
 74 |         pass
 75 |     else:
 76 |         tf.set_random_seed(i)
 77 |     np.random.seed(i)
 78 |     random.seed(i)
 79 | 
 80 | 
 81 | def pretty_eta(seconds_left):
 82 |     """Print the number of seconds in human readable format.
 83 | 
 84 |     Examples:
 85 |     2 days
 86 |     2 hours and 37 minutes
 87 |     less than a minute
 88 | 
 89 |     Paramters
 90 |     ---------
 91 |     seconds_left: int
 92 |         Number of seconds to be converted to the ETA
 93 |     Returns
 94 |     -------
 95 |     eta: str
 96 |         String representing the pretty ETA.
 97 |     """
 98 |     minutes_left = seconds_left // 60
 99 |     seconds_left %= 60
100 |     hours_left = minutes_left // 60
101 |     minutes_left %= 60
102 |     days_left = hours_left // 24
103 |     hours_left %= 24
104 | 
105 |     def helper(cnt, name):
106 |         return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
107 | 
108 |     if days_left > 0:
109 |         msg = helper(days_left, 'day')
110 |         if hours_left > 0:
111 |             msg += ' and ' + helper(hours_left, 'hour')
112 |         return msg
113 |     if hours_left > 0:
114 |         msg = helper(hours_left, 'hour')
115 |         if minutes_left > 0:
116 |             msg += ' and ' + helper(minutes_left, 'minute')
117 |         return msg
118 |     if minutes_left > 0:
119 |         return helper(minutes_left, 'minute')
120 |     return 'less than a minute'
121 | 
122 | 
123 | class RunningAvg(object):
124 |     def __init__(self, gamma, init_value=None):
125 |         """Keep a running estimate of a quantity. This is a bit like mean
126 |         but more sensitive to recent changes.
127 | 
128 |         Parameters
129 |         ----------
130 |         gamma: float
131 |             Must be between 0 and 1, where 0 is the most sensitive to recent
132 |             changes.
133 |         init_value: float or None
134 |             Initial value of the estimate. If None, it will be set on the first update.
135 |         """
136 |         self._value = init_value
137 |         self._gamma = gamma
138 | 
139 |     def update(self, new_val):
140 |         """Update the estimate.
141 | 
142 |         Parameters
143 |         ----------
144 |         new_val: float
145 |             new observated value of estimated quantity.
146 |         """
147 |         if self._value is None:
148 |             self._value = new_val
149 |         else:
150 |             self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
151 | 
152 |     def __float__(self):
153 |         """Get the current estimate"""
154 |         return self._value
155 | 
156 | 
157 | class SimpleMonitor(gym.Wrapper):
158 |     def __init__(self, env=None):
159 |         """Adds two qunatities to info returned by every step:
160 | 
161 |             num_steps: int
162 |                 Number of steps takes so far
163 |             rewards: [float]
164 |                 All the cumulative rewards for the episodes completed so far.
165 |         """
166 |         super().__init__(env)
167 |         # current episode state
168 |         self._current_reward = None
169 |         self._num_steps = None
170 |         # temporary monitor state that we do not save
171 |         self._time_offset = None
172 |         self._total_steps = None
173 |         # monitor state
174 |         self._episode_rewards = []
175 |         self._episode_lengths = []
176 |         self._episode_end_times = []
177 | 
178 |     def _reset(self):
179 |         obs = self.env.reset()
180 |         # recompute temporary state if needed
181 |         if self._time_offset is None:
182 |             self._time_offset = time.time()
183 |             if len(self._episode_end_times) > 0:
184 |                 self._time_offset -= self._episode_end_times[-1]
185 |         if self._total_steps is None:
186 |             self._total_steps = sum(self._episode_lengths)
187 |         # update monitor state
188 |         if self._current_reward is not None:
189 |             self._episode_rewards.append(self._current_reward)
190 |             self._episode_lengths.append(self._num_steps)
191 |             self._episode_end_times.append(time.time() - self._time_offset)
192 |         # reset episode state
193 |         self._current_reward = 0
194 |         self._num_steps = 0
195 | 
196 |         return obs
197 | 
198 |     def _step(self, action):
199 |         obs, rew, done, info = self.env.step(action)
200 |         self._current_reward += rew
201 |         self._num_steps += 1
202 |         self._total_steps += 1
203 |         info['steps'] = self._total_steps
204 |         info['rewards'] = self._episode_rewards
205 |         return (obs, rew, done, info)
206 | 
207 |     def get_state(self):
208 |         return {
209 |             'env_id': self.env.unwrapped.spec.id,
210 |             'episode_data': {
211 |                 'episode_rewards': self._episode_rewards,
212 |                 'episode_lengths': self._episode_lengths,
213 |                 'episode_end_times': self._episode_end_times,
214 |                 'initial_reset_time': 0,
215 |             }
216 |         }
217 | 
218 |     def set_state(self, state):
219 |         assert state['env_id'] == self.env.unwrapped.spec.id
220 |         ed = state['episode_data']
221 |         self._episode_rewards = ed['episode_rewards']
222 |         self._episode_lengths = ed['episode_lengths']
223 |         self._episode_end_times = ed['episode_end_times']
224 | 
225 | 
226 | def boolean_flag(parser, name, default=False, help=None):
227 |     """Add a boolean flag to argparse parser.
228 | 
229 |     Parameters
230 |     ----------
231 |     parser: argparse.Parser
232 |         parser to add the flag to
233 |     name: str
234 |         --<name> will enable the flag, while --no-<name> will disable it
235 |     default: bool or None
236 |         default value of the flag
237 |     help: str
238 |         help string for the flag
239 |     """
240 |     parser.add_argument("--" + name, action="store_true", default=default, help=help)
241 |     parser.add_argument("--no-" + name, action="store_false", dest=name)
242 | 
243 | 
244 | def get_wrapper_by_name(env, classname):
245 |     """Given an a gym environment possibly wrapped multiple times, returns a wrapper
246 |     of class named classname or raises ValueError if no such wrapper was applied
247 | 
248 |     Parameters
249 |     ----------
250 |     env: gym.Env of gym.Wrapper
251 |         gym environment
252 |     classname: str
253 |         name of the wrapper
254 | 
255 |     Returns
256 |     -------
257 |     wrapper: gym.Wrapper
258 |         wrapper named classname
259 |     """
260 |     currentenv = env
261 |     while True:
262 |         if classname == currentenv.class_name():
263 |             return currentenv
264 |         elif isinstance(currentenv, gym.Wrapper):
265 |             currentenv = currentenv.env
266 |         else:
267 |             raise ValueError("Couldn't find wrapper named %s" % classname)
268 | 
269 | 
270 | def relatively_safe_pickle_dump(obj, path, compression=False):
271 |     """This is just like regular pickle dump, except from the fact that failure cases are
272 |     different:
273 | 
274 |         - It's never possible that we end up with a pickle in corrupted state.
275 |         - If a there was a different file at the path, that file will remain unchanged in the
276 |           even of failure (provided that filesystem rename is atomic).
277 |         - it is sometimes possible that we end up with useless temp file which needs to be
278 |           deleted manually (it will be removed automatically on the next function call)
279 | 
280 |     The indended use case is periodic checkpoints of experiment state, such that we never
281 |     corrupt previous checkpoints if the current one fails.
282 | 
283 |     Parameters
284 |     ----------
285 |     obj: object
286 |         object to pickle
287 |     path: str
288 |         path to the output file
289 |     compression: bool
290 |         if true pickle will be compressed
291 |     """
292 |     temp_storage = path + ".relatively_safe"
293 |     if compression:
294 |         # Using gzip here would be simpler, but the size is limited to 2GB
295 |         with tempfile.NamedTemporaryFile() as uncompressed_file:
296 |             pickle.dump(obj, uncompressed_file)
297 |             with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
298 |                 myzip.write(uncompressed_file.name, "data")
299 |     else:
300 |         with open(temp_storage, "wb") as f:
301 |             pickle.dump(obj, f)
302 |     os.rename(temp_storage, path)
303 | 
304 | 
305 | def pickle_load(path, compression=False):
306 |     """Unpickle a possible compressed pickle.
307 | 
308 |     Parameters
309 |     ----------
310 |     path: str
311 |         path to the output file
312 |     compression: bool
313 |         if true assumes that pickle was compressed when created and attempts decompression.
314 | 
315 |     Returns
316 |     -------
317 |     obj: object
318 |         the unpickled object
319 |     """
320 | 
321 |     if compression:
322 |         with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
323 |             with myzip.open("data") as f:
324 |                 return pickle.load(f)
325 |     else:
326 |         with open(path, "rb") as f:
327 |             return pickle.load(f)
328 | 


--------------------------------------------------------------------------------
/baselines/deepq/simple.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import dill
  4 | import tempfile
  5 | import tensorflow as tf
  6 | import zipfile
  7 | 
  8 | import baselines.common.tf_util as U
  9 | 
 10 | from baselines import logger
 11 | from baselines.common.schedules import LinearSchedule
 12 | from baselines import deepq
 13 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
 14 | 
 15 | 
 16 | class ActWrapper(object):
 17 |     def __init__(self, act, act_params):
 18 |         self._act = act
 19 |         self._act_params = act_params
 20 | 
 21 |     @staticmethod
 22 |     def load(path, num_cpu=16):
 23 |         with open(path, "rb") as f:
 24 |             model_data, act_params = dill.load(f)
 25 |         act = deepq.build_act(**act_params)
 26 |         sess = U.make_session(num_cpu=num_cpu)
 27 |         sess.__enter__()
 28 |         with tempfile.TemporaryDirectory() as td:
 29 |             arc_path = os.path.join(td, "packed.zip")
 30 |             with open(arc_path, "wb") as f:
 31 |                 f.write(model_data)
 32 | 
 33 |             zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
 34 |             U.load_state(os.path.join(td, "model"))
 35 | 
 36 |         return ActWrapper(act, act_params)
 37 | 
 38 |     def __call__(self, *args, **kwargs):
 39 |         return self._act(*args, **kwargs)
 40 | 
 41 |     def save(self, path):
 42 |         """Save model to a pickle located at `path`"""
 43 |         with tempfile.TemporaryDirectory() as td:
 44 |             U.save_state(os.path.join(td, "model"))
 45 |             arc_name = os.path.join(td, "packed.zip")
 46 |             with zipfile.ZipFile(arc_name, 'w') as zipf:
 47 |                 for root, dirs, files in os.walk(td):
 48 |                     for fname in files:
 49 |                         file_path = os.path.join(root, fname)
 50 |                         if file_path != arc_name:
 51 |                             zipf.write(file_path, os.path.relpath(file_path, td))
 52 |             with open(arc_name, "rb") as f:
 53 |                 model_data = f.read()
 54 |         with open(path, "wb") as f:
 55 |             dill.dump((model_data, self._act_params), f)
 56 | 
 57 | 
 58 | def load(path, num_cpu=16):
 59 |     """Load act function that was returned by learn function.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     path: str
 64 |         path to the act function pickle
 65 |     num_cpu: int
 66 |         number of cpus to use for executing the policy
 67 | 
 68 |     Returns
 69 |     -------
 70 |     act: ActWrapper
 71 |         function that takes a batch of observations
 72 |         and returns actions.
 73 |     """
 74 |     return ActWrapper.load(path, num_cpu=num_cpu)
 75 | 
 76 | 
 77 | def learn(env,
 78 |           q_func,
 79 |           lr=5e-4,
 80 |           max_timesteps=100000,
 81 |           buffer_size=50000,
 82 |           exploration_fraction=0.1,
 83 |           exploration_final_eps=0.02,
 84 |           train_freq=1,
 85 |           batch_size=32,
 86 |           print_freq=1,
 87 |           checkpoint_freq=10000,
 88 |           learning_starts=1000,
 89 |           gamma=1.0,
 90 |           target_network_update_freq=500,
 91 |           prioritized_replay=False,
 92 |           prioritized_replay_alpha=0.6,
 93 |           prioritized_replay_beta0=0.4,
 94 |           prioritized_replay_beta_iters=None,
 95 |           prioritized_replay_eps=1e-6,
 96 |           num_cpu=16,
 97 |           callback=None):
 98 |     """Train a deepq model.
 99 | 
100 |     Parameters
101 |     -------
102 |     env : gym.Env
103 |         environment to train on
104 |     q_func: (tf.Variable, int, str, bool) -> tf.Variable
105 |         the model that takes the following inputs:
106 |             observation_in: object
107 |                 the output of observation placeholder
108 |             num_actions: int
109 |                 number of actions
110 |             scope: str
111 |             reuse: bool
112 |                 should be passed to outer variable scope
113 |         and returns a tensor of shape (batch_size, num_actions) with values of every action.
114 |     lr: float
115 |         learning rate for adam optimizer
116 |     max_timesteps: int
117 |         number of env steps to optimizer for
118 |     buffer_size: int
119 |         size of the replay buffer
120 |     exploration_fraction: float
121 |         fraction of entire training period over which the exploration rate is annealed
122 |     exploration_final_eps: float
123 |         final value of random action probability
124 |     train_freq: int
125 |         update the model every `train_freq` steps.
126 |     batch_size: int
127 |         size of a batched sampled from replay buffer for training
128 |     print_freq: int
129 |         how often to print out training progress
130 |         set to None to disable printing
131 |     checkpoint_freq: int
132 |         how often to save the model. This is so that the best version is restored
133 |         at the end of the training. If you do not wish to restore the best version at
134 |         the end of the training set this variable to None.
135 |     learning_starts: int
136 |         how many steps of the model to collect transitions for before learning starts
137 |     gamma: float
138 |         discount factor
139 |     target_network_update_freq: int
140 |         update the target network every `target_network_update_freq` steps.
141 |     prioritized_replay: True
142 |         if True prioritized replay buffer will be used.
143 |     prioritized_replay_alpha: float
144 |         alpha parameter for prioritized replay buffer
145 |     prioritized_replay_beta0: float
146 |         initial value of beta for prioritized replay buffer
147 |     prioritized_replay_beta_iters: int
148 |         number of iterations over which beta will be annealed from initial value
149 |         to 1.0. If set to None equals to max_timesteps.
150 |     prioritized_replay_eps: float
151 |         epsilon to add to the TD errors when updating priorities.
152 |     num_cpu: int
153 |         number of cpus to use for training
154 |     callback: (locals, globals) -> None
155 |         function called at every steps with state of the algorithm.
156 |         If callback returns true training stops.
157 | 
158 |     Returns
159 |     -------
160 |     act: ActWrapper
161 |         Wrapper over act function. Adds ability to save it and load it.
162 |         See header of baselines/deepq/categorical.py for details on the act function.
163 |     """
164 |     # Create all the functions necessary to train the model
165 | 
166 |     sess = U.make_session(num_cpu=num_cpu)
167 |     sess.__enter__()
168 | 
169 |     def make_obs_ph(name):
170 |         return U.BatchInput(env.observation_space.shape, name=name)
171 | 
172 |     act, train, update_target, debug = deepq.build_train(
173 |         make_obs_ph=make_obs_ph,
174 |         q_func=q_func,
175 |         num_actions=env.action_space.n,
176 |         optimizer=tf.train.AdamOptimizer(learning_rate=lr),
177 |         gamma=gamma,
178 |         grad_norm_clipping=10
179 |     )
180 |     act_params = {
181 |         'make_obs_ph': make_obs_ph,
182 |         'q_func': q_func,
183 |         'num_actions': env.action_space.n,
184 |     }
185 |     # Create the replay buffer
186 |     if prioritized_replay:
187 |         replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
188 |         if prioritized_replay_beta_iters is None:
189 |             prioritized_replay_beta_iters = max_timesteps
190 |         beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
191 |                                        initial_p=prioritized_replay_beta0,
192 |                                        final_p=1.0)
193 |     else:
194 |         replay_buffer = ReplayBuffer(buffer_size)
195 |         beta_schedule = None
196 |     # Create the schedule for exploration starting from 1.
197 |     exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
198 |                                  initial_p=1.0,
199 |                                  final_p=exploration_final_eps)
200 | 
201 |     # Initialize the parameters and copy them to the target network.
202 |     U.initialize()
203 |     update_target()
204 | 
205 |     episode_rewards = [0.0]
206 |     saved_mean_reward = None
207 |     obs = env.reset()
208 |     with tempfile.TemporaryDirectory() as td:
209 |         model_saved = False
210 |         model_file = os.path.join(td, "model")
211 |         for t in range(max_timesteps):
212 |             if callback is not None:
213 |                 if callback(locals(), globals()):
214 |                     break
215 |             # Take action and update exploration to the newest value
216 |             action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
217 |             new_obs, rew, done, _ = env.step(action)
218 |             # Store transition in the replay buffer.
219 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
220 |             obs = new_obs
221 | 
222 |             episode_rewards[-1] += rew
223 |             if done:
224 |                 obs = env.reset()
225 |                 episode_rewards.append(0.0)
226 | 
227 |             if t > learning_starts and t % train_freq == 0:
228 |                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
229 |                 if prioritized_replay:
230 |                     experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
231 |                     (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
232 |                 else:
233 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
234 |                     weights, batch_idxes = np.ones_like(rewards), None
235 |                 td_errors = train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
236 |                 if prioritized_replay:
237 |                     new_priorities = np.abs(td_errors) + prioritized_replay_eps
238 |                     replay_buffer.update_priorities(batch_idxes, new_priorities)
239 | 
240 |             if t > learning_starts and t % target_network_update_freq == 0:
241 |                 # Update target network periodically.
242 |                 update_target()
243 | 
244 |             mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
245 |             num_episodes = len(episode_rewards)
246 |             if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
247 |                 logger.record_tabular("steps", t)
248 |                 logger.record_tabular("episodes", num_episodes)
249 |                 logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
250 |                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
251 |                 logger.dump_tabular()
252 | 
253 |             if (checkpoint_freq is not None and t > learning_starts and
254 |                     num_episodes > 100 and t % checkpoint_freq == 0):
255 |                 if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
256 |                     if print_freq is not None:
257 |                         logger.log("Saving model due to mean reward increase: {} -> {}".format(
258 |                                    saved_mean_reward, mean_100ep_reward))
259 |                     U.save_state(model_file)
260 |                     model_saved = True
261 |                     saved_mean_reward = mean_100ep_reward
262 |         if model_saved:
263 |             if print_freq is not None:
264 |                 logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
265 |             U.load_state(model_file)
266 | 
267 |     return ActWrapper(act, act_params)
268 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import numpy as np
  4 | import os
  5 | import tensorflow as tf
  6 | import tempfile
  7 | import time
  8 | 
  9 | import baselines.common.tf_util as U
 10 | 
 11 | from baselines import logger
 12 | from baselines import deepq
 13 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
 14 | from baselines.common.misc_util import (
 15 |     boolean_flag,
 16 |     pickle_load,
 17 |     pretty_eta,
 18 |     relatively_safe_pickle_dump,
 19 |     set_global_seeds,
 20 |     RunningAvg,
 21 |     SimpleMonitor
 22 | )
 23 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
 24 | # when updating this to non-deperecated ones, it is important to
 25 | # copy over LazyFrames
 26 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
 27 | from baselines.common.azure_utils import Container
 28 | from .model import model, dueling_model
 29 | 
 30 | 
 31 | def parse_args():
 32 |     parser = argparse.ArgumentParser("DQN experiments for Atari games")
 33 |     # Environment
 34 |     parser.add_argument("--env", type=str, default="Pong", help="name of the game")
 35 |     parser.add_argument("--seed", type=int, default=42, help="which seed to use")
 36 |     # Core DQN parameters
 37 |     parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size")
 38 |     parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer")
 39 |     parser.add_argument("--num-steps", type=int, default=int(2e8), help="total number of steps to run the environment for")
 40 |     parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time")
 41 |     parser.add_argument("--learning-freq", type=int, default=4, help="number of iterations between every optimization step")
 42 |     parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
 43 |     # Bells and whistles
 44 |     boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
 45 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
 46 |     boolean_flag(parser, "prioritized", default=False, help="whether or not to use prioritized replay buffer")
 47 |     parser.add_argument("--prioritized-alpha", type=float, default=0.6, help="alpha parameter for prioritized replay buffer")
 48 |     parser.add_argument("--prioritized-beta0", type=float, default=0.4, help="initial value of beta parameters for prioritized replay")
 49 |     parser.add_argument("--prioritized-eps", type=float, default=1e-6, help="eps parameter for prioritized replay buffer")
 50 |     # Checkpointing
 51 |     parser.add_argument("--save-dir", type=str, default=None, help="directory in which training state and model should be saved.")
 52 |     parser.add_argument("--save-azure-container", type=str, default=None,
 53 |                         help="It present data will saved/loaded from Azure. Should be in format ACCOUNT_NAME:ACCOUNT_KEY:CONTAINER")
 54 |     parser.add_argument("--save-freq", type=int, default=1e6, help="save model once every time this many iterations are completed")
 55 |     boolean_flag(parser, "load-on-start", default=True, help="if true and model was previously saved then training will be resumed")
 56 |     return parser.parse_args()
 57 | 
 58 | 
 59 | def make_env(game_name):
 60 |     env = gym.make(game_name + "NoFrameskip-v4")
 61 |     monitored_env = SimpleMonitor(env)  # puts rewards and number of steps in info, before environment is wrapped
 62 |     env = wrap_dqn(monitored_env)  # applies a bunch of modification to simplify the observation space (downsample, make b/w)
 63 |     return env, monitored_env
 64 | 
 65 | 
 66 | def maybe_save_model(savedir, container, state):
 67 |     """This function checkpoints the model and state of the training algorithm."""
 68 |     if savedir is None:
 69 |         return
 70 |     start_time = time.time()
 71 |     model_dir = "model-{}".format(state["num_iters"])
 72 |     U.save_state(os.path.join(savedir, model_dir, "saved"))
 73 |     if container is not None:
 74 |         container.put(os.path.join(savedir, model_dir), model_dir)
 75 |     relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
 76 |     if container is not None:
 77 |         container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip')
 78 |     relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl'))
 79 |     if container is not None:
 80 |         container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl')
 81 |     logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
 82 | 
 83 | 
 84 | def maybe_load_model(savedir, container):
 85 |     """Load model if present at the specified path."""
 86 |     if savedir is None:
 87 |         return
 88 | 
 89 |     state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
 90 |     if container is not None:
 91 |         logger.log("Attempting to download model from Azure")
 92 |         found_model = container.get(savedir, 'training_state.pkl.zip')
 93 |     else:
 94 |         found_model = os.path.exists(state_path)
 95 |     if found_model:
 96 |         state = pickle_load(state_path, compression=True)
 97 |         model_dir = "model-{}".format(state["num_iters"])
 98 |         if container is not None:
 99 |             container.get(savedir, model_dir)
100 |         U.load_state(os.path.join(savedir, model_dir, "saved"))
101 |         logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
102 |         return state
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     args = parse_args()
107 |     # Parse savedir and azure container.
108 |     savedir = args.save_dir
109 |     if args.save_azure_container is not None:
110 |         account_name, account_key, container_name = args.save_azure_container.split(":")
111 |         container = Container(account_name=account_name,
112 |                               account_key=account_key,
113 |                               container_name=container_name,
114 |                               maybe_create=True)
115 |         if savedir is None:
116 |             # Careful! This will not get cleaned up. Docker spoils the developers.
117 |             savedir = tempfile.TemporaryDirectory().name
118 |     else:
119 |         container = None
120 |     # Create and seed the env.
121 |     env, monitored_env = make_env(args.env)
122 |     if args.seed > 0:
123 |         set_global_seeds(args.seed)
124 |         env.unwrapped.seed(args.seed)
125 | 
126 |     with U.make_session(4) as sess:
127 |         # Create training graph and replay buffer
128 |         act, train, update_target, debug = deepq.build_train(
129 |             make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
130 |             q_func=dueling_model if args.dueling else model,
131 |             num_actions=env.action_space.n,
132 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
133 |             gamma=0.99,
134 |             grad_norm_clipping=10,
135 |             double_q=args.double_q
136 |         )
137 | 
138 |         approximate_num_iters = args.num_steps / 4
139 |         exploration = PiecewiseSchedule([
140 |             (0, 1.0),
141 |             (approximate_num_iters / 50, 0.1),
142 |             (approximate_num_iters / 5, 0.01)
143 |         ], outside_value=0.01)
144 | 
145 |         if args.prioritized:
146 |             replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha)
147 |             beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0)
148 |         else:
149 |             replay_buffer = ReplayBuffer(args.replay_buffer_size)
150 | 
151 |         U.initialize()
152 |         update_target()
153 |         num_iters = 0
154 | 
155 |         # Load the model
156 |         state = maybe_load_model(savedir, container)
157 |         if state is not None:
158 |             num_iters, replay_buffer = state["num_iters"], state["replay_buffer"],
159 |             monitored_env.set_state(state["monitor_state"])
160 | 
161 |         start_time, start_steps = None, None
162 |         steps_per_iter = RunningAvg(0.999)
163 |         iteration_time_est = RunningAvg(0.999)
164 |         obs = env.reset()
165 | 
166 |         # Main trianing loop
167 |         while True:
168 |             num_iters += 1
169 |             # Take action and store transition in the replay buffer.
170 |             action = act(np.array(obs)[None], update_eps=exploration.value(num_iters))[0]
171 |             new_obs, rew, done, info = env.step(action)
172 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
173 |             obs = new_obs
174 |             if done:
175 |                 obs = env.reset()
176 | 
177 |             if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and
178 |                     num_iters % args.learning_freq == 0):
179 |                 # Sample a bunch of transitions from replay buffer
180 |                 if args.prioritized:
181 |                     experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters))
182 |                     (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
183 |                 else:
184 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
185 |                     weights = np.ones_like(rewards)
186 |                 # Minimize the error in Bellman's equation and compute TD-error
187 |                 td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
188 |                 # Update the priorities in the replay buffer
189 |                 if args.prioritized:
190 |                     new_priorities = np.abs(td_errors) + args.prioritized_eps
191 |                     replay_buffer.update_priorities(batch_idxes, new_priorities)
192 |             # Update target network.
193 |             if num_iters % args.target_update_freq == 0:
194 |                 update_target()
195 | 
196 |             if start_time is not None:
197 |                 steps_per_iter.update(info['steps'] - start_steps)
198 |                 iteration_time_est.update(time.time() - start_time)
199 |             start_time, start_steps = time.time(), info["steps"]
200 | 
201 |             # Save the model and training state.
202 |             if num_iters > 0 and (num_iters % args.save_freq == 0 or info["steps"] > args.num_steps):
203 |                 maybe_save_model(savedir, container, {
204 |                     'replay_buffer': replay_buffer,
205 |                     'num_iters': num_iters,
206 |                     'monitor_state': monitored_env.get_state()
207 |                 })
208 | 
209 |             if info["steps"] > args.num_steps:
210 |                 break
211 | 
212 |             if done:
213 |                 steps_left = args.num_steps - info["steps"]
214 |                 completion = np.round(info["steps"] / args.num_steps, 1)
215 | 
216 |                 logger.record_tabular("% completion", completion)
217 |                 logger.record_tabular("steps", info["steps"])
218 |                 logger.record_tabular("iters", num_iters)
219 |                 logger.record_tabular("episodes", len(info["rewards"]))
220 |                 logger.record_tabular("reward (100 epi mean)", np.mean(info["rewards"][-100:]))
221 |                 logger.record_tabular("exploration", exploration.value(num_iters))
222 |                 if args.prioritized:
223 |                     logger.record_tabular("max priority", replay_buffer._max_priority)
224 |                 fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6)
225 |                                 if steps_per_iter._value is not None else "calculating...")
226 |                 logger.dump_tabular()
227 |                 logger.log()
228 |                 logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate)))
229 |                 logger.log()
230 | 


--------------------------------------------------------------------------------
/baselines/deepq/build_graph.py:
--------------------------------------------------------------------------------
  1 | """Deep Q learning graph
  2 | 
  3 | The functions in this file can are used to create the following functions:
  4 | 
  5 | ======= act ========
  6 | 
  7 |     Function to chose an action given an observation
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     observation: object
 12 |         Observation that can be feed into the output of make_obs_ph
 13 |     stochastic: bool
 14 |         if set to False all the actions are always deterministic (default False)
 15 |     update_eps_ph: float
 16 |         update epsilon a new value, if negative not update happens
 17 |         (default: no update)
 18 | 
 19 |     Returns
 20 |     -------
 21 |     Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
 22 |     every element of the batch.
 23 | 
 24 | 
 25 | ======= train =======
 26 | 
 27 |     Function that takes a transition (s,a,r,s') and optimizes Bellman equation's error:
 28 | 
 29 |         td_error = Q(s,a) - (r + gamma * max_a' Q(s', a'))
 30 |         loss = huber_loss[td_error]
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     obs_t: object
 35 |         a batch of observations
 36 |     action: np.array
 37 |         actions that were selected upon seeing obs_t.
 38 |         dtype must be int32 and shape must be (batch_size,)
 39 |     reward: np.array
 40 |         immediate reward attained after executing those actions
 41 |         dtype must be float32 and shape must be (batch_size,)
 42 |     obs_tp1: object
 43 |         observations that followed obs_t
 44 |     done: np.array
 45 |         1 if obs_t was the last observation in the episode and 0 otherwise
 46 |         obs_tp1 gets ignored, but must be of the valid shape.
 47 |         dtype must be float32 and shape must be (batch_size,)
 48 |     weight: np.array
 49 |         imporance weights for every element of the batch (gradient is multiplied
 50 |         by the importance weight) dtype must be float32 and shape must be (batch_size,)
 51 | 
 52 |     Returns
 53 |     -------
 54 |     td_error: np.array
 55 |         a list of differences between Q(s,a) and the target in Bellman's equation.
 56 |         dtype is float32 and shape is (batch_size,)
 57 | 
 58 | ======= update_target ========
 59 | 
 60 |     copy the parameters from optimized Q function to the target Q function.
 61 |     In Q learning we actually optimize the following error:
 62 | 
 63 |         Q(s,a) - (r + gamma * max_a' Q'(s', a'))
 64 | 
 65 |     Where Q' is lagging behind Q to stablize the learning. For example for Atari
 66 | 
 67 |     Q' is set to Q once every 10000 updates training steps.
 68 | 
 69 | """
 70 | import tensorflow as tf
 71 | import baselines.common.tf_util as U
 72 | from cleverhans.attacks import FastGradientMethod, BasicIterativeMethod, CarliniWagnerL2
 73 | from cleverhans.model import CallableModelWrapper
 74 | 
 75 | 
 76 | def build_act(make_obs_ph, q_func, num_actions, attack=None, scope="deepq", reuse=None, model_path=''):
 77 |     """Creates the act function:
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     make_obs_ph: str -> tf.placeholder or TfInput
 82 |         a function that take a name and creates a placeholder of input with that name
 83 |     q_func: (tf.Variable, int, str, bool) -> tf.Variable
 84 |         the model that takes the following inputs:
 85 |             observation_in: object
 86 |                 the output of observation placeholder
 87 |             num_actions: int
 88 |                 number of actions
 89 |             scope: str
 90 |             reuse: bool
 91 |                 should be passed to outer variable scope
 92 |         and returns a tensor of shape (batch_size, num_actions) with values of every action.
 93 |     num_actions: int
 94 |         number of actions.
 95 |     scope: str or VariableScope
 96 |         optional scope for variable_scope.
 97 |     reuse: bool or None
 98 |         whether or not the variables should be reused. To be able to reuse the scope must be given.
 99 | 
100 |     Returns
101 |     -------
102 |     act: (tf.Variable, bool, float) -> tf.Variable
103 |         function to select and action given observation.
104 | `       See the top of the file for details.
105 |     """
106 |     with tf.variable_scope(scope, reuse=reuse):
107 |         observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
108 |         stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
109 |         update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
110 | 
111 |         eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
112 | 
113 |         q_values = q_func(observations_ph.get(), num_actions, scope="q_func", concat_softmax=True)
114 |         deterministic_actions = tf.argmax(q_values, axis=1)
115 |         batch_size = tf.shape(observations_ph.get())[0]
116 |         random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
117 |         chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
118 |         stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)
119 | 
120 |         output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
121 |         update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
122 | 
123 |         act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
124 |                          outputs=output_actions,
125 |                          givens={update_eps_ph: -1.0, stochastic_ph: True},
126 |                          updates=[update_eps_expr])
127 | 
128 |         # Load model before attacks graph construction so that TF won't
129 |         # complain can't load parameters for attack
130 |         U.load_state(model_path)
131 | 
132 |         if attack != None:
133 |             if attack == 'fgsm':
134 |                 def wrapper(x):
135 |                     return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True)
136 |                 adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
137 |                 adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
138 |                                                       clip_min=0, clip_max=1.0) * 255.0
139 |             elif attack == 'iterative':
140 |                 def wrapper(x):
141 |                     return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True)
142 |                 adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
143 |                 adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
144 |                                                       clip_min=0, clip_max=1.0) * 255.0
145 |             elif attack == 'cwl2':
146 |                 def wrapper(x):
147 |                     return q_func(x, num_actions, scope="q_func", reuse=True)
148 |                 adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session())
149 |                 cw_params = {'binary_search_steps': 1,
150 |                              'max_iterations': 100,
151 |                              'learning_rate': 0.1,
152 |                              'initial_const': 10,
153 |                              'clip_min': 0,
154 |                              'clip_max': 1.0}
155 |                 adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0
156 | 
157 |             craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
158 |                             outputs=adv_observations,
159 |                             givens={update_eps_ph: -1.0, stochastic_ph: True},
160 |                             updates=[update_eps_expr])
161 | 
162 |         if attack == None:
163 |             craft_adv_obs = None
164 | 
165 |         return act, craft_adv_obs
166 | 
167 | 
168 | def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None):
169 |     """Creates the train function:
170 | 
171 |     Parameters
172 |     ----------
173 |     make_obs_ph: str -> tf.placeholder or TfInput
174 |         a function that takes a name and creates a placeholder of input with that name
175 |     q_func: (tf.Variable, int, str, bool) -> tf.Variable
176 |         the model that takes the following inputs:
177 |             observation_in: object
178 |                 the output of observation placeholder
179 |             num_actions: int
180 |                 number of actions
181 |             scope: str
182 |             reuse: bool
183 |                 should be passed to outer variable scope
184 |         and returns a tensor of shape (batch_size, num_actions) with values of every action.
185 |     num_actions: int
186 |         number of actions
187 |     reuse: bool
188 |         whether or not to reuse the graph variables
189 |     optimizer: tf.train.Optimizer
190 |         optimizer to use for the Q-learning objective.
191 |     grad_norm_clipping: float or None
192 |         clip gradient norms to this value. If None no clipping is performed.
193 |     gamma: float
194 |         discount rate.
195 |     double_q: bool
196 |         if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
197 |         In general it is a good idea to keep it enabled.
198 |     scope: str or VariableScope
199 |         optional scope for variable_scope.
200 |     reuse: bool or None
201 |         whether or not the variables should be reused. To be able to reuse the scope must be given.
202 | 
203 |     Returns
204 |     -------
205 |     act: (tf.Variable, bool, float) -> tf.Variable
206 |         function to select and action given observation.
207 | `       See the top of the file for details.
208 |     train: (object, np.array, np.array, object, np.array, np.array) -> np.array
209 |         optimize the error in Bellman's equation.
210 | `       See the top of the file for details.
211 |     update_target: () -> ()
212 |         copy the parameters from optimized Q function to the target Q function.
213 | `       See the top of the file for details.
214 |     debug: {str: function}
215 |         a bunch of functions to print debug data like q_values.
216 |     """
217 |     act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)
218 | 
219 |     with tf.variable_scope(scope, reuse=reuse):
220 |         # set up placeholders
221 |         obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
222 |         act_t_ph = tf.placeholder(tf.int32, [None], name="action")
223 |         rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
224 |         obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
225 |         done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
226 |         importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
227 | 
228 |         # q network evaluation
229 |         q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
230 |         q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
231 | 
232 |         # target q network evalution
233 |         q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
234 |         target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
235 | 
236 |         # q scores for actions which we know were selected in the given state.
237 |         q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
238 | 
239 |         # compute estimate of best possible value starting from state at t + 1
240 |         if double_q:
241 |             q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
242 |             q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
243 |             q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
244 |         else:
245 |             q_tp1_best = tf.reduce_max(q_tp1, 1)
246 |         q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best
247 | 
248 |         # compute RHS of bellman equation
249 |         q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
250 | 
251 |         # compute the error (potentially clipped)
252 |         td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
253 |         errors = U.huber_loss(td_error)
254 |         weighted_error = tf.reduce_mean(importance_weights_ph * errors)
255 |         # compute optimization op (potentially with gradient clipping)
256 |         if grad_norm_clipping is not None:
257 |             optimize_expr = U.minimize_and_clip(optimizer,
258 |                                                 weighted_error,
259 |                                                 var_list=q_func_vars,
260 |                                                 clip_val=grad_norm_clipping)
261 |         else:
262 |             optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)
263 | 
264 |         # update_target_fn will be called periodically to copy Q network to target Q network
265 |         update_target_expr = []
266 |         for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
267 |                                    sorted(target_q_func_vars, key=lambda v: v.name)):
268 |             update_target_expr.append(var_target.assign(var))
269 |         update_target_expr = tf.group(*update_target_expr)
270 | 
271 |         # Create callable functions
272 |         train = U.function(
273 |             inputs=[
274 |                 obs_t_input,
275 |                 act_t_ph,
276 |                 rew_t_ph,
277 |                 obs_tp1_input,
278 |                 done_mask_ph,
279 |                 importance_weights_ph
280 |             ],
281 |             outputs=td_error,
282 |             updates=[optimize_expr]
283 |         )
284 |         update_target = U.function([], [], updates=[update_target_expr])
285 | 
286 |         q_values = U.function([obs_t_input], q_t)
287 | 
288 |         return act_f, train, update_target, {'q_values': q_values}
289 | 


--------------------------------------------------------------------------------
/baselines/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf  # pylint: ignore-module
  3 | import builtins
  4 | import functools
  5 | import copy
  6 | import os
  7 | import collections
  8 | 
  9 | 
 10 | # ================================================================
 11 | # Make consistent with numpy
 12 | # ================================================================
 13 | 
 14 | clip = tf.clip_by_value
 15 | 
 16 | 
 17 | def sum(x, axis=None, keepdims=False):
 18 |     axis = None if axis is None else [axis]
 19 |     return tf.reduce_sum(x, axis=axis, keep_dims=keepdims)
 20 | 
 21 | 
 22 | def mean(x, axis=None, keepdims=False):
 23 |     axis = None if axis is None else [axis]
 24 |     return tf.reduce_mean(x, axis=axis, keep_dims=keepdims)
 25 | 
 26 | 
 27 | def var(x, axis=None, keepdims=False):
 28 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 29 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 30 | 
 31 | 
 32 | def std(x, axis=None, keepdims=False):
 33 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 34 | 
 35 | 
 36 | def max(x, axis=None, keepdims=False):
 37 |     axis = None if axis is None else [axis]
 38 |     return tf.reduce_max(x, axis=axis, keep_dims=keepdims)
 39 | 
 40 | 
 41 | def min(x, axis=None, keepdims=False):
 42 |     axis = None if axis is None else [axis]
 43 |     return tf.reduce_min(x, axis=axis, keep_dims=keepdims)
 44 | 
 45 | 
 46 | def concatenate(arrs, axis=0):
 47 |     return tf.concat(axis=axis, values=arrs)
 48 | 
 49 | 
 50 | def argmax(x, axis=None):
 51 |     return tf.argmax(x, axis=axis)
 52 | 
 53 | 
 54 | def switch(condition, then_expression, else_expression):
 55 |     """Switches between two operations depending on a scalar value (int or bool).
 56 |     Note that both `then_expression` and `else_expression`
 57 |     should be symbolic tensors of the *same shape*.
 58 | 
 59 |     # Arguments
 60 |         condition: scalar tensor.
 61 |         then_expression: TensorFlow operation.
 62 |         else_expression: TensorFlow operation.
 63 |     """
 64 |     x_shape = copy.copy(then_expression.get_shape())
 65 |     x = tf.cond(tf.cast(condition, 'bool'),
 66 |                 lambda: then_expression,
 67 |                 lambda: else_expression)
 68 |     x.set_shape(x_shape)
 69 |     return x
 70 | 
 71 | # ================================================================
 72 | # Extras
 73 | # ================================================================
 74 | 
 75 | 
 76 | def l2loss(params):
 77 |     if len(params) == 0:
 78 |         return tf.constant(0.0)
 79 |     else:
 80 |         return tf.add_n([sum(tf.square(p)) for p in params])
 81 | 
 82 | 
 83 | def lrelu(x, leak=0.2):
 84 |     f1 = 0.5 * (1 + leak)
 85 |     f2 = 0.5 * (1 - leak)
 86 |     return f1 * x + f2 * abs(x)
 87 | 
 88 | 
 89 | def categorical_sample_logits(X):
 90 |     # https://github.com/tensorflow/tensorflow/issues/456
 91 |     U = tf.random_uniform(tf.shape(X))
 92 |     return argmax(X - tf.log(-tf.log(U)), axis=1)
 93 | 
 94 | 
 95 | # ================================================================
 96 | # Inputs
 97 | # ================================================================
 98 | 
 99 | 
100 | def is_placeholder(x):
101 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
102 | 
103 | 
104 | class TfInput(object):
105 |     def __init__(self, name="(unnamed)"):
106 |         """Generalized Tensorflow placeholder. The main differences are:
107 |             - possibly uses multiple placeholders internally and returns multiple values
108 |             - can apply light postprocessing to the value feed to placeholder.
109 |         """
110 |         self.name = name
111 | 
112 |     def get(self):
113 |         """Return the tf variable(s) representing the possibly postprocessed value
114 |         of placeholder(s).
115 |         """
116 |         raise NotImplemented()
117 | 
118 |     def make_feed_dict(data):
119 |         """Given data input it to the placeholder(s)."""
120 |         raise NotImplemented()
121 | 
122 | 
123 | class PlacholderTfInput(TfInput):
124 |     def __init__(self, placeholder):
125 |         """Wrapper for regular tensorflow placeholder."""
126 |         super().__init__(placeholder.name)
127 |         self._placeholder = placeholder
128 | 
129 |     def get(self):
130 |         return self._placeholder
131 | 
132 |     def make_feed_dict(self, data):
133 |         return {self._placeholder: data}
134 | 
135 | 
136 | class BatchInput(PlacholderTfInput):
137 |     def __init__(self, shape, dtype=tf.float32, name=None):
138 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
139 | 
140 |         Parameters
141 |         ----------
142 |         shape: [int]
143 |             shape of a single elemenet of the batch
144 |         dtype: tf.dtype
145 |             number representation used for tensor contents
146 |         name: str
147 |             name of the underlying placeholder
148 |         """
149 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
150 | 
151 | 
152 | class Uint8Input(PlacholderTfInput):
153 |     def __init__(self, shape, name=None):
154 |         """Takes input in uint8 format which is cast to float32 and divided by 255
155 |         before passing it to the model.
156 | 
157 |         On GPU this ensures lower data transfer times.
158 | 
159 |         Parameters
160 |         ----------
161 |         shape: [int]
162 |             shape of the tensor.
163 |         name: str
164 |             name of the underlying placeholder
165 |         """
166 | 
167 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
168 |         self._shape = shape
169 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
170 | 
171 |     def get(self):
172 |         return self._output
173 | 
174 | 
175 | def ensure_tf_input(thing):
176 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
177 |     if isinstance(thing, TfInput):
178 |         return thing
179 |     elif is_placeholder(thing):
180 |         return PlacholderTfInput(thing)
181 |     else:
182 |         raise ValueError("Must be a placeholder or TfInput")
183 | 
184 | # ================================================================
185 | # Mathematical utils
186 | # ================================================================
187 | 
188 | 
189 | def huber_loss(x, delta=1.0):
190 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
191 |     return tf.where(
192 |         tf.abs(x) < delta,
193 |         tf.square(x) * 0.5,
194 |         delta * (tf.abs(x) - 0.5 * delta)
195 |     )
196 | 
197 | # ================================================================
198 | # Optimizer utils
199 | # ================================================================
200 | 
201 | 
202 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
203 |     """Minimized `objective` using `optimizer` w.r.t. variables in
204 |     `var_list` while ensure the norm of the gradients for each
205 |     variable is clipped to `clip_val`
206 |     """
207 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
208 |     for i, (grad, var) in enumerate(gradients):
209 |         if grad is not None:
210 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
211 |     return optimizer.apply_gradients(gradients)
212 | 
213 | 
214 | # ================================================================
215 | # Global session
216 | # ================================================================
217 | 
218 | def get_session():
219 |     """Returns recently made Tensorflow session"""
220 |     return tf.get_default_session()
221 | 
222 | 
223 | def make_session(num_cpu):
224 |     """Returns a session that will use <num_cpu> CPU's only"""
225 |     tf_config = tf.ConfigProto(
226 |         inter_op_parallelism_threads=num_cpu,
227 |         intra_op_parallelism_threads=num_cpu)
228 |     return tf.Session(config=tf_config)
229 | 
230 | 
231 | def single_threaded_session():
232 |     """Returns a session which will only use a single CPU"""
233 |     return make_session(1)
234 | 
235 | 
236 | ALREADY_INITIALIZED = set()
237 | 
238 | 
239 | def initialize():
240 |     """Initialize all the uninitialized variables in the global scope."""
241 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
242 |     get_session().run(tf.variables_initializer(new_variables))
243 |     ALREADY_INITIALIZED.update(new_variables)
244 | 
245 | 
246 | def eval(expr, feed_dict=None):
247 |     if feed_dict is None:
248 |         feed_dict = {}
249 |     return get_session().run(expr, feed_dict=feed_dict)
250 | 
251 | 
252 | VALUE_SETTERS = collections.OrderedDict()
253 | 
254 | 
255 | def set_value(v, val):
256 |     global VALUE_SETTERS
257 |     if v in VALUE_SETTERS:
258 |         set_op, set_endpoint = VALUE_SETTERS[v]
259 |     else:
260 |         set_endpoint = tf.placeholder(v.dtype)
261 |         set_op = v.assign(set_endpoint)
262 |         VALUE_SETTERS[v] = (set_op, set_endpoint)
263 |     get_session().run(set_op, feed_dict={set_endpoint: val})
264 | 
265 | 
266 | # ================================================================
267 | # Saving variables
268 | # ================================================================
269 | 
270 | 
271 | def load_state(fname):
272 |     saver = tf.train.Saver()
273 |     saver.restore(get_session(), fname)
274 | 
275 | 
276 | def save_state(fname):
277 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
278 |     saver = tf.train.Saver()
279 |     saver.save(get_session(), fname)
280 | 
281 | # ================================================================
282 | # Model components
283 | # ================================================================
284 | 
285 | 
286 | def normc_initializer(std=1.0):
287 |     def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
288 |         out = np.random.randn(*shape).astype(np.float32)
289 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
290 |         return tf.constant(out)
291 |     return _initializer
292 | 
293 | 
294 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
295 |            summary_tag=None):
296 |     with tf.variable_scope(name):
297 |         stride_shape = [1, stride[0], stride[1], 1]
298 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
299 | 
300 |         # there are "num input feature maps * filter height * filter width"
301 |         # inputs to each hidden unit
302 |         fan_in = intprod(filter_shape[:3])
303 |         # each unit in the lower layer receives a gradient from:
304 |         # "num output feature maps * filter height * filter width" /
305 |         #   pooling size
306 |         fan_out = intprod(filter_shape[:2]) * num_filters
307 |         # initialize weights with random weights
308 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
309 | 
310 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
311 |                             collections=collections)
312 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
313 |                             collections=collections)
314 | 
315 |         if summary_tag is not None:
316 |             tf.summary.image(summary_tag,
317 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
318 |                                           [2, 0, 1, 3]),
319 |                              max_images=10)
320 | 
321 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
322 | 
323 | 
324 | def dense(x, size, name, weight_init=None, bias=True):
325 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
326 |     ret = tf.matmul(x, w)
327 |     if bias:
328 |         b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
329 |         return ret + b
330 |     else:
331 |         return ret
332 | 
333 | 
334 | def wndense(x, size, name, init_scale=1.0):
335 |     v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
336 |                         initializer=tf.random_normal_initializer(0, 0.05))
337 |     g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
338 |     b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
339 | 
340 |     # use weight normalization (Salimans & Kingma, 2016)
341 |     x = tf.matmul(x, v)
342 |     scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
343 |     return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
344 | 
345 | 
346 | def densenobias(x, size, name, weight_init=None):
347 |     return dense(x, size, name, weight_init=weight_init, bias=False)
348 | 
349 | 
350 | def dropout(x, pkeep, phase=None, mask=None):
351 |     mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
352 |     if phase is None:
353 |         return mask * x
354 |     else:
355 |         return switch(phase, mask * x, pkeep * x)
356 | 
357 | 
358 | # ================================================================
359 | # Theano-like Function
360 | # ================================================================
361 | 
362 | 
363 | 
364 | def function(inputs, outputs, updates=None, givens=None):
365 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
366 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
367 |     values to be fed to the input's placeholders and produces the values of the expressions
368 |     in outputs.
369 | 
370 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
371 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
372 | 
373 |     Example:
374 |         x = tf.placeholder(tf.int32, (), name="x")
375 |         y = tf.placeholder(tf.int32, (), name="y")
376 |         z = 3 * x + 2 * y
377 |         lin = function([x, y], z, givens={y: 0})
378 | 
379 |         with single_threaded_session():
380 |             initialize()
381 | 
382 |             assert lin(2) == 6
383 |             assert lin(x=3) == 9
384 |             assert lin(2, 2) == 10
385 |             assert lin(x=2, y=3) == 12
386 | 
387 |     Parameters
388 |     ----------
389 |     inputs: [tf.placeholder or TfInput]
390 |         list of input arguments
391 |     outputs: [tf.Variable] or tf.Variable
392 |         list of outputs or a single output to be returned from function. Returned
393 |         value will also have the same shape.
394 |     """
395 |     if isinstance(outputs, list):
396 |         return _Function(inputs, outputs, updates, givens=givens)
397 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
398 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
399 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
400 |     else:
401 |         f = _Function(inputs, [outputs], updates, givens=givens)
402 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
403 | 
404 | 
405 | class _Function(object):
406 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
407 |         for inpt in inputs:
408 |             if not issubclass(type(inpt), TfInput):
409 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of baselines.common.TfInput"
410 |         self.inputs = inputs
411 |         updates = updates or []
412 |         self.update_group = tf.group(*updates)
413 |         self.outputs_update = list(outputs) + [self.update_group]
414 |         self.givens = {} if givens is None else givens
415 |         self.check_nan = check_nan
416 | 
417 |     def _feed_input(self, feed_dict, inpt, value):
418 |         if issubclass(type(inpt), TfInput):
419 |             feed_dict.update(inpt.make_feed_dict(value))
420 |         elif is_placeholder(inpt):
421 |             feed_dict[inpt] = value
422 | 
423 |     def __call__(self, *args, **kwargs):
424 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
425 |         feed_dict = {}
426 |         # Update the args
427 |         for inpt, value in zip(self.inputs, args):
428 |             self._feed_input(feed_dict, inpt, value)
429 |         # Update the kwargs
430 |         kwargs_passed_inpt_names = set()
431 |         for inpt in self.inputs[len(args):]:
432 |             inpt_name = inpt.name.split(':')[0]
433 |             inpt_name = inpt_name.split('/')[-1]
434 |             assert inpt_name not in kwargs_passed_inpt_names, \
435 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
436 |             if inpt_name in kwargs:
437 |                 kwargs_passed_inpt_names.add(inpt_name)
438 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
439 |             else:
440 |                 assert inpt in self.givens, "Missing argument " + inpt_name
441 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
442 |         # Update feed dict with givens.
443 |         for inpt in self.givens:
444 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
445 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
446 |         if self.check_nan:
447 |             if any(np.isnan(r).any() for r in results):
448 |                 raise RuntimeError("Nan detected")
449 |         return results
450 | 
451 | 
452 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
453 |     if isinstance(outputs, list):
454 |         return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
455 |     else:
456 |         f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
457 |         return lambda *inputs: f(*inputs)[0]
458 | 
459 | 
460 | class _MemFriendlyFunction(object):
461 |     def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
462 |         self.nondata_inputs = nondata_inputs
463 |         self.data_inputs = data_inputs
464 |         self.outputs = list(outputs)
465 |         self.batch_size = batch_size
466 | 
467 |     def __call__(self, *inputvals):
468 |         assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
469 |         nondata_vals = inputvals[0:len(self.nondata_inputs)]
470 |         data_vals = inputvals[len(self.nondata_inputs):]
471 |         feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
472 |         n = data_vals[0].shape[0]
473 |         for v in data_vals[1:]:
474 |             assert v.shape[0] == n
475 |         for i_start in range(0, n, self.batch_size):
476 |             slice_vals = [v[i_start:builtins.min(i_start + self.batch_size, n)] for v in data_vals]
477 |             for (var, val) in zip(self.data_inputs, slice_vals):
478 |                 feed_dict[var] = val
479 |             results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
480 |             if i_start == 0:
481 |                 sum_results = results
482 |             else:
483 |                 for i in range(len(results)):
484 |                     sum_results[i] = sum_results[i] + results[i]
485 |         for i in range(len(results)):
486 |             sum_results[i] = sum_results[i] / n
487 |         return sum_results
488 | 
489 | # ================================================================
490 | # Modules
491 | # ================================================================
492 | 
493 | 
494 | class Module(object):
495 |     def __init__(self, name):
496 |         self.name = name
497 |         self.first_time = True
498 |         self.scope = None
499 |         self.cache = {}
500 | 
501 |     def __call__(self, *args):
502 |         if args in self.cache:
503 |             print("(%s) retrieving value from cache" % (self.name,))
504 |             return self.cache[args]
505 |         with tf.variable_scope(self.name, reuse=not self.first_time):
506 |             scope = tf.get_variable_scope().name
507 |             if self.first_time:
508 |                 self.scope = scope
509 |                 print("(%s) running function for the first time" % (self.name,))
510 |             else:
511 |                 assert self.scope == scope, "Tried calling function with a different scope"
512 |                 print("(%s) running function on new inputs" % (self.name,))
513 |             self.first_time = False
514 |             out = self._call(*args)
515 |         self.cache[args] = out
516 |         return out
517 | 
518 |     def _call(self, *args):
519 |         raise NotImplementedError
520 | 
521 |     @property
522 |     def trainable_variables(self):
523 |         assert self.scope is not None, "need to call module once before getting variables"
524 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
525 | 
526 |     @property
527 |     def variables(self):
528 |         assert self.scope is not None, "need to call module once before getting variables"
529 |         return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope)
530 | 
531 | 
532 | def module(name):
533 |     @functools.wraps
534 |     def wrapper(f):
535 |         class WrapperModule(Module):
536 |             def _call(self, *args):
537 |                 return f(*args)
538 |         return WrapperModule(name)
539 |     return wrapper
540 | 
541 | # ================================================================
542 | # Graph traversal
543 | # ================================================================
544 | 
545 | 
546 | VARIABLES = {}
547 | 
548 | 
549 | def get_parents(node):
550 |     return node.op.inputs
551 | 
552 | 
553 | def topsorted(outputs):
554 |     """
555 |     Topological sort via non-recursive depth-first search
556 |     """
557 |     assert isinstance(outputs, (list, tuple))
558 |     marks = {}
559 |     out = []
560 |     stack = []  # pylint: disable=W0621
561 |     # i: node
562 |     # jidx = number of children visited so far from that node
563 |     # marks: state of each node, which is one of
564 |     #   0: haven't visited
565 |     #   1: have visited, but not done visiting children
566 |     #   2: done visiting children
567 |     for x in outputs:
568 |         stack.append((x, 0))
569 |         while stack:
570 |             (i, jidx) = stack.pop()
571 |             if jidx == 0:
572 |                 m = marks.get(i, 0)
573 |                 if m == 0:
574 |                     marks[i] = 1
575 |                 elif m == 1:
576 |                     raise ValueError("not a dag")
577 |                 else:
578 |                     continue
579 |             ps = get_parents(i)
580 |             if jidx == len(ps):
581 |                 marks[i] = 2
582 |                 out.append(i)
583 |             else:
584 |                 stack.append((i, jidx + 1))
585 |                 j = ps[jidx]
586 |                 stack.append((j, 0))
587 |     return out
588 | 
589 | 
590 | # ================================================================
591 | # Flat vectors
592 | # ================================================================
593 | 
594 | def var_shape(x):
595 |     out = x.get_shape().as_list()
596 |     assert all(isinstance(a, int) for a in out), \
597 |         "shape function assumes that shape is fully known"
598 |     return out
599 | 
600 | 
601 | def numel(x):
602 |     return intprod(var_shape(x))
603 | 
604 | 
605 | def intprod(x):
606 |     return int(np.prod(x))
607 | 
608 | 
609 | def flatgrad(loss, var_list):
610 |     grads = tf.gradients(loss, var_list)
611 |     return tf.concat(axis=0, values=[
612 |         tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)])
613 |         for (v, grad) in zip(var_list, grads)
614 |     ])
615 | 
616 | 
617 | class SetFromFlat(object):
618 |     def __init__(self, var_list, dtype=tf.float32):
619 |         assigns = []
620 |         shapes = list(map(var_shape, var_list))
621 |         total_size = np.sum([intprod(shape) for shape in shapes])
622 | 
623 |         self.theta = theta = tf.placeholder(dtype, [total_size])
624 |         start = 0
625 |         assigns = []
626 |         for (shape, v) in zip(shapes, var_list):
627 |             size = intprod(shape)
628 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
629 |             start += size
630 |         self.op = tf.group(*assigns)
631 | 
632 |     def __call__(self, theta):
633 |         get_session().run(self.op, feed_dict={self.theta: theta})
634 | 
635 | 
636 | class GetFlat(object):
637 |     def __init__(self, var_list):
638 |         self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
639 | 
640 |     def __call__(self):
641 |         return get_session().run(self.op)
642 | 
643 | # ================================================================
644 | # Misc
645 | # ================================================================
646 | 
647 | 
648 | def fancy_slice_2d(X, inds0, inds1):
649 |     """
650 |     like numpy X[inds0, inds1]
651 |     XXX this implementation is bad
652 |     """
653 |     inds0 = tf.cast(inds0, tf.int64)
654 |     inds1 = tf.cast(inds1, tf.int64)
655 |     shape = tf.cast(tf.shape(X), tf.int64)
656 |     ncols = shape[1]
657 |     Xflat = tf.reshape(X, [-1])
658 |     return tf.gather(Xflat, inds0 * ncols + inds1)
659 | 
660 | 
661 | # ================================================================
662 | # Scopes
663 | # ================================================================
664 | 
665 | 
666 | def scope_vars(scope, trainable_only=False):
667 |     """
668 |     Get variables inside a scope
669 |     The scope can be specified as a string
670 | 
671 |     Parameters
672 |     ----------
673 |     scope: str or VariableScope
674 |         scope in which the variables reside.
675 |     trainable_only: bool
676 |         whether or not to return only the variables that were marked as trainable.
677 | 
678 |     Returns
679 |     -------
680 |     vars: [tf.Variable]
681 |         list of variables in `scope`.
682 |     """
683 |     return tf.get_collection(
684 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES,
685 |         scope=scope if isinstance(scope, str) else scope.name
686 |     )
687 | 
688 | 
689 | def scope_name():
690 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
691 |     return tf.get_variable_scope().name
692 | 
693 | 
694 | def absolute_scope_name(relative_scope_name):
695 |     """Appends parent scope name to `relative_scope_name`"""
696 |     return scope_name() + "/" + relative_scope_name
697 | 
698 | 
699 | def lengths_to_mask(lengths_b, max_length):
700 |     """
701 |     Turns a vector of lengths into a boolean mask
702 | 
703 |     Args:
704 |         lengths_b: an integer vector of lengths
705 |         max_length: maximum length to fill the mask
706 | 
707 |     Returns:
708 |         a boolean array of shape (batch_size, max_length)
709 |         row[i] consists of True repeated lengths_b[i] times, followed by False
710 |     """
711 |     lengths_b = tf.convert_to_tensor(lengths_b)
712 |     assert lengths_b.get_shape().ndims == 1
713 |     mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
714 |     return mask_bt
715 | 
716 | 
717 | def in_session(f):
718 |     @functools.wraps(f)
719 |     def newfunc(*args, **kwargs):
720 |         with tf.Session():
721 |             f(*args, **kwargs)
722 |     return newfunc
723 | 
724 | 
725 | _PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
726 | 
727 | 
728 | def get_placeholder(name, dtype, shape):
729 |     if name in _PLACEHOLDER_CACHE:
730 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
731 |         assert dtype1 == dtype and shape1 == shape
732 |         return out
733 |     else:
734 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
735 |         _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
736 |         return out
737 | 
738 | 
739 | def get_placeholder_cached(name):
740 |     return _PLACEHOLDER_CACHE[name][0]
741 | 
742 | 
743 | def flattenallbut0(x):
744 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
745 | 
746 | 
747 | def reset():
748 |     global _PLACEHOLDER_CACHE
749 |     global VARIABLES
750 |     _PLACEHOLDER_CACHE = {}
751 |     VARIABLES = {}
752 |     tf.reset_default_graph()
753 | 


--------------------------------------------------------------------------------