├── images
    ├── mario1.gif
    ├── mario2.gif
    └── vizdoom.gif
├── .gitignore
├── src
    ├── requirements.txt
    ├── env_wrapper.py
    └── demo.py
├── models
    └── download_models.sh
├── LICENSE
└── README.md


/images/mario1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/noreward-rl/master/images/mario1.gif


--------------------------------------------------------------------------------
/images/mario2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/noreward-rl/master/images/mario2.gif


--------------------------------------------------------------------------------
/images/vizdoom.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/noreward-rl/master/images/vizdoom.gif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__
 2 | **pyc
 3 | **npy
 4 | tmp/
 5 | curiosity
 6 | src/vizdoom.ini
 7 | models/*.tar.gz
 8 | models/output
 9 | models/doom
10 | models/mario
11 | 


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | doom-py==0.0.15
 3 | funcsigs==1.0.2
 4 | -e git+https://github.com/openai/gym.git@c16f6c8a233324422d4faccb391d32363ef6f36e#egg=gym
 5 | -e git+https://github.com/ppaquette/gym-pull.git@5b2797fd081ba5be26544983d1eba764e6d9f73b#egg=gym_pull
 6 | mock==2.0.0
 7 | numpy==1.12.1
 8 | olefile==0.44
 9 | packaging==16.8
10 | pbr==3.0.1
11 | Pillow==4.1.1
12 | pkg-resources==0.0.0
13 | ppaquette-gym-doom==0.0.6
14 | protobuf==3.1.0
15 | pyglet==1.2.4
16 | pyparsing==2.2.0
17 | requests==2.14.2
18 | six==1.10.0
19 | tensorflow==0.12.0rc1
20 | 


--------------------------------------------------------------------------------
/models/download_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/" && pwd )"
 4 | cd $DIR
 5 | 
 6 | FILE=models.tar.gz
 7 | URL=https://people.eecs.berkeley.edu/~pathak/noreward-rl/resources/$FILE
 8 | CHECKSUM=fbf57709e4949f93fb857c945e5a2c70
 9 | 
10 | if [ ! -f $FILE ]; then
11 |   echo "Downloading the curiosity-driven RL trained models (6MB)..."
12 |   wget $URL -O $FILE
13 |   echo "Unzipping..."
14 |   tar zxvf $FILE
15 |   mv models/* .
16 |   rm -rf models
17 |   echo "Downloading Done."
18 | else
19 |   echo "File already exists. Checking md5..."
20 | fi
21 | 
22 | os=`uname -s`
23 | if [ "$os" = "Linux" ]; then
24 |   checksum=`md5sum $FILE | awk '{ print $1 }'`
25 | elif [ "$os" = "Darwin" ]; then
26 |   checksum=`cat $FILE | md5`
27 | elif [ "$os" = "SunOS" ]; then
28 |   checksum=`digest -a md5 -v $FILE | awk '{ print $4 }'`
29 | fi
30 | if [ "$checksum" = "$CHECKSUM" ]; then
31 |   echo "Checksum is correct. File was correctly downloaded."
32 |   exit 0
33 | else
34 |   echo "Checksum is incorrect. DELETE and download again."
35 | fi
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Deepak Pathak
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Curiosity-driven Exploration by Self-supervised Prediction ##
 2 | #### In ICML 2017 [[Project Website]](http://pathak22.github.io/noreward-rl/) [[Demo Video]](http://pathak22.github.io/noreward-rl/index.html#demoVideo)
 3 | 
 4 | [Deepak Pathak](https://people.eecs.berkeley.edu/~pathak/), [Pulkit Agrawal](https://people.eecs.berkeley.edu/~pulkitag/), [Alexei A. Efros](https://people.eecs.berkeley.edu/~efros/), [Trevor Darrell](https://people.eecs.berkeley.edu/~trevor/)<br/>
 5 | University of California, Berkeley<br/>
 6 | 
 7 | <img src="images/mario1.gif" width="300">    <img src="images/vizdoom.gif" width="351">
 8 | 
 9 | This is the code for our [ICML 2017 paper on curiosity-driven exploration for reinforcement learning](http://pathak22.github.io/noreward-rl/). Idea is to train agent with intrinsic curiosity-based motivation (ICM) when external rewards from environment are sparse. Surprisingly, you can use ICM even when there are no rewards available from the environment, in which case, agent learns to explore only out of curiosity: 'RL without rewards'. If you find this work useful in your research, please cite:
10 | 
11 |     @inproceedings{pathakICMl17curiosity,
12 |         Author = {Pathak, Deepak and Agrawal, Pulkit and
13 |                   Efros, Alexei A. and Darrell, Trevor},
14 |         Title = {Curiosity-driven Exploration by Self-supervised Prediction},
15 |         Booktitle = {International Conference on Machine Learning ({ICML})},
16 |         Year = {2017}
17 |     }
18 | 
19 | ### 1) Running demo
20 | 
21 | 1. Install dependencies in the virtual environment and download trained models:
22 |   ```Shell
23 |   git clone -b master --single-branch https://github.com/pathak22/noreward-rl.git
24 |   cd noreward-rl/
25 |   virtualenv curiosity
26 |   source $PWD/curiosity/bin/activate
27 |   pip install -r src/requirements.txt
28 |   bash ./models/download_models.sh
29 |   ```
30 | 
31 | 2. Run demo:
32 |   ```Shell
33 |   cd noreward-rl/src/
34 |   python demo.py --ckpt ../models/doom/doom_ICM
35 |   # By default, it runs doom "dense-reward" setup. If you want to run, "sparse" and "very-sparse" reward setups, replace relevant doom files as follows:
36 |   # mv noreward-rl/curiosity/local/lib/python2.7/site-packages/doom_py/scenarios/my_way_home.wad  noreward-rl/curiosity/local/lib/python2.7/site-packages/doom_py/scenarios/my_way_home_orig.wad
37 |   # cp noreward-rl/models/doom/wads/my_way_home_fixed13.wad noreward-rl/curiosity/local/lib/python2.7/site-packages/doom_py/scenarios/my_way_home.wad
38 |   ```
39 | 
40 | ### 2) Training code
41 | 
42 | [To be released soon. Stay tuned !]
43 | 


--------------------------------------------------------------------------------
/src/env_wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file is derived from https://github.com/shelhamer/ourl/envs.py
 3 | Originally written by Evan Shelhamer and modified by Deepak Pathak
 4 | """
 5 | from __future__ import print_function
 6 | import numpy as np
 7 | from collections import deque
 8 | from PIL import Image
 9 | from gym.spaces.box import Box
10 | import gym
11 | 
12 | class BufferedObsEnv(gym.ObservationWrapper):
13 |     """Buffer observations and stack e.g. for frame skipping.
14 | 
15 |     n is the length of the buffer, and number of observations stacked.
16 |     skip is the number of steps between buffered observations (min=1).
17 | 
18 |     n.b. first obs is the oldest, last obs is the newest.
19 |          the buffer is zeroed out on reset.
20 |          *must* call reset() for init!
21 |     """
22 |     def __init__(self, env=None, n=4, skip=4, shape=(84, 84),
23 |                     channel_last=True):
24 |         super(BufferedObsEnv, self).__init__(env)
25 |         self.obs_shape = shape
26 |         # most recent raw observations (for max pooling across time steps)
27 |         self.obs_buffer = deque(maxlen=2)
28 |         self.n = n
29 |         self.skip = skip
30 |         self.buffer = deque(maxlen=self.n)
31 |         self.counter = 0  # init and reset should agree on this
32 |         shape = shape + (n,) if channel_last else (n,) + shape
33 |         self.observation_space = Box(0.0, 255.0, shape)
34 |         self.ch_axis = -1 if channel_last else 0
35 |         self.scale = 1.0 / 255
36 |         self.observation_space.high[...] = 1.0
37 | 
38 |     def _step(self, action):
39 |         obs, reward, done, info = self.env.step(action)
40 |         return self._observation(obs), reward, done, info
41 | 
42 |     def _observation(self, obs):
43 |         obs = self._convert(obs)
44 |         self.counter += 1
45 |         if self.counter % self.skip == 0:
46 |             self.buffer.append(obs)
47 |         obsNew = np.stack(self.buffer, axis=self.ch_axis)
48 |         return obsNew.astype(np.float32) * self.scale
49 | 
50 |     def _reset(self):
51 |         """Clear buffer and re-fill by duplicating the first observation."""
52 |         self.obs_buffer.clear()
53 |         obs = self._convert(self.env.reset())
54 |         self.buffer.clear()
55 |         self.counter = 0
56 |         for _ in range(self.n - 1):
57 |             self.buffer.append(np.zeros_like(obs))
58 |         self.buffer.append(obs)
59 |         obsNew = np.stack(self.buffer, axis=self.ch_axis) 
60 |         return obsNew.astype(np.float32) * self.scale
61 | 
62 |     def _convert(self, obs):
63 |         self.obs_buffer.append(obs)
64 |         max_frame = np.max(np.stack(self.obs_buffer), axis=0)
65 |         intensity_frame = self._rgb2y(max_frame).astype(np.uint8)
66 |         small_frame = np.array(Image.fromarray(intensity_frame).resize(
67 |             self.obs_shape, resample=Image.BILINEAR), dtype=np.uint8)
68 |         return small_frame
69 | 
70 |     def _rgb2y(self, im):
71 |         """Converts an RGB image to a Y image (as in YUV).
72 | 
73 |         These coefficients are taken from the torch/image library.
74 |         Beware: these are more critical than you might think, as the
75 |         monochromatic contrast can be surprisingly low.
76 |         """
77 |         if len(im.shape) < 3:
78 |             return im
79 |         return np.sum(im * [0.299, 0.587, 0.114], axis=2)
80 | 
81 | 
82 | class NoNegativeRewardEnv(gym.RewardWrapper):
83 |     """Clip reward in negative direction."""
84 |     def __init__(self, env=None, neg_clip=0.0):
85 |         super(NoNegativeRewardEnv, self).__init__(env)
86 |         self.neg_clip = neg_clip
87 | 
88 |     def _reward(self, reward):
89 |         new_reward = self.neg_clip if reward < self.neg_clip else reward
90 |         return new_reward
91 | 


--------------------------------------------------------------------------------
/src/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import tensorflow as tf
  4 | import gym
  5 | import numpy as np
  6 | import argparse
  7 | import logging
  8 | logger = logging.getLogger(__name__)
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | def create_doom(record=False, outdir=None):
 12 |     from ppaquette_gym_doom import wrappers
 13 |     import env_wrapper
 14 |     env = gym.make('ppaquette/DoomMyWayHome-v0')
 15 |     modewrapper = wrappers.SetPlayingMode('algo')
 16 |     obwrapper = wrappers.SetResolution('160x120')
 17 |     acwrapper = wrappers.ToDiscrete('minimal')
 18 |     env = modewrapper(obwrapper(acwrapper(env)))
 19 | 
 20 |     if record:
 21 |         env = gym.wrappers.Monitor(env, outdir, force=True)
 22 |     fshape = (42, 42)
 23 | 
 24 |     env.seed(None)
 25 |     env = env_wrapper.NoNegativeRewardEnv(env)
 26 |     env = env_wrapper.BufferedObsEnv(env, skip=1, shape=fshape)
 27 |     return env
 28 | 
 29 | 
 30 | def inference(args):
 31 |     """
 32 |     It restore policy weights, and does inference.
 33 |     """
 34 |     # define environment
 35 |     env = create_doom(args.record, outdir=args.outdir)
 36 |     numaction = env.action_space.n
 37 | 
 38 |     with tf.device("/cpu:0"):
 39 |         config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 40 |         with tf.Session(config=config) as sess:
 41 |             logger.info("Restoring trainable global parameters.")
 42 |             saver = tf.train.import_meta_graph(args.ckpt+'.meta')
 43 |             saver.restore(sess, args.ckpt)
 44 | 
 45 |             probs = tf.get_collection("probs")[0]
 46 |             sample = tf.get_collection("sample")[0]
 47 |             vf = tf.get_collection("vf")[0]
 48 |             state_out_0 = tf.get_collection("state_out_0")[0]
 49 |             state_out_1 = tf.get_collection("state_out_1")[0]
 50 | 
 51 |             last_state = env.reset()
 52 |             if args.render or args.record:
 53 |                 env.render()
 54 |             last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
 55 |             length = 0
 56 |             rewards = 0
 57 |             mario_distances = np.zeros((args.num_episodes,))
 58 |             for i in range(args.num_episodes):
 59 |                 print("Starting episode %d" % (i + 1))
 60 | 
 61 |                 if args.random:
 62 |                     print('I am a random policy!')
 63 |                 else:
 64 |                     if args.greedy:
 65 |                         print('I am a greedy policy!')
 66 |                     else:
 67 |                         print('I am a sampled policy!')
 68 |                 while True:
 69 |                     # run policy
 70 |                     fetched = sess.run([probs, sample, vf, state_out_0, state_out_1] ,
 71 |                                 {"global/x:0": [last_state], "global/c_in:0": last_features[0], "global/h_in:0": last_features[1]})
 72 |                     prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:]
 73 | 
 74 |                     # run environment
 75 |                     if args.random:
 76 |                         stepAct = np.random.randint(0, numaction)  # random policy
 77 |                     else:
 78 |                         if args.greedy:
 79 |                             stepAct = prob_action.argmax()  # greedy policy
 80 |                         else:
 81 |                             stepAct = action.argmax()
 82 |                     state, reward, terminal, info = env.step(stepAct)
 83 | 
 84 |                     # update stats
 85 |                     length += 1
 86 |                     rewards += reward
 87 |                     last_state = state
 88 |                     last_features = features
 89 |                     if args.render or args.record:
 90 |                         env.render()
 91 | 
 92 |                     timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
 93 |                     if timestep_limit is None: timestep_limit = env.spec.timestep_limit
 94 |                     if terminal or length >= timestep_limit:
 95 |                         if length >= timestep_limit or not env.metadata.get('semantics.autoreset'):
 96 |                             last_state = env.reset()
 97 |                         last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
 98 |                         print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length))
 99 |                         length = 0
100 |                         rewards = 0
101 |                         if args.render or args.record:
102 |                             env.render()
103 |                         break
104 | 
105 |         logger.info('Finished %d true episodes.', args.num_episodes)
106 |         env.close()
107 | 
108 | 
109 | def main(_):
110 |     parser = argparse.ArgumentParser(description=None)
111 |     parser.add_argument('--ckpt', default="../models/doom/doom_ICM", help='checkpoint name')
112 |     parser.add_argument('--outdir', default="../models/output", help='Output log directory')
113 |     parser.add_argument('--env-id', default="doom", help='Environment id')
114 |     parser.add_argument('--record', action='store_true', help="Record the policy running video")
115 |     parser.add_argument('--render', action='store_true',
116 |                         help="Render the gym environment video online")
117 |     parser.add_argument('--num-episodes', type=int, default=2, help="Number of episodes to run")
118 |     parser.add_argument('--greedy', action='store_true',
119 |                         help="Default sampled policy. This option does argmax.")
120 |     parser.add_argument('--random', action='store_true',
121 |                         help="Default sampled policy. This option does random policy.")
122 |     args = parser.parse_args()
123 |     inference(args)
124 | 
125 | if __name__ == "__main__":
126 |     tf.app.run()
127 | 


--------------------------------------------------------------------------------