├── README.md
├── Statistical-Arbitrage-on-the-SP500.pdf
├── atari_wrappers.py
├── dqn.py
├── dqn_utils.py
├── gym_env
    ├── __init__.py
    └── trading
    │   ├── __init__.py
    │   ├── policy_gradient.py
    │   ├── test_policy_gradient.py
    │   ├── test_trading_env.py
    │   └── trading.py
├── prediction-using-RL.pdf
├── run_dqn_atari.py
└── run_dqn_ram.py


/README.md:
--------------------------------------------------------------------------------
1 | # Stock Trading with Deep Reinforcement Learning
2 | 
3 | 


--------------------------------------------------------------------------------
/Statistical-Arbitrage-on-the-SP500.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/songyunfan/DeepRLTrading/8e8b9fae27e58aa574b51f33b1588d9f9cbc4ba7/Statistical-Arbitrage-on-the-SP500.pdf


--------------------------------------------------------------------------------
/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | import os
 11 | 
 12 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 13 | 
 14 | def learn(env,
 15 |           q_func,
 16 |           optimizer_spec,
 17 |           session,
 18 |           exploration=LinearSchedule(1000000, 0.1),
 19 |           stopping_criterion=None,
 20 |           replay_buffer_size=1000000,
 21 |           batch_size=32,
 22 |           gamma=0.99,
 23 |           learning_starts=50000,
 24 |           learning_freq=4,
 25 |           frame_history_len=4,
 26 |           target_update_freq=10000,
 27 |           grad_norm_clipping=10):
 28 |     """Run Deep Q-learning algorithm.
 29 | 
 30 |     You can specify your own convnet using q_func.
 31 | 
 32 |     All schedules are w.r.t. total number of steps taken in the environment.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     env: gym.Env
 37 |         gym environment to train on.
 38 |     q_func: function
 39 |         Model to use for computing the q function. It should accept the
 40 |         following named arguments:
 41 |             img_in: tf.Tensor
 42 |                 tensorflow tensor representing the input image
 43 |             num_actions: int
 44 |                 number of actions
 45 |             scope: str
 46 |                 scope in which all the model related variables
 47 |                 should be created
 48 |             reuse: bool
 49 |                 whether previously created variables should be reused.
 50 |     optimizer_spec: OptimizerSpec
 51 |         Specifying the constructor and kwargs, as well as learning rate schedule
 52 |         for the optimizer
 53 |     session: tf.Session
 54 |         tensorflow session to use.
 55 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 56 |         schedule for probability of chosing random action.
 57 |     stopping_criterion: (env, t) -> bool
 58 |         should return true when it's ok for the RL algorithm to stop.
 59 |         takes in env and the number of steps executed so far.
 60 |     replay_buffer_size: int
 61 |         How many memories to store in the replay buffer.
 62 |     batch_size: int
 63 |         How many transitions to sample each time experience is replayed.
 64 |     gamma: float
 65 |         Discount Factor
 66 |     learning_starts: int
 67 |         After how many environment steps to start replaying experiences
 68 |     learning_freq: int
 69 |         How many steps of environment to take between every experience replay
 70 |     frame_history_len: int
 71 |         How many past frames to include as input to the model.
 72 |     target_update_freq: int
 73 |         How many experience replay rounds (not steps!) to perform between
 74 |         each update to the target Q network
 75 |     grad_norm_clipping: float or None
 76 |         If not None gradients' norms are clipped to this value.
 77 |     """
 78 |     assert type(env.observation_space) == gym.spaces.Box
 79 |     assert type(env.action_space)      == gym.spaces.Discrete
 80 | 
 81 |     ###############
 82 |     # BUILD MODEL #
 83 |     ###############
 84 | 
 85 |     if len(env.observation_space.shape) == 1:
 86 |         # This means we are running on low-dimensional observations (e.g. RAM)
 87 |         input_shape = env.observation_space.shape
 88 |     else:
 89 |         img_h, img_w, img_c = env.observation_space.shape
 90 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 91 |     num_actions = env.action_space.n
 92 | 
 93 |     # set up placeholders
 94 |     # placeholder for current observation (or state)
 95 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
 96 |     # placeholder for current action
 97 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
 98 |     # placeholder for current reward
 99 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
100 |     # placeholder for next observation (or state)
101 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
102 |     # placeholder for end of episode mask
103 |     # this value is 1 if the next state corresponds to the end of an episode,
104 |     # in which case there is no Q-value at the next state; at the end of an
105 |     # episode, only the current state reward contributes to the target, not the
106 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
107 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
108 | 
109 |     # casting to float on GPU ensures lower data transfer times.
110 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
111 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
112 | 
113 |     # Here, you should fill in your own code to compute the Bellman error. This requires
114 |     # evaluating the current and next Q-values and constructing the corresponding error.
115 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
116 |     # optimizer. See assignment text for details.
117 |     # Your code should produce one scalar-valued tensor: total_error
118 |     # This will be passed to the optimizer in the provided code below.
119 |     # Your code should also produce two collections of variables:
120 |     # q_func_vars
121 |     # target_q_func_vars
122 |     # These should hold all of the variables of the Q-function network and target network,
123 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
124 |     # For example, you can create your Q-function network with the scope "q_func" like this:
125 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
126 |     # And then you can obtain the variables like this:
127 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
128 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
129 |     ######
130 |      
131 |     # YOUR CODE HERE
132 |     # output of Q function from the model
133 |     Q_st_ph     = q_func( obs_t_float, num_actions, scope="q_func", reuse=False)
134 |     indx = tf.one_hot(act_t_ph, num_actions)
135 |     Q_stat_ph = tf.reduce_sum(Q_st_ph*indx,axis = 1)
136 |     
137 |     # compute the action for epsilon greedy exploration
138 |     #argmax_Q = tf.argmax(Q_st_ph, axis=1)
139 |     #args = tf.one_hot(argmax_Q, num_actions)
140 |     #expl = exploration.value(np.random.rand(1))
141 |     #sy_actions = args*(1.-expl)+(1.-args)*expl/(num_actions-1.)
142 |     # choose based on probability
143 |     #sy_actions = tf.multinomial(sy_actions, 1)
144 |     #sy_actions = tf.reshape(sy_actions, [-1])
145 |     # variables
146 |     q_func_vars  = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
147 |     
148 |     # target function using the rewards
149 |     Q_st_php = q_func( obs_tp1_float, num_actions, scope="target_q_func", reuse=False)
150 |     y_ = rew_t_ph + tf.multiply( 1.-done_mask_ph, gamma*tf.reduce_max(Q_st_php, axis=1))
151 |     # variables
152 |     target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func')
153 |     
154 |     total_error = tf.nn.l2_loss(Q_stat_ph-y_)
155 | 
156 | 
157 |     ######
158 | 
159 |     # construct optimization op (with gradient clipping)
160 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
161 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
162 |     train_fn = minimize_and_clip(optimizer, total_error,
163 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
164 | 
165 |     # update_target_fn will be called periodically to copy Q network to target Q network
166 |     update_target_fn = []
167 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
168 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
169 |         update_target_fn.append(var_target.assign(var))
170 |     update_target_fn = tf.group(*update_target_fn)
171 | 
172 |     # construct the replay buffer
173 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
174 | 
175 |     ###############
176 |     # RUN ENV     #
177 |     ###############
178 |     model_initialized = False
179 |     num_param_updates = 0
180 |     mean_episode_reward      = -float('nan')
181 |     best_mean_episode_reward = -float('inf')
182 |     last_obs = env.reset()
183 |     LOG_EVERY_N_STEPS = 10000
184 | 
185 |     mr = 'mr.txt'
186 |     bmr = 'bmr.txt'
187 |     tmr = 't.txt'
188 |     if os.path.isfile(mr):
189 |         os.remove(mr)
190 |     else:
191 |         f = open(mr,"w")
192 |         f.close()
193 |     if os.path.isfile(bmr):
194 |         os.remove(bmr)
195 |     else:
196 |         f = open(bmr,"w")
197 |         f.close()
198 |     if os.path.isfile(tmr):
199 |         os.remove(tmr)
200 |     else:
201 |         f = open(tmr,"w")
202 |         f.close()
203 | 
204 |     for t in itertools.count():
205 |         ### 1. Check stopping criterion
206 |         if stopping_criterion is not None and stopping_criterion(env, t):
207 |             break
208 |         ### 2. Step the env and store the transition
209 |         # At this point, "last_obs" contains the latest observation that was
210 |         # recorded from the simulator. Here, your code needs to store this
211 |         # observation and its outcome (reward, next observation, etc.) into
212 |         # the replay buffer while stepping the simulator forward one step.
213 |         # At the end of this block of code, the simulator should have been
214 |         # advanced one step, and the replay buffer should contain one more
215 |         # transition.
216 |         # Specifically, last_obs must point to the new latest observation.
217 |         # Useful functions you'll need to call:
218 |         # obs, reward, done, info = env.step(action)
219 |         # this steps the environment forward one step
220 |         # obs = env.reset()
221 |         # this resets the environment if you reached an episode boundary.
222 |         # Don't forget to call env.reset() to get a new observation if done
223 |         # is true!!
224 |         # Note that you cannot use "last_obs" directly as input
225 |         # into your network, since it needs to be processed to include context
226 |         # from previous frames. You should check out the replay buffer
227 |         # implementation in dqn_utils.py to see what functionality the replay
228 |         # buffer exposes. The replay buffer has a function called
229 |         # encode_recent_observation that will take the latest observation
230 |         # that you pushed into the buffer and compute the corresponding
231 |         # input that should be given to a Q network by appending some
232 |         # previous frames.
233 |         # Don't forget to include epsilon greedy exploration!
234 |         # And remember that the first time you enter this loop, the model
235 |         # may not yet have been initialized (but of course, the first step
236 |         # might as well be random, since you haven't trained your net...)
237 | 
238 |         #####
239 |         
240 |         # YOUR CODE HERE
241 |         # add the last_obs
242 |         idx = replay_buffer.store_frame(last_obs)
243 |         # extract obs from the replay_buffer
244 |         obs_ = [replay_buffer.encode_recent_observation()]
245 |         
246 |         # action from the model
247 |         if not model_initialized:
248 |             action = np.random.randint(num_actions)
249 |         else:
250 |             # epsilon greedy exploration
251 |             Q_val = session.run(Q_st_ph, feed_dict={ obs_t_ph: obs_})
252 |             
253 |             e = exploration.value(t)
254 |             
255 |             if np.random.rand(1)<e:
256 |                 action = env.action_space.sample()
257 |             else :
258 |                 action = np.argmax(Q_val, axis=1)
259 | 
260 |         last_obs, reward, done, info = env.step(action)
261 | 
262 |         replay_buffer.store_effect(idx, action, reward, done)
263 |         # If it is reached an episode boundary
264 |         if done:
265 |             last_obs = env.reset()
266 | 
267 |         #####
268 | 
269 |         # at this point, the environment should have been advanced one step (and
270 |         # reset if done was true), and last_obs should point to the new latest
271 |         # observation
272 | 
273 |         ### 3. Perform experience replay and train the network.
274 |         # note that this is only done if the replay buffer contains enough samples
275 |         # for us to learn something useful -- until then, the model will not be
276 |         # initialized and random actions should be taken
277 |         if (t > learning_starts and
278 |                 t % learning_freq == 0 and
279 |                 replay_buffer.can_sample(batch_size)):
280 |             # Here, you should perform training. Training consists of four steps:
281 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
282 |             # replay buffer code for function definition, each batch that you sample
283 |             # should consist of current observations, current actions, rewards,
284 |             # next observations, and done indicator).
285 |             # 3.b: initialize the model if it has not been initialized yet; to do
286 |             # that, call
287 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
288 |             #        obs_t_ph: obs_t_batch,
289 |             #        obs_tp1_ph: obs_tp1_batch,
290 |             #    })
291 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
292 |             # the current and next time step. The boolean variable model_initialized
293 |             # indicates whether or not the model has been initialized.
294 |             # Remember that you have to update the target network too (see 3.d)!
295 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
296 |             # total_error ops that were created earlier: total_error is what you
297 |             # created to compute the total Bellman error in a batch, and train_fn
298 |             # will actually perform a gradient step and update the network parameters
299 |             # to reduce total_error. When calling session.run on these you'll need to
300 |             # populate the following placeholders:
301 |             # obs_t_ph
302 |             # act_t_ph
303 |             # rew_t_ph
304 |             # obs_tp1_ph
305 |             # done_mask_ph
306 |             # (this is needed for computing total_error)
307 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
308 |             # (this is needed by the optimizer to choose the learning rate)
309 |             # 3.d: periodically update the target network by calling
310 |             # session.run(update_target_fn)
311 |             # you should update every target_update_freq steps, and you may find the
312 |             # variable num_param_updates useful for this (it was initialized to 0)
313 |             #####
314 |             
315 |             # YOUR CODE HERE
316 | 
317 |             # 3.a sample batches
318 |             obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample(batch_size)
319 |             
320 |             # 3.b initialize the model
321 |             if not model_initialized:
322 |                 initialize_interdependent_variables(session, tf.global_variables(), {
323 |                     obs_t_ph: obs_t_batch,
324 |                     obs_tp1_ph: obs_tp1_batch,
325 |                 })
326 |                 model_initialized = True
327 | 
328 |             #3.c train the model
329 |             feed_dict = {obs_t_ph: obs_t_batch, act_t_ph: act_batch, rew_t_ph: rew_batch,
330 |                 obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask, 
331 |                 learning_rate : optimizer_spec.lr_schedule.value(t)}
332 |             
333 |             session.run(train_fn, feed_dict = feed_dict)
334 | 
335 |             # update the network
336 |             if t%target_update_freq == 0:
337 |                 session.run(update_target_fn)
338 |             #####
339 |         ### 4. Log progress
340 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
341 |         if len(episode_rewards) > 0:
342 |             mean_episode_reward = np.mean(episode_rewards[-100:])
343 |         if len(episode_rewards) > 100:
344 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
345 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
346 |             print("Timestep %d" % (t,))
347 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
348 |             print("best mean reward %f" % best_mean_episode_reward)
349 |             print("episodes %d" % len(episode_rewards))
350 |             print("exploration %f" % exploration.value(t))
351 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
352 |             sys.stdout.flush()
353 | 
354 |             f = open(mr,"a")
355 |             f.write(str(mean_episode_reward)+'\n')
356 |             f.close()
357 |             f = open(bmr,"a")
358 |             f.write(str(best_mean_episode_reward)+'\n')
359 |             f.close()
360 |             f = open(tmr,"a")
361 |             f.write(str(t)+'\n')
362 |             f.close()
363 | 
364 | 


--------------------------------------------------------------------------------
/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.select(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 | 
226 |     def sample(self, batch_size):
227 |         """Sample `batch_size` different transitions.
228 | 
229 |         i-th sample transition is the following:
230 | 
231 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
232 |         after which reward `rew_batch[i]` was received and subsequent
233 |         observation  next_obs_batch[i] was observed, unless the epsiode
234 |         was done which is represented by `done_mask[i]` which is equal
235 |         to 1 if episode has ended as a result of that action.
236 | 
237 |         Parameters
238 |         ----------
239 |         batch_size: int
240 |             How many transitions to sample.
241 | 
242 |         Returns
243 |         -------
244 |         obs_batch: np.array
245 |             Array of shape
246 |             (batch_size, img_h, img_w, img_c * frame_history_len)
247 |             and dtype np.uint8
248 |         act_batch: np.array
249 |             Array of shape (batch_size,) and dtype np.int32
250 |         rew_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.float32
252 |         next_obs_batch: np.array
253 |             Array of shape
254 |             (batch_size, img_h, img_w, img_c * frame_history_len)
255 |             and dtype np.uint8
256 |         done_mask: np.array
257 |             Array of shape (batch_size,) and dtype np.float32
258 |         """
259 |         assert self.can_sample(batch_size)
260 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
261 |         return self._encode_sample(idxes)
262 | 
263 |     def encode_recent_observation(self):
264 |         """Return the most recent `frame_history_len` frames.
265 | 
266 |         Returns
267 |         -------
268 |         observation: np.array
269 |             Array of shape (img_h, img_w, img_c * frame_history_len)
270 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
271 |             encodes frame at time `t - frame_history_len + i`
272 |         """
273 |         assert self.num_in_buffer > 0
274 |         return self._encode_observation((self.next_idx - 1) % self.size)
275 | 
276 |     def _encode_observation(self, idx):
277 |         end_idx   = idx + 1 # make noninclusive
278 |         start_idx = end_idx - self.frame_history_len
279 |         # this checks if we are using low-dimensional observations, such as RAM
280 |         # state, in which case we just directly return the latest RAM.
281 |         if len(self.obs.shape) == 2:
282 |             return self.obs[end_idx-1]
283 |         # if there weren't enough frames ever in the buffer for context
284 |         if start_idx < 0 and self.num_in_buffer != self.size:
285 |             start_idx = 0
286 |         for idx in range(start_idx, end_idx - 1):
287 |             if self.done[idx % self.size]:
288 |                 start_idx = idx + 1
289 |         missing_context = self.frame_history_len - (end_idx - start_idx)
290 |         # if zero padding is needed for missing context
291 |         # or we are on the boundry of the buffer
292 |         if start_idx < 0 or missing_context > 0:
293 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
294 |             for idx in range(start_idx, end_idx):
295 |                 frames.append(self.obs[idx % self.size])
296 |             return np.concatenate(frames, 2)
297 |         else:
298 |             # this optimization has potential to saves about 30% compute time \o/
299 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
300 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
301 | 
302 |     def store_frame(self, frame):
303 |         """Store a single frame in the buffer at the next available index, overwriting
304 |         old frames if necessary.
305 | 
306 |         Parameters
307 |         ----------
308 |         frame: np.array
309 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
310 |             the frame to be stored
311 | 
312 |         Returns
313 |         -------
314 |         idx: int
315 |             Index at which the frame is stored. To be used for `store_effect` later.
316 |         """
317 |         if self.obs is None:
318 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
319 |             self.action   = np.empty([self.size],                     dtype=np.int32)
320 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
321 |             self.done     = np.empty([self.size],                     dtype=np.bool)
322 |         self.obs[self.next_idx] = frame
323 | 
324 |         ret = self.next_idx
325 |         self.next_idx = (self.next_idx + 1) % self.size
326 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
327 | 
328 |         return ret
329 | 
330 |     def store_effect(self, idx, action, reward, done):
331 |         """Store effects of action taken after obeserving frame stored
332 |         at index idx. The reason `store_frame` and `store_effect` is broken
333 |         up into two functions is so that once can call `encode_recent_observation`
334 |         in between.
335 | 
336 |         Paramters
337 |         ---------
338 |         idx: int
339 |             Index in buffer of recently observed frame (returned by `store_frame`).
340 |         action: int
341 |             Action that was performed upon observing this frame.
342 |         reward: float
343 |             Reward that was received when the actions was performed.
344 |         done: bool
345 |             True if episode was finished after performing that action.
346 |         """
347 |         self.action[idx] = action
348 |         self.reward[idx] = reward
349 |         self.done[idx]   = done
350 | 
351 | 


--------------------------------------------------------------------------------
/gym_env/__init__.py:
--------------------------------------------------------------------------------
  1 | from gym.envs.registration import registry, register, make, spec
  2 | 
  3 | # Added for CS 294-112
  4 | # ----------------------------------------
  5 | register(
  6 |     id='trading-v0',
  7 |     entry_point='gym.envs.trading:TradingEnv',
  8 |     timestep_limit=1000,
  9 | )
 10 | 
 11 | # Algorithmic
 12 | # ----------------------------------------
 13 | 
 14 | register(
 15 |     id='Copy-v0',
 16 |     entry_point='gym.envs.algorithmic:CopyEnv',
 17 |     max_episode_steps=200,
 18 |     reward_threshold=25.0,
 19 | )
 20 | 
 21 | register(
 22 |     id='RepeatCopy-v0',
 23 |     entry_point='gym.envs.algorithmic:RepeatCopyEnv',
 24 |     max_episode_steps=200,
 25 |     reward_threshold=75.0,
 26 | )
 27 | 
 28 | register(
 29 |     id='ReversedAddition-v0',
 30 |     entry_point='gym.envs.algorithmic:ReversedAdditionEnv',
 31 |     kwargs={'rows' : 2},
 32 |     max_episode_steps=200,
 33 |     reward_threshold=25.0,
 34 | )
 35 | 
 36 | register(
 37 |     id='ReversedAddition3-v0',
 38 |     entry_point='gym.envs.algorithmic:ReversedAdditionEnv',
 39 |     kwargs={'rows' : 3},
 40 |     max_episode_steps=200,
 41 |     reward_threshold=25.0,
 42 | )
 43 | 
 44 | register(
 45 |     id='DuplicatedInput-v0',
 46 |     entry_point='gym.envs.algorithmic:DuplicatedInputEnv',
 47 |     max_episode_steps=200,
 48 |     reward_threshold=9.0,
 49 | )
 50 | 
 51 | register(
 52 |     id='Reverse-v0',
 53 |     entry_point='gym.envs.algorithmic:ReverseEnv',
 54 |     max_episode_steps=200,
 55 |     reward_threshold=25.0,
 56 | )
 57 | 
 58 | # Classic
 59 | # ----------------------------------------
 60 | 
 61 | register(
 62 |     id='CartPole-v0',
 63 |     entry_point='gym.envs.classic_control:CartPoleEnv',
 64 |     max_episode_steps=200,
 65 |     reward_threshold=195.0,
 66 | )
 67 | 
 68 | register(
 69 |     id='CartPole-v1',
 70 |     entry_point='gym.envs.classic_control:CartPoleEnv',
 71 |     max_episode_steps=500,
 72 |     reward_threshold=475.0,
 73 | )
 74 | 
 75 | register(
 76 |     id='MountainCar-v0',
 77 |     entry_point='gym.envs.classic_control:MountainCarEnv',
 78 |     max_episode_steps=200,
 79 |     reward_threshold=-110.0,
 80 | )
 81 | 
 82 | register(
 83 |     id='MountainCarContinuous-v0',
 84 |     entry_point='gym.envs.classic_control:Continuous_MountainCarEnv',
 85 |     max_episode_steps=999,
 86 |     reward_threshold=90.0,
 87 | )
 88 | 
 89 | register(
 90 |     id='Pendulum-v0',
 91 |     entry_point='gym.envs.classic_control:PendulumEnv',
 92 |     max_episode_steps=200,
 93 | )
 94 | 
 95 | register(
 96 |     id='Acrobot-v1',
 97 |     entry_point='gym.envs.classic_control:AcrobotEnv',
 98 |     max_episode_steps=500,
 99 | )
100 | 
101 | # Box2d
102 | # ----------------------------------------
103 | 
104 | register(
105 |     id='LunarLander-v2',
106 |     entry_point='gym.envs.box2d:LunarLander',
107 |     max_episode_steps=1000,
108 |     reward_threshold=200,
109 | )
110 | 
111 | register(
112 |     id='LunarLanderContinuous-v2',
113 |     entry_point='gym.envs.box2d:LunarLanderContinuous',
114 |     max_episode_steps=1000,
115 |     reward_threshold=200,
116 | )
117 | 
118 | register(
119 |     id='BipedalWalker-v2',
120 |     entry_point='gym.envs.box2d:BipedalWalker',
121 |     max_episode_steps=1600,
122 |     reward_threshold=300,
123 | )
124 | 
125 | register(
126 |     id='BipedalWalkerHardcore-v2',
127 |     entry_point='gym.envs.box2d:BipedalWalkerHardcore',
128 |     max_episode_steps=2000,
129 |     reward_threshold=300,
130 | )
131 | 
132 | register(
133 |     id='CarRacing-v0',
134 |     entry_point='gym.envs.box2d:CarRacing',
135 |     max_episode_steps=1000,
136 |     reward_threshold=900,
137 | )
138 | 
139 | # Toy Text
140 | # ----------------------------------------
141 | 
142 | register(
143 |     id='Blackjack-v0',
144 |     entry_point='gym.envs.toy_text:BlackjackEnv',
145 | )
146 | 
147 | register(
148 |     id='KellyCoinflip-v0',
149 |     entry_point='gym.envs.toy_text:KellyCoinflipEnv',
150 |     reward_threshold=246.61,
151 | )
152 | register(
153 |     id='KellyCoinflipGeneralized-v0',
154 |     entry_point='gym.envs.toy_text:KellyCoinflipGeneralizedEnv',
155 | )
156 | 
157 | register(
158 |     id='FrozenLake-v0',
159 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
160 |     kwargs={'map_name' : '4x4'},
161 |     max_episode_steps=100,
162 |     reward_threshold=0.78, # optimum = .8196
163 | )
164 | 
165 | register(
166 |     id='FrozenLake8x8-v0',
167 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
168 |     kwargs={'map_name' : '8x8'},
169 |     max_episode_steps=200,
170 |     reward_threshold=0.99, # optimum = 1
171 | )
172 | 
173 | register(
174 |     id='CliffWalking-v0',
175 |     entry_point='gym.envs.toy_text:CliffWalkingEnv',
176 | )
177 | 
178 | register(
179 |     id='NChain-v0',
180 |     entry_point='gym.envs.toy_text:NChainEnv',
181 |     max_episode_steps=1000,
182 | )
183 | 
184 | register(
185 |     id='Roulette-v0',
186 |     entry_point='gym.envs.toy_text:RouletteEnv',
187 |     max_episode_steps=100,
188 | )
189 | 
190 | register(
191 |     id='Taxi-v2',
192 |     entry_point='gym.envs.toy_text.taxi:TaxiEnv',
193 |     reward_threshold=8, # optimum = 8.46
194 |     max_episode_steps=200,
195 | )
196 | 
197 | register(
198 |     id='GuessingGame-v0',
199 |     entry_point='gym.envs.toy_text.guessing_game:GuessingGame',
200 |     max_episode_steps=200,
201 | )
202 | 
203 | register(
204 |     id='HotterColder-v0',
205 |     entry_point='gym.envs.toy_text.hotter_colder:HotterColder',
206 |     max_episode_steps=200,
207 | )
208 | 
209 | # Mujoco
210 | # ----------------------------------------
211 | 
212 | # 2D
213 | 
214 | register(
215 |     id='Reacher-v1',
216 |     entry_point='gym.envs.mujoco:ReacherEnv',
217 |     max_episode_steps=50,
218 |     reward_threshold=-3.75,
219 | )
220 | 
221 | register(
222 |     id='Pusher-v0',
223 |     entry_point='gym.envs.mujoco:PusherEnv',
224 |     max_episode_steps=100,
225 |     reward_threshold=0.0,
226 | )
227 | 
228 | register(
229 |     id='Thrower-v0',
230 |     entry_point='gym.envs.mujoco:ThrowerEnv',
231 |     max_episode_steps=100,
232 |     reward_threshold=0.0,
233 | )
234 | 
235 | register(
236 |     id='Striker-v0',
237 |     entry_point='gym.envs.mujoco:StrikerEnv',
238 |     max_episode_steps=100,
239 |     reward_threshold=0.0,
240 | )
241 | 
242 | register(
243 |     id='InvertedPendulum-v1',
244 |     entry_point='gym.envs.mujoco:InvertedPendulumEnv',
245 |     max_episode_steps=1000,
246 |     reward_threshold=950.0,
247 | )
248 | 
249 | register(
250 |     id='InvertedDoublePendulum-v1',
251 |     entry_point='gym.envs.mujoco:InvertedDoublePendulumEnv',
252 |     max_episode_steps=1000,
253 |     reward_threshold=9100.0,
254 | )
255 | 
256 | register(
257 |     id='HalfCheetah-v1',
258 |     entry_point='gym.envs.mujoco:HalfCheetahEnv',
259 |     max_episode_steps=1000,
260 |     reward_threshold=4800.0,
261 | )
262 | 
263 | register(
264 |     id='Hopper-v1',
265 |     entry_point='gym.envs.mujoco:HopperEnv',
266 |     max_episode_steps=1000,
267 |     reward_threshold=3800.0,
268 | )
269 | 
270 | register(
271 |     id='Swimmer-v1',
272 |     entry_point='gym.envs.mujoco:SwimmerEnv',
273 |     max_episode_steps=1000,
274 |     reward_threshold=360.0,
275 | )
276 | 
277 | register(
278 |     id='Walker2d-v1',
279 |     max_episode_steps=1000,
280 |     entry_point='gym.envs.mujoco:Walker2dEnv',
281 | )
282 | 
283 | register(
284 |     id='Ant-v1',
285 |     entry_point='gym.envs.mujoco:AntEnv',
286 |     max_episode_steps=1000,
287 |     reward_threshold=6000.0,
288 | )
289 | 
290 | register(
291 |     id='Humanoid-v1',
292 |     entry_point='gym.envs.mujoco:HumanoidEnv',
293 |     max_episode_steps=1000,
294 | )
295 | 
296 | register(
297 |     id='HumanoidStandup-v1',
298 |     entry_point='gym.envs.mujoco:HumanoidStandupEnv',
299 |     max_episode_steps=1000,
300 | )
301 | 
302 | # Atari
303 | # ----------------------------------------
304 | 
305 | # # print ', '.join(["'{}'".format(name.split('.')[0]) for name in atari_py.list_games()])
306 | for game in ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis',
307 |     'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival',
308 |     'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk',
309 |     'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar',
310 |     'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master',
311 |     'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan',
312 |     'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing',
313 |     'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down',
314 |     'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']:
315 |     for obs_type in ['image', 'ram']:
316 |         # space_invaders should yield SpaceInvaders-v0 and SpaceInvaders-ram-v0
317 |         name = ''.join([g.capitalize() for g in game.split('_')])
318 |         if obs_type == 'ram':
319 |             name = '{}-ram'.format(name)
320 | 
321 |         nondeterministic = False
322 |         if game == 'elevator_action' and obs_type == 'ram':
323 |             # ElevatorAction-ram-v0 seems to yield slightly
324 |             # non-deterministic observations about 10% of the time. We
325 |             # should track this down eventually, but for now we just
326 |             # mark it as nondeterministic.
327 |             nondeterministic = True
328 | 
329 |         register(
330 |             id='{}-v0'.format(name),
331 |             entry_point='gym.envs.atari:AtariEnv',
332 |             kwargs={'game': game, 'obs_type': obs_type, 'repeat_action_probability': 0.25},
333 |             max_episode_steps=10000,
334 |             nondeterministic=nondeterministic,
335 |         )
336 | 
337 |         register(
338 |             id='{}-v4'.format(name),
339 |             entry_point='gym.envs.atari:AtariEnv',
340 |             kwargs={'game': game, 'obs_type': obs_type},
341 |             max_episode_steps=100000,
342 |             nondeterministic=nondeterministic,
343 |         )
344 | 
345 |         # Standard Deterministic (as in the original DeepMind paper)
346 |         if game == 'space_invaders':
347 |             frameskip = 3
348 |         else:
349 |             frameskip = 4
350 | 
351 |         # Use a deterministic frame skip.
352 |         register(
353 |             id='{}Deterministic-v0'.format(name),
354 |             entry_point='gym.envs.atari:AtariEnv',
355 |             kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip, 'repeat_action_probability': 0.25},
356 |             max_episode_steps=100000,
357 |             nondeterministic=nondeterministic,
358 |         )
359 | 
360 |         register(
361 |             id='{}Deterministic-v4'.format(name),
362 |             entry_point='gym.envs.atari:AtariEnv',
363 |             kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip},
364 |             max_episode_steps=100000,
365 |             nondeterministic=nondeterministic,
366 |         )
367 | 
368 |         register(
369 |             id='{}NoFrameskip-v0'.format(name),
370 |             entry_point='gym.envs.atari:AtariEnv',
371 |             kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1, 'repeat_action_probability': 0.25}, # A frameskip of 1 means we get every frame
372 |             max_episode_steps=frameskip * 100000,
373 |             nondeterministic=nondeterministic,
374 |         )
375 | 
376 |         # No frameskip. (Atari has no entropy source, so these are
377 |         # deterministic environments.)
378 |         register(
379 |             id='{}NoFrameskip-v4'.format(name),
380 |             entry_point='gym.envs.atari:AtariEnv',
381 |             kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1}, # A frameskip of 1 means we get every frame
382 |             max_episode_steps=frameskip * 100000,
383 |             nondeterministic=nondeterministic,
384 |         )
385 | 
386 | # Board games
387 | # ----------------------------------------
388 | 
389 | register(
390 |     id='Go9x9-v0',
391 |     entry_point='gym.envs.board_game:GoEnv',
392 |     kwargs={
393 |         'player_color': 'black',
394 |         'opponent': 'pachi:uct:_2400',
395 |         'observation_type': 'image3c',
396 |         'illegal_move_mode': 'lose',
397 |         'board_size': 9,
398 |     },
399 |     # The pachi player seems not to be determistic given a fixed seed.
400 |     # (Reproduce by running 'import gym; h = gym.make('Go9x9-v0'); h.seed(1); h.reset(); h.step(15); h.step(16); h.step(17)' a few times.)
401 |     #
402 |     # This is probably due to a computation time limit.
403 |     nondeterministic=True,
404 | )
405 | 
406 | register(
407 |     id='Go19x19-v0',
408 |     entry_point='gym.envs.board_game:GoEnv',
409 |     kwargs={
410 |         'player_color': 'black',
411 |         'opponent': 'pachi:uct:_2400',
412 |         'observation_type': 'image3c',
413 |         'illegal_move_mode': 'lose',
414 |         'board_size': 19,
415 |     },
416 |     nondeterministic=True,
417 | )
418 | 
419 | register(
420 |     id='Hex9x9-v0',
421 |     entry_point='gym.envs.board_game:HexEnv',
422 |     kwargs={
423 |         'player_color': 'black',
424 |         'opponent': 'random',
425 |         'observation_type': 'numpy3c',
426 |         'illegal_move_mode': 'lose',
427 |         'board_size': 9,
428 |     },
429 | )
430 | 
431 | # Debugging
432 | # ----------------------------------------
433 | 
434 | register(
435 |     id='OneRoundDeterministicReward-v0',
436 |     entry_point='gym.envs.debugging:OneRoundDeterministicRewardEnv',
437 |     local_only=True
438 | )
439 | 
440 | register(
441 |     id='TwoRoundDeterministicReward-v0',
442 |     entry_point='gym.envs.debugging:TwoRoundDeterministicRewardEnv',
443 |     local_only=True
444 | )
445 | 
446 | register(
447 |     id='OneRoundNondeterministicReward-v0',
448 |     entry_point='gym.envs.debugging:OneRoundNondeterministicRewardEnv',
449 |     local_only=True
450 | )
451 | 
452 | register(
453 |     id='TwoRoundNondeterministicReward-v0',
454 |     entry_point='gym.envs.debugging:TwoRoundNondeterministicRewardEnv',
455 |     local_only=True,
456 | )
457 | 
458 | # Parameter tuning
459 | # ----------------------------------------
460 | register(
461 |     id='ConvergenceControl-v0',
462 |     entry_point='gym.envs.parameter_tuning:ConvergenceControl',
463 | )
464 | 
465 | register(
466 |     id='CNNClassifierTraining-v0',
467 |     entry_point='gym.envs.parameter_tuning:CNNClassifierTraining',
468 | )
469 | 
470 | # Safety
471 | # ----------------------------------------
472 | 
473 | # interpretability envs
474 | register(
475 |     id='PredictActionsCartpole-v0',
476 |     entry_point='gym.envs.safety:PredictActionsCartpoleEnv',
477 |     max_episode_steps=200,
478 | )
479 | 
480 | register(
481 |     id='PredictObsCartpole-v0',
482 |     entry_point='gym.envs.safety:PredictObsCartpoleEnv',
483 |     max_episode_steps=200,
484 | )
485 | 
486 | # semi_supervised envs
487 |     # probably the easiest:
488 | register(
489 |     id='SemisuperPendulumNoise-v0',
490 |     entry_point='gym.envs.safety:SemisuperPendulumNoiseEnv',
491 |     max_episode_steps=200,
492 | )
493 |     # somewhat harder because of higher variance:
494 | register(
495 |     id='SemisuperPendulumRandom-v0',
496 |     entry_point='gym.envs.safety:SemisuperPendulumRandomEnv',
497 |     max_episode_steps=200,
498 | )
499 |     # probably the hardest because you only get a constant number of rewards in total:
500 | register(
501 |     id='SemisuperPendulumDecay-v0',
502 |     entry_point='gym.envs.safety:SemisuperPendulumDecayEnv',
503 |     max_episode_steps=200,
504 | )
505 | 
506 | # off_switch envs
507 | register(
508 |     id='OffSwitchCartpole-v0',
509 |     entry_point='gym.envs.safety:OffSwitchCartpoleEnv',
510 |     max_episode_steps=200,
511 | )
512 | 
513 | register(
514 |     id='OffSwitchCartpoleProb-v0',
515 |     entry_point='gym.envs.safety:OffSwitchCartpoleProbEnv',
516 |     max_episode_steps=200,
517 | )
518 | 


--------------------------------------------------------------------------------
/gym_env/trading/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.trading import TradingEnv
2 | 


--------------------------------------------------------------------------------
/gym_env/trading/policy_gradient.py:
--------------------------------------------------------------------------------
  1 | ''' Policy Gradient implementation customized a bit for 
  2 | solving the trading problem'''
  3 | # stolen shamelessly and adapted December 2016 by Tito Ingargiola
  4 | # was originally:
  5 | 
  6 | '''Solution to the Cartpole problem using Policy Gradients in Tensorflow.'''
  7 | # written October 2016 by Sam Greydanus
  8 | # inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
  9 | 
 10 | import numpy as np
 11 | import gym
 12 | import tensorflow as tf
 13 | import pdb
 14 | import logging
 15 | import os.path
 16 | import pandas as pd
 17 | 
 18 | import trading_env as te
 19 | 
 20 | log = logging.getLogger(__name__)
 21 | log.setLevel(logging.INFO)
 22 | log.info('%s logger started.',__name__)
 23 | 
 24 | class PolicyGradient(object) :
 25 |     """ Policy Gradient implementation in tensor flow.
 26 |    """
 27 |     
 28 |     def __init__(self,
 29 |                  sess,                # tensorflow session
 30 |                  obs_dim,             # observation shape
 31 |                  num_actions,         # number of possible actions
 32 |                  neurons_per_dim=32,  # hidden layer will have obs_dim * neurons_per_dim neurons
 33 |                  learning_rate=1e-2,  # learning rate
 34 |                  gamma = 0.9,         # reward discounting 
 35 |                  decay = 0.9          # gradient decay rate
 36 |                  ):
 37 |                  
 38 |         self._sess = sess
 39 |         self._gamma = gamma
 40 |         self._tf_model = {}
 41 |         self._num_actions = num_actions
 42 |         hidden_neurons = obs_dim * neurons_per_dim
 43 |         with tf.variable_scope('layer_one',reuse=False):
 44 |             L1 = tf.truncated_normal_initializer(mean=0,
 45 |                                                  stddev=1./np.sqrt(obs_dim),
 46 |                                                  dtype=tf.float32)
 47 |             self._tf_model['W1'] = tf.get_variable("W1",
 48 |                                                    [obs_dim, hidden_neurons],
 49 |                                                    initializer=L1)
 50 |         with tf.variable_scope('layer_two',reuse=False):
 51 |             L2 = tf.truncated_normal_initializer(mean=0,
 52 |                                                  stddev=1./np.sqrt(hidden_neurons),
 53 |                                                  dtype=tf.float32)
 54 |             self._tf_model['W2'] = tf.get_variable("W2",
 55 |                                                    [hidden_neurons,num_actions],
 56 |                                                    initializer=L2)
 57 |        
 58 |         # tf placeholders
 59 |         self._tf_x = tf.placeholder(dtype=tf.float32, shape=[None, obs_dim],name="tf_x")
 60 |         self._tf_y = tf.placeholder(dtype=tf.float32, shape=[None, num_actions],name="tf_y")
 61 |         self._tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr")
 62 | 
 63 |         # tf reward processing (need tf_discounted_epr for policy gradient wizardry)
 64 |         self._tf_discounted_epr = self.tf_discount_rewards(self._tf_epr)
 65 |         self._tf_mean, self._tf_variance= tf.nn.moments(self._tf_discounted_epr, [0], 
 66 |                                                         shift=None, name="reward_moments")
 67 |         self._tf_discounted_epr -= self._tf_mean
 68 |         self._tf_discounted_epr /= tf.sqrt(self._tf_variance + 1e-6)
 69 | 
 70 |         self._saver = tf.train.Saver()
 71 | 
 72 |         # tf optimizer op
 73 |         self._tf_aprob = self.tf_policy_forward(self._tf_x)
 74 |         loss = tf.nn.l2_loss(self._tf_y - self._tf_aprob) # this gradient encourages the actions taken
 75 |         optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay)
 76 |         tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), 
 77 |                                                grad_loss=self._tf_discounted_epr)
 78 |         self._train_op = optimizer.apply_gradients(tf_grads)
 79 |     
 80 |     def tf_discount_rewards(self, tf_r): #tf_r ~ [game_steps,1]
 81 |         discount_f = lambda a, v: a*self._gamma + v;
 82 |         tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False]))
 83 |         tf_discounted_r = tf.reverse(tf_r_reverse,[True, False])
 84 |         return tf_discounted_r
 85 | 
 86 |     def tf_policy_forward(self, x): #x ~ [1,D]
 87 |         h = tf.matmul(x, self._tf_model['W1'])
 88 |         h = tf.nn.relu(h)
 89 |         logp = tf.matmul(h, self._tf_model['W2'])
 90 |         p = tf.nn.softmax(logp)
 91 |         return p
 92 | 
 93 |     def train_model(self, env, episodes=100, 
 94 |                     load_model = False,  # load model from checkpoint if available:?
 95 |                     model_dir = '/tmp/pgmodel/', log_freq=10 ) :
 96 | 
 97 |         # initialize variables and load model
 98 |         init_op = tf.global_variables_initializer()
 99 |         self._sess.run(init_op)
100 |         if load_model:
101 |             ckpt = tf.train.get_checkpoint_state(model_dir)
102 |             print tf.train.latest_checkpoint(model_dir)
103 |             if ckpt and ckpt.model_checkpoint_path:
104 |                 savr = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta')
105 |                 out = savr.restore(self._sess, ckpt.model_checkpoint_path)
106 |                 print("Model restored from ",ckpt.model_checkpoint_path)
107 |             else:
108 |                 print('No checkpoint found at: ',model_dir)
109 |         if not os.path.exists(model_dir):
110 |             os.makedirs(model_dir)
111 | 
112 |         episode = 0
113 |         observation = env.reset()
114 |         xs,rs,ys = [],[],[]    # environment info
115 |         running_reward = 0    
116 |         reward_sum = 0
117 |         # training loop
118 |         day = 0
119 |         simrors = np.zeros(episodes)
120 |         mktrors = np.zeros(episodes)
121 |         alldf = None
122 |         victory = False
123 |         while episode < episodes and not victory:
124 |             # stochastically sample a policy from the network
125 |             x = observation
126 |             feed = {self._tf_x: np.reshape(x, (1,-1))}
127 |             aprob = self._sess.run(self._tf_aprob,feed)
128 |             aprob = aprob[0,:] # we live in a batched world :/
129 | 
130 |             action = np.random.choice(self._num_actions, p=aprob)
131 |             label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label'
132 | 
133 |             # step the environment and get new measurements
134 |             observation, reward, done, info = env.step(action)
135 |             #print observation, reward, done, info
136 |             reward_sum += reward
137 | 
138 |             # record game history
139 |             xs.append(x)
140 |             ys.append(label)
141 |             rs.append(reward)
142 |             day += 1
143 |             if done:
144 |                 running_reward = running_reward * 0.99 + reward_sum * 0.01
145 |                 epx = np.vstack(xs)
146 |                 epr = np.vstack(rs)
147 |                 epy = np.vstack(ys)
148 |                 xs,rs,ys = [],[],[] # reset game history
149 |                 df = env.sim.to_df()
150 |                 #pdb.set_trace()
151 |                 simrors[episode]=df.bod_nav.values[-1]-1 # compound returns
152 |                 mktrors[episode]=df.mkt_nav.values[-1]-1
153 | 
154 |                 alldf = df if alldf is None else pd.concat([alldf,df], axis=0)
155 |                 
156 |                 feed = {self._tf_x: epx, self._tf_epr: epr, self._tf_y: epy}
157 |                 _ = self._sess.run(self._train_op,feed) # parameter update
158 | 
159 |                 if episode % log_freq == 0:
160 |                     log.info('year #%6d, mean reward: %8.4f, sim ret: %8.4f, mkt ret: %8.4f, net: %8.4f', episode,
161 |                              running_reward, simrors[episode],mktrors[episode], simrors[episode]-mktrors[episode])
162 |                     save_path = self._saver.save(self._sess, model_dir+'model.ckpt',
163 |                                                  global_step=episode+1)
164 |                     if episode > 100:
165 |                         vict = pd.DataFrame( { 'sim': simrors[episode-100:episode],
166 |                                                'mkt': mktrors[episode-100:episode] } )
167 |                         vict['net'] = vict.sim - vict.mkt
168 |                         if vict.net.mean() > 0.0:
169 |                             victory = True
170 |                             log.info('Congratulations, Warren Buffet!  You won the trading game.')
171 |                     #print("Model saved in file: {}".format(save_path))
172 | 
173 |                 
174 |                     
175 |                 episode += 1
176 |                 observation = env.reset()
177 |                 reward_sum = 0
178 |                 day = 0
179 |                 
180 |         return alldf, pd.DataFrame({'simror':simrors,'mktror':mktrors})
181 | 


--------------------------------------------------------------------------------
/gym_env/trading/test_policy_gradient.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import gym
 4 | import tensorflow as tf
 5 | import matplotlib as mpl
 6 | import matplotlib.pyplot as plt
 7 | from matplotlib import interactive
 8 | interactive(True)
 9 | import pdb
10 | import logging
11 | 
12 | log = logging.getLogger()
13 | #log.addHandler(logging.StreamHandler())
14 | import policy_gradient 
15 | # create gym
16 | env = gym.make('trading-v0')
17 | 
18 | sess = tf.InteractiveSession()
19 | 
20 | # create policygradient
21 | pg = policy_gradient.PolicyGradient(sess, obs_dim=5, num_actions=3, learning_rate=1e-2 )
22 | 
23 | # train model, loading if possible
24 | alldf,summrzed = pg.train_model( env,episodes=1001, log_freq=100)#, load_model=True)
25 | #print df
26 | #pd.DataFrame(sharpes).expanding().mean().plot()
27 | 
28 | 


--------------------------------------------------------------------------------
/gym_env/trading/test_trading_env.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | #import gym_trading
 4 | import pandas as pd
 5 | import numpy as np
 6 | import trading_env as te
 7 | 
 8 | pd.set_option('display.width',500)
 9 | 
10 | env = gym.make('trading-v0')
11 | 
12 | #env.time_cost_bps = 0
13 | 
14 | Episodes=1
15 | 
16 | obs = []
17 | 
18 | for _ in range(Episodes):
19 |     observation = env.reset()
20 |     done = False
21 |     count = 0
22 |     while not done:
23 |         action = env.action_space.sample() # random
24 |         observation, reward, done, info = env.step(action)
25 |         obs = obs + [observation]
26 |         #print observation,reward,done,info
27 |         count += 1
28 |         if done:
29 |             print reward
30 |             print count
31 |         
32 | df = env.sim.to_df()
33 | 
34 | df.head()
35 | df.tail()
36 | 
37 | buyhold = lambda x,y : 2
38 | df = env.run_strat( buyhold )
39 | 
40 | df10 = env.run_strats( buyhold, episodes )
41 | 


--------------------------------------------------------------------------------
/gym_env/trading/trading.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import error, spaces, utils
  3 | from gym.utils import seeding
  4 | from collections import Counter
  5 | 
  6 | import quandl
  7 | import numpy as np
  8 | from numpy import random
  9 | import pandas as pd
 10 | import logging
 11 | import pdb
 12 | 
 13 | import tempfile
 14 | 
 15 | log = logging.getLogger(__name__)
 16 | log.info('%s logger started.',__name__)
 17 | 
 18 | 
 19 | def _sharpe(Returns, freq=252) :
 20 |   """Given a set of returns, calculates naive (rfr=0) sharpe """
 21 |   return (np.sqrt(freq) * np.mean(Returns))/np.std(Returns)
 22 | 
 23 | def _prices2returns(prices):
 24 |   px = pd.DataFrame(prices)
 25 |   nl = px.shift().fillna(0)
 26 |   R = ((px - nl)/nl).fillna(0).replace([np.inf, -np.inf], np.nan).dropna()
 27 |   R = np.append( R[0].values, 0)
 28 |   return R
 29 | 
 30 | class QuandlEnvSrc(object):
 31 |   ''' 
 32 |   Quandl-based implementation of a TradingEnv's data source.
 33 |   
 34 |   Pulls data from Quandl, preps for use by TradingEnv and then 
 35 |   acts as data provider for each new episode.
 36 |   '''
 37 | 
 38 |   MinPercentileDays = 100 
 39 |   QuandlAuthToken = ""  # not necessary, but can be used if desired
 40 |   Name = "GOOG/NYSE_SPY" #"GOOG/NYSE_IBM"
 41 | 
 42 |   def __init__(self, days=252, name=Name, auth=QuandlAuthToken, scale=True ):
 43 |     self.name = name
 44 |     self.auth = auth
 45 |     self.days = days+1
 46 |     log.info('getting data for %s from quandl...',QuandlEnvSrc.Name)
 47 |     df = quandl.get(self.name) if self.auth=='' else quandl.get(self.name, authtoken=self.auth)
 48 |     log.info('got data for %s from quandl...',QuandlEnvSrc.Name)
 49 |     
 50 |     df = df[ ~np.isnan(df.Volume)][['Close','Volume']]
 51 |     # we calculate returns and percentiles, then kill nans
 52 |     df = df[['Close','Volume']]   
 53 |     df.Volume.replace(0,1,inplace=True) # days shouldn't have zero volume..
 54 |     df['Return'] = (df.Close-df.Close.shift())/df.Close.shift()
 55 |     pctrank = lambda x: pd.Series(x).rank(pct=True).iloc[-1]
 56 |     df['ClosePctl'] = df.Close.expanding(self.MinPercentileDays).apply(pctrank)
 57 |     df['VolumePctl'] = df.Volume.expanding(self.MinPercentileDays).apply(pctrank)
 58 |     df.dropna(axis=0,inplace=True)
 59 |     R = df.Return
 60 |     if scale:
 61 |       mean_values = df.mean(axis=0)
 62 |       std_values = df.std(axis=0)
 63 |       df = (df - np.array(mean_values))/ np.array(std_values)
 64 |     df['Return'] = R # we don't want our returns scaled
 65 |     self.min_values = df.min(axis=0)
 66 |     self.max_values = df.max(axis=0)
 67 |     self.data = df
 68 |     self.step = 0
 69 |     
 70 |   def reset(self):
 71 |     # we want contiguous data
 72 |     self.idx = np.random.randint( low = 0, high=len(self.data.index)-self.days )
 73 |     self.step = 0
 74 | 
 75 |   def _step(self):    
 76 |     obs = self.data.iloc[self.idx].as_matrix()
 77 |     self.idx += 1
 78 |     self.step += 1
 79 |     done = self.step >= self.days
 80 |     return obs,done
 81 | 
 82 | class TradingSim(object) :
 83 |   """ Implements core trading simulator for single-instrument univ """
 84 | 
 85 |   def __init__(self, steps, trading_cost_bps = 1e-3, time_cost_bps = 1e-4):
 86 |     # invariant for object life
 87 |     self.trading_cost_bps = trading_cost_bps
 88 |     self.time_cost_bps    = time_cost_bps
 89 |     self.steps            = steps
 90 |     # change every step
 91 |     self.step             = 0
 92 |     self.actions          = np.zeros(self.steps)
 93 |     self.navs             = np.ones(self.steps)
 94 |     self.mkt_nav         = np.ones(self.steps)
 95 |     self.strat_retrns     = np.ones(self.steps)
 96 |     self.posns            = np.zeros(self.steps)
 97 |     self.costs            = np.zeros(self.steps)
 98 |     self.trades           = np.zeros(self.steps)
 99 |     self.mkt_retrns       = np.zeros(self.steps)
100 |     
101 |   def reset(self):
102 |     self.step = 0
103 |     self.actions.fill(0)
104 |     self.navs.fill(1)
105 |     self.mkt_nav.fill(1)
106 |     self.strat_retrns.fill(0)
107 |     self.posns.fill(0)
108 |     self.costs.fill(0)
109 |     self.trades.fill(0)
110 |     self.mkt_retrns.fill(0)
111 |     
112 |   def _step(self, action, retrn ):
113 |     """ Given an action and return for prior period, calculates costs, navs,
114 |         etc and returns the reward and a  summary of the day's activity. """
115 | 
116 |     bod_posn = 0.0 if self.step == 0 else self.posns[self.step-1]
117 |     bod_nav  = 1.0 if self.step == 0 else self.navs[self.step-1]
118 |     mkt_nav  = 1.0 if self.step == 0 else self.mkt_nav[self.step-1]
119 | 
120 |     self.mkt_retrns[self.step] = retrn
121 |     self.actions[self.step] = action
122 |     
123 |     self.posns[self.step] = action - 1     
124 |     self.trades[self.step] = self.posns[self.step] - bod_posn
125 |     
126 |     trade_costs_pct = abs(self.trades[self.step]) * self.trading_cost_bps 
127 |     self.costs[self.step] = trade_costs_pct +  self.time_cost_bps
128 |     reward = ( (bod_posn * retrn) - self.costs[self.step] )
129 |     self.strat_retrns[self.step] = reward
130 | 
131 |     if self.step != 0 :
132 |       self.navs[self.step] =  bod_nav * (1 + self.strat_retrns[self.step-1])
133 |       self.mkt_nav[self.step] =  mkt_nav * (1 + self.mkt_retrns[self.step-1])
134 |     
135 |     info = { 'reward': reward, 'nav':self.navs[self.step], 'costs':self.costs[self.step] }
136 | 
137 |     self.step += 1      
138 |     return reward, info
139 | 
140 |   def to_df(self):
141 |     """returns internal state in new dataframe """
142 |     cols = ['action', 'bod_nav', 'mkt_nav','mkt_return','sim_return',
143 |             'position','costs', 'trade' ]
144 |     rets = _prices2returns(self.navs)
145 |     #pdb.set_trace()
146 |     df = pd.DataFrame( {'action':     self.actions, # today's action (from agent)
147 |                           'bod_nav':    self.navs,    # BOD Net Asset Value (NAV)
148 |                           'mkt_nav':  self.mkt_nav, 
149 |                           'mkt_return': self.mkt_retrns,
150 |                           'sim_return': self.strat_retrns,
151 |                           'position':   self.posns,   # EOD position
152 |                           'costs':  self.costs,   # eod costs
153 |                           'trade':  self.trades },# eod trade
154 |                          columns=cols)
155 |     return df
156 | 
157 | class TradingEnv(gym.Env):
158 |   """This gym implements a simple trading environment for reinforcement learning.
159 | 
160 |   The gym provides daily observations based on real market data pulled
161 |   from Quandl on, by default, the SPY etf. An episode is defined as 252
162 |   contiguous days sampled from the overall dataset. Each day is one
163 |   'step' within the gym and for each step, the algo has a choice:
164 | 
165 |   SHORT (0)
166 |   FLAT (1)
167 |   LONG (2)
168 | 
169 |   If you trade, you will be charged, by default, 10 BPS of the size of
170 |   your trade. Thus, going from short to long costs twice as much as
171 |   going from short to/from flat. Not trading also has a default cost of
172 |   1 BPS per step. Nobody said it would be easy!
173 | 
174 |   At the beginning of your episode, you are allocated 1 unit of
175 |   cash. This is your starting Net Asset Value (NAV). If your NAV drops
176 |   to 0, your episode is over and you lose. If your NAV hits 2.0, then
177 |   you win.
178 | 
179 |   The trading env will track a buy-and-hold strategy which will act as
180 |   the benchmark for the game.
181 | 
182 |   """
183 |   metadata = {'render.modes': ['human']}
184 | 
185 |   def __init__(self):
186 |     self.days = 252
187 |     self.src = QuandlEnvSrc(days=self.days)
188 |     self.sim = TradingSim(steps=self.days, trading_cost_bps=1e-3,
189 |                           time_cost_bps=1e-4)
190 |     self.action_space = spaces.Discrete( 3 )
191 |     self.observation_space= spaces.Box( self.src.min_values,
192 |                                         self.src.max_values)
193 |     self.reset()
194 | 
195 |   def _configure(self, display=None):
196 |     self.display = display
197 | 
198 |   def _seed(self, seed=None):
199 |     self.np_random, seed = seeding.np_random(seed)
200 |     return [seed]
201 | 
202 |   def _step(self, action):
203 |     assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
204 |     observation, done = self.src._step()
205 |     # Close    Volume     Return  ClosePctl  VolumePctl
206 |     yret = observation[2]
207 | 
208 |     reward, info = self.sim._step( action, yret )
209 |       
210 |     #info = { 'pnl': daypnl, 'nav':self.nav, 'costs':costs }
211 | 
212 |     return observation, reward, done, info
213 |   
214 |   def _reset(self):
215 |     self.src.reset()
216 |     self.sim.reset()
217 |     return self.src._step()[0]
218 |     
219 |   def _render(self, mode='human', close=False):
220 |     #... TODO
221 |     pass
222 | 
223 |   # some convenience functions:
224 |   
225 |   def run_strat(self,  strategy, return_df=True):
226 |     """run provided strategy, returns dataframe with all steps"""
227 |     observation = self.reset()
228 |     done = False
229 |     while not done:
230 |       action = strategy( observation, self ) # call strategy
231 |       observation, reward, done, info = self.step(action)
232 | 
233 |     return self.sim.to_df() if return_df else None
234 |       
235 |   def run_strats( self, strategy, episodes=1, write_log=True, return_df=True):
236 |     """ run provided strategy the specified # of times, possibly
237 |         writing a log and possibly returning a dataframe summarizing activity.
238 |     
239 |         Note that writing the log is expensive and returning the df is moreso.  
240 |         For training purposes, you might not want to set both.
241 |     """
242 |     logfile = None
243 |     if write_log:
244 |       logfile = tempfile.NamedTemporaryFile(delete=False)
245 |       log.info('writing log to %s',logfile.name)
246 |       need_df = write_log or return_df
247 | 
248 |     alldf = None
249 |         
250 |     for i in range(episodes):
251 |       df = self.run_strat(strategy, return_df=need_df)
252 |       if write_log:
253 |         df.to_csv(logfile, mode='a')
254 |         if return_df:
255 |           alldf = df if alldf is None else pd.concat([alldf,df], axis=0)
256 |             
257 |     return alldf
258 | 


--------------------------------------------------------------------------------
/prediction-using-RL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/songyunfan/DeepRLTrading/8e8b9fae27e58aa574b51f33b1588d9f9cbc4ba7/prediction-using-RL.pdf


--------------------------------------------------------------------------------
/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10
 78 |     )
 79 |     env.close()
 80 | 
 81 | def get_available_gpus():
 82 |     from tensorflow.python.client import device_lib
 83 |     local_device_protos = device_lib.list_local_devices()
 84 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 85 | 
 86 | def set_global_seeds(i):
 87 |     try:
 88 |         import tensorflow as tf
 89 |     except ImportError:
 90 |         pass
 91 |     else:
 92 |         tf.set_random_seed(i) 
 93 |     np.random.seed(i)
 94 |     random.seed(i)
 95 | 
 96 | def get_session():
 97 |     tf.reset_default_graph()
 98 |     tf_config = tf.ConfigProto(
 99 |         inter_op_parallelism_threads=1,
100 |         intra_op_parallelism_threads=1)
101 |     session = tf.Session(config=tf_config)
102 |     print("AVAILABLE GPUS: ", get_available_gpus())
103 |     return session
104 | 
105 | def get_env(task, seed):
106 |     env_id = task.env_id
107 | 
108 |     env = gym.make(env_id)
109 | 
110 |     set_global_seeds(seed)
111 |     env.seed(seed)
112 | 
113 |     expt_dir = '/tmp/hw3_vid_dir2/'
114 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
115 |     env = wrap_deepmind(env)
116 | 
117 |     return env
118 | 
119 | def main():
120 |     # Get Atari games.
121 |     benchmark = gym.benchmark_spec('Atari40M')
122 | 
123 |     # Change the index to select a different game.
124 |     task = benchmark.tasks[3]
125 | 
126 |     # Run training
127 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
128 |     env = get_env(task, seed)
129 |     session = get_session()
130 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0 
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i) 
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------