├── README.md ├── Statistical-Arbitrage-on-the-SP500.pdf ├── atari_wrappers.py ├── dqn.py ├── dqn_utils.py ├── gym_env ├── __init__.py └── trading │ ├── __init__.py │ ├── policy_gradient.py │ ├── test_policy_gradient.py │ ├── test_trading_env.py │ └── trading.py ├── prediction-using-RL.pdf ├── run_dqn_atari.py └── run_dqn_ram.py /README.md: -------------------------------------------------------------------------------- 1 | # Stock Trading with Deep Reinforcement Learning 2 | 3 | -------------------------------------------------------------------------------- /Statistical-Arbitrage-on-the-SP500.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songyunfan/DeepRLTrading/8e8b9fae27e58aa574b51f33b1588d9f9cbc4ba7/Statistical-Arbitrage-on-the-SP500.pdf -------------------------------------------------------------------------------- /atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | import os 11 | 12 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 13 | 14 | def learn(env, 15 | q_func, 16 | optimizer_spec, 17 | session, 18 | exploration=LinearSchedule(1000000, 0.1), 19 | stopping_criterion=None, 20 | replay_buffer_size=1000000, 21 | batch_size=32, 22 | gamma=0.99, 23 | learning_starts=50000, 24 | learning_freq=4, 25 | frame_history_len=4, 26 | target_update_freq=10000, 27 | grad_norm_clipping=10): 28 | """Run Deep Q-learning algorithm. 29 | 30 | You can specify your own convnet using q_func. 31 | 32 | All schedules are w.r.t. total number of steps taken in the environment. 33 | 34 | Parameters 35 | ---------- 36 | env: gym.Env 37 | gym environment to train on. 38 | q_func: function 39 | Model to use for computing the q function. It should accept the 40 | following named arguments: 41 | img_in: tf.Tensor 42 | tensorflow tensor representing the input image 43 | num_actions: int 44 | number of actions 45 | scope: str 46 | scope in which all the model related variables 47 | should be created 48 | reuse: bool 49 | whether previously created variables should be reused. 50 | optimizer_spec: OptimizerSpec 51 | Specifying the constructor and kwargs, as well as learning rate schedule 52 | for the optimizer 53 | session: tf.Session 54 | tensorflow session to use. 55 | exploration: rl_algs.deepq.utils.schedules.Schedule 56 | schedule for probability of chosing random action. 57 | stopping_criterion: (env, t) -> bool 58 | should return true when it's ok for the RL algorithm to stop. 59 | takes in env and the number of steps executed so far. 60 | replay_buffer_size: int 61 | How many memories to store in the replay buffer. 62 | batch_size: int 63 | How many transitions to sample each time experience is replayed. 64 | gamma: float 65 | Discount Factor 66 | learning_starts: int 67 | After how many environment steps to start replaying experiences 68 | learning_freq: int 69 | How many steps of environment to take between every experience replay 70 | frame_history_len: int 71 | How many past frames to include as input to the model. 72 | target_update_freq: int 73 | How many experience replay rounds (not steps!) to perform between 74 | each update to the target Q network 75 | grad_norm_clipping: float or None 76 | If not None gradients' norms are clipped to this value. 77 | """ 78 | assert type(env.observation_space) == gym.spaces.Box 79 | assert type(env.action_space) == gym.spaces.Discrete 80 | 81 | ############### 82 | # BUILD MODEL # 83 | ############### 84 | 85 | if len(env.observation_space.shape) == 1: 86 | # This means we are running on low-dimensional observations (e.g. RAM) 87 | input_shape = env.observation_space.shape 88 | else: 89 | img_h, img_w, img_c = env.observation_space.shape 90 | input_shape = (img_h, img_w, frame_history_len * img_c) 91 | num_actions = env.action_space.n 92 | 93 | # set up placeholders 94 | # placeholder for current observation (or state) 95 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 96 | # placeholder for current action 97 | act_t_ph = tf.placeholder(tf.int32, [None]) 98 | # placeholder for current reward 99 | rew_t_ph = tf.placeholder(tf.float32, [None]) 100 | # placeholder for next observation (or state) 101 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 102 | # placeholder for end of episode mask 103 | # this value is 1 if the next state corresponds to the end of an episode, 104 | # in which case there is no Q-value at the next state; at the end of an 105 | # episode, only the current state reward contributes to the target, not the 106 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 107 | done_mask_ph = tf.placeholder(tf.float32, [None]) 108 | 109 | # casting to float on GPU ensures lower data transfer times. 110 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 111 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 112 | 113 | # Here, you should fill in your own code to compute the Bellman error. This requires 114 | # evaluating the current and next Q-values and constructing the corresponding error. 115 | # TensorFlow will differentiate this error for you, you just need to pass it to the 116 | # optimizer. See assignment text for details. 117 | # Your code should produce one scalar-valued tensor: total_error 118 | # This will be passed to the optimizer in the provided code below. 119 | # Your code should also produce two collections of variables: 120 | # q_func_vars 121 | # target_q_func_vars 122 | # These should hold all of the variables of the Q-function network and target network, 123 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 124 | # For example, you can create your Q-function network with the scope "q_func" like this: 125 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 126 | # And then you can obtain the variables like this: 127 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 128 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 129 | ###### 130 | 131 | # YOUR CODE HERE 132 | # output of Q function from the model 133 | Q_st_ph = q_func( obs_t_float, num_actions, scope="q_func", reuse=False) 134 | indx = tf.one_hot(act_t_ph, num_actions) 135 | Q_stat_ph = tf.reduce_sum(Q_st_ph*indx,axis = 1) 136 | 137 | # compute the action for epsilon greedy exploration 138 | #argmax_Q = tf.argmax(Q_st_ph, axis=1) 139 | #args = tf.one_hot(argmax_Q, num_actions) 140 | #expl = exploration.value(np.random.rand(1)) 141 | #sy_actions = args*(1.-expl)+(1.-args)*expl/(num_actions-1.) 142 | # choose based on probability 143 | #sy_actions = tf.multinomial(sy_actions, 1) 144 | #sy_actions = tf.reshape(sy_actions, [-1]) 145 | # variables 146 | q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 147 | 148 | # target function using the rewards 149 | Q_st_php = q_func( obs_tp1_float, num_actions, scope="target_q_func", reuse=False) 150 | y_ = rew_t_ph + tf.multiply( 1.-done_mask_ph, gamma*tf.reduce_max(Q_st_php, axis=1)) 151 | # variables 152 | target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') 153 | 154 | total_error = tf.nn.l2_loss(Q_stat_ph-y_) 155 | 156 | 157 | ###### 158 | 159 | # construct optimization op (with gradient clipping) 160 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 161 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 162 | train_fn = minimize_and_clip(optimizer, total_error, 163 | var_list=q_func_vars, clip_val=grad_norm_clipping) 164 | 165 | # update_target_fn will be called periodically to copy Q network to target Q network 166 | update_target_fn = [] 167 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 168 | sorted(target_q_func_vars, key=lambda v: v.name)): 169 | update_target_fn.append(var_target.assign(var)) 170 | update_target_fn = tf.group(*update_target_fn) 171 | 172 | # construct the replay buffer 173 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 174 | 175 | ############### 176 | # RUN ENV # 177 | ############### 178 | model_initialized = False 179 | num_param_updates = 0 180 | mean_episode_reward = -float('nan') 181 | best_mean_episode_reward = -float('inf') 182 | last_obs = env.reset() 183 | LOG_EVERY_N_STEPS = 10000 184 | 185 | mr = 'mr.txt' 186 | bmr = 'bmr.txt' 187 | tmr = 't.txt' 188 | if os.path.isfile(mr): 189 | os.remove(mr) 190 | else: 191 | f = open(mr,"w") 192 | f.close() 193 | if os.path.isfile(bmr): 194 | os.remove(bmr) 195 | else: 196 | f = open(bmr,"w") 197 | f.close() 198 | if os.path.isfile(tmr): 199 | os.remove(tmr) 200 | else: 201 | f = open(tmr,"w") 202 | f.close() 203 | 204 | for t in itertools.count(): 205 | ### 1. Check stopping criterion 206 | if stopping_criterion is not None and stopping_criterion(env, t): 207 | break 208 | ### 2. Step the env and store the transition 209 | # At this point, "last_obs" contains the latest observation that was 210 | # recorded from the simulator. Here, your code needs to store this 211 | # observation and its outcome (reward, next observation, etc.) into 212 | # the replay buffer while stepping the simulator forward one step. 213 | # At the end of this block of code, the simulator should have been 214 | # advanced one step, and the replay buffer should contain one more 215 | # transition. 216 | # Specifically, last_obs must point to the new latest observation. 217 | # Useful functions you'll need to call: 218 | # obs, reward, done, info = env.step(action) 219 | # this steps the environment forward one step 220 | # obs = env.reset() 221 | # this resets the environment if you reached an episode boundary. 222 | # Don't forget to call env.reset() to get a new observation if done 223 | # is true!! 224 | # Note that you cannot use "last_obs" directly as input 225 | # into your network, since it needs to be processed to include context 226 | # from previous frames. You should check out the replay buffer 227 | # implementation in dqn_utils.py to see what functionality the replay 228 | # buffer exposes. The replay buffer has a function called 229 | # encode_recent_observation that will take the latest observation 230 | # that you pushed into the buffer and compute the corresponding 231 | # input that should be given to a Q network by appending some 232 | # previous frames. 233 | # Don't forget to include epsilon greedy exploration! 234 | # And remember that the first time you enter this loop, the model 235 | # may not yet have been initialized (but of course, the first step 236 | # might as well be random, since you haven't trained your net...) 237 | 238 | ##### 239 | 240 | # YOUR CODE HERE 241 | # add the last_obs 242 | idx = replay_buffer.store_frame(last_obs) 243 | # extract obs from the replay_buffer 244 | obs_ = [replay_buffer.encode_recent_observation()] 245 | 246 | # action from the model 247 | if not model_initialized: 248 | action = np.random.randint(num_actions) 249 | else: 250 | # epsilon greedy exploration 251 | Q_val = session.run(Q_st_ph, feed_dict={ obs_t_ph: obs_}) 252 | 253 | e = exploration.value(t) 254 | 255 | if np.random.rand(1) learning_starts and 278 | t % learning_freq == 0 and 279 | replay_buffer.can_sample(batch_size)): 280 | # Here, you should perform training. Training consists of four steps: 281 | # 3.a: use the replay buffer to sample a batch of transitions (see the 282 | # replay buffer code for function definition, each batch that you sample 283 | # should consist of current observations, current actions, rewards, 284 | # next observations, and done indicator). 285 | # 3.b: initialize the model if it has not been initialized yet; to do 286 | # that, call 287 | # initialize_interdependent_variables(session, tf.global_variables(), { 288 | # obs_t_ph: obs_t_batch, 289 | # obs_tp1_ph: obs_tp1_batch, 290 | # }) 291 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 292 | # the current and next time step. The boolean variable model_initialized 293 | # indicates whether or not the model has been initialized. 294 | # Remember that you have to update the target network too (see 3.d)! 295 | # 3.c: train the model. To do this, you'll need to use the train_fn and 296 | # total_error ops that were created earlier: total_error is what you 297 | # created to compute the total Bellman error in a batch, and train_fn 298 | # will actually perform a gradient step and update the network parameters 299 | # to reduce total_error. When calling session.run on these you'll need to 300 | # populate the following placeholders: 301 | # obs_t_ph 302 | # act_t_ph 303 | # rew_t_ph 304 | # obs_tp1_ph 305 | # done_mask_ph 306 | # (this is needed for computing total_error) 307 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 308 | # (this is needed by the optimizer to choose the learning rate) 309 | # 3.d: periodically update the target network by calling 310 | # session.run(update_target_fn) 311 | # you should update every target_update_freq steps, and you may find the 312 | # variable num_param_updates useful for this (it was initialized to 0) 313 | ##### 314 | 315 | # YOUR CODE HERE 316 | 317 | # 3.a sample batches 318 | obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample(batch_size) 319 | 320 | # 3.b initialize the model 321 | if not model_initialized: 322 | initialize_interdependent_variables(session, tf.global_variables(), { 323 | obs_t_ph: obs_t_batch, 324 | obs_tp1_ph: obs_tp1_batch, 325 | }) 326 | model_initialized = True 327 | 328 | #3.c train the model 329 | feed_dict = {obs_t_ph: obs_t_batch, act_t_ph: act_batch, rew_t_ph: rew_batch, 330 | obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask, 331 | learning_rate : optimizer_spec.lr_schedule.value(t)} 332 | 333 | session.run(train_fn, feed_dict = feed_dict) 334 | 335 | # update the network 336 | if t%target_update_freq == 0: 337 | session.run(update_target_fn) 338 | ##### 339 | ### 4. Log progress 340 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 341 | if len(episode_rewards) > 0: 342 | mean_episode_reward = np.mean(episode_rewards[-100:]) 343 | if len(episode_rewards) > 100: 344 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 345 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 346 | print("Timestep %d" % (t,)) 347 | print("mean reward (100 episodes) %f" % mean_episode_reward) 348 | print("best mean reward %f" % best_mean_episode_reward) 349 | print("episodes %d" % len(episode_rewards)) 350 | print("exploration %f" % exploration.value(t)) 351 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 352 | sys.stdout.flush() 353 | 354 | f = open(mr,"a") 355 | f.write(str(mean_episode_reward)+'\n') 356 | f.close() 357 | f = open(bmr,"a") 358 | f.write(str(best_mean_episode_reward)+'\n') 359 | f.close() 360 | f = open(tmr,"a") 361 | f.write(str(t)+'\n') 362 | f.close() 363 | 364 | -------------------------------------------------------------------------------- /dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.select( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | 226 | def sample(self, batch_size): 227 | """Sample `batch_size` different transitions. 228 | 229 | i-th sample transition is the following: 230 | 231 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 232 | after which reward `rew_batch[i]` was received and subsequent 233 | observation next_obs_batch[i] was observed, unless the epsiode 234 | was done which is represented by `done_mask[i]` which is equal 235 | to 1 if episode has ended as a result of that action. 236 | 237 | Parameters 238 | ---------- 239 | batch_size: int 240 | How many transitions to sample. 241 | 242 | Returns 243 | ------- 244 | obs_batch: np.array 245 | Array of shape 246 | (batch_size, img_h, img_w, img_c * frame_history_len) 247 | and dtype np.uint8 248 | act_batch: np.array 249 | Array of shape (batch_size,) and dtype np.int32 250 | rew_batch: np.array 251 | Array of shape (batch_size,) and dtype np.float32 252 | next_obs_batch: np.array 253 | Array of shape 254 | (batch_size, img_h, img_w, img_c * frame_history_len) 255 | and dtype np.uint8 256 | done_mask: np.array 257 | Array of shape (batch_size,) and dtype np.float32 258 | """ 259 | assert self.can_sample(batch_size) 260 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 261 | return self._encode_sample(idxes) 262 | 263 | def encode_recent_observation(self): 264 | """Return the most recent `frame_history_len` frames. 265 | 266 | Returns 267 | ------- 268 | observation: np.array 269 | Array of shape (img_h, img_w, img_c * frame_history_len) 270 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 271 | encodes frame at time `t - frame_history_len + i` 272 | """ 273 | assert self.num_in_buffer > 0 274 | return self._encode_observation((self.next_idx - 1) % self.size) 275 | 276 | def _encode_observation(self, idx): 277 | end_idx = idx + 1 # make noninclusive 278 | start_idx = end_idx - self.frame_history_len 279 | # this checks if we are using low-dimensional observations, such as RAM 280 | # state, in which case we just directly return the latest RAM. 281 | if len(self.obs.shape) == 2: 282 | return self.obs[end_idx-1] 283 | # if there weren't enough frames ever in the buffer for context 284 | if start_idx < 0 and self.num_in_buffer != self.size: 285 | start_idx = 0 286 | for idx in range(start_idx, end_idx - 1): 287 | if self.done[idx % self.size]: 288 | start_idx = idx + 1 289 | missing_context = self.frame_history_len - (end_idx - start_idx) 290 | # if zero padding is needed for missing context 291 | # or we are on the boundry of the buffer 292 | if start_idx < 0 or missing_context > 0: 293 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 294 | for idx in range(start_idx, end_idx): 295 | frames.append(self.obs[idx % self.size]) 296 | return np.concatenate(frames, 2) 297 | else: 298 | # this optimization has potential to saves about 30% compute time \o/ 299 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 300 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 301 | 302 | def store_frame(self, frame): 303 | """Store a single frame in the buffer at the next available index, overwriting 304 | old frames if necessary. 305 | 306 | Parameters 307 | ---------- 308 | frame: np.array 309 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 310 | the frame to be stored 311 | 312 | Returns 313 | ------- 314 | idx: int 315 | Index at which the frame is stored. To be used for `store_effect` later. 316 | """ 317 | if self.obs is None: 318 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 319 | self.action = np.empty([self.size], dtype=np.int32) 320 | self.reward = np.empty([self.size], dtype=np.float32) 321 | self.done = np.empty([self.size], dtype=np.bool) 322 | self.obs[self.next_idx] = frame 323 | 324 | ret = self.next_idx 325 | self.next_idx = (self.next_idx + 1) % self.size 326 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 327 | 328 | return ret 329 | 330 | def store_effect(self, idx, action, reward, done): 331 | """Store effects of action taken after obeserving frame stored 332 | at index idx. The reason `store_frame` and `store_effect` is broken 333 | up into two functions is so that once can call `encode_recent_observation` 334 | in between. 335 | 336 | Paramters 337 | --------- 338 | idx: int 339 | Index in buffer of recently observed frame (returned by `store_frame`). 340 | action: int 341 | Action that was performed upon observing this frame. 342 | reward: float 343 | Reward that was received when the actions was performed. 344 | done: bool 345 | True if episode was finished after performing that action. 346 | """ 347 | self.action[idx] = action 348 | self.reward[idx] = reward 349 | self.done[idx] = done 350 | 351 | -------------------------------------------------------------------------------- /gym_env/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import registry, register, make, spec 2 | 3 | # Added for CS 294-112 4 | # ---------------------------------------- 5 | register( 6 | id='trading-v0', 7 | entry_point='gym.envs.trading:TradingEnv', 8 | timestep_limit=1000, 9 | ) 10 | 11 | # Algorithmic 12 | # ---------------------------------------- 13 | 14 | register( 15 | id='Copy-v0', 16 | entry_point='gym.envs.algorithmic:CopyEnv', 17 | max_episode_steps=200, 18 | reward_threshold=25.0, 19 | ) 20 | 21 | register( 22 | id='RepeatCopy-v0', 23 | entry_point='gym.envs.algorithmic:RepeatCopyEnv', 24 | max_episode_steps=200, 25 | reward_threshold=75.0, 26 | ) 27 | 28 | register( 29 | id='ReversedAddition-v0', 30 | entry_point='gym.envs.algorithmic:ReversedAdditionEnv', 31 | kwargs={'rows' : 2}, 32 | max_episode_steps=200, 33 | reward_threshold=25.0, 34 | ) 35 | 36 | register( 37 | id='ReversedAddition3-v0', 38 | entry_point='gym.envs.algorithmic:ReversedAdditionEnv', 39 | kwargs={'rows' : 3}, 40 | max_episode_steps=200, 41 | reward_threshold=25.0, 42 | ) 43 | 44 | register( 45 | id='DuplicatedInput-v0', 46 | entry_point='gym.envs.algorithmic:DuplicatedInputEnv', 47 | max_episode_steps=200, 48 | reward_threshold=9.0, 49 | ) 50 | 51 | register( 52 | id='Reverse-v0', 53 | entry_point='gym.envs.algorithmic:ReverseEnv', 54 | max_episode_steps=200, 55 | reward_threshold=25.0, 56 | ) 57 | 58 | # Classic 59 | # ---------------------------------------- 60 | 61 | register( 62 | id='CartPole-v0', 63 | entry_point='gym.envs.classic_control:CartPoleEnv', 64 | max_episode_steps=200, 65 | reward_threshold=195.0, 66 | ) 67 | 68 | register( 69 | id='CartPole-v1', 70 | entry_point='gym.envs.classic_control:CartPoleEnv', 71 | max_episode_steps=500, 72 | reward_threshold=475.0, 73 | ) 74 | 75 | register( 76 | id='MountainCar-v0', 77 | entry_point='gym.envs.classic_control:MountainCarEnv', 78 | max_episode_steps=200, 79 | reward_threshold=-110.0, 80 | ) 81 | 82 | register( 83 | id='MountainCarContinuous-v0', 84 | entry_point='gym.envs.classic_control:Continuous_MountainCarEnv', 85 | max_episode_steps=999, 86 | reward_threshold=90.0, 87 | ) 88 | 89 | register( 90 | id='Pendulum-v0', 91 | entry_point='gym.envs.classic_control:PendulumEnv', 92 | max_episode_steps=200, 93 | ) 94 | 95 | register( 96 | id='Acrobot-v1', 97 | entry_point='gym.envs.classic_control:AcrobotEnv', 98 | max_episode_steps=500, 99 | ) 100 | 101 | # Box2d 102 | # ---------------------------------------- 103 | 104 | register( 105 | id='LunarLander-v2', 106 | entry_point='gym.envs.box2d:LunarLander', 107 | max_episode_steps=1000, 108 | reward_threshold=200, 109 | ) 110 | 111 | register( 112 | id='LunarLanderContinuous-v2', 113 | entry_point='gym.envs.box2d:LunarLanderContinuous', 114 | max_episode_steps=1000, 115 | reward_threshold=200, 116 | ) 117 | 118 | register( 119 | id='BipedalWalker-v2', 120 | entry_point='gym.envs.box2d:BipedalWalker', 121 | max_episode_steps=1600, 122 | reward_threshold=300, 123 | ) 124 | 125 | register( 126 | id='BipedalWalkerHardcore-v2', 127 | entry_point='gym.envs.box2d:BipedalWalkerHardcore', 128 | max_episode_steps=2000, 129 | reward_threshold=300, 130 | ) 131 | 132 | register( 133 | id='CarRacing-v0', 134 | entry_point='gym.envs.box2d:CarRacing', 135 | max_episode_steps=1000, 136 | reward_threshold=900, 137 | ) 138 | 139 | # Toy Text 140 | # ---------------------------------------- 141 | 142 | register( 143 | id='Blackjack-v0', 144 | entry_point='gym.envs.toy_text:BlackjackEnv', 145 | ) 146 | 147 | register( 148 | id='KellyCoinflip-v0', 149 | entry_point='gym.envs.toy_text:KellyCoinflipEnv', 150 | reward_threshold=246.61, 151 | ) 152 | register( 153 | id='KellyCoinflipGeneralized-v0', 154 | entry_point='gym.envs.toy_text:KellyCoinflipGeneralizedEnv', 155 | ) 156 | 157 | register( 158 | id='FrozenLake-v0', 159 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 160 | kwargs={'map_name' : '4x4'}, 161 | max_episode_steps=100, 162 | reward_threshold=0.78, # optimum = .8196 163 | ) 164 | 165 | register( 166 | id='FrozenLake8x8-v0', 167 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 168 | kwargs={'map_name' : '8x8'}, 169 | max_episode_steps=200, 170 | reward_threshold=0.99, # optimum = 1 171 | ) 172 | 173 | register( 174 | id='CliffWalking-v0', 175 | entry_point='gym.envs.toy_text:CliffWalkingEnv', 176 | ) 177 | 178 | register( 179 | id='NChain-v0', 180 | entry_point='gym.envs.toy_text:NChainEnv', 181 | max_episode_steps=1000, 182 | ) 183 | 184 | register( 185 | id='Roulette-v0', 186 | entry_point='gym.envs.toy_text:RouletteEnv', 187 | max_episode_steps=100, 188 | ) 189 | 190 | register( 191 | id='Taxi-v2', 192 | entry_point='gym.envs.toy_text.taxi:TaxiEnv', 193 | reward_threshold=8, # optimum = 8.46 194 | max_episode_steps=200, 195 | ) 196 | 197 | register( 198 | id='GuessingGame-v0', 199 | entry_point='gym.envs.toy_text.guessing_game:GuessingGame', 200 | max_episode_steps=200, 201 | ) 202 | 203 | register( 204 | id='HotterColder-v0', 205 | entry_point='gym.envs.toy_text.hotter_colder:HotterColder', 206 | max_episode_steps=200, 207 | ) 208 | 209 | # Mujoco 210 | # ---------------------------------------- 211 | 212 | # 2D 213 | 214 | register( 215 | id='Reacher-v1', 216 | entry_point='gym.envs.mujoco:ReacherEnv', 217 | max_episode_steps=50, 218 | reward_threshold=-3.75, 219 | ) 220 | 221 | register( 222 | id='Pusher-v0', 223 | entry_point='gym.envs.mujoco:PusherEnv', 224 | max_episode_steps=100, 225 | reward_threshold=0.0, 226 | ) 227 | 228 | register( 229 | id='Thrower-v0', 230 | entry_point='gym.envs.mujoco:ThrowerEnv', 231 | max_episode_steps=100, 232 | reward_threshold=0.0, 233 | ) 234 | 235 | register( 236 | id='Striker-v0', 237 | entry_point='gym.envs.mujoco:StrikerEnv', 238 | max_episode_steps=100, 239 | reward_threshold=0.0, 240 | ) 241 | 242 | register( 243 | id='InvertedPendulum-v1', 244 | entry_point='gym.envs.mujoco:InvertedPendulumEnv', 245 | max_episode_steps=1000, 246 | reward_threshold=950.0, 247 | ) 248 | 249 | register( 250 | id='InvertedDoublePendulum-v1', 251 | entry_point='gym.envs.mujoco:InvertedDoublePendulumEnv', 252 | max_episode_steps=1000, 253 | reward_threshold=9100.0, 254 | ) 255 | 256 | register( 257 | id='HalfCheetah-v1', 258 | entry_point='gym.envs.mujoco:HalfCheetahEnv', 259 | max_episode_steps=1000, 260 | reward_threshold=4800.0, 261 | ) 262 | 263 | register( 264 | id='Hopper-v1', 265 | entry_point='gym.envs.mujoco:HopperEnv', 266 | max_episode_steps=1000, 267 | reward_threshold=3800.0, 268 | ) 269 | 270 | register( 271 | id='Swimmer-v1', 272 | entry_point='gym.envs.mujoco:SwimmerEnv', 273 | max_episode_steps=1000, 274 | reward_threshold=360.0, 275 | ) 276 | 277 | register( 278 | id='Walker2d-v1', 279 | max_episode_steps=1000, 280 | entry_point='gym.envs.mujoco:Walker2dEnv', 281 | ) 282 | 283 | register( 284 | id='Ant-v1', 285 | entry_point='gym.envs.mujoco:AntEnv', 286 | max_episode_steps=1000, 287 | reward_threshold=6000.0, 288 | ) 289 | 290 | register( 291 | id='Humanoid-v1', 292 | entry_point='gym.envs.mujoco:HumanoidEnv', 293 | max_episode_steps=1000, 294 | ) 295 | 296 | register( 297 | id='HumanoidStandup-v1', 298 | entry_point='gym.envs.mujoco:HumanoidStandupEnv', 299 | max_episode_steps=1000, 300 | ) 301 | 302 | # Atari 303 | # ---------------------------------------- 304 | 305 | # # print ', '.join(["'{}'".format(name.split('.')[0]) for name in atari_py.list_games()]) 306 | for game in ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis', 307 | 'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival', 308 | 'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk', 309 | 'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar', 310 | 'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master', 311 | 'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan', 312 | 'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing', 313 | 'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down', 314 | 'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']: 315 | for obs_type in ['image', 'ram']: 316 | # space_invaders should yield SpaceInvaders-v0 and SpaceInvaders-ram-v0 317 | name = ''.join([g.capitalize() for g in game.split('_')]) 318 | if obs_type == 'ram': 319 | name = '{}-ram'.format(name) 320 | 321 | nondeterministic = False 322 | if game == 'elevator_action' and obs_type == 'ram': 323 | # ElevatorAction-ram-v0 seems to yield slightly 324 | # non-deterministic observations about 10% of the time. We 325 | # should track this down eventually, but for now we just 326 | # mark it as nondeterministic. 327 | nondeterministic = True 328 | 329 | register( 330 | id='{}-v0'.format(name), 331 | entry_point='gym.envs.atari:AtariEnv', 332 | kwargs={'game': game, 'obs_type': obs_type, 'repeat_action_probability': 0.25}, 333 | max_episode_steps=10000, 334 | nondeterministic=nondeterministic, 335 | ) 336 | 337 | register( 338 | id='{}-v4'.format(name), 339 | entry_point='gym.envs.atari:AtariEnv', 340 | kwargs={'game': game, 'obs_type': obs_type}, 341 | max_episode_steps=100000, 342 | nondeterministic=nondeterministic, 343 | ) 344 | 345 | # Standard Deterministic (as in the original DeepMind paper) 346 | if game == 'space_invaders': 347 | frameskip = 3 348 | else: 349 | frameskip = 4 350 | 351 | # Use a deterministic frame skip. 352 | register( 353 | id='{}Deterministic-v0'.format(name), 354 | entry_point='gym.envs.atari:AtariEnv', 355 | kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip, 'repeat_action_probability': 0.25}, 356 | max_episode_steps=100000, 357 | nondeterministic=nondeterministic, 358 | ) 359 | 360 | register( 361 | id='{}Deterministic-v4'.format(name), 362 | entry_point='gym.envs.atari:AtariEnv', 363 | kwargs={'game': game, 'obs_type': obs_type, 'frameskip': frameskip}, 364 | max_episode_steps=100000, 365 | nondeterministic=nondeterministic, 366 | ) 367 | 368 | register( 369 | id='{}NoFrameskip-v0'.format(name), 370 | entry_point='gym.envs.atari:AtariEnv', 371 | kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1, 'repeat_action_probability': 0.25}, # A frameskip of 1 means we get every frame 372 | max_episode_steps=frameskip * 100000, 373 | nondeterministic=nondeterministic, 374 | ) 375 | 376 | # No frameskip. (Atari has no entropy source, so these are 377 | # deterministic environments.) 378 | register( 379 | id='{}NoFrameskip-v4'.format(name), 380 | entry_point='gym.envs.atari:AtariEnv', 381 | kwargs={'game': game, 'obs_type': obs_type, 'frameskip': 1}, # A frameskip of 1 means we get every frame 382 | max_episode_steps=frameskip * 100000, 383 | nondeterministic=nondeterministic, 384 | ) 385 | 386 | # Board games 387 | # ---------------------------------------- 388 | 389 | register( 390 | id='Go9x9-v0', 391 | entry_point='gym.envs.board_game:GoEnv', 392 | kwargs={ 393 | 'player_color': 'black', 394 | 'opponent': 'pachi:uct:_2400', 395 | 'observation_type': 'image3c', 396 | 'illegal_move_mode': 'lose', 397 | 'board_size': 9, 398 | }, 399 | # The pachi player seems not to be determistic given a fixed seed. 400 | # (Reproduce by running 'import gym; h = gym.make('Go9x9-v0'); h.seed(1); h.reset(); h.step(15); h.step(16); h.step(17)' a few times.) 401 | # 402 | # This is probably due to a computation time limit. 403 | nondeterministic=True, 404 | ) 405 | 406 | register( 407 | id='Go19x19-v0', 408 | entry_point='gym.envs.board_game:GoEnv', 409 | kwargs={ 410 | 'player_color': 'black', 411 | 'opponent': 'pachi:uct:_2400', 412 | 'observation_type': 'image3c', 413 | 'illegal_move_mode': 'lose', 414 | 'board_size': 19, 415 | }, 416 | nondeterministic=True, 417 | ) 418 | 419 | register( 420 | id='Hex9x9-v0', 421 | entry_point='gym.envs.board_game:HexEnv', 422 | kwargs={ 423 | 'player_color': 'black', 424 | 'opponent': 'random', 425 | 'observation_type': 'numpy3c', 426 | 'illegal_move_mode': 'lose', 427 | 'board_size': 9, 428 | }, 429 | ) 430 | 431 | # Debugging 432 | # ---------------------------------------- 433 | 434 | register( 435 | id='OneRoundDeterministicReward-v0', 436 | entry_point='gym.envs.debugging:OneRoundDeterministicRewardEnv', 437 | local_only=True 438 | ) 439 | 440 | register( 441 | id='TwoRoundDeterministicReward-v0', 442 | entry_point='gym.envs.debugging:TwoRoundDeterministicRewardEnv', 443 | local_only=True 444 | ) 445 | 446 | register( 447 | id='OneRoundNondeterministicReward-v0', 448 | entry_point='gym.envs.debugging:OneRoundNondeterministicRewardEnv', 449 | local_only=True 450 | ) 451 | 452 | register( 453 | id='TwoRoundNondeterministicReward-v0', 454 | entry_point='gym.envs.debugging:TwoRoundNondeterministicRewardEnv', 455 | local_only=True, 456 | ) 457 | 458 | # Parameter tuning 459 | # ---------------------------------------- 460 | register( 461 | id='ConvergenceControl-v0', 462 | entry_point='gym.envs.parameter_tuning:ConvergenceControl', 463 | ) 464 | 465 | register( 466 | id='CNNClassifierTraining-v0', 467 | entry_point='gym.envs.parameter_tuning:CNNClassifierTraining', 468 | ) 469 | 470 | # Safety 471 | # ---------------------------------------- 472 | 473 | # interpretability envs 474 | register( 475 | id='PredictActionsCartpole-v0', 476 | entry_point='gym.envs.safety:PredictActionsCartpoleEnv', 477 | max_episode_steps=200, 478 | ) 479 | 480 | register( 481 | id='PredictObsCartpole-v0', 482 | entry_point='gym.envs.safety:PredictObsCartpoleEnv', 483 | max_episode_steps=200, 484 | ) 485 | 486 | # semi_supervised envs 487 | # probably the easiest: 488 | register( 489 | id='SemisuperPendulumNoise-v0', 490 | entry_point='gym.envs.safety:SemisuperPendulumNoiseEnv', 491 | max_episode_steps=200, 492 | ) 493 | # somewhat harder because of higher variance: 494 | register( 495 | id='SemisuperPendulumRandom-v0', 496 | entry_point='gym.envs.safety:SemisuperPendulumRandomEnv', 497 | max_episode_steps=200, 498 | ) 499 | # probably the hardest because you only get a constant number of rewards in total: 500 | register( 501 | id='SemisuperPendulumDecay-v0', 502 | entry_point='gym.envs.safety:SemisuperPendulumDecayEnv', 503 | max_episode_steps=200, 504 | ) 505 | 506 | # off_switch envs 507 | register( 508 | id='OffSwitchCartpole-v0', 509 | entry_point='gym.envs.safety:OffSwitchCartpoleEnv', 510 | max_episode_steps=200, 511 | ) 512 | 513 | register( 514 | id='OffSwitchCartpoleProb-v0', 515 | entry_point='gym.envs.safety:OffSwitchCartpoleProbEnv', 516 | max_episode_steps=200, 517 | ) 518 | -------------------------------------------------------------------------------- /gym_env/trading/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.trading import TradingEnv 2 | -------------------------------------------------------------------------------- /gym_env/trading/policy_gradient.py: -------------------------------------------------------------------------------- 1 | ''' Policy Gradient implementation customized a bit for 2 | solving the trading problem''' 3 | # stolen shamelessly and adapted December 2016 by Tito Ingargiola 4 | # was originally: 5 | 6 | '''Solution to the Cartpole problem using Policy Gradients in Tensorflow.''' 7 | # written October 2016 by Sam Greydanus 8 | # inspired by gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5 9 | 10 | import numpy as np 11 | import gym 12 | import tensorflow as tf 13 | import pdb 14 | import logging 15 | import os.path 16 | import pandas as pd 17 | 18 | import trading_env as te 19 | 20 | log = logging.getLogger(__name__) 21 | log.setLevel(logging.INFO) 22 | log.info('%s logger started.',__name__) 23 | 24 | class PolicyGradient(object) : 25 | """ Policy Gradient implementation in tensor flow. 26 | """ 27 | 28 | def __init__(self, 29 | sess, # tensorflow session 30 | obs_dim, # observation shape 31 | num_actions, # number of possible actions 32 | neurons_per_dim=32, # hidden layer will have obs_dim * neurons_per_dim neurons 33 | learning_rate=1e-2, # learning rate 34 | gamma = 0.9, # reward discounting 35 | decay = 0.9 # gradient decay rate 36 | ): 37 | 38 | self._sess = sess 39 | self._gamma = gamma 40 | self._tf_model = {} 41 | self._num_actions = num_actions 42 | hidden_neurons = obs_dim * neurons_per_dim 43 | with tf.variable_scope('layer_one',reuse=False): 44 | L1 = tf.truncated_normal_initializer(mean=0, 45 | stddev=1./np.sqrt(obs_dim), 46 | dtype=tf.float32) 47 | self._tf_model['W1'] = tf.get_variable("W1", 48 | [obs_dim, hidden_neurons], 49 | initializer=L1) 50 | with tf.variable_scope('layer_two',reuse=False): 51 | L2 = tf.truncated_normal_initializer(mean=0, 52 | stddev=1./np.sqrt(hidden_neurons), 53 | dtype=tf.float32) 54 | self._tf_model['W2'] = tf.get_variable("W2", 55 | [hidden_neurons,num_actions], 56 | initializer=L2) 57 | 58 | # tf placeholders 59 | self._tf_x = tf.placeholder(dtype=tf.float32, shape=[None, obs_dim],name="tf_x") 60 | self._tf_y = tf.placeholder(dtype=tf.float32, shape=[None, num_actions],name="tf_y") 61 | self._tf_epr = tf.placeholder(dtype=tf.float32, shape=[None,1], name="tf_epr") 62 | 63 | # tf reward processing (need tf_discounted_epr for policy gradient wizardry) 64 | self._tf_discounted_epr = self.tf_discount_rewards(self._tf_epr) 65 | self._tf_mean, self._tf_variance= tf.nn.moments(self._tf_discounted_epr, [0], 66 | shift=None, name="reward_moments") 67 | self._tf_discounted_epr -= self._tf_mean 68 | self._tf_discounted_epr /= tf.sqrt(self._tf_variance + 1e-6) 69 | 70 | self._saver = tf.train.Saver() 71 | 72 | # tf optimizer op 73 | self._tf_aprob = self.tf_policy_forward(self._tf_x) 74 | loss = tf.nn.l2_loss(self._tf_y - self._tf_aprob) # this gradient encourages the actions taken 75 | optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay) 76 | tf_grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), 77 | grad_loss=self._tf_discounted_epr) 78 | self._train_op = optimizer.apply_gradients(tf_grads) 79 | 80 | def tf_discount_rewards(self, tf_r): #tf_r ~ [game_steps,1] 81 | discount_f = lambda a, v: a*self._gamma + v; 82 | tf_r_reverse = tf.scan(discount_f, tf.reverse(tf_r,[True, False])) 83 | tf_discounted_r = tf.reverse(tf_r_reverse,[True, False]) 84 | return tf_discounted_r 85 | 86 | def tf_policy_forward(self, x): #x ~ [1,D] 87 | h = tf.matmul(x, self._tf_model['W1']) 88 | h = tf.nn.relu(h) 89 | logp = tf.matmul(h, self._tf_model['W2']) 90 | p = tf.nn.softmax(logp) 91 | return p 92 | 93 | def train_model(self, env, episodes=100, 94 | load_model = False, # load model from checkpoint if available:? 95 | model_dir = '/tmp/pgmodel/', log_freq=10 ) : 96 | 97 | # initialize variables and load model 98 | init_op = tf.global_variables_initializer() 99 | self._sess.run(init_op) 100 | if load_model: 101 | ckpt = tf.train.get_checkpoint_state(model_dir) 102 | print tf.train.latest_checkpoint(model_dir) 103 | if ckpt and ckpt.model_checkpoint_path: 104 | savr = tf.train.import_meta_graph(ckpt.model_checkpoint_path+'.meta') 105 | out = savr.restore(self._sess, ckpt.model_checkpoint_path) 106 | print("Model restored from ",ckpt.model_checkpoint_path) 107 | else: 108 | print('No checkpoint found at: ',model_dir) 109 | if not os.path.exists(model_dir): 110 | os.makedirs(model_dir) 111 | 112 | episode = 0 113 | observation = env.reset() 114 | xs,rs,ys = [],[],[] # environment info 115 | running_reward = 0 116 | reward_sum = 0 117 | # training loop 118 | day = 0 119 | simrors = np.zeros(episodes) 120 | mktrors = np.zeros(episodes) 121 | alldf = None 122 | victory = False 123 | while episode < episodes and not victory: 124 | # stochastically sample a policy from the network 125 | x = observation 126 | feed = {self._tf_x: np.reshape(x, (1,-1))} 127 | aprob = self._sess.run(self._tf_aprob,feed) 128 | aprob = aprob[0,:] # we live in a batched world :/ 129 | 130 | action = np.random.choice(self._num_actions, p=aprob) 131 | label = np.zeros_like(aprob) ; label[action] = 1 # make a training 'label' 132 | 133 | # step the environment and get new measurements 134 | observation, reward, done, info = env.step(action) 135 | #print observation, reward, done, info 136 | reward_sum += reward 137 | 138 | # record game history 139 | xs.append(x) 140 | ys.append(label) 141 | rs.append(reward) 142 | day += 1 143 | if done: 144 | running_reward = running_reward * 0.99 + reward_sum * 0.01 145 | epx = np.vstack(xs) 146 | epr = np.vstack(rs) 147 | epy = np.vstack(ys) 148 | xs,rs,ys = [],[],[] # reset game history 149 | df = env.sim.to_df() 150 | #pdb.set_trace() 151 | simrors[episode]=df.bod_nav.values[-1]-1 # compound returns 152 | mktrors[episode]=df.mkt_nav.values[-1]-1 153 | 154 | alldf = df if alldf is None else pd.concat([alldf,df], axis=0) 155 | 156 | feed = {self._tf_x: epx, self._tf_epr: epr, self._tf_y: epy} 157 | _ = self._sess.run(self._train_op,feed) # parameter update 158 | 159 | if episode % log_freq == 0: 160 | log.info('year #%6d, mean reward: %8.4f, sim ret: %8.4f, mkt ret: %8.4f, net: %8.4f', episode, 161 | running_reward, simrors[episode],mktrors[episode], simrors[episode]-mktrors[episode]) 162 | save_path = self._saver.save(self._sess, model_dir+'model.ckpt', 163 | global_step=episode+1) 164 | if episode > 100: 165 | vict = pd.DataFrame( { 'sim': simrors[episode-100:episode], 166 | 'mkt': mktrors[episode-100:episode] } ) 167 | vict['net'] = vict.sim - vict.mkt 168 | if vict.net.mean() > 0.0: 169 | victory = True 170 | log.info('Congratulations, Warren Buffet! You won the trading game.') 171 | #print("Model saved in file: {}".format(save_path)) 172 | 173 | 174 | 175 | episode += 1 176 | observation = env.reset() 177 | reward_sum = 0 178 | day = 0 179 | 180 | return alldf, pd.DataFrame({'simror':simrors,'mktror':mktrors}) 181 | -------------------------------------------------------------------------------- /gym_env/trading/test_policy_gradient.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import gym 4 | import tensorflow as tf 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | from matplotlib import interactive 8 | interactive(True) 9 | import pdb 10 | import logging 11 | 12 | log = logging.getLogger() 13 | #log.addHandler(logging.StreamHandler()) 14 | import policy_gradient 15 | # create gym 16 | env = gym.make('trading-v0') 17 | 18 | sess = tf.InteractiveSession() 19 | 20 | # create policygradient 21 | pg = policy_gradient.PolicyGradient(sess, obs_dim=5, num_actions=3, learning_rate=1e-2 ) 22 | 23 | # train model, loading if possible 24 | alldf,summrzed = pg.train_model( env,episodes=1001, log_freq=100)#, load_model=True) 25 | #print df 26 | #pd.DataFrame(sharpes).expanding().mean().plot() 27 | 28 | -------------------------------------------------------------------------------- /gym_env/trading/test_trading_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | #import gym_trading 4 | import pandas as pd 5 | import numpy as np 6 | import trading_env as te 7 | 8 | pd.set_option('display.width',500) 9 | 10 | env = gym.make('trading-v0') 11 | 12 | #env.time_cost_bps = 0 13 | 14 | Episodes=1 15 | 16 | obs = [] 17 | 18 | for _ in range(Episodes): 19 | observation = env.reset() 20 | done = False 21 | count = 0 22 | while not done: 23 | action = env.action_space.sample() # random 24 | observation, reward, done, info = env.step(action) 25 | obs = obs + [observation] 26 | #print observation,reward,done,info 27 | count += 1 28 | if done: 29 | print reward 30 | print count 31 | 32 | df = env.sim.to_df() 33 | 34 | df.head() 35 | df.tail() 36 | 37 | buyhold = lambda x,y : 2 38 | df = env.run_strat( buyhold ) 39 | 40 | df10 = env.run_strats( buyhold, episodes ) 41 | -------------------------------------------------------------------------------- /gym_env/trading/trading.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import error, spaces, utils 3 | from gym.utils import seeding 4 | from collections import Counter 5 | 6 | import quandl 7 | import numpy as np 8 | from numpy import random 9 | import pandas as pd 10 | import logging 11 | import pdb 12 | 13 | import tempfile 14 | 15 | log = logging.getLogger(__name__) 16 | log.info('%s logger started.',__name__) 17 | 18 | 19 | def _sharpe(Returns, freq=252) : 20 | """Given a set of returns, calculates naive (rfr=0) sharpe """ 21 | return (np.sqrt(freq) * np.mean(Returns))/np.std(Returns) 22 | 23 | def _prices2returns(prices): 24 | px = pd.DataFrame(prices) 25 | nl = px.shift().fillna(0) 26 | R = ((px - nl)/nl).fillna(0).replace([np.inf, -np.inf], np.nan).dropna() 27 | R = np.append( R[0].values, 0) 28 | return R 29 | 30 | class QuandlEnvSrc(object): 31 | ''' 32 | Quandl-based implementation of a TradingEnv's data source. 33 | 34 | Pulls data from Quandl, preps for use by TradingEnv and then 35 | acts as data provider for each new episode. 36 | ''' 37 | 38 | MinPercentileDays = 100 39 | QuandlAuthToken = "" # not necessary, but can be used if desired 40 | Name = "GOOG/NYSE_SPY" #"GOOG/NYSE_IBM" 41 | 42 | def __init__(self, days=252, name=Name, auth=QuandlAuthToken, scale=True ): 43 | self.name = name 44 | self.auth = auth 45 | self.days = days+1 46 | log.info('getting data for %s from quandl...',QuandlEnvSrc.Name) 47 | df = quandl.get(self.name) if self.auth=='' else quandl.get(self.name, authtoken=self.auth) 48 | log.info('got data for %s from quandl...',QuandlEnvSrc.Name) 49 | 50 | df = df[ ~np.isnan(df.Volume)][['Close','Volume']] 51 | # we calculate returns and percentiles, then kill nans 52 | df = df[['Close','Volume']] 53 | df.Volume.replace(0,1,inplace=True) # days shouldn't have zero volume.. 54 | df['Return'] = (df.Close-df.Close.shift())/df.Close.shift() 55 | pctrank = lambda x: pd.Series(x).rank(pct=True).iloc[-1] 56 | df['ClosePctl'] = df.Close.expanding(self.MinPercentileDays).apply(pctrank) 57 | df['VolumePctl'] = df.Volume.expanding(self.MinPercentileDays).apply(pctrank) 58 | df.dropna(axis=0,inplace=True) 59 | R = df.Return 60 | if scale: 61 | mean_values = df.mean(axis=0) 62 | std_values = df.std(axis=0) 63 | df = (df - np.array(mean_values))/ np.array(std_values) 64 | df['Return'] = R # we don't want our returns scaled 65 | self.min_values = df.min(axis=0) 66 | self.max_values = df.max(axis=0) 67 | self.data = df 68 | self.step = 0 69 | 70 | def reset(self): 71 | # we want contiguous data 72 | self.idx = np.random.randint( low = 0, high=len(self.data.index)-self.days ) 73 | self.step = 0 74 | 75 | def _step(self): 76 | obs = self.data.iloc[self.idx].as_matrix() 77 | self.idx += 1 78 | self.step += 1 79 | done = self.step >= self.days 80 | return obs,done 81 | 82 | class TradingSim(object) : 83 | """ Implements core trading simulator for single-instrument univ """ 84 | 85 | def __init__(self, steps, trading_cost_bps = 1e-3, time_cost_bps = 1e-4): 86 | # invariant for object life 87 | self.trading_cost_bps = trading_cost_bps 88 | self.time_cost_bps = time_cost_bps 89 | self.steps = steps 90 | # change every step 91 | self.step = 0 92 | self.actions = np.zeros(self.steps) 93 | self.navs = np.ones(self.steps) 94 | self.mkt_nav = np.ones(self.steps) 95 | self.strat_retrns = np.ones(self.steps) 96 | self.posns = np.zeros(self.steps) 97 | self.costs = np.zeros(self.steps) 98 | self.trades = np.zeros(self.steps) 99 | self.mkt_retrns = np.zeros(self.steps) 100 | 101 | def reset(self): 102 | self.step = 0 103 | self.actions.fill(0) 104 | self.navs.fill(1) 105 | self.mkt_nav.fill(1) 106 | self.strat_retrns.fill(0) 107 | self.posns.fill(0) 108 | self.costs.fill(0) 109 | self.trades.fill(0) 110 | self.mkt_retrns.fill(0) 111 | 112 | def _step(self, action, retrn ): 113 | """ Given an action and return for prior period, calculates costs, navs, 114 | etc and returns the reward and a summary of the day's activity. """ 115 | 116 | bod_posn = 0.0 if self.step == 0 else self.posns[self.step-1] 117 | bod_nav = 1.0 if self.step == 0 else self.navs[self.step-1] 118 | mkt_nav = 1.0 if self.step == 0 else self.mkt_nav[self.step-1] 119 | 120 | self.mkt_retrns[self.step] = retrn 121 | self.actions[self.step] = action 122 | 123 | self.posns[self.step] = action - 1 124 | self.trades[self.step] = self.posns[self.step] - bod_posn 125 | 126 | trade_costs_pct = abs(self.trades[self.step]) * self.trading_cost_bps 127 | self.costs[self.step] = trade_costs_pct + self.time_cost_bps 128 | reward = ( (bod_posn * retrn) - self.costs[self.step] ) 129 | self.strat_retrns[self.step] = reward 130 | 131 | if self.step != 0 : 132 | self.navs[self.step] = bod_nav * (1 + self.strat_retrns[self.step-1]) 133 | self.mkt_nav[self.step] = mkt_nav * (1 + self.mkt_retrns[self.step-1]) 134 | 135 | info = { 'reward': reward, 'nav':self.navs[self.step], 'costs':self.costs[self.step] } 136 | 137 | self.step += 1 138 | return reward, info 139 | 140 | def to_df(self): 141 | """returns internal state in new dataframe """ 142 | cols = ['action', 'bod_nav', 'mkt_nav','mkt_return','sim_return', 143 | 'position','costs', 'trade' ] 144 | rets = _prices2returns(self.navs) 145 | #pdb.set_trace() 146 | df = pd.DataFrame( {'action': self.actions, # today's action (from agent) 147 | 'bod_nav': self.navs, # BOD Net Asset Value (NAV) 148 | 'mkt_nav': self.mkt_nav, 149 | 'mkt_return': self.mkt_retrns, 150 | 'sim_return': self.strat_retrns, 151 | 'position': self.posns, # EOD position 152 | 'costs': self.costs, # eod costs 153 | 'trade': self.trades },# eod trade 154 | columns=cols) 155 | return df 156 | 157 | class TradingEnv(gym.Env): 158 | """This gym implements a simple trading environment for reinforcement learning. 159 | 160 | The gym provides daily observations based on real market data pulled 161 | from Quandl on, by default, the SPY etf. An episode is defined as 252 162 | contiguous days sampled from the overall dataset. Each day is one 163 | 'step' within the gym and for each step, the algo has a choice: 164 | 165 | SHORT (0) 166 | FLAT (1) 167 | LONG (2) 168 | 169 | If you trade, you will be charged, by default, 10 BPS of the size of 170 | your trade. Thus, going from short to long costs twice as much as 171 | going from short to/from flat. Not trading also has a default cost of 172 | 1 BPS per step. Nobody said it would be easy! 173 | 174 | At the beginning of your episode, you are allocated 1 unit of 175 | cash. This is your starting Net Asset Value (NAV). If your NAV drops 176 | to 0, your episode is over and you lose. If your NAV hits 2.0, then 177 | you win. 178 | 179 | The trading env will track a buy-and-hold strategy which will act as 180 | the benchmark for the game. 181 | 182 | """ 183 | metadata = {'render.modes': ['human']} 184 | 185 | def __init__(self): 186 | self.days = 252 187 | self.src = QuandlEnvSrc(days=self.days) 188 | self.sim = TradingSim(steps=self.days, trading_cost_bps=1e-3, 189 | time_cost_bps=1e-4) 190 | self.action_space = spaces.Discrete( 3 ) 191 | self.observation_space= spaces.Box( self.src.min_values, 192 | self.src.max_values) 193 | self.reset() 194 | 195 | def _configure(self, display=None): 196 | self.display = display 197 | 198 | def _seed(self, seed=None): 199 | self.np_random, seed = seeding.np_random(seed) 200 | return [seed] 201 | 202 | def _step(self, action): 203 | assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action)) 204 | observation, done = self.src._step() 205 | # Close Volume Return ClosePctl VolumePctl 206 | yret = observation[2] 207 | 208 | reward, info = self.sim._step( action, yret ) 209 | 210 | #info = { 'pnl': daypnl, 'nav':self.nav, 'costs':costs } 211 | 212 | return observation, reward, done, info 213 | 214 | def _reset(self): 215 | self.src.reset() 216 | self.sim.reset() 217 | return self.src._step()[0] 218 | 219 | def _render(self, mode='human', close=False): 220 | #... TODO 221 | pass 222 | 223 | # some convenience functions: 224 | 225 | def run_strat(self, strategy, return_df=True): 226 | """run provided strategy, returns dataframe with all steps""" 227 | observation = self.reset() 228 | done = False 229 | while not done: 230 | action = strategy( observation, self ) # call strategy 231 | observation, reward, done, info = self.step(action) 232 | 233 | return self.sim.to_df() if return_df else None 234 | 235 | def run_strats( self, strategy, episodes=1, write_log=True, return_df=True): 236 | """ run provided strategy the specified # of times, possibly 237 | writing a log and possibly returning a dataframe summarizing activity. 238 | 239 | Note that writing the log is expensive and returning the df is moreso. 240 | For training purposes, you might not want to set both. 241 | """ 242 | logfile = None 243 | if write_log: 244 | logfile = tempfile.NamedTemporaryFile(delete=False) 245 | log.info('writing log to %s',logfile.name) 246 | need_df = write_log or return_df 247 | 248 | alldf = None 249 | 250 | for i in range(episodes): 251 | df = self.run_strat(strategy, return_df=need_df) 252 | if write_log: 253 | df.to_csv(logfile, mode='a') 254 | if return_df: 255 | alldf = df if alldf is None else pd.concat([alldf,df], axis=0) 256 | 257 | return alldf 258 | -------------------------------------------------------------------------------- /prediction-using-RL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/songyunfan/DeepRLTrading/8e8b9fae27e58aa574b51f33b1588d9f9cbc4ba7/prediction-using-RL.pdf -------------------------------------------------------------------------------- /run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10 78 | ) 79 | env.close() 80 | 81 | def get_available_gpus(): 82 | from tensorflow.python.client import device_lib 83 | local_device_protos = device_lib.list_local_devices() 84 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 85 | 86 | def set_global_seeds(i): 87 | try: 88 | import tensorflow as tf 89 | except ImportError: 90 | pass 91 | else: 92 | tf.set_random_seed(i) 93 | np.random.seed(i) 94 | random.seed(i) 95 | 96 | def get_session(): 97 | tf.reset_default_graph() 98 | tf_config = tf.ConfigProto( 99 | inter_op_parallelism_threads=1, 100 | intra_op_parallelism_threads=1) 101 | session = tf.Session(config=tf_config) 102 | print("AVAILABLE GPUS: ", get_available_gpus()) 103 | return session 104 | 105 | def get_env(task, seed): 106 | env_id = task.env_id 107 | 108 | env = gym.make(env_id) 109 | 110 | set_global_seeds(seed) 111 | env.seed(seed) 112 | 113 | expt_dir = '/tmp/hw3_vid_dir2/' 114 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 115 | env = wrap_deepmind(env) 116 | 117 | return env 118 | 119 | def main(): 120 | # Get Atari games. 121 | benchmark = gym.benchmark_spec('Atari40M') 122 | 123 | # Change the index to select a different game. 124 | task = benchmark.tasks[3] 125 | 126 | # Run training 127 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 128 | env = get_env(task, seed) 129 | session = get_session() 130 | atari_learn(env, session, num_timesteps=task.max_timesteps) 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | --------------------------------------------------------------------------------