├── .gitignore ├── README.md ├── moody_DRL_trader.py ├── moody_ts_gen.py └── simulated data moody2001.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/* 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # automated-trading-RL 2 | 3 | Some early code from work on a big RL trading project. 4 | 5 | Implements Direct Reinforcement Learning trader based on Moody et al 2001. 6 | 7 | * Generates time-series by random walk with autoregression 8 | * Optimises reward (profit) by batch training a RNN policy. 9 | * pure numpy implementation 10 | 11 | -------------------------------------------------------------------------------- /moody_DRL_trader.py: -------------------------------------------------------------------------------- 1 | """ numpy implementation of Moody 2001 DRL trader by Matt Pearce """ 2 | import numpy as np 3 | 4 | 5 | def normalise(x): 6 | return (x - x.mean()) / x.std() 7 | 8 | 9 | class Env(): 10 | 11 | def __init__(self, r, mu, TC, T, m, z): 12 | self.quantity = mu 13 | self.transaction_costs = TC 14 | self.F_prev = 0.0 15 | self.T = T 16 | self.m = m 17 | self.r = r # returns z{t} - z{t-1} 18 | self.z = z 19 | 20 | def get_reward(self, r_t, F_t): 21 | r_t = self.z[self.t+self.m-1] - self.z[self.t+self.m-1-1] 22 | return self.quantity * (self.F_prev * r_t - self.transaction_costs * abs(F_t - self.F_prev)) 23 | 24 | def get_observation(self): 25 | return normalise(self.r)[self.t : self.t + self.m] 26 | 27 | def step(self, action): 28 | # print(self.t, 'F =', action) 29 | self.t += 1 30 | 31 | reward = self.get_reward(self.r[self.t+self.m-2], action) 32 | self.F_prev = action 33 | 34 | observation = self.get_observation() 35 | 36 | done = self.t >= self.T 37 | 38 | return observation, reward, done 39 | 40 | def reset(self): 41 | self.t = 1 42 | self.F_prev = 0.0 43 | return self.get_observation() 44 | 45 | 46 | 47 | 48 | class MoodyDRLAgent(): 49 | 50 | def __init__(self, r, m, TC, mu): 51 | # self.theta = np.ones(m+2) # params are [b,theta(m),u] 52 | self.theta = (np.random.rand(m+2)*2-1.0)*np.sqrt(6./(m+2)) 53 | self.rho = 0.04 # learning rate 54 | self.m = m 55 | self.reset() 56 | self.TC = TC 57 | self.mu = mu 58 | self.r = r 59 | self.reset() 60 | 61 | def reset(self): 62 | self.F = [0.] 63 | self.I = [0.] 64 | 65 | def get_features(self, observation, F_prev): 66 | # obs_norm = normalise(observation) 67 | return np.concatenate([[1], observation, [F_prev]]) 68 | 69 | def get_action(self, observation): 70 | It = self.get_features(observation, self.F[-1]) 71 | self.I.append(It) 72 | # print ('get_action', b, u, Ft_prev, I) 73 | Ft = np.tanh(np.dot(self.theta, It)) 74 | self.F.append(Ft) 75 | return Ft 76 | 77 | def fit(self): 78 | T = len(self.F) 79 | dF = np.zeros([self.m + 2, T]) 80 | 81 | for i in range(1, T): 82 | It = self.I[i] 83 | sech2 = 1 - np.power(np.tanh(np.dot(self.theta, It)), 2) 84 | dF[:,i] = sech2 * (It + self.theta[-1] * dF[:,i-1]) 85 | 86 | F = np.array(self.F) 87 | 88 | dRtdFt = -self.mu * self.TC * np.sign(F[1:] - F[0:-1]) 89 | dRtdFt1 = self.mu * (self.r[self.m:self.m+T-1] + self.TC * np.sign(F[1:] - F[0:-1])) 90 | 91 | dUt = dRtdFt * dF[:,1:] + dRtdFt1 * dF[:,0:-1] 92 | 93 | self.theta = self.theta + self.rho * np.sum(dUt,1) 94 | 95 | 96 | 97 | 98 | 99 | 100 | # 101 | # training 102 | # 103 | def training(tick_data): 104 | T = 1000 # training size 105 | m = 50 # number of prices in input feature window 106 | mu = 1. # trade quantity 107 | TC = 0.002 # transaction costs 108 | 109 | 110 | z = tick_data[:T + m] # training prices 111 | r = (z[1:] - z[:-1]) #/ z[1:] 112 | 113 | env = Env(r=r, T=T, TC=TC, mu=mu, m=m, z=z) 114 | 115 | agent = MoodyDRLAgent(r, m, TC=TC, mu=mu) 116 | 117 | 118 | num_epochs = 100 119 | epoch = 1 120 | total_reward = [] 121 | R = [] 122 | observation = env.reset() 123 | while (True): 124 | 125 | action = agent.get_action(observation) 126 | 127 | observation, reward, done = env.step(action) 128 | R.append(reward) 129 | 130 | if done: 131 | # print stats 132 | print (epoch, 'total reward=', np.sum(R)) 133 | total_reward.append(np.sum(R)) 134 | 135 | # train policy (gradient ascent) 136 | agent.fit() 137 | 138 | # start next epoch/episode 139 | epoch += 1 140 | if epoch > num_epochs: 141 | break 142 | observation = env.reset() 143 | R = [] 144 | agent.reset() 145 | 146 | # output some stats 147 | print (agent.theta) 148 | print ("Buy and Hold PnL", z[-1]-z[0], 'v', total_reward[-1]) 149 | import matplotlib.pyplot as plt 150 | ax=plt.subplot(3, 1, 1) 151 | plt.plot(total_reward) 152 | ax.set_title("Cum Reward per epoch") 153 | ax=plt.subplot(3, 1, 2) 154 | plt.plot(z[m:]) 155 | ax.set_title("Time-series (prices)") 156 | ax=plt.subplot(3, 1, 3) 157 | plt.plot(np.round(agent.F)) 158 | ax.set_title("Trading signals") 159 | plt.show() 160 | 161 | return agent, env 162 | 163 | 164 | if __name__ == '__main__': 165 | import moody_ts_gen 166 | np.random.seed(0) 167 | tick_data = moody_ts_gen.generate_timeseries(10000) 168 | training(tick_data) 169 | 170 | -------------------------------------------------------------------------------- /moody_ts_gen.py: -------------------------------------------------------------------------------- 1 | # Generates timeseries by random walk with autoregresive trend a la Moody et al 2001 2 | 3 | import numpy as np 4 | 5 | 6 | def generate_timeseries(T, alpha=0.9, k=3): 7 | T=10000 8 | alpha=0.9 9 | k=3 10 | 11 | eps = np.random.randn(T) 12 | v = np.random.randn(T) 13 | z = np.zeros(T) 14 | p = np.zeros(T) 15 | beta = np.zeros(T) 16 | p[0] = 1 17 | 18 | 19 | for t in range(1, T): 20 | p[t] = p[t-1] + beta[t-1] + k*eps[t] 21 | beta[t] = alpha * beta[t-1] + v[t] 22 | 23 | # R = np.max(p[:t+1]) - np.min(p[:t+1]) 24 | # z[t] = np.exp(p[t] / R) 25 | R = np.max(p) - np.min(p) 26 | z = np.exp(p / R) 27 | 28 | return z 29 | 30 | --------------------------------------------------------------------------------