├── .gitignore
├── README.md
├── moody_DRL_trader.py
├── moody_ts_gen.py
└── simulated data moody2001.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # automated-trading-RL
 2 | 
 3 | Some early code from work on a big RL trading project.
 4 | 
 5 | Implements Direct Reinforcement Learning trader based on Moody et al 2001.
 6 | 
 7 | * Generates time-series by random walk with autoregression
 8 | * Optimises reward (profit) by batch training a RNN policy.
 9 | * pure numpy implementation
10 | 
11 | 


--------------------------------------------------------------------------------
/moody_DRL_trader.py:
--------------------------------------------------------------------------------
  1 | """ numpy implementation of Moody 2001 DRL trader by Matt Pearce """
  2 | import numpy as np
  3 | 
  4 | 
  5 | def normalise(x):
  6 |     return (x - x.mean()) / x.std()
  7 | 
  8 | 
  9 | class Env():
 10 | 
 11 |     def __init__(self, r, mu, TC, T, m, z):
 12 |         self.quantity = mu
 13 |         self.transaction_costs = TC
 14 |         self.F_prev = 0.0
 15 |         self.T = T
 16 |         self.m = m
 17 |         self.r = r # returns  z{t} - z{t-1}
 18 |         self.z = z
 19 | 
 20 |     def get_reward(self, r_t, F_t):
 21 |         r_t = self.z[self.t+self.m-1] - self.z[self.t+self.m-1-1]
 22 |         return self.quantity * (self.F_prev * r_t - self.transaction_costs * abs(F_t - self.F_prev))
 23 | 
 24 |     def get_observation(self):
 25 |         return normalise(self.r)[self.t : self.t + self.m]
 26 | 
 27 |     def step(self, action):
 28 |         # print(self.t, 'F =', action)
 29 |         self.t += 1
 30 | 
 31 |         reward = self.get_reward(self.r[self.t+self.m-2], action)
 32 |         self.F_prev = action
 33 | 
 34 |         observation = self.get_observation()
 35 | 
 36 |         done = self.t >= self.T
 37 | 
 38 |         return observation, reward, done
 39 | 
 40 |     def reset(self):
 41 |         self.t = 1
 42 |         self.F_prev = 0.0
 43 |         return self.get_observation()
 44 | 
 45 | 
 46 | 
 47 | 
 48 | class MoodyDRLAgent():
 49 | 
 50 |     def __init__(self, r, m, TC, mu):
 51 |         # self.theta = np.ones(m+2) # params are [b,theta(m),u]
 52 |         self.theta = (np.random.rand(m+2)*2-1.0)*np.sqrt(6./(m+2))
 53 |         self.rho = 0.04      # learning rate
 54 |         self.m = m
 55 |         self.reset()
 56 |         self.TC = TC
 57 |         self.mu = mu
 58 |         self.r = r
 59 |         self.reset()
 60 | 
 61 |     def reset(self):
 62 |         self.F = [0.]
 63 |         self.I = [0.]
 64 | 
 65 |     def get_features(self, observation, F_prev):
 66 |         # obs_norm = normalise(observation)
 67 |         return np.concatenate([[1], observation, [F_prev]])
 68 | 
 69 |     def get_action(self, observation):
 70 |         It = self.get_features(observation, self.F[-1])
 71 |         self.I.append(It)
 72 |     #     print ('get_action', b, u, Ft_prev, I)
 73 |         Ft = np.tanh(np.dot(self.theta, It))
 74 |         self.F.append(Ft)
 75 |         return Ft
 76 | 
 77 |     def fit(self):
 78 |         T = len(self.F)
 79 |         dF = np.zeros([self.m + 2, T])
 80 | 
 81 |         for i in range(1, T):
 82 |             It = self.I[i]
 83 |             sech2 = 1 - np.power(np.tanh(np.dot(self.theta, It)), 2)
 84 |             dF[:,i] = sech2 * (It + self.theta[-1] * dF[:,i-1])
 85 | 
 86 |         F = np.array(self.F)
 87 | 
 88 |         dRtdFt = -self.mu * self.TC * np.sign(F[1:] - F[0:-1])
 89 |         dRtdFt1 = self.mu * (self.r[self.m:self.m+T-1] + self.TC * np.sign(F[1:] - F[0:-1]))
 90 | 
 91 |         dUt = dRtdFt * dF[:,1:] + dRtdFt1 * dF[:,0:-1]
 92 | 
 93 |         self.theta = self.theta + self.rho * np.sum(dUt,1)
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | #
101 | # training
102 | #
103 | def training(tick_data):
104 |     T = 1000        # training size
105 |     m = 50          # number of prices in input feature window
106 |     mu = 1.         # trade quantity
107 |     TC = 0.002      # transaction costs
108 | 
109 | 
110 |     z = tick_data[:T + m] # training prices
111 |     r = (z[1:] - z[:-1]) #/ z[1:]
112 | 
113 |     env = Env(r=r, T=T, TC=TC, mu=mu, m=m, z=z)
114 | 
115 |     agent = MoodyDRLAgent(r, m, TC=TC, mu=mu)
116 | 
117 | 
118 |     num_epochs = 100
119 |     epoch = 1
120 |     total_reward = []
121 |     R = []
122 |     observation = env.reset()
123 |     while (True):
124 | 
125 |         action = agent.get_action(observation)
126 | 
127 |         observation, reward, done = env.step(action)
128 |         R.append(reward)
129 | 
130 |         if done:
131 |             # print stats
132 |             print (epoch, 'total reward=', np.sum(R))
133 |             total_reward.append(np.sum(R))
134 | 
135 |             # train policy (gradient ascent)
136 |             agent.fit()
137 | 
138 |             # start next epoch/episode
139 |             epoch += 1
140 |             if epoch > num_epochs:
141 |                 break
142 |             observation = env.reset()
143 |             R = []
144 |             agent.reset()
145 | 
146 |     # output some stats
147 |     print (agent.theta)
148 |     print ("Buy and Hold PnL", z[-1]-z[0], 'v', total_reward[-1])
149 |     import matplotlib.pyplot as plt
150 |     ax=plt.subplot(3, 1, 1)
151 |     plt.plot(total_reward)
152 |     ax.set_title("Cum Reward per epoch")
153 |     ax=plt.subplot(3, 1, 2)
154 |     plt.plot(z[m:])
155 |     ax.set_title("Time-series (prices)")
156 |     ax=plt.subplot(3, 1, 3)
157 |     plt.plot(np.round(agent.F))
158 |     ax.set_title("Trading signals")
159 |     plt.show()
160 | 
161 |     return agent, env
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     import moody_ts_gen
166 |     np.random.seed(0)
167 |     tick_data = moody_ts_gen.generate_timeseries(10000)
168 |     training(tick_data)
169 | 
170 | 


--------------------------------------------------------------------------------
/moody_ts_gen.py:
--------------------------------------------------------------------------------
 1 | # Generates timeseries by random walk with autoregresive trend a la Moody et al 2001
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def generate_timeseries(T, alpha=0.9, k=3):
 7 | 	T=10000
 8 | 	alpha=0.9
 9 | 	k=3
10 | 
11 | 	eps = np.random.randn(T)
12 | 	v = np.random.randn(T)
13 | 	z = np.zeros(T)
14 | 	p = np.zeros(T)
15 | 	beta = np.zeros(T)
16 | 	p[0] = 1
17 | 
18 | 
19 | 	for t in range(1, T):
20 | 	    p[t] = p[t-1] + beta[t-1] + k*eps[t]
21 | 	    beta[t] = alpha * beta[t-1] + v[t]
22 | 
23 | 	#     R = np.max(p[:t+1]) - np.min(p[:t+1])
24 | 	#     z[t] = np.exp(p[t] / R)
25 | 	R = np.max(p) - np.min(p)
26 | 	z = np.exp(p / R)
27 | 
28 | 	return z
29 | 
30 | 


--------------------------------------------------------------------------------