├── README.md
├── agent.py
├── data
    ├── daily_IBM.csv
    ├── daily_MSFT.csv
    └── daily_QCOM.csv
├── envs.py
├── model.py
├── requirements.txt
├── run.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Overview
 3 | 
 4 | This is the code for [this](https://youtu.be/rRssY6FrTvU) video on Youtube by Siraj Raval on Q Learning for Trading as part of the Move 37 course at [School of AI](https://www.theschool.ai). Credits for this code go to [ShuaiW](https://github.com/ShuaiW/teach-machine-to-trade). 
 5 | 
 6 | Related post: [Teach Machine to Trade](https://shuaiw.github.io/2018/02/11/teach-machine-to-trade.html)
 7 | 
 8 | ### Dependencies
 9 | 
10 | Python 2.7. To install all the libraries, run `pip install -r requirements.txt`
11 | 
12 | 
13 | ### Table of content
14 | 
15 | * `agent.py`: a Deep Q learning agent
16 | * `envs.py`: a simple 3-stock trading environment
17 | * `model.py`: a multi-layer perceptron as the function approximator
18 | * `utils.py`: some utility functions
19 | * `run.py`: train/test logic
20 | * `requirement.txt`: all dependencies
21 | * `data/`: 3 csv files with IBM, MSFT, and QCOM stock prices from Jan 3rd, 2000 to Dec 27, 2017 (5629 days). The data was retrieved using [Alpha Vantage API](https://www.alphavantage.co/)
22 | 
23 | 
24 | ### How to run
25 | 
26 | **To train a Deep Q agent**, run `python run.py --mode train`. There are other parameters and I encourage you look at the `run.py` script. After training, a trained model as well as the portfolio value history at episode end would be saved to disk.
27 | 
28 | **To test the model performance**, run `python run.py --mode test --weights <trained_model>`, where `<trained_model>` points to the local model weights file. Test data portfolio value history at episode end would be saved to disk.
29 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import random
 3 | import numpy as np
 4 | from model import mlp
 5 | 
 6 | 
 7 | 
 8 | class DQNAgent(object):
 9 |   """ A simple Deep Q agent """
10 |   def __init__(self, state_size, action_size):
11 |     self.state_size = state_size
12 |     self.action_size = action_size
13 |     self.memory = deque(maxlen=2000)
14 |     self.gamma = 0.95  # discount rate
15 |     self.epsilon = 1.0  # exploration rate
16 |     self.epsilon_min = 0.01
17 |     self.epsilon_decay = 0.995
18 |     self.model = mlp(state_size, action_size)
19 | 
20 | 
21 |   def remember(self, state, action, reward, next_state, done):
22 |     self.memory.append((state, action, reward, next_state, done))
23 | 
24 | 
25 |   def act(self, state):
26 |     if np.random.rand() <= self.epsilon:
27 |       return random.randrange(self.action_size)
28 |     act_values = self.model.predict(state)
29 |     return np.argmax(act_values[0])  # returns action
30 | 
31 | 
32 |   def replay(self, batch_size=32):
33 |     """ vectorized implementation; 30x speed up compared with for loop """
34 |     minibatch = random.sample(self.memory, batch_size)
35 | 
36 |     states = np.array([tup[0][0] for tup in minibatch])
37 |     actions = np.array([tup[1] for tup in minibatch])
38 |     rewards = np.array([tup[2] for tup in minibatch])
39 |     next_states = np.array([tup[3][0] for tup in minibatch])
40 |     done = np.array([tup[4] for tup in minibatch])
41 | 
42 |     # Q(s', a)
43 |     target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)
44 |     # end state target is reward itself (no lookahead)
45 |     target[done] = rewards[done]
46 | 
47 |     # Q(s, a)
48 |     target_f = self.model.predict(states)
49 |     # make the agent to approximately map the current state to future discounted reward
50 |     target_f[range(batch_size), actions] = target
51 | 
52 |     self.model.fit(states, target_f, epochs=1, verbose=0)
53 | 
54 |     if self.epsilon > self.epsilon_min:
55 |       self.epsilon *= self.epsilon_decay
56 | 
57 | 
58 |   def load(self, name):
59 |     self.model.load_weights(name)
60 | 
61 | 
62 |   def save(self, name):
63 |     self.model.save_weights(name)


--------------------------------------------------------------------------------
/envs.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | from gym.utils import seeding
  4 | import numpy as np
  5 | import itertools
  6 | 
  7 | 
  8 | class TradingEnv(gym.Env):
  9 |   """
 10 |   A 3-stock (MSFT, IBM, QCOM) trading environment.
 11 | 
 12 |   State: [# of stock owned, current stock prices, cash in hand]
 13 |     - array of length n_stock * 2 + 1
 14 |     - price is discretized (to integer) to reduce state space
 15 |     - use close price for each stock
 16 |     - cash in hand is evaluated at each step based on action performed
 17 | 
 18 |   Action: sell (0), hold (1), and buy (2)
 19 |     - when selling, sell all the shares
 20 |     - when buying, buy as many as cash in hand allows
 21 |     - if buying multiple stock, equally distribute cash in hand and then utilize the balance
 22 |   """
 23 |   def __init__(self, train_data, init_invest=20000):
 24 |     # data
 25 |     self.stock_price_history = np.around(train_data) # round up to integer to reduce state space
 26 |     self.n_stock, self.n_step = self.stock_price_history.shape
 27 | 
 28 |     # instance attributes
 29 |     self.init_invest = init_invest
 30 |     self.cur_step = None
 31 |     self.stock_owned = None
 32 |     self.stock_price = None
 33 |     self.cash_in_hand = None
 34 | 
 35 |     # action space
 36 |     self.action_space = spaces.Discrete(3**self.n_stock)
 37 | 
 38 |     # observation space: give estimates in order to sample and build scaler
 39 |     stock_max_price = self.stock_price_history.max(axis=1)
 40 |     stock_range = [[0, init_invest * 2 // mx] for mx in stock_max_price]
 41 |     price_range = [[0, mx] for mx in stock_max_price]
 42 |     cash_in_hand_range = [[0, init_invest * 2]]
 43 |     self.observation_space = spaces.MultiDiscrete(stock_range + price_range + cash_in_hand_range)
 44 | 
 45 |     # seed and start
 46 |     self._seed()
 47 |     self._reset()
 48 | 
 49 | 
 50 |   def _seed(self, seed=None):
 51 |     self.np_random, seed = seeding.np_random(seed)
 52 |     return [seed]
 53 | 
 54 | 
 55 |   def _reset(self):
 56 |     self.cur_step = 0
 57 |     self.stock_owned = [0] * self.n_stock
 58 |     self.stock_price = self.stock_price_history[:, self.cur_step]
 59 |     self.cash_in_hand = self.init_invest
 60 |     return self._get_obs()
 61 | 
 62 | 
 63 |   def _step(self, action):
 64 |     assert self.action_space.contains(action)
 65 |     prev_val = self._get_val()
 66 |     self.cur_step += 1
 67 |     self.stock_price = self.stock_price_history[:, self.cur_step] # update price
 68 |     self._trade(action)
 69 |     cur_val = self._get_val()
 70 |     reward = cur_val - prev_val
 71 |     done = self.cur_step == self.n_step - 1
 72 |     info = {'cur_val': cur_val}
 73 |     return self._get_obs(), reward, done, info
 74 | 
 75 | 
 76 |   def _get_obs(self):
 77 |     obs = []
 78 |     obs.extend(self.stock_owned)
 79 |     obs.extend(list(self.stock_price))
 80 |     obs.append(self.cash_in_hand)
 81 |     return obs
 82 | 
 83 | 
 84 |   def _get_val(self):
 85 |     return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand
 86 | 
 87 | 
 88 |   def _trade(self, action):
 89 |     # all combo to sell(0), hold(1), or buy(2) stocks
 90 |     action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock))
 91 |     action_vec = action_combo[action]
 92 | 
 93 |     # one pass to get sell/buy index
 94 |     sell_index = []
 95 |     buy_index = []
 96 |     for i, a in enumerate(action_vec):
 97 |       if a == 0:
 98 |         sell_index.append(i)
 99 |       elif a == 2:
100 |         buy_index.append(i)
101 | 
102 |     # two passes: sell first, then buy; might be naive in real-world settings
103 |     if sell_index:
104 |       for i in sell_index:
105 |         self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
106 |         self.stock_owned[i] = 0
107 |     if buy_index:
108 |       can_buy = True
109 |       while can_buy:
110 |         for i in buy_index:
111 |           if self.cash_in_hand > self.stock_price[i]:
112 |             self.stock_owned[i] += 1 # buy one share
113 |             self.cash_in_hand -= self.stock_price[i]
114 |           else:
115 |             can_buy = False
116 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Sequential
 2 | from keras.layers import Dense
 3 | from keras.optimizers import Adam
 4 | 
 5 | 
 6 | 
 7 | def mlp(n_obs, n_action, n_hidden_layer=1, n_neuron_per_layer=32,
 8 |         activation='relu', loss='mse'):
 9 |   """ A multi-layer perceptron """
10 |   model = Sequential()
11 |   model.add(Dense(n_neuron_per_layer, input_dim=n_obs, activation=activation))
12 |   for _ in range(n_hidden_layer):
13 |     model.add(Dense(n_neuron_per_layer, activation=activation))
14 |   model.add(Dense(n_action, activation='linear'))
15 |   model.compile(loss=loss, optimizer=Adam())
16 |   print(model.summary())
17 |   return model


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | keras
4 | tensorflow
5 | sklearn
6 | h5py
7 | gym==0.9.4


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import time
 3 | import numpy as np
 4 | import argparse
 5 | import re
 6 | 
 7 | from envs import TradingEnv
 8 | from agent import DQNAgent
 9 | from utils import get_data, get_scaler, maybe_make_dir
10 | 
11 | 
12 | 
13 | if __name__ == '__main__':
14 |   parser = argparse.ArgumentParser()
15 |   parser.add_argument('-e', '--episode', type=int, default=2000,
16 |                       help='number of episode to run')
17 |   parser.add_argument('-b', '--batch_size', type=int, default=32,
18 |                       help='batch size for experience replay')
19 |   parser.add_argument('-i', '--initial_invest', type=int, default=20000,
20 |                       help='initial investment amount')
21 |   parser.add_argument('-m', '--mode', type=str, required=True,
22 |                       help='either "train" or "test"')
23 |   parser.add_argument('-w', '--weights', type=str, help='a trained model weights')
24 |   args = parser.parse_args()
25 | 
26 |   maybe_make_dir('weights')
27 |   maybe_make_dir('portfolio_val')
28 | 
29 |   timestamp = time.strftime('%Y%m%d%H%M')
30 | 
31 |   data = np.around(get_data())
32 |   train_data = data[:, :3526]
33 |   test_data = data[:, 3526:]
34 | 
35 |   env = TradingEnv(train_data, args.initial_invest)
36 |   state_size = env.observation_space.shape
37 |   action_size = env.action_space.n
38 |   agent = DQNAgent(state_size, action_size)
39 |   scaler = get_scaler(env)
40 | 
41 |   portfolio_value = []
42 | 
43 |   if args.mode == 'test':
44 |     # remake the env with test data
45 |     env = TradingEnv(test_data, args.initial_invest)
46 |     # load trained weights
47 |     agent.load(args.weights)
48 |     # when test, the timestamp is same as time when weights was trained
49 |     timestamp = re.findall(r'\d{12}', args.weights)[0]
50 | 
51 |   for e in range(args.episode):
52 |     state = env.reset()
53 |     state = scaler.transform([state])
54 |     for time in range(env.n_step):
55 |       action = agent.act(state)
56 |       next_state, reward, done, info = env.step(action)
57 |       next_state = scaler.transform([next_state])
58 |       if args.mode == 'train':
59 |         agent.remember(state, action, reward, next_state, done)
60 |       state = next_state
61 |       if done:
62 |         print("episode: {}/{}, episode end value: {}".format(
63 |           e + 1, args.episode, info['cur_val']))
64 |         portfolio_value.append(info['cur_val']) # append episode end portfolio value
65 |         break
66 |       if args.mode == 'train' and len(agent.memory) > args.batch_size:
67 |         agent.replay(args.batch_size)
68 |     if args.mode == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
69 |       agent.save('weights/{}-dqn.h5'.format(timestamp))
70 | 
71 |   # save portfolio value history to disk
72 |   with open('portfolio_val/{}-{}.p'.format(timestamp, args.mode), 'wb') as fp:
73 |     pickle.dump(portfolio_value, fp)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.preprocessing import StandardScaler
 5 | 
 6 | 
 7 | def get_data(col='close'):
 8 |   """ Returns a 3 x n_step array """
 9 |   msft = pd.read_csv('data/daily_MSFT.csv', usecols=[col])
10 |   ibm = pd.read_csv('data/daily_IBM.csv', usecols=[col])
11 |   qcom = pd.read_csv('data/daily_QCOM.csv', usecols=[col])
12 |   # recent price are at top; reverse it
13 |   return np.array([msft[col].values[::-1],
14 |                    ibm[col].values[::-1],
15 |                    qcom[col].values[::-1]])
16 | 
17 | 
18 | def get_scaler(env):
19 |   """ Takes a env and returns a scaler for its observation space """
20 |   low = [0] * (env.n_stock * 2 + 1)
21 | 
22 |   high = []
23 |   max_price = env.stock_price_history.max(axis=1)
24 |   min_price = env.stock_price_history.min(axis=1)
25 |   max_cash = env.init_invest * 3 # 3 is a magic number...
26 |   max_stock_owned = max_cash // min_price
27 |   for i in max_stock_owned:
28 |     high.append(i)
29 |   for i in max_price:
30 |     high.append(i)
31 |   high.append(max_cash)
32 | 
33 |   scaler = StandardScaler()
34 |   scaler.fit([low, high])
35 |   return scaler
36 | 
37 | 
38 | def maybe_make_dir(directory):
39 |   if not os.path.exists(directory):
40 |     os.makedirs(directory)


--------------------------------------------------------------------------------