├── README.md ├── agent.py ├── data ├── daily_IBM.csv ├── daily_MSFT.csv └── daily_QCOM.csv ├── envs.py ├── model.py ├── requirements.txt ├── run.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Overview 3 | 4 | This is the code for [this](https://youtu.be/rRssY6FrTvU) video on Youtube by Siraj Raval on Q Learning for Trading as part of the Move 37 course at [School of AI](https://www.theschool.ai). Credits for this code go to [ShuaiW](https://github.com/ShuaiW/teach-machine-to-trade). 5 | 6 | Related post: [Teach Machine to Trade](https://shuaiw.github.io/2018/02/11/teach-machine-to-trade.html) 7 | 8 | ### Dependencies 9 | 10 | Python 2.7. To install all the libraries, run `pip install -r requirements.txt` 11 | 12 | 13 | ### Table of content 14 | 15 | * `agent.py`: a Deep Q learning agent 16 | * `envs.py`: a simple 3-stock trading environment 17 | * `model.py`: a multi-layer perceptron as the function approximator 18 | * `utils.py`: some utility functions 19 | * `run.py`: train/test logic 20 | * `requirement.txt`: all dependencies 21 | * `data/`: 3 csv files with IBM, MSFT, and QCOM stock prices from Jan 3rd, 2000 to Dec 27, 2017 (5629 days). The data was retrieved using [Alpha Vantage API](https://www.alphavantage.co/) 22 | 23 | 24 | ### How to run 25 | 26 | **To train a Deep Q agent**, run `python run.py --mode train`. There are other parameters and I encourage you look at the `run.py` script. After training, a trained model as well as the portfolio value history at episode end would be saved to disk. 27 | 28 | **To test the model performance**, run `python run.py --mode test --weights `, where `` points to the local model weights file. Test data portfolio value history at episode end would be saved to disk. 29 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | import numpy as np 4 | from model import mlp 5 | 6 | 7 | 8 | class DQNAgent(object): 9 | """ A simple Deep Q agent """ 10 | def __init__(self, state_size, action_size): 11 | self.state_size = state_size 12 | self.action_size = action_size 13 | self.memory = deque(maxlen=2000) 14 | self.gamma = 0.95 # discount rate 15 | self.epsilon = 1.0 # exploration rate 16 | self.epsilon_min = 0.01 17 | self.epsilon_decay = 0.995 18 | self.model = mlp(state_size, action_size) 19 | 20 | 21 | def remember(self, state, action, reward, next_state, done): 22 | self.memory.append((state, action, reward, next_state, done)) 23 | 24 | 25 | def act(self, state): 26 | if np.random.rand() <= self.epsilon: 27 | return random.randrange(self.action_size) 28 | act_values = self.model.predict(state) 29 | return np.argmax(act_values[0]) # returns action 30 | 31 | 32 | def replay(self, batch_size=32): 33 | """ vectorized implementation; 30x speed up compared with for loop """ 34 | minibatch = random.sample(self.memory, batch_size) 35 | 36 | states = np.array([tup[0][0] for tup in minibatch]) 37 | actions = np.array([tup[1] for tup in minibatch]) 38 | rewards = np.array([tup[2] for tup in minibatch]) 39 | next_states = np.array([tup[3][0] for tup in minibatch]) 40 | done = np.array([tup[4] for tup in minibatch]) 41 | 42 | # Q(s', a) 43 | target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1) 44 | # end state target is reward itself (no lookahead) 45 | target[done] = rewards[done] 46 | 47 | # Q(s, a) 48 | target_f = self.model.predict(states) 49 | # make the agent to approximately map the current state to future discounted reward 50 | target_f[range(batch_size), actions] = target 51 | 52 | self.model.fit(states, target_f, epochs=1, verbose=0) 53 | 54 | if self.epsilon > self.epsilon_min: 55 | self.epsilon *= self.epsilon_decay 56 | 57 | 58 | def load(self, name): 59 | self.model.load_weights(name) 60 | 61 | 62 | def save(self, name): 63 | self.model.save_weights(name) -------------------------------------------------------------------------------- /envs.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.utils import seeding 4 | import numpy as np 5 | import itertools 6 | 7 | 8 | class TradingEnv(gym.Env): 9 | """ 10 | A 3-stock (MSFT, IBM, QCOM) trading environment. 11 | 12 | State: [# of stock owned, current stock prices, cash in hand] 13 | - array of length n_stock * 2 + 1 14 | - price is discretized (to integer) to reduce state space 15 | - use close price for each stock 16 | - cash in hand is evaluated at each step based on action performed 17 | 18 | Action: sell (0), hold (1), and buy (2) 19 | - when selling, sell all the shares 20 | - when buying, buy as many as cash in hand allows 21 | - if buying multiple stock, equally distribute cash in hand and then utilize the balance 22 | """ 23 | def __init__(self, train_data, init_invest=20000): 24 | # data 25 | self.stock_price_history = np.around(train_data) # round up to integer to reduce state space 26 | self.n_stock, self.n_step = self.stock_price_history.shape 27 | 28 | # instance attributes 29 | self.init_invest = init_invest 30 | self.cur_step = None 31 | self.stock_owned = None 32 | self.stock_price = None 33 | self.cash_in_hand = None 34 | 35 | # action space 36 | self.action_space = spaces.Discrete(3**self.n_stock) 37 | 38 | # observation space: give estimates in order to sample and build scaler 39 | stock_max_price = self.stock_price_history.max(axis=1) 40 | stock_range = [[0, init_invest * 2 // mx] for mx in stock_max_price] 41 | price_range = [[0, mx] for mx in stock_max_price] 42 | cash_in_hand_range = [[0, init_invest * 2]] 43 | self.observation_space = spaces.MultiDiscrete(stock_range + price_range + cash_in_hand_range) 44 | 45 | # seed and start 46 | self._seed() 47 | self._reset() 48 | 49 | 50 | def _seed(self, seed=None): 51 | self.np_random, seed = seeding.np_random(seed) 52 | return [seed] 53 | 54 | 55 | def _reset(self): 56 | self.cur_step = 0 57 | self.stock_owned = [0] * self.n_stock 58 | self.stock_price = self.stock_price_history[:, self.cur_step] 59 | self.cash_in_hand = self.init_invest 60 | return self._get_obs() 61 | 62 | 63 | def _step(self, action): 64 | assert self.action_space.contains(action) 65 | prev_val = self._get_val() 66 | self.cur_step += 1 67 | self.stock_price = self.stock_price_history[:, self.cur_step] # update price 68 | self._trade(action) 69 | cur_val = self._get_val() 70 | reward = cur_val - prev_val 71 | done = self.cur_step == self.n_step - 1 72 | info = {'cur_val': cur_val} 73 | return self._get_obs(), reward, done, info 74 | 75 | 76 | def _get_obs(self): 77 | obs = [] 78 | obs.extend(self.stock_owned) 79 | obs.extend(list(self.stock_price)) 80 | obs.append(self.cash_in_hand) 81 | return obs 82 | 83 | 84 | def _get_val(self): 85 | return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand 86 | 87 | 88 | def _trade(self, action): 89 | # all combo to sell(0), hold(1), or buy(2) stocks 90 | action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock)) 91 | action_vec = action_combo[action] 92 | 93 | # one pass to get sell/buy index 94 | sell_index = [] 95 | buy_index = [] 96 | for i, a in enumerate(action_vec): 97 | if a == 0: 98 | sell_index.append(i) 99 | elif a == 2: 100 | buy_index.append(i) 101 | 102 | # two passes: sell first, then buy; might be naive in real-world settings 103 | if sell_index: 104 | for i in sell_index: 105 | self.cash_in_hand += self.stock_price[i] * self.stock_owned[i] 106 | self.stock_owned[i] = 0 107 | if buy_index: 108 | can_buy = True 109 | while can_buy: 110 | for i in buy_index: 111 | if self.cash_in_hand > self.stock_price[i]: 112 | self.stock_owned[i] += 1 # buy one share 113 | self.cash_in_hand -= self.stock_price[i] 114 | else: 115 | can_buy = False 116 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential 2 | from keras.layers import Dense 3 | from keras.optimizers import Adam 4 | 5 | 6 | 7 | def mlp(n_obs, n_action, n_hidden_layer=1, n_neuron_per_layer=32, 8 | activation='relu', loss='mse'): 9 | """ A multi-layer perceptron """ 10 | model = Sequential() 11 | model.add(Dense(n_neuron_per_layer, input_dim=n_obs, activation=activation)) 12 | for _ in range(n_hidden_layer): 13 | model.add(Dense(n_neuron_per_layer, activation=activation)) 14 | model.add(Dense(n_action, activation='linear')) 15 | model.compile(loss=loss, optimizer=Adam()) 16 | print(model.summary()) 17 | return model -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | keras 4 | tensorflow 5 | sklearn 6 | h5py 7 | gym==0.9.4 -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import time 3 | import numpy as np 4 | import argparse 5 | import re 6 | 7 | from envs import TradingEnv 8 | from agent import DQNAgent 9 | from utils import get_data, get_scaler, maybe_make_dir 10 | 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-e', '--episode', type=int, default=2000, 16 | help='number of episode to run') 17 | parser.add_argument('-b', '--batch_size', type=int, default=32, 18 | help='batch size for experience replay') 19 | parser.add_argument('-i', '--initial_invest', type=int, default=20000, 20 | help='initial investment amount') 21 | parser.add_argument('-m', '--mode', type=str, required=True, 22 | help='either "train" or "test"') 23 | parser.add_argument('-w', '--weights', type=str, help='a trained model weights') 24 | args = parser.parse_args() 25 | 26 | maybe_make_dir('weights') 27 | maybe_make_dir('portfolio_val') 28 | 29 | timestamp = time.strftime('%Y%m%d%H%M') 30 | 31 | data = np.around(get_data()) 32 | train_data = data[:, :3526] 33 | test_data = data[:, 3526:] 34 | 35 | env = TradingEnv(train_data, args.initial_invest) 36 | state_size = env.observation_space.shape 37 | action_size = env.action_space.n 38 | agent = DQNAgent(state_size, action_size) 39 | scaler = get_scaler(env) 40 | 41 | portfolio_value = [] 42 | 43 | if args.mode == 'test': 44 | # remake the env with test data 45 | env = TradingEnv(test_data, args.initial_invest) 46 | # load trained weights 47 | agent.load(args.weights) 48 | # when test, the timestamp is same as time when weights was trained 49 | timestamp = re.findall(r'\d{12}', args.weights)[0] 50 | 51 | for e in range(args.episode): 52 | state = env.reset() 53 | state = scaler.transform([state]) 54 | for time in range(env.n_step): 55 | action = agent.act(state) 56 | next_state, reward, done, info = env.step(action) 57 | next_state = scaler.transform([next_state]) 58 | if args.mode == 'train': 59 | agent.remember(state, action, reward, next_state, done) 60 | state = next_state 61 | if done: 62 | print("episode: {}/{}, episode end value: {}".format( 63 | e + 1, args.episode, info['cur_val'])) 64 | portfolio_value.append(info['cur_val']) # append episode end portfolio value 65 | break 66 | if args.mode == 'train' and len(agent.memory) > args.batch_size: 67 | agent.replay(args.batch_size) 68 | if args.mode == 'train' and (e + 1) % 10 == 0: # checkpoint weights 69 | agent.save('weights/{}-dqn.h5'.format(timestamp)) 70 | 71 | # save portfolio value history to disk 72 | with open('portfolio_val/{}-{}.p'.format(timestamp, args.mode), 'wb') as fp: 73 | pickle.dump(portfolio_value, fp) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.preprocessing import StandardScaler 5 | 6 | 7 | def get_data(col='close'): 8 | """ Returns a 3 x n_step array """ 9 | msft = pd.read_csv('data/daily_MSFT.csv', usecols=[col]) 10 | ibm = pd.read_csv('data/daily_IBM.csv', usecols=[col]) 11 | qcom = pd.read_csv('data/daily_QCOM.csv', usecols=[col]) 12 | # recent price are at top; reverse it 13 | return np.array([msft[col].values[::-1], 14 | ibm[col].values[::-1], 15 | qcom[col].values[::-1]]) 16 | 17 | 18 | def get_scaler(env): 19 | """ Takes a env and returns a scaler for its observation space """ 20 | low = [0] * (env.n_stock * 2 + 1) 21 | 22 | high = [] 23 | max_price = env.stock_price_history.max(axis=1) 24 | min_price = env.stock_price_history.min(axis=1) 25 | max_cash = env.init_invest * 3 # 3 is a magic number... 26 | max_stock_owned = max_cash // min_price 27 | for i in max_stock_owned: 28 | high.append(i) 29 | for i in max_price: 30 | high.append(i) 31 | high.append(max_cash) 32 | 33 | scaler = StandardScaler() 34 | scaler.fit([low, high]) 35 | return scaler 36 | 37 | 38 | def maybe_make_dir(directory): 39 | if not os.path.exists(directory): 40 | os.makedirs(directory) --------------------------------------------------------------------------------