├── .gitignore ├── Bot_code_and_models ├── .gitignore ├── Agent.py ├── Environment.py ├── Readme.md ├── hourly_aggregated_dataset.csv ├── main.py ├── models.py ├── profit_reward_double_ddqn_model ├── profit_reward_double_dqn_model ├── profit_reward_dqn_model ├── sr_reward_double_ddqn_model ├── sr_reward_double_dqn_model ├── sr_reward_dqn_model ├── train_test.py └── utils.py ├── LICENSE ├── Readme.md └── report.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | venv/ 4 | -------------------------------------------------------------------------------- /Bot_code_and_models/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | venv/ 4 | -------------------------------------------------------------------------------- /Bot_code_and_models/Agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.nn.functional as F 4 | from models import ConvDQN, ConvDuelingDQN 5 | from utils import ReplayMemory 6 | from utils import Transition 7 | import random 8 | from tqdm import tqdm 9 | import re 10 | import os 11 | 12 | 13 | class Agent: 14 | """Definition of the Agent that will interact with the environment. 15 | 16 | Attributes: 17 | REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory 18 | 19 | BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper. 20 | 21 | GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1 22 | that ensures the sum converges. It also controls the importance of future 23 | expected reward. 24 | 25 | EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action 26 | selection 27 | 28 | EPS_END(:obj:`float`): final value for epsilon of the e-greedy action 29 | selection 30 | 31 | LEARNING_RATE(:obj:`float`): learning rate of the optimizer 32 | (Adam) 33 | 34 | INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size. 35 | 36 | HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only) 37 | 38 | ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network 39 | 40 | TARGET_UPDATE (:obj:`int`): period of Q target network updates 41 | 42 | MODEL (:obj:`string`): type of the model. 43 | 44 | DOUBLE (:obj:`bool`): Type of Q function computation. 45 | """ 46 | 47 | def __init__(self, 48 | REPLAY_MEM_SIZE=10000, 49 | BATCH_SIZE=40, 50 | GAMMA=0.98, 51 | EPS_START=1, 52 | EPS_END=0.12, 53 | EPS_STEPS=300, 54 | LEARNING_RATE=0.001, 55 | INPUT_DIM=24, 56 | HIDDEN_DIM=120, 57 | ACTION_NUMBER=3, 58 | TARGET_UPDATE=10, 59 | MODEL='ddqn', 60 | DOUBLE=True): 61 | 62 | self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE 63 | self.BATCH_SIZE = BATCH_SIZE 64 | self.GAMMA = GAMMA 65 | self.EPS_START = EPS_START 66 | self.EPS_END = EPS_END 67 | self.EPS_STEPS = EPS_STEPS 68 | self.LEARNING_RATE = LEARNING_RATE 69 | self.INPUT_DIM = INPUT_DIM 70 | self.HIDDEN_DIM = HIDDEN_DIM 71 | self.ACTION_NUMBER = ACTION_NUMBER 72 | self.TARGET_UPDATE = TARGET_UPDATE 73 | self.MODEL = MODEL # deep q network (dqn) or Dueling deep q network (ddqn) 74 | self.DOUBLE = DOUBLE # to understand if use or do not use a 'Double' model (regularization) 75 | self.TRAINING = True # to do not pick random actions during testing 76 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 77 | print("Agent is using device:\t" + str(self.device)) 78 | '''elif self.MODEL == 'lin_ddqn': 79 | self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) 80 | self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) 81 | elif self.MODEL == 'lin_dqn': 82 | self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) 83 | self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) 84 | ''' 85 | 86 | if self.MODEL == 'ddqn': 87 | self.policy_net = ConvDuelingDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 88 | self.target_net = ConvDuelingDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 89 | elif self.MODEL == 'dqn': 90 | self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 91 | self.target_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 92 | 93 | self.target_net.load_state_dict(self.policy_net.state_dict()) 94 | self.target_net.eval() 95 | 96 | self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LEARNING_RATE) 97 | self.memory = ReplayMemory(self.REPLAY_MEM_SIZE) 98 | self.steps_done = 0 99 | self.training_cumulative_reward = [] 100 | 101 | def select_action(self, state): 102 | """ the epsilon-greedy action selection""" 103 | state = state.unsqueeze(0).unsqueeze(1) 104 | sample = random.random() 105 | if self.TRAINING: 106 | if self.steps_done > self.EPS_STEPS: 107 | eps_threshold = self.EPS_END 108 | else: 109 | eps_threshold = self.EPS_START 110 | else: 111 | eps_threshold = self.EPS_END 112 | 113 | self.steps_done += 1 114 | # [Exploitation] pick the best action according to current Q approx. 115 | if sample > eps_threshold: 116 | with torch.no_grad(): 117 | # Return the number of the action with highest non normalized probability 118 | # TODO: decide if diverge from paper and normalize probabilities with 119 | # softmax or at least compare the architectures 120 | return torch.tensor([self.policy_net(state).argmax()], device=self.device, dtype=torch.long) 121 | 122 | # [Exploration] pick a random action from the action space 123 | else: 124 | return torch.tensor([random.randrange(self.ACTION_NUMBER)], device=self.device, dtype=torch.long) 125 | 126 | def optimize_model(self): 127 | if len(self.memory) < self.BATCH_SIZE: 128 | # it will return without doing nothing if we have not enough data to sample 129 | return 130 | transitions = self.memory.sample(self.BATCH_SIZE) 131 | # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for 132 | # detailed explanation). This converts batch-array of Transitions 133 | # to Transition of batch-arrays. 134 | # Transition is the named tuple defined above. 135 | batch = Transition(*zip(*transitions)) 136 | 137 | # Compute a mask of non-final states and concatenate the batch elements 138 | # (a final state would've been the one after which simulation ended) 139 | # 140 | # non_final_mask is a column vector telling wich state of the sampled is final 141 | # non_final_next_states contains all the non-final states sampled 142 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, 143 | dtype=torch.bool) 144 | nfns = [s for s in batch.next_state if s is not None] 145 | non_final_next_states = torch.cat(nfns).view(len(nfns), -1) 146 | non_final_next_states = non_final_next_states.unsqueeze(1) 147 | 148 | state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) 149 | state_batch = state_batch.unsqueeze(1) 150 | action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) 151 | reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) 152 | 153 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 154 | # columns of actions taken. These are the actions which would've been taken 155 | # for each batch state according to policy_net 156 | state_action_values = self.policy_net(state_batch).gather(1, action_batch) 157 | 158 | # Compute V(s_{t+1}) for all next states. 159 | # Expected values of actions for non_final_next_states are computed based 160 | # on the "older" target_net; selecting their best reward with max(1)[0]. 161 | # This is merged based on the mask, such that we'll have either the expected 162 | # state value or 0 in case the state was final. 163 | # detach removes the tensor from the graph -> no gradient computation is 164 | # required 165 | next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device) 166 | next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() 167 | next_state_values = next_state_values.view(self.BATCH_SIZE, -1) 168 | 169 | # Compute the expected Q values 170 | expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch 171 | # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape)) 172 | 173 | # Compute MSE loss 174 | loss = F.mse_loss(state_action_values, 175 | expected_state_action_values) # expected_state_action_values.unsqueeze(1) 176 | 177 | # Optimize the model 178 | self.optimizer.zero_grad() 179 | loss.backward() 180 | for param in self.policy_net.parameters(): 181 | param.grad.data.clamp_(-1, 1) 182 | self.optimizer.step() 183 | 184 | def optimize_double_dqn_model(self): 185 | if len(self.memory) < self.BATCH_SIZE: 186 | # it will return without doing nothing if we have not enough data to sample 187 | return 188 | transitions = self.memory.sample(self.BATCH_SIZE) 189 | # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for 190 | # detailed explanation). This converts batch-array of Transitions 191 | # to Transition of batch-arrays. 192 | # Transition is the named tuple defined above. 193 | batch = Transition(*zip(*transitions)) 194 | 195 | # Compute a mask of non-final states and concatenate the batch elements 196 | # (a final state would've been the one after which simulation ended) 197 | # 198 | # non_final_mask is a column vector telling wich state of the sampled is final 199 | # non_final_next_states contains all the non-final states sampled 200 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, 201 | dtype=torch.bool) 202 | nfns = [s for s in batch.next_state if s is not None] 203 | non_final_next_states = torch.cat(nfns).view(len(nfns), -1) 204 | non_final_next_states = non_final_next_states.unsqueeze(1) 205 | 206 | state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1) 207 | state_batch = state_batch.unsqueeze(1) 208 | action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1) 209 | reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1) 210 | # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape))) 211 | 212 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 213 | # columns of actions taken. These are the actions which would've been taken 214 | # for each batch state according to policy_net 215 | state_action_values = self.policy_net(state_batch).gather(1, action_batch) 216 | 217 | # ---------- D-DQN Extra Line--------------- 218 | _, next_state_action = self.policy_net(state_batch).max(1, keepdim=True) 219 | 220 | # Compute V(s_{t+1}) for all next states. 221 | # Expected values of actions for non_final_next_states are computed based 222 | # on the actions given by policynet. 223 | # This is merged based on the mask, such that we'll have either the expected 224 | # state value or 0 in case the state was final. 225 | # detach removes the tensor from the graph -> no gradient computation is 226 | # required 227 | next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device).view(self.BATCH_SIZE, -1) 228 | 229 | out = self.target_net(non_final_next_states) 230 | next_state_values[non_final_mask] = out.gather(1, next_state_action[non_final_mask]) 231 | # next_state_values = next_state_values.view(self.BATCH_SIZE, -1) 232 | # Compute the expected Q values 233 | expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch 234 | 235 | # Compute MSE loss 236 | loss = F.mse_loss(state_action_values, expected_state_action_values) 237 | 238 | # Optimize the model 239 | self.optimizer.zero_grad() 240 | loss.backward() 241 | for param in self.policy_net.parameters(): 242 | param.grad.data.clamp_(-1, 1) 243 | self.optimizer.step() 244 | 245 | def train(self, env, path, num_episodes=40): 246 | self.TRAINING = True 247 | cumulative_reward = [0 for t in range(num_episodes)] 248 | print("Training:") 249 | for i_episode in tqdm(range(num_episodes)): 250 | # Initialize the environment and state 251 | env.reset() # reset the env st it is set at the beginning of the time serie 252 | self.steps_done = 0 253 | state = env.get_state() 254 | for t in range(len(env.data)): # while not env.done 255 | 256 | # Select and perform an action 257 | action = self.select_action(state) 258 | reward, done, _ = env.step(action) 259 | 260 | cumulative_reward[i_episode] += reward.item() 261 | 262 | # Observe new state: it will be None if env.done = True. It is the next 263 | # state since env.step() has been called two rows above. 264 | next_state = env.get_state() 265 | 266 | # Store the transition in memory 267 | self.memory.push(state, action, next_state, reward) 268 | 269 | # Move to the next state 270 | state = next_state 271 | 272 | # Perform one step of the optimization (on the policy network): note that 273 | # it will return without doing nothing if we have not enough data to sample 274 | 275 | if self.DOUBLE: 276 | self.optimize_double_dqn_model() 277 | else: 278 | self.optimize_model() 279 | 280 | if done: 281 | break 282 | 283 | # Update the target network, copying all weights and biases of policy_net 284 | if i_episode % self.TARGET_UPDATE == 0: 285 | self.target_net.load_state_dict(self.policy_net.state_dict()) 286 | 287 | # save the model 288 | if self.DOUBLE: 289 | model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model' 290 | count = 0 291 | while os.path.exists(path + model_name): # avoid overrinding models 292 | count += 1 293 | model_name = model_name + "_" + str(count) 294 | 295 | else: 296 | model_name = env.reward_f + '_reward_' + self.MODEL + '_model' 297 | count = 0 298 | while os.path.exists(path + model_name): # avoid overrinding models 299 | count += 1 300 | model_name = model_name + "_" + str(count) 301 | 302 | torch.save(self.policy_net.state_dict(), path + model_name) 303 | 304 | return cumulative_reward 305 | 306 | def test(self, env_test, model_name=None, path=None): 307 | self.TRAINING = False 308 | cumulative_reward = [0 for t in range(len(env_test.data))] 309 | reward_list = [0 for t in range(len(env_test.data))] 310 | 311 | if model_name is None: 312 | pass 313 | elif path is not None: 314 | if re.match(".*_dqn_.*", model_name): 315 | self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 316 | if str(self.device) == "cuda": 317 | self.policy_net.load_state_dict(torch.load(path + model_name)) 318 | else: 319 | self.policy_net.load_state_dict(torch.load(path + model_name, map_location=torch.device('cpu'))) 320 | elif re.match(".*_ddqn_.*", model_name): 321 | self.policy_net = ConvDuelingDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) 322 | if str(self.device) == "cuda": 323 | self.policy_net.load_state_dict(torch.load(path + model_name)) 324 | else: 325 | self.policy_net.load_state_dict(torch.load(path + model_name, map_location=torch.device('cpu'))) 326 | else: 327 | raise RuntimeError("Please Provide a valid model name or valid path.") 328 | else: 329 | raise RuntimeError('Path can not be None if model Name is not None.') 330 | 331 | env_test.reset() # reset the env st it is set at the beginning of the time serie 332 | state = env_test.get_state() 333 | for t in tqdm(range(len(env_test.data))): # while not env.done 334 | 335 | # Select and perform an action 336 | action = self.select_action(state) 337 | 338 | reward, done, _ = env_test.step(action) 339 | 340 | cumulative_reward[t] += reward.item() + cumulative_reward[t - 1 if t - 1 > 0 else 0] 341 | reward_list[t] = reward 342 | 343 | # Observe new state: it will be None if env.done = True. It is the next 344 | # state since env.step() has been called two rows above. 345 | next_state = env_test.get_state() 346 | 347 | # Move to the next state 348 | state = next_state 349 | 350 | if done: 351 | break 352 | 353 | return cumulative_reward, reward_list 354 | -------------------------------------------------------------------------------- /Bot_code_and_models/Environment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | # TODO: modify the reward st. we can choose between sharpe ratio reward or profit 6 | # reward as shown in the paper. 7 | class Environment: 8 | """Definition of the trading environment for the DQN-Agent. 9 | 10 | Attributes: 11 | data (pandas.DataFrame): Time serie to be considered within the environment. 12 | 13 | t (:obj:`int`): Current time instant we are considering. 14 | 15 | profits (:obj:`float`): profit of the agent at time self.t 16 | 17 | agent_positions(:obj:`list` :obj:`float`): list of the positions 18 | currently owned by the agent. 19 | 20 | agent_position_value(:obj:`float`): current value of open positions 21 | (positions in self.agent_positions) 22 | 23 | cumulative_return(:obj:`list` :obj:`float`): econometric measure of profit 24 | during time 25 | 26 | init_price(:obj:`float`): the price of stocks at the beginning of trading 27 | period. 28 | """ 29 | 30 | def __init__(self, data, reward): 31 | """ 32 | Creates the environment. Note: Before using the environment you must call 33 | the Environment.reset() method. 34 | 35 | Args: 36 | data (:obj:`pd.DataFrane`): Time serie to be initialize the environment. 37 | reward (:obj:`str`): Type of reward function to use, either sharpe ratio 38 | "sr" or profit function "profit" 39 | """ 40 | self.data = data 41 | self.reward_f = reward if reward == "sr" else "profit" 42 | self.reset() 43 | 44 | def reset(self): 45 | """ 46 | Reset the environment or makes a further step of initialization if called 47 | on an environment never used before. It must always be called before .step() 48 | method to avoid errors. 49 | """ 50 | self.t = 23 51 | self.done = False 52 | self.profits = [0 for e in range(len(self.data))] 53 | self.agent_positions = [] 54 | self.agent_open_position_value = 0 55 | 56 | self.cumulative_return = [0 for e in range(len(self.data))] 57 | self.init_price = self.data.iloc[0, :]['Close'] 58 | 59 | def get_state(self): 60 | """ 61 | Return the current state of the environment. NOTE: if called after 62 | Environment.step() it will return the next state. 63 | """ 64 | 65 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 66 | if not self.done: 67 | return torch.tensor([el for el in self.data.iloc[self.t - 23:self.t + 1, :]['Close']], device=device, 68 | dtype=torch.float) 69 | else: 70 | return None 71 | 72 | def step(self, act): 73 | """ 74 | Perform the action of the Agent on the environment, computes the reward 75 | and update some datastructures to keep track of some econometric indexes 76 | during time. 77 | 78 | Args: 79 | act (:obj:`int`): Action to be performed on the environment. 80 | 81 | Returns: 82 | reward (:obj:`torch.tensor` :dtype:`torch.float`): the reward of 83 | performing the action on the current env state. 84 | self.done (:obj:`bool`): A boolean flag telling if we are in a final 85 | state 86 | current_state (:obj:`torch.tensor` :dtype:`torch.float`): 87 | the state of the environment after the action execution. 88 | """ 89 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 90 | 91 | reward = 0 92 | # GET CURRENT STATE 93 | state = self.data.iloc[self.t, :]['Close'] 94 | 95 | # EXECUTE THE ACTION (act = 0: stay, 1: buy, 2: sell) 96 | if act == 0: # Do Nothing 97 | pass 98 | 99 | if act == 1: # Buy 100 | self.agent_positions.append(self.data.iloc[self.t, :]['Close']) 101 | 102 | sell_nothing = False 103 | if act == 2: # Sell 104 | profits = 0 105 | if len(self.agent_positions) < 1: 106 | sell_nothing = True 107 | for position in self.agent_positions: 108 | profits += (self.data.iloc[self.t, :][ 109 | 'Close'] - position) # profit = close - my_position for each my_position "p" 110 | 111 | self.profits[self.t] = profits 112 | self.agent_positions = [] 113 | # reward += profits 114 | 115 | self.agent_open_position_value = 0 116 | for position in self.agent_positions: 117 | self.agent_open_position_value += (self.data.iloc[self.t, :]['Close'] - position) 118 | # TO CHECK if the calculus is correct according to the definition 119 | self.cumulative_return[self.t] += (position - self.init_price) / self.init_price 120 | 121 | # COLLECT THE REWARD 122 | reward = 0 123 | if self.reward_f == "sr": 124 | sr = self.agent_open_position_value / np.std(np.array(self.data.iloc[0:self.t]['Close'])) if np.std( 125 | np.array(self.data.iloc[0:self.t]['Close'])) != 0 else 0 126 | # sr = self.profits[self.t] / np.std(np.array(self.profits)) 127 | if sr <= -4: 128 | reward = -10 129 | elif sr < -1: 130 | reward = -4 131 | elif sr < 0: 132 | reward = -1 133 | elif sr == 0: 134 | reward = 0 135 | elif sr <= 1: 136 | reward = 1 137 | elif sr < 4: 138 | reward = 4 139 | else: 140 | reward = 10 141 | 142 | if self.reward_f == "profit": 143 | p = self.profits[self.t] 144 | if p > 0: 145 | reward = 1 146 | elif p < 0: 147 | reward = -1 148 | elif p == 0: 149 | reward = 0 150 | 151 | if sell_nothing and (reward > -5): 152 | reward = -5 153 | 154 | # UPDATE THE STATE 155 | self.t += 1 156 | 157 | if (self.t == len(self.data) - 1): 158 | self.done = True 159 | 160 | return torch.tensor([reward], device=device, dtype=torch.float), self.done, torch.tensor([state], 161 | dtype=torch.float) # reward, done, current_state -------------------------------------------------------------------------------- /Bot_code_and_models/Readme.md: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | Dependecies: 3 | * prettytable (`pip install prettytable`) 4 | * PyTorch 5 | * tqdm 6 | * pandas 7 | * numpy 8 | * matplotlib 9 | 10 | To Run Pretrained Models: 11 | * `python main.py` 12 | 13 | To Run Train and Test: 14 | * `python train_test.py` 15 | 16 | -------------------------------------------------------------------------------- /Bot_code_and_models/main.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable as PrettyTable 2 | from utils import load_data, print_stats, plot_multiple_conf_interval 3 | import random 4 | import os 5 | #from google.colab import drive 6 | #drive.mount('/content/drive') 7 | from Environment import Environment 8 | from Agent import Agent 9 | 10 | 11 | 12 | def main(): 13 | #----------------------------- LOAD DATA --------------------------------------------------------------------------- 14 | path = '' 15 | df = load_data(path) 16 | 17 | 18 | # ----------------------------- AGENTS COMPARISON -------------------------------- 19 | REPLAY_MEM_SIZE = 10000 20 | BATCH_SIZE = 40 21 | GAMMA = 0.98 22 | EPS_START = 1 23 | EPS_END = 0.12 24 | EPS_STEPS = 300 25 | LEARNING_RATE = 0.001 26 | INPUT_DIM = 24 27 | HIDDEN_DIM = 120 28 | ACTION_NUMBER = 3 29 | TARGET_UPDATE = 10 30 | N_TEST = 10 31 | TRADING_PERIOD = 4000 32 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 33 | 34 | dqn_agent = Agent(REPLAY_MEM_SIZE, 35 | BATCH_SIZE, 36 | GAMMA, 37 | EPS_START, 38 | EPS_END, 39 | EPS_STEPS, 40 | LEARNING_RATE, 41 | INPUT_DIM, 42 | HIDDEN_DIM, 43 | ACTION_NUMBER, 44 | TARGET_UPDATE, 45 | MODEL='dqn', 46 | DOUBLE=False) 47 | 48 | double_dqn_agent = Agent(REPLAY_MEM_SIZE, 49 | BATCH_SIZE, 50 | GAMMA, 51 | EPS_START, 52 | EPS_END, 53 | EPS_STEPS, 54 | LEARNING_RATE, 55 | INPUT_DIM, 56 | HIDDEN_DIM, 57 | ACTION_NUMBER, 58 | TARGET_UPDATE, 59 | MODEL='dqn', 60 | DOUBLE=True) 61 | 62 | dueling_double_dqn_agent = Agent(REPLAY_MEM_SIZE, 63 | BATCH_SIZE, 64 | GAMMA, 65 | EPS_START, 66 | EPS_END, 67 | EPS_STEPS, 68 | LEARNING_RATE, 69 | INPUT_DIM, 70 | HIDDEN_DIM, 71 | ACTION_NUMBER, 72 | TARGET_UPDATE, 73 | MODEL='ddqn', 74 | DOUBLE=True) 75 | 76 | train_size = int(TRADING_PERIOD * 0.8) 77 | profit_dqn_return = [] 78 | sharpe_dqn_return = [] 79 | profit_ddqn_return = [] 80 | sharpe_ddqn_return = [] 81 | profit_dueling_ddqn_return = [] 82 | sharpe_dueling_ddqn_return = [] 83 | 84 | #profit_train_env = Environment(df[index:index + train_size], "profit") 85 | sharpe_train_env = Environment(df[index:index + train_size], "sr") 86 | 87 | # ProfitDQN 88 | #cr_profit_dqn = dqn_agent.train(profit_train_env, path) 89 | #profit_train_env.reset() 90 | 91 | # Profit Double DQN 92 | #cr_profit_ddqn = double_dqn_agent.train(profit_train_env, path) 93 | #profit_train_env.reset() 94 | 95 | # Profit Dueling Double DQN 96 | #cr_profit_dueling_ddqn = dueling_double_dqn_agent.train(profit_train_env, path) 97 | #profit_train_env.reset() 98 | 99 | i = 0 100 | while i < N_TEST: 101 | print("Test nr. %s" % str(i+1)) 102 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 103 | 104 | profit_test_env = Environment(df[index + train_size:index + TRADING_PERIOD], "profit") 105 | 106 | # ProfitDQN 107 | cr_profit_dqn_test, _ = dqn_agent.test(profit_test_env, model_name="profit_reward_dqn_model" , path=path) 108 | profit_dqn_return.append(profit_test_env.cumulative_return) 109 | profit_test_env.reset() 110 | 111 | # Profit Double DQN 112 | cr_profit_ddqn_test, _ = double_dqn_agent.test(profit_test_env, model_name="profit_reward_double_dqn_model" , path=path) 113 | profit_ddqn_return.append(profit_test_env.cumulative_return) 114 | profit_test_env.reset() 115 | 116 | # Profit Dueling Double DQN 117 | cr_profit_dueling_ddqn_test, _ = dueling_double_dqn_agent.test(profit_test_env, model_name="profit_reward_double_ddqn_model" , path=path) 118 | profit_dueling_ddqn_return.append(profit_test_env.cumulative_return) 119 | profit_test_env.reset() 120 | 121 | i += 1 122 | 123 | dqn_agent = Agent(REPLAY_MEM_SIZE, 124 | BATCH_SIZE, 125 | GAMMA, 126 | EPS_START, 127 | EPS_END, 128 | EPS_STEPS, 129 | LEARNING_RATE, 130 | INPUT_DIM, 131 | HIDDEN_DIM, 132 | ACTION_NUMBER, 133 | TARGET_UPDATE, 134 | MODEL='dqn', 135 | DOUBLE=False) 136 | 137 | double_dqn_agent = Agent(REPLAY_MEM_SIZE, 138 | BATCH_SIZE, 139 | GAMMA, 140 | EPS_START, 141 | EPS_END, 142 | EPS_STEPS, 143 | LEARNING_RATE, 144 | INPUT_DIM, 145 | HIDDEN_DIM, 146 | ACTION_NUMBER, 147 | TARGET_UPDATE, 148 | MODEL='dqn', 149 | DOUBLE=True) 150 | 151 | dueling_double_dqn_agent = Agent(REPLAY_MEM_SIZE, 152 | BATCH_SIZE, 153 | GAMMA, 154 | EPS_START, 155 | EPS_END, 156 | EPS_STEPS, 157 | LEARNING_RATE, 158 | INPUT_DIM, 159 | HIDDEN_DIM, 160 | ACTION_NUMBER, 161 | TARGET_UPDATE, 162 | MODEL='ddqn', 163 | DOUBLE=True) 164 | 165 | # SharpeDQN 166 | #cr_sharpe_dqn = dqn_agent.train(sharpe_train_env, path) 167 | #sharpe_train_env.reset() 168 | 169 | # Sharpe Double DQN 170 | #cr_sharpe_ddqn = double_dqn_agent.train(sharpe_train_env, path) 171 | #sharpe_train_env.reset() 172 | 173 | # Sharpe Dueling Double DQN 174 | #cr_sharpe_dueling_ddqn = dueling_double_dqn_agent.train(sharpe_train_env, path) 175 | #sharpe_train_env.reset() 176 | 177 | i = 0 178 | while i < N_TEST: 179 | print("Test nr. %s"%str(i+1)) 180 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 181 | 182 | sharpe_test_env = Environment(df[index + train_size:index + TRADING_PERIOD], "sr") 183 | 184 | # SharpeDQN 185 | cr_sharpe_dqn_test, _ = dqn_agent.test(sharpe_test_env, model_name="sr_reward_dqn_model", path=path) 186 | sharpe_dqn_return.append(sharpe_test_env.cumulative_return) 187 | sharpe_test_env.reset() 188 | 189 | # Sharpe Double DQN 190 | cr_sharpe_ddqn_test, _ = double_dqn_agent.test(sharpe_test_env, model_name="sr_reward_double_dqn_model" , path=path) 191 | sharpe_ddqn_return.append(sharpe_test_env.cumulative_return) 192 | sharpe_test_env.reset() 193 | 194 | # Sharpe Dueling Double DQN 195 | cr_sharpe_dueling_ddqn_test, _ = dueling_double_dqn_agent.test(sharpe_test_env, model_name="sr_reward_double_ddqn_model" , path=path) 196 | sharpe_dueling_ddqn_return.append(sharpe_test_env.cumulative_return) 197 | sharpe_test_env.reset() 198 | 199 | i += 1 200 | 201 | #--------------------------------------- Print Test Stats --------------------------------------------------------- 202 | t = PrettyTable(["Trading System", "Avg. Return (%)", "Max Return (%)", "Min Return (%)", "Std. Dev."]) 203 | print_stats("ProfitDQN", profit_dqn_return, t) 204 | print_stats("SharpeDQN", sharpe_dqn_return, t) 205 | print_stats("ProfitDDQN", profit_ddqn_return, t) 206 | print_stats("SharpeDDQN", sharpe_ddqn_return, t) 207 | print_stats("ProfitD-DDQN", profit_dueling_ddqn_return, t) 208 | print_stats("SharpeD-DDQN", sharpe_dueling_ddqn_return, t) 209 | 210 | print(t) 211 | plot_multiple_conf_interval(["ProfitDQN", "SharpeDQN", "ProfitDDQN","SharpeDDQN","ProfitD-DDQN","SharpeD-DDQN"], 212 | [profit_dqn_return,sharpe_dqn_return,profit_ddqn_return,sharpe_ddqn_return, 213 | profit_dueling_ddqn_return,sharpe_dueling_ddqn_return]) 214 | 215 | 216 | 217 | if __name__ == "__main__": 218 | main() -------------------------------------------------------------------------------- /Bot_code_and_models/models.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | # Definition of the netwroks 4 | class DQN(nn.Module): 5 | # Deep Q Network 6 | def __init__(self, obs_len, hidden_size, actions_n): 7 | super(DQN, self).__init__() 8 | # we might want Conv1d ? 9 | self.fc_val = nn.Sequential( 10 | nn.Linear(obs_len, hidden_size), 11 | nn.LeakyReLU(), 12 | nn.Linear(hidden_size, hidden_size), 13 | nn.LeakyReLU(), 14 | nn.Linear(hidden_size, actions_n) 15 | ) 16 | 17 | def forward(self, x): 18 | h = self.fc_val(x) 19 | return h 20 | 21 | 22 | 23 | class DuelingDQN(nn.Module): 24 | # Linear Dueling Deep Q Network 25 | def __init__(self, obs_len, hidden_size, actions_n): 26 | super(DuelingDQN, self).__init__() 27 | 28 | self.feauture_layer = nn.Sequential( 29 | nn.Linear(obs_len, hidden_size), 30 | nn.LeakyReLU(), 31 | nn.Linear(hidden_size, hidden_size), 32 | nn.LeakyReLU(), 33 | ) 34 | 35 | self.value_stream = nn.Sequential( 36 | nn.Linear(hidden_size, hidden_size), 37 | nn.LeakyReLU(), 38 | nn.Linear(hidden_size, 1), 39 | ) 40 | 41 | self.advantage_stream = nn.Sequential( 42 | nn.Linear(hidden_size, hidden_size), 43 | nn.LeakyReLU(), 44 | nn.Linear(hidden_size, actions_n) 45 | ) 46 | 47 | def forward(self, state): 48 | features = self.feauture_layer(state) 49 | values = self.value_stream(features) 50 | advantages = self.advantage_stream(features) 51 | qvals = values + (advantages - advantages.mean()) 52 | 53 | return qvals 54 | 55 | 56 | # Convolutional DQN 57 | class ConvDQN(nn.Module): 58 | def __init__(self, seq_len_in, actions_n, kernel_size=8): 59 | super(ConvDQN, self).__init__() 60 | n_filters = 64 61 | max_pool_kernel = 2 62 | self.conv1 = nn.Conv1d(1, n_filters, kernel_size) 63 | self.maxPool = nn.MaxPool1d(max_pool_kernel, stride=1) 64 | self.LRelu = nn.LeakyReLU() 65 | self.conv2 = nn.Conv1d(n_filters, n_filters, kernel_size // 2) 66 | 67 | self.hidden_dim = n_filters * (((( 68 | seq_len_in - kernel_size + 1) - max_pool_kernel + 1) - kernel_size // 2 + 1) - max_pool_kernel + 1) 69 | 70 | self.out_layer = nn.Linear(self.hidden_dim, actions_n) 71 | 72 | def forward(self, x): 73 | c1_out = self.conv1(x) 74 | max_pool_1 = self.maxPool(self.LRelu(c1_out)) 75 | c2_out = self.conv2(max_pool_1) 76 | max_pool_2 = self.maxPool(self.LRelu(c2_out)) 77 | # print("c1_out:\t%s"%str(c1_out.shape)) 78 | # print("max_pool_1:\t%s"%str(max_pool_1.shape)) 79 | # print("c2_out:\t%s"%str(c2_out.shape)) 80 | # print("max_pool_2:\t%s"%str(max_pool_2.shape)) 81 | 82 | max_pool_2 = max_pool_2.view(-1, self.hidden_dim) 83 | # print("max_pool_2_view:\t%s"%str(max_pool_2.shape)) 84 | 85 | return self.LRelu(self.out_layer(max_pool_2)) 86 | 87 | 88 | # Convolutional Dueling DQN 89 | class ConvDuelingDQN(nn.Module): 90 | def __init__(self, seq_len_in, actions_n, kernel_size=8): 91 | super(ConvDuelingDQN, self).__init__() 92 | n_filters = 64 93 | max_pool_kernel = 2 94 | self.conv1 = nn.Conv1d(1, n_filters, kernel_size) 95 | self.maxPool = nn.MaxPool1d(max_pool_kernel, stride=1) 96 | self.LRelu = nn.LeakyReLU() 97 | self.conv2 = nn.Conv1d(n_filters, n_filters, kernel_size // 2) 98 | self.hidden_dim = n_filters * (((( 99 | seq_len_in - kernel_size + 1) - max_pool_kernel + 1) - kernel_size // 2 + 1) - max_pool_kernel + 1) 100 | paper_hidden_dim = 120 101 | self.split_layer = nn.Linear(self.hidden_dim, paper_hidden_dim) 102 | 103 | self.value_stream = nn.Sequential( 104 | nn.Linear(paper_hidden_dim, paper_hidden_dim), 105 | nn.LeakyReLU(), 106 | nn.Linear(paper_hidden_dim, 1), 107 | ) 108 | 109 | self.advantage_stream = nn.Sequential( 110 | nn.Linear(paper_hidden_dim, paper_hidden_dim), 111 | nn.LeakyReLU(), 112 | nn.Linear(paper_hidden_dim, actions_n) 113 | ) 114 | 115 | def forward(self, x): 116 | c1_out = self.conv1(x) 117 | max_pool_1 = self.maxPool(self.LRelu(c1_out)) 118 | c2_out = self.conv2(max_pool_1) 119 | max_pool_2 = self.maxPool(self.LRelu(c2_out)) 120 | # DEBUG code: 121 | # print("c1_out:\t%s"%str(c1_out.shape)) 122 | # print("max_pool_1:\t%s"%str(max_pool_1.shape)) 123 | # print("c2_out:\t%s"%str(c2_out.shape)) 124 | # print("max_pool_2:\t%s"%str(max_pool_2.shape)) 125 | 126 | max_pool_2 = max_pool_2.view(-1, self.hidden_dim) 127 | # print("max_pool_2_view:\t%s"%str(max_pool_2.shape)) 128 | 129 | split = self.split_layer(max_pool_2) 130 | values = self.value_stream(split) 131 | advantages = self.advantage_stream(split) 132 | qvals = values + (advantages - advantages.mean()) 133 | return qvals 134 | 135 | -------------------------------------------------------------------------------- /Bot_code_and_models/profit_reward_double_ddqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/profit_reward_double_ddqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/profit_reward_double_dqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/profit_reward_double_dqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/profit_reward_dqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/profit_reward_dqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/sr_reward_double_ddqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/sr_reward_double_ddqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/sr_reward_double_dqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/sr_reward_double_dqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/sr_reward_dqn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/Bot_code_and_models/sr_reward_dqn_model -------------------------------------------------------------------------------- /Bot_code_and_models/train_test.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable as PrettyTable 2 | from utils import load_data, print_stats, plot_multiple_conf_interval 3 | import random 4 | import warnings 5 | # from google.colab import drive 6 | # drive.mount('/content/drive') 7 | from Environment import Environment 8 | from Agent import Agent 9 | 10 | 11 | def main(): 12 | # ----------------------------- LOAD DATA --------------------------------------------------------------------------- 13 | path = '' 14 | df = load_data(path) 15 | 16 | # ----------------------------- AGENTS COMPARISON -------------------------------- 17 | REPLAY_MEM_SIZE = 10000 18 | BATCH_SIZE = 40 19 | GAMMA = 0.98 20 | EPS_START = 1 21 | EPS_END = 0.12 22 | EPS_STEPS = 300 23 | LEARNING_RATE = 0.001 24 | INPUT_DIM = 24 25 | HIDDEN_DIM = 120 26 | ACTION_NUMBER = 3 27 | TARGET_UPDATE = 10 28 | N_TEST = 10 29 | TRADING_PERIOD = 4000 30 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 31 | 32 | dqn_agent = Agent(REPLAY_MEM_SIZE, 33 | BATCH_SIZE, 34 | GAMMA, 35 | EPS_START, 36 | EPS_END, 37 | EPS_STEPS, 38 | LEARNING_RATE, 39 | INPUT_DIM, 40 | HIDDEN_DIM, 41 | ACTION_NUMBER, 42 | TARGET_UPDATE, 43 | MODEL='dqn', 44 | DOUBLE=False) 45 | if str(dqn_agent.device) == "cpu": 46 | warnings.warn("Device is set to CPU. This will lead to a very slow training. Consider to run pretained models by" 47 | "executing main.py script instead of train_test.py!") 48 | 49 | 50 | double_dqn_agent = Agent(REPLAY_MEM_SIZE, 51 | BATCH_SIZE, 52 | GAMMA, 53 | EPS_START, 54 | EPS_END, 55 | EPS_STEPS, 56 | LEARNING_RATE, 57 | INPUT_DIM, 58 | HIDDEN_DIM, 59 | ACTION_NUMBER, 60 | TARGET_UPDATE, 61 | MODEL='dqn', 62 | DOUBLE=True) 63 | 64 | dueling_double_dqn_agent = Agent(REPLAY_MEM_SIZE, 65 | BATCH_SIZE, 66 | GAMMA, 67 | EPS_START, 68 | EPS_END, 69 | EPS_STEPS, 70 | LEARNING_RATE, 71 | INPUT_DIM, 72 | HIDDEN_DIM, 73 | ACTION_NUMBER, 74 | TARGET_UPDATE, 75 | MODEL='ddqn', 76 | DOUBLE=True) 77 | 78 | train_size = int(TRADING_PERIOD * 0.8) 79 | profit_dqn_return = [] 80 | sharpe_dqn_return = [] 81 | profit_ddqn_return = [] 82 | sharpe_ddqn_return = [] 83 | profit_dueling_ddqn_return = [] 84 | sharpe_dueling_ddqn_return = [] 85 | 86 | profit_train_env = Environment(df[index:index + train_size], "profit") 87 | sharpe_train_env = Environment(df[index:index + train_size], "sr") 88 | 89 | # ProfitDQN 90 | cr_profit_dqn = dqn_agent.train(profit_train_env, path) 91 | profit_train_env.reset() 92 | 93 | # Profit Double DQN 94 | cr_profit_ddqn = double_dqn_agent.train(profit_train_env, path) 95 | profit_train_env.reset() 96 | 97 | # Profit Dueling Double DQN 98 | cr_profit_dueling_ddqn = dueling_double_dqn_agent.train(profit_train_env, path) 99 | profit_train_env.reset() 100 | 101 | i = 0 102 | while i < N_TEST: 103 | print("Test nr. %s" % str(i + 1)) 104 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 105 | 106 | profit_test_env = Environment(df[index + train_size:index + TRADING_PERIOD], "profit") 107 | 108 | # ProfitDQN 109 | cr_profit_dqn_test, _ = dqn_agent.test(profit_test_env) 110 | profit_dqn_return.append(profit_test_env.cumulative_return) 111 | profit_test_env.reset() 112 | 113 | # Profit Double DQN 114 | cr_profit_ddqn_test, _ = double_dqn_agent.test(profit_test_env) 115 | profit_ddqn_return.append(profit_test_env.cumulative_return) 116 | profit_test_env.reset() 117 | 118 | # Profit Dueling Double DQN 119 | cr_profit_dueling_ddqn_test, _ = dueling_double_dqn_agent.test(profit_test_env) 120 | profit_dueling_ddqn_return.append(profit_test_env.cumulative_return) 121 | profit_test_env.reset() 122 | 123 | i += 1 124 | 125 | dqn_agent = Agent(REPLAY_MEM_SIZE, 126 | BATCH_SIZE, 127 | GAMMA, 128 | EPS_START, 129 | EPS_END, 130 | EPS_STEPS, 131 | LEARNING_RATE, 132 | INPUT_DIM, 133 | HIDDEN_DIM, 134 | ACTION_NUMBER, 135 | TARGET_UPDATE, 136 | MODEL='dqn', 137 | DOUBLE=False) 138 | 139 | double_dqn_agent = Agent(REPLAY_MEM_SIZE, 140 | BATCH_SIZE, 141 | GAMMA, 142 | EPS_START, 143 | EPS_END, 144 | EPS_STEPS, 145 | LEARNING_RATE, 146 | INPUT_DIM, 147 | HIDDEN_DIM, 148 | ACTION_NUMBER, 149 | TARGET_UPDATE, 150 | MODEL='dqn', 151 | DOUBLE=True) 152 | 153 | dueling_double_dqn_agent = Agent(REPLAY_MEM_SIZE, 154 | BATCH_SIZE, 155 | GAMMA, 156 | EPS_START, 157 | EPS_END, 158 | EPS_STEPS, 159 | LEARNING_RATE, 160 | INPUT_DIM, 161 | HIDDEN_DIM, 162 | ACTION_NUMBER, 163 | TARGET_UPDATE, 164 | MODEL='ddqn', 165 | DOUBLE=True) 166 | 167 | # SharpeDQN 168 | cr_sharpe_dqn = dqn_agent.train(sharpe_train_env, path) 169 | sharpe_train_env.reset() 170 | 171 | # Sharpe Double DQN 172 | cr_sharpe_ddqn = double_dqn_agent.train(sharpe_train_env, path) 173 | sharpe_train_env.reset() 174 | 175 | # Sharpe Dueling Double DQN 176 | cr_sharpe_dueling_ddqn = dueling_double_dqn_agent.train(sharpe_train_env, path) 177 | sharpe_train_env.reset() 178 | 179 | i = 0 180 | while i < N_TEST: 181 | print("Test nr. %s" % str(i + 1)) 182 | index = random.randrange(len(df) - TRADING_PERIOD - 1) 183 | 184 | sharpe_test_env = Environment(df[index + train_size:index + TRADING_PERIOD], "sr") 185 | 186 | # SharpeDQN 187 | cr_sharpe_dqn_test, _ = dqn_agent.test(sharpe_test_env) 188 | sharpe_dqn_return.append(sharpe_test_env.cumulative_return) 189 | sharpe_test_env.reset() 190 | 191 | # Sharpe Double DQN 192 | cr_sharpe_ddqn_test, _ = double_dqn_agent.test(sharpe_test_env) 193 | sharpe_ddqn_return.append(sharpe_test_env.cumulative_return) 194 | sharpe_test_env.reset() 195 | 196 | # Sharpe Dueling Double DQN 197 | cr_sharpe_dueling_ddqn_test, _ = dueling_double_dqn_agent.test(sharpe_test_env) 198 | sharpe_dueling_ddqn_return.append(sharpe_test_env.cumulative_return) 199 | sharpe_test_env.reset() 200 | 201 | i += 1 202 | 203 | # --------------------------------------- Print Test Stats --------------------------------------------------------- 204 | t = PrettyTable(["Trading System", "Avg. Return (%)", "Max Return (%)", "Min Return (%)", "Std. Dev."]) 205 | print_stats("ProfitDQN", profit_dqn_return, t) 206 | print_stats("SharpeDQN", sharpe_dqn_return, t) 207 | print_stats("ProfitDDQN", profit_ddqn_return, t) 208 | print_stats("SharpeDDQN", sharpe_ddqn_return, t) 209 | print_stats("ProfitD-DDQN", profit_dueling_ddqn_return, t) 210 | print_stats("SharpeD-DDQN", sharpe_dueling_ddqn_return, t) 211 | 212 | print(t) 213 | plot_multiple_conf_interval(["ProfitDQN", "SharpeDQN", "ProfitDDQN", "SharpeDDQN", "ProfitD-DDQN", "SharpeD-DDQN"], 214 | [profit_dqn_return, sharpe_dqn_return, profit_ddqn_return, sharpe_ddqn_return, 215 | profit_dueling_ddqn_return, sharpe_dueling_ddqn_return]) 216 | 217 | 218 | if __name__ == "__main__": 219 | main() -------------------------------------------------------------------------------- /Bot_code_and_models/utils.py: -------------------------------------------------------------------------------- 1 | # It essentially maps (state, action) pairs to their (next_state, reward) result, 2 | # with the state being the current stock price 3 | from collections import namedtuple 4 | import random 5 | import pandas as pd 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import os 9 | 10 | Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward') ) 11 | 12 | 13 | class ReplayMemory(object): 14 | 15 | def __init__(self, capacity): 16 | self.capacity = capacity 17 | self.memory = [] 18 | self.position = 0 19 | 20 | def push(self, *args): 21 | """Saves a transition.""" 22 | if len(self.memory) < self.capacity: 23 | self.memory.append(None) 24 | self.memory[self.position] = Transition(*args) 25 | self.position = (self.position + 1) % self.capacity 26 | 27 | def sample(self, batch_size): 28 | return random.sample(self.memory, batch_size) 29 | 30 | def __len__(self): 31 | return len(self.memory) 32 | 33 | 34 | 35 | def print_stats(model, c_return, t): 36 | c_return = np.array(c_return).flatten() 37 | t.add_row([str(model), "%.2f" % np.mean(c_return), "%.2f" % np.amax(c_return), "%.2f" % np.amin(c_return), 38 | "%.2f" % np.std(c_return)]) 39 | 40 | 41 | def plot_conf_interval(name, cum_returns ): 42 | """ NB. cum_returns must be 2-dim """ 43 | # Mean 44 | M = np.mean(np.array(cum_returns), axis=0) 45 | # std dev 46 | S = np.std(np.array(cum_returns), axis=0) 47 | # upper and lower limit of confidence intervals 48 | LL = M - 0.95 * S 49 | UL = M + 0.95 * S 50 | 51 | plt.figure(figsize=(20, 5)) 52 | plt.xlabel("Trading Instant (h)") 53 | plt.ylabel(name) 54 | plt.legend(['Cumulative Averadge Return (%)'], loc='upper left') 55 | plt.grid(True) 56 | plt.ylim(-5, 15) 57 | plt.plot(range(len(M)), M, linewidth=2) # mean curve. 58 | plt.fill_between(range(len(M)), LL, UL, color='b', alpha=.2) # std curves. 59 | plt.show() 60 | 61 | def plot_multiple_conf_interval(names, cum_returns_list ): 62 | """ NB. cum_returns[i] must be 2-dim """ 63 | i = 1 64 | 65 | for cr in cum_returns_list: 66 | plt.subplot(len(cum_returns_list), 2, i) 67 | # Mean 68 | M = np.mean(np.array(cr), axis=0) 69 | # std dev 70 | S = np.std(np.array(cr), axis=0) 71 | # upper and lower limit of confidence intervals 72 | LL = M - 0.95 * S 73 | UL = M + 0.95 * S 74 | 75 | plt.xlabel("Trading Instant (h)") 76 | plt.ylabel(names[i-1]) 77 | plt.title('Cumulative Averadge Return (%)') 78 | plt.grid(True) 79 | plt.plot(range(len(M)), M, linewidth=2) # mean curve. 80 | plt.fill_between(range(len(M)), LL, UL, color='b', alpha=.2) # std curves. 81 | i += 1 82 | 83 | plt.show() 84 | 85 | 86 | 87 | 88 | def load_data(path): 89 | if os.path.isfile(path + 'hourly_aggregated_dataset.csv'): 90 | df = pd.read_csv(path + 'hourly_aggregated_dataset.csv') 91 | else: 92 | # Aggregate the dataset hourly by picking the value at first row for Open, 93 | # the max within an hour for High, the minimum for Low, the last value for Close 94 | 95 | df = pd.read_csv(path + 'coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.csv') 96 | df_hourly_aggregated = pd.DataFrame() 97 | 98 | for count in range(0, len(df) - 60, 60): 99 | hour_interval = pd.DataFrame(df.iloc[count:count + 60]) 100 | df_hourly_aggregated = df_hourly_aggregated.append(pd.DataFrame([[hour_interval['Open'].iloc[0], 101 | hour_interval['High'].max(), 102 | hour_interval['Low'].min(), 103 | hour_interval['Close'].iloc[ 104 | len(hour_interval) - 1]]])) 105 | 106 | df_hourly_aggregated.columns = ['Open', 'High', 'Low', 'Close'] 107 | df_hourly_aggregated.index = np.arange(1, len(df_hourly_aggregated) + 1) 108 | df_hourly_aggregated.interpolate(inplace=True) 109 | df_hourly_aggregated.fillna(method='bfill', axis=0, inplace=True) 110 | df_hourly_aggregated.to_csv(path + 'hourly_aggregated_dataset.csv', index=False) 111 | df = df_hourly_aggregated 112 | del df_hourly_aggregated 113 | return df 114 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # A Deep Reinforcement Learning Trading Bot 2 | 3 | This job is mine personal implementation of an existing paper. 4 | For a detailed description and references to the original work check the report.pdf file. For a short introduction check [here](https://nicods96.github.io/hi//designing-a-pytorch-deep-reinforcement-learning-trading-bot/). 5 | 6 | - Note 1: Since I am not an expert of financial markets, the environment definition is just a sketch. To train the model for a real use the environment must be redefined. I am open to any suggestion on how to implement a real simulation of a financial market. 7 | - Note 2: This job is just an implementation, with some modifications, of an existing work. The model's parameters are not tuned, they just replicate the paper setting. In my opinion, as stated and motivated in the [report.pdf](https://github.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/blob/master/report.pdf), those parameters are not valid for a well trained Deep Reinforcement Learning agent. 8 | - Note 3: DRL techniques available in the model: Deep Reinforcement Learning, Double Deep Reinforcement Learning, Dueling Deep Reinforcement Learning, Dueling Double Deep Reinforcement Learning. 9 | 10 | ## To Do: 11 | - Add command line arguments to run custom training/test 12 | - Implement real market simulation 13 | -------------------------------------------------------------------------------- /report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicoDs96/Trading-Bot---Deep-Reinforcement-Learning/818fafc5e836e85011889ee04c5b8b1b804c5b22/report.pdf --------------------------------------------------------------------------------