├── requirements.txt
├── data
    └── 3_Dividends
    │   └── GAZP.ME.csv
├── 3_create_adj_ticks.py
├── 2_preprocess_ticks.py
├── README.md
├── QCTickDataStrategy.py
├── 1_get_tick_data.py
└── 4_dollar_bars_triple_barrier_indicators.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | #python==3.6.6
 2 | setuptools
 3 | numpy
 4 | python-dateutil
 5 | pytz
 6 | pandas
 7 | fastparquet
 8 | sklearn
 9 | matplotlib
10 | mlfinlab
11 | selenium
12 | jupyter


--------------------------------------------------------------------------------
/data/3_Dividends/GAZP.ME.csv:
--------------------------------------------------------------------------------
 1 | Date,Dividends
 2 | 2016-07-19,7.89
 3 | 2017-07-18,8.0397
 4 | 2017-07-19,8.0397
 5 | 2018-07-18,8.04
 6 | 2019-07-17,16.61
 7 | 2013-05-14,5.99
 8 | 2015-07-14,7.2
 9 | 2010-05-11,2.39
10 | 2012-05-08,8.97
11 | 2011-05-13,3.85
12 | 2012-05-11,8.97
13 | 2011-05-05,3.85
14 | 2014-07-16,7.2
15 | 


--------------------------------------------------------------------------------
/3_create_adj_ticks.py:
--------------------------------------------------------------------------------
 1 | # https://help.yahoo.com/kb/SLN28256.html
 2 | 
 3 | import os
 4 | import numpy as np
 5 | import pandas as pd
 6 | import datetime as dt
 7 | 
 8 | from multiprocessing import Pool
 9 | my_dir = os.getcwd()
10 | 
11 | ticks_folder = os.path.join(my_dir, "data/4_Ticks")
12 | dividends_folder = os.path.join(my_dir, "data/3_Dividends")
13 | 
14 | adj_ticks_folder = os.path.join(my_dir, "data/5_AdjTicks")
15 | 
16 | if os.path.basename(adj_ticks_folder) not in os.listdir(os.path.dirname(adj_ticks_folder)):
17 |     os.mkdir(adj_ticks_folder)
18 | 
19 | keys = [key[:4] for key in os.listdir(ticks_folder) if not key.startswith(".")]
20 | print(keys)
21 | 
22 | for key in keys:
23 |     div_keys = [key[:4] for key in os.listdir(dividends_folder) if not key.startswith(".")]
24 |     if key not in div_keys:
25 |         print(key, 'passed')
26 |         pass
27 |     else:
28 |         print('processing ', key)
29 |         dividends = pd.read_csv(os.path.join(dividends_folder, (key + ".ME.csv")), index_col=0, parse_dates=True)
30 |         dividends.sort_index(inplace=True)
31 | 
32 |         ticks_file = [f for f in os.listdir(ticks_folder) if key in f][0]
33 |         ticks = pd.read_parquet(os.path.join(ticks_folder, ticks_file), engine='fastparquet')
34 |         ticks.index = ticks.date_time
35 | 
36 |         dividends_due = dividends[((dividends.index < ticks.index[-1]) & (dividends.index > ticks.index[0]))]
37 |         ss = pd.Series(ticks.index).searchsorted(dividends_due.index)
38 |         for_backward_roll = ticks.iloc[(ss - 1)]
39 |         # print(for_backward_roll)
40 | 
41 |         dividends_due["Coef"] = 0.
42 |         for i in range(len(dividends_due)):
43 |             dividends_due.iloc[i, 1] = 1 - dividends_due.iloc[i, 0] / for_backward_roll.price.iloc[i]
44 | 
45 |         adj_ticks = ticks.copy()
46 |         for i in range(len(dividends_due)):
47 |             adj_ticks.price.loc[adj_ticks.index < dividends_due.index[i]] *= dividends_due.iloc[i, 1]
48 | 
49 |         t1 = dt.datetime.now()
50 | 
51 |         adj_ticks_file = os.path.join(adj_ticks_folder, ticks_file)
52 | 
53 |         print("saving parquet")
54 |         adj_ticks.reset_index(drop=True).to_parquet(adj_ticks_file, compression='GZIP', engine='fastparquet')
55 |         t2 = dt.datetime.now()
56 |         print(t2 - t1)
57 | 


--------------------------------------------------------------------------------
/2_preprocess_ticks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import datetime as dt
 5 | 
 6 | from multiprocessing import Pool
 7 | 
 8 | my_dir = os.getcwd()
 9 | raw_tick_dir = os.path.join(my_dir, "data/1_RawTicks")
10 | target_folder = os.path.join(my_dir, "data/4_Ticks")
11 | 
12 | if os.path.basename(target_folder) not in os.listdir(os.path.dirname(target_folder)):
13 |     os.mkdir(target_folder)
14 | 
15 | 
16 | def check_breaks(file_names):
17 |     end = dt.datetime.strptime(file_names[0][-10:-4], "%y%m%d")
18 |     breaches = []
19 |     for i in file_names[1:]:
20 |         expected_previous_end = dt.datetime.strptime(i[5:11], "%y%m%d") - dt.timedelta(days=1)
21 |         if not end == expected_previous_end:
22 |             breaches.append([end, expected_previous_end])
23 |         end = dt.datetime.strptime(i[-10:-4], "%y%m%d")
24 | 
25 |     if breaches:
26 |         print("Breaches: ", breaches)
27 |         return False
28 |     return True
29 | 
30 | 
31 | def get_csv(file):
32 |     global source_folder
33 |     file_path = os.path.join(source_folder, file)
34 | 
35 |     df_temp = pd.read_csv(file_path, sep=';')
36 | 
37 |     if '<DATE>' not in df_temp.columns:
38 |         df_temp = pd.read_csv(file_path, sep=',')
39 | 
40 |     assert '<DATE>' in df_temp.columns, "No DATE in columns in " + file
41 | 
42 |     b = np.arange(len(df_temp)) * 10 ** -6
43 |     delta = np.array(list(map(lambda x: "{:.6f}".format(x)[1:], b)))
44 |     date_time = df_temp['<DATE>'].apply(str) + df_temp['<TIME>'].apply(str) + delta
45 |     dt_temp = pd.to_datetime(date_time, format='%Y%m%d%H%M%S.%f', infer_datetime_format=True)
46 | 
47 |     # to deal with repeating datetime values
48 |     df_temp['date_time'] = dt_temp
49 |     df_temp_new = df_temp[['date_time', '<LAST>', '<VOL>']]
50 |     df_temp_new.columns = ['date_time', 'price', 'volume']
51 | 
52 |     return df_temp_new
53 | 
54 | 
55 | if __name__ == '__main__':
56 | 
57 |     keys = [key for key in os.listdir(raw_tick_dir) if not key.startswith(".")]
58 |     print(keys)
59 | 
60 |     for key in keys:
61 |         print(key)
62 | 
63 |         source_folder = os.path.join(raw_tick_dir, key)
64 | 
65 |         files = [file for file in os.listdir(source_folder) if file.startswith(key + '_') and not file.endswith("part")]
66 |         files.sort()
67 | 
68 |         assert check_breaks(files), "List of files is not uninterrupted"
69 | 
70 |         t0 = dt.datetime.now()
71 | 
72 |         with Pool() as pool:  # start 4 worker processes
73 |             it = pool.imap(get_csv, files)
74 |             p = [df for df in it]
75 |             df_new = pd.concat(p, sort=False)
76 | 
77 |         t1 = dt.datetime.now()
78 |         print(t1 - t0)
79 |         print("Amount of ticks: ", df_new.shape)
80 | 
81 |         file_name = os.path.join(target_folder, f"{key}_{files[0][5:11]}_{files[-1][-10:-4]}.gzip")
82 | 
83 |         print("saving parquet")
84 |         df_new.to_parquet(file_name, compression='GZIP', engine='fastparquet')
85 |         t2 = dt.datetime.now()
86 |         print(t2 - t1)
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tick Data Strategy
 2 | 
 3 | To proceed one needs to download and to preprocess data for feeding algorithm. Below is the file / data structure and intstructions how to run scripts.
 4 | 
 5 | **folders/files structure:**
 6 | <pre>
 7 | data
 8 | |__ 1_RawTicks
 9 |     |__ ...
10 | |__ 2_MOEX
11 |     |__ ...
12 | |__ 3_Dividends
13 |     |__ ...
14 | |__ 4_Ticks
15 |     |__ ...
16 | |__ 5_AdjTicks
17 |     |__ ...
18 | |__ 6_DollarBars
19 |     |__ ...
20 | |__ 7_Indicators
21 |     |__ ...
22 | 1_get_tick_data.py
23 | 2_preprocess_ticks.py
24 | 3_create_adj_ticks.py
25 | 4_dollar_bars_triple_barrier_indicators.py
26 | QCTickDataStrategy.py
27 | </pre>
28 | 
29 | 
30 | To create python environment one needs to use conda and use 'requirements.txt'. This environment should be built on python verios 3.6.
31 | 
32 | <code>conda create --name QC python=3.6.6</code>
33 | 
34 | <code>conda activate QC</code>
35 | 
36 | <code>pip install -r requirements.txt</code>
37 | 
38 | It will be used for running python scripts as well as for LEAN Engine.
39 | 
40 | 
41 | ## Data Download
42 | 
43 | This script was tested on Ubuntu 18.04 and MacOs (Mojave, Catalina) with FireFox browser.
44 | 
45 | To download the data one required to have selenium webdrive installed. Instructions for that could be found [here](https://selenium-python.readthedocs.io/installation.html#drivers)
46 | 
47 | Among blue chips and MOEX index constituents the following assets were chosen for the analysis:
48 | 'AFKS', 'ALRS', 'CHMF', 'GAZP', 'GMKN', 'LKOH', 'MGNT', 'MTSS', 'NVTK', 'ROSN', 'RTKM', 'SBER', 'SNGS', 'TATN', 'VRBR', 'YNDX'
49 | 
50 | To start downloading process one needs to run '1_get_tick_data.py' from command line with the arguments 'symbol', start date and end date 'YYYY-MM-DD', i.e.:
51 | 
52 | <code>python.py 1_get_tick_data.py GAZP 2009-01-01 2019-12-13</code>
53 | 
54 | 
55 | After the algorithm opens the firefox window, the frequency of the data ('ticks') and the output format ('.csv') need to be selected manually. In addition, one needs to select 'save to file' and select a checkbox 'repeat for the next occurencies'. All this need to be done onces.
56 | 
57 | Algorithm will dowload chunks of data in .csv files withing size limit of around 41.6Mb into folders named by symbol into '1_RawTicks' folder.
58 | 
59 | This web-site _does not_ provide data to download every day from 7:00am to 3:00pm GMT.
60 | 
61 | Alternatavly, raw tick data for this research can be reached in [Box folder](https://app.box.com/s/fwau5uwsrvn4lgwfwpvkf9zwnxo24k82)
62 | 
63 | 
64 | ## Data Preprocess
65 | 
66 | These scripts preprocess raw tick data to create dollarbars and indicators (features) for feedin ML algorithm.
67 | 
68 | 1. Run 2_preprocess_ticks.py to save data within single parquet file in the folder '2_Ticks' for each company.
69 | 
70 | 2. Adjust tick data backward to dividends paid - see [formula](https://help.yahoo.com/kb/SLN28256.html). Dividends data is manually downloaded from 'finance.yahoo.com' and saved to the folder '3_Dividends'. Run 3_create_adj_ticks.py for adjusting and saving data into the folder '3_AdjTicks'.
71 | 
72 | 3. The last preprocessing script 4_dollar_bars_triple_barrier_indicators.py creates dollarbars and save them into folder '4_DollarBars', then creates dataseries to feed trading altorithm and saves them into folder '5_Indicators'. The dollarbars and indicators are built based on input parameters, that could be changed for modelling variations.
73 | 
74 | 
75 | ## Running Algos
76 | 
77 | Trading algorithm is developed to run within open source QuantConnect platform. Trading algorithm could be executed on the web [QuantConnect](https://www.quantconnect.com) service or locally on the underlying [LEAN Engine](https://github.com/QuantConnect/Lean/tree/master/Algorithm.Python#quantconnect-python-algorithm-project).
78 | 
79 | - To run it with QuantConnect platform one needs to login, 'Create new Algorithm' within 'Algorithm Lab' (or 'Lab'), and substitute the code by the code from 'QCTickDataStrategy.py'. This script contains links to dropbox folders with already created 'Indicators' files, which can be substituted by the files you created at the 'Data Preprocess' step above. After starting 'Backtest', QuantConnect will generate statistics and reports. (This scripts were tested under PRO account and could be running slowly under free account).
80 | 
81 | - To run algorithm locally, one needs to have Visual Studio and python envrionment, which was created at the first step. Details on istallation, compiling and running algotithm are available [here](https://medium.com/hackernoon/setting-up-your-own-algorithmic-trading-server-4bbdf0766c17). In this step the dropbox links to files with indidators can be substituted to local file links.
82 | 
83 | 


--------------------------------------------------------------------------------
/QCTickDataStrategy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | from clr import AddReference
  4 | AddReference("System")
  5 | AddReference("QuantConnect.Algorithm")
  6 | AddReference("QuantConnect.Common")
  7 | AddReference("QuantConnect.Indicators")
  8 | 
  9 | from System import *
 10 | from QuantConnect import *
 11 | from QuantConnect.Algorithm import *
 12 | from QuantConnect.Algorithm.Framework import *
 13 | from QuantConnect.Algorithm.Framework.Risk import *
 14 | from QuantConnect.Algorithm.Framework.Alphas import *
 15 | from QuantConnect.Algorithm.Framework.Execution import *
 16 | from QuantConnect.Algorithm.Framework.Portfolio import *
 17 | from QuantConnect.Algorithm.Framework.Selection import *
 18 | 
 19 | from QuantConnect.Data import SubscriptionDataSource
 20 | from QuantConnect.Python import PythonData
 21 | 
 22 | from datetime import timedelta, datetime, date
 23 | from sklearn.ensemble import RandomForestClassifier
 24 | from sklearn.model_selection import GridSearchCV
 25 | 
 26 | 
 27 | class QCTickDataStrategy(QCAlgorithm):
 28 | 
 29 |     def Initialize(self):
 30 |         self.SetStartDate(2009, 5, 2)
 31 |         self.SetEndDate(2010, 12, 13)
 32 |         self.SetCash(100000)
 33 | 
 34 |         self.assets_keys = ['AFKS', 'ALRS', 'CHMF', 'GAZP',
 35 |                             'GMKN', 'LKOH', 'MGNT', 'MTSS',
 36 |                             'NVTK', 'ROSN', 'RTKM', 'SBER',
 37 |                             'SNGS', 'TATN', 'VRBR', 'YNDX']
 38 | 
 39 |         self.assets = {i: self.AddData(AdvancedBars, i) for i in self.assets_keys}
 40 |         # self.AddData(AdvancedBars, "GAZP")
 41 | 
 42 |         self.Schedule.On(self.DateRules.MonthStart(), self.TimeRules.At(13, 10), Action(self.TrainML))
 43 | 
 44 |         colsU = [  # 'Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Fastmavg', 'Slowmavg', 'Sideold',
 45 |             'Logret', 'Momone', 'Momtwo', 'Momthree', 'Momfour', 'Momfive',
 46 |             'Volatilityfifty', 'Volatilitythirtyone', 'Volatilityfifteen',
 47 |             'Autocorrone', 'Autocorrtwo', 'Autocorrthree', 'Autocorrfour', 'Autocorrfive',
 48 |             'Logtone', 'Logttwo', 'Logtthree', 'Logtfour', 'Logtfive',
 49 |             # 'Ret', 'Trgt',
 50 |             'Bin', 'Side']
 51 | 
 52 |         self.features_dict = {i: pd.DataFrame(columns=colsU) for i in self.assets_keys}
 53 |         # self.features = pd.DataFrame(columns=colsU)
 54 | 
 55 |         self.clf_dict = {i: None for i in self.assets_keys}
 56 |         # self.clf = None
 57 | 
 58 |         self.changed = False
 59 |         self.long = True
 60 | 
 61 |         self.lifetime = timedelta(days=10)
 62 |         self.stop_time_dict = {i: self.Time for i in self.assets_keys}
 63 |         # self.stop_time = self.Time
 64 | 
 65 |         # self.AddData(BenchmarkMOEX, "MOEX", Resolution.Daily)
 66 |         # self.SetBenchmark(BenchmarkMOEX, "MOEX")
 67 | 
 68 |     def OnData(self, data):
 69 |         """OnData event is the primary entry point for your algorithm. Each new data point will be pumped in here.
 70 |             Arguments:
 71 |                 data: Slice object keyed by symbol containing the stock data
 72 |         """
 73 | 
 74 |         for k in self.assets_keys:
 75 |             cond1 = (self.Time > self.stop_time_dict[k])
 76 |             cond2 = self.Portfolio[k].Invested
 77 |             # self.Debug(f"cond1 {cond1}, cond2 {cond2}")
 78 |             if cond1 and cond2:
 79 |                 self.Debug(f"{self.Time}, {k} position {self.Portfolio[k].Quantity}")
 80 |                 self.Liquidate(k)
 81 |                 self.Debug(f"{k} position liquidated: {self.Portfolio[k].Quantity}")
 82 | 
 83 |         for k in self.assets_keys:
 84 |             if not data.ContainsKey(k):
 85 |                 continue
 86 | 
 87 |             dat = data[k]
 88 |             time = dat.Time
 89 | 
 90 |             try:
 91 |                 # self.features.loc[time] = [data["GAZP"].Fastmavg, data["GAZP"].Slowmavg, data["GAZP"].Close]
 92 |                 # self.features.loc[time]
 93 |                 self.features_dict[k].loc[time] = [dat.Logret, dat.Momone, dat.Momtwo, dat.Momthree, dat.Momfour,
 94 |                                                    dat.Momfive, dat.Volatilityfifty, dat.Volatilitythirtyone,
 95 |                                                    dat.Volatilityfifteen,
 96 |                                                    dat.Autocorrone, dat.Autocorrtwo, dat.Autocorrthree,
 97 |                                                    dat.Autocorrfour, dat.Autocorrfive,
 98 |                                                    dat.Logtone, dat.Logttwo, dat.Logtthree, dat.Logtfour, dat.Logtfive,
 99 |                                                    dat.Bin, dat.Side]
100 |                 # self.Debug("1")
101 |             except AttributeError as e:
102 |                 continue
103 | 
104 |             if self.clf_dict[k] is not None:
105 |                 X = self.features_dict[k].drop(["Bin"], axis=1).loc[time].values.reshape(1, -1)
106 |                 y_pred = self.clf_dict[k].predict(X)
107 | 
108 |                 if y_pred > .8:
109 | 
110 |                     if dat.Side == 1:
111 |                         if not self.Portfolio[k].IsLong:
112 |                             self.stop_time_dict[k] = self.Time + self.lifetime
113 |                             if self.Portfolio[k].Invested:
114 |                                 self.Liquidate(k)
115 |                             self.SetHoldings(k, .5)
116 |                             # self.Debug(f" long {k}, {self.Portfolio[k].Quantity}, till {self.stop_time_dict[k]}")
117 |                             # self.Debug(f" hol {self.Portfolio.TotalHoldingsValue}, cash {self.Portfolio.Cash}")
118 | 
119 |                         else:
120 |                             self.stop_time_dict[k] = self.Time + self.lifetime
121 |                             # self.Debug(f" long_ {k}, {self.Portfolio[k].Quantity}, till {self.stop_time_dict[k]}")
122 | 
123 |                     elif dat.Side == -1:
124 |                         if self.Portfolio[k].IsLong:
125 |                             self.stop_time_dict[k] = self.Time + self.lifetime
126 |                             self.Liquidate(k)
127 |                             self.SetHoldings(k, -0.5)
128 |                             # self.Debug(f" short {k}, {self.Portfolio[k].Quantity}, till {self.stop_time_dict[k]}")
129 |                             # self.Debug(f" hol {self.Portfolio.TotalHoldingsValue}, cash {self.Portfolio.Cash}")
130 |                         else:
131 |                             self.stop_time_dict[k] = self.Time + self.lifetime
132 |                             # self.Liquidate(k)
133 |                             self.SetHoldings(k, -0.5)
134 |                             # self.Debug(f" short_ {k}, {self.Portfolio[k].Quantity}, till {self.stop_time_dict[k]}")
135 |                             # self.Debug(f" hol {self.Portfolio.TotalHoldingsValue}, cash {self.Portfolio.Cash}")
136 | 
137 | 
138 |     def TrainML(self):
139 | 
140 |         # re-learn ML quarterly
141 |         if self.Time.month % 3 != 0:
142 |             return
143 | 
144 |         for k in self.assets_keys:
145 |             a = self.features_dict[k].shape
146 | 
147 |             # self.Debug(f"{self.Time} asset: {k} shape: {a}")
148 | 
149 |             if a[0] > 800:
150 | 
151 |                 X = self.features_dict[k].drop(["Bin"], axis=1).values
152 |                 y = self.features_dict[k]["Bin"].values.ravel()
153 | 
154 |                 if self.clf_dict[k] is None:
155 |                     n_estimator, depth = self._GridSearchML(X, y)
156 |                     self.clf_dict[k] = RandomForestClassifier(max_depth=depth,
157 |                                                               n_estimators=n_estimator,
158 |                                                               criterion='entropy', random_state=42)
159 | 
160 |                 self.clf_dict[k].fit(X, y)
161 |         # self.Debug(f"Training ML data shape {self.features.shape}, before {a}")
162 | 
163 |     def _GridSearchML(self, X, y):
164 |         parameters = {'max_depth': [2, 3, 4, 5, 7],
165 |                       'n_estimators': [1, 10, 25, 50, 100, 256, 512],
166 |                       'random_state': [42]}
167 | 
168 |         rf = RandomForestClassifier(criterion='entropy')
169 | 
170 |         grid_search_cv = GridSearchCV(rf, parameters, cv=4, scoring='roc_auc', n_jobs=3)
171 | 
172 |         # self.clf = grid_search_cv.best_estimator_
173 | 
174 |         grid_search_cv.fit(X, y)
175 | 
176 |         # self.Log(f"Grid search mean test score {grid_search_cv.cv_results_['mean_test_score']}")
177 | 
178 |         return grid_search_cv.best_params_['n_estimators'], grid_search_cv.best_params_['max_depth']
179 | 
180 | 
181 | class AdvancedBars(PythonData):
182 |     """Custom advanced bars (DollarBars, VolumeBars, etc)"""
183 | 
184 |     def GetSource(self, config, date, isLiveMode):
185 | 
186 |         data = dict(
187 |             AFKS="https://www.dropbox.com/s/sltdv01h2qagsuz/AFKS_10_0.1_indicators.csv?dl=1",
188 |             ALRS="https://www.dropbox.com/s/kvv41qbqkvae57z/ALRS_10_0.1_indicators.csv?dl=1",
189 |             CHMF="https://www.dropbox.com/s/o3zllu8roi5mul0/CHMF_10_0.1_indicators.csv?dl=1",
190 |             GAZP="https://www.dropbox.com/s/8ok7497uzojp719/GAZP_10_0.1_indicators.csv?dl=1",
191 |             GMKN="https://www.dropbox.com/s/xt3pda9hjgoitu7/GMKN_10_0.1_indicators.csv?dl=1",
192 |             LKOH="https://www.dropbox.com/s/v747cnwefjlwx5a/LKOH_10_0.1_indicators.csv?dl=1",
193 |             MGNT="https://www.dropbox.com/s/ppgtj7jgme0nasb/MGNT_10_0.1_indicators.csv?dl=1",
194 |             MTSS="https://www.dropbox.com/s/gnti48z0ar1isz4/MTSS_10_0.1_indicators.csv?dl=1",
195 |             NVTK="https://www.dropbox.com/s/9veawbt7awy6avv/NVTK_10_0.1_indicators.csv?dl=1",
196 |             ROSN="https://www.dropbox.com/s/hpzh2pm5cluuohn/ROSN_10_0.1_indicators.csv?dl=1",
197 |             RTKM="https://www.dropbox.com/s/9q61fc2rx5psbor/RTKM_10_0.1_indicators.csv?dl=1",
198 |             SBER="https://www.dropbox.com/s/x4qs21ocx8imq0k/SBER_10_0.1_indicators.csv?dl=1",
199 |             SNGS="https://www.dropbox.com/s/114l8afw1t66pxo/SNGS_10_0.1_indicators.csv?dl=1",
200 |             TATN="https://www.dropbox.com/s/y55z0ay09v6qrfh/TATN_10_0.1_indicators.csv?dl=1",
201 |             VRBR="https://www.dropbox.com/s/erao89h62f8aal8/VTBR_10_0.1_indicators.csv?dl=1",
202 |             YNDX="https://www.dropbox.com/s/qhr27lo71m0fk3c/YNDX_10_0.1_indicators.csv?dl=1"
203 |         )
204 | 
205 |         path = data[config.Symbol.Value]
206 | 
207 |         """
208 |         path = "...." + config.Symbol.Value + "_10_0.1_indicators.csv"
209 |         """
210 | 
211 |         return SubscriptionDataSource(path, SubscriptionTransportMedium.RemoteFile);
212 | 
213 |     def Reader(self, config, line, date, isLiveMode):
214 |         bar = AdvancedBars()
215 |         bar.Symbol = config.Symbol
216 | 
217 |         if not (line.strip() and line[0].isdigit()): return None
218 | 
219 |         cols = ['Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Fastmavg', 'Slowmavg',
220 |                 'Sideold', 'Logret', 'Momone', 'Momtwo', 'Momthree', 'Momfour', 'Momfive',
221 |                 'Volatilityfifty', 'Volatilitythirtyone', 'Volatilityfifteen',
222 |                 'Autocorrone', 'Autocorrtwo', 'Autocorrthree', 'Autocorrfour', 'Autocorrfive',
223 |                 'Logtone', 'Logttwo', 'Logtthree', 'Logtfour', 'Logtfive',
224 |                 'Ret', 'Trgt', 'Bin', 'Side']
225 | 
226 |         try:
227 |             data = line.split(',')
228 | 
229 |             bar.Time = datetime.strptime(data[0], "%Y-%m-%d %H:%M:%S.%f")
230 |             bar.Value = float(data[4])
231 |             # bar['Length'] = len(data)
232 | 
233 |             for j, c in enumerate(cols):
234 |                 if (data[j] != '') and (j != 0):
235 |                     try:
236 |                         bar[c] = float(data[j])
237 |                     except (ValueError, IndexError) as e:
238 |                         pass
239 | 
240 |             return bar;
241 | 
242 |         except ValueError:
243 |             return None
244 | 
245 | 
246 | class BenchmarkMOEX(PythonData):
247 |     """Custom benchmark data
248 |     """
249 | 
250 |     def GetSource(self, config, date, isLiveMode):
251 | 
252 |         path = "https://www.dropbox.com/s/fl9oe733ls677wv/RI.IMOEX_090101_191213.csv?dl=1"
253 |         return SubscriptionDataSource(path, SubscriptionTransportMedium.RemoteFile);
254 | 
255 |     def Reader(self, config, line, date, isLiveMode):
256 |         bar = BenchmarkMOEX()
257 |         bar.Symbol = config.Symbol
258 | 
259 |         if not (line.strip() and line[0] == "R"): return None
260 | 
261 |         try:
262 |             data = line.split(';')
263 | 
264 |             bar.Time = datetime.strptime(data[2], "%Y%m%d")
265 |             bar.Value = float(data[7])
266 |             bar.Open = float(data[4])
267 |             bar.High = float(data[5])
268 |             bar.Low = float(data[6])
269 |             bar.Close = float(data[7])
270 | 
271 |             return bar
272 | 
273 |         except ValueError:
274 |             return None
275 | 


--------------------------------------------------------------------------------
/1_get_tick_data.py:
--------------------------------------------------------------------------------
  1 | # bash python==3
  2 | # Project to download tick data (MOEX) from finam.ru
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import math
  8 | import datetime as dt
  9 | 
 10 | # 'selenuim' package for manipulation with the website
 11 | from selenium import webdriver
 12 | from selenium.webdriver.common.keys import Keys
 13 | from selenium.webdriver.support.ui import WebDriverWait
 14 | from selenium.webdriver.common.by import By
 15 | from selenium.webdriver.support import expected_conditions as EC
 16 | from selenium.webdriver import ActionChains
 17 | 
 18 | url = 'https://www.finam.ru/profile/moex-akcii/gazprom-neft/export/?market=1&em=2&code=SIBN&apply=0&df=1&mf=0&' +\
 19 |       'yf=2017&from=01.01.2017&dt=31&mt=0&yt=2017&to=31.01.2017&p=1&f=SIBN_170101_170131&e=.csv&cn=SIBN&dtf=1&' +\
 20 |       'tmf=1&MSOR=1&mstime=on&mstimever=1&sep=1&sep2=1&datf=6&at=1'
 21 | 
 22 | 
 23 | class GetTickData(object):
 24 |     """A class to download data from finam.ru by chanks wrt restriction to downloaded file size ~41k
 25 |     """
 26 | 
 27 |     def __init__(self, name, folder=None, url=url):
 28 |         """initiation of the selenium and download process
 29 |         """
 30 | 
 31 |         self.name = name
 32 |         if folder is None:
 33 |             folder = os.getcwd()
 34 |         self.parent_folder = folder
 35 |         self.folder = self.set_folder(folder)
 36 |         self.driver = self._get_driver()
 37 |         self.driver.get(url)
 38 |         self.wait = WebDriverWait(self.driver, 10)
 39 |         self.last_time = dt.datetime.now()
 40 |         self.out = 'csv'
 41 |         self.previous_from = dt.datetime.now().date()
 42 |         self.previous_to = dt.datetime.now().date()
 43 |         self.file_size = 40 * 2 ** 20
 44 |         self.target_days = 15
 45 |         self.files = []
 46 |         self.os = os.name
 47 | 
 48 |     def set_folder(self, folder):
 49 |         """Creating a folder to store the data"""
 50 |         folder_target = os.path.join(folder, self.name)
 51 |         if not (self.name in os.listdir(folder)):
 52 |             os.mkdir(folder_target)
 53 |         print("Target folder:", folder_target)
 54 |         return folder_target
 55 | 
 56 |     def _get_driver(self):
 57 |         """Starting the web-driver for Firefox for further manipulations with the website
 58 |         """
 59 |         mime_csv = 'text/plain, application/csv, application/download,' + \
 60 |                    ' text/comma-separated-values, text/csv, text/anytext,' + \
 61 |                    ' application/csv, application/excel,' + \
 62 |                    ' application/vnd.msexce, application/vnd.ms-excel,' + \
 63 |                    ' attachment/csv, text/plain'
 64 | 
 65 |         fp = webdriver.FirefoxProfile()
 66 |         fp.set_preference('browser.download.folderList', 2)
 67 |         fp.set_preference('browser.download.manager.showWhenStarting', False)
 68 |         fp.set_preference('browser.download.dir', self.folder)
 69 |         fp.set_preference('browser.helperApps.neverAsk.saveToDisk', mime_csv)
 70 |         driver = webdriver.Firefox(firefox_profile=fp)
 71 |         return driver
 72 | 
 73 |     def set_company(self, name):
 74 |         """Selecting a company which data will be downloaded
 75 |         """
 76 |         company = '/html/body/div[3]/div[2]/div[1]/div/table/tbody/tr/td/div/div/div[2]/div[1]/div[2]/input'
 77 | 
 78 |         try:
 79 |             s3 = self.driver.find_element_by_xpath(company)
 80 |             s3.clear()
 81 |             s3.send_keys(name)
 82 |             s3.send_keys(Keys.ENTER)
 83 |         except Exception as e:
 84 |             print('Error setting a company ', name, ', ', e)
 85 |             return False
 86 |         time.sleep(40)
 87 |         return True
 88 | 
 89 |     @staticmethod
 90 |     def _get_date_position(date):
 91 |         """function to get year, month and day positions within drop-down
 92 |         calendar
 93 |         """
 94 | 
 95 |         day_col = date.weekday() + 1
 96 | 
 97 |         # get the No of week (week starts from monday)
 98 |         first_day = date.replace(day=1)
 99 |         day = date.day
100 |         adjusted_dom = day + first_day.weekday()
101 |         day_ind = int(math.ceil(adjusted_dom / 7.0))  # np.ceil(a).astype(int)[0]
102 | 
103 |         return day_col, day_ind
104 | 
105 |     def set_date(self, date_from, date_to, from_or_to='from'):
106 |         """Function to set specific date in the drop-down calendar
107 |         """
108 |         if from_or_to == 'from':
109 |             date = date_from
110 |             year_num = date.year + (40 - 2018)
111 |             month_num = date.month
112 |             date_sel = '#issuer-profile-export-from'
113 |             previous = self.previous_from
114 |         elif from_or_to == 'to':
115 |             date = date_to
116 |             year_num = date.year + (1 - date_from.year)
117 |             month_num = (date.month + 1 - date_from.month) if date_from.year == date.year else date.month
118 |             date_sel = '#issuer-profile-export-to'
119 |             previous = self.previous_to
120 | 
121 |         day_col, day_ind = self._get_date_position(date)
122 | 
123 |         year_sel = '.ui-datepicker-year'
124 |         month_sel = '.ui-datepicker-month'
125 | 
126 |         year = '/html/body/div[16]/div/div/select[2]/option[' + str(year_num) + ']'  # 40 - 2018
127 |         month = '/html/body/div[16]/div/div/select[1]/option[' + str(month_num) + ']'
128 |         day = '/html/body/div[16]/table/tbody/tr[' + str(day_ind) + ']/td[' + str(day_col) + ']/a'
129 | 
130 |         try:
131 |             select_calendar = self.wait.until(
132 |                 EC.visibility_of_element_located((By.CSS_SELECTOR, date_sel)))
133 |             select_calendar.click()
134 |         except Exception as e:
135 |             print('error date ', from_or_to, e)
136 |             return False
137 | 
138 |         if not previous.year == date.year:
139 |             try:
140 |                 select_year = self.wait.until(
141 |                     EC.visibility_of_element_located((By.CSS_SELECTOR, year_sel)))
142 |                 select_year.click()
143 | 
144 |                 select_year = self.wait.until(
145 |                     EC.element_to_be_clickable((By.XPATH, year)))
146 | 
147 |                 ActionChains(self.driver).move_to_element(select_year).perform()
148 | 
149 |                 self.driver.find_element_by_xpath(year).click()
150 | 
151 |             except Exception as e:
152 |                 print('error year ', from_or_to, e)
153 |                 return False
154 | 
155 |         if not previous.month == date.month:
156 |             try:
157 |                 select_month = self.wait.until(
158 |                     EC.visibility_of_element_located((By.CSS_SELECTOR, month_sel)))
159 |                 select_month.click()
160 | 
161 |                 select_month = self.wait.until(
162 |                     EC.element_to_be_clickable((By.XPATH, month)))
163 |                 ActionChains(self.driver).move_to_element(select_month).perform()
164 |                 self.driver.find_element_by_xpath(month).click()
165 |             except Exception as e:
166 |                 print('error month', from_or_to, e)
167 |                 return False
168 | 
169 |         try:
170 |             first_available_date = self.wait.until(
171 |                 EC.element_to_be_clickable((By.XPATH, day)))
172 |             ActionChains(self.driver).move_to_element(first_available_date).perform()
173 |             self.driver.find_element_by_xpath(day).click()
174 |         except Exception as e:
175 |             print('error day', from_or_to, e)
176 |             return False
177 | 
178 |         time.sleep(5)
179 |         return True
180 | 
181 |     def get_data(self, date_from, date_to, chronological=False):
182 |         """Function to download data between two dates. Remember the calendar
183 |         validation restriction with regard to current state (thus, one cannt
184 |         set date_to > date_from)
185 |         """
186 |         if not chronological:
187 |             self.set_date(date_from, date_to, from_or_to='from')
188 |             self.set_date(date_from, date_to, from_or_to='to')
189 |         else:
190 |             self.set_date(date_from, date_to, from_or_to='to')
191 |             self.set_date(date_from, date_to, from_or_to='from')
192 | 
193 |         dnlwdxp = '/html/body/div[3]/div[2]/div[1]/div/table/tbody/tr/td/div/div/div[2]/div[2]/div/div[2]/' +\
194 |                   'div[1]/form/div/button/span'
195 | 
196 |         try:
197 |             downld = self.wait.until(
198 |                 EC.visibility_of_element_located((By.XPATH, dnlwdxp)))
199 |             downld.click()
200 | 
201 |             # handle FireFox allert regarding sending sensitive date
202 |             self.driver.switch_to.alert.accept()
203 |         except Exception as e:
204 |             print('Error when downloading file', e)
205 |             return False
206 | 
207 |         print(f'Downloading data file from {date_from} to {date_to}')
208 | 
209 |         return True
210 | 
211 |     def get_file_size(self, date_from, date_to):
212 |         """Function to check that the size of the file is not beyond the boundary ~40k and that it has data
213 |         """
214 |         name_key = date_from.strftime('%y%m%d') + '_' + date_to.strftime('%y%m%d') + '.' + self.out
215 |         name = None
216 | 
217 |         i = 1
218 |         while i < 40:
219 |             print('Waiting for file to proceed...')
220 |             time.sleep(15)
221 |             list_files = os.listdir(self.folder)
222 |             for name in list_files:
223 |                 if name_key in name:
224 |                     i = 100
225 |                     break
226 |             i += 1
227 | 
228 |         if i >= 100 and (name is not None):
229 |             file_path = os.path.join(self.folder, name)
230 |             file_size = os.stat(file_path).st_size
231 |         else:
232 |             file_path = ''
233 |             file_size = 500000
234 | 
235 |         return file_path, file_size
236 | 
237 |     def get_downloaded_date_from(self):
238 |         """Get the earliest data of downloaded asset to continue download from then
239 |         """
240 |         date = dt.datetime.now().date()
241 |         for file in os.listdir(self.folder):
242 |             try:
243 |                 temp_date = dt.datetime.strptime(file[-17:-11], '%y%m%d').date()
244 |             except Exception as e:
245 |                 print("file without data in the name, ", e)
246 |                 temp_date = date
247 |             if temp_date < date:
248 |                 date = temp_date
249 | 
250 |         return date
251 | 
252 |     def get_bulk_data(self, date_from="2019-11-01", date_to="2019-11-30"):
253 |         """Method to continuously operate downloading
254 |         """
255 | 
256 |         date_from = dt.datetime.strptime(date_from, '%Y-%m-%d').date()
257 |         date_to = dt.datetime.strptime(date_to, '%Y-%m-%d').date()
258 | 
259 |         downloaded_date_from = self.get_downloaded_date_from()
260 | 
261 |         if downloaded_date_from <= date_from:
262 |             return
263 |         else:
264 |             if downloaded_date_from < date_to:
265 |                 date_to = downloaded_date_from - dt.timedelta(days=1)
266 | 
267 |         # set temp dates to go through daterange in reversed chronological order
268 |         temp_date_to = date_to
269 |         temp_date_from = temp_date_to - dt.timedelta(days=self.target_days)
270 |         if date_from > temp_date_from:
271 |             temp_date_from = date_from
272 | 
273 |         while date_from < temp_date_to:
274 |             self.files = os.listdir(self.folder)
275 |             self.get_data(temp_date_from, temp_date_to)
276 | 
277 |             expected_file_name = (self.name + "_" + dt.datetime.strftime(temp_date_from, "%y%m%d") + "_" +
278 |                                   dt.datetime.strftime(temp_date_to, "%y%m%d") + ".csv")
279 | 
280 |             # waiting for the downloading to complete
281 |             stop_time = dt.datetime.now() + dt.timedelta(minutes=30)
282 |             now = dt.datetime.now()
283 |             while now < stop_time:
284 |                 time.sleep(20)
285 |                 list_files = os.listdir(self.folder)
286 | 
287 |                 if (list_files != self.files) and (expected_file_name in list_files):
288 |                     now += dt.timedelta(minutes=30)
289 |                     for name in list_files:
290 |                         if ".part" in name:
291 |                             now = dt.datetime.now()
292 |                             print("Waiting for downloading to complete, ", (stop_time - now), " before cancel")
293 |                 else:
294 |                     now = dt.datetime.now()
295 |                     print("Waiting for downloading to begin, ", (stop_time - now), " before cancel")
296 | 
297 |             file_path, file_size = self.get_file_size(temp_date_from, temp_date_to)
298 | 
299 |             if file_size < 200:
300 |                 os.remove(file_path)
301 |                 raise Exception("No data provided")
302 | 
303 |             print("Finished at ", dt.datetime.now(), ", file size (b): ", file_size, )
304 | 
305 |             self.previous_to, self.previous_from = temp_date_to, temp_date_from
306 | 
307 |             # check if file_size close to the limit 40k
308 |             if file_size < (self.file_size * 0.9):
309 |                 # self-adjusting period of download within size-limits
310 |                 self.target_days = int((self.file_size / 2) * self.target_days / file_size)
311 |                 temp_date_to = temp_date_from - dt.timedelta(days=1)
312 |                 temp_date_from = temp_date_to - dt.timedelta(days=self.target_days)
313 |             else:
314 |                 file_name = os.path.basename(file_path)
315 |                 print(f"File {file_name} beyond the file size, retrying")
316 |                 os.remove(file_path)
317 |                 self.target_days = self.target_days // 2
318 |                 temp_date_from = temp_date_to - dt.timedelta(days=self.target_days)
319 | 
320 |         return
321 | 
322 |     def close(self):
323 |         """closing the session of seleniun
324 |         """
325 |         self.driver.close()
326 | 
327 | 
328 | # blue chips as of 2014 (from wikipedia history)
329 | # finam.ru abbr. / finam.ru name
330 | BLUE_CHIPS_DICT = dict(GAZP='ГАЗПРОМ ао', SBER='Сбербанк', LKOH='ЛУКОЙЛ', MGNT='Магнит ао', SNGS='Сургнфгз',
331 |                        NVTK='Новатэк ао', MTSS='МТС-ао', ROSN='Роснефть', GMKN='ГМКНорНик', VTBR='ВТБ ао',
332 |                        TATN='Татнфт 3ао', AFKS='Система ао', RTKM='Ростел -ао', YNDX='', CHMF='', ALRS='')
333 | 
334 | 
335 | def main():
336 |     my_dir = os.getcwd()
337 |     folder = os.path.join(my_dir, "data/1_RawTicks")
338 | 
339 |     if os.path.basename(folder) not in os.listdir(os.path.dirname(folder)):
340 |         os.mkdir(folder)
341 | 
342 |     if len(sys.argv) > 1:
343 |         key = sys.argv[1]
344 |     else:
345 |         key = 'GAZP'
346 | 
347 |     if len(sys.argv) > 2:
348 |         date_from = sys.argv[2]
349 |     else:
350 |         date_from = "2009-01-01"
351 | 
352 |     if len(sys.argv) > 3:
353 |         date_to = sys.argv[3]
354 |     else:
355 |         date_to = "2019-12-13"
356 | 
357 |     name = BLUE_CHIPS_DICT[key]
358 | 
359 |     asset = GetTickData(key, folder=folder)
360 | 
361 |     try:
362 |         asset.set_company(name)
363 |         asset.get_bulk_data(date_from, date_to)
364 |     except Exception as e:
365 |         print('company ', key, ' failed because of ', e)
366 | 
367 |     asset.close()
368 | 
369 | 
370 | if __name__ == "__main__":
371 |     main()
372 | 


--------------------------------------------------------------------------------
/4_dollar_bars_triple_barrier_indicators.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import time
  5 | import sys
  6 | import os
  7 | import numpy as np
  8 | import pandas as pd
  9 | import datetime as dt
 10 | 
 11 | import multiprocessing as mp
 12 | 
 13 | import mlfinlab as ml
 14 | 
 15 | np.random.seed(42)
 16 | 
 17 | # module to substitute in 'mlfinlab' package
 18 | def new_batch_run(self, verbose=True, to_csv=False, output_path=None):
 19 |     """
 20 |     Reads a csv file in batches and then constructs the financial data structure in the form of a DataFrame.
 21 |     The csv file must have only 3 columns: date_time, price, & volume.
 22 |     :param verbose: (Boolean) Flag whether to print message on each processed batch or not
 23 |     :param to_csv: (Boolean) Flag for writing the results of bars generation to local csv file, or to in-memory DataFrame
 24 |     :param output_path: (Boolean) Path to results file, if to_csv = True
 25 | 
 26 |     :return: (DataFrame or None) Financial data structure
 27 |     """
 28 | 
 29 |     # for parquet
 30 |     if ".gzip" in self.file_path:
 31 |         parquet = pd.read_parquet(self.file_path, engine='fastparquet')
 32 |         n_batches = len(parquet) // self.batch_size
 33 |         iterations = np.array_split(parquet, n_batches)
 34 |     else:
 35 |         # Read in the first row & assert format
 36 |         first_row = pd.read_csv(self.file_path, nrows=1)
 37 |         self._assert_csv(first_row)
 38 |         iterations = pd.read_csv(self.file_path, chunksize=self.batch_size)
 39 | 
 40 |     if to_csv is True:
 41 |         header = True  # if to_csv is True, header should written on the first batch only
 42 |         open(output_path, 'w').close()  # clean output csv file
 43 | 
 44 |     if verbose:  # pragma: no cover
 45 |         print('Reading data in batches:')
 46 | 
 47 |     # Read csv in batches
 48 |     count = 0
 49 |     final_bars = []
 50 |     cols = ['date_time', 'open', 'high', 'low', 'close', 'volume']
 51 |     for batch in iterations:
 52 |         if verbose:  # pragma: no cover
 53 |             print('Batch number:', count)
 54 | 
 55 |         list_bars = self._extract_bars(data=batch)
 56 | 
 57 |         if to_csv is True:
 58 |             pd.DataFrame(list_bars, columns=cols).to_csv(output_path, header=header, index=False, mode='a')
 59 |             header = False
 60 |         else:
 61 |             # Append to bars list
 62 |             final_bars += list_bars
 63 |         count += 1
 64 | 
 65 |         # Set flag to True: notify function to use cache
 66 |         self.flag = True
 67 | 
 68 |     if verbose:  # pragma: no cover
 69 |         print('Returning bars \n')
 70 | 
 71 |     # Return a DataFrame
 72 |     if final_bars:
 73 |         bars_df = pd.DataFrame(final_bars, columns=cols)
 74 |         return bars_df
 75 | 
 76 |     # Processed DataFrame is stored in .csv file, return None
 77 |     return None
 78 | 
 79 | 
 80 | # update imported package to deal with advanced data structure and adjust it to reas 'parquet'
 81 | ml.data_structures.base_bars.BaseBars.batch_run = new_batch_run
 82 | 
 83 | 
 84 | class TrippleBarrier(object):
 85 |     """This class is to create indicators (features) to feed ML trading algorithm.
 86 |     The content of this class was sourced from 'mlfinlab' package.
 87 |     Objectification was made in order to run this class within 'QuantConnect' platform.
 88 |     """
 89 | 
 90 |     def __init__(self):
 91 |         # tbd
 92 |         return
 93 | 
 94 |     def get_daily_vol(self, close, lookback=100):
 95 |         """
 96 |         Snippet 3.1, page 44, Daily Volatility Estimates
 97 |         Computes the daily volatility at intraday estimation points.
 98 |         In practice we want to set profit taking and stop-loss limits that are a function of the risks involved
 99 |         in a bet. Otherwise, sometimes we will be aiming too high (tao ≫ sigma_t_i,0), and sometimes too low
100 |         (tao ≪ sigma_t_i,0 ), considering the prevailing volatility. Snippet 3.1 computes the daily volatility
101 |         at intraday estimation points, applying a span of lookback days to an exponentially weighted moving
102 |         standard deviation.
103 |         See the pandas documentation for details on the pandas.Series.ewm function.
104 |         Note: This function is used to compute dynamic thresholds for profit taking and stop loss limits.
105 |         :param close: Closing prices
106 |         :param lookback: lookback period to compute volatility
107 |         :return: series of daily volatility value
108 |         """
109 |         # daily vol re-indexed to close
110 |         df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
111 |         df0 = df0[df0 > 0]
112 |         df0 = (pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:]))
113 | 
114 |         df0 = close.loc[df0.index] / close.loc[df0.values].values - 1  # daily returns
115 |         df0 = df0.ewm(span=lookback).std()
116 |         return df0
117 | 
118 |     # Snippet 2.4, page 39, The Symmetric CUSUM Filter.
119 |     def cusum_filter(self, raw_time_series, threshold, time_stamps=True):
120 |         """
121 |         Snippet 2.4, page 39, The Symmetric Dynamic/Fixed CUSUM Filter.
122 |         The CUSUM filter is a quality-control method, designed to detect a shift in the
123 |         mean value of a measured quantity away from a target value. The filter is set up to
124 |         identify a sequence of upside or downside divergences from any reset level zero.
125 |         We sample a bar t if and only if S_t >= threshold, at which point S_t is reset to 0.
126 |         One practical aspect that makes CUSUM filters appealing is that multiple events are not
127 |         triggered by raw_time_series hovering around a threshold level, which is a flaw suffered by popular
128 |         market signals such as Bollinger Bands. It will require a full run of length threshold for
129 |         raw_time_series to trigger an event.
130 |         Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine
131 |         whether the occurrence of such events constitutes actionable intelligence.
132 |         Below is an implementation of the Symmetric CUSUM filter.
133 |         Note: As per the book this filter is applied to closing prices but we extended it to also work on other
134 |         time series such as volatility.
135 |         :param raw_time_series: (series) of close prices (or other time series, e.g. volatility).
136 |         :param threshold: (float or pd.Series) when the abs(change) is larger than the threshold, the function captures
137 |         it as an event, can be dynamic if threshold is pd.Series
138 |         :param time_stamps: (bool) default is to return a DateTimeIndex, change to false to have it return a list.
139 |         :return: (datetime index vector) vector of datetimes when the events occurred. This is used later to sample.
140 |         """
141 | 
142 |         t_events = []
143 |         s_pos = 0
144 |         s_neg = 0
145 | 
146 |         # log returns
147 |         raw_time_series = pd.DataFrame(raw_time_series)  # Convert to DataFrame
148 |         raw_time_series.columns = ['price']
149 |         raw_time_series['log_ret'] = raw_time_series.price.apply(np.log).diff()
150 |         if isinstance(threshold, (float, int)):
151 |             raw_time_series['threshold'] = threshold
152 |         elif isinstance(threshold, pd.Series):
153 |             raw_time_series.loc[threshold.index, 'threshold'] = threshold
154 |         else:
155 |             raise ValueError('threshold is neither float nor pd.Series!')
156 | 
157 |         raw_time_series = raw_time_series.iloc[1:]  # Drop first na values
158 | 
159 |         # Get event time stamps for the entire series
160 |         for tup in raw_time_series.itertuples():
161 |             thresh = tup.threshold
162 |             pos = float(s_pos + tup.log_ret)
163 |             neg = float(s_neg + tup.log_ret)
164 |             s_pos = max(0.0, pos)
165 |             s_neg = min(0.0, neg)
166 | 
167 |             if s_neg < -thresh:
168 |                 s_neg = 0
169 |                 t_events.append(tup.Index)
170 | 
171 |             elif s_pos > thresh:
172 |                 s_pos = 0
173 |                 t_events.append(tup.Index)
174 | 
175 |         # Return DatetimeIndex or list
176 |         if time_stamps:
177 |             event_timestamps = pd.DatetimeIndex(t_events)
178 |             return event_timestamps
179 | 
180 |         return t_events
181 | 
182 |     # Snippet 3.4 page 49, Adding a Vertical Barrier
183 |     def add_vertical_barrier(self, t_events, close, num_days=0, num_hours=0, num_minutes=0, num_seconds=0):
184 |         """
185 |         Snippet 3.4 page 49, Adding a Vertical Barrier
186 |         For each index in t_events, it finds the timestamp of the next price bar at or immediately after
187 |         a number of days num_days. This vertical barrier can be passed as an optional argument t1 in get_events.
188 |         This function creates a series that has all the timestamps of when the vertical barrier would be reached.
189 |         :param t_events: (series) series of events (symmetric CUSUM filter)
190 |         :param close: (series) close prices
191 |         :param num_days: (int) number of days to add for vertical barrier
192 |         :param num_hours: (int) number of hours to add for vertical barrier
193 |         :param num_minutes: (int) number of minutes to add for vertical barrier
194 |         :param num_seconds: (int) number of seconds to add for vertical barrier
195 |         :return: (series) timestamps of vertical barriers
196 |         """
197 |         timedelta = pd.Timedelta(
198 |             '{} days, {} hours, {} minutes, {} seconds'.format(num_days, num_hours, num_minutes, num_seconds))
199 |         # Find index to closest to vertical barrier
200 |         nearest_index = close.index.searchsorted(t_events + timedelta)
201 | 
202 |         # Exclude indexes which are outside the range of close price index
203 |         nearest_index = nearest_index[nearest_index < close.shape[0]]
204 | 
205 |         # Find price index closest to vertical barrier time stamp
206 |         nearest_timestamp = close.index[nearest_index]
207 |         filtered_events = t_events[:nearest_index.shape[0]]
208 | 
209 |         vertical_barriers = pd.Series(data=nearest_timestamp, index=filtered_events)
210 |         return vertical_barriers
211 | 
212 |     # Snippet 20.5 (page 306), the lin_parts function
213 |     def lin_parts(self, num_atoms, num_threads):
214 |         """
215 |         Snippet 20.5 (page 306), the lin_parts function
216 |         The simplest way to form molecules is to partition a list of atoms in subsets of equal size,
217 |         where the number of subsets is the minimum between the number of processors and the number
218 |         of atoms. For N subsets we need to find the N+1 indices that enclose the partitions.
219 |         This logic is demonstrated in Snippet 20.5.
220 |         This function partitions a list of atoms in subsets (molecules) of equal size.
221 |         An atom is a set of indivisible set of tasks.
222 |         """
223 |         # Partition of atoms with a single loop
224 |         parts = np.linspace(0, num_atoms, min(num_threads, num_atoms) + 1)
225 |         parts = np.ceil(parts).astype(int)
226 |         return parts
227 | 
228 |     # Snippet 3.2, page 45, Triple Barrier Labeling Method
229 |     def apply_pt_sl_on_t1(self, close, events, pt_sl, molecule):  # pragma: no cover
230 |         """
231 |         Snippet 3.2, page 45, Triple Barrier Labeling Method
232 |         This function applies the triple-barrier labeling method. It works on a set of
233 |         datetime index values (molecule). This allows the program to parallelize the processing.
234 |         Mainly it returns a DataFrame of timestamps regarding the time when the first barriers were reached.
235 |         :param close: (series) close prices
236 |         :param events: (series) of indices that signify "events" (see cusum_filter function
237 |         for more details)
238 |         :param pt_sl: (array) element 0, indicates the profit taking level; element 1 is stop loss level
239 |         :param molecule: (an array) a set of datetime index values for processing
240 |         :return: DataFrame of timestamps of when first barrier was touched
241 |         """
242 |         # Apply stop loss/profit taking, if it takes place before t1 (end of event)
243 |         events_ = events.loc[molecule]
244 |         out = events_[['t1']].copy(deep=True)
245 | 
246 |         profit_taking_multiple = pt_sl[0]
247 |         stop_loss_multiple = pt_sl[1]
248 | 
249 |         # Profit taking active
250 |         if profit_taking_multiple > 0:
251 |             profit_taking = profit_taking_multiple * events_['trgt']
252 |         else:
253 |             profit_taking = pd.Series(index=events.index)  # NaNs
254 | 
255 |         # Stop loss active
256 |         if stop_loss_multiple > 0:
257 |             stop_loss = -stop_loss_multiple * events_['trgt']
258 |         else:
259 |             stop_loss = pd.Series(index=events.index)  # NaNs
260 | 
261 |         # Get events
262 |         for loc, vertical_barrier in events_['t1'].fillna(close.index[-1]).iteritems():
263 |             closing_prices = close[loc: vertical_barrier]  # Path prices for a given trade
264 |             cum_returns = (closing_prices / close[loc] - 1) * events_.at[loc, 'side']  # Path returns
265 |             out.loc[loc, 'sl'] = cum_returns[cum_returns < stop_loss[loc]].index.min()  # Earliest stop loss date
266 |             out.loc[loc, 'pt'] = cum_returns[
267 |                 cum_returns > profit_taking[loc]].index.min()  # Earliest profit taking date
268 | 
269 |         return out
270 | 
271 |     # Snippet 20.7 (page 310), The mpPandasObj, used at various points in the book
272 |     def mp_pandas_obj(self, func, pd_obj, num_threads=24, mp_batches=1, lin_mols=True, **kargs):
273 |         """
274 |         Snippet 20.7 (page 310), The mpPandasObj, used at various points in the book
275 |         Parallelize jobs, return a dataframe or series.
276 |         Example: df1=mp_pandas_obj(func,('molecule',df0.index),24,**kwds)
277 |         First, atoms are grouped into molecules, using linParts (equal number of atoms per molecule)
278 |         or nestedParts (atoms distributed in a lower-triangular structure). When mpBatches is greater
279 |         than 1, there will be more molecules than cores. Suppose that we divide a task into 10 molecules,
280 |         where molecule 1 takes twice as long as the rest. If we run this process in 10 cores, 9 of the
281 |         cores will be idle half of the runtime, waiting for the first core to process molecule 1.
282 |         Alternatively, we could set mpBatches=10 so as to divide that task in 100 molecules. In doing so,
283 |         every core will receive equal workload, even though the first 10 molecules take as much time as the
284 |         next 20 molecules. In this example, the run with mpBatches=10 will take half of the time consumed by
285 |         mpBatches=1.
286 |         Second, we form a list of jobs. A job is a dictionary containing all the information needed to process
287 |         a molecule, that is, the callback function, its keyword arguments, and the subset of atoms that form
288 |         the molecule.
289 |         Third, we will process the jobs sequentially if numThreads==1 (see Snippet 20.8), and in parallel
290 |         otherwise (see Section 20.5.2). The reason that we want the option to run jobs sequentially is for
291 |         debugging purposes. It is not easy to catch a bug when programs are run in multiple processors.
292 |         Once the code is debugged, we will want to use numThreads>1.
293 |         Fourth, we stitch together the output from every molecule into a single list, series, or dataframe.
294 |         :param func: A callback function, which will be executed in parallel
295 |         :param pd_obj: (tuple) Element 0: The name of the argument used to pass molecules to the callback function
296 |                         Element 1: A list of indivisible tasks (atoms), which will be grouped into molecules
297 |         :param num_threads: (int) The number of threads that will be used in parallel (one processor per thread)
298 |         :param mp_batches: (int) Number of parallel batches (jobs per core)
299 |         :param lin_mols: (bool) Tells if the method should use linear or nested partitioning
300 |         :param kargs: (var args) Keyword arguments needed by func
301 |         :return: (data frame) of results
302 |         """
303 | 
304 |         if lin_mols:
305 |             parts = self.lin_parts(len(pd_obj[1]), num_threads * mp_batches)
306 |         else:
307 |             print("nested parts... to fix")
308 |             # parts = nested_parts(len(pd_obj[1]), num_threads * mp_batches)
309 | 
310 |         jobs = []
311 |         for i in range(1, len(parts)):
312 |             job = {pd_obj[0]: pd_obj[1][parts[i - 1]:parts[i]], 'func': func}
313 |             job.update(kargs)
314 |             jobs.append(job)
315 | 
316 |         if num_threads == 1:
317 |             out = self.process_jobs_(jobs)
318 |         else:
319 |             out = self.process_jobs(jobs, num_threads=num_threads)
320 | 
321 |         if isinstance(out[0], pd.DataFrame):
322 |             df0 = pd.DataFrame()
323 |         elif isinstance(out[0], pd.Series):
324 |             df0 = pd.Series()
325 |         else:
326 |             return out
327 | 
328 |         for i in out:
329 |             df0 = df0.append(i)
330 | 
331 |         df0 = df0.sort_index()
332 |         return df0
333 | 
334 |     # Snippet 20.8, pg 311, Single thread execution, for debugging
335 |     def process_jobs_(self, jobs):
336 |         """
337 |         # Snippet 20.8, pg 311, Single thread execution, for debugging
338 |         Run jobs sequentially, for debugging
339 |         """
340 |         out = []
341 |         for job in jobs:
342 |             out_ = self.expand_call(job)
343 |             out.append(out_)
344 | 
345 |         return out
346 | 
347 |     # Snippet 20.9.2, pg 312, Example of Asynchronous call to pythons multiprocessing library
348 |     def process_jobs(self, jobs, task=None, num_threads=24):
349 |         """
350 |         Snippet 20.9.2, pg 312, Example of Asynchronous call to pythons multiprocessing library
351 |         Run in parallel. jobs must contain a 'func' callback, for expand_call
352 |         """
353 | 
354 |         if task is None:
355 |             task = jobs[0]['func'].__name__
356 | 
357 |         pool = mp.Pool(processes=num_threads)
358 |         outputs = pool.imap_unordered(self.expand_call, jobs)
359 |         out = []
360 |         time0 = time.time()
361 | 
362 |         # Process asynchronous output, report progress
363 |         for i, out_ in enumerate(outputs, 1):
364 |             out.append(out_)
365 |             self.report_progress(i, len(jobs), time0, task)
366 | 
367 |         pool.close()
368 |         pool.join()  # This is needed to prevent memory leaks
369 |         return out
370 | 
371 |     # Snippet 20.10 Passing the job (molecule) to the callback function
372 |     def expand_call(self, kargs):
373 |         """
374 |         Snippet 20.10 Passing the job (molecule) to the callback function
375 |         Expand the arguments of a callback function, kargs['func']
376 |         """
377 |         func = kargs['func']
378 |         del kargs['func']
379 |         out = func(**kargs)
380 |         return out
381 | 
382 |     # Snippet 20.9.1, pg 312, Example of Asynchronous call to pythons multiprocessing library
383 |     def report_progress(self, job_num, num_jobs, time0, task):
384 |         """
385 |         Snippet 20.9.1, pg 312, Example of Asynchronous call to pythons multiprocessing library
386 |         """
387 |         # Report progress as asynch jobs are completed
388 |         msg = [float(job_num) / num_jobs, (time.time() - time0) / 60.0]
389 |         msg.append(msg[1] * (1 / msg[0] - 1))
390 |         time_stamp = str(dt.datetime.fromtimestamp(time.time()))
391 | 
392 |         msg = time_stamp + ' ' + str(round(msg[0] * 100, 2)) + '% ' + task + ' done after ' + str(
393 |             round(msg[1], 2)) + ' minutes. Remaining ' + str(round(msg[2], 2)) + ' minutes.'
394 | 
395 |         if job_num < num_jobs:
396 |             sys.stderr.write(msg + '\r')
397 |         else:
398 |             sys.stderr.write(msg + '\n')
399 | 
400 |     # Snippet 3.3 -> 3.6 page 50, Getting the Time of the First Touch, with Meta Labels
401 |     def get_events(self, close, t_events, pt_sl, target, min_ret, num_threads, vertical_barrier_times=False,
402 |                    side_prediction=None):
403 |         """
404 |         Snippet 3.6 page 50, Getting the Time of the First Touch, with Meta Labels
405 |         This function is orchestrator to meta-label the data, in conjunction with the Triple Barrier Method.
406 |         :param close: (series) Close prices
407 |         :param t_events: (series) of t_events. These are timestamps that will seed every triple barrier.
408 |             These are the timestamps selected by the sampling procedures discussed in Chapter 2, Section 2.5.
409 |             Eg: CUSUM Filter
410 |         :param pt_sl: (2 element array) element 0, indicates the profit taking level; element 1 is stop loss level.
411 |             A non-negative float that sets the width of the two barriers. A 0 value means that the respective
412 |             horizontal barrier (profit taking and/or stop loss) will be disabled.
413 |         :param target: (series) of values that are used (in conjunction with pt_sl) to determine the width
414 |             of the barrier. In this program this is daily volatility series.
415 |         :param min_ret: (float) The minimum target return required for running a triple barrier search.
416 |         :param num_threads: (int) The number of threads concurrently used by the function.
417 |         :param vertical_barrier_times: (series) A pandas series with the timestamps of the vertical barriers.
418 |             We pass a False when we want to disable vertical barriers.
419 |         :param side_prediction: (series) Side of the bet (long/short) as decided by the primary model
420 |         :return: (data frame) of events
421 |                 -events.index is event's starttime
422 |                 -events['t1'] is event's endtime
423 |                 -events['trgt'] is event's target
424 |                 -events['side'] (optional) implies the algo's position side
425 |                 -events['pt'] Profit taking multiple
426 |                 -events['sl'] Stop loss multiple
427 |         """
428 | 
429 |         # 1) Get target
430 |         target = target.loc[t_events]
431 |         target = target[target > min_ret]  # min_ret
432 | 
433 |         # 2) Get vertical barrier (max holding period)
434 |         if vertical_barrier_times is False:
435 |             vertical_barrier_times = pd.Series(pd.NaT, index=t_events)
436 | 
437 |         # 3) Form events object, apply stop loss on vertical barrier
438 |         if side_prediction is None:
439 |             side_ = pd.Series(1.0, index=target.index)
440 |             pt_sl_ = [pt_sl[0], pt_sl[0]]
441 |         else:
442 |             side_ = side_prediction.loc[target.index]  # Subset side_prediction on target index.
443 |             pt_sl_ = pt_sl[:2]
444 | 
445 |         # Create a new df with [v_barrier, target, side] and drop rows that are NA in target
446 |         events = pd.concat({'t1': vertical_barrier_times, 'trgt': target, 'side': side_}, axis=1)
447 |         events = events.dropna(subset=['trgt'])
448 | 
449 |         # Apply Triple Barrier
450 |         first_touch_dates = self.mp_pandas_obj(func=self.apply_pt_sl_on_t1,
451 |                                                pd_obj=('molecule', events.index),
452 |                                                num_threads=num_threads,
453 |                                                close=close,
454 |                                                events=events,
455 |                                                pt_sl=pt_sl_)
456 | 
457 |         events['t1'] = first_touch_dates.dropna(how='all').min(axis=1)  # pd.min ignores nan
458 | 
459 |         if side_prediction is None:
460 |             events = events.drop('side', axis=1)
461 | 
462 |         # Add profit taking and stop loss multiples for vertical barrier calculations
463 |         events['pt'] = pt_sl[0]
464 |         events['sl'] = pt_sl[1]
465 | 
466 |         return events
467 | 
468 |     # Snippet 3.9, pg 55, Question 3.3
469 |     def barrier_touched(self, out_df, events):
470 |         """
471 |         Snippet 3.9, pg 55, Question 3.3
472 |         Adjust the getBins function (Snippet 3.7) to return a 0 whenever the vertical barrier is the one touched first.
473 |         Top horizontal barrier: 1
474 |         Bottom horizontal barrier: -1
475 |         Vertical barrier: 0
476 |         :param out_df: (DataFrame) containing the returns and target
477 |         :param events: (DataFrame) The original events data frame. Contains the pt sl multiples needed here.
478 |         :return: (DataFrame) containing returns, target, and labels
479 |         """
480 |         store = []
481 |         for date_time, values in out_df.iterrows():
482 |             ret = values['ret']
483 |             target = values['trgt']
484 | 
485 |             pt_level_reached = ret > target * events.loc[date_time, 'pt']
486 |             sl_level_reached = ret < -target * events.loc[date_time, 'sl']
487 | 
488 |             if ret > 0.0 and pt_level_reached:
489 |                 # Top barrier reached
490 |                 store.append(1)
491 |             elif ret < 0.0 and sl_level_reached:
492 |                 # Bottom barrier reached
493 |                 store.append(-1)
494 |             else:
495 |                 # Vertical barrier reached
496 |                 store.append(0)
497 | 
498 |         # Save to 'bin' column and return
499 |         out_df['bin'] = store
500 |         return out_df
501 | 
502 |     # Snippet 3.4 -> 3.7, page 51, Labeling for Side & Size with Meta Labels
503 |     def get_bins(self, triple_barrier_events, close):
504 |         """
505 |         Snippet 3.7, page 51, Labeling for Side & Size with Meta Labels
506 |         Compute event's outcome (including side information, if provided).
507 |         events is a DataFrame where:
508 |         Now the possible values for labels in out['bin'] are {0,1}, as opposed to whether to take the bet or pass,
509 |         a purely binary prediction. When the predicted label the previous feasible values {−1,0,1}.
510 |         The ML algorithm will be trained to decide is 1, we can use the probability of this secondary prediction
511 |         to derive the size of the bet, where the side (sign) of the position has been set by the primary model.
512 |         :param triple_barrier_events: (data frame)
513 |                     -events.index is event's starttime
514 |                     -events['t1'] is event's endtime
515 |                     -events['trgt'] is event's target
516 |                     -events['side'] (optional) implies the algo's position side
517 |                     Case 1: ('side' not in events): bin in (-1,1) <-label by price action
518 |                     Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
519 |         :param close: (series) close prices
520 |         :return: (data frame) of meta-labeled events
521 |         """
522 | 
523 |         # 1) Align prices with their respective events
524 |         events_ = triple_barrier_events.dropna(subset=['t1'])
525 |         all_dates = events_.index.union(other=events_['t1'].values).drop_duplicates()
526 |         prices = close.reindex(all_dates, method='bfill')
527 | 
528 |         # 2) Create out DataFrame
529 |         out_df = pd.DataFrame(index=events_.index)
530 |         # Need to take the log returns, else your results will be skewed for short positions
531 |         out_df['ret'] = np.log(prices.loc[events_['t1'].values].values) - np.log(prices.loc[events_.index])
532 |         out_df['trgt'] = events_['trgt']
533 | 
534 |         # Meta labeling: Events that were correct will have pos returns
535 |         if 'side' in events_:
536 |             out_df['ret'] = out_df['ret'] * events_['side']  # meta-labeling
537 | 
538 |         # Added code: label 0 when vertical barrier reached
539 |         out_df = self.barrier_touched(out_df, triple_barrier_events)
540 | 
541 |         # Meta labeling: label incorrect events with a 0
542 |         if 'side' in events_:
543 |             out_df.loc[out_df['ret'] <= 0, 'bin'] = 0
544 | 
545 |         # Transform the log returns back to normal returns.
546 |         out_df['ret'] = np.exp(out_df['ret']) - 1
547 | 
548 |         # Add the side to the output. This is useful for when a meta label model must be fit
549 |         tb_cols = triple_barrier_events.columns
550 |         if 'side' in tb_cols:
551 |             out_df['side'] = triple_barrier_events['side']
552 | 
553 |         return out_df
554 | 
555 | 
556 | def get_side(data):
557 |     fast_window = fast
558 |     slow_window = slow
559 | 
560 |     data['fast_mavg'] = data['close'].rolling(window=fast_window, min_periods=fast_window, center=False).mean()
561 |     data['slow_mavg'] = data['close'].rolling(window=slow_window, min_periods=slow_window, center=False).mean()
562 |     data.head()
563 | 
564 |     # Compute sides
565 |     data['side'] = np.nan
566 | 
567 |     long_signals = data['fast_mavg'] >= data['slow_mavg']
568 |     short_signals = data['fast_mavg'] < data['slow_mavg']
569 |     data.loc[long_signals, 'side'] = 1
570 |     data.loc[short_signals, 'side'] = -1
571 | 
572 |     # Remove Look ahead biase by lagging the signal
573 |     data['side'] = data['side'].shift(1)
574 | 
575 |     return data
576 | 
577 | 
578 | def get_indicators(raw_data):
579 |     # Log Returns
580 |     raw_data['log_ret'] = np.log(raw_data['close']).diff()
581 | 
582 |     # Momentum
583 |     raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
584 |     raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
585 |     raw_data['mom3'] = raw_data['close'].pct_change(periods=3)
586 |     raw_data['mom4'] = raw_data['close'].pct_change(periods=4)
587 |     raw_data['mom5'] = raw_data['close'].pct_change(periods=5)
588 | 
589 |     # Volatility
590 |     raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
591 |     raw_data['volatility_31'] = raw_data['log_ret'].rolling(window=31, min_periods=31, center=False).std()
592 |     raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()
593 | 
594 |     # Serial Correlation (Takes about 4 minutes)
595 |     # GBM data is lack of serial correlation, thus disabled
596 | 
597 |     window_autocorr = 50
598 | 
599 |     raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr,
600 |                                                          center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
601 |     raw_data['autocorr_2'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr,
602 |                                                          center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
603 |     raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr,
604 |                                                          center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
605 |     raw_data['autocorr_4'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr,
606 |                                                          center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
607 |     raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr,
608 |                                                          center=False).apply(lambda x: x.autocorr(lag=5), raw=False)
609 | 
610 |     # Get the various log -t returns
611 |     raw_data['log_t1'] = raw_data['log_ret'].shift(1)
612 |     raw_data['log_t2'] = raw_data['log_ret'].shift(2)
613 |     raw_data['log_t3'] = raw_data['log_ret'].shift(3)
614 |     raw_data['log_t4'] = raw_data['log_ret'].shift(4)
615 |     raw_data['log_t5'] = raw_data['log_ret'].shift(5)
616 | 
617 |     # Re compute sides
618 |     raw_data['side'] = np.nan
619 | 
620 |     long_signals = raw_data['fast_mavg'] >= raw_data['slow_mavg']
621 |     short_signals = raw_data['fast_mavg'] < raw_data['slow_mavg']
622 | 
623 |     raw_data.loc[long_signals, 'side'] = 1
624 |     raw_data.loc[short_signals, 'side'] = -1
625 | 
626 |     # Remove look ahead bias
627 |     raw_data = raw_data.shift(1)
628 | 
629 |     return raw_data
630 | 
631 | 
632 | # source folders
633 | my_dir = os.getcwd()
634 | ticks_folder = os.path.join(my_dir, "data/5_AdjTicks")
635 | 
636 | # destination folder / path to files with dollar bars
637 | dollar_bars_folder = os.path.join(my_dir, "data/6_DollarBars")
638 | if os.path.basename(dollar_bars_folder) not in os.listdir(os.path.dirname(dollar_bars_folder)):
639 |     os.mkdir(dollar_bars_folder)
640 | 
641 | # destination folder / path to files with indicators
642 | # indicators_folder = os.path.expanduser('~/Downloads/Indicators')
643 | indicators_folder = os.path.join(my_dir, "data/7_Indicators")
644 | if os.path.basename(indicators_folder) not in os.listdir(os.path.dirname(indicators_folder)):
645 |     os.mkdir(indicators_folder)
646 | 
647 | keys = [key[:4] for key in os.listdir(ticks_folder) if not key.startswith(".")]
648 | print(keys)
649 | 
650 | # Input parameters
651 | 
652 | est_ticks = 10  # per day
653 | 
654 | vertical_barrier_days = 5  # days
655 | 
656 | # the following parameters need to be adjusted for particular case
657 | pt_sl = [1, 2]
658 | min_ret = 1 / 100  # triple_barrier_boundary
659 | 
660 | # sma
661 | fast = 20
662 | slow = 50
663 | 
664 | get_dollar_bars_file_name = lambda key, est_ticks: f"{key}_{str(est_ticks)}_dollar_bars.csv"
665 | 
666 | for key in keys:
667 | 
668 |     ticks_file = [f for f in os.listdir(ticks_folder) if key in f][0]
669 |     ticks_file_path = os.path.join(ticks_folder, ticks_file)
670 | 
671 |     dollar_bars_path = os.path.join(dollar_bars_folder, get_dollar_bars_file_name(key, est_ticks))
672 | 
673 |     if os.path.basename(dollar_bars_path) not in os.listdir(os.path.dirname(dollar_bars_path)):
674 |         # indicators_path = os.path.join(indicators_folder, (key+ '_indicators.csv'))
675 | 
676 |         # Select DollarBar size
677 |         ticks = pd.read_parquet(ticks_file_path)
678 | 
679 |         # In[6]:
680 | 
681 |         # overall traded volume
682 |         N = ticks[['price', 'volume']].prod(axis=1).sum()
683 | 
684 |         # number of days traded
685 |         D = np.unique(ticks.date_time.values.astype('M8[D]')).shape[0]
686 | 
687 |         # estimated threshold wrt estimated dayly amount of ticks
688 |         threshold = np.round((N / D) / est_ticks)
689 |         print('Creating Dollar Bars for ', key)
690 |         print("N of ticks: ", N, "trading days: ", D, "dollars of trade in dollar bar: ", threshold)
691 | 
692 |         # ## Create dollar bars
693 |         dollar = ml.data_structures.get_dollar_bars(ticks_file_path,
694 |                                                     threshold=threshold, batch_size=5000000,
695 |                                                     verbose=True, to_csv=True,
696 |                                                     output_path=dollar_bars_path)
697 | 
698 | get_indicators_name = lambda key, vbd, minret: f"{key}_{str(vbd)}_{str(minret * 100)}_indicators.csv"
699 | 
700 | for key in keys:
701 |     dollar_bars_path = os.path.join(dollar_bars_folder, get_dollar_bars_file_name(key, est_ticks))
702 |     indicators_path = os.path.join(indicators_folder,
703 |                                    get_indicators_name(key, vertical_barrier_days, min_ret))
704 | 
705 |     if os.path.basename(indicators_path) not in os.listdir(os.path.dirname(indicators_path)):
706 |         data = pd.read_csv(dollar_bars_path, index_col=0, parse_dates=True)
707 |         print("data shape for ", key, " - ", data.shape)
708 | 
709 |         # data heads: ['open', 'high', 'low', 'close'] ?? cum_vol    cum_dollar    cum_ticks
710 |         ############ get indicators:###########################################
711 |         data = get_side(data)
712 | 
713 |         ################## build bins ###################################
714 |         # Save the raw data
715 |         raw_data = data.copy()
716 | 
717 |         # Drop the NaN values from our data set
718 |         data.dropna(axis=0, how='any', inplace=True)
719 | 
720 |         trplbr = TrippleBarrier()
721 | 
722 |         # Compute daily volatility
723 |         daily_vol = trplbr.get_daily_vol(close=data['close'], lookback=50)
724 | 
725 |         # Apply Symmetric CUSUM Filter and get timestamps for events
726 |         # Note: Only the CUSUM filter needs a point estimate for volatility
727 |         cusum_events = trplbr.cusum_filter(data['close'], threshold=daily_vol.mean() * 0.5)
728 | 
729 |         # Compute vertical barrier
730 |         vertical_barriers = trplbr.add_vertical_barrier(t_events=cusum_events, close=data['close'],
731 |                                                         num_days=vertical_barrier_days)
732 | 
733 |         # the following parameters need to be adjusted for particular case
734 |         # pt_sl = [1, 2]
735 |         # min_ret = 0.0005
736 |         triple_barrier_events = trplbr.get_events(close=data['close'],
737 |                                                   t_events=cusum_events,
738 |                                                   pt_sl=pt_sl,
739 |                                                   target=daily_vol,
740 |                                                   min_ret=min_ret,
741 |                                                   num_threads=3,
742 |                                                   vertical_barrier_times=vertical_barriers,
743 |                                                   side_prediction=data['side'])
744 | 
745 |         # labels = ml.labeling.get_bins(triple_barrier_events, data['close'])
746 |         labels = trplbr.get_bins(triple_barrier_events, data['close'])
747 | 
748 |         print("shape of labels :", labels.shape)
749 | 
750 |         ###################### get other indicators ####################################
751 | 
752 |         raw_data = get_indicators(raw_data)
753 | 
754 |         #### Now get the data at the specified events
755 | 
756 |         df = pd.concat([raw_data, labels], axis=1, sort=False)
757 | 
758 |         df[~df.slow_mavg.isna()].to_csv(indicators_path)
759 | 
760 | 


--------------------------------------------------------------------------------