├── requirements.txt ├── data └── 3_Dividends │ └── GAZP.ME.csv ├── 3_create_adj_ticks.py ├── 2_preprocess_ticks.py ├── README.md ├── QCTickDataStrategy.py ├── 1_get_tick_data.py └── 4_dollar_bars_triple_barrier_indicators.py /requirements.txt: -------------------------------------------------------------------------------- 1 | #python==3.6.6 2 | setuptools 3 | numpy 4 | python-dateutil 5 | pytz 6 | pandas 7 | fastparquet 8 | sklearn 9 | matplotlib 10 | mlfinlab 11 | selenium 12 | jupyter -------------------------------------------------------------------------------- /data/3_Dividends/GAZP.ME.csv: -------------------------------------------------------------------------------- 1 | Date,Dividends 2 | 2016-07-19,7.89 3 | 2017-07-18,8.0397 4 | 2017-07-19,8.0397 5 | 2018-07-18,8.04 6 | 2019-07-17,16.61 7 | 2013-05-14,5.99 8 | 2015-07-14,7.2 9 | 2010-05-11,2.39 10 | 2012-05-08,8.97 11 | 2011-05-13,3.85 12 | 2012-05-11,8.97 13 | 2011-05-05,3.85 14 | 2014-07-16,7.2 15 | -------------------------------------------------------------------------------- /3_create_adj_ticks.py: -------------------------------------------------------------------------------- 1 | # https://help.yahoo.com/kb/SLN28256.html 2 | 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | import datetime as dt 7 | 8 | from multiprocessing import Pool 9 | my_dir = os.getcwd() 10 | 11 | ticks_folder = os.path.join(my_dir, "data/4_Ticks") 12 | dividends_folder = os.path.join(my_dir, "data/3_Dividends") 13 | 14 | adj_ticks_folder = os.path.join(my_dir, "data/5_AdjTicks") 15 | 16 | if os.path.basename(adj_ticks_folder) not in os.listdir(os.path.dirname(adj_ticks_folder)): 17 | os.mkdir(adj_ticks_folder) 18 | 19 | keys = [key[:4] for key in os.listdir(ticks_folder) if not key.startswith(".")] 20 | print(keys) 21 | 22 | for key in keys: 23 | div_keys = [key[:4] for key in os.listdir(dividends_folder) if not key.startswith(".")] 24 | if key not in div_keys: 25 | print(key, 'passed') 26 | pass 27 | else: 28 | print('processing ', key) 29 | dividends = pd.read_csv(os.path.join(dividends_folder, (key + ".ME.csv")), index_col=0, parse_dates=True) 30 | dividends.sort_index(inplace=True) 31 | 32 | ticks_file = [f for f in os.listdir(ticks_folder) if key in f][0] 33 | ticks = pd.read_parquet(os.path.join(ticks_folder, ticks_file), engine='fastparquet') 34 | ticks.index = ticks.date_time 35 | 36 | dividends_due = dividends[((dividends.index < ticks.index[-1]) & (dividends.index > ticks.index[0]))] 37 | ss = pd.Series(ticks.index).searchsorted(dividends_due.index) 38 | for_backward_roll = ticks.iloc[(ss - 1)] 39 | # print(for_backward_roll) 40 | 41 | dividends_due["Coef"] = 0. 42 | for i in range(len(dividends_due)): 43 | dividends_due.iloc[i, 1] = 1 - dividends_due.iloc[i, 0] / for_backward_roll.price.iloc[i] 44 | 45 | adj_ticks = ticks.copy() 46 | for i in range(len(dividends_due)): 47 | adj_ticks.price.loc[adj_ticks.index < dividends_due.index[i]] *= dividends_due.iloc[i, 1] 48 | 49 | t1 = dt.datetime.now() 50 | 51 | adj_ticks_file = os.path.join(adj_ticks_folder, ticks_file) 52 | 53 | print("saving parquet") 54 | adj_ticks.reset_index(drop=True).to_parquet(adj_ticks_file, compression='GZIP', engine='fastparquet') 55 | t2 = dt.datetime.now() 56 | print(t2 - t1) 57 | -------------------------------------------------------------------------------- /2_preprocess_ticks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import datetime as dt 5 | 6 | from multiprocessing import Pool 7 | 8 | my_dir = os.getcwd() 9 | raw_tick_dir = os.path.join(my_dir, "data/1_RawTicks") 10 | target_folder = os.path.join(my_dir, "data/4_Ticks") 11 | 12 | if os.path.basename(target_folder) not in os.listdir(os.path.dirname(target_folder)): 13 | os.mkdir(target_folder) 14 | 15 | 16 | def check_breaks(file_names): 17 | end = dt.datetime.strptime(file_names[0][-10:-4], "%y%m%d") 18 | breaches = [] 19 | for i in file_names[1:]: 20 | expected_previous_end = dt.datetime.strptime(i[5:11], "%y%m%d") - dt.timedelta(days=1) 21 | if not end == expected_previous_end: 22 | breaches.append([end, expected_previous_end]) 23 | end = dt.datetime.strptime(i[-10:-4], "%y%m%d") 24 | 25 | if breaches: 26 | print("Breaches: ", breaches) 27 | return False 28 | return True 29 | 30 | 31 | def get_csv(file): 32 | global source_folder 33 | file_path = os.path.join(source_folder, file) 34 | 35 | df_temp = pd.read_csv(file_path, sep=';') 36 | 37 | if '' not in df_temp.columns: 38 | df_temp = pd.read_csv(file_path, sep=',') 39 | 40 | assert '' in df_temp.columns, "No DATE in columns in " + file 41 | 42 | b = np.arange(len(df_temp)) * 10 ** -6 43 | delta = np.array(list(map(lambda x: "{:.6f}".format(x)[1:], b))) 44 | date_time = df_temp[''].apply(str) + df_temp['