├── finance_ml ├── __init__.py ├── breakout │ ├── __init__.py │ └── get_bsadf.py ├── constants.py ├── datasets │ ├── __init__.py │ └── generate.py ├── multiprocessing │ ├── __init__.py │ ├── partition.py │ ├── pandas.py │ └── utils.py ├── risk │ ├── __init__.py │ └── failure.py ├── stats │ ├── __init__.py │ ├── rolling.py │ └── vol.py ├── hierarchical_clustering │ ├── __init__.py │ ├── metrics.py │ ├── utils.py │ ├── quasi.py │ └── allocation.py ├── labeling │ ├── __init__.py │ ├── utils.py │ ├── sampling.py │ ├── trend.py │ ├── betsides.py │ ├── betsizes.py │ └── barriers.py ├── features │ ├── __init__.py │ ├── orth.py │ ├── fraction.py │ └── entropy.py ├── sampling │ ├── __init__.py │ ├── utils.py │ ├── decay.py │ ├── weight.py │ ├── bootstrap.py │ ├── co_events.py │ └── time_weight.py ├── model_selection │ ├── __init__.py │ ├── distribution.py │ ├── pipeline.py │ ├── hyper.py │ ├── score.py │ ├── utils.py │ └── kfold.py ├── utils.py ├── distance.py ├── denoising.py ├── experiments.py ├── clustering.py └── importance.py ├── setup.py ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ └── index.rst ├── .gitignore ├── datasets.py ├── LICENSE └── README.md /finance_ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finance_ml/breakout/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finance_ml/constants.py: -------------------------------------------------------------------------------- 1 | LONG = 1 2 | SHORT = -1 -------------------------------------------------------------------------------- /finance_ml/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .generate import get_cls_data -------------------------------------------------------------------------------- /finance_ml/multiprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .pandas import mp_pandas_obj -------------------------------------------------------------------------------- /finance_ml/risk/__init__.py: -------------------------------------------------------------------------------- 1 | from .failure import calc_prob_sr, prob_failure 2 | -------------------------------------------------------------------------------- /finance_ml/stats/__init__.py: -------------------------------------------------------------------------------- 1 | from .vol import get_vol, get_mean 2 | from .rolling import pandas_rolling -------------------------------------------------------------------------------- /finance_ml/hierarchical_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from .allocation import get_rec_bipart, get_hrp 2 | from .quasi import get_quasi_diag 3 | from .metrics import get_corr_dist -------------------------------------------------------------------------------- /finance_ml/labeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .barriers import get_events, get_t1, get_barrier_labels 2 | from .sampling import cusum_filter 3 | from .trend import get_bins_from_trend -------------------------------------------------------------------------------- /finance_ml/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .orth import get_evec, ortho_feats 2 | from .entropy import match_length, lempel_zib_lib, get_entropy_rate, plug_in, konto 3 | from .fraction import get_opt_d, frac_diff_FFD -------------------------------------------------------------------------------- /finance_ml/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | from .co_events import get_num_co_events 2 | from .bootstrap import seq_bootstrap 3 | from .time_weight import get_sample_weight, get_uniq_weight 4 | from .decay import get_time_decay -------------------------------------------------------------------------------- /finance_ml/model_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .kfold import PurgedKFold, CPKFold, generate_signals 2 | from .score import cv_score 3 | from .pipeline import Pipeline 4 | from .hyper import clf_hyper_fit 5 | from .distribution import LogUniformGen, log_uniform 6 | from .utils import evaluate -------------------------------------------------------------------------------- /finance_ml/hierarchical_clustering/metrics.py: -------------------------------------------------------------------------------- 1 | def get_corr_dist(corr): 2 | """Calculate correlation distance 3 | 4 | Params 5 | ------ 6 | corr: pd.DataFrame 7 | 8 | Returns 9 | ------- 10 | pd.DataFrame 11 | """ 12 | dist = ((1 - corr) / 2)**.5 13 | return dist -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="finance_ml", 6 | version='0.1', 7 | description='utility library for finance', 8 | author='jjakimoto', 9 | author_email='f.j.akimoto@gmail.com', 10 | packages=find_packages(), 11 | py_modeuls=["finance_ml"] 12 | ) -------------------------------------------------------------------------------- /finance_ml/stats/rolling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def pandas_rolling(series, window, freq=1, method='mean'): 5 | series_list = [] 6 | for i in range(freq): 7 | _series = series.iloc[i::freq].rolling(window).agg(method) 8 | series_list.append(_series) 9 | return pd.concat(series_list, axis=0).sort_index() -------------------------------------------------------------------------------- /finance_ml/model_selection/distribution.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import rv_continuous 2 | import numpy as np 3 | 4 | 5 | class LogUniformGen(rv_continuous): 6 | def _cdf(self, x): 7 | return np.log(x / self.a) / np.log(self.b / self.a) 8 | 9 | 10 | def log_uniform(a=1, b=np.exp(1)): 11 | return LogUniformGen(a=a, b=b, name='log_uniform') -------------------------------------------------------------------------------- /finance_ml/model_selection/pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline as _Pipeline 2 | 3 | 4 | class Pipeline(_Pipeline): 5 | def fit(self, X, y, sample_weight=None, **fit_params): 6 | if sample_weight is not None: 7 | fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight 8 | return super(MyPipeline, self).fit(X, y, **fit_params) -------------------------------------------------------------------------------- /finance_ml/utils.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | 6 | 7 | def sign_log(x, scale=1): 8 | const = 1 9 | if isinstance(x, numbers.Number): 10 | if x >= 0: 11 | return np.log(const + scale * x) 12 | else: 13 | return np.log(const + scale * np.abs(x)) 14 | x = deepcopy(x) 15 | x[x >= 0] = np.log(const + scale * np.abs(x[x >= 0])) 16 | x[x < 0] = -np.log(const + scale * np.abs(x[x < 0])) 17 | return x -------------------------------------------------------------------------------- /finance_ml/sampling/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def get_ind_matrix(bar_idx, t1): 5 | ind_m = pd.DataFrame(0, index=bar_idx, 6 | columns=range(t1.shape[0])) 7 | for i, (t0_, t1_) in enumerate(t1.iteritems()): 8 | ind_m.loc[t0_:t1_, i] = 1 9 | return ind_m 10 | 11 | 12 | def get_avg_uniq(ind_m, c=None): 13 | if c is None: 14 | c = ind_m.sum(axis=1) 15 | ind_m = ind_m.loc[c > 0] 16 | c = c.loc[c > 0] 17 | u = ind_m.div(c, axis=0) 18 | avg_u = u[u > 0].mean() 19 | avg_u = avg_u.fillna(0) 20 | return avg_u 21 | -------------------------------------------------------------------------------- /finance_ml/hierarchical_clustering/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import random 4 | 5 | 6 | def generateData(nObs, size0, size1, sigma1): 7 | # Time series of correlated variables 8 | #1) generating some uncorrelated data 9 | np.random.seed(seed=12345) 10 | random.seed(12345) 11 | x = np.random.normal(0, 1, size=(nObs, size0)) # each row is a variable 12 | #2) creating correlation between the variables 13 | cols = [random.randint(0, size0 - 1) for i in range(size1)] 14 | y = x[:, cols] + np.random.normal(0, sigma1, size=(nObs, len(cols))) 15 | x = np.append(x, y, axis=1) 16 | x = pd.DataFrame(x, columns=range(1, x.shape[1] + 1)) 17 | return x, cols 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /finance_ml/sampling/decay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get_time_decay(uniq_weight, last=1.): 6 | """Calculate time decay weight 7 | 8 | Params 9 | ------ 10 | uniq_weight: pd.Series 11 | Sampling weight calculated label uniqueness 12 | last: float, default 1, no decay 13 | Parameter to detemine the slope and constant 14 | 15 | Returns 16 | ------- 17 | pd.Series 18 | """ 19 | weight = uniq_weight.sort_index().cumsum() 20 | if last > 0: 21 | slope = (1 - last) / weight.iloc[-1] 22 | else: 23 | slope = 1 / ((1 + last) * weight.iloc[-1]) 24 | const = 1. - slope * weight.iloc[-1] 25 | weight = const + slope * weight 26 | weight[weight < 0] = 0 27 | return weight -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.pyc 10 | *.swp 11 | 12 | # Packages # 13 | ############ 14 | # it's better to unpack these files and commit the raw source 15 | # git has its own built in compression methods 16 | *.7z 17 | *.dmg 18 | *.gz 19 | *.iso 20 | *.jar 21 | *.rar 22 | *.tar 23 | *.zip 24 | *.meta 25 | *.index 26 | *.ckpt* 27 | # Logs and databases # 28 | ###################### 29 | .ipynb_checkpoints 30 | *.log 31 | *.sql 32 | *.sqlite 33 | /*.egg-info 34 | 35 | # OS generated files # 36 | ###################### 37 | .DS_Store 38 | .DS_Store? 39 | ._* 40 | .Spotlight-V100 41 | .Trashes 42 | ehthumbs.db 43 | Thumbs.db 44 | dist 45 | build 46 | build/* 47 | .idea 48 | params 49 | */data/* 50 | .vscode 51 | -------------------------------------------------------------------------------- /finance_ml/sampling/weight.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get_time_decay(tw, last_w=1., truncate=0, is_exp=False): 6 | cum_w = tw.sort_index().cumsum() 7 | init_w = 1. 8 | if is_exp: 9 | init_w = np.log(init_w) 10 | if last_w >= 0: 11 | if is_exp: 12 | last_w = np.log(last_w) 13 | slope = (init_w - last_w) / cum_w.iloc[-1] 14 | else: 15 | slope = init_w / ((last_w + 1) * cum_w.iloc[-1]) 16 | const = init_w - slope * cum_w.iloc[-1] 17 | weights = const + slope * cum_w 18 | if is_exp: 19 | weights =np.exp(weights) 20 | weights[weights < truncate] = 0 21 | return weights 22 | 23 | 24 | def get_sample_tw(t1, num_co_events, molecule): 25 | wght = pd.Series(index=molecule) 26 | for t_in, t_out in t1.loc[wght.index].iteritems(): 27 | wght.loc[t_in] = (1. / num_co_events.loc[t_in: t_out]).mean() 28 | return wght -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /finance_ml/labeling/utils.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | 3 | 4 | def drop_labels(events, min_pct=0.05): 5 | while True: 6 | df = events['bin'].value_counts(normalize=True) 7 | if df.min() > min_pct or df.shape[0] < 3: 8 | break 9 | print('dropped label', df.argmin(), df.min()) 10 | events = events[events['bin'] != df.argmin()] 11 | return events 12 | 13 | 14 | def get_partial_index(df, start=None, end=None): 15 | """Get partial time index according to start and end 16 | 17 | Args: 18 | df (pd.DatFrame or pd.Series) 19 | 20 | start (datetime.datetime, optional): e.g., datetime(2018, 1, 1) 21 | 22 | end (datetime.datetime, optional): e.g., dateteim(2018, 3, 1) 23 | 24 | Returns: 25 | pd.DatetimeIndex 26 | """ 27 | if start is not None: 28 | df = df.loc[df.index >= start] 29 | if end is not None: 30 | df = df.loc[df.index <= end] 31 | return df.index 32 | -------------------------------------------------------------------------------- /finance_ml/sampling/bootstrap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .utils import get_avg_uniq 3 | 4 | 5 | def seq_bootstrap(ind_m, s_length=None): 6 | if s_length is None: 7 | s_length = ind_m.shape[1] 8 | phi = [] 9 | while len(phi) < s_length: 10 | c = ind_m[phi].sum(axis=1) + 1 11 | avg_u = get_avg_uniq(ind_m, c) 12 | prob = (avg_u / avg_u.sum()).values 13 | phi += [np.random.choice(ind_m.columns, p=prob)] 14 | return phi 15 | 16 | 17 | def get_ind_matrix(timestamps, t1, num_threads=1): 18 | return mp_pandas_obj( 19 | mp_ind_matrix, ('molecule', t1.index), 20 | num_threads, 21 | timestamps=timestamps, 22 | t1=t1) 23 | 24 | 25 | def mp_ind_matrix(timestamps, t1, molecule): 26 | t1 = t1.loc[molecule] 27 | ind_matrix = pd.DataFrame(0, index=timestamps, columns=molecule) 28 | for i, (t0, t1) in enumerate(t1.iteritems()): 29 | ind_matrix.loc[t0:t1] = 1 30 | return ind_matrix -------------------------------------------------------------------------------- /finance_ml/stats/vol.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def _get_ret(close, span=100, days=None, seconds=None): 6 | """Estimate exponential average volatility""" 7 | if days is None: 8 | delta = pd.Timedelta(seconds=seconds) 9 | else: 10 | delta = pd.Timedelta(days=days) 11 | use_idx = close.index.searchsorted(close.index - delta) 12 | prev_idx = pd.Series(use_idx, index=close.index) 13 | prev_idx = prev_idx[prev_idx > 0] 14 | # Get rid of duplications in index 15 | prev_idx = prev_idx.drop_duplicates() 16 | ret = close[prev_idx.index] / close[prev_idx].values - 1 17 | vol = ret.ewm(span=span).std() 18 | return vol 19 | 20 | 21 | def get_vol(close, span=100, days=None, seconds=None): 22 | ret = _get_ret(close, span, days, seconds) 23 | vol = ret.ewm(span=span).std() 24 | return vol 25 | 26 | 27 | def get_mean(close, span=100, days=None, seconds=None): 28 | ret = _get_ret(close, span, days, seconds) 29 | mean = ret.ewm(span=span).mean() 30 | return mean 31 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import make_classification 3 | 4 | 5 | def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000): 6 | X, cont = make_classification(n_samples=n_samples, n_features=n_features, 7 | n_informative=n_informative, n_redundant=n_redundant, 8 | random_state=0, shuffle=False) 9 | time_idx = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.BDay(), 10 | end=pd.datetime.today()) 11 | X = pd.DataFrame(X, index=time_idx) 12 | cont = pd.Series(cont, index=time_idx).to_frame('bin') 13 | # Create name of columns 14 | columns = ['I_' + str(i) for i in range(n_informative)] 15 | columns += ['R_' + str(i) for i in range(n_redundant)] 16 | columns += ['N_' + str(i) for i in range(n_features - len(columns))] 17 | X.columns = columns 18 | cont['w'] = 1. / cont.shape[0] 19 | cont['t1'] = pd.Series(cont.index, index=cont.index) 20 | return X, cont 21 | -------------------------------------------------------------------------------- /finance_ml/datasets/generate.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import make_classification 2 | import pandas as pd 3 | 4 | 5 | def get_cls_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000): 6 | X, cont = make_classification(n_samples=n_samples, n_features=n_features, 7 | n_informative=n_informative, n_redundant=n_redundant, 8 | random_state=0, shuffle=False) 9 | time_idx = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.BDay(), 10 | end=pd.datetime.today()) 11 | X = pd.DataFrame(X, index=time_idx) 12 | cont = pd.Series(cont, index=time_idx).to_frame('bin') 13 | # Create name of columns 14 | columns = ['I_' + str(i) for i in range(n_informative)] 15 | columns += ['R_' + str(i) for i in range(n_redundant)] 16 | columns += ['N_' + str(i) for i in range(n_features - len(columns))] 17 | X.columns = columns 18 | cont['w'] = 1. / cont.shape[0] 19 | cont['t1'] = pd.Series(cont.index, index=cont.index) 20 | return X, cont -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tomoaki Fujii 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /finance_ml/hierarchical_clustering/quasi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get_quasi_diag(link): 6 | """Calculate quasi diagonalization 7 | 8 | Params 9 | ------ 10 | link: list 11 | Result from hierachical clustering of scipy 12 | 13 | Returns 14 | ------- 15 | pd.Series: sorted index 16 | """ 17 | # Make labels integers 18 | link = link.astype(int) 19 | sort_idx = pd.Series([link[-1, 0], link[-1, 1]]) 20 | num_items = link[-1, 3] 21 | # Iterate until all elements are assigned 22 | while sort_idx.max() >= num_items: 23 | # Gerante index for the first element of cluster 24 | sort_idx.index = range(0, sort_idx.shape[0] * 2, 2) 25 | # Get clustered value not single elements 26 | clusters = sort_idx[sort_idx >= num_items] 27 | idx = clusters.index 28 | # Add clusters 29 | cl_idx = clusters.values - num_items 30 | sort_idx[idx] = link[cl_idx, 0] 31 | df = pd.Series(link[cl_idx, 1], index=idx + 1) 32 | sort_idx = sort_idx.append(df) 33 | # Resort 34 | sort_idx = sort_idx.sort_index() 35 | sort_idx.index = range(sort_idx.shape[0]) 36 | return sort_idx.tolist() -------------------------------------------------------------------------------- /finance_ml/sampling/co_events.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from ..multiprocessing import mp_pandas_obj 5 | 6 | 7 | def mp_num_co_events(timestamps, t1, molecule): 8 | """Calculate the number of co events for multiprocessing""" 9 | # Find events that span the period defined by molecule 10 | t1 = t1.fillna(timestamps[-1]) 11 | t1 = t1[t1 >= molecule[0]] 12 | t1 = t1.loc[:t1[molecule].max()] 13 | # Count the events 14 | iloc = timestamps.searchsorted(np.array([t1.index[0], t1.max()])) 15 | count = pd.Series(0, index=timestamps[iloc[0]:iloc[1] + 1]) 16 | for t_in, t_out in t1.iteritems(): 17 | count.loc[t_in:t_out] += 1 18 | return count.loc[molecule[0]:t1[molecule].max()] 19 | 20 | 21 | def get_num_co_events(timestamps, t1, num_threads=1): 22 | """Calculate the number of co events 23 | 24 | Params 25 | ------ 26 | timestamps: DatetimeIndex 27 | The timesstamps defining the range of searching 28 | t1: pd.Series 29 | num_threads: int 30 | 31 | Returns 32 | pd.Series: each value corresponds to the number of co occurence 33 | """ 34 | return mp_pandas_obj( 35 | mp_num_co_events, ('molecule', t1.index), 36 | num_threads, 37 | timestamps=timestamps, 38 | t1=t1) 39 | -------------------------------------------------------------------------------- /finance_ml/labeling/sampling.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import pandas as pd 3 | 4 | 5 | def cusum_filter(close, h, k=0): 6 | """Sample points with CUSUM Filter 7 | 8 | Args: 9 | close (pd.Series): Price series. 10 | 11 | h (float or pd.Series): Threasholds to sample points.\ 12 | If specified with float, translate to pd.Series(h, index=close.index) 13 | 14 | k (float, optional): Minimum speed parameter to hit threashold.\ 15 | Defaults to 0, which means inactive 16 | 17 | Returns: 18 | pd.DatetimeIndex: Sampled data points 19 | """ 20 | # asssum that E y_t = y_{t-1} 21 | s_pos, s_neg = 0, 0 22 | diff = close.diff().dropna() 23 | # time variant threshold 24 | if isinstance(h, numbers.Number): 25 | h = pd.Series(h, index=diff.index) 26 | h = h.reindex(diff.index, method='bfill') 27 | h = h.dropna() 28 | timestamps = [] 29 | th = h.loc[h.index[0]] 30 | for t in h.index: 31 | s_pos = max(0, s_pos + diff.loc[t] - k) 32 | s_neg = min(0, s_neg + diff.loc[t] + k) 33 | if s_pos > th: 34 | s_pos = 0 35 | timestamps.append(t) 36 | th = h.loc[t] 37 | elif s_neg < -th: 38 | s_neg = 0 39 | timestamps.append(t) 40 | th = h.loc[t] 41 | return pd.DatetimeIndex(timestamps) 42 | -------------------------------------------------------------------------------- /finance_ml/risk/failure.py: -------------------------------------------------------------------------------- 1 | def calc_prob_sr(pt, sl, freq, tgt_sr, rf=0.): 2 | """Calculate required probability wrt target SR 3 | 4 | Paramters 5 | --------- 6 | pt: float 7 | Profit Take 8 | sl: float 9 | Stop Loss 10 | freq: float 11 | Frequency of trading 12 | tgt_sr: float 13 | Target Sharpe Ratio 14 | rf: float, (default 0) 15 | Risk Free Rate 16 | 17 | Returns 18 | ------- 19 | float: Required probability 20 | """ 21 | diff = pt - sl 22 | a = (freq + tgt_sr ** 2) * diff ** 2 23 | b = diff * (2 * freq * (sl - rf) - tgt_sr ** 2 * diff) 24 | c = freq * (sl - rf) ** 2 25 | p = (-b + (b ** 2 - 4 * a * c) ** .5) / (2. * a) 26 | return p 27 | 28 | 29 | def prob_failure(ret, freq, tgt_sr): 30 | """ 31 | Calculate the probability to fail in achieving 32 | target Sharpe Ratio 33 | 34 | Parameters 35 | ---------- 36 | ret: array-like 37 | Returns of trading 38 | freq: float 39 | Frequency of trading 40 | tgt_sr: float 41 | Aiming Sharpe Ratio 42 | 43 | Returns 44 | ------- 45 | risk: float 46 | """ 47 | r_pos = ret[ret > 0].mean() 48 | r_neg = ret[ret <= 0].mean() 49 | p = ret[ret > 0].shape[0] / float(ret.shape[0]) 50 | th_p = calc_prob_sr(r_pos, r_neg, freq, tgt_sr) 51 | risk = ss.norm.cdf(th_p, p, p * (1 - p)) 52 | return risk -------------------------------------------------------------------------------- /finance_ml/multiprocessing/partition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def linear_parts(num_atoms, num_threads): 5 | """Linear partitions 6 | 7 | Args: 8 | num_atoms (int): The number of data points 9 | 10 | num_threads (int): The number of partitions to split 11 | 12 | Returns: 13 | array-like: indices of start and end 14 | """ 15 | parts = np.linspace(0, num_atoms, min(num_threads, num_atoms) + 1) 16 | parts = np.ceil(parts).astype(int) 17 | return parts 18 | 19 | 20 | def nested_parts(num_atoms, num_threads, descend=False): 21 | """Nested partitions 22 | 23 | Args: 24 | num_atoms (int): The number of data points 25 | 26 | num_threads (int): The number of partitions to split 27 | 28 | descend (bool, optional): If True, the size of partitions are decreasing. 29 | Defaults to False. 30 | 31 | Returns: 32 | array-like: indices of start and end 33 | """ 34 | parts = [0] 35 | num_threads = min(num_threads, num_atoms) 36 | for num in range(num_threads): 37 | part = 1 + 4 * (parts[-1] ** 2 + parts[-1] + num_atoms * (num_atoms + 1.) / num_threads) 38 | part = 0.5 * (-1 + np.sqrt(part)) 39 | parts.append(part) 40 | if descend: 41 | # Computational decreases as index increases 42 | parts = np.cumsum(np.diff(parts)[::-1]) 43 | parts = np.append(np.array([0]), parts) 44 | parts = np.round(parts).astype(int) 45 | return parts -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # finance_ml 2 | Python implementations of Machine Learning helper functions for Quantiative Finance based on books, 3 | [Advances in Financial Machine Learning](https://www.amazon.co.jp/Advances-Financial-Machine-Learning-English-ebook/dp/B079KLDW21) and [Machine Learning for Asset Managers](https://www.amazon.com/Machine-Learning-Managers-Elements-Quantitative/dp/1108792898) , written by `Marcos Lopez de Prado`. 4 | 5 | # Installation 6 | Excute the following command 7 | ```python 8 | python setup.py install 9 | ``` 10 | 11 | or 12 | 13 | Simply add `your/path/to/finace_ml` to your PYTHONPATH. 14 | 15 | # Implementation 16 | The following functions are implemented: 17 | * Labeling 18 | * Multiporcessing 19 | * Sampling 20 | * Feature Selection 21 | * Asset Allcation 22 | * Breakout Detection 23 | 24 | # Examples 25 | Some of example notebooks are found under the folder `MLAssetManagers`. 26 | 27 | ## multiprocessing 28 | Parallel computing using `multiprocessing` library. 29 | Here is the example of applying function to each element with parallelization. 30 | ```python 31 | import pandas as pd 32 | import numpy as np 33 | 34 | def apply_func(x): 35 | return x ** 2 36 | 37 | def func(df, timestamps, f): 38 | df_ = df.loc[timestamps] 39 | for idx, x in df_.items(): 40 | df_.loc[idx] = f(x) 41 | return df_ 42 | 43 | df = pd.Series(np.random.randn(10000)) 44 | from finance_ml.multiprocessing import mp_pandas_obj 45 | 46 | results = mp_pandas_obj(func, pd_obj=('timestamps', df.index), 47 | num_threads=24, df=df, f=apply_func) 48 | print(results.head()) 49 | ``` 50 | Output: 51 | ``` 52 | 0 0.449278 53 | 1 1.411846 54 | 2 0.157630 55 | 3 4.949410 56 | 4 0.601459 57 | ``` 58 | 59 | For more detail, please refer to example notebook! -------------------------------------------------------------------------------- /finance_ml/model_selection/hyper.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 2 | from sklearn.ensemble import BaggingClassifier 3 | 4 | from .kfold import PurgedKFold 5 | from .pipeline import Pipeline 6 | 7 | 8 | def clf_hyper_fit(feat, label, t1, pipe_clf, search_params, scoring=None, 9 | n_splits=3, bagging=[0, None, 1.], 10 | rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params): 11 | # Set default value for scoring 12 | if scoring is None: 13 | if set(label.values) == {0, 1}: 14 | scoring = 'f1' 15 | else: 16 | scoring = 'neg_log_loss' 17 | # HP search on training data 18 | inner_cv = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo) 19 | if rnd_search_iter == 0: 20 | search = GridSearchCV(estimator=pipe_clf, param_grid=search_params, 21 | scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False) 22 | else: 23 | search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=search_params, 24 | scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False) 25 | best_pipe = search.fit(feat, label, **fit_params).best_estimator_ 26 | # Fit validated model on the entirely of data 27 | if bagging[0] > 0: 28 | bag_est = BaggingClassifier(base_estimator=Pipeline(best_pipe.steps), 29 | n_estimators=int(bagging[0]), max_samples=float(bagging[1]), 30 | max_features=float(bagging[2]), n_jobs=n_jobs) 31 | bag_est = best_pipe.fit(feat, label, 32 | sample_weight=fit_params[bag_est.base_estimator.steps[-1][0] + '__sample_weight']) 33 | best_pipe = Pipeline([('bag', bag_est)]) 34 | return best_pipe -------------------------------------------------------------------------------- /finance_ml/features/orth.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get_evec(dot, var_th): 6 | """Calculate eigen values and vectors 7 | 8 | Params 9 | ------ 10 | dot: pd.DataFrame 11 | Z score product dataframe 12 | var_th: float 13 | Threshold for the explanation of variance 14 | 15 | Returns 16 | ------- 17 | e_val: pd.Series, eigen values 18 | e_vec: pd.DataFrame, eigen vectors 19 | """ 20 | # Compute and sort eigen vectors and values for dot product matrix 21 | e_val, e_vec = np.linalg.eigh(dot) 22 | idx = e_val.argsort()[::-1] 23 | e_val, e_vec = e_val[idx], e_vec[:, idx] 24 | # Labeling features 25 | e_val = pd.Series(e_val, index=['PC_' + str(i + 1) for i in range(e_val.shape[0])]) 26 | e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index) 27 | e_vec = e_vec.loc[:, e_val.index] 28 | # Reduce dimension from threshold 29 | cum_var = e_val.cumsum() / e_val.sum() 30 | dim = cum_var.searchsorted(var_th)[0] 31 | e_val = e_val.iloc[:dim + 1] 32 | e_vec = e_vec.iloc[:, :dim + 1] 33 | return e_val, e_vec 34 | 35 | 36 | def ortho_feats(dfX, var_th=.95): 37 | """Compute orthgonal features with threshold 38 | 39 | Params 40 | ------ 41 | dfX: pd.DataFrame 42 | Feataures dataframe 43 | var_th: float 44 | Threshold for the explanation of variance 45 | 46 | Returns 47 | ------- 48 | pd.DataFrame: orthogonal feature 49 | """ 50 | Z = (dfX.values - dfX.mean().values) / dfX.std().values 51 | dot = pd.DataFrame(np.dot(Z.T, Z), index=dfX.columns, columns=dfX.columns) 52 | e_val, e_vec = get_evec(dot, var_th) 53 | dfP = pd.DataFrame(np.dot(Z, e_vec), index=dfX.index, 54 | columns=['PC_' + str(i + 1) for i in range(e_vec.shape[1])]) 55 | return dfP -------------------------------------------------------------------------------- /finance_ml/labeling/trend.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import multiprocessing as mp 4 | 5 | import statsmodels.api as sm 6 | 7 | from ..multiprocessing import mp_pandas_obj 8 | 9 | 10 | def t_val_linreg(close): 11 | x = np.ones((close.shape[0], 2)) 12 | x[:, 1] = np.arange(close.shape[0]) 13 | ols = sm.OLS(close, x).fit() 14 | return ols.tvalues[1] 15 | 16 | def _get_bins_from_trend(molecule, close, min_step, max_step, step): 17 | out = pd.DataFrame(index=molecule, columns=['t1', 't_val','bin']) 18 | hrzns = list(range(min_step, max_step + 1, step)) 19 | for dt0 in molecule: 20 | iloc0 = close.index.get_loc(dt0) 21 | if iloc0 + max(hrzns) > close.shape[0]: 22 | continue 23 | df0 = pd.Series() 24 | for hrzn in hrzns: 25 | dt1 = close.index[iloc0 + hrzn - 1] 26 | df1 = close.loc[dt0:dt1] 27 | df0.loc[dt1] = t_val_linreg(df1.values) 28 | # Get maximum tstats point 29 | dt1 = df0.replace([-np.inf, np.inf, np.nan], 0).abs().idxmax() 30 | out.loc[dt0, ['t1', 't_val', 'bin']] = df0.index[-1], df0[dt1], np.sign(df0[dt1]) 31 | out['t1'] = pd.to_datetime(out['t1']) 32 | out['bin'] = pd.to_numeric(out['bin'], downcast='signed') 33 | return out.dropna(subset=['bin']) 34 | 35 | 36 | def get_bins_from_trend(close, max_step, min_step=3, step=1, num_threads=None): 37 | if num_threads is None: 38 | num_threads = mp.cpu_count() 39 | output = mp_pandas_obj(func=_get_bins_from_trend, 40 | pd_obj=('molecule', close.index), 41 | num_threads=num_threads, 42 | close=close, 43 | max_step=max_step, 44 | min_step=min_step, 45 | step=step) 46 | return output 47 | 48 | 49 | -------------------------------------------------------------------------------- /finance_ml/multiprocessing/pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from .partition import linear_parts, nested_parts 4 | from .utils import process_jobs 5 | 6 | 7 | def mp_pandas_obj(func, pd_obj, num_threads=1, mp_batches=1, 8 | linear_mols=True, 9 | descend=False, **kwargs): 10 | """Return multiprocessed results 11 | 12 | Args: 13 | func (function object) 14 | 15 | pd_obj (list): 16 | pd_obj[0], The name of parameters to be parallelized 17 | pd_obj[1], List of parameters to be parallelized 18 | 19 | mp_batches (int): The number of batches processed for each thread. 20 | 21 | linear_mols (bool): 22 | If True, use linear partition 23 | If False, use nested partition 24 | 25 | descend (bool): The parameter for nested partitions 26 | 27 | kwargs: optional parameters of `func` 28 | 29 | Returns: 30 | The same type as the output of func 31 | """ 32 | if linear_mols: 33 | parts = linear_parts(len(pd_obj[1]), num_threads * mp_batches) 34 | else: 35 | parts = nested_parts(len(pd_obj[1]), num_threads * mp_batches, descend) 36 | jobs = [] 37 | for i in range(1, len(parts)): 38 | job = {pd_obj[0]: pd_obj[1][parts[i - 1]: parts[i]], 'func': func} 39 | job.update(kwargs) 40 | jobs.append(job) 41 | outputs = [x[0] for x in process_jobs(jobs, num_threads=num_threads)] 42 | # You can use either of pd.Series or pd.DatFrame 43 | if isinstance(outputs[0], pd.Series): 44 | df = pd.Series() 45 | elif isinstance(outputs[0], pd.DataFrame): 46 | df = pd.DataFrame() 47 | else: 48 | return outputs 49 | # The case of multiple threads 50 | for output in outputs: 51 | df = df.append(output) 52 | df = df.sort_index() 53 | return df -------------------------------------------------------------------------------- /finance_ml/sampling/time_weight.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from ..multiprocessing import mp_pandas_obj 5 | 6 | 7 | def mp_sample_weight(series, t1, num_co_events, molecule): 8 | weight = pd.Series(index=molecule) 9 | for t_in, t_out in t1.loc[weight.index].iteritems(): 10 | weight.loc[t_in] = ( 11 | series.loc[t_in:t_out] / num_co_events.loc[t_in:t_out]).sum() 12 | return weight.abs() 13 | 14 | 15 | def get_sample_weight(series, t1, num_co_events, num_threads=1): 16 | """Calculate sampeling weight with considering some attributes 17 | 18 | Params 19 | ------ 20 | series: pd.Series 21 | Used for assigning weight. Larger value, larger weight e.g., log return 22 | t1: pd.Series 23 | num_co_events: pd.Series 24 | num_threads: int 25 | 26 | Return 27 | ------ 28 | pd.Series 29 | """ 30 | weight = mp_pandas_obj( 31 | mp_sample_weight, ('molecule', t1.index), 32 | num_threads, 33 | series=series, 34 | t1=t1, 35 | num_co_events=num_co_events) 36 | return weight * weight.shape[0] / weight.sum() 37 | 38 | 39 | def mp_uniq_weight(t1, num_co_events, molecule): 40 | """Calculate time sample weight utilizing occurence events information""" 41 | wght = pd.Series(index=molecule) 42 | for t_in, t_out in t1.loc[wght.index].iteritems(): 43 | wght.loc[t_in] = (1. / num_co_events.loc[t_in:t_out]).mean() 44 | return wght 45 | 46 | 47 | def get_uniq_weight(t1, num_co_events, num_threads=1): 48 | """Calculate time sample weight utilizing occurence events information 49 | 50 | Params 51 | ------ 52 | t1: pd.Series 53 | num_co_events: pd.Series 54 | The number of co-occurence events 55 | num_threads: int 56 | 57 | Returns 58 | pd.Series 59 | """ 60 | return mp_pandas_obj( 61 | mp_uniq_weight, ('molecule', t1.index), 62 | num_threads, 63 | t1=t1, 64 | num_co_events=num_co_events) 65 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../..')) 16 | sys.setrecursionlimit(1500) 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'finance_ml' 21 | copyright = '2019, jjakimoto' 22 | author = 'jjakimoto' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = ['sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinx.ext.autodoc'] 31 | 32 | # Add any paths that contain templates here, relative to this directory. 33 | templates_path = ['_templates'] 34 | 35 | # List of patterns, relative to source directory, that match files and 36 | # directories to ignore when looking for source files. 37 | # This pattern also affects html_static_path and html_extra_path. 38 | exclude_patterns = [] 39 | 40 | 41 | # -- Options for HTML output ------------------------------------------------- 42 | 43 | # The theme to use for HTML and HTML Help pages. See the documentation for 44 | # a list of builtin themes. 45 | # 46 | html_theme = 'alabaster' 47 | 48 | # Add any paths that contain custom static files (such as style sheets) here, 49 | # relative to this directory. They are copied after the builtin static files, 50 | # so a file named "default.css" will overwrite the builtin "default.css". 51 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /finance_ml/breakout/get_bsadf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def get_bsadf(series, tau, constant, lags): 5 | y, x = get_yx(series, constant=constant, lags=lags) 6 | if not isinstance(lags, int): 7 | lags = np.max(lags) 8 | start_points = range(0, y.shape[0] - tau + 1) 9 | basdf = None 10 | all_adf = [] 11 | for start in start_points: 12 | y_ = y[start:] 13 | x_ = x[start:] 14 | b_mean, b_var = get_betas(y_, x_) 15 | b_mean = b_mean[0,] 16 | b_std = b_var[0, 0] ** 0.5 17 | all_adf.append(b_mean / b_std) 18 | all_adf = np.array(all_adf) 19 | bsadf = np.max(all_adf[np.isfinite(all_adf)]) 20 | out = {'Time': series.index[-1], 'bsadf': bsadf} 21 | return out 22 | 23 | 24 | def get_yx(series, constant, lags): 25 | diff = series.diff().dropna() 26 | lag_feat = get_lag_features(diff, lags).dropna() 27 | # Add non diff feature 28 | lag_feat[series.name] = series.shift(1) 29 | index = lag_feat.dropna().index & diff.dropna().index 30 | x = lag_feat.loc[index].values 31 | y = diff.loc[index].values 32 | # Set constant value 33 | if constant != 'nc': 34 | const = np.ones((x.shape[0], 1)) 35 | x = np.hstack((x, const)) 36 | if constant[:2] == 'ct': 37 | trend = np.arange(x.shape[0]).reshape(-1, 1) 38 | x = np.hstack((x, trend)) 39 | if constant == 'ctt': 40 | x = np.hstack((x, trend ** 2)) 41 | return y, x 42 | 43 | 44 | def get_lag_features(series, lags): 45 | lag_feat = pd.DataFrame() 46 | if isinstance(lags, int): 47 | lags = range(1, lags + 1) 48 | else: 49 | lags = [int(lag) for lag in lags] 50 | for lag in lags: 51 | lag_feat[f'{series.name}_{lag}'] = series.shift(lag).copy(deep=True) 52 | return lag_feat 53 | 54 | def get_betas(y, x, lam=0): 55 | xy = np.dot(x.T, y) 56 | xx = np.dot(x.T, x) 57 | xxinv = np.linalg.inv(xx + lam) 58 | beta_mean = np.dot(xxinv, xy) 59 | err = y - np.dot(x, beta_mean) 60 | beta_var = np.dot(err.T, err) / (x.shape[0] - x.shape[1]) * xxinv 61 | return beta_mean, beta_var -------------------------------------------------------------------------------- /finance_ml/labeling/betsides.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import pandas as pd 3 | import numpy as np 4 | import multiprocessing as mp 5 | 6 | from ..multiprocessing import mp_pandas_obj 7 | 8 | 9 | def _cusum_side(diff, h, k=0, molecule=None): 10 | side = [] 11 | s_pos, s_neg = 0, 0 12 | timestamps = [] 13 | th = None 14 | for t in molecule: 15 | if th is None: 16 | th = h.loc[t] 17 | s_pos = max(0, s_pos + diff.loc[t] - k) 18 | s_neg = min(0, s_neg + diff.loc[t] + k) 19 | if s_pos > th: 20 | s_pos = 0 21 | timestamps.append(t) 22 | th = h.loc[t] 23 | side.append(1) 24 | elif s_neg < -th: 25 | s_neg = 0 26 | timestamps.append(t) 27 | th = h.loc[t] 28 | side.append(-1) 29 | side = pd.Series(side, index=pd.DatetimeIndex(timestamps)) 30 | return side 31 | 32 | 33 | def cusum_side(close, h, k=0, use_log=True, num_threads=None): 34 | """Sample points with CUSUM Filter and use its direction as betting side 35 | 36 | Args: 37 | close (pd.Series): Price series 38 | 39 | h (float or pd.Series): Threasholds to sampmle points.\ 40 | If specified with float, translate to pd.Series(h, index=close.index) 41 | 42 | k (float, optional): Minimum speed parameter to hit threashold.\ 43 | Defaults to 0, which means inactive 44 | 45 | Returns: 46 | pd.Series: Betting sides at sampled points 47 | """ 48 | if num_threads is None: 49 | num_threads = mp.cpu_count() 50 | # asssum that E y_t = y_{t-1} 51 | side = [] 52 | s_pos, s_neg = 0, 0 53 | if use_log: 54 | diff = np.log(close).diff().dropna() 55 | else: 56 | diff = close.diff().dropna() 57 | # time variant threshold 58 | if isinstance(h, numbers.Number): 59 | h = pd.Series(h, index=diff.index) 60 | h = h.reindex(diff.index, method='bfill') 61 | h = h.dropna() 62 | side = mp_pandas_obj(func=_cusum_side, 63 | pd_obj=('molecule', h.index), 64 | num_threads=num_threads, 65 | diff=diff, h=h, k=k) 66 | return side -------------------------------------------------------------------------------- /finance_ml/hierarchical_clustering/allocation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.cluster.hierarchy as sch 4 | 5 | from .metrics import get_corr_dist 6 | from .quasi import get_quasi_diag 7 | 8 | 9 | def get_rec_bipart(cov, sort_idx): 10 | """Compute portfolio weight by recursive bisection 11 | 12 | Params 13 | ------ 14 | cov: pd.DataFrame 15 | sort_idx: pd.Series 16 | Sorted index by quasi diagonalization 17 | 18 | Returns 19 | ------- 20 | pd.Series 21 | """ 22 | weight = pd.Series(1, index=sort_idx) 23 | # Initialize all in one cluster 24 | cl_items = [sort_idx] 25 | while len(cl_items) > 0: 26 | cl_items_ = [] 27 | for cl in cl_items: 28 | # Split into half for each cluter 29 | if len(cl) >= 2: 30 | cl_items_.append(cl[0:len(cl) // 2]) 31 | cl_items_.append(cl[len(cl) // 2:len(cl)]) 32 | # Update cluster 33 | cl_items = cl_items_ 34 | for i in range(0, len(cl_items), 2): 35 | cl0 = cl_items[i] 36 | cl1 = cl_items[i + 1] 37 | var0 = get_cluster_var(cov, cl0) 38 | var1 = get_cluster_var(cov, cl1) 39 | alpha = var1 / (var0 + var1) 40 | weight[cl0] *= alpha 41 | weight[cl1] *= 1 - alpha 42 | return weight 43 | 44 | 45 | def get_ivp(cov): 46 | """Compute inverse variance portfolio 47 | 48 | Params 49 | ------ 50 | cov: pd.DataFrame 51 | 52 | Returns 53 | ------- 54 | np.array 55 | """ 56 | ivp = 1. / np.diag(cov) 57 | ivp /= ivp.sum() 58 | return ivp 59 | 60 | 61 | def get_cluster_var(cov, cl_items): 62 | """Compute variance per cluster 63 | 64 | Params 65 | ------ 66 | cov: pd.DataFrame 67 | cl_items: pd.Series 68 | 69 | Returns 70 | ------- 71 | float 72 | """ 73 | cov_cl = cov.loc[cl_items, cl_items] 74 | w = get_ivp(cov_cl).reshape(-1, 1) 75 | cl_var = np.dot(np.dot(w.T, cov_cl), w)[0, 0] 76 | return cl_var 77 | 78 | 79 | def get_hrp(cov, corr): 80 | """Construct a hierarchical portfolio 81 | 82 | Params 83 | ------ 84 | cov: pd.DataFrame 85 | corr: pd.DataFrame 86 | 87 | Returns 88 | ------- 89 | pd.Series 90 | """ 91 | dist = get_corr_dist(corr) 92 | link = sch.linkage(dist, 'single') 93 | sort_idx = get_quasi_diag(link) 94 | # Recover label 95 | sort_idx = corr.index[sort_idx].tolist() 96 | hrp = get_rec_bipart(cov, sort_idx) 97 | return hrp.sort_index() -------------------------------------------------------------------------------- /finance_ml/distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as ss 4 | from sklearn.metrics import mutual_info_score 5 | 6 | def _fix_corr(corr): 7 | corr[corr > 1] = 1 8 | corr[corr < -1] = -1 9 | return corr.fillna(0) 10 | 11 | def corr_metric(corr, use_abs=False): 12 | corr = _fix_corr(corr) 13 | if use_abs: 14 | return np.sqrt(1 - np.abs(corr)) 15 | else: 16 | return np.sqrt(0.5 * (1 - corr)) 17 | 18 | def corr_metric_xy(x, y, use_abs=False): 19 | corr = np.corrcoef(x, y)[0, 1] 20 | return corr_metric(corr, use_abs) 21 | 22 | def _get_zeta(N): 23 | return (8 + 324 * N + 12 * (36 * N + 729 * N ** 2) ** 0.5) ** (1./3) 24 | 25 | def _num_bins(n_obs, corr=None): 26 | if corr is None or 1. - corr ** 2 < 1e-8: 27 | zeta = _get_zeta(n_obs) 28 | b = round(zeta / 6. + 2. / (3 * zeta) + 1. / 3) 29 | else: 30 | b = round(2 ** -0.5 * (1 + (1 + 24 * n_obs / (1. - corr ** 2)) ** 0.5) ** 0.5) 31 | return max(int(b), 2) 32 | 33 | 34 | def entropy(x, bx=None, is_cont=False): 35 | if bx is None: 36 | bx = _num_bins(x.shape[0]) 37 | hx = ss.entropy(np.histogram(x, bx)[0]) 38 | if is_cont: 39 | delta = (x.max() - x.min()) / bx 40 | hx += np.log(delta) 41 | return hx 42 | 43 | def joint_entropy(x, y, bxy=None, is_cont=False): 44 | if bxy is None: 45 | bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1]) 46 | cxy = np.histogram2d(x, y, bxy)[0] 47 | hx = ss.entropy(np.histogram(x, bxy)[0]) 48 | hy = ss.entropy(np.histogram(y, bxy)[0]) 49 | ixy = mutual_info_score(None, None, contingency=cxy) 50 | hxy = hx + hy - ixy 51 | if is_cont: 52 | deltax = (x.max() - x.min()) / bxy 53 | deltay = (y.max() - y.min()) / bxy 54 | hxy += np.log(deltax) + np.log(deltay) 55 | return hxy 56 | 57 | def cond_entropy(x, y, bxy=None, is_cont=False): 58 | if bxy is None: 59 | bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1]) 60 | cxy = np.histogram2d(x, y, bxy)[0] 61 | hx = ss.entropy(np.histogram(x, bxy)[0]) 62 | hy = ss.entropy(np.histogram(y, bxy)[0]) 63 | ixy = mutual_info_score(None, None, contingency=cxy) 64 | hxy = hx + hy - ixy 65 | if is_cont: 66 | deltax = (x.max() - x.min()) / bxy 67 | deltay = (y.max() - y.min()) / bxy 68 | hxy += np.log(deltax) + np.log(deltay) 69 | hy += np.log(deltay) 70 | return hxy - hy 71 | 72 | def variation_info(x, y, normalize=False): 73 | bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1]) 74 | cxy = np.histogram2d(x, y, bxy)[0] 75 | hx = ss.entropy(np.histogram(x, bxy)[0]) 76 | hy = ss.entropy(np.histogram(y, bxy)[0]) 77 | ixy = mutual_info_score(None, None, contingency=cxy) 78 | varxy = hx + hy - 2 * ixy 79 | if normalize: 80 | hxy = hx + hy - ixy 81 | varxy /= hxy 82 | return varxy 83 | 84 | def mutual_info(x, y, normalize=False): 85 | bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1]) 86 | cxy = np.histogram2d(x, y, bxy)[0] 87 | ixy = mutual_info_score(None, None, contingency=cxy) 88 | if normalize: 89 | hx = ss.entropy(np.histogram(x, bxy)[0]) 90 | hy = ss.entropy(np.histogram(y, bxy)[0]) 91 | ixy /= min(hx, hy) 92 | return ixy -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. finance_ml documentation master file, created by 2 | sphinx-quickstart on Sat Dec 28 14:57:57 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to ``finance_ml``'s documentation! 7 | =============================================== 8 | Python implementations of Machine Learning helper functions for Quantiative Finance based on a book, 9 | `Advances in Financial Machine Learning`_, written by ``Marcos Lopez de Prado``. 10 | 11 | .. _Advances in Financial Machine Learning: https://www.amazon.co.jp/Advances-Financial-Machine-Learning-English-ebook/dp/B079KLDW21 12 | 13 | 14 | Installation 15 | -------------- 16 | Excute the following command :: 17 | 18 | python setup.py install 19 | 20 | Examples 21 | -------------- 22 | labeling 23 | ~~~~~~~~~ 24 | Triple Barriers Labeling and CUSUM sampling:: 25 | 26 | from finance_ml.labeling import get_barrier_labels, cusum_filter 27 | from finance_ml.stats import get_daily_vol 28 | 29 | vol = get_daily_vol(close) 30 | trgt = vol 31 | timestamps = cusum_filter(close, vol) 32 | labels = get_barrier_labels(close, timestamps, trgt, sltp=[1, 1], 33 | num_days=1, min_ret=0, num_threads=16) 34 | print(labels.show()) 35 | 36 | Return the following pandas.Series:: 37 | 38 | 2000-01-05 -1.0 39 | 2000-01-06 1.0 40 | 2000-01-10 -1.0 41 | 2000-01-11 1.0 42 | 2000-01-12 1.0 43 | 44 | multiprocessing 45 | ~~~~~~~~~~~~~~~~ 46 | Parallel computing using ``multiprocessing`` library. 47 | Here is the example of applying function to each element with parallelization.:: 48 | 49 | import pandas as pd 50 | import numpy as np 51 | 52 | def apply_func(x): 53 | return x ** 2 54 | 55 | def func(df, timestamps, f): 56 | df_ = df.loc[timestamps] 57 | for idx, x in df_.items(): 58 | df_.loc[idx] = f(x) 59 | return df_ 60 | 61 | df = pd.Series(np.random.randn(10000)) 62 | from finance_ml.multiprocessing import mp_pandas_obj 63 | 64 | results = mp_pandas_obj(func, pd_obj=('timestamps', df.index), 65 | num_threads=24, df=df, f=apply_func) 66 | print(results.head()) 67 | 68 | Output:: 69 | 70 | 0 0.449278 71 | 1 1.411846 72 | 2 0.157630 73 | 3 4.949410 74 | 4 0.601459 75 | 76 | 77 | Documentation for the Code 78 | ============================ 79 | .. toctree:: 80 | :maxdepth: 2 81 | :caption: Contents: 82 | 83 | Labeling 84 | --------- 85 | .. automodule:: finance_ml.labeling.barriers 86 | :members: 87 | 88 | .. automodule:: finance_ml.labeling.sampling 89 | :members: 90 | 91 | .. automodule:: finance_ml.labeling.sides 92 | :members: 93 | 94 | .. automodule:: finance_ml.labeling.sizes 95 | :members: 96 | 97 | .. automodule:: finance_ml.labeling.utils 98 | :members: 99 | 100 | Multiprocessing 101 | ------------------ 102 | .. automodule:: finance_ml.multiprocessing.pandas 103 | :members: 104 | 105 | .. automodule:: finance_ml.multiprocessing.partition 106 | :members: 107 | 108 | .. automodule:: finance_ml.multiprocessing.utils 109 | :members: 110 | 111 | 112 | Indices and tables 113 | ================== 114 | 115 | * :ref:`genindex` 116 | * :ref:`modindex` 117 | * :ref:`search` 118 | -------------------------------------------------------------------------------- /finance_ml/model_selection/score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .kfold import PurgedKFold, CPKFold 4 | from .utils import evaluate 5 | 6 | 7 | def cv_score(clf, 8 | X, 9 | y, 10 | sample_weight=None, 11 | scoring='neg_log_loss', 12 | n_splits=3, 13 | t1=None, 14 | cv_gen=None, 15 | pct_embargo=0., 16 | purging=True, 17 | return_combs=False, 18 | ret=None, 19 | num_threads=1, 20 | **kwargs): 21 | """Cross Validation with default purging and embargo 22 | 23 | Params 24 | ------ 25 | X: pd.DataFrame 26 | y: pd.Series, optional 27 | sample_weight: pd.Series, optional 28 | If specified, apply this to bot testing and training 29 | scoring: str, default 'neg_log_loss' 30 | The name of scoring methods. 'precision', 'recall', 'f1', 'precision_recall', 31 | 'roc', 'accuracy' or 'neg_log_loss' 32 | n_splits: int 33 | The number of splits for cross validation 34 | t1: pd.Series 35 | Index and value correspond to the begining and end of information 36 | cv_gen: KFold instance 37 | If not specified, use PurgedKfold. If cv_gen == 'cp', use CPKFold 38 | pct_embargo: float, default 0 39 | The percentage of applying embargo 40 | purging: bool, default True 41 | If true, apply purging method 42 | return_combs: bool, default False 43 | If True and use CPKFold, return combinatorics location 44 | num_threads: int, default 1 45 | The number of threads for purging 46 | kwargs: Parameters for scoring function 47 | 48 | Returns 49 | ------- 50 | array: scores of cross validation 51 | """ 52 | if cv_gen is None: 53 | if t1 is not None: 54 | cv_gen = PurgedKFold( 55 | n_splits=n_splits, 56 | t1=t1, 57 | pct_embargo=pct_embargo, 58 | purging=purging, 59 | num_threads=num_threads) 60 | else: 61 | cv_gen = KFold(n_splits=n_splits) 62 | elif cv_gen == 'cp': 63 | cv_gen = CPKFold( 64 | n_splits=n_splits, 65 | t1=t1, 66 | pct_embargo=pct_embargo, 67 | purging=purging, 68 | num_threads=num_threads) 69 | scores = [] 70 | for train, test in cv_gen.split(X=X): 71 | train_params = dict() 72 | test_params = dict() 73 | # Sample weight is an optional parameter 74 | if sample_weight is not None: 75 | train_params['sample_weight'] = sample_weight.iloc[train].values 76 | test_params['sample_weight'] = sample_weight.iloc[test].values 77 | test_params.update(kwargs) 78 | clf_fit = clf.fit( 79 | X=X.iloc[train, :].values, y=y.iloc[train].values, **train_params) 80 | if hasattr(clf_fit, 'classes_'): 81 | test_params['labels'] = clf_fit.classes_ 82 | if ret is not None: 83 | test_params['ret'] = ret.iloc[test] 84 | # Scoring 85 | score_ = evaluate(clf_fit, X.iloc[test, :].values, y.iloc[test].values, 86 | scoring, **test_params) 87 | scores.append(score_) 88 | if scoring not in ['roc', 'precision_recall']: 89 | scores = np.array(scores) 90 | if return_combs: 91 | return scores, cv_gen.get_test_combs() 92 | else: 93 | return scores -------------------------------------------------------------------------------- /finance_ml/features/fraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from statsmodels.tsa.stattools import adfuller 4 | from tqdm import tqdm, tqdm_notebook 5 | 6 | 7 | def get_weights_FFD(d, thres, max_size=10000): 8 | """Get coefficient for calculating fractional derivative 9 | 10 | Args: 11 | d (int) 12 | 13 | thres (float) 14 | 15 | max_size (int, optional) Defauts to 1e4.\ 16 | Set the maximum size for stability 17 | 18 | Returns: 19 | array-like 20 | """ 21 | w = [1.] 22 | for k in range(1, max_size): 23 | w_ = -w[-1] / k * (d - k + 1) 24 | if abs(w_) <= thres: 25 | break 26 | w.append(w_) 27 | w = np.array(w) 28 | return w 29 | 30 | 31 | def frac_diff_FFD(series, d, lag=1, thres=1e-5, max_size=10000): 32 | """Compute Fractional Differentiation 33 | 34 | Args: 35 | series (pd.Series) 36 | 37 | d (float): the degree of differentiation 38 | 39 | lag (int, optional): Defaults to 1.\ 40 | The lag scale when making differential like series.diff(lag) 41 | 42 | thres (float, optional): Defaults to 1e-5.\ 43 | Threshold to determine fixed length window 44 | 45 | Returns: 46 | pd.Series 47 | """ 48 | max_size = int(max_size / lag) 49 | w = get_weights_FFD(d, thres, max_size) 50 | width = len(w) 51 | series_ = series.fillna(method='ffill').dropna() 52 | rolling_array = [] 53 | for i in range(width): 54 | rolling_array.append(series_.shift(i * lag).values) 55 | rolling_array = np.array(rolling_array) 56 | series_val = np.dot(rolling_array.T, w) 57 | series = pd.Series(index=series.index) 58 | timestamps = series.index[-len(series_val):] 59 | series.loc[timestamps] = series_val 60 | return series 61 | 62 | 63 | def get_opt_d(series, ds=None, lag=1, thres=1e-5, max_size=10000, 64 | p_thres=1e-2, autolag=None, verbose=1, **kwargs): 65 | """Find minimum value of degree of stationary differntial 66 | 67 | Args: 68 | series (pd.Series) 69 | 70 | ds (array-like, optional): Defaults to np.linspace(0, 1, 100)\ 71 | Search space of degree. 72 | 73 | lag (int, optional): Defaults to 1.\ 74 | The lag scale when making differential like series.diff(lag) 75 | 76 | thres (float, optional): Defaults to 1e-5.\ 77 | Threshold to determine fixed length window 78 | 79 | p_threds (float, optional): Defaults to 1e-2.\ 80 | 81 | auto_lag (str, optional) 82 | 83 | verbose (int, optional): Defaults to 1.\ 84 | If 1 or 2, show the progress bar. 2 for notebook 85 | 86 | kwargs (optional): paramters for ADF 87 | 88 | Returns: 89 | int: optimal degree 90 | """ 91 | if ds is None: 92 | ds = np.linspace(0, 1, 100) 93 | # Sort to ascending order 94 | ds = np.array(ds) 95 | sort_idx = np.argsort(ds) 96 | ds = ds[sort_idx] 97 | if verbose == 2: 98 | iter_ds = tqdm_notebook(ds) 99 | elif verbose == 1: 100 | iter_ds = tqdm(ds) 101 | else: 102 | iter_ds = ds 103 | opt_d = ds[-1] 104 | # Compute pval for each d 105 | for d in iter_ds: 106 | diff = frac_diff_FFD(series, d=d, thres=thres, max_size=max_size) 107 | pval = adfuller(diff.dropna().values, autolag=autolag, **kwargs)[1] 108 | if pval < p_thres: 109 | opt_d = d 110 | break 111 | return opt_d -------------------------------------------------------------------------------- /finance_ml/denoising.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.neighbors.kde import KernelDensity 4 | from scipy.optimize import minimize 5 | 6 | 7 | def cov2corr(cov): 8 | std = np.sqrt(np.diag(cov)) 9 | corr = cov / np.outer(std, std) 10 | corr[corr < -1] = -1 11 | corr[corr > 1] = 1 12 | return corr 13 | 14 | def corr2cov(corr, std): 15 | return corr * np.outer(std, std) 16 | 17 | def mp_pdf(var, q, pts): 18 | # Marcenko-Pastur Distribution 19 | # q = T/N 20 | e_min = var * (1 - (1./q) ** 0.5) ** 2 21 | e_max = var * (1 + (1./q) ** 0.5) ** 2 22 | e_val = np.linspace(e_min, e_max, pts) 23 | pdf = q * ((e_max - e_val) * (e_val - e_min)) ** 0.5 / (2 * np.pi * var * e_val) 24 | return pd.Series(pdf, index=e_val) 25 | 26 | def getPCA(matrix): 27 | e_val, e_vec = np.linalg.eigh(matrix) 28 | indices = e_val.argsort()[::-1] 29 | e_val = e_val[indices] 30 | e_vec = e_vec[:, indices] 31 | e_val = np.diagflat(e_val) 32 | return e_val, e_vec 33 | 34 | def fitKDE(obs, bwidth=0.25, kernel='gaussian', x=None): 35 | if len(obs.shape) == 1: 36 | obs = obs.reshape(-1, 1) 37 | kde = KernelDensity(kernel=kernel, bandwidth=bwidth).fit(obs) 38 | if x is None: 39 | x = np.unique(obs).reshape(-1 , 1) 40 | if len(x.shape) == 1: 41 | x = x.reshape(-1, 1) 42 | log_prob = kde.score_samples(x) 43 | pdf = pd.Series(np.exp(log_prob), index=x.flatten()) 44 | return pdf 45 | 46 | def err_pdf(var, e_val, q, bwidth, pts=1000): 47 | pdf0 = mp_pdf(var[0], q, pts) 48 | pdf1 = fitKDE(e_val, bwidth, x=pdf0.index.values) 49 | sse = np.sum((pdf1 - pdf0) ** 2) 50 | return sse 51 | 52 | def find_max_eigen_val(e_val, q, bwidth, min_var=1e-5, max_var=1-1e-5): 53 | out = minimize(lambda *x: err_pdf(*x), .5, args=(e_val, q, bwidth), bounds=((min_var, max_var),)) 54 | if out["success"]: 55 | var = out['x'][0] 56 | else: 57 | var = 1 58 | e_max = var * (1 + (1./q) ** 0.5) ** 2 59 | return e_max, var 60 | 61 | 62 | def denoise_corr(e_val, e_vec, n_facts, shrinkage=False, alpha=0): 63 | if shrinkage: 64 | e_val_l, e_vec_l = e_val[:n_facts, :n_facts], e_vec[:, :n_facts] 65 | e_val_r, e_vec_r = e_val[n_facts:, n_facts:], e_vec[:, n_facts:] 66 | corr_l = np.dot(e_vec_l, e_val_l).dot(e_vec_l.T) 67 | corr_r = np.dot(e_vec_r, e_val_r).dot(e_vec_r.T) 68 | corr1 = corr_l + alpha * corr_r + (1 - alpha) * np.diag(np.diag(corr_r)) 69 | else: 70 | e_val_ = np.diag(e_val).copy() 71 | e_val_[n_facts:] = e_val_[n_facts:].sum() / float(e_val_.shape[0] - n_facts) 72 | e_val_ = np.diag(e_val_) 73 | corr1 = np.dot(e_vec, e_val_).dot(e_vec.T) 74 | # Renormalize to keep trace 1 75 | corr1 = cov2corr(corr1) 76 | return corr1 77 | 78 | 79 | def detone_corr(e_val, e_vec, n_facts, shrinkage=False, alpha=0): 80 | if shrinkage: 81 | e_val_r, e_vec_r = e_val[n_facts:, n_facts:], e_vec[:, n_facts:] 82 | corr_r = np.dot(e_vec_r, e_val_r).dot(e_vec_r.T) 83 | corr1 = alpha * corr_r + (1 - alpha) * np.diag(np.diag(corr_r)) 84 | # Renormalize to keep trace 1 85 | corr1 = cov2corr(corr1) 86 | else: 87 | e_val_ = np.diag(e_val).copy() 88 | e_val_[:n_facts] = 0 89 | e_val_ = np.diag(e_val_) 90 | corr1 = np.dot(e_vec, e_val_).dot(e_vec.T) 91 | # Renormalize to keep trace 1 92 | corr1 = cov2corr(corr1) 93 | return corr1 94 | 95 | 96 | def denoise_cov(cov, q, bwidth): 97 | corr0 = cov2corr(cov) 98 | e_val0, e_vec0 = getPCA(corr0) 99 | e_max0, var0 = find_max_eigen_val(np.diag(e_val0), q, bwidth) 100 | nfacts0 = e_val0.shape[0] - np.diag(e_val0)[::-1].searchsorted(e_max0) 101 | corr1 = denoise_corr(e_val0, e_vec0, nfacts0) 102 | cov1 = corr2cov(corr1, np.diag(cov) ** .5) 103 | return cov1 104 | 105 | def opt_portfolio(cov, mu=None): 106 | inv = np.linalg.inv(cov) 107 | ones = np.ones(shape=(inv.shape[0], 1)) 108 | if mu is None: 109 | mu = ones 110 | w = np.dot(inv, mu) 111 | w /= np.dot(ones.T, w) 112 | return w 113 | -------------------------------------------------------------------------------- /finance_ml/experiments.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.linalg import block_diag 4 | from sklearn.covariance import LedoitWolf 5 | from sklearn.utils import check_random_state 6 | from sklearn.datasets import make_classification 7 | 8 | from .denoising import corr2cov, cov2corr 9 | 10 | # Matrix 11 | ################################################ 12 | def form_block_matrix(n_blocks, bsize, bcorr): 13 | block = np.ones((bsize, bsize)) * bcorr 14 | for i in range(bsize): 15 | block[i, i] = 1 16 | corr = block_diag(*[block] * n_blocks) 17 | return corr 18 | 19 | def form_true_matrix(n_blocks, bsize, bcorr, is_shuffle=True): 20 | corr0 = form_block_matrix(n_blocks, bsize, bcorr) 21 | corr0 = pd.DataFrame(corr0) 22 | cols = corr0.columns.tolist() 23 | if is_shuffle: 24 | np.random.shuffle(cols) 25 | corr0 = corr0[cols].loc[cols].copy(deep=True) 26 | std0 = np.random.uniform(0.05, 0.2, corr0.shape[0]) 27 | cov0 = corr2cov(corr0, std0) 28 | mu0 = np.random.normal(std0, std0, cov0.shape[0]).reshape(-1, 1) 29 | return mu0, cov0 30 | 31 | def simulate_mu_cov(mu, cov, n_obs, shrink=False): 32 | x = np.random.multivariate_normal(mu.flatten(), cov, size=n_obs) 33 | mu1 = x.mean(axis=0).reshape(-1, 1) 34 | if shrink: 35 | cov1 = LedoitWolf().fit(x).covariance_ 36 | else: 37 | cov1 = np.cov(x, rowvar=0) 38 | return mu1, cov1 39 | 40 | def get_random_cov(n_cols, n_facts): 41 | w = np.random.normal(size=(n_cols, n_facts)) 42 | cov = np.dot(w, w.T) 43 | cov += np.diag(np.random.uniform(size=n_cols)) 44 | return cov 45 | 46 | def get_cov_sub(n_obs, n_cols, sigma, random_state=None): 47 | rng = check_random_state(random_state) 48 | if n_cols == 1: 49 | return np.ones((1, 1)) 50 | ar0 = rng.normal(size=(n_obs, 1)) 51 | ar0 = np.repeat(ar0, n_cols, axis=1) 52 | ar0 += rng.normal(scale=sigma, size=ar0.shape) 53 | ar0 = np.cov(ar0, rowvar=False) 54 | return ar0 55 | 56 | def get_random_block_cov(n_cols, n_blocks, min_block_size=2, sigma=1., random_state=None): 57 | rng = check_random_state(random_state) 58 | # Generate Size of each block 59 | parts = rng.choice(range(1, n_cols - (min_block_size - 1) * n_blocks), n_blocks-1, replace=False) 60 | parts.sort() 61 | parts = np.append(parts, n_cols - (min_block_size - 1) * n_blocks) 62 | parts = np.append(parts[0], np.diff(parts)) - 1 + min_block_size 63 | # Combine blocks as diagonal matrix 64 | cov = None 65 | for n_cols_ in parts: 66 | cov_ = get_cov_sub(int(max(n_cols_ * (n_cols_ + 1) / 2., 100)), n_cols_, sigma, random_state=rng) 67 | if cov is None: 68 | cov = cov_.copy() 69 | else: 70 | cov = block_diag(cov, cov_) 71 | return cov 72 | 73 | def get_random_block_corr(n_cols, n_blocks, random_state=None, min_block_size=2, sigma=1., is_shuffle=False): 74 | rng = check_random_state(random_state) 75 | cov0 = get_random_block_cov(n_cols, n_blocks, min_block_size=min_block_size, sigma=sigma * 0.5, random_state=rng) 76 | # Add noise 77 | cov1 = get_random_block_cov(n_cols, 1, min_block_size=min_block_size, sigma=sigma, random_state=rng) 78 | cov0 += cov1 79 | # Generate Correlation 80 | corr0 = cov2corr(cov0) 81 | corr0 = pd.DataFrame(corr0) 82 | if is_shuffle: 83 | orig_cols = corr0.columns.tolist() 84 | cols = corr0.columns.tolist() 85 | np.random.shuffle(cols) 86 | corr0 = pd.DataFrame(corr0[cols].loc[cols].values, index=orig_cols, columns=orig_cols) 87 | return corr0 88 | 89 | def get_classification_data(n_features=100, n_informative=25, n_reduntant=25, n_samples=10000, random_state=0, sigma=.0): 90 | np.random.seed(random_state) 91 | X, y = make_classification(n_samples=n_samples, n_features=n_features - n_reduntant, 92 | n_informative=n_informative, n_redundant=0, shuffle=False) 93 | cols = [f"I_{i}" for i in range(n_informative)] 94 | cols += [f"N_{i}" for i in range(n_features - n_reduntant - n_informative)] 95 | X = pd.DataFrame(X, columns=cols) 96 | y = pd.Series(y) 97 | rdt_choices = np.random.choice(range(n_informative), size=n_reduntant) 98 | for i, choice in enumerate(rdt_choices): 99 | X[f"R_{i}"] = X[f"I_{choice}"] + np.random.normal(size=X.shape[0]) * sigma 100 | return X, y 101 | -------------------------------------------------------------------------------- /finance_ml/clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.cluster import KMeans 4 | from sklearn.metrics import silhouette_samples, silhouette_score 5 | 6 | from .distance import corr_metric 7 | 8 | 9 | _eps = 1e-16 10 | 11 | def cluster_kmeans_base(corr0, max_num_clusters=10, min_num_clusters=4, n_init=10, debug=False): 12 | dist = corr_metric(corr0, False) 13 | silh = None 14 | kmeans = None 15 | q_val = None 16 | max_num_clusters = min(max_num_clusters, int(np.floor(dist.shape[0]/2))) 17 | min_num_clusters = max(2, min_num_clusters) 18 | for init in range(n_init): 19 | for n_clusters in range(min_num_clusters, max_num_clusters + 1): 20 | kmeans_ = KMeans(n_clusters=n_clusters, n_jobs=1, n_init=1) 21 | kmeans_ = kmeans_.fit(dist.values) 22 | silh_ = silhouette_samples(dist.values, kmeans_.labels_) 23 | q_val_ = silh_.mean() / max(silh_.std(), _eps) 24 | if q_val is None or q_val_ > q_val: 25 | silh = silh_ 26 | kmeans = kmeans_ 27 | q_val = q_val_ 28 | if debug: 29 | print(kmeans) 30 | print(q_val, silh) 31 | silhouette_avg = silhouette_score(dist.values, kmeans_.labels_) 32 | print(f"For n_clusters={n_clusters}, slih_std: {silh_.std()} The average silhouette_score is : {silhouette_avg}") 33 | print("********") 34 | new_idx = np.argsort(kmeans.labels_) 35 | corr1 = corr0.iloc[new_idx] 36 | corr1 = corr1.iloc[:, new_idx] 37 | clstrs = {i:corr0.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_)} 38 | silh = pd.Series(silh, index=dist.index) 39 | return corr1, clstrs, silh 40 | 41 | def make_new_outputs(corr0, clstrs1, clstrs2): 42 | clstrs_new = dict() 43 | for i in clstrs1.keys(): 44 | clstrs_new[len(clstrs_new.keys())] = list(clstrs1[i]) 45 | for i in clstrs2.keys(): 46 | clstrs_new[len(clstrs_new.keys())] = list(clstrs2[i]) 47 | new_idx = [j for i in clstrs_new.keys() for j in clstrs_new[i]] 48 | corr_new = corr0.loc[new_idx, new_idx] 49 | dist = corr_metric(corr0, False) 50 | kmeans_labels = np.zeros(len(dist.columns)) 51 | for i in clstrs_new.keys(): 52 | idxs = [dist.index.get_loc(k) for k in clstrs_new[i]] 53 | kmeans_labels[idxs] = i 54 | silh_new = pd.Series(silhouette_samples(dist.values, kmeans_labels), index=dist.index) 55 | return corr_new, clstrs_new, silh_new 56 | 57 | def cluster_kmeans_top(corr0, max_num_clusters=None, min_num_clusters=4, n_init=10, debug=False): 58 | if max_num_clusters is None: 59 | max_num_clusters = corr0.shape[1] - 1 60 | max_num_clusters = min(max_num_clusters, corr0.shape[1] - 1) 61 | corr1, clstrs, silh = cluster_kmeans_base(corr0, 62 | max_num_clusters=max_num_clusters, 63 | min_num_clusters=min_num_clusters, 64 | n_init=n_init, debug=debug) 65 | clstrs_tstats = {i:np.mean(silh[clstrs[i]]) / max(np.std(silh[clstrs[i]]), _eps) for i in clstrs.keys()} 66 | tstats_mean = np.mean(list(clstrs_tstats.values())) 67 | redo_clstrs = [i for i in clstrs_tstats.keys() if clstrs_tstats[i] < tstats_mean] 68 | if len(redo_clstrs) <= 2: 69 | return corr1, clstrs, silh 70 | else: 71 | keys_redo = [j for i in redo_clstrs for j in clstrs[i]] 72 | corr_tmp = corr0.loc[keys_redo, keys_redo] 73 | corr2, clstrs2, silh2 = cluster_kmeans_base(corr_tmp, 74 | max_num_clusters=min(max_num_clusters, corr_tmp.shape[1] - 1), 75 | min_num_clusters=2, 76 | n_init=n_init, 77 | debug=debug) 78 | clstrs1 = {i: clstrs[i] for i in clstrs.keys() if i not in redo_clstrs} 79 | corr_new, clstrs_new, silh_new = make_new_outputs(corr0, clstrs1, clstrs2) 80 | new_clstrs_tstats = {i:np.mean(silh_new[i]) / max(np.std(silh_new[i]), _eps) for i in clstrs_new.keys()} 81 | tstats_mean = np.mean(list(clstrs_tstats.values())) 82 | new_tstats_mean = np.mean(list(new_clstrs_tstats.values())) 83 | if new_tstats_mean <= tstats_mean: 84 | return corr1, clstrs, silh 85 | else: 86 | return corr_new, clstrs_new, silh_new -------------------------------------------------------------------------------- /finance_ml/features/entropy.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | from ..multiprocessing import mp_pandas_obj 8 | 9 | 10 | def plug_in(data, window): 11 | """Plug in Entropy Estimator 12 | 13 | Args: 14 | data (list) 15 | 16 | window (int) 17 | 18 | Returns: 19 | float: Estimated entropy 20 | 21 | dict: Probability mass function 22 | """ 23 | pmf = calc_pmf(data, window) 24 | out = -sum([pmf[key] * np.log2(pmf[key]) for key in pmf.keys()]) 25 | return out, pmf 26 | 27 | 28 | def calc_pmf(data, window): 29 | """Calculate probability mass function 30 | 31 | Args: 32 | data (list) 33 | 34 | window (int) 35 | 36 | Returns: 37 | dict 38 | """ 39 | lib = {} 40 | for i in range(window, len(data)): 41 | x = '_'.join([str(data_i) for data_i in data[i - window:i]]) 42 | if x not in lib: 43 | lib[x] = [i - window] 44 | else: 45 | lib[x] += [ 46 | i - window, 47 | ] 48 | num_samples = float(len(data) - window) 49 | pmf = {key: len(lib[key]) / num_samples for key in lib} 50 | return pmf 51 | 52 | 53 | def lempel_zib_lib(data): 54 | """Calculate Lampel Ziv dictionary 55 | 56 | Args: 57 | data (list) 58 | 59 | Returns: 60 | dict 61 | """ 62 | i = 1 63 | lib = [str(data[0])] 64 | while i < len(data): 65 | for j in range(i, len(data)): 66 | x = '_'.join([str(data_i) for data_i in data[i:j + 1]]) 67 | if x not in lib: 68 | lib.append(x) 69 | break 70 | i = j + 1 71 | return lib 72 | 73 | 74 | def match_length(data, i, n): 75 | """Calculate math length 76 | 77 | Args: 78 | data (list) 79 | 80 | i (int): start point 81 | 82 | n (int): window size 83 | 84 | Returns: 85 | int: length of the longest matched substring + 1 86 | 87 | str: the longest mathed substring 88 | """ 89 | sub_str = '' 90 | for l in range(n): 91 | msg1 = '_'.join([str(data_i) for data_i in data[i:i + l + 1]]) 92 | for j in range(max(i - n, 0), i): 93 | msg0 = '_'.join([str(data_i) for data_i in data[j:j + l + 1]]) 94 | if msg1 == msg0: 95 | sub_str = msg1 96 | break 97 | return len(sub_str.split('_')) + 1, sub_str 98 | 99 | 100 | def konto(data, window=None, verbose=0): 101 | """Calculate Kontonyiasnnis' LZ entropy estimate 102 | 103 | Args: 104 | data (list) 105 | 106 | window (int, optional 107 | 108 | verbose (int, optional) Defaults to 0.\ 109 | If 1, show the progress bar 110 | """ 111 | out = {'num': 0, 'sum': 0, 'sub_str': []} 112 | if window is None: 113 | points = range(1, len(data) // 2 + 1) 114 | else: 115 | window = min(window, len(data) // 2) 116 | poitns = range(window, len(data) - window + 1) 117 | if verbose == 1: 118 | points = tqdm(points) 119 | for i in points: 120 | if window is None: 121 | l, msg = match_length(data, i, i) 122 | out['sum'] += np.log2(i + 1) / l 123 | else: 124 | l, msg = match_length(data, i, window) 125 | out['sum'] += np.log(i + 1) / l 126 | out['sub_str'].append(msg) 127 | out['num'] += 1 128 | out['h'] = out['sum'] / out['num'] 129 | out['r'] = 1 - out['h'] / np.log2(len(data)) 130 | return out 131 | 132 | 133 | def mp_get_entropy_rate(series, lag, molecule): 134 | delta = timedelta(seconds=lag) 135 | entropy = pd.Series(index=molecule) 136 | for t in molecule: 137 | series_ = series[t - delta:t] 138 | entropy_t = konto(series_.values, verbose=0) 139 | entropy.loc[t] = entropy_t['h'] 140 | return entropy 141 | 142 | 143 | def get_entropy_rate(series, lag, num_threads=1): 144 | """Calculate entropy rate for time series 145 | 146 | Args: 147 | series (pd.Series) 148 | 149 | lag (int): Time slide length (seconds) 150 | 151 | num_threads (int): Defaults to 1 152 | 153 | Returns: 154 | pd.Series 155 | """ 156 | start = series.index[0] + timedelta(seconds=lag) 157 | return mp_pandas_obj( 158 | mp_get_entropy_rate, ('molecule', series[start:].index), 159 | num_threads, 160 | series=series, 161 | lag=lag) 162 | -------------------------------------------------------------------------------- /finance_ml/multiprocessing/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | import sys 4 | from copy import deepcopy 5 | import multiprocessing as mp 6 | import multiprocessing.pool 7 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor 8 | from concurrent.futures import _base 9 | from concurrent.futures.process import _global_shutdown, BrokenProcessPool, _WorkItem 10 | 11 | 12 | class MyProcessPoolExecutor(ProcessPoolExecutor): 13 | def submit(*args, **kwargs): 14 | if len(args) >= 2: 15 | self, fn, *args = args 16 | elif not args: 17 | raise TypeError("descriptor 'submit' of 'ProcessPoolExecutor' object " 18 | "needs an argument") 19 | elif 'fn' in kwargs: 20 | fn = kwargs.pop('fn') 21 | self, *args = args 22 | else: 23 | raise TypeError('submit expected at least 1 positional argument, ' 24 | 'got %d' % (len(args)-1)) 25 | 26 | with self._shutdown_lock: 27 | if self._broken: 28 | print(f"Broken Parameters: {args}, {kwargs}") 29 | raise BrokenProcessPool(self._broken) 30 | if self._shutdown_thread: 31 | raise RuntimeError( 32 | 'cannot schedule new futures after shutdown') 33 | if _global_shutdown: 34 | raise RuntimeError('cannot schedule new futures after ' 35 | 'interpreter shutdown') 36 | 37 | f = _base.Future() 38 | w = _WorkItem(f, fn, args, kwargs) 39 | 40 | self._pending_work_items[self._queue_count] = w 41 | self._work_ids.put(self._queue_count) 42 | self._queue_count += 1 43 | # Wake up queue management thread 44 | self._queue_management_thread_wakeup.wakeup() 45 | 46 | self._start_queue_management_thread() 47 | return f 48 | 49 | 50 | def expand_call(kwargs): 51 | """Execute function from dictionary input""" 52 | func = kwargs['func'] 53 | del kwargs['func'] 54 | optional_argument = None 55 | if "optional_argument" in kwargs: 56 | optional_argument = kwargs["optional_argument"] 57 | del kwargs["optional_argument"] 58 | 59 | transform = None 60 | if 'transform' in kwargs: 61 | transform = kwargs['transform'] 62 | del kwargs['transform'] 63 | 64 | def wrapped_func(**input_kwargs): 65 | if transform is not None: 66 | input_kwargs = transform(input_kwargs) 67 | try: 68 | return func(**input_kwargs) 69 | except Exception as e: 70 | print(e) 71 | print(f"paramteres: {input_kwargs}") 72 | return e 73 | out = wrapped_func(**kwargs) 74 | if optional_argument is None: 75 | return (out, kwargs) 76 | else: 77 | return (out, kwargs, optional_argument) 78 | 79 | 80 | def report_progress(job_idx, num_jobs, time0, task): 81 | """Report progress to system output""" 82 | msg = [float(job_idx) / num_jobs, (time.time() - time0) / 60.] 83 | msg.append(msg[1] * (1 / msg[0] - 1)) 84 | time_stamp = str(datetime.fromtimestamp(time.time())) 85 | msg_ = time_stamp + ' ' + str( 86 | round(msg[0] * 100, 2)) + '% ' + task + ' done after ' + \ 87 | str(round(msg[1], 2)) + ' minutes. Remaining ' + str( 88 | round(msg[2], 2)) + ' minutes.' 89 | if job_idx < num_jobs: 90 | sys.stderr.write(msg_ + '\r') 91 | else: 92 | sys.stderr.write(msg_ + '\n') 93 | 94 | 95 | def process_jobs(jobs, task=None, num_threads=mp.cpu_count(), use_thread=False): 96 | """Execute parallelized jobs 97 | 98 | Parameters 99 | ---------- 100 | jobs: list(dict) 101 | Each element contains `function` and its parameters 102 | task: str, optional 103 | The name of task. If not specified, function name is used 104 | num_threads, (default max count) 105 | The number of threads for parallelization 106 | 107 | Returns 108 | ------- 109 | List: each element is results of each part 110 | """ 111 | if task is None: 112 | if hasattr(jobs[0]['func'], '__name__'): 113 | task = jobs[0]['func'].__name__ 114 | else: 115 | task = 'function' 116 | out = [] 117 | if num_threads > 1: 118 | if use_thread: 119 | executor = ThreadPoolExecutor(max_workers=num_threads) 120 | else: 121 | executor = MyProcessPoolExecutor(max_workers=num_threads) 122 | outputs = executor.map(expand_call, jobs, 123 | chunksize=1) 124 | time0 = time.time() 125 | # Execute programs here 126 | for i, out_ in enumerate(outputs, 1): 127 | out.append(out_) 128 | report_progress(i, len(jobs), time0, task) 129 | else: 130 | for job in jobs: 131 | job = deepcopy(job) 132 | out_ = expand_call(job) 133 | out.append(out_) 134 | return out 135 | -------------------------------------------------------------------------------- /finance_ml/model_selection/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score, precision_score,\ 4 | precision_recall_curve, roc_curve 5 | 6 | from finance_ml.multiprocessing import mp_pandas_obj 7 | 8 | 9 | def mp_train_times(train_times, test_times, molecule): 10 | trn = train_times[molecule].copy(deep=True) 11 | for init, end in test_times.iteritems(): 12 | df0 = trn[(init <= trn.index) & (trn.index <= end)].index 13 | df1 = trn[(init <= trn) & (trn <= end)].index 14 | df2 = trn[(trn.index <= init) & (end <= trn)].index 15 | trn = trn.drop(df0 | df1 | df2) 16 | return trn 17 | 18 | 19 | def get_train_times(train_times, test_times, num_threads=1): 20 | """Sample train points without overlapping with test period 21 | 22 | Params 23 | ------ 24 | train_times: pd.Series 25 | Trainig points with index for initial and values for end time 26 | test_times: pd.Series 27 | Testing points with index for initial and values for end time 28 | num_threads: int, default 1 29 | The number of thrads for multiprocessing 30 | 31 | Returns 32 | ------- 33 | pd.Series 34 | """ 35 | return mp_pandas_obj( 36 | mp_train_times, ('molecule', train_times.index), 37 | num_threads, 38 | train_times=train_times, 39 | test_times=test_times) 40 | 41 | 42 | def get_embargo_times(times, pct_embargo): 43 | """Get embargo time index for each timestamp 44 | 45 | times: 46 | times: Timestamps 47 | Entire timestamps which you want to apply embargo 48 | pct_embargo: float ranged at [0, 1] 49 | The ratio to embargo with respect to the size of timestamps 50 | 51 | Returns: 52 | pd.Series: For each valud corresponds to a point which you should take 53 | out before from the other forward dataset 54 | """ 55 | step = int(times.shape[0] * pct_embargo) 56 | if step == 0: 57 | embg = pd.Series(times, index=times) 58 | else: 59 | embg = pd.Series(times[step:], index=times[:-step]) 60 | embg = embg.append(pd.Series(times[-1], index=times[-step:])) 61 | return embg 62 | 63 | 64 | def performance(ret, proba, step=0.01): 65 | if isinstance(ret, pd.Series): 66 | ret = ret.values 67 | n_step = int(.5 / step) + 1 68 | pnls = [] 69 | sharpes = [] 70 | won_ratios = [] 71 | ths = np.linspace(.5, 1, n_step) 72 | for th in ths: 73 | neg_idx = proba[:, 0] <= th 74 | pos_idx = proba[:, 1] >= th 75 | neg_ret = ret[neg_idx] 76 | pos_ret = ret[pos_idx] 77 | won_count = len(neg_ret[neg_ret < 0]) + len(pos_ret[pos_ret > 0]) 78 | total_count = len(neg_ret) + len(pos_ret) 79 | if total_count == 0: 80 | won_ratio = 0 81 | else: 82 | won_ratio = won_count / total_count 83 | won_ratios.append(won_ratio) 84 | idx = neg_idx | pos_idx 85 | ret_ = ret[idx] 86 | if len(ret_) == 0: 87 | pnl = 0 88 | sharpe = 0 89 | elif len(ret_) == 1: 90 | pnl = float(ret_) 91 | sharpe = 0 92 | else: 93 | pnl = np.sum(ret_) 94 | sharpe = np.mean(ret_) / np.std(ret_) 95 | pnls.append(pnl) 96 | sharpes.append(sharpe) 97 | return ths, np.array(pnls), np.array(sharpes), np.array(won_ratios) 98 | 99 | 100 | def meta_performance(ret, proba, step=0.01): 101 | if isinstance(ret, pd.Series): 102 | ret = ret.values 103 | n_step = int(1. / step) + 1 104 | pnls = [] 105 | sharpes = [] 106 | won_ratios = [] 107 | ths = np.linspace(0, 1, n_step) 108 | for th in ths: 109 | idx = proba[:, 1] >= th 110 | bet_ret = ret[idx] 111 | won_count = len(bet_ret[bet_ret > 0]) 112 | total_count = len(bet_ret) 113 | if total_count == 0: 114 | won_ratio = 0 115 | else: 116 | won_ratio = won_count / total_count 117 | won_ratios.append(won_ratio) 118 | if len(bet_ret) == 0: 119 | pnl = 0 120 | sharpe = 0 121 | elif len(bet_ret) == 1: 122 | pnl = float(bet_ret) 123 | sharpe = 0 124 | else: 125 | pnl = np.sum(bet_ret) 126 | sharpe = np.mean(bet_ret) / np.std(bet_ret) 127 | pnls.append(pnl) 128 | sharpes.append(sharpe) 129 | return ths, np.array(pnls), np.array(sharpes), np.array(won_ratios) 130 | 131 | 132 | def evaluate(model, 133 | X, 134 | y, 135 | method, 136 | sample_weight=None, 137 | pos_idx=1, 138 | pos_label=1, 139 | ret=None): 140 | """Calculate score 141 | 142 | Params 143 | ------ 144 | model: Trained classifier instance 145 | X: array-like, Input feature 146 | y: array-like, Label 147 | method: str 148 | The name of scoring methods. 'precision', 'recall', 'f1', 'precision_recall', 149 | 'roc', 'accuracy' or 'neg_log_loss' 150 | sample_weight: pd.Series, optional 151 | If specified, apply this to bot testing and training 152 | labels: array-like, optional 153 | The name of labels 154 | 155 | Returns 156 | ------- 157 | list of scores 158 | """ 159 | if method == 'f1': 160 | labels = model.classes_ 161 | pred = model.predict(X) 162 | score = f1_score(y, pred, sample_weight=sample_weight, labels=labels) 163 | elif method == 'neg_log_loss': 164 | labels = model.classes_ 165 | prob = model.predict_proba(X) 166 | score = -log_loss(y, prob, sample_weight=sample_weight, labels=labels) 167 | elif method == 'precision': 168 | pred = model.predict(X) 169 | score = precision_score( 170 | y, pred, pos_label=pos_label, sample_weight=sample_weight) 171 | elif method == 'recall': 172 | pred = model.predict(X) 173 | score = recall_score( 174 | y, pred, pos_label=pos_label, sample_weight=sample_weight) 175 | elif method == 'precision_recall': 176 | prob = model.predict_proba(X)[:, pos_idx] 177 | score = precision_recall_curve( 178 | y, prob, pos_label=pos_label, sample_weight=sample_weight) 179 | elif method == 'roc': 180 | prob = model.predict_proba(X)[:, pos_idx] 181 | score = roc_curve( 182 | y, prob, pos_label=pos_label, sample_weight=sample_weight) 183 | elif method == 'accuracy': 184 | pred = model.predict(X) 185 | score = accuracy_score(y, pred, sample_weight=sample_weight) 186 | elif method == 'performance': 187 | prob = model.predict_proba(X) 188 | score = performance(ret, prob) 189 | elif method == 'meta_performance': 190 | prob = model.predict_proba(X) 191 | score = meta_performance(ret, prob) 192 | else: 193 | raise Exception(f'No Implementation method={method}') 194 | return score -------------------------------------------------------------------------------- /finance_ml/labeling/betsizes.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import numpy as np 3 | import pandas as pd 4 | from scipy.stats import norm, t 5 | 6 | from ..multiprocessing import mp_pandas_obj 7 | 8 | 9 | # Specific Betting Size Calculation 10 | ############################################################### 11 | def get_gaussian_betsize(probs, num_classes=2, eps=1e-4): 12 | """Translate probability to bettingsize 13 | 14 | Args: 15 | probs (array-like) 16 | num_classes (int, optional): Defaults to 2 17 | 18 | Returns: 19 | array-like: Signals after gaussian transform 20 | """ 21 | max_prob = 1 - eps 22 | min_prob = eps 23 | if isinstance(probs, numbers.Number): 24 | if probs >= min_prob and probs <= max_prob: 25 | signal = (probs - 1. / num_classes) / np.sqrt(probs * (1 - probs)) 26 | signal = 2 * norm.cdf(signal) - 1 27 | elif probs < min_prob: 28 | signal = -1 29 | elif probs > max_prob: 30 | signal = 1 31 | else: 32 | raise ValueError(f"Unkonwn probabilty: {probs}") 33 | else: 34 | signal = probs.copy() 35 | signal[probs >= max_prob] = 1 36 | signal[probs <= min_prob] = -1 37 | cond = (probs < max_prob) & (probs > min_prob) 38 | signal[cond] = (probs[cond] - 1. / num_classes) / np.sqrt(probs[cond] * (1 - probs[cond])) 39 | signal[cond] = 2 * norm.cdf(signal[cond]) - 1 40 | return signal 41 | 42 | 43 | def get_tstats_betsize(probs, N, num_classes=2, eps=1e-4): 44 | """Translate probability to bettingsize 45 | 46 | Args: 47 | probs (array-like) 48 | N (int): The number of estimators used for generating probs 49 | num_classes (int, optional): Defaults to 2 50 | 51 | Returns: 52 | array-like: Signals after gaussian transform 53 | """ 54 | max_prob = 1 - eps 55 | min_prob = eps 56 | if isinstance(probs, numbers.Number): 57 | if probs >= min_prob and probs <= max_prob: 58 | signal = (probs - 1. / num_classes) / np.sqrt(probs * (1 - probs)) * np.sqrt(N) 59 | signal = 2 * t.cdf(signal, df=N-1) - 1 60 | elif probs < min_prob: 61 | signal = -1 62 | elif probs > max_prob: 63 | signal = 1 64 | else: 65 | raise ValueError(f"Unkonwn probabilty: {probs}") 66 | else: 67 | signal = probs.copy() 68 | signal[probs >= max_prob] = 1 69 | signal[probs <= min_prob] = -1 70 | cond = (probs < max_prob) & (probs > min_prob) 71 | signal[cond] = (probs[cond] - 1. / num_classes) / np.sqrt(probs[cond] * (1 - probs[cond])) * np.sqrt(N) 72 | signal[cond] = 2 * t.cdf(signal[cond], df=N-1) - 1 73 | return signal 74 | 75 | 76 | # Aggregate Signals 77 | ##################################################################### 78 | def discrete_signals(signals, step_size): 79 | """Discretize signals 80 | 81 | Args: 82 | signals (pd.Series or float): Signals for betting size ranged [-1, 1] 83 | 84 | step_size (float): Discrete size ranged [0, 1] 85 | 86 | Returns: 87 | pd.Series or float: Discretized signals. If signals is pd.Series,\ 88 | return value is pd.Series. If signals is float, return value\ 89 | is float 90 | """ 91 | if isinstance(signals, numbers.Number): 92 | signals = round(signals / step_size) * step_size 93 | signals = min(1, signals) 94 | signals = max(-1, signals) 95 | else: 96 | signals = (signals / step_size).round() * step_size 97 | signals[signals > 1] = 1 98 | signals[signals < -1] = -1 99 | return signals 100 | 101 | 102 | def avg_active_signals(signals, num_threads=1, timestamps=None): 103 | """Average active signals 104 | 105 | Args: 106 | signals (pd.DataFrame): With keys: 't1' and 'signal' 107 | - t1, signal effective time boundary. 108 | - signal, signal value 109 | 110 | num_threads (int, optional): The number of processor used for calculation.\ 111 | Defaults to 1. 112 | 113 | timestamps (list, optional): Timestamps used for output. When there is no active signal,\ 114 | value will be zero on that point. If not specified, use signals.index. 115 | 116 | Returns: 117 | pd.Series: Averaged signals 118 | """ 119 | if timestamps is None: 120 | timestamps = set(signals['t1'].dropna().values) 121 | timestamps = list(timestamps.union(set(signals.index.values))) 122 | timestamps.sort() 123 | out = mp_pandas_obj( 124 | mp_avg_active_signals, ('molecule', timestamps), 125 | num_threads, 126 | signals=signals) 127 | return out 128 | 129 | 130 | def mp_avg_active_signals(signals, molecule): 131 | """Function to calculate averaging with multiprocessing""" 132 | out = pd.Series() 133 | for loc in molecule: 134 | loc = pd.Timestamp(loc) 135 | cond = (signals.index <= loc) & ( 136 | (loc < signals['t1']) | pd.isnull(signals['t1'])) 137 | active_idx = signals[cond].index 138 | if len(active_idx) > 0: 139 | out[loc] = signals.loc[active_idx, 'signal'].mean() 140 | else: 141 | out[loc] = 0 142 | return out 143 | 144 | 145 | # Signal Translation 146 | ################################################################################# 147 | def get_betsize(probs, 148 | events=None, 149 | scale=1, 150 | step_size=None, 151 | signal_func=None, 152 | num_classes=2, 153 | num_threads=1, 154 | **kwargs): 155 | """Average and discretize signals from probability 156 | 157 | Args: 158 | events (pd.DataFrame): With the following keys 159 | - time, time of barrier 160 | - type, type of barrier - tp, sl, or t1 161 | - trgt, horizontal barrier width 162 | - side, position side 163 | 164 | probs (pd.Series): Probability signals 165 | 166 | scale (float): Betting size scale 167 | 168 | step_size (float, optional): If specified, discretize signals.\ 169 | The value is ranged [0, 1] 170 | 171 | num_classes (int, optional): The number of classes. Defaults to 2. 172 | 173 | num_threads (int, optional): The number of threads used for averaging bets.\ 174 | Defaults to 1. 175 | 176 | Returns: 177 | pd.Series: bet size signals 178 | """ 179 | # Get Signals 180 | if probs.shape[0] == 0: 181 | return pd.Series() 182 | if signal_func is None: 183 | signal_func = get_gaussian_betsize 184 | signal = pd.Series(signal_func(probs, num_classes=num_classes, **kwargs), index=probs.index) 185 | if events and 'side' in events: 186 | signal = signal * events.loc[signal.index, 'side'] 187 | if step_size is not None: 188 | signal = discrete_signals(signal, step_size=step_size) 189 | signal = scale * signal 190 | return signal -------------------------------------------------------------------------------- /finance_ml/model_selection/kfold.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import combinations 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.model_selection._split import _BaseKFold 7 | 8 | from .utils import get_train_times 9 | 10 | 11 | class PurgedKFold(_BaseKFold): 12 | """Cross Validation with purging and embargo 13 | 14 | Params 15 | ------ 16 | n_splits: int 17 | The number of splits for cross validation 18 | t1: pd.Series 19 | Index and value correspond to the begining and end of information 20 | pct_embargo: float, default 0 21 | The percentage of applying embargo 22 | purging: bool, default True 23 | If true, apply purging method 24 | num_threads: int, default 1 25 | The number of threads for purging 26 | """ 27 | 28 | def __init__(self, 29 | n_splits=3, 30 | t1=None, 31 | pct_embargo=0., 32 | purging=True, 33 | num_threads=1): 34 | super(PurgedKFold, self).__init__( 35 | n_splits=n_splits, shuffle=False, random_state=None) 36 | if not isinstance(t1, pd.Series): 37 | raise ValueError('t1 must be pd.Series') 38 | self.t1 = t1 39 | self.pct_embargo = pct_embargo 40 | self.purging = purging 41 | self.num_threads = num_threads 42 | 43 | def split(self, X, y=None, groups=None): 44 | """Get train and test times stamps 45 | 46 | Params 47 | ------ 48 | X: pd.DataFrame 49 | y: pd.Series, optional 50 | 51 | Returns 52 | ------- 53 | train_indices, test_indices: np.array 54 | """ 55 | if (X.index == self.t1.index).sum() != len(self.t1): 56 | raise ValueError('X and t1 must have the same index') 57 | indices = np.arange(X.shape[0]) 58 | # Embargo width 59 | embg_size = int(X.shape[0] * self.pct_embargo) 60 | # Pandas is close set when using [t0:t1] 61 | test_ranges = [(i[0], i[-1] + 1) 62 | for i in np.array_split(indices, self.n_splits)] 63 | for st, end in test_ranges: 64 | test_indices = indices[st:end] 65 | t0 = self.t1.index[st] 66 | # Avoid look ahead leakage here 67 | train_indices = self.t1.index.searchsorted( 68 | self.t1[self.t1 <= t0].index) 69 | # Edge point of test set in the most recent side 70 | max_t1_idx = self.t1.index.searchsorted( 71 | self.t1[test_indices].max()) 72 | if max_t1_idx < X.shape[0]: 73 | # Adding indices after test set 74 | train_indices = np.concatenate( 75 | (train_indices, indices[max_t1_idx + embg_size:])) 76 | # Purging 77 | if self.purging: 78 | train_t1 = self.t1.iloc[train_indices] 79 | test_t1 = self.t1.iloc[test_indices] 80 | train_t1 = get_train_times( 81 | train_t1, test_t1, num_threads=self.num_threads) 82 | train_indices = self.t1.index.searchsorted(train_t1.index) 83 | yield train_indices, test_indices 84 | 85 | 86 | class CPKFold(object): 87 | """Cross Validation with purging and embargo 88 | 89 | Params 90 | ------ 91 | n_splits: tuple 92 | Combinatorial of (n_splits[0], n_splits[1]). n_splits[1] is the number of test. 93 | t1: pd.Series 94 | Index and value correspond to the begining and end of information 95 | pct_embargo: float, default 0 96 | The percentage of applying embargo 97 | purging: bool, default True 98 | If true, apply purging method 99 | num_threads: int, default 1 100 | The number of threads for purging 101 | """ 102 | 103 | def __init__(self, 104 | n_splits, 105 | t1=None, 106 | pct_embargo=0., 107 | purging=True, 108 | num_threads=1): 109 | if not isinstance(t1, pd.Series): 110 | raise ValueError('t1 must be pd.Series') 111 | self.n_splits = n_splits 112 | self.t1 = t1 113 | self.pct_embargo = pct_embargo 114 | self.purging = purging 115 | self.num_threads = num_threads 116 | 117 | def split(self, X, y=None, groups=None): 118 | """Get train and test times stamps 119 | 120 | Params 121 | ------ 122 | X: pd.DataFrame 123 | y: pd.Series, optional 124 | 125 | Returns 126 | ------- 127 | train_indices, test_indices: np.array 128 | """ 129 | if (X.index == self.t1.index).sum() != len(self.t1): 130 | raise ValueError('X and t1 must have the same index') 131 | indices = np.arange(X.shape[0]) 132 | # Embargo width 133 | embg_size = int(X.shape[0] * self.pct_embargo) 134 | # Generate Combinatorial Pairs for training 135 | split_indices = np.array_split(indices, self.n_splits[0]) 136 | self._split_locs = np.arange(self.n_splits[0]) 137 | self._test_loc = { 138 | i: X.index[idx] 139 | for i, idx in enumerate(split_indices) 140 | } 141 | self._test_combs = np.array( 142 | list(combinations(self._split_locs, self.n_splits[1]))) 143 | train_combs = [] 144 | for comb_idx in self._test_combs: 145 | train_comb = list(set(self._split_locs).difference(set(comb_idx))) 146 | train_combs.append(train_comb) 147 | 148 | train_indices_embg = [] 149 | train_indices = [] 150 | for comb_idx in train_combs: 151 | train_index_embg = [] 152 | train_index = [] 153 | for i in comb_idx: 154 | if i < self.n_splits[0] - 1: 155 | train_index_ = np.hstack( 156 | (split_indices[i], split_indices[i + 1][:embg_size])) 157 | train_index_embg.append(train_index_) 158 | train_index.append(split_indices[i]) 159 | else: 160 | train_index_embg.append(split_indices[i]) 161 | train_index.append(split_indices[i]) 162 | train_indices_embg.append( 163 | np.array(list(set(np.hstack(train_index_embg))))) 164 | train_indices.append(np.array(list(set(np.hstack(train_index))))) 165 | 166 | for train_index, train_index_embg in zip(train_indices, 167 | train_indices_embg): 168 | test_index = np.array( 169 | list(set(indices).difference(set(train_index)))) 170 | # Purging 171 | if self.purging: 172 | train_t1 = self.t1.iloc[train_index] 173 | test_t1 = self.t1.iloc[test_index] 174 | train_t1 = get_train_times( 175 | train_t1, test_t1, num_threads=self.num_threads) 176 | train_index = self.t1.index.searchsorted(train_t1.index) 177 | yield train_index, test_index 178 | 179 | def get_test_combs(self): 180 | return self._test_combs, self._test_loc 181 | 182 | 183 | def generate_signals(clf, 184 | X, 185 | y, 186 | sample_weight=None, 187 | n_splits=(4, 2), 188 | t1=None, 189 | pct_embargo=0., 190 | purging=True, 191 | num_threads=1, 192 | **kwargs): 193 | """Cross Validation with default purging and embargo 194 | 195 | Params 196 | ------ 197 | X: pd.DataFrame 198 | y: pd.Series, optional 199 | sample_weight: pd.Series, optional 200 | If specified, apply this to bot testing and training 201 | n_splits: tuple 202 | Combinatorial of (n_splits[0], n_splits[1]). n_splits[1] is the number of test. 203 | t1: pd.Series 204 | Index and value correspond to the begining and end of information 205 | pct_embargo: float, default 0 206 | The percentage of applying embargo 207 | purging: bool, default True 208 | If true, apply purging method 209 | num_threads: int, default 1 210 | The number of threads for purging 211 | kwargs: Parameters for scoring function 212 | 213 | Returns 214 | ------- 215 | result: dict(list) 216 | Each element is signal generated from classifier 217 | test_times: timestamps 218 | """ 219 | cv_gen = CPKFold( 220 | n_splits=n_splits, 221 | t1=t1, 222 | pct_embargo=pct_embargo, 223 | purging=purging, 224 | num_threads=num_threads) 225 | signals = [] 226 | for train, test in cv_gen.split(X=X): 227 | train_params = dict() 228 | test_params = dict() 229 | # Sample weight is an optional parameter 230 | if sample_weight is not None: 231 | train_params['sample_weight'] = sample_weight.iloc[train].values 232 | test_params['sample_weight'] = sample_weight.iloc[test].values 233 | test_params.update(kwargs) 234 | clf_fit = clf.fit( 235 | X=X.iloc[train, :].values, y=y.iloc[train].values, **train_params) 236 | # Scoring 237 | signal = clf_fit.predict_proba(X.iloc[test, :].values) 238 | signal = pd.DataFrame(signal, index=X.iloc[test].index) 239 | signals.append(signal) 240 | 241 | combs = cv_gen.get_test_combs() 242 | result = defaultdict(list) 243 | test_times = combs[1] 244 | for signal, comb in zip(signals, combs[0]): 245 | for i in comb: 246 | result[i].append(signal.loc[test_times[i]]) 247 | return result, test_times -------------------------------------------------------------------------------- /finance_ml/labeling/barriers.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import multiprocessing as mp 6 | 7 | from ..multiprocessing import mp_pandas_obj 8 | from ..constants import LONG, SHORT 9 | 10 | 11 | def get_touch_idx(close, events, sltp, molecule=None): 12 | """Return timestamps of when data points touch the barriers 13 | 14 | Args: 15 | close (pd.Series): Close price series 16 | 17 | events (pd.DataFrame): With columns: 't1', 'trgt', and 'side' 18 | t1, time stamp of vertical barrier, could be np.nan 19 | trgt, unit of width of horizontal barriers 20 | side, Side label for metalabeling 21 | 22 | sltp (list): Coefficients of width of Stop Loss and Take Profit.\ 23 | sltp[0] and sltp[1] correspond to width of stop loss\ 24 | and take profit, respectively. If 0 or negative, the barrier\ 25 | is turned off. 26 | 27 | molecule (list, optional): Subset of indices of events to be processed 28 | 29 | Returns: 30 | pd.DataFrame: each colum corresponds to the time to touch each barrier 31 | """ 32 | # Sample a subset with specific indices 33 | if molecule is not None: 34 | _events = events.loc[molecule] 35 | else: 36 | _events = events 37 | touch_idx = pd.DataFrame(index=_events.index) 38 | # Set Stop Loss and Take Profoit 39 | if sltp[0] > 0: 40 | sls = -sltp[0] * _events["trgt"] 41 | else: 42 | # Switch off stop loss 43 | sls = pd.Series(index=_events.index) 44 | if sltp[1] > 0: 45 | tps = sltp[1] * _events["trgt"] 46 | else: 47 | # Switch off profit taking 48 | tps = pd.Series(index=_events.index) 49 | # Replace undefined value with the last time index 50 | vertical_lines = _events["t1"].fillna(close.index[-1]) 51 | for loc, t1 in vertical_lines.iteritems(): 52 | df = close[loc:t1] 53 | # Change the direction depending on the side 54 | df = (df / close[loc] - 1) * _events.at[loc, 'side'] 55 | touch_idx.at[loc, 'sl'] = df[df < sls[loc]].index.min() 56 | touch_idx.at[loc, 'tp'] = df[df > tps[loc]].index.min() 57 | touch_idx['t1'] = _events['t1'].copy(deep=True) 58 | return touch_idx 59 | 60 | 61 | def get_events(close, timestamps, sltp=None, trgt=None, min_trgt=0, 62 | num_threads=1, t1=None, side=None): 63 | """Return DataFrame containing infomation defining barriers 64 | 65 | Args: 66 | close (pd.Series): Close price series 67 | 68 | timestamps (pd.DatetimeIndex): sampled points to analyze 69 | 70 | sltp (list or int, optional): Coefficients of width of Stop Loss and Take Profit.\ 71 | sltp[0] and sltp[1] correspond to width of stop loss\ 72 | and take profit, respectively. If 0 or negative, the barrier\ 73 | is turned off. If not specified, use only vertical line.\ 74 | 75 | trgt (pd.Series, optional): Time series of threashold.\ 76 | If not specified, we will switch off horizontal thresholds 77 | 78 | min_trgt (float, optional): Minimum value of threashold to label either of negative\ 79 | or positive. Defaults to 0. 80 | 81 | num_threads (int, optional): The number of threads to use.\ 82 | Defaults to 1. 83 | 84 | t1 (pd.Series, optional): Vertical lines\ 85 | 86 | side (pd.Series, optional): Side of trading positions 87 | 88 | Returns: 89 | pd.DataFrame: With the following keys: 90 | - t1, timestamp of labeled point 91 | - trgt, target threashold value 92 | - type, the type of labeled point, either of `t1`, `tp`, or `sl`. 93 | - side, Only if you use metalabeling, this key is available 94 | """ 95 | if trgt is None: 96 | # Switch off horizontal barriers 97 | trgt = pd.Series(1 + min_trgt, index=timestamps) 98 | sltp = -1 99 | elif isinstance(trgt, numbers.Number): 100 | trgt = pd.Series(trgt, index=timestamps) 101 | # Get sampled target values 102 | trgt = trgt.loc[timestamps] 103 | trgt = trgt[trgt > min_trgt] 104 | if len(trgt) == 0: 105 | return pd.DataFrame(columns=['t1', 'trgt', 'side']) 106 | # Get time boundary t1 107 | if t1 is None: 108 | t1 = pd.Series(pd.NaT, index=timestamps) 109 | # slpt has to be either of integer, list or tuple 110 | if isinstance(sltp, list) or isinstance(sltp, tuple): 111 | _sltp = sltp[:2] 112 | else: 113 | _sltp = [sltp, sltp] 114 | # Define the side 115 | if side is None: 116 | # Default is LONG 117 | _side = pd.Series(LONG, index=trgt.index) 118 | else: 119 | _side = side.loc[trgt.index] 120 | events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1) 121 | events = events.dropna(subset=['trgt']) 122 | time_idx = mp_pandas_obj(func=get_touch_idx, 123 | pd_obj=('molecule', events.index), 124 | num_threads=num_threads, 125 | close=close, events=events, sltp=_sltp) 126 | # Skip when all of barrier are not touched 127 | time_idx = time_idx.dropna(how='all') 128 | events['type'] = time_idx.idxmin(axis=1) 129 | events['t1'] = time_idx.min(axis=1) 130 | if side is None: 131 | events = events.drop('side', axis=1) 132 | return events 133 | 134 | 135 | def get_t1(close, timestamps, seconds=None): 136 | """Return horizontal timestamps 137 | 138 | Note: 139 | Not include the case to hit the vertical line at the end of close.index 140 | 141 | Args: 142 | close (pd.Series) 143 | 144 | timestamps (pd.DatetimeIndex) 145 | 146 | seconds (int, optional): 147 | The number of forward dates or seconds for vertical barrier. 148 | 149 | Returns: 150 | pd.Series: Vertical barrier timestamps 151 | """ 152 | delta = pd.Timedelta(seconds=seconds) 153 | t1 = close.index.searchsorted(timestamps + delta) 154 | t1 = t1[t1 < close.shape[0]] 155 | t1 = pd.Series(close.index[t1], index=timestamps[:t1.shape[0]]) 156 | return t1 157 | 158 | 159 | def get_labels(close, events, min_ret=0, sign_label=True, zero_label=0): 160 | """Return label 161 | 162 | Args: 163 | close (pd.Series) 164 | 165 | events (pd.DataFrame): 166 | t1: time of barrier 167 | type: type of barrier - tp, sl, or t1 168 | trgt: horizontal barrier width 169 | side: position side 170 | 171 | min_ret (float): Minimum of absolute value for labeling non zero label. min_ret >=0 172 | 173 | sign_label (bool, opyionsl): If True, assign label for points touching vertical\ 174 | line accroing to return's sign. Defaults to True. 175 | 176 | zero_label (int, optional): 177 | If specified, use it for the label of zero value of return\ 178 | If not, get rid of samples. Defaults to 0. 179 | 180 | Returns: 181 | pd.DataFrame: With the following keys: 182 | - ret, return value for label 183 | - t1, timestamp of labeled point 184 | - label, label values 185 | - type, the type of labeled point, either of `t1`, `tp`, or `sl`. 186 | - side, Only if you use metalabeling, this key is available 187 | """ 188 | # Prices algined with events 189 | events = events.dropna(subset=['t1']) 190 | # All used indices 191 | time_idx = events.index.union(events['t1'].values).drop_duplicates() 192 | close = close.reindex(time_idx, method='bfill') 193 | # Create out object 194 | out = pd.DataFrame(index=events.index) 195 | out['ret'] = close.loc[events['t1'].values].values / close.loc[ 196 | events.index] - 1. 197 | # Modify return according to the side 198 | if 'side' in events: 199 | out['ret'] *= events['side'] 200 | out['side'] = events['side'] 201 | # Assign labels 202 | out = out.dropna() 203 | out['label'] = np.sign(out['ret']) 204 | if 'side' in events: 205 | out.loc[out['ret'] <= min_ret, 'label'] = zero_label 206 | else: 207 | out.loc[(out['ret'] <= min_ret) & (out['ret'] >= -min_ret), 'label'] = zero_label 208 | if not sign_label: 209 | out['label'].loc[events['type'] == 't1'] = zero_label 210 | out['t1'] = events['t1'] 211 | out['type'] = events['type'] 212 | return out 213 | 214 | 215 | 216 | def get_barrier_labels(close, timestamps=None, trgt=None, sltp=[1, 1], 217 | seconds=None, min_trgt=0, min_ret=0, 218 | num_threads=None, side=None, sign_label=False, zero_label=0): 219 | """Return Labels for triple barrier 220 | 221 | Args: 222 | close (pd.Series) 223 | 224 | timestamps (pd.DatetimeIndex, optional): Sampled points to analyze.\ 225 | If not specified, use close.index 226 | 227 | trgt (pd.Series, optional): Time series of threshold.\ 228 | If not specified, it will switch off horizontal barriers 229 | 230 | sltp (list, optional): Coefficients of width of Stop Loss and Take Profit.\ 231 | sltp[0] and sltp[1] correspond to width of stop loss\ 232 | and take profit, respectively. If 0 or negative, the barrier\ 233 | is switched off. Defaults to [1, 1].\ 234 | 235 | seconds (float, optional): The length of vertical barrier. 236 | 237 | min_trgt (float, optional): Minimum value of threshold to label positive or negative.\ 238 | Deafults to 0. 239 | 240 | num_threads (int, optional): The number of threads to use. If not specified,\ 241 | use maximum number of threads. 242 | 243 | side (pd.Series, optional): Side of trading positions 244 | 245 | sign_label (bool, optional): If True, assign label for points touching vertical\ 246 | line according to return's sign. Defaults to True. 247 | 248 | zero_label (int, optional): The label for zero value of returns 249 | 250 | Returns: 251 | pd.DataFrame: With the following keys: 252 | - ret, return value for label 253 | - t1, timestamp of labeled point 254 | - label, label values 255 | - type, the type of labeled point, either of `t1`, `tp`, or `sl`. 256 | - side, Only if you use metalabeling, this key is available 257 | """ 258 | if timestamps is None: 259 | if side is None: 260 | timestamps = close.index 261 | else: 262 | timestamps = side.index 263 | t1 = get_t1(close, timestamps, seconds=seconds) 264 | if num_threads is None: 265 | num_threads = mp.cpu_count() 266 | events = get_events(close, timestamps, 267 | sltp=sltp, 268 | trgt=trgt, 269 | min_trgt=min_trgt, 270 | num_threads=num_threads, 271 | t1=t1, side=side) 272 | labels = get_labels(close, events, min_ret=min_ret, sign_label=sign_label, zero_label=zero_label) 273 | return labels -------------------------------------------------------------------------------- /finance_ml/importance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sklearn.model_selection import KFold 5 | from sklearn.metrics import log_loss, mean_squared_error 6 | 7 | from .model_selection import PurgedKFold, cv_score, evaluate 8 | 9 | 10 | def mp_feat_imp_SFI(clf, X, y, feat_names, sample_weight=None, scoring='neg_log_loss', 11 | n_splits=3, t1=None, cv_gen=None, pct_embargo=0, purging=True): 12 | imp = pd.DataFrame(columns=['mean', 'std']) 13 | for feat_name in feat_names: 14 | scores = cv_score(clf, X=X[[feat_name]], y=y, 15 | sample_weight=sample_weight, 16 | scoring=scoring, 17 | cv_gen=cv_gen, 18 | n_splits=n_splits, 19 | t1=t1, 20 | pct_embargo=pct_embargo, 21 | purging=purging) 22 | imp.loc[feat_name, 'mean'] = scores.mean() 23 | imp.loc[feat_name, 'std'] = scores.std() * scores.shape[0] ** -0.5 24 | return imp 25 | 26 | 27 | def feat_imp_SFI(clf, X, y, sample_weight=None, scoring='neg_log_loss', 28 | n_splits=5, t1=None, cv_gen=None, pct_embargo=0, purging=True, num_threads=1): 29 | """Calculate Single Feature Importance 30 | 31 | Args: 32 | clf: Classifier instance 33 | X: pd.DataFrame, Input feature 34 | y: pd.Series, Label 35 | clstrs: dict[list] 36 | Clustering labels: key is the name of cluster and value is list of belonging columns 37 | sample_weight: pd.Series, optional 38 | If specified, apply this to testing and training 39 | scoring: str, default 'neg_log_loss' 40 | The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss' 41 | n_splits: int, default 3 42 | The number of splits for cross validation 43 | t1: pd.Series 44 | Index and value correspond to the begining and end of information. It is required for purging and embargo 45 | cv_gen: KFold instance 46 | If not specified, use PurgedKfold 47 | pct_embargo: float, default 0 48 | The percentage of applying embargo 49 | purging: bool, default True 50 | If true, apply purging method 51 | num_threads: int, default 1 52 | The number of threads for purging 53 | 54 | Returns: 55 | pd.DataFrame: Importance means and standard deviations 56 | - mean: Mean of importance 57 | - std: Standard deviation of importance 58 | """ 59 | imp = mp_pandas_obj(mp_feat_imp_SFI, ('feat_names', X.columns), 60 | num_threads, clf=clf, X=X, y=y, sample_weight=sample_weight, 61 | scoring=scoring, n_splits=n_splits, t1=t1, cv_gen=cv_gen, 62 | pct_embargo=pct_embargo, purging=purging) 63 | return imp 64 | 65 | 66 | def feat_imp_MDI(fit, feat_names): 67 | """Compute Mean Decrease Impurity 68 | 69 | Args: 70 | forest (Forest Classifier instance) 71 | feat_names (list(str)): List of names of features 72 | 73 | Returns: 74 | pd.DataFrame: Importance means and standard deviations 75 | - mean: Mean of importance 76 | - std: Standard deviation of importance 77 | """ 78 | df0 = {i: tree.feature_importances_ for i, tree in enumerate(fit.estimators_)} 79 | df0 = pd.DataFrame.from_dict(df0, orient='index') 80 | df0.columns = feat_names 81 | df0 = df0.replace(0, np.nan) 82 | imp = pd.concat({"mean": df0.mean(), "std": df0.std() * (df0.shape[0] ** -0.5)}, axis=1) 83 | imp /= imp["mean"].sum() 84 | return imp 85 | 86 | 87 | def feat_imp_MDA(clf, X, y, sample_weight=None, scoring='neg_log_loss', n_splits=5, t1=None, 88 | cv_gen=None, pct_embargo=0, purging=True, num_threads=1): 89 | """Calculate Mean Decrease Accuracy 90 | 91 | Note: 92 | You can use any classifier to estimate importance 93 | 94 | Args: 95 | clf: Classifier instance 96 | X: pd.DataFrame, Input feature 97 | y: pd.Series, Label 98 | sample_weight: pd.Series, optional 99 | If specified, apply this to testing and training 100 | scoring: str, default 'neg_log_loss' 101 | The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss' 102 | n_splits: int, default 3 103 | The number of splits for cross validation 104 | t1: pd.Series 105 | Index and value correspond to the begining and end of information. It is required for purging and embargo 106 | cv_gen: KFold instance 107 | If not specified, use PurgedKfold 108 | pct_embargo: float, default 0 109 | The percentage of applying embargo 110 | purging: bool, default True 111 | If true, apply purging method 112 | num_threads: int, default 1 113 | The number of threads for purging 114 | 115 | Returns: 116 | pd.DataFrame: Importance means and standard deviations 117 | - mean: Mean of importance 118 | - std: Standard deviation of importance 119 | """ 120 | 121 | if cv_gen is None: 122 | if t1 is not None: 123 | cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo, 124 | purging=purging, num_threads=num_threads) 125 | else: 126 | cv_gen = KFold(n_splits=n_splits) 127 | index = np.arange(n_splits) 128 | scores = pd.Series(index=index) 129 | scores_perm = pd.DataFrame(index=index, columns=X.columns) 130 | for idx, (train, test) in zip(index, cv_gen.split(X=X)): 131 | X_train = X.iloc[train] 132 | y_train = y.iloc[train] 133 | if sample_weight is not None: 134 | w_train = sample_weight.iloc[train].values 135 | else: 136 | w_train = None 137 | X_test = X.iloc[test] 138 | y_test = y.iloc[test] 139 | if sample_weight is not None: 140 | w_test = sample_weight.iloc[test].values 141 | else: 142 | w_test = None 143 | clf_fit = clf.fit(X_train, y_train, sample_weight=w_train) 144 | scores.loc[idx] = evaluate(clf_fit, X_test, y_test, scoring, 145 | sample_weight=w_test) 146 | 147 | for col in X.columns: 148 | X_test_ = X_test.copy(deep=True) 149 | # Randomize certain feature to make it not effective 150 | np.random.shuffle(X_test_[col].values) 151 | scores_perm.loc[idx, col] = evaluate(clf_fit, X_test_, y_test, scoring, 152 | sample_weight=w_test) 153 | # (Original score) - (premutated score) 154 | imprv = (-scores_perm).add(scores, axis=0) 155 | # Relative to maximum improvement 156 | if scoring == 'neg_log_loss': 157 | max_imprv = -scores_perm 158 | else: 159 | max_imprv = 1. - scores_perm 160 | imp = imprv / max_imprv 161 | return pd.concat({"mean": imp.mean(), "std": imp.std() * (imp.shape[0] ** -0.5)}, axis=1) 162 | 163 | def group_mean_std(df0, clstrs): 164 | out = pd.DataFrame(columns=['mean', 'std']) 165 | for key, elements in clstrs.items(): 166 | df1 = df0[elements].sum(axis=1) 167 | out.loc[f"C_{key}", 'mean'] = df1.mean() 168 | out.loc[f"C_{key}", 'std'] = df1.std() * df1.shape[0]**-.5 169 | return out 170 | 171 | def feat_imp_MDI_clustered(fit, feat_names, clstrs): 172 | """Compute Mean Decrease Impurity 173 | 174 | Args: 175 | forest (Forest Classifier instance) 176 | feat_names (list(str)): List of names of features 177 | clstrs: dict[list] 178 | Clustering labels: key is the name of cluster and value is list of belonging columns 179 | 180 | Returns: 181 | pd.DataFrame: Importance means and standard deviations 182 | - mean: Mean of importance 183 | - std: Standard deviation of importance 184 | """ 185 | df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimators_)} 186 | df0 = pd.DataFrame.from_dict(df0, orient='index') 187 | df0.columns = feat_names 188 | df0 = df0.replace(0, np.nan) #because max_features=1 189 | imp = group_mean_std(df0, clstrs) 190 | imp /= imp['mean'].sum() 191 | return imp 192 | 193 | 194 | def feat_imp_MDA_clustered(clf, X, y, clstrs, 195 | sample_weight=None, 196 | scoring='neg_log_loss', 197 | n_splits=5, t1=None, 198 | cv_gen=None, pct_embargo=0, 199 | purging=True, num_threads=1): 200 | """Calculate Clustered Mean Decrease Accuracy 201 | 202 | Note: 203 | You can use any classifier to estimate importance 204 | 205 | Args: 206 | clf: Classifier instance 207 | X: pd.DataFrame, Input feature 208 | y: pd.Series, Label 209 | clstrs: dict[list] 210 | Clustering labels: key is the name of cluster and value is list of belonging columns 211 | sample_weight: pd.Series, optional 212 | If specified, apply this to testing and training 213 | scoring: str, default 'neg_log_loss' 214 | The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss' 215 | n_splits: int, default 3 216 | The number of splits for cross validation 217 | t1: pd.Series 218 | Index and value correspond to the begining and end of information. It is required for purging and embargo 219 | cv_gen: KFold instance 220 | If not specified, use PurgedKfold 221 | pct_embargo: float, default 0 222 | The percentage of applying embargo 223 | purging: bool, default True 224 | If true, apply purging method 225 | num_threads: int, default 1 226 | The number of threads for purging 227 | 228 | Returns: 229 | pd.DataFrame: Importance means and standard deviations 230 | - mean: Mean of importance 231 | - std: Standard deviation of importance 232 | """ 233 | 234 | if cv_gen is None: 235 | if t1 is not None: 236 | cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo, 237 | purging=purging, num_threads=num_threads) 238 | else: 239 | cv_gen = KFold(n_splits=n_splits) 240 | index = np.arange(n_splits) 241 | scores = pd.Series(index=index) 242 | scores_perm = pd.DataFrame(index=index, columns=clstrs.keys()) 243 | for idx, (train, test) in zip(index, cv_gen.split(X=X)): 244 | X_train = X.iloc[train] 245 | y_train = y.iloc[train] 246 | if sample_weight is not None: 247 | w_train = sample_weight.iloc[train].values 248 | else: 249 | w_train = None 250 | X_test = X.iloc[test] 251 | y_test = y.iloc[test] 252 | if sample_weight is not None: 253 | w_test = sample_weight.iloc[test].values 254 | else: 255 | w_test = None 256 | clf_fit = clf.fit(X_train, y_train, sample_weight=w_train) 257 | scores.loc[idx] = evaluate(clf_fit, X_test, y_test, scoring, 258 | sample_weight=w_test) 259 | 260 | for clstr_name in clstrs.keys(): 261 | X_test_ = X_test.copy(deep=True) 262 | for k in clstrs[clstr_name]: 263 | np.random.shuffle(X_test_[k].values) 264 | scores_perm.loc[idx, clstr_name] = evaluate(clf_fit, X_test_, y_test, 265 | scoring, sample_weight=w_test) 266 | # (Original score) - (premutated score) 267 | imprv = (-scores_perm).add(scores, axis=0) 268 | # Relative to maximum improvement 269 | if scoring == 'neg_log_loss': 270 | max_imprv = -scores_perm 271 | else: 272 | max_imprv = 1. - scores_perm 273 | imp = imprv / max_imprv 274 | imp = pd.concat({'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** -0.5}, axis=1) 275 | imp.index = [f"C_{i}" for i in imp.index] 276 | return imp --------------------------------------------------------------------------------