├── finance_ml
    ├── __init__.py
    ├── breakout
    │   ├── __init__.py
    │   └── get_bsadf.py
    ├── constants.py
    ├── datasets
    │   ├── __init__.py
    │   └── generate.py
    ├── multiprocessing
    │   ├── __init__.py
    │   ├── partition.py
    │   ├── pandas.py
    │   └── utils.py
    ├── risk
    │   ├── __init__.py
    │   └── failure.py
    ├── stats
    │   ├── __init__.py
    │   ├── rolling.py
    │   └── vol.py
    ├── hierarchical_clustering
    │   ├── __init__.py
    │   ├── metrics.py
    │   ├── utils.py
    │   ├── quasi.py
    │   └── allocation.py
    ├── labeling
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── sampling.py
    │   ├── trend.py
    │   ├── betsides.py
    │   ├── betsizes.py
    │   └── barriers.py
    ├── features
    │   ├── __init__.py
    │   ├── orth.py
    │   ├── fraction.py
    │   └── entropy.py
    ├── sampling
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── decay.py
    │   ├── weight.py
    │   ├── bootstrap.py
    │   ├── co_events.py
    │   └── time_weight.py
    ├── model_selection
    │   ├── __init__.py
    │   ├── distribution.py
    │   ├── pipeline.py
    │   ├── hyper.py
    │   ├── score.py
    │   ├── utils.py
    │   └── kfold.py
    ├── utils.py
    ├── distance.py
    ├── denoising.py
    ├── experiments.py
    ├── clustering.py
    └── importance.py
├── setup.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── conf.py
    │   └── index.rst
├── .gitignore
├── datasets.py
├── LICENSE
└── README.md


/finance_ml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finance_ml/breakout/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finance_ml/constants.py:
--------------------------------------------------------------------------------
1 | LONG = 1
2 | SHORT = -1


--------------------------------------------------------------------------------
/finance_ml/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .generate import get_cls_data


--------------------------------------------------------------------------------
/finance_ml/multiprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .pandas import mp_pandas_obj


--------------------------------------------------------------------------------
/finance_ml/risk/__init__.py:
--------------------------------------------------------------------------------
1 | from .failure import calc_prob_sr, prob_failure
2 | 


--------------------------------------------------------------------------------
/finance_ml/stats/__init__.py:
--------------------------------------------------------------------------------
1 | from .vol import get_vol, get_mean
2 | from .rolling import pandas_rolling


--------------------------------------------------------------------------------
/finance_ml/hierarchical_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from .allocation import get_rec_bipart, get_hrp
2 | from .quasi import get_quasi_diag
3 | from .metrics import get_corr_dist


--------------------------------------------------------------------------------
/finance_ml/labeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .barriers import get_events, get_t1, get_barrier_labels
2 | from .sampling import cusum_filter
3 | from .trend import get_bins_from_trend


--------------------------------------------------------------------------------
/finance_ml/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .orth import get_evec, ortho_feats
2 | from .entropy import match_length, lempel_zib_lib, get_entropy_rate, plug_in, konto
3 | from .fraction import get_opt_d, frac_diff_FFD


--------------------------------------------------------------------------------
/finance_ml/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | from .co_events import get_num_co_events
2 | from .bootstrap import seq_bootstrap
3 | from .time_weight import get_sample_weight, get_uniq_weight
4 | from .decay import get_time_decay


--------------------------------------------------------------------------------
/finance_ml/model_selection/__init__.py:
--------------------------------------------------------------------------------
1 | from .kfold import PurgedKFold, CPKFold, generate_signals
2 | from .score import cv_score
3 | from .pipeline import Pipeline
4 | from .hyper import clf_hyper_fit
5 | from .distribution import LogUniformGen, log_uniform
6 | from .utils import evaluate


--------------------------------------------------------------------------------
/finance_ml/hierarchical_clustering/metrics.py:
--------------------------------------------------------------------------------
 1 | def get_corr_dist(corr):
 2 |     """Calculate correlation distance
 3 |     
 4 |     Params
 5 |     ------
 6 |     corr: pd.DataFrame
 7 |     
 8 |     Returns
 9 |     -------
10 |     pd.DataFrame
11 |     """
12 |     dist = ((1 - corr) / 2)**.5
13 |     return dist


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name="finance_ml",
 6 |     version='0.1',
 7 |     description='utility library for finance',
 8 |     author='jjakimoto',
 9 |     author_email='f.j.akimoto@gmail.com',
10 |     packages=find_packages(),
11 |     py_modeuls=["finance_ml"]
12 | )


--------------------------------------------------------------------------------
/finance_ml/stats/rolling.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | 
4 | def pandas_rolling(series, window, freq=1, method='mean'):
5 |     series_list = []
6 |     for i in range(freq):
7 |         _series = series.iloc[i::freq].rolling(window).agg(method)
8 |         series_list.append(_series)
9 |     return pd.concat(series_list, axis=0).sort_index()


--------------------------------------------------------------------------------
/finance_ml/model_selection/distribution.py:
--------------------------------------------------------------------------------
 1 | from scipy.stats import rv_continuous
 2 | import numpy as np
 3 | 
 4 | 
 5 | class LogUniformGen(rv_continuous):
 6 |     def _cdf(self, x):
 7 |         return np.log(x / self.a) / np.log(self.b / self.a)
 8 | 
 9 | 
10 | def log_uniform(a=1, b=np.exp(1)):
11 |     return LogUniformGen(a=a, b=b, name='log_uniform')


--------------------------------------------------------------------------------
/finance_ml/model_selection/pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import Pipeline as _Pipeline
2 | 
3 | 
4 | class Pipeline(_Pipeline):
5 |     def fit(self, X, y, sample_weight=None, **fit_params):
6 |         if sample_weight is not None:
7 |             fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight
8 |         return super(MyPipeline, self).fit(X, y, **fit_params)


--------------------------------------------------------------------------------
/finance_ml/utils.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | from copy import deepcopy
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | def sign_log(x, scale=1):
 8 |     const = 1
 9 |     if isinstance(x, numbers.Number):
10 |         if x >= 0:
11 |             return np.log(const + scale * x)
12 |         else:
13 |             return np.log(const + scale * np.abs(x))
14 |     x = deepcopy(x)
15 |     x[x >= 0] = np.log(const + scale * np.abs(x[x >= 0]))
16 |     x[x < 0] = -np.log(const + scale * np.abs(x[x < 0]))
17 |     return x


--------------------------------------------------------------------------------
/finance_ml/sampling/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def get_ind_matrix(bar_idx, t1):
 5 |     ind_m = pd.DataFrame(0, index=bar_idx,
 6 |                          columns=range(t1.shape[0]))
 7 |     for i, (t0_, t1_) in enumerate(t1.iteritems()):
 8 |         ind_m.loc[t0_:t1_, i] = 1
 9 |     return ind_m
10 | 
11 | 
12 | def get_avg_uniq(ind_m, c=None):
13 |     if c is None:
14 |         c = ind_m.sum(axis=1)
15 |     ind_m = ind_m.loc[c > 0]
16 |     c = c.loc[c > 0]
17 |     u = ind_m.div(c, axis=0)
18 |     avg_u = u[u > 0].mean()
19 |     avg_u = avg_u.fillna(0)
20 |     return avg_u
21 | 


--------------------------------------------------------------------------------
/finance_ml/hierarchical_clustering/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import random
 4 | 
 5 | 
 6 | def generateData(nObs, size0, size1, sigma1):
 7 |     # Time series of correlated variables
 8 |     #1) generating some uncorrelated data
 9 |     np.random.seed(seed=12345)
10 |     random.seed(12345)
11 |     x = np.random.normal(0, 1, size=(nObs, size0))  # each row is a variable
12 |     #2) creating correlation between the variables
13 |     cols = [random.randint(0, size0 - 1) for i in range(size1)]
14 |     y = x[:, cols] + np.random.normal(0, sigma1, size=(nObs, len(cols)))
15 |     x = np.append(x, y, axis=1)
16 |     x = pd.DataFrame(x, columns=range(1, x.shape[1] + 1))
17 |     return x, cols
18 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/finance_ml/sampling/decay.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_time_decay(uniq_weight, last=1.):
 6 |     """Calculate time decay weight
 7 |     
 8 |     Params
 9 |     ------
10 |     uniq_weight: pd.Series
11 |         Sampling weight calculated label uniqueness
12 |     last: float, default 1, no decay
13 |         Parameter to detemine the slope and constant
14 |     
15 |     Returns
16 |     -------
17 |     pd.Series
18 |     """
19 |     weight = uniq_weight.sort_index().cumsum()
20 |     if last > 0:
21 |         slope = (1 - last) / weight.iloc[-1]
22 |     else:
23 |         slope = 1 / ((1 + last) * weight.iloc[-1])
24 |     const = 1. - slope * weight.iloc[-1]
25 |     weight = const + slope * weight
26 |     weight[weight < 0] = 0
27 |     return weight


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | *.pyc
10 | *.swp
11 | 
12 | # Packages #
13 | ############
14 | # it's better to unpack these files and commit the raw source
15 | # git has its own built in compression methods
16 | *.7z
17 | *.dmg
18 | *.gz
19 | *.iso
20 | *.jar
21 | *.rar
22 | *.tar
23 | *.zip
24 | *.meta
25 | *.index
26 | *.ckpt*
27 | # Logs and databases #
28 | ######################
29 | .ipynb_checkpoints
30 | *.log
31 | *.sql
32 | *.sqlite
33 | /*.egg-info
34 | 
35 | # OS generated files #
36 | ######################
37 | .DS_Store
38 | .DS_Store?
39 | ._*
40 | .Spotlight-V100
41 | .Trashes
42 | ehthumbs.db
43 | Thumbs.db
44 | dist
45 | build
46 | build/*
47 | .idea
48 | params
49 | */data/*
50 | .vscode
51 | 


--------------------------------------------------------------------------------
/finance_ml/sampling/weight.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_time_decay(tw, last_w=1., truncate=0, is_exp=False):
 6 |     cum_w = tw.sort_index().cumsum()
 7 |     init_w = 1.
 8 |     if is_exp:
 9 |         init_w = np.log(init_w)
10 |     if last_w >= 0:
11 |         if is_exp:
12 |             last_w = np.log(last_w)
13 |         slope = (init_w - last_w) / cum_w.iloc[-1]
14 |     else:
15 |         slope = init_w / ((last_w + 1) * cum_w.iloc[-1])
16 |     const = init_w - slope * cum_w.iloc[-1]
17 |     weights = const + slope * cum_w
18 |     if is_exp:
19 |         weights =np.exp(weights)
20 |     weights[weights < truncate] = 0
21 |     return weights
22 | 
23 | 
24 | def get_sample_tw(t1, num_co_events, molecule):
25 |     wght = pd.Series(index=molecule)
26 |     for t_in, t_out in t1.loc[wght.index].iteritems():
27 |         wght.loc[t_in] = (1. / num_co_events.loc[t_in: t_out]).mean()
28 |     return wght


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/finance_ml/labeling/utils.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | 
 3 | 
 4 | def drop_labels(events, min_pct=0.05):
 5 |     while True:
 6 |         df = events['bin'].value_counts(normalize=True)
 7 |         if df.min() > min_pct or df.shape[0] < 3:
 8 |             break
 9 |         print('dropped label', df.argmin(), df.min())
10 |         events = events[events['bin'] != df.argmin()]
11 |     return events
12 | 
13 | 
14 | def get_partial_index(df, start=None, end=None):
15 |     """Get partial time index according to start and end
16 | 
17 |     Args:
18 |         df (pd.DatFrame or pd.Series)
19 | 
20 |         start (datetime.datetime, optional): e.g., datetime(2018, 1, 1)
21 | 
22 |         end (datetime.datetime, optional): e.g., dateteim(2018, 3, 1)
23 | 
24 |     Returns:
25 |         pd.DatetimeIndex
26 |     """
27 |     if start is not None:
28 |         df = df.loc[df.index >= start]
29 |     if end is not None:
30 |         df = df.loc[df.index <= end]
31 |     return df.index
32 | 


--------------------------------------------------------------------------------
/finance_ml/sampling/bootstrap.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .utils import get_avg_uniq
 3 | 
 4 | 
 5 | def seq_bootstrap(ind_m, s_length=None):
 6 |     if s_length is None:
 7 |         s_length = ind_m.shape[1]
 8 |     phi = []
 9 |     while len(phi) < s_length:
10 |         c = ind_m[phi].sum(axis=1) + 1
11 |         avg_u = get_avg_uniq(ind_m, c)
12 |         prob = (avg_u / avg_u.sum()).values
13 |         phi += [np.random.choice(ind_m.columns, p=prob)]
14 |     return phi
15 | 
16 | 
17 | def get_ind_matrix(timestamps, t1, num_threads=1):
18 |     return mp_pandas_obj(
19 |         mp_ind_matrix, ('molecule', t1.index),
20 |         num_threads,
21 |         timestamps=timestamps,
22 |         t1=t1)
23 | 
24 | 
25 | def mp_ind_matrix(timestamps, t1, molecule):
26 |     t1 = t1.loc[molecule]
27 |     ind_matrix = pd.DataFrame(0, index=timestamps, columns=molecule)
28 |     for i, (t0, t1) in enumerate(t1.iteritems()):
29 |         ind_matrix.loc[t0:t1] = 1
30 |     return ind_matrix


--------------------------------------------------------------------------------
/finance_ml/stats/vol.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def _get_ret(close, span=100, days=None, seconds=None):
 6 |     """Estimate exponential average volatility"""
 7 |     if days is None:
 8 |         delta = pd.Timedelta(seconds=seconds)
 9 |     else:
10 |         delta = pd.Timedelta(days=days)
11 |     use_idx = close.index.searchsorted(close.index - delta)
12 |     prev_idx = pd.Series(use_idx, index=close.index)
13 |     prev_idx = prev_idx[prev_idx > 0]
14 |     # Get rid of duplications in index
15 |     prev_idx = prev_idx.drop_duplicates()
16 |     ret = close[prev_idx.index] / close[prev_idx].values - 1
17 |     vol = ret.ewm(span=span).std()
18 |     return vol
19 | 
20 | 
21 | def get_vol(close, span=100, days=None, seconds=None):
22 |     ret = _get_ret(close, span, days, seconds)
23 |     vol = ret.ewm(span=span).std()
24 |     return vol
25 | 
26 | 
27 | def get_mean(close, span=100, days=None, seconds=None):
28 |     ret = _get_ret(close, span, days, seconds)
29 |     mean = ret.ewm(span=span).mean()
30 |     return mean
31 | 


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.datasets import make_classification
 3 | 
 4 | 
 5 | def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
 6 |     X, cont = make_classification(n_samples=n_samples, n_features=n_features,
 7 |                                   n_informative=n_informative, n_redundant=n_redundant,
 8 |                                   random_state=0, shuffle=False)
 9 |     time_idx = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.BDay(),
10 |                                 end=pd.datetime.today())
11 |     X = pd.DataFrame(X, index=time_idx)
12 |     cont = pd.Series(cont, index=time_idx).to_frame('bin')
13 |     # Create name of columns
14 |     columns = ['I_' + str(i) for i in range(n_informative)]
15 |     columns += ['R_' + str(i) for i in range(n_redundant)]
16 |     columns += ['N_' + str(i) for i in range(n_features - len(columns))]
17 |     X.columns = columns
18 |     cont['w'] = 1. / cont.shape[0]
19 |     cont['t1'] = pd.Series(cont.index, index=cont.index)
20 |     return X, cont
21 | 


--------------------------------------------------------------------------------
/finance_ml/datasets/generate.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import make_classification
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_cls_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
 6 |     X, cont = make_classification(n_samples=n_samples, n_features=n_features,
 7 |                                   n_informative=n_informative, n_redundant=n_redundant,
 8 |                                   random_state=0, shuffle=False)
 9 |     time_idx = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.BDay(),
10 |                                 end=pd.datetime.today())
11 |     X = pd.DataFrame(X, index=time_idx)
12 |     cont = pd.Series(cont, index=time_idx).to_frame('bin')
13 |     # Create name of columns
14 |     columns = ['I_' + str(i) for i in range(n_informative)]
15 |     columns += ['R_' + str(i) for i in range(n_redundant)]
16 |     columns += ['N_' + str(i) for i in range(n_features - len(columns))]
17 |     X.columns = columns
18 |     cont['w'] = 1. / cont.shape[0]
19 |     cont['t1'] = pd.Series(cont.index, index=cont.index)
20 |     return X, cont


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Tomoaki Fujii
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/finance_ml/hierarchical_clustering/quasi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_quasi_diag(link):
 6 |     """Calculate quasi diagonalization
 7 |     
 8 |     Params
 9 |     ------
10 |     link: list
11 |         Result from hierachical clustering of scipy
12 |         
13 |     Returns
14 |     -------
15 |     pd.Series: sorted index
16 |     """
17 |     # Make labels integers
18 |     link = link.astype(int)
19 |     sort_idx = pd.Series([link[-1, 0], link[-1, 1]])
20 |     num_items = link[-1, 3]
21 |     # Iterate until all elements are assigned
22 |     while sort_idx.max() >= num_items:
23 |         # Gerante index for the first element of cluster
24 |         sort_idx.index = range(0, sort_idx.shape[0] * 2, 2)
25 |         # Get clustered value not single elements
26 |         clusters = sort_idx[sort_idx >= num_items]
27 |         idx = clusters.index
28 |         # Add clusters
29 |         cl_idx = clusters.values - num_items
30 |         sort_idx[idx] = link[cl_idx, 0]
31 |         df = pd.Series(link[cl_idx, 1], index=idx + 1)
32 |         sort_idx = sort_idx.append(df)
33 |         # Resort
34 |         sort_idx = sort_idx.sort_index()
35 |         sort_idx.index = range(sort_idx.shape[0])
36 |     return sort_idx.tolist()


--------------------------------------------------------------------------------
/finance_ml/sampling/co_events.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from ..multiprocessing import mp_pandas_obj
 5 | 
 6 | 
 7 | def mp_num_co_events(timestamps, t1, molecule):
 8 |     """Calculate the number of co events for multiprocessing"""
 9 |     # Find events that span the period defined by molecule
10 |     t1 = t1.fillna(timestamps[-1])
11 |     t1 = t1[t1 >= molecule[0]]
12 |     t1 = t1.loc[:t1[molecule].max()]
13 |     # Count the events
14 |     iloc = timestamps.searchsorted(np.array([t1.index[0], t1.max()]))
15 |     count = pd.Series(0, index=timestamps[iloc[0]:iloc[1] + 1])
16 |     for t_in, t_out in t1.iteritems():
17 |         count.loc[t_in:t_out] += 1
18 |     return count.loc[molecule[0]:t1[molecule].max()]
19 | 
20 | 
21 | def get_num_co_events(timestamps, t1, num_threads=1):
22 |     """Calculate the number of co events
23 |     
24 |     Params
25 |     ------
26 |     timestamps: DatetimeIndex
27 |         The timesstamps defining the range of searching
28 |     t1: pd.Series
29 |     num_threads: int
30 |     
31 |     Returns
32 |     pd.Series: each value corresponds to the number of co occurence
33 |     """
34 |     return mp_pandas_obj(
35 |         mp_num_co_events, ('molecule', t1.index),
36 |         num_threads,
37 |         timestamps=timestamps,
38 |         t1=t1)
39 | 


--------------------------------------------------------------------------------
/finance_ml/labeling/sampling.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def cusum_filter(close, h, k=0):
 6 |     """Sample points with CUSUM Filter
 7 | 
 8 |     Args:
 9 |         close (pd.Series): Price series.
10 |         
11 |         h (float or pd.Series): Threasholds to sample points.\
12 |             If specified with float, translate to pd.Series(h, index=close.index)
13 |         
14 |         k (float, optional): Minimum speed parameter to hit threashold.\
15 |             Defaults to 0, which means inactive
16 | 
17 |     Returns:
18 |         pd.DatetimeIndex: Sampled data points
19 |     """
20 |     # asssum that E y_t = y_{t-1}
21 |     s_pos, s_neg = 0, 0
22 |     diff = close.diff().dropna()
23 |     # time variant threshold
24 |     if isinstance(h, numbers.Number):
25 |         h = pd.Series(h, index=diff.index)
26 |     h = h.reindex(diff.index, method='bfill')
27 |     h = h.dropna()
28 |     timestamps = []
29 |     th = h.loc[h.index[0]]
30 |     for t in h.index:
31 |         s_pos = max(0, s_pos + diff.loc[t] - k)
32 |         s_neg = min(0, s_neg + diff.loc[t] + k)
33 |         if s_pos > th:
34 |             s_pos = 0
35 |             timestamps.append(t)
36 |             th = h.loc[t]
37 |         elif s_neg < -th:
38 |             s_neg = 0
39 |             timestamps.append(t)
40 |             th = h.loc[t]
41 |     return pd.DatetimeIndex(timestamps)
42 | 


--------------------------------------------------------------------------------
/finance_ml/risk/failure.py:
--------------------------------------------------------------------------------
 1 | def calc_prob_sr(pt, sl, freq, tgt_sr, rf=0.):
 2 |     """Calculate required probability wrt target SR
 3 | 
 4 |     Paramters
 5 |     ---------
 6 |     pt: float
 7 |         Profit Take
 8 |     sl: float
 9 |         Stop Loss
10 |     freq: float
11 |         Frequency of trading
12 |     tgt_sr: float
13 |         Target Sharpe Ratio
14 |     rf: float, (default 0)
15 |         Risk Free Rate
16 | 
17 |     Returns
18 |     -------
19 |     float: Required probability
20 |     """
21 |     diff = pt - sl
22 |     a = (freq + tgt_sr ** 2) * diff ** 2
23 |     b = diff * (2 * freq * (sl - rf) - tgt_sr ** 2 * diff)
24 |     c = freq * (sl - rf) ** 2
25 |     p = (-b + (b ** 2 - 4 * a * c) ** .5) / (2. * a)
26 |     return p
27 | 
28 | 
29 | def prob_failure(ret, freq, tgt_sr):
30 |     """
31 |     Calculate the probability to fail in achieving
32 |     target Sharpe Ratio
33 | 
34 |     Parameters
35 |     ----------
36 |     ret: array-like
37 |         Returns of trading
38 |     freq: float
39 |         Frequency of trading
40 |     tgt_sr: float
41 |         Aiming Sharpe Ratio
42 | 
43 |     Returns
44 |     -------
45 |     risk: float
46 |     """
47 |     r_pos = ret[ret > 0].mean()
48 |     r_neg = ret[ret <= 0].mean()
49 |     p = ret[ret > 0].shape[0] / float(ret.shape[0])
50 |     th_p = calc_prob_sr(r_pos, r_neg, freq, tgt_sr)
51 |     risk = ss.norm.cdf(th_p, p, p * (1 - p))
52 |     return risk


--------------------------------------------------------------------------------
/finance_ml/multiprocessing/partition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def linear_parts(num_atoms, num_threads):
 5 |     """Linear partitions
 6 | 
 7 |     Args:
 8 |         num_atoms (int): The number of data points
 9 | 
10 |         num_threads (int): The number of partitions to split
11 | 
12 |     Returns:
13 |         array-like: indices of start and end
14 |     """
15 |     parts = np.linspace(0, num_atoms, min(num_threads, num_atoms) + 1)
16 |     parts = np.ceil(parts).astype(int)
17 |     return parts
18 | 
19 | 
20 | def nested_parts(num_atoms, num_threads, descend=False):
21 |     """Nested partitions
22 | 
23 |     Args:
24 |         num_atoms (int): The number of data points
25 | 
26 |         num_threads (int): The number of partitions to split
27 | 
28 |         descend (bool, optional): If True, the size of partitions are decreasing.
29 |             Defaults to False.
30 | 
31 |     Returns:
32 |         array-like: indices of start and end
33 |     """
34 |     parts = [0]
35 |     num_threads = min(num_threads, num_atoms)
36 |     for num in range(num_threads):
37 |         part = 1 + 4 * (parts[-1] ** 2 + parts[-1] + num_atoms * (num_atoms + 1.) / num_threads)
38 |         part = 0.5 * (-1 + np.sqrt(part))
39 |         parts.append(part)
40 |     if descend:
41 |         # Computational decreases as index increases
42 |         parts = np.cumsum(np.diff(parts)[::-1])
43 |         parts = np.append(np.array([0]), parts)
44 |     parts = np.round(parts).astype(int)
45 |     return parts


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # finance_ml
 2 | Python implementations of Machine Learning helper functions for Quantiative Finance based on books,
 3 | [Advances in Financial Machine Learning](https://www.amazon.co.jp/Advances-Financial-Machine-Learning-English-ebook/dp/B079KLDW21) and [Machine Learning for Asset Managers](https://www.amazon.com/Machine-Learning-Managers-Elements-Quantitative/dp/1108792898) , written by `Marcos Lopez de Prado`. 
 4 | 
 5 | # Installation
 6 | Excute the following command
 7 | ```python
 8 | python setup.py install
 9 | ```
10 | 
11 | or
12 | 
13 | Simply add `your/path/to/finace_ml` to your PYTHONPATH.
14 | 
15 | # Implementation
16 | The following functions are implemented:
17 | * Labeling
18 | * Multiporcessing
19 | * Sampling
20 | * Feature Selection
21 | * Asset Allcation
22 | * Breakout Detection
23 | 
24 | # Examples
25 | Some of example notebooks are found under the folder `MLAssetManagers`.
26 | 
27 | ## multiprocessing
28 | Parallel computing using `multiprocessing` library.
29 | Here is the example of applying function to each element with parallelization.
30 | ```python
31 | import pandas as pd
32 | import numpy as np
33 | 
34 | def apply_func(x):
35 |     return x ** 2
36 | 
37 | def func(df, timestamps, f):
38 |     df_ = df.loc[timestamps]
39 |     for idx, x in df_.items():
40 |         df_.loc[idx] = f(x)
41 |     return df_
42 |     
43 | df = pd.Series(np.random.randn(10000))
44 | from finance_ml.multiprocessing import mp_pandas_obj
45 | 
46 | results = mp_pandas_obj(func, pd_obj=('timestamps', df.index),
47 |                         num_threads=24, df=df, f=apply_func)
48 | print(results.head())
49 | ```
50 | Output:
51 | ```
52 | 0    0.449278
53 | 1    1.411846
54 | 2    0.157630
55 | 3    4.949410
56 | 4    0.601459
57 | ```
58 | 
59 | For more detail, please refer to example notebook!


--------------------------------------------------------------------------------
/finance_ml/model_selection/hyper.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 2 | from sklearn.ensemble import BaggingClassifier
 3 | 
 4 | from .kfold import PurgedKFold
 5 | from .pipeline import Pipeline
 6 | 
 7 | 
 8 | def clf_hyper_fit(feat, label, t1, pipe_clf, search_params, scoring=None,
 9 |                   n_splits=3, bagging=[0, None, 1.],
10 |                   rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params):
11 |     # Set default value for scoring
12 |     if scoring is None:
13 |         if set(label.values) == {0, 1}:
14 |             scoring = 'f1'
15 |         else:
16 |             scoring = 'neg_log_loss'
17 |     # HP search on training data
18 |     inner_cv = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)
19 |     if rnd_search_iter == 0:
20 |         search = GridSearchCV(estimator=pipe_clf, param_grid=search_params,
21 |                               scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
22 |     else:
23 |         search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=search_params,
24 |                                     scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
25 |     best_pipe = search.fit(feat, label, **fit_params).best_estimator_
26 |     # Fit validated model on the entirely of data
27 |     if bagging[0] > 0:
28 |         bag_est = BaggingClassifier(base_estimator=Pipeline(best_pipe.steps),
29 |                                     n_estimators=int(bagging[0]), max_samples=float(bagging[1]),
30 |                                     max_features=float(bagging[2]), n_jobs=n_jobs)
31 |         bag_est = best_pipe.fit(feat, label,
32 |                                 sample_weight=fit_params[bag_est.base_estimator.steps[-1][0] + '__sample_weight'])
33 |         best_pipe = Pipeline([('bag', bag_est)])
34 |     return best_pipe


--------------------------------------------------------------------------------
/finance_ml/features/orth.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_evec(dot, var_th):
 6 |     """Calculate eigen values and vectors
 7 |     
 8 |     Params
 9 |     ------
10 |     dot: pd.DataFrame
11 |         Z score product dataframe
12 |     var_th: float
13 |         Threshold for the explanation of variance
14 |     
15 |     Returns
16 |     -------
17 |     e_val: pd.Series, eigen values
18 |     e_vec: pd.DataFrame, eigen vectors
19 |     """
20 |     # Compute and sort eigen vectors and values for dot product matrix
21 |     e_val, e_vec = np.linalg.eigh(dot)
22 |     idx = e_val.argsort()[::-1]
23 |     e_val, e_vec = e_val[idx], e_vec[:, idx]
24 |     # Labeling features
25 |     e_val = pd.Series(e_val, index=['PC_' + str(i + 1) for i in range(e_val.shape[0])])
26 |     e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index)
27 |     e_vec = e_vec.loc[:, e_val.index]
28 |     # Reduce dimension from threshold
29 |     cum_var = e_val.cumsum() / e_val.sum()
30 |     dim = cum_var.searchsorted(var_th)[0]
31 |     e_val = e_val.iloc[:dim + 1]
32 |     e_vec = e_vec.iloc[:, :dim + 1]
33 |     return e_val, e_vec
34 | 
35 | 
36 | def ortho_feats(dfX, var_th=.95):
37 |     """Compute orthgonal features with threshold
38 |     
39 |     Params
40 |     ------
41 |     dfX: pd.DataFrame
42 |         Feataures dataframe
43 |     var_th: float
44 |         Threshold for the explanation of variance
45 |         
46 |     Returns
47 |     -------
48 |     pd.DataFrame: orthogonal feature
49 |     """
50 |     Z = (dfX.values - dfX.mean().values) / dfX.std().values
51 |     dot = pd.DataFrame(np.dot(Z.T, Z), index=dfX.columns, columns=dfX.columns)
52 |     e_val, e_vec = get_evec(dot, var_th)
53 |     dfP = pd.DataFrame(np.dot(Z, e_vec), index=dfX.index,
54 |                        columns=['PC_' + str(i + 1) for i in range(e_vec.shape[1])])
55 |     return dfP


--------------------------------------------------------------------------------
/finance_ml/labeling/trend.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import multiprocessing as mp
 4 | 
 5 | import statsmodels.api as sm
 6 | 
 7 | from ..multiprocessing import mp_pandas_obj
 8 | 
 9 | 
10 | def t_val_linreg(close):
11 |     x = np.ones((close.shape[0], 2))
12 |     x[:, 1] = np.arange(close.shape[0])
13 |     ols = sm.OLS(close, x).fit()
14 |     return ols.tvalues[1]
15 | 
16 | def _get_bins_from_trend(molecule, close, min_step, max_step, step):
17 |     out = pd.DataFrame(index=molecule, columns=['t1', 't_val','bin'])
18 |     hrzns = list(range(min_step, max_step + 1, step))
19 |     for dt0 in molecule:
20 |         iloc0 = close.index.get_loc(dt0)
21 |         if iloc0 + max(hrzns) > close.shape[0]:
22 |             continue
23 |         df0 = pd.Series()
24 |         for hrzn in hrzns:
25 |             dt1 = close.index[iloc0 + hrzn - 1]
26 |             df1 = close.loc[dt0:dt1]
27 |             df0.loc[dt1] = t_val_linreg(df1.values)
28 |         # Get maximum tstats point
29 |         dt1 = df0.replace([-np.inf, np.inf, np.nan], 0).abs().idxmax()
30 |         out.loc[dt0, ['t1', 't_val', 'bin']] = df0.index[-1], df0[dt1], np.sign(df0[dt1])
31 |     out['t1'] = pd.to_datetime(out['t1'])
32 |     out['bin'] = pd.to_numeric(out['bin'], downcast='signed')
33 |     return out.dropna(subset=['bin'])
34 | 
35 | 
36 | def get_bins_from_trend(close, max_step, min_step=3, step=1, num_threads=None):
37 |     if num_threads is None:
38 |         num_threads = mp.cpu_count()
39 |     output = mp_pandas_obj(func=_get_bins_from_trend,
40 |                            pd_obj=('molecule', close.index),
41 |                            num_threads=num_threads,
42 |                            close=close,
43 |                            max_step=max_step,
44 |                            min_step=min_step,
45 |                            step=step)
46 |     return output
47 | 
48 | 
49 |     


--------------------------------------------------------------------------------
/finance_ml/multiprocessing/pandas.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from .partition import linear_parts, nested_parts
 4 | from .utils import process_jobs
 5 | 
 6 | 
 7 | def mp_pandas_obj(func, pd_obj, num_threads=1, mp_batches=1,
 8 |                   linear_mols=True,
 9 |                   descend=False, **kwargs):
10 |     """Return multiprocessed results
11 | 
12 |     Args:
13 |         func　（function object)
14 |     
15 |         pd_obj (list):
16 |             pd_obj[0], The name of parameters to be parallelized
17 |             pd_obj[1], List of parameters to be parallelized
18 | 
19 |         mp_batches (int): The number of batches processed for each thread.
20 | 
21 |         linear_mols (bool):
22 |             If True, use linear partition
23 |             If False, use nested partition
24 |         
25 |         descend (bool): The parameter for nested partitions
26 |         
27 |         kwargs: optional parameters of `func`
28 | 
29 |     Returns:
30 |         The same type as the output of func
31 |     """
32 |     if linear_mols:
33 |         parts = linear_parts(len(pd_obj[1]), num_threads * mp_batches)
34 |     else:
35 |         parts = nested_parts(len(pd_obj[1]), num_threads * mp_batches, descend)
36 |     jobs = []
37 |     for i in range(1, len(parts)):
38 |         job = {pd_obj[0]: pd_obj[1][parts[i - 1]: parts[i]], 'func': func}
39 |         job.update(kwargs)
40 |         jobs.append(job)
41 |     outputs = [x[0] for x in process_jobs(jobs, num_threads=num_threads)]
42 |     # You can use either of pd.Series or pd.DatFrame
43 |     if isinstance(outputs[0], pd.Series):
44 |         df = pd.Series()
45 |     elif isinstance(outputs[0], pd.DataFrame):
46 |         df = pd.DataFrame()
47 |     else:
48 |         return outputs
49 |     # The case of multiple threads
50 |     for output in outputs:
51 |         df = df.append(output)
52 |     df = df.sort_index()
53 |     return df


--------------------------------------------------------------------------------
/finance_ml/sampling/time_weight.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from ..multiprocessing import mp_pandas_obj
 5 | 
 6 | 
 7 | def mp_sample_weight(series, t1, num_co_events, molecule):
 8 |     weight = pd.Series(index=molecule)
 9 |     for t_in, t_out in t1.loc[weight.index].iteritems():
10 |         weight.loc[t_in] = (
11 |             series.loc[t_in:t_out] / num_co_events.loc[t_in:t_out]).sum()
12 |     return weight.abs()
13 | 
14 | 
15 | def get_sample_weight(series, t1, num_co_events, num_threads=1):
16 |     """Calculate sampeling weight with considering some attributes
17 |     
18 |     Params
19 |     ------
20 |     series: pd.Series
21 |         Used for assigning weight. Larger value, larger weight e.g., log return
22 |     t1: pd.Series
23 |     num_co_events: pd.Series
24 |     num_threads: int
25 |     
26 |     Return
27 |     ------
28 |     pd.Series
29 |     """
30 |     weight = mp_pandas_obj(
31 |         mp_sample_weight, ('molecule', t1.index),
32 |         num_threads,
33 |         series=series,
34 |         t1=t1,
35 |         num_co_events=num_co_events)
36 |     return weight * weight.shape[0] / weight.sum()
37 | 
38 | 
39 | def mp_uniq_weight(t1, num_co_events, molecule):
40 |     """Calculate time sample weight utilizing occurence events information"""
41 |     wght = pd.Series(index=molecule)
42 |     for t_in, t_out in t1.loc[wght.index].iteritems():
43 |         wght.loc[t_in] = (1. / num_co_events.loc[t_in:t_out]).mean()
44 |     return wght
45 | 
46 | 
47 | def get_uniq_weight(t1, num_co_events, num_threads=1):
48 |     """Calculate time sample weight utilizing occurence events information
49 |     
50 |     Params
51 |     ------
52 |     t1: pd.Series
53 |     num_co_events: pd.Series
54 |         The number of co-occurence events
55 |     num_threads: int
56 |     
57 |     Returns
58 |     pd.Series
59 |     """
60 |     return mp_pandas_obj(
61 |         mp_uniq_weight, ('molecule', t1.index),
62 |         num_threads,
63 |         t1=t1,
64 |         num_co_events=num_co_events)
65 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../..'))
16 | sys.setrecursionlimit(1500)
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'finance_ml'
21 | copyright = '2019, jjakimoto'
22 | author = 'jjakimoto'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = ['sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinx.ext.autodoc']
31 | 
32 | # Add any paths that contain templates here, relative to this directory.
33 | templates_path = ['_templates']
34 | 
35 | # List of patterns, relative to source directory, that match files and
36 | # directories to ignore when looking for source files.
37 | # This pattern also affects html_static_path and html_extra_path.
38 | exclude_patterns = []
39 | 
40 | 
41 | # -- Options for HTML output -------------------------------------------------
42 | 
43 | # The theme to use for HTML and HTML Help pages.  See the documentation for
44 | # a list of builtin themes.
45 | #
46 | html_theme = 'alabaster'
47 | 
48 | # Add any paths that contain custom static files (such as style sheets) here,
49 | # relative to this directory. They are copied after the builtin static files,
50 | # so a file named "default.css" will overwrite the builtin "default.css".
51 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/finance_ml/breakout/get_bsadf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def get_bsadf(series, tau, constant, lags):
 5 |     y, x = get_yx(series, constant=constant, lags=lags)
 6 |     if not isinstance(lags, int):
 7 |         lags = np.max(lags)
 8 |     start_points = range(0, y.shape[0] - tau + 1)
 9 |     basdf = None
10 |     all_adf = []
11 |     for start in start_points:
12 |         y_ = y[start:]
13 |         x_ = x[start:]
14 |         b_mean, b_var = get_betas(y_, x_)
15 |         b_mean = b_mean[0,]
16 |         b_std = b_var[0, 0] ** 0.5
17 |         all_adf.append(b_mean / b_std)
18 |     all_adf = np.array(all_adf)
19 |     bsadf = np.max(all_adf[np.isfinite(all_adf)])
20 |     out = {'Time': series.index[-1], 'bsadf': bsadf}
21 |     return out
22 | 
23 | 
24 | def get_yx(series, constant, lags):
25 |     diff = series.diff().dropna()
26 |     lag_feat = get_lag_features(diff, lags).dropna()
27 |     # Add non diff feature
28 |     lag_feat[series.name] = series.shift(1)
29 |     index = lag_feat.dropna().index & diff.dropna().index
30 |     x = lag_feat.loc[index].values
31 |     y = diff.loc[index].values
32 |     # Set constant value
33 |     if constant != 'nc':
34 |         const = np.ones((x.shape[0], 1))
35 |         x = np.hstack((x, const))
36 |         if constant[:2] == 'ct':
37 |             trend = np.arange(x.shape[0]).reshape(-1, 1)
38 |             x = np.hstack((x, trend))
39 |         if constant == 'ctt':
40 |             x = np.hstack((x, trend ** 2))
41 |     return y, x
42 | 
43 | 
44 | def get_lag_features(series, lags):
45 |     lag_feat = pd.DataFrame()
46 |     if isinstance(lags, int):
47 |         lags = range(1, lags + 1)
48 |     else:
49 |         lags = [int(lag) for lag in lags]
50 |     for lag in lags:
51 |         lag_feat[f'{series.name}_{lag}'] = series.shift(lag).copy(deep=True)
52 |     return lag_feat
53 | 
54 | def get_betas(y, x, lam=0):
55 |     xy = np.dot(x.T, y)
56 |     xx = np.dot(x.T, x)
57 |     xxinv = np.linalg.inv(xx + lam)
58 |     beta_mean = np.dot(xxinv, xy)
59 |     err = y - np.dot(x, beta_mean)
60 |     beta_var = np.dot(err.T, err) / (x.shape[0] - x.shape[1]) * xxinv
61 |     return beta_mean, beta_var


--------------------------------------------------------------------------------
/finance_ml/labeling/betsides.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | import pandas as pd
 3 | import numpy as np
 4 | import multiprocessing as mp
 5 | 
 6 | from ..multiprocessing import mp_pandas_obj
 7 | 
 8 | 
 9 | def _cusum_side(diff, h, k=0, molecule=None):
10 |     side = []
11 |     s_pos, s_neg = 0, 0
12 |     timestamps = []
13 |     th = None
14 |     for t in molecule:
15 |         if th is None:
16 |             th = h.loc[t]
17 |         s_pos = max(0, s_pos + diff.loc[t] - k)
18 |         s_neg = min(0, s_neg + diff.loc[t] + k)
19 |         if s_pos > th:
20 |             s_pos = 0
21 |             timestamps.append(t)
22 |             th = h.loc[t]
23 |             side.append(1)
24 |         elif s_neg < -th:
25 |             s_neg = 0
26 |             timestamps.append(t)
27 |             th = h.loc[t]
28 |             side.append(-1)
29 |     side = pd.Series(side, index=pd.DatetimeIndex(timestamps))
30 |     return side
31 | 
32 | 
33 | def cusum_side(close, h, k=0, use_log=True, num_threads=None):
34 |     """Sample points with CUSUM Filter and use its direction as betting side
35 | 
36 |     Args:
37 |         close (pd.Series): Price series
38 | 
39 |         h (float or pd.Series): Threasholds to sampmle points.\
40 |             If specified with float, translate to pd.Series(h, index=close.index)
41 | 
42 |         k (float, optional): Minimum speed parameter to hit threashold.\
43 |             Defaults to 0, which means inactive
44 | 
45 |     Returns:
46 |         pd.Series: Betting sides at sampled points
47 |     """
48 |     if num_threads is None:
49 |         num_threads = mp.cpu_count()
50 |     # asssum that E y_t = y_{t-1}
51 |     side = []
52 |     s_pos, s_neg = 0, 0
53 |     if use_log:
54 |         diff = np.log(close).diff().dropna()
55 |     else:
56 |         diff = close.diff().dropna()
57 |     # time variant threshold
58 |     if isinstance(h, numbers.Number):
59 |         h = pd.Series(h, index=diff.index)
60 |     h = h.reindex(diff.index, method='bfill')
61 |     h = h.dropna()
62 |     side = mp_pandas_obj(func=_cusum_side,
63 |                          pd_obj=('molecule', h.index),
64 |                          num_threads=num_threads,
65 |                          diff=diff, h=h, k=k)
66 |     return side


--------------------------------------------------------------------------------
/finance_ml/hierarchical_clustering/allocation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scipy.cluster.hierarchy as sch
 4 | 
 5 | from .metrics import get_corr_dist
 6 | from .quasi import get_quasi_diag
 7 | 
 8 | 
 9 | def get_rec_bipart(cov, sort_idx):
10 |     """Compute portfolio weight by recursive bisection
11 |     
12 |     Params
13 |     ------
14 |     cov: pd.DataFrame
15 |     sort_idx: pd.Series
16 |         Sorted index by quasi diagonalization
17 |     
18 |     Returns
19 |     -------
20 |     pd.Series
21 |     """
22 |     weight = pd.Series(1, index=sort_idx)
23 |     # Initialize all in one cluster
24 |     cl_items = [sort_idx]
25 |     while len(cl_items) > 0:
26 |         cl_items_ = []
27 |         for cl in cl_items:
28 |             # Split into half for each cluter
29 |             if len(cl) >= 2:
30 |                 cl_items_.append(cl[0:len(cl) // 2])
31 |                 cl_items_.append(cl[len(cl) // 2:len(cl)])
32 |         # Update cluster
33 |         cl_items = cl_items_
34 |         for i in range(0, len(cl_items), 2):
35 |             cl0 = cl_items[i]
36 |             cl1 = cl_items[i + 1]
37 |             var0 = get_cluster_var(cov, cl0)
38 |             var1 = get_cluster_var(cov, cl1)
39 |             alpha = var1 / (var0 + var1)
40 |             weight[cl0] *= alpha
41 |             weight[cl1] *= 1 - alpha
42 |     return weight
43 | 
44 | 
45 | def get_ivp(cov):
46 |     """Compute inverse variance portfolio
47 |     
48 |     Params
49 |     ------
50 |     cov: pd.DataFrame
51 |     
52 |     Returns
53 |     -------
54 |     np.array
55 |     """
56 |     ivp = 1. / np.diag(cov)
57 |     ivp /= ivp.sum()
58 |     return ivp
59 | 
60 | 
61 | def get_cluster_var(cov, cl_items):
62 |     """Compute variance per cluster
63 |     
64 |     Params
65 |     ------
66 |     cov: pd.DataFrame
67 |     cl_items: pd.Series
68 |     
69 |     Returns
70 |     -------
71 |     float
72 |     """
73 |     cov_cl = cov.loc[cl_items, cl_items]
74 |     w = get_ivp(cov_cl).reshape(-1, 1)
75 |     cl_var = np.dot(np.dot(w.T, cov_cl), w)[0, 0]
76 |     return cl_var
77 | 
78 | 
79 | def get_hrp(cov, corr):
80 |     """Construct a hierarchical portfolio
81 |     
82 |     Params
83 |     ------
84 |     cov: pd.DataFrame
85 |     corr: pd.DataFrame
86 |     
87 |     Returns
88 |     -------
89 |     pd.Series
90 |     """
91 |     dist = get_corr_dist(corr)
92 |     link = sch.linkage(dist, 'single')
93 |     sort_idx = get_quasi_diag(link)
94 |     # Recover label
95 |     sort_idx = corr.index[sort_idx].tolist()
96 |     hrp = get_rec_bipart(cov, sort_idx)
97 |     return hrp.sort_index()


--------------------------------------------------------------------------------
/finance_ml/distance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scipy.stats as ss
 4 | from sklearn.metrics import mutual_info_score
 5 | 
 6 | def _fix_corr(corr):
 7 |     corr[corr > 1] = 1
 8 |     corr[corr < -1] = -1
 9 |     return corr.fillna(0)
10 | 
11 | def corr_metric(corr, use_abs=False):
12 |     corr = _fix_corr(corr)
13 |     if use_abs:
14 |         return np.sqrt(1 - np.abs(corr))
15 |     else:
16 |         return np.sqrt(0.5 * (1 - corr))
17 | 
18 | def corr_metric_xy(x, y, use_abs=False):
19 |     corr = np.corrcoef(x, y)[0, 1]
20 |     return corr_metric(corr, use_abs)
21 | 
22 | def _get_zeta(N):
23 |     return (8 + 324 * N + 12 * (36 * N + 729 * N ** 2) ** 0.5) ** (1./3)
24 | 
25 | def _num_bins(n_obs, corr=None):
26 |     if corr is None or 1. - corr ** 2 < 1e-8:
27 |         zeta = _get_zeta(n_obs)
28 |         b = round(zeta / 6. + 2. / (3 * zeta) + 1. / 3)
29 |     else:
30 |         b = round(2 ** -0.5 * (1 + (1 + 24 * n_obs / (1. - corr ** 2)) ** 0.5) ** 0.5)
31 |     return max(int(b), 2)
32 | 
33 | 
34 | def entropy(x, bx=None, is_cont=False):
35 |     if bx is None:
36 |         bx = _num_bins(x.shape[0])
37 |     hx = ss.entropy(np.histogram(x, bx)[0])
38 |     if is_cont:
39 |         delta = (x.max() - x.min()) / bx
40 |         hx += np.log(delta)
41 |     return hx
42 | 
43 | def joint_entropy(x, y, bxy=None, is_cont=False):
44 |     if bxy is None:
45 |         bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1])
46 |     cxy = np.histogram2d(x, y, bxy)[0]
47 |     hx = ss.entropy(np.histogram(x, bxy)[0])
48 |     hy = ss.entropy(np.histogram(y, bxy)[0])
49 |     ixy = mutual_info_score(None, None, contingency=cxy)
50 |     hxy = hx + hy - ixy
51 |     if is_cont:
52 |         deltax = (x.max() - x.min()) / bxy
53 |         deltay = (y.max() - y.min()) / bxy
54 |         hxy += np.log(deltax) + np.log(deltay)
55 |     return hxy
56 | 
57 | def cond_entropy(x, y, bxy=None, is_cont=False):
58 |     if bxy is None:
59 |         bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1])
60 |     cxy = np.histogram2d(x, y, bxy)[0]
61 |     hx = ss.entropy(np.histogram(x, bxy)[0])
62 |     hy = ss.entropy(np.histogram(y, bxy)[0])
63 |     ixy = mutual_info_score(None, None, contingency=cxy)
64 |     hxy = hx + hy - ixy
65 |     if is_cont:
66 |         deltax = (x.max() - x.min()) / bxy
67 |         deltay = (y.max() - y.min()) / bxy
68 |         hxy += np.log(deltax) + np.log(deltay)
69 |         hy += np.log(deltay)
70 |     return hxy - hy
71 | 
72 | def variation_info(x, y, normalize=False):
73 |     bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1])
74 |     cxy = np.histogram2d(x, y, bxy)[0]
75 |     hx = ss.entropy(np.histogram(x, bxy)[0])
76 |     hy = ss.entropy(np.histogram(y, bxy)[0])
77 |     ixy = mutual_info_score(None, None, contingency=cxy)
78 |     varxy = hx + hy - 2 * ixy
79 |     if normalize:
80 |         hxy = hx + hy - ixy
81 |         varxy /= hxy
82 |     return varxy
83 | 
84 | def mutual_info(x, y, normalize=False):
85 |     bxy = _num_bins(x.shape[0], corr=np.corrcoef(x, y)[0, 1])
86 |     cxy = np.histogram2d(x, y, bxy)[0]
87 |     ixy = mutual_info_score(None, None, contingency=cxy)
88 |     if normalize:
89 |         hx = ss.entropy(np.histogram(x, bxy)[0])
90 |         hy = ss.entropy(np.histogram(y, bxy)[0])
91 |         ixy /= min(hx, hy)
92 |     return ixy


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. finance_ml documentation master file, created by
  2 |    sphinx-quickstart on Sat Dec 28 14:57:57 2019.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to ``finance_ml``'s documentation!
  7 | ===============================================
  8 | Python implementations of Machine Learning helper functions for Quantiative Finance based on a book,
  9 | `Advances in Financial Machine Learning`_, written by ``Marcos Lopez de Prado``. 
 10 | 
 11 | .. _Advances in Financial Machine Learning: https://www.amazon.co.jp/Advances-Financial-Machine-Learning-English-ebook/dp/B079KLDW21
 12 | 
 13 | 
 14 | Installation
 15 | --------------
 16 | Excute the following command ::
 17 | 
 18 |     python setup.py install
 19 | 
 20 | Examples
 21 | --------------
 22 | labeling
 23 | ~~~~~~~~~
 24 | Triple Barriers Labeling and CUSUM sampling::
 25 | 
 26 |     from finance_ml.labeling import get_barrier_labels, cusum_filter
 27 |     from finance_ml.stats import get_daily_vol
 28 | 
 29 |     vol = get_daily_vol(close)
 30 |     trgt = vol
 31 |     timestamps = cusum_filter(close, vol)
 32 |     labels = get_barrier_labels(close, timestamps, trgt, sltp=[1, 1],
 33 |                             num_days=1, min_ret=0, num_threads=16)
 34 |     print(labels.show())
 35 | 
 36 | Return the following pandas.Series::
 37 | 
 38 |     2000-01-05 -1.0
 39 |     2000-01-06  1.0
 40 |     2000-01-10 -1.0
 41 |     2000-01-11  1.0
 42 |     2000-01-12  1.0
 43 | 
 44 | multiprocessing
 45 | ~~~~~~~~~~~~~~~~
 46 | Parallel computing using ``multiprocessing`` library.
 47 | Here is the example of applying function to each element with parallelization.::
 48 | 
 49 |     import pandas as pd
 50 |     import numpy as np
 51 | 
 52 |     def apply_func(x):
 53 |         return x ** 2
 54 | 
 55 |     def func(df, timestamps, f):
 56 |         df_ = df.loc[timestamps]
 57 |         for idx, x in df_.items():
 58 |             df_.loc[idx] = f(x)
 59 |         return df_
 60 |     
 61 |     df = pd.Series(np.random.randn(10000))
 62 |     from finance_ml.multiprocessing import mp_pandas_obj
 63 | 
 64 |     results = mp_pandas_obj(func, pd_obj=('timestamps', df.index),
 65 |                             num_threads=24, df=df, f=apply_func)
 66 |     print(results.head())
 67 | 
 68 | Output::
 69 | 
 70 |     0    0.449278
 71 |     1    1.411846
 72 |     2    0.157630
 73 |     3    4.949410
 74 |     4    0.601459
 75 | 
 76 | 
 77 | Documentation for the Code
 78 | ============================
 79 | .. toctree::
 80 |    :maxdepth: 2
 81 |    :caption: Contents:
 82 | 
 83 | Labeling
 84 | ---------
 85 | .. automodule:: finance_ml.labeling.barriers
 86 |     :members:
 87 | 
 88 | .. automodule:: finance_ml.labeling.sampling
 89 |     :members:
 90 | 
 91 | .. automodule:: finance_ml.labeling.sides
 92 |     :members:
 93 | 
 94 | .. automodule:: finance_ml.labeling.sizes
 95 |     :members:
 96 | 
 97 | .. automodule:: finance_ml.labeling.utils
 98 |     :members:
 99 | 
100 | Multiprocessing
101 | ------------------
102 | .. automodule:: finance_ml.multiprocessing.pandas
103 |     :members:
104 | 
105 | .. automodule:: finance_ml.multiprocessing.partition
106 |     :members:
107 | 
108 | .. automodule:: finance_ml.multiprocessing.utils
109 |     :members:
110 | 
111 | 
112 | Indices and tables
113 | ==================
114 | 
115 | * :ref:`genindex`
116 | * :ref:`modindex`
117 | * :ref:`search`
118 | 


--------------------------------------------------------------------------------
/finance_ml/model_selection/score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .kfold import PurgedKFold, CPKFold
 4 | from .utils import evaluate
 5 | 
 6 | 
 7 | def cv_score(clf,
 8 |              X,
 9 |              y,
10 |              sample_weight=None,
11 |              scoring='neg_log_loss',
12 |              n_splits=3,
13 |              t1=None,
14 |              cv_gen=None,
15 |              pct_embargo=0.,
16 |              purging=True,
17 |              return_combs=False,
18 |              ret=None,
19 |              num_threads=1,
20 |              **kwargs):
21 |     """Cross Validation with default purging and embargo
22 |     
23 |     Params
24 |     ------
25 |     X: pd.DataFrame
26 |     y: pd.Series, optional
27 |     sample_weight: pd.Series, optional
28 |         If specified, apply this to bot testing and training
29 |     scoring: str, default 'neg_log_loss'
30 |         The name of scoring methods. 'precision', 'recall', 'f1', 'precision_recall',
31 |         'roc', 'accuracy' or 'neg_log_loss'
32 |     n_splits: int
33 |         The number of splits for cross validation
34 |     t1: pd.Series
35 |         Index and value correspond to the begining and end of information
36 |     cv_gen: KFold instance
37 |         If not specified, use PurgedKfold. If cv_gen == 'cp', use CPKFold
38 |     pct_embargo: float, default 0
39 |         The percentage of applying embargo
40 |     purging: bool, default True
41 |         If true, apply purging method
42 |     return_combs: bool, default False
43 |         If True and use CPKFold, return combinatorics location
44 |     num_threads: int, default 1
45 |         The number of threads for purging
46 |     kwargs: Parameters for scoring function
47 |         
48 |     Returns
49 |     -------
50 |     array: scores of cross validation
51 |     """
52 |     if cv_gen is None:
53 |         if t1 is not None:
54 |             cv_gen = PurgedKFold(
55 |                 n_splits=n_splits,
56 |                 t1=t1,
57 |                 pct_embargo=pct_embargo,
58 |                 purging=purging,
59 |                 num_threads=num_threads)
60 |         else:
61 |             cv_gen = KFold(n_splits=n_splits)
62 |     elif cv_gen == 'cp':
63 |         cv_gen = CPKFold(
64 |             n_splits=n_splits,
65 |             t1=t1,
66 |             pct_embargo=pct_embargo,
67 |             purging=purging,
68 |             num_threads=num_threads)
69 |     scores = []
70 |     for train, test in cv_gen.split(X=X):
71 |         train_params = dict()
72 |         test_params = dict()
73 |         # Sample weight is an optional parameter
74 |         if sample_weight is not None:
75 |             train_params['sample_weight'] = sample_weight.iloc[train].values
76 |             test_params['sample_weight'] = sample_weight.iloc[test].values
77 |         test_params.update(kwargs)
78 |         clf_fit = clf.fit(
79 |             X=X.iloc[train, :].values, y=y.iloc[train].values, **train_params)
80 |         if hasattr(clf_fit, 'classes_'):
81 |             test_params['labels'] = clf_fit.classes_
82 |         if ret is not None:
83 |             test_params['ret'] = ret.iloc[test]
84 |         # Scoring
85 |         score_ = evaluate(clf_fit, X.iloc[test, :].values, y.iloc[test].values,
86 |                           scoring, **test_params)
87 |         scores.append(score_)
88 |     if scoring not in ['roc', 'precision_recall']:
89 |         scores = np.array(scores)
90 |     if return_combs:
91 |         return scores, cv_gen.get_test_combs()
92 |     else:
93 |         return scores


--------------------------------------------------------------------------------
/finance_ml/features/fraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from statsmodels.tsa.stattools import adfuller
  4 | from tqdm import tqdm, tqdm_notebook
  5 | 
  6 | 
  7 | def get_weights_FFD(d, thres, max_size=10000):
  8 |     """Get coefficient for calculating fractional derivative
  9 |     
 10 |     Args:
 11 |         d (int)
 12 | 
 13 |         thres (float)
 14 | 
 15 |         max_size (int, optional) Defauts to 1e4.\
 16 |             Set the maximum size for stability
 17 |         
 18 |     Returns:
 19 |         array-like
 20 |     """
 21 |     w = [1.]
 22 |     for k in range(1, max_size):
 23 |         w_ = -w[-1] / k * (d - k + 1)
 24 |         if abs(w_) <= thres:
 25 |             break
 26 |         w.append(w_)
 27 |     w = np.array(w)
 28 |     return w
 29 | 
 30 | 
 31 | def frac_diff_FFD(series, d, lag=1, thres=1e-5, max_size=10000):
 32 |     """Compute Fractional Differentiation
 33 |     
 34 |     Args:
 35 |         series (pd.Series)
 36 | 
 37 |         d (float): the degree of differentiation
 38 | 
 39 |         lag (int, optional): Defaults to 1.\
 40 |             The lag scale when making differential like series.diff(lag)
 41 | 
 42 |         thres (float, optional): Defaults to 1e-5.\
 43 |             Threshold to determine fixed length window
 44 |     
 45 |     Returns:
 46 |         pd.Series
 47 |     """
 48 |     max_size = int(max_size / lag)
 49 |     w = get_weights_FFD(d, thres, max_size)
 50 |     width = len(w)
 51 |     series_ = series.fillna(method='ffill').dropna()
 52 |     rolling_array = []
 53 |     for i in range(width):
 54 |         rolling_array.append(series_.shift(i * lag).values)
 55 |     rolling_array = np.array(rolling_array)
 56 |     series_val = np.dot(rolling_array.T, w)
 57 |     series = pd.Series(index=series.index)
 58 |     timestamps = series.index[-len(series_val):]
 59 |     series.loc[timestamps] = series_val
 60 |     return series
 61 | 
 62 | 
 63 | def get_opt_d(series, ds=None, lag=1, thres=1e-5, max_size=10000,
 64 |               p_thres=1e-2, autolag=None, verbose=1, **kwargs):
 65 |     """Find minimum value of degree of stationary differntial
 66 |     
 67 |     Args:
 68 |         series (pd.Series)
 69 | 
 70 |         ds (array-like, optional): Defaults to np.linspace(0, 1, 100)\
 71 |             Search space of degree.
 72 | 
 73 |         lag (int, optional): Defaults to 1.\
 74 |             The lag scale when making differential like series.diff(lag)
 75 | 
 76 |         thres (float, optional): Defaults to 1e-5.\
 77 |             Threshold to determine fixed length window
 78 | 
 79 |         p_threds (float, optional): Defaults to 1e-2.\
 80 | 
 81 |         auto_lag (str, optional)
 82 | 
 83 |         verbose (int, optional): Defaults to 1.\
 84 |             If 1 or 2, show the progress bar. 2 for notebook
 85 |             
 86 |         kwargs (optional): paramters for ADF
 87 |     
 88 |     Returns:
 89 |         int: optimal degree
 90 |     """
 91 |     if ds is None:
 92 |         ds = np.linspace(0, 1, 100)
 93 |     # Sort to ascending order
 94 |     ds = np.array(ds)
 95 |     sort_idx = np.argsort(ds)
 96 |     ds = ds[sort_idx]
 97 |     if verbose == 2:
 98 |         iter_ds = tqdm_notebook(ds)
 99 |     elif verbose == 1:
100 |         iter_ds = tqdm(ds)
101 |     else:
102 |         iter_ds = ds
103 |     opt_d = ds[-1]
104 |     # Compute pval for each d
105 |     for d in iter_ds:
106 |         diff = frac_diff_FFD(series, d=d, thres=thres, max_size=max_size)
107 |         pval = adfuller(diff.dropna().values, autolag=autolag, **kwargs)[1]
108 |         if pval < p_thres:
109 |             opt_d = d
110 |             break
111 |     return opt_d


--------------------------------------------------------------------------------
/finance_ml/denoising.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.neighbors.kde import KernelDensity
  4 | from scipy.optimize import minimize
  5 | 
  6 | 
  7 | def cov2corr(cov):
  8 |     std = np.sqrt(np.diag(cov))
  9 |     corr = cov / np.outer(std, std)
 10 |     corr[corr < -1] = -1
 11 |     corr[corr > 1] = 1
 12 |     return corr
 13 | 
 14 | def corr2cov(corr, std):
 15 |     return corr * np.outer(std, std)
 16 | 
 17 | def mp_pdf(var, q, pts):
 18 |     # Marcenko-Pastur Distribution
 19 |     # q = T/N
 20 |     e_min = var * (1 - (1./q) ** 0.5) ** 2
 21 |     e_max = var * (1 + (1./q) ** 0.5) ** 2
 22 |     e_val = np.linspace(e_min, e_max, pts)
 23 |     pdf = q * ((e_max - e_val) * (e_val - e_min)) ** 0.5 / (2 * np.pi * var * e_val)
 24 |     return pd.Series(pdf, index=e_val)
 25 | 
 26 | def getPCA(matrix):
 27 |     e_val, e_vec = np.linalg.eigh(matrix)
 28 |     indices = e_val.argsort()[::-1]
 29 |     e_val = e_val[indices]
 30 |     e_vec = e_vec[:, indices]
 31 |     e_val = np.diagflat(e_val)
 32 |     return e_val, e_vec
 33 | 
 34 | def fitKDE(obs, bwidth=0.25, kernel='gaussian', x=None):
 35 |     if len(obs.shape) == 1:
 36 |         obs = obs.reshape(-1, 1)
 37 |     kde = KernelDensity(kernel=kernel, bandwidth=bwidth).fit(obs)
 38 |     if x is None:
 39 |         x = np.unique(obs).reshape(-1 , 1)
 40 |     if len(x.shape) == 1:
 41 |         x = x.reshape(-1, 1)
 42 |     log_prob = kde.score_samples(x)
 43 |     pdf = pd.Series(np.exp(log_prob), index=x.flatten())
 44 |     return pdf
 45 | 
 46 | def err_pdf(var, e_val, q, bwidth, pts=1000):
 47 |     pdf0 = mp_pdf(var[0], q, pts)
 48 |     pdf1 = fitKDE(e_val, bwidth, x=pdf0.index.values)
 49 |     sse = np.sum((pdf1 - pdf0) ** 2)
 50 |     return sse
 51 | 
 52 | def find_max_eigen_val(e_val, q, bwidth, min_var=1e-5, max_var=1-1e-5):
 53 |     out = minimize(lambda *x: err_pdf(*x), .5, args=(e_val, q, bwidth), bounds=((min_var, max_var),))
 54 |     if out["success"]:
 55 |         var = out['x'][0]
 56 |     else:
 57 |         var = 1
 58 |     e_max = var * (1 + (1./q) ** 0.5) ** 2
 59 |     return e_max, var
 60 | 
 61 | 
 62 | def denoise_corr(e_val, e_vec, n_facts, shrinkage=False, alpha=0):
 63 |     if shrinkage:
 64 |         e_val_l, e_vec_l = e_val[:n_facts, :n_facts], e_vec[:, :n_facts]
 65 |         e_val_r, e_vec_r = e_val[n_facts:, n_facts:], e_vec[:, n_facts:]
 66 |         corr_l = np.dot(e_vec_l, e_val_l).dot(e_vec_l.T)
 67 |         corr_r = np.dot(e_vec_r, e_val_r).dot(e_vec_r.T)
 68 |         corr1 = corr_l + alpha * corr_r + (1 - alpha) * np.diag(np.diag(corr_r))
 69 |     else:
 70 |         e_val_ = np.diag(e_val).copy()
 71 |         e_val_[n_facts:] = e_val_[n_facts:].sum() / float(e_val_.shape[0] - n_facts)
 72 |         e_val_ = np.diag(e_val_)
 73 |         corr1 = np.dot(e_vec, e_val_).dot(e_vec.T)
 74 |         # Renormalize to keep trace 1
 75 |         corr1 = cov2corr(corr1)
 76 |     return corr1
 77 | 
 78 | 
 79 | def detone_corr(e_val, e_vec, n_facts, shrinkage=False, alpha=0):
 80 |     if shrinkage:
 81 |         e_val_r, e_vec_r = e_val[n_facts:, n_facts:], e_vec[:, n_facts:]
 82 |         corr_r = np.dot(e_vec_r, e_val_r).dot(e_vec_r.T)
 83 |         corr1 = alpha * corr_r + (1 - alpha) * np.diag(np.diag(corr_r))
 84 |         # Renormalize to keep trace 1
 85 |         corr1 = cov2corr(corr1)
 86 |     else:
 87 |         e_val_ = np.diag(e_val).copy()
 88 |         e_val_[:n_facts] = 0
 89 |         e_val_ = np.diag(e_val_)
 90 |         corr1 = np.dot(e_vec, e_val_).dot(e_vec.T)
 91 |         # Renormalize to keep trace 1
 92 |         corr1 = cov2corr(corr1)
 93 |     return corr1
 94 | 
 95 | 
 96 | def denoise_cov(cov, q, bwidth):
 97 |     corr0 = cov2corr(cov)
 98 |     e_val0, e_vec0 = getPCA(corr0)
 99 |     e_max0, var0 = find_max_eigen_val(np.diag(e_val0), q, bwidth)
100 |     nfacts0 = e_val0.shape[0] - np.diag(e_val0)[::-1].searchsorted(e_max0)
101 |     corr1 = denoise_corr(e_val0, e_vec0, nfacts0)
102 |     cov1 = corr2cov(corr1, np.diag(cov) ** .5)
103 |     return cov1
104 | 
105 | def opt_portfolio(cov, mu=None):
106 |     inv = np.linalg.inv(cov)
107 |     ones = np.ones(shape=(inv.shape[0], 1))
108 |     if mu is None:
109 |         mu = ones
110 |     w = np.dot(inv, mu)
111 |     w /= np.dot(ones.T, w)
112 |     return w
113 | 


--------------------------------------------------------------------------------
/finance_ml/experiments.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy.linalg import block_diag
  4 | from sklearn.covariance import LedoitWolf
  5 | from sklearn.utils import check_random_state
  6 | from sklearn.datasets import make_classification
  7 | 
  8 | from .denoising import corr2cov, cov2corr
  9 | 
 10 | # Matrix
 11 | ################################################
 12 | def form_block_matrix(n_blocks, bsize, bcorr):
 13 |     block = np.ones((bsize, bsize)) * bcorr
 14 |     for i in range(bsize):
 15 |         block[i, i] = 1
 16 |     corr = block_diag(*[block] * n_blocks)
 17 |     return corr
 18 | 
 19 | def form_true_matrix(n_blocks, bsize, bcorr, is_shuffle=True):
 20 |     corr0 = form_block_matrix(n_blocks, bsize, bcorr)
 21 |     corr0 = pd.DataFrame(corr0)
 22 |     cols = corr0.columns.tolist()
 23 |     if is_shuffle:
 24 |         np.random.shuffle(cols)
 25 |     corr0 = corr0[cols].loc[cols].copy(deep=True)
 26 |     std0 = np.random.uniform(0.05, 0.2, corr0.shape[0])
 27 |     cov0 = corr2cov(corr0, std0)
 28 |     mu0 = np.random.normal(std0, std0, cov0.shape[0]).reshape(-1, 1)
 29 |     return mu0, cov0
 30 | 
 31 | def simulate_mu_cov(mu, cov, n_obs, shrink=False):
 32 |     x = np.random.multivariate_normal(mu.flatten(), cov, size=n_obs)
 33 |     mu1 = x.mean(axis=0).reshape(-1, 1)
 34 |     if shrink:
 35 |         cov1 = LedoitWolf().fit(x).covariance_
 36 |     else:
 37 |         cov1 = np.cov(x, rowvar=0)
 38 |     return mu1, cov1
 39 | 
 40 | def get_random_cov(n_cols, n_facts):
 41 |     w = np.random.normal(size=(n_cols, n_facts))
 42 |     cov = np.dot(w, w.T)
 43 |     cov += np.diag(np.random.uniform(size=n_cols))
 44 |     return cov
 45 | 
 46 | def get_cov_sub(n_obs, n_cols, sigma, random_state=None):
 47 |     rng = check_random_state(random_state)
 48 |     if n_cols == 1:
 49 |         return np.ones((1, 1))
 50 |     ar0 = rng.normal(size=(n_obs, 1))
 51 |     ar0 = np.repeat(ar0, n_cols, axis=1)
 52 |     ar0 += rng.normal(scale=sigma, size=ar0.shape)
 53 |     ar0 = np.cov(ar0, rowvar=False)
 54 |     return ar0
 55 | 
 56 | def get_random_block_cov(n_cols, n_blocks, min_block_size=2, sigma=1., random_state=None):
 57 |     rng = check_random_state(random_state)
 58 |     # Generate Size of each block
 59 |     parts = rng.choice(range(1, n_cols - (min_block_size - 1) * n_blocks), n_blocks-1, replace=False)
 60 |     parts.sort()
 61 |     parts = np.append(parts, n_cols - (min_block_size - 1) * n_blocks)
 62 |     parts = np.append(parts[0], np.diff(parts)) - 1 + min_block_size
 63 |     # Combine blocks as diagonal matrix
 64 |     cov = None
 65 |     for n_cols_ in parts:
 66 |         cov_ = get_cov_sub(int(max(n_cols_ * (n_cols_ + 1) / 2., 100)), n_cols_, sigma, random_state=rng)
 67 |         if cov is None:
 68 |             cov = cov_.copy()
 69 |         else:
 70 |             cov = block_diag(cov, cov_)
 71 |     return cov
 72 | 
 73 | def get_random_block_corr(n_cols, n_blocks, random_state=None, min_block_size=2, sigma=1., is_shuffle=False):
 74 |     rng = check_random_state(random_state)
 75 |     cov0 = get_random_block_cov(n_cols, n_blocks, min_block_size=min_block_size, sigma=sigma * 0.5, random_state=rng)
 76 |     # Add noise
 77 |     cov1 = get_random_block_cov(n_cols, 1, min_block_size=min_block_size, sigma=sigma, random_state=rng)
 78 |     cov0 += cov1
 79 |     # Generate Correlation
 80 |     corr0 = cov2corr(cov0)
 81 |     corr0 = pd.DataFrame(corr0)
 82 |     if is_shuffle:
 83 |         orig_cols = corr0.columns.tolist()
 84 |         cols = corr0.columns.tolist()
 85 |         np.random.shuffle(cols)
 86 |         corr0 = pd.DataFrame(corr0[cols].loc[cols].values, index=orig_cols, columns=orig_cols)
 87 |     return corr0
 88 | 
 89 | def get_classification_data(n_features=100, n_informative=25, n_reduntant=25, n_samples=10000, random_state=0, sigma=.0):
 90 |     np.random.seed(random_state)
 91 |     X, y = make_classification(n_samples=n_samples, n_features=n_features - n_reduntant,
 92 |                                n_informative=n_informative, n_redundant=0, shuffle=False)
 93 |     cols = [f"I_{i}" for i in range(n_informative)]
 94 |     cols += [f"N_{i}" for i in range(n_features - n_reduntant - n_informative)]
 95 |     X = pd.DataFrame(X, columns=cols)
 96 |     y = pd.Series(y)
 97 |     rdt_choices = np.random.choice(range(n_informative), size=n_reduntant)
 98 |     for i, choice in enumerate(rdt_choices):
 99 |         X[f"R_{i}"] = X[f"I_{choice}"] + np.random.normal(size=X.shape[0]) * sigma
100 |     return X, y
101 | 


--------------------------------------------------------------------------------
/finance_ml/clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.cluster import KMeans
 4 | from sklearn.metrics import silhouette_samples, silhouette_score
 5 | 
 6 | from .distance import corr_metric
 7 | 
 8 | 
 9 | _eps = 1e-16
10 | 
11 | def cluster_kmeans_base(corr0, max_num_clusters=10, min_num_clusters=4, n_init=10, debug=False):
12 |     dist = corr_metric(corr0, False)
13 |     silh = None
14 |     kmeans = None
15 |     q_val = None
16 |     max_num_clusters = min(max_num_clusters, int(np.floor(dist.shape[0]/2)))
17 |     min_num_clusters = max(2, min_num_clusters)
18 |     for init in range(n_init):
19 |         for n_clusters in range(min_num_clusters, max_num_clusters + 1):
20 |             kmeans_ = KMeans(n_clusters=n_clusters, n_jobs=1, n_init=1)
21 |             kmeans_ = kmeans_.fit(dist.values)
22 |             silh_ = silhouette_samples(dist.values, kmeans_.labels_)
23 |             q_val_ = silh_.mean() / max(silh_.std(), _eps)
24 |             if q_val is None or q_val_ > q_val:
25 |                 silh = silh_
26 |                 kmeans = kmeans_
27 |                 q_val = q_val_
28 |                 if debug:
29 |                     print(kmeans)
30 |                     print(q_val, silh)
31 |                     silhouette_avg = silhouette_score(dist.values, kmeans_.labels_)
32 |                     print(f"For n_clusters={n_clusters}, slih_std: {silh_.std()} The average silhouette_score is : {silhouette_avg}")
33 |                     print("********")
34 |     new_idx = np.argsort(kmeans.labels_)
35 |     corr1 = corr0.iloc[new_idx]
36 |     corr1 = corr1.iloc[:, new_idx]
37 |     clstrs = {i:corr0.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_)}
38 |     silh = pd.Series(silh, index=dist.index)
39 |     return corr1, clstrs, silh
40 | 
41 | def make_new_outputs(corr0, clstrs1, clstrs2):
42 |     clstrs_new = dict()
43 |     for i in clstrs1.keys():
44 |         clstrs_new[len(clstrs_new.keys())] = list(clstrs1[i])
45 |     for i in clstrs2.keys():
46 |         clstrs_new[len(clstrs_new.keys())] = list(clstrs2[i])
47 |     new_idx = [j for i in clstrs_new.keys() for j in clstrs_new[i]]
48 |     corr_new = corr0.loc[new_idx, new_idx]
49 |     dist = corr_metric(corr0, False)
50 |     kmeans_labels = np.zeros(len(dist.columns))
51 |     for i in clstrs_new.keys():
52 |         idxs = [dist.index.get_loc(k) for k in clstrs_new[i]]
53 |         kmeans_labels[idxs] = i
54 |     silh_new = pd.Series(silhouette_samples(dist.values, kmeans_labels), index=dist.index)
55 |     return corr_new, clstrs_new, silh_new
56 | 
57 | def cluster_kmeans_top(corr0, max_num_clusters=None, min_num_clusters=4, n_init=10, debug=False):
58 |     if max_num_clusters is None:
59 |         max_num_clusters = corr0.shape[1] - 1
60 |     max_num_clusters = min(max_num_clusters, corr0.shape[1] - 1)
61 |     corr1, clstrs, silh = cluster_kmeans_base(corr0,
62 |                                               max_num_clusters=max_num_clusters,
63 |                                               min_num_clusters=min_num_clusters,
64 |                                               n_init=n_init, debug=debug)
65 |     clstrs_tstats = {i:np.mean(silh[clstrs[i]]) / max(np.std(silh[clstrs[i]]), _eps) for i in clstrs.keys()}
66 |     tstats_mean = np.mean(list(clstrs_tstats.values()))
67 |     redo_clstrs = [i for i in clstrs_tstats.keys() if clstrs_tstats[i] < tstats_mean]
68 |     if len(redo_clstrs) <= 2:
69 |         return corr1, clstrs, silh
70 |     else:
71 |         keys_redo = [j for i in redo_clstrs for j in clstrs[i]]
72 |         corr_tmp = corr0.loc[keys_redo, keys_redo]
73 |         corr2, clstrs2, silh2 = cluster_kmeans_base(corr_tmp,
74 |                                                     max_num_clusters=min(max_num_clusters, corr_tmp.shape[1] - 1),
75 |                                                     min_num_clusters=2,
76 |                                                     n_init=n_init,
77 |                                                     debug=debug)
78 |         clstrs1 = {i: clstrs[i] for i in clstrs.keys() if i not in redo_clstrs}
79 |         corr_new, clstrs_new, silh_new = make_new_outputs(corr0, clstrs1, clstrs2)
80 |         new_clstrs_tstats = {i:np.mean(silh_new[i]) / max(np.std(silh_new[i]),  _eps) for i in clstrs_new.keys()}
81 |         tstats_mean = np.mean(list(clstrs_tstats.values()))
82 |         new_tstats_mean = np.mean(list(new_clstrs_tstats.values()))
83 |         if new_tstats_mean <= tstats_mean:
84 |             return corr1, clstrs, silh
85 |         else:
86 |             return corr_new, clstrs_new, silh_new


--------------------------------------------------------------------------------
/finance_ml/features/entropy.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from tqdm import tqdm
  6 | 
  7 | from ..multiprocessing import mp_pandas_obj
  8 | 
  9 | 
 10 | def plug_in(data, window):
 11 |     """Plug in Entropy Estimator
 12 |     
 13 |     Args:
 14 |         data (list)
 15 | 
 16 |         window (int)
 17 |     
 18 |     Returns:
 19 |         float: Estimated entropy
 20 | 
 21 |         dict: Probability mass function
 22 |     """
 23 |     pmf = calc_pmf(data, window)
 24 |     out = -sum([pmf[key] * np.log2(pmf[key]) for key in pmf.keys()])
 25 |     return out, pmf
 26 | 
 27 | 
 28 | def calc_pmf(data, window):
 29 |     """Calculate probability mass function
 30 |     
 31 |     Args:
 32 |         data (list)
 33 |         
 34 |         window (int)
 35 |     
 36 |     Returns:
 37 |         dict
 38 |     """
 39 |     lib = {}
 40 |     for i in range(window, len(data)):
 41 |         x = '_'.join([str(data_i) for data_i in data[i - window:i]])
 42 |         if x not in lib:
 43 |             lib[x] = [i - window]
 44 |         else:
 45 |             lib[x] += [
 46 |                 i - window,
 47 |             ]
 48 |     num_samples = float(len(data) - window)
 49 |     pmf = {key: len(lib[key]) / num_samples for key in lib}
 50 |     return pmf
 51 | 
 52 | 
 53 | def lempel_zib_lib(data):
 54 |     """Calculate Lampel Ziv dictionary
 55 |     
 56 |     Args:
 57 |         data (list)
 58 |     
 59 |     Returns:
 60 |         dict
 61 |     """
 62 |     i = 1
 63 |     lib = [str(data[0])]
 64 |     while i < len(data):
 65 |         for j in range(i, len(data)):
 66 |             x = '_'.join([str(data_i) for data_i in data[i:j + 1]])
 67 |             if x not in lib:
 68 |                 lib.append(x)
 69 |                 break
 70 |         i = j + 1
 71 |     return lib
 72 | 
 73 | 
 74 | def match_length(data, i, n):
 75 |     """Calculate math length
 76 |     
 77 |     Args:
 78 |         data (list)
 79 |         
 80 |         i (int): start point
 81 | 
 82 |         n (int): window size
 83 |     
 84 |     Returns:
 85 |         int: length of the longest matched substring + 1
 86 | 
 87 |         str: the longest mathed substring
 88 |     """
 89 |     sub_str = ''
 90 |     for l in range(n):
 91 |         msg1 = '_'.join([str(data_i) for data_i in data[i:i + l + 1]])
 92 |         for j in range(max(i - n, 0), i):
 93 |             msg0 = '_'.join([str(data_i) for data_i in data[j:j + l + 1]])
 94 |             if msg1 == msg0:
 95 |                 sub_str = msg1
 96 |                 break
 97 |     return len(sub_str.split('_')) + 1, sub_str
 98 | 
 99 | 
100 | def konto(data, window=None, verbose=0):
101 |     """Calculate Kontonyiasnnis' LZ entropy estimate
102 |     
103 |     Args:
104 |         data (list)
105 | 
106 |         window (int, optional
107 | 
108 |         verbose (int, optional) Defaults  to 0.\
109 |             If 1, show the progress bar
110 |     """
111 |     out = {'num': 0, 'sum': 0, 'sub_str': []}
112 |     if window is None:
113 |         points = range(1, len(data) // 2 + 1)
114 |     else:
115 |         window = min(window, len(data) // 2)
116 |         poitns = range(window, len(data) - window + 1)
117 |     if verbose == 1:
118 |         points = tqdm(points)
119 |     for i in points:
120 |         if window is None:
121 |             l, msg = match_length(data, i, i)
122 |             out['sum'] += np.log2(i + 1) / l
123 |         else:
124 |             l, msg = match_length(data, i, window)
125 |             out['sum'] += np.log(i + 1) / l
126 |         out['sub_str'].append(msg)
127 |         out['num'] += 1
128 |     out['h'] = out['sum'] / out['num']
129 |     out['r'] = 1 - out['h'] / np.log2(len(data))
130 |     return out
131 | 
132 | 
133 | def mp_get_entropy_rate(series, lag, molecule):
134 |     delta = timedelta(seconds=lag)
135 |     entropy = pd.Series(index=molecule)
136 |     for t in molecule:
137 |         series_ = series[t - delta:t]
138 |         entropy_t = konto(series_.values, verbose=0)
139 |         entropy.loc[t] = entropy_t['h']
140 |     return entropy
141 | 
142 | 
143 | def get_entropy_rate(series, lag, num_threads=1):
144 |     """Calculate entropy rate for time series
145 | 
146 |     Args:
147 |         series (pd.Series)
148 | 
149 |         lag (int): Time slide length (seconds)
150 | 
151 |         num_threads (int): Defaults to 1
152 |     
153 |     Returns:
154 |         pd.Series
155 |     """
156 |     start = series.index[0] + timedelta(seconds=lag)
157 |     return mp_pandas_obj(
158 |         mp_get_entropy_rate, ('molecule', series[start:].index),
159 |         num_threads,
160 |         series=series,
161 |         lag=lag)
162 | 


--------------------------------------------------------------------------------
/finance_ml/multiprocessing/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from datetime import datetime
  3 | import sys
  4 | from copy import deepcopy
  5 | import multiprocessing as mp
  6 | import multiprocessing.pool
  7 | from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
  8 | from concurrent.futures import _base
  9 | from concurrent.futures.process import _global_shutdown, BrokenProcessPool, _WorkItem
 10 | 
 11 | 
 12 | class MyProcessPoolExecutor(ProcessPoolExecutor):
 13 |     def submit(*args, **kwargs):
 14 |         if len(args) >= 2:
 15 |             self, fn, *args = args
 16 |         elif not args:
 17 |             raise TypeError("descriptor 'submit' of 'ProcessPoolExecutor' object "
 18 |                             "needs an argument")
 19 |         elif 'fn' in kwargs:
 20 |             fn = kwargs.pop('fn')
 21 |             self, *args = args
 22 |         else:
 23 |             raise TypeError('submit expected at least 1 positional argument, '
 24 |                             'got %d' % (len(args)-1))
 25 | 
 26 |         with self._shutdown_lock:
 27 |             if self._broken:
 28 |                 print(f"Broken Parameters: {args}, {kwargs}")
 29 |                 raise BrokenProcessPool(self._broken)
 30 |             if self._shutdown_thread:
 31 |                 raise RuntimeError(
 32 |                     'cannot schedule new futures after shutdown')
 33 |             if _global_shutdown:
 34 |                 raise RuntimeError('cannot schedule new futures after '
 35 |                                    'interpreter shutdown')
 36 | 
 37 |             f = _base.Future()
 38 |             w = _WorkItem(f, fn, args, kwargs)
 39 | 
 40 |             self._pending_work_items[self._queue_count] = w
 41 |             self._work_ids.put(self._queue_count)
 42 |             self._queue_count += 1
 43 |             # Wake up queue management thread
 44 |             self._queue_management_thread_wakeup.wakeup()
 45 | 
 46 |             self._start_queue_management_thread()
 47 |             return f
 48 | 
 49 | 
 50 | def expand_call(kwargs):
 51 |     """Execute function from dictionary input"""
 52 |     func = kwargs['func']
 53 |     del kwargs['func']
 54 |     optional_argument = None
 55 |     if "optional_argument" in kwargs:
 56 |         optional_argument = kwargs["optional_argument"]
 57 |         del kwargs["optional_argument"]
 58 | 
 59 |     transform = None
 60 |     if 'transform' in kwargs:
 61 |         transform = kwargs['transform']
 62 |         del kwargs['transform']
 63 | 
 64 |     def wrapped_func(**input_kwargs):
 65 |         if transform is not None:
 66 |             input_kwargs = transform(input_kwargs)
 67 |         try:
 68 |             return func(**input_kwargs)
 69 |         except Exception as e:
 70 |             print(e)
 71 |             print(f"paramteres: {input_kwargs}")
 72 |             return e
 73 |     out = wrapped_func(**kwargs)
 74 |     if optional_argument is None:
 75 |         return (out, kwargs)
 76 |     else:
 77 |         return (out, kwargs, optional_argument)
 78 | 
 79 | 
 80 | def report_progress(job_idx, num_jobs, time0, task):
 81 |     """Report progress to system output"""
 82 |     msg = [float(job_idx) / num_jobs, (time.time() - time0) / 60.]
 83 |     msg.append(msg[1] * (1 / msg[0] - 1))
 84 |     time_stamp = str(datetime.fromtimestamp(time.time()))
 85 |     msg_ = time_stamp + ' ' + str(
 86 |         round(msg[0] * 100, 2)) + '% ' + task + ' done after ' + \
 87 |         str(round(msg[1], 2)) + ' minutes. Remaining ' + str(
 88 |         round(msg[2], 2)) + ' minutes.'
 89 |     if job_idx < num_jobs:
 90 |         sys.stderr.write(msg_ + '\r')
 91 |     else:
 92 |         sys.stderr.write(msg_ + '\n')
 93 | 
 94 | 
 95 | def process_jobs(jobs, task=None, num_threads=mp.cpu_count(), use_thread=False):
 96 |     """Execute parallelized jobs
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     jobs: list(dict)
101 |         Each element contains `function` and its parameters
102 |     task: str, optional
103 |         The name of task. If not specified, function name is used
104 |     num_threads, (default max count)
105 |         The number of threads for parallelization
106 | 
107 |     Returns
108 |     -------
109 |     List: each element is results of each part
110 |     """
111 |     if task is None:
112 |         if hasattr(jobs[0]['func'], '__name__'):
113 |             task = jobs[0]['func'].__name__
114 |         else:
115 |             task = 'function'
116 |     out = []
117 |     if num_threads > 1:
118 |         if use_thread:
119 |             executor = ThreadPoolExecutor(max_workers=num_threads)
120 |         else:
121 |             executor = MyProcessPoolExecutor(max_workers=num_threads)
122 |         outputs = executor.map(expand_call, jobs,
123 |                                chunksize=1)
124 |         time0 = time.time()
125 |         # Execute programs here
126 |         for i, out_ in enumerate(outputs, 1):
127 |             out.append(out_)
128 |             report_progress(i, len(jobs), time0, task)
129 |     else:
130 |         for job in jobs:
131 |             job = deepcopy(job)
132 |             out_ = expand_call(job)
133 |             out.append(out_)
134 |     return out
135 | 


--------------------------------------------------------------------------------
/finance_ml/model_selection/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.metrics import log_loss, accuracy_score, f1_score, recall_score, precision_score,\
  4 |     precision_recall_curve, roc_curve
  5 | 
  6 | from finance_ml.multiprocessing import mp_pandas_obj
  7 | 
  8 | 
  9 | def mp_train_times(train_times, test_times, molecule):
 10 |     trn = train_times[molecule].copy(deep=True)
 11 |     for init, end in test_times.iteritems():
 12 |         df0 = trn[(init <= trn.index) & (trn.index <= end)].index
 13 |         df1 = trn[(init <= trn) & (trn <= end)].index
 14 |         df2 = trn[(trn.index <= init) & (end <= trn)].index
 15 |         trn = trn.drop(df0 | df1 | df2)
 16 |     return trn
 17 | 
 18 | 
 19 | def get_train_times(train_times, test_times, num_threads=1):
 20 |     """Sample train points without overlapping with test period
 21 |     
 22 |     Params
 23 |     ------
 24 |     train_times: pd.Series
 25 |         Trainig points with index for initial and values for end time
 26 |     test_times: pd.Series
 27 |         Testing points with index for initial and values for end time
 28 |     num_threads: int, default 1
 29 |         The number of thrads for multiprocessing
 30 |         
 31 |     Returns
 32 |     -------
 33 |     pd.Series
 34 |     """
 35 |     return mp_pandas_obj(
 36 |         mp_train_times, ('molecule', train_times.index),
 37 |         num_threads,
 38 |         train_times=train_times,
 39 |         test_times=test_times)
 40 | 
 41 | 
 42 | def get_embargo_times(times, pct_embargo):
 43 |     """Get embargo time index for each timestamp
 44 |     
 45 |     times:
 46 |         times: Timestamps
 47 |             Entire timestamps which you want to apply embargo
 48 |         pct_embargo: float ranged at [0, 1]
 49 |             The ratio to embargo with respect to the size of timestamps
 50 |             
 51 |     Returns:
 52 |         pd.Series: For each valud corresponds to a point which you should take
 53 |         out before from the other forward dataset
 54 |     """
 55 |     step = int(times.shape[0] * pct_embargo)
 56 |     if step == 0:
 57 |         embg = pd.Series(times, index=times)
 58 |     else:
 59 |         embg = pd.Series(times[step:], index=times[:-step])
 60 |         embg = embg.append(pd.Series(times[-1], index=times[-step:]))
 61 |     return embg
 62 | 
 63 | 
 64 | def performance(ret, proba, step=0.01):
 65 |     if isinstance(ret, pd.Series):
 66 |         ret = ret.values
 67 |     n_step = int(.5 / step) + 1
 68 |     pnls = []
 69 |     sharpes = []
 70 |     won_ratios = []
 71 |     ths = np.linspace(.5, 1, n_step)
 72 |     for th in ths:
 73 |         neg_idx = proba[:, 0] <= th
 74 |         pos_idx = proba[:, 1] >= th
 75 |         neg_ret = ret[neg_idx]
 76 |         pos_ret = ret[pos_idx]
 77 |         won_count = len(neg_ret[neg_ret < 0]) + len(pos_ret[pos_ret > 0])
 78 |         total_count = len(neg_ret) + len(pos_ret)
 79 |         if total_count == 0:
 80 |             won_ratio = 0
 81 |         else:
 82 |             won_ratio = won_count / total_count
 83 |         won_ratios.append(won_ratio)
 84 |         idx = neg_idx | pos_idx
 85 |         ret_ = ret[idx]
 86 |         if len(ret_) == 0:
 87 |             pnl = 0
 88 |             sharpe = 0
 89 |         elif len(ret_) == 1:
 90 |             pnl = float(ret_)
 91 |             sharpe = 0
 92 |         else:
 93 |             pnl = np.sum(ret_)
 94 |             sharpe = np.mean(ret_) / np.std(ret_)
 95 |         pnls.append(pnl)
 96 |         sharpes.append(sharpe)
 97 |     return ths, np.array(pnls), np.array(sharpes), np.array(won_ratios)
 98 | 
 99 | 
100 | def meta_performance(ret, proba, step=0.01):
101 |     if isinstance(ret, pd.Series):
102 |         ret = ret.values
103 |     n_step = int(1. / step) + 1
104 |     pnls = []
105 |     sharpes = []
106 |     won_ratios = []
107 |     ths = np.linspace(0, 1, n_step)
108 |     for th in ths:
109 |         idx = proba[:, 1] >= th
110 |         bet_ret = ret[idx]
111 |         won_count = len(bet_ret[bet_ret > 0])
112 |         total_count = len(bet_ret)
113 |         if total_count == 0:
114 |             won_ratio = 0
115 |         else:
116 |             won_ratio = won_count / total_count
117 |         won_ratios.append(won_ratio)
118 |         if len(bet_ret) == 0:
119 |             pnl = 0
120 |             sharpe = 0
121 |         elif len(bet_ret) == 1:
122 |             pnl = float(bet_ret)
123 |             sharpe = 0
124 |         else:
125 |             pnl = np.sum(bet_ret)
126 |             sharpe = np.mean(bet_ret) / np.std(bet_ret)
127 |         pnls.append(pnl)
128 |         sharpes.append(sharpe)
129 |     return ths, np.array(pnls), np.array(sharpes), np.array(won_ratios)
130 | 
131 | 
132 | def evaluate(model,
133 |              X,
134 |              y,
135 |              method,
136 |              sample_weight=None,
137 |              pos_idx=1,
138 |              pos_label=1,
139 |              ret=None):
140 |     """Calculate score
141 |     
142 |     Params
143 |     ------
144 |     model: Trained classifier instance
145 |     X: array-like, Input feature
146 |     y: array-like, Label
147 |     method: str
148 |         The name of scoring methods. 'precision', 'recall', 'f1', 'precision_recall',
149 |         'roc', 'accuracy' or 'neg_log_loss'
150 |     sample_weight: pd.Series, optional
151 |         If specified, apply this to bot testing and training
152 |     labels: array-like, optional
153 |         The name of labels
154 |         
155 |     Returns
156 |     -------
157 |     list of scores
158 |     """
159 |     if method == 'f1':
160 |         labels = model.classes_
161 |         pred = model.predict(X)
162 |         score = f1_score(y, pred, sample_weight=sample_weight, labels=labels)
163 |     elif method == 'neg_log_loss':
164 |         labels = model.classes_
165 |         prob = model.predict_proba(X)
166 |         score = -log_loss(y, prob, sample_weight=sample_weight, labels=labels)
167 |     elif method == 'precision':
168 |         pred = model.predict(X)
169 |         score = precision_score(
170 |             y, pred, pos_label=pos_label, sample_weight=sample_weight)
171 |     elif method == 'recall':
172 |         pred = model.predict(X)
173 |         score = recall_score(
174 |             y, pred, pos_label=pos_label, sample_weight=sample_weight)
175 |     elif method == 'precision_recall':
176 |         prob = model.predict_proba(X)[:, pos_idx]
177 |         score = precision_recall_curve(
178 |             y, prob, pos_label=pos_label, sample_weight=sample_weight)
179 |     elif method == 'roc':
180 |         prob = model.predict_proba(X)[:, pos_idx]
181 |         score = roc_curve(
182 |             y, prob, pos_label=pos_label, sample_weight=sample_weight)
183 |     elif method == 'accuracy':
184 |         pred = model.predict(X)
185 |         score = accuracy_score(y, pred, sample_weight=sample_weight)
186 |     elif method == 'performance':
187 |         prob = model.predict_proba(X)
188 |         score = performance(ret, prob)
189 |     elif method == 'meta_performance':
190 |         prob = model.predict_proba(X)
191 |         score = meta_performance(ret, prob)
192 |     else:
193 |         raise Exception(f'No Implementation method={method}')
194 |     return score


--------------------------------------------------------------------------------
/finance_ml/labeling/betsizes.py:
--------------------------------------------------------------------------------
  1 | import numbers
  2 | import numpy as np
  3 | import pandas as pd
  4 | from scipy.stats import norm, t
  5 | 
  6 | from ..multiprocessing import mp_pandas_obj
  7 | 
  8 | 
  9 | # Specific Betting Size Calculation
 10 | ###############################################################
 11 | def get_gaussian_betsize(probs, num_classes=2, eps=1e-4):
 12 |     """Translate probability to bettingsize
 13 | 
 14 |     Args:
 15 |         probs (array-like)
 16 |         num_classes (int, optional): Defaults to 2
 17 | 
 18 |     Returns:
 19 |         array-like: Signals after gaussian transform
 20 |     """
 21 |     max_prob = 1 - eps
 22 |     min_prob = eps
 23 |     if isinstance(probs, numbers.Number):
 24 |         if probs >= min_prob and probs <= max_prob:
 25 |             signal = (probs - 1. / num_classes) / np.sqrt(probs * (1 - probs))
 26 |             signal = 2 * norm.cdf(signal) - 1
 27 |         elif probs < min_prob:
 28 |             signal = -1
 29 |         elif probs > max_prob:
 30 |             signal = 1
 31 |         else:
 32 |             raise ValueError(f"Unkonwn probabilty: {probs}")
 33 |     else:
 34 |         signal = probs.copy()
 35 |         signal[probs >= max_prob] = 1
 36 |         signal[probs <= min_prob] = -1
 37 |         cond = (probs < max_prob) & (probs > min_prob)
 38 |         signal[cond] = (probs[cond] - 1. / num_classes) / np.sqrt(probs[cond] * (1 - probs[cond]))
 39 |         signal[cond] = 2 * norm.cdf(signal[cond]) - 1
 40 |     return signal
 41 | 
 42 | 
 43 | def get_tstats_betsize(probs, N, num_classes=2, eps=1e-4):
 44 |     """Translate probability to bettingsize
 45 | 
 46 |     Args:
 47 |         probs (array-like)
 48 |         N (int): The number of estimators used for generating probs
 49 |         num_classes (int, optional): Defaults to 2
 50 | 
 51 |     Returns:
 52 |         array-like: Signals after gaussian transform
 53 |     """
 54 |     max_prob = 1 - eps
 55 |     min_prob = eps
 56 |     if isinstance(probs, numbers.Number):
 57 |         if probs >= min_prob and probs <= max_prob:
 58 |             signal = (probs - 1. / num_classes) / np.sqrt(probs * (1 - probs)) * np.sqrt(N)
 59 |             signal = 2 * t.cdf(signal, df=N-1) - 1
 60 |         elif probs < min_prob:
 61 |             signal = -1
 62 |         elif probs > max_prob:
 63 |             signal = 1
 64 |         else:
 65 |             raise ValueError(f"Unkonwn probabilty: {probs}")
 66 |     else:
 67 |         signal = probs.copy()
 68 |         signal[probs >= max_prob] = 1
 69 |         signal[probs <= min_prob] = -1
 70 |         cond = (probs < max_prob) & (probs > min_prob)
 71 |         signal[cond] = (probs[cond] - 1. / num_classes) / np.sqrt(probs[cond] * (1 - probs[cond])) * np.sqrt(N)
 72 |         signal[cond] = 2 * t.cdf(signal[cond], df=N-1) - 1
 73 |     return signal
 74 | 
 75 | 
 76 | # Aggregate Signals
 77 | #####################################################################
 78 | def discrete_signals(signals, step_size):
 79 |     """Discretize signals
 80 |     
 81 |     Args:
 82 |         signals (pd.Series or float): Signals for betting size ranged [-1, 1]
 83 |         
 84 |         step_size (float): Discrete size ranged [0, 1]
 85 |     
 86 |     Returns:
 87 |         pd.Series or float: Discretized signals. If signals is pd.Series,\
 88 |             return value is pd.Series. If signals is float, return value\
 89 |             is float
 90 |     """
 91 |     if isinstance(signals, numbers.Number):
 92 |         signals = round(signals / step_size) * step_size
 93 |         signals = min(1, signals)
 94 |         signals = max(-1, signals)
 95 |     else:
 96 |         signals = (signals / step_size).round() * step_size
 97 |         signals[signals > 1] = 1
 98 |         signals[signals < -1] = -1
 99 |     return signals
100 | 
101 | 
102 | def avg_active_signals(signals, num_threads=1, timestamps=None):
103 |     """Average active signals
104 | 
105 |     Args:
106 |         signals (pd.DataFrame): With keys: 't1' and 'signal'
107 |             - t1, signal effective time boundary.
108 |             - signal, signal value
109 | 
110 |         num_threads (int, optional): The number of processor used for calculation.\
111 |             Defaults to 1.
112 | 
113 |         timestamps (list, optional): Timestamps used for output. When there is no active signal,\
114 |             value will be zero on that point. If not specified, use signals.index.
115 |     
116 |     Returns:
117 |         pd.Series: Averaged signals
118 |     """
119 |     if timestamps is None:
120 |         timestamps = set(signals['t1'].dropna().values)
121 |         timestamps = list(timestamps.union(set(signals.index.values)))
122 |         timestamps.sort()
123 |     out = mp_pandas_obj(
124 |         mp_avg_active_signals, ('molecule', timestamps),
125 |         num_threads,
126 |         signals=signals)
127 |     return out
128 | 
129 | 
130 | def mp_avg_active_signals(signals, molecule):
131 |     """Function to calculate averaging with multiprocessing"""
132 |     out = pd.Series()
133 |     for loc in molecule:
134 |         loc = pd.Timestamp(loc)
135 |         cond = (signals.index <= loc) & (
136 |             (loc < signals['t1']) | pd.isnull(signals['t1']))
137 |         active_idx = signals[cond].index
138 |         if len(active_idx) > 0:
139 |             out[loc] = signals.loc[active_idx, 'signal'].mean()
140 |         else:
141 |             out[loc] = 0
142 |     return out
143 | 
144 | 
145 | # Signal Translation
146 | #################################################################################
147 | def get_betsize(probs,
148 |                 events=None,
149 |                 scale=1,
150 |                 step_size=None,
151 |                 signal_func=None,
152 |                 num_classes=2,
153 |                 num_threads=1,
154 |                 **kwargs):
155 |     """Average and discretize signals from probability
156 | 
157 |     Args:
158 |         events (pd.DataFrame): With the following keys
159 |             - time, time of barrier
160 |             - type, type of barrier - tp, sl, or t1
161 |             - trgt, horizontal barrier width
162 |             - side, position side
163 | 
164 |         probs (pd.Series): Probability signals
165 | 
166 |         scale (float): Betting size scale
167 | 
168 |         step_size (float, optional): If specified, discretize signals.\
169 |             The value is ranged [0, 1]
170 | 
171 |         num_classes (int, optional): The number of classes. Defaults to 2.
172 | 
173 |         num_threads (int, optional): The number of threads used for averaging bets.\
174 |             Defaults to 1.
175 | 
176 |     Returns:
177 |         pd.Series: bet size signals
178 |     """
179 |     # Get Signals
180 |     if probs.shape[0] == 0:
181 |         return pd.Series()
182 |     if signal_func is None:
183 |         signal_func = get_gaussian_betsize
184 |     signal = pd.Series(signal_func(probs, num_classes=num_classes, **kwargs), index=probs.index)
185 |     if events and 'side' in events:
186 |         signal = signal * events.loc[signal.index, 'side']
187 |     if step_size is not None:
188 |         signal = discrete_signals(signal, step_size=step_size)
189 |     signal = scale * signal
190 |     return signal


--------------------------------------------------------------------------------
/finance_ml/model_selection/kfold.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from itertools import combinations
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.model_selection._split import _BaseKFold
  7 | 
  8 | from .utils import get_train_times
  9 | 
 10 | 
 11 | class PurgedKFold(_BaseKFold):
 12 |     """Cross Validation with purging and embargo
 13 |     
 14 |     Params
 15 |     ------
 16 |     n_splits: int
 17 |         The number of splits for cross validation
 18 |     t1: pd.Series
 19 |         Index and value correspond to the begining and end of information
 20 |     pct_embargo: float, default 0
 21 |         The percentage of applying embargo
 22 |     purging: bool, default True
 23 |         If true, apply purging method
 24 |     num_threads: int, default 1
 25 |         The number of threads for purging
 26 |     """
 27 | 
 28 |     def __init__(self,
 29 |                  n_splits=3,
 30 |                  t1=None,
 31 |                  pct_embargo=0.,
 32 |                  purging=True,
 33 |                  num_threads=1):
 34 |         super(PurgedKFold, self).__init__(
 35 |             n_splits=n_splits, shuffle=False, random_state=None)
 36 |         if not isinstance(t1, pd.Series):
 37 |             raise ValueError('t1 must be pd.Series')
 38 |         self.t1 = t1
 39 |         self.pct_embargo = pct_embargo
 40 |         self.purging = purging
 41 |         self.num_threads = num_threads
 42 | 
 43 |     def split(self, X, y=None, groups=None):
 44 |         """Get train and test times stamps
 45 |         
 46 |         Params
 47 |         ------
 48 |         X: pd.DataFrame
 49 |         y: pd.Series, optional
 50 |         
 51 |         Returns
 52 |         -------
 53 |         train_indices, test_indices: np.array
 54 |         """
 55 |         if (X.index == self.t1.index).sum() != len(self.t1):
 56 |             raise ValueError('X and t1 must have the same index')
 57 |         indices = np.arange(X.shape[0])
 58 |         # Embargo width
 59 |         embg_size = int(X.shape[0] * self.pct_embargo)
 60 |         # Pandas is close set when using [t0:t1]
 61 |         test_ranges = [(i[0], i[-1] + 1)
 62 |                        for i in np.array_split(indices, self.n_splits)]
 63 |         for st, end in test_ranges:
 64 |             test_indices = indices[st:end]
 65 |             t0 = self.t1.index[st]
 66 |             # Avoid look ahead leakage here
 67 |             train_indices = self.t1.index.searchsorted(
 68 |                 self.t1[self.t1 <= t0].index)
 69 |             # Edge point of test set in the most recent side
 70 |             max_t1_idx = self.t1.index.searchsorted(
 71 |                 self.t1[test_indices].max())
 72 |             if max_t1_idx < X.shape[0]:
 73 |                 # Adding indices after test set
 74 |                 train_indices = np.concatenate(
 75 |                     (train_indices, indices[max_t1_idx + embg_size:]))
 76 |             # Purging
 77 |             if self.purging:
 78 |                 train_t1 = self.t1.iloc[train_indices]
 79 |                 test_t1 = self.t1.iloc[test_indices]
 80 |                 train_t1 = get_train_times(
 81 |                     train_t1, test_t1, num_threads=self.num_threads)
 82 |                 train_indices = self.t1.index.searchsorted(train_t1.index)
 83 |             yield train_indices, test_indices
 84 | 
 85 | 
 86 | class CPKFold(object):
 87 |     """Cross Validation with purging and embargo
 88 |     
 89 |     Params
 90 |     ------
 91 |     n_splits: tuple
 92 |         Combinatorial of (n_splits[0], n_splits[1]). n_splits[1] is the number of test.
 93 |     t1: pd.Series
 94 |         Index and value correspond to the begining and end of information
 95 |     pct_embargo: float, default 0
 96 |         The percentage of applying embargo
 97 |     purging: bool, default True
 98 |         If true, apply purging method
 99 |     num_threads: int, default 1
100 |         The number of threads for purging
101 |     """
102 | 
103 |     def __init__(self,
104 |                  n_splits,
105 |                  t1=None,
106 |                  pct_embargo=0.,
107 |                  purging=True,
108 |                  num_threads=1):
109 |         if not isinstance(t1, pd.Series):
110 |             raise ValueError('t1 must be pd.Series')
111 |         self.n_splits = n_splits
112 |         self.t1 = t1
113 |         self.pct_embargo = pct_embargo
114 |         self.purging = purging
115 |         self.num_threads = num_threads
116 | 
117 |     def split(self, X, y=None, groups=None):
118 |         """Get train and test times stamps
119 |         
120 |         Params
121 |         ------
122 |         X: pd.DataFrame
123 |         y: pd.Series, optional
124 |         
125 |         Returns
126 |         -------
127 |         train_indices, test_indices: np.array
128 |         """
129 |         if (X.index == self.t1.index).sum() != len(self.t1):
130 |             raise ValueError('X and t1 must have the same index')
131 |         indices = np.arange(X.shape[0])
132 |         # Embargo width
133 |         embg_size = int(X.shape[0] * self.pct_embargo)
134 |         # Generate Combinatorial Pairs for training
135 |         split_indices = np.array_split(indices, self.n_splits[0])
136 |         self._split_locs = np.arange(self.n_splits[0])
137 |         self._test_loc = {
138 |             i: X.index[idx]
139 |             for i, idx in enumerate(split_indices)
140 |         }
141 |         self._test_combs = np.array(
142 |             list(combinations(self._split_locs, self.n_splits[1])))
143 |         train_combs = []
144 |         for comb_idx in self._test_combs:
145 |             train_comb = list(set(self._split_locs).difference(set(comb_idx)))
146 |             train_combs.append(train_comb)
147 | 
148 |         train_indices_embg = []
149 |         train_indices = []
150 |         for comb_idx in train_combs:
151 |             train_index_embg = []
152 |             train_index = []
153 |             for i in comb_idx:
154 |                 if i < self.n_splits[0] - 1:
155 |                     train_index_ = np.hstack(
156 |                         (split_indices[i], split_indices[i + 1][:embg_size]))
157 |                     train_index_embg.append(train_index_)
158 |                     train_index.append(split_indices[i])
159 |                 else:
160 |                     train_index_embg.append(split_indices[i])
161 |                     train_index.append(split_indices[i])
162 |             train_indices_embg.append(
163 |                 np.array(list(set(np.hstack(train_index_embg)))))
164 |             train_indices.append(np.array(list(set(np.hstack(train_index)))))
165 | 
166 |         for train_index, train_index_embg in zip(train_indices,
167 |                                                  train_indices_embg):
168 |             test_index = np.array(
169 |                 list(set(indices).difference(set(train_index))))
170 |             # Purging
171 |             if self.purging:
172 |                 train_t1 = self.t1.iloc[train_index]
173 |                 test_t1 = self.t1.iloc[test_index]
174 |                 train_t1 = get_train_times(
175 |                     train_t1, test_t1, num_threads=self.num_threads)
176 |                 train_index = self.t1.index.searchsorted(train_t1.index)
177 |             yield train_index, test_index
178 | 
179 |     def get_test_combs(self):
180 |         return self._test_combs, self._test_loc
181 | 
182 | 
183 | def generate_signals(clf,
184 |                      X,
185 |                      y,
186 |                      sample_weight=None,
187 |                      n_splits=(4, 2),
188 |                      t1=None,
189 |                      pct_embargo=0.,
190 |                      purging=True,
191 |                      num_threads=1,
192 |                      **kwargs):
193 |     """Cross Validation with default purging and embargo
194 |     
195 |     Params
196 |     ------
197 |     X: pd.DataFrame
198 |     y: pd.Series, optional
199 |     sample_weight: pd.Series, optional
200 |         If specified, apply this to bot testing and training
201 |     n_splits: tuple
202 |         Combinatorial of (n_splits[0], n_splits[1]). n_splits[1] is the number of test.
203 |     t1: pd.Series
204 |         Index and value correspond to the begining and end of information
205 |     pct_embargo: float, default 0
206 |         The percentage of applying embargo
207 |     purging: bool, default True
208 |         If true, apply purging method
209 |     num_threads: int, default 1
210 |         The number of threads for purging
211 |     kwargs: Parameters for scoring function
212 |         
213 |     Returns
214 |     -------
215 |     result: dict(list)
216 |         Each element is signal generated from classifier
217 |     test_times: timestamps
218 |     """
219 |     cv_gen = CPKFold(
220 |         n_splits=n_splits,
221 |         t1=t1,
222 |         pct_embargo=pct_embargo,
223 |         purging=purging,
224 |         num_threads=num_threads)
225 |     signals = []
226 |     for train, test in cv_gen.split(X=X):
227 |         train_params = dict()
228 |         test_params = dict()
229 |         # Sample weight is an optional parameter
230 |         if sample_weight is not None:
231 |             train_params['sample_weight'] = sample_weight.iloc[train].values
232 |             test_params['sample_weight'] = sample_weight.iloc[test].values
233 |         test_params.update(kwargs)
234 |         clf_fit = clf.fit(
235 |             X=X.iloc[train, :].values, y=y.iloc[train].values, **train_params)
236 |         # Scoring
237 |         signal = clf_fit.predict_proba(X.iloc[test, :].values)
238 |         signal = pd.DataFrame(signal, index=X.iloc[test].index)
239 |         signals.append(signal)
240 | 
241 |     combs = cv_gen.get_test_combs()
242 |     result = defaultdict(list)
243 |     test_times = combs[1]
244 |     for signal, comb in zip(signals, combs[0]):
245 |         for i in comb:
246 |             result[i].append(signal.loc[test_times[i]])
247 |     return result, test_times


--------------------------------------------------------------------------------
/finance_ml/labeling/barriers.py:
--------------------------------------------------------------------------------
  1 | import numbers
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import multiprocessing as mp
  6 | 
  7 | from ..multiprocessing import mp_pandas_obj
  8 | from ..constants import LONG, SHORT
  9 | 
 10 | 
 11 | def get_touch_idx(close, events, sltp, molecule=None):
 12 |     """Return timestamps of when data points touch the barriers
 13 | 
 14 |     Args:
 15 |         close (pd.Series): Close price series
 16 | 
 17 |         events (pd.DataFrame): With columns: 't1', 'trgt', and 'side'
 18 |             t1, time stamp of vertical barrier, could be np.nan
 19 |             trgt, unit of width of horizontal barriers
 20 |             side, Side label for metalabeling
 21 | 
 22 |         sltp (list): Coefficients of width of Stop Loss and Take Profit.\
 23 |             sltp[0] and sltp[1] correspond to width of stop loss\
 24 |             and take profit, respectively. If 0 or negative, the barrier\
 25 |             is turned off.
 26 | 
 27 |         molecule (list, optional): Subset of indices of events to be processed
 28 | 
 29 |     Returns:
 30 |         pd.DataFrame: each colum corresponds to the time to touch each barrier
 31 |     """
 32 |     # Sample a subset with specific indices
 33 |     if molecule is not None:
 34 |         _events = events.loc[molecule]
 35 |     else:
 36 |         _events = events
 37 |     touch_idx = pd.DataFrame(index=_events.index)
 38 |     # Set Stop Loss and Take Profoit
 39 |     if sltp[0] > 0:
 40 |         sls = -sltp[0] * _events["trgt"]
 41 |     else:
 42 |         # Switch off stop loss
 43 |         sls = pd.Series(index=_events.index)
 44 |     if sltp[1] > 0:
 45 |         tps = sltp[1] * _events["trgt"]
 46 |     else:
 47 |         # Switch off profit taking
 48 |         tps = pd.Series(index=_events.index)
 49 |     # Replace undefined value with the last time index
 50 |     vertical_lines = _events["t1"].fillna(close.index[-1])
 51 |     for loc, t1 in vertical_lines.iteritems():
 52 |         df = close[loc:t1]
 53 |         # Change the direction depending on the side
 54 |         df = (df / close[loc] - 1) * _events.at[loc, 'side']
 55 |         touch_idx.at[loc, 'sl'] = df[df < sls[loc]].index.min()
 56 |         touch_idx.at[loc, 'tp'] = df[df > tps[loc]].index.min()
 57 |     touch_idx['t1'] = _events['t1'].copy(deep=True)
 58 |     return touch_idx
 59 | 
 60 | 
 61 | def get_events(close, timestamps, sltp=None, trgt=None, min_trgt=0,
 62 |                num_threads=1, t1=None, side=None):
 63 |     """Return DataFrame containing infomation defining barriers
 64 | 
 65 |     Args:
 66 |         close (pd.Series): Close price series
 67 | 
 68 |         timestamps (pd.DatetimeIndex): sampled points to analyze
 69 | 
 70 |         sltp (list or int, optional): Coefficients of width of Stop Loss and Take Profit.\
 71 |             sltp[0] and sltp[1] correspond to width of stop loss\
 72 |             and take profit, respectively. If 0 or negative, the barrier\
 73 |             is turned off. If not specified, use only vertical line.\
 74 | 
 75 |         trgt (pd.Series, optional): Time series of threashold.\
 76 |             If not specified, we will switch off horizontal thresholds
 77 | 
 78 |         min_trgt (float, optional): Minimum value of threashold to label either of negative\
 79 |             or positive. Defaults to 0.
 80 | 
 81 |         num_threads (int, optional): The number of threads to use.\
 82 |             Defaults to 1.
 83 | 
 84 |         t1 (pd.Series, optional): Vertical lines\
 85 | 
 86 |         side (pd.Series, optional): Side of trading positions
 87 | 
 88 |     Returns:
 89 |         pd.DataFrame: With the following keys:
 90 |             - t1, timestamp of labeled point
 91 |             - trgt, target threashold value
 92 |             - type, the type of labeled point, either of `t1`, `tp`, or `sl`.
 93 |             - side, Only if you use metalabeling, this key is available
 94 |     """
 95 |     if trgt is None:
 96 |         # Switch off horizontal barriers
 97 |         trgt = pd.Series(1 + min_trgt, index=timestamps)
 98 |         sltp = -1
 99 |     elif isinstance(trgt, numbers.Number):
100 |         trgt = pd.Series(trgt, index=timestamps)
101 |     # Get sampled target values
102 |     trgt = trgt.loc[timestamps]
103 |     trgt = trgt[trgt > min_trgt]
104 |     if len(trgt) == 0:
105 |         return pd.DataFrame(columns=['t1', 'trgt', 'side'])
106 |     # Get time boundary t1
107 |     if t1 is None:
108 |         t1 = pd.Series(pd.NaT, index=timestamps)
109 |     # slpt has to be either of integer, list or tuple
110 |     if isinstance(sltp, list) or isinstance(sltp, tuple):
111 |         _sltp = sltp[:2]
112 |     else:
113 |         _sltp = [sltp, sltp]
114 |     # Define the side
115 |     if side is None:
116 |         # Default is LONG
117 |         _side = pd.Series(LONG, index=trgt.index)
118 |     else:
119 |         _side = side.loc[trgt.index]
120 |     events = pd.concat({'t1': t1, 'trgt': trgt, 'side': _side}, axis=1)
121 |     events = events.dropna(subset=['trgt'])
122 |     time_idx = mp_pandas_obj(func=get_touch_idx,
123 |                              pd_obj=('molecule', events.index),
124 |                              num_threads=num_threads,
125 |                              close=close, events=events, sltp=_sltp)
126 |     # Skip when all of barrier are not touched
127 |     time_idx = time_idx.dropna(how='all')
128 |     events['type'] = time_idx.idxmin(axis=1)
129 |     events['t1'] = time_idx.min(axis=1)
130 |     if side is None:
131 |         events = events.drop('side', axis=1)
132 |     return events
133 | 
134 | 
135 | def get_t1(close, timestamps, seconds=None):
136 |     """Return horizontal timestamps
137 | 
138 |     Note:
139 |         Not include the case to hit the vertical line at the end of close.index
140 | 
141 |     Args:
142 |         close (pd.Series)
143 | 
144 |         timestamps (pd.DatetimeIndex)
145 | 
146 |         seconds (int, optional):
147 |             The number of forward dates or seconds for vertical barrier.
148 | 
149 |     Returns:
150 |         pd.Series: Vertical barrier timestamps
151 |     """
152 |     delta = pd.Timedelta(seconds=seconds)
153 |     t1 = close.index.searchsorted(timestamps + delta)
154 |     t1 = t1[t1 < close.shape[0]]
155 |     t1 = pd.Series(close.index[t1], index=timestamps[:t1.shape[0]])
156 |     return t1
157 | 
158 | 
159 | def get_labels(close, events, min_ret=0, sign_label=True, zero_label=0):
160 |     """Return label
161 | 
162 |     Args:
163 |         close (pd.Series)
164 | 
165 |         events (pd.DataFrame):
166 |             t1: time of barrier
167 |             type: type of barrier - tp, sl, or t1
168 |             trgt: horizontal barrier width
169 |             side: position side
170 | 
171 |         min_ret (float): Minimum of absolute value for labeling non zero label. min_ret >=0
172 | 
173 |         sign_label (bool, opyionsl): If True, assign label for points touching vertical\
174 |             line accroing to return's sign. Defaults to True.
175 | 
176 |         zero_label (int, optional):
177 |             If specified, use it for the label of zero value of return\
178 |             If not, get rid of samples. Defaults to 0.
179 | 
180 |     Returns:
181 |         pd.DataFrame: With the following keys:
182 |             - ret, return value for label
183 |             - t1, timestamp of labeled point
184 |             - label, label values
185 |             - type, the type of labeled point, either of `t1`, `tp`, or `sl`.
186 |             - side, Only if you use metalabeling, this key is available
187 |     """
188 |     # Prices algined with events
189 |     events = events.dropna(subset=['t1'])
190 |     # All used indices
191 |     time_idx = events.index.union(events['t1'].values).drop_duplicates()
192 |     close = close.reindex(time_idx, method='bfill')
193 |     # Create out object
194 |     out = pd.DataFrame(index=events.index)
195 |     out['ret'] = close.loc[events['t1'].values].values / close.loc[
196 |         events.index] - 1.
197 |     # Modify return according to the side
198 |     if 'side' in events:
199 |         out['ret'] *= events['side']
200 |         out['side'] = events['side']
201 |     # Assign labels
202 |     out = out.dropna()
203 |     out['label'] = np.sign(out['ret'])
204 |     if 'side' in events:
205 |         out.loc[out['ret'] <= min_ret, 'label'] = zero_label
206 |     else:
207 |         out.loc[(out['ret'] <= min_ret) & (out['ret'] >= -min_ret), 'label'] = zero_label
208 |     if not sign_label:
209 |         out['label'].loc[events['type'] == 't1'] = zero_label
210 |     out['t1'] = events['t1']
211 |     out['type'] = events['type']
212 |     return out
213 | 
214 | 
215 | 
216 | def get_barrier_labels(close, timestamps=None, trgt=None, sltp=[1, 1],
217 |                        seconds=None, min_trgt=0, min_ret=0,
218 |                        num_threads=None, side=None, sign_label=False, zero_label=0):
219 |     """Return Labels for triple barrier
220 | 
221 |     Args:
222 |         close (pd.Series)
223 | 
224 |         timestamps (pd.DatetimeIndex, optional): Sampled points to analyze.\
225 |             If not specified, use close.index
226 | 
227 |         trgt (pd.Series, optional): Time series of threshold.\
228 |             If not specified, it will switch off horizontal barriers
229 | 
230 |         sltp (list, optional): Coefficients of width of Stop Loss and Take Profit.\
231 |             sltp[0] and sltp[1] correspond to width of stop loss\
232 |             and take profit, respectively. If 0 or negative, the barrier\
233 |             is switched off. Defaults to [1, 1].\
234 | 
235 |         seconds (float, optional): The length of vertical barrier.
236 | 
237 |         min_trgt (float, optional): Minimum value of threshold to label positive or negative.\
238 |             Deafults to 0.
239 | 
240 |         num_threads (int, optional): The number of threads to use. If not specified,\
241 |             use maximum number of threads.
242 | 
243 |         side (pd.Series, optional): Side of trading positions
244 | 
245 |         sign_label (bool, optional): If True, assign label for points touching vertical\
246 |             line according to return's sign. Defaults to True.
247 |             
248 |         zero_label (int, optional): The label for zero value of returns
249 | 
250 |     Returns:
251 |         pd.DataFrame: With the following keys:
252 |             - ret, return value for label
253 |             - t1, timestamp of labeled point
254 |             - label, label values
255 |             - type, the type of labeled point, either of `t1`, `tp`, or `sl`.
256 |             - side, Only if you use metalabeling, this key is available
257 |     """
258 |     if timestamps is None:
259 |         if side is None:
260 |             timestamps = close.index
261 |         else:
262 |             timestamps = side.index
263 |     t1 = get_t1(close, timestamps, seconds=seconds)
264 |     if num_threads is None:
265 |         num_threads = mp.cpu_count()
266 |     events = get_events(close, timestamps,
267 |                         sltp=sltp,
268 |                         trgt=trgt,
269 |                         min_trgt=min_trgt,
270 |                         num_threads=num_threads,
271 |                         t1=t1, side=side)
272 |     labels = get_labels(close, events, min_ret=min_ret, sign_label=sign_label, zero_label=zero_label)
273 |     return labels


--------------------------------------------------------------------------------
/finance_ml/importance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from sklearn.model_selection import KFold
  5 | from sklearn.metrics import log_loss, mean_squared_error
  6 | 
  7 | from .model_selection import PurgedKFold, cv_score, evaluate
  8 | 
  9 | 
 10 | def mp_feat_imp_SFI(clf, X, y, feat_names, sample_weight=None, scoring='neg_log_loss',
 11 |                     n_splits=3, t1=None, cv_gen=None, pct_embargo=0, purging=True):
 12 |     imp = pd.DataFrame(columns=['mean', 'std'])
 13 |     for feat_name in feat_names:
 14 |         scores = cv_score(clf, X=X[[feat_name]], y=y,
 15 |                           sample_weight=sample_weight,
 16 |                           scoring=scoring,
 17 |                           cv_gen=cv_gen,
 18 |                           n_splits=n_splits,
 19 |                           t1=t1,
 20 |                           pct_embargo=pct_embargo,
 21 |                           purging=purging)
 22 |         imp.loc[feat_name, 'mean'] = scores.mean()
 23 |         imp.loc[feat_name, 'std'] = scores.std() * scores.shape[0] ** -0.5
 24 |     return imp
 25 | 
 26 | 
 27 | def feat_imp_SFI(clf, X, y, sample_weight=None, scoring='neg_log_loss',
 28 |                  n_splits=5, t1=None, cv_gen=None, pct_embargo=0, purging=True, num_threads=1):
 29 |     """Calculate Single Feature Importance
 30 |     
 31 |     Args:
 32 |         clf: Classifier instance
 33 |         X: pd.DataFrame, Input feature
 34 |         y: pd.Series, Label
 35 |         clstrs: dict[list]
 36 |             Clustering labels: key is the name of cluster and value is list of belonging columns  
 37 |         sample_weight: pd.Series, optional
 38 |             If specified, apply this to testing and training
 39 |         scoring: str, default 'neg_log_loss'
 40 |             The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss'
 41 |         n_splits: int, default 3
 42 |             The number of splits for cross validation
 43 |         t1: pd.Series
 44 |             Index and value correspond to the begining and end of information. It is required for purging and embargo
 45 |         cv_gen: KFold instance
 46 |             If not specified, use PurgedKfold
 47 |         pct_embargo: float, default 0
 48 |             The percentage of applying embargo
 49 |         purging: bool, default True
 50 |             If true, apply purging method
 51 |         num_threads: int, default 1
 52 |             The number of threads for purging
 53 |         
 54 |     Returns:
 55 |         pd.DataFrame: Importance means and standard deviations
 56 |             - mean: Mean of importance
 57 |             - std: Standard deviation of importance
 58 |     """
 59 |     imp = mp_pandas_obj(mp_feat_imp_SFI, ('feat_names', X.columns),
 60 |                         num_threads, clf=clf, X=X, y=y, sample_weight=sample_weight,
 61 |                         scoring=scoring, n_splits=n_splits, t1=t1, cv_gen=cv_gen,
 62 |                         pct_embargo=pct_embargo, purging=purging)
 63 |     return imp
 64 | 
 65 | 
 66 | def feat_imp_MDI(fit, feat_names):
 67 |     """Compute Mean Decrease Impurity
 68 |     
 69 |     Args:
 70 |         forest (Forest Classifier instance)
 71 |         feat_names (list(str)): List of names of features
 72 | 
 73 |     Returns:
 74 |         pd.DataFrame: Importance means and standard deviations
 75 |             - mean: Mean of importance
 76 |             - std: Standard deviation of importance
 77 |     """
 78 |     df0 = {i: tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
 79 |     df0 = pd.DataFrame.from_dict(df0, orient='index')
 80 |     df0.columns = feat_names
 81 |     df0 = df0.replace(0, np.nan)
 82 |     imp = pd.concat({"mean": df0.mean(), "std": df0.std() * (df0.shape[0] ** -0.5)}, axis=1)
 83 |     imp /= imp["mean"].sum()
 84 |     return imp
 85 | 
 86 | 
 87 | def feat_imp_MDA(clf, X, y, sample_weight=None, scoring='neg_log_loss', n_splits=5, t1=None,
 88 |                  cv_gen=None, pct_embargo=0, purging=True, num_threads=1):
 89 |     """Calculate Mean Decrease Accuracy
 90 | 
 91 |     Note:
 92 |         You can use any classifier to estimate importance
 93 |     
 94 |     Args:
 95 |         clf: Classifier instance
 96 |         X: pd.DataFrame, Input feature
 97 |         y: pd.Series, Label        
 98 |         sample_weight: pd.Series, optional
 99 |             If specified, apply this to testing and training
100 |         scoring: str, default 'neg_log_loss'
101 |             The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss'
102 |         n_splits: int, default 3
103 |             The number of splits for cross validation
104 |         t1: pd.Series
105 |             Index and value correspond to the begining and end of information. It is required for purging and embargo
106 |         cv_gen: KFold instance
107 |             If not specified, use PurgedKfold
108 |         pct_embargo: float, default 0
109 |             The percentage of applying embargo
110 |         purging: bool, default True
111 |             If true, apply purging method
112 |         num_threads: int, default 1
113 |             The number of threads for purging
114 |     
115 |     Returns:
116 |         pd.DataFrame: Importance means and standard deviations
117 |             - mean: Mean of importance
118 |             - std: Standard deviation of importance
119 |     """
120 |     
121 |     if cv_gen is None:
122 |         if t1 is not None:
123 |             cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo,
124 |                                  purging=purging, num_threads=num_threads)
125 |         else:
126 |             cv_gen = KFold(n_splits=n_splits)
127 |     index = np.arange(n_splits)
128 |     scores = pd.Series(index=index)
129 |     scores_perm = pd.DataFrame(index=index, columns=X.columns)
130 |     for idx, (train, test) in zip(index, cv_gen.split(X=X)):
131 |         X_train = X.iloc[train]
132 |         y_train = y.iloc[train]
133 |         if sample_weight is not None:
134 |             w_train = sample_weight.iloc[train].values
135 |         else:
136 |             w_train = None
137 |         X_test = X.iloc[test]
138 |         y_test = y.iloc[test]
139 |         if sample_weight is not None:
140 |             w_test = sample_weight.iloc[test].values
141 |         else:
142 |             w_test = None
143 |         clf_fit = clf.fit(X_train, y_train, sample_weight=w_train)
144 |         scores.loc[idx] = evaluate(clf_fit, X_test, y_test, scoring,
145 |                                    sample_weight=w_test)
146 | 
147 |         for col in X.columns:
148 |             X_test_ = X_test.copy(deep=True)
149 |             # Randomize certain feature to make it not effective
150 |             np.random.shuffle(X_test_[col].values)
151 |             scores_perm.loc[idx, col] = evaluate(clf_fit, X_test_, y_test, scoring,
152 |                                                  sample_weight=w_test)
153 |     # (Original score) - (premutated score)
154 |     imprv = (-scores_perm).add(scores, axis=0)
155 |     # Relative to maximum improvement
156 |     if scoring == 'neg_log_loss':
157 |         max_imprv = -scores_perm
158 |     else:
159 |         max_imprv = 1. - scores_perm
160 |     imp = imprv / max_imprv
161 |     return pd.concat({"mean": imp.mean(), "std": imp.std() * (imp.shape[0] ** -0.5)}, axis=1)
162 | 
163 | def group_mean_std(df0, clstrs):
164 |     out = pd.DataFrame(columns=['mean', 'std'])
165 |     for key, elements in clstrs.items():
166 |         df1 = df0[elements].sum(axis=1)
167 |         out.loc[f"C_{key}", 'mean'] = df1.mean()
168 |         out.loc[f"C_{key}", 'std'] = df1.std() * df1.shape[0]**-.5
169 |     return out
170 | 
171 | def feat_imp_MDI_clustered(fit, feat_names, clstrs):
172 |     """Compute Mean Decrease Impurity
173 |     
174 |     Args:
175 |         forest (Forest Classifier instance)
176 |         feat_names (list(str)): List of names of features
177 |         clstrs: dict[list]
178 |             Clustering labels: key is the name of cluster and value is list of belonging columns
179 | 
180 |     Returns:
181 |         pd.DataFrame: Importance means and standard deviations
182 |             - mean: Mean of importance
183 |             - std: Standard deviation of importance
184 |     """
185 |     df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
186 |     df0 = pd.DataFrame.from_dict(df0, orient='index')
187 |     df0.columns = feat_names
188 |     df0 = df0.replace(0, np.nan) #because max_features=1
189 |     imp = group_mean_std(df0, clstrs)
190 |     imp /= imp['mean'].sum()
191 |     return imp
192 | 
193 | 
194 | def feat_imp_MDA_clustered(clf, X, y, clstrs, 
195 |                            sample_weight=None,
196 |                            scoring='neg_log_loss',
197 |                            n_splits=5, t1=None,
198 |                            cv_gen=None, pct_embargo=0,
199 |                            purging=True, num_threads=1):
200 |     """Calculate Clustered Mean Decrease Accuracy
201 | 
202 |     Note:
203 |         You can use any classifier to estimate importance
204 |     
205 |     Args:
206 |         clf: Classifier instance
207 |         X: pd.DataFrame, Input feature
208 |         y: pd.Series, Label
209 |         clstrs: dict[list]
210 |             Clustering labels: key is the name of cluster and value is list of belonging columns  
211 |         sample_weight: pd.Series, optional
212 |             If specified, apply this to testing and training
213 |         scoring: str, default 'neg_log_loss'
214 |             The name of scoring methods. 'f1', 'accuracy' or 'neg_log_loss'
215 |         n_splits: int, default 3
216 |             The number of splits for cross validation
217 |         t1: pd.Series
218 |             Index and value correspond to the begining and end of information. It is required for purging and embargo
219 |         cv_gen: KFold instance
220 |             If not specified, use PurgedKfold
221 |         pct_embargo: float, default 0
222 |             The percentage of applying embargo
223 |         purging: bool, default True
224 |             If true, apply purging method
225 |         num_threads: int, default 1
226 |             The number of threads for purging
227 |     
228 |     Returns:
229 |         pd.DataFrame: Importance means and standard deviations
230 |             - mean: Mean of importance
231 |             - std: Standard deviation of importance
232 |     """
233 |     
234 |     if cv_gen is None:
235 |         if t1 is not None:
236 |             cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo,
237 |                                  purging=purging, num_threads=num_threads)
238 |         else:
239 |             cv_gen = KFold(n_splits=n_splits)
240 |     index = np.arange(n_splits)
241 |     scores = pd.Series(index=index)
242 |     scores_perm = pd.DataFrame(index=index, columns=clstrs.keys())
243 |     for idx, (train, test) in zip(index, cv_gen.split(X=X)):
244 |         X_train = X.iloc[train]
245 |         y_train = y.iloc[train]
246 |         if sample_weight is not None:
247 |             w_train = sample_weight.iloc[train].values
248 |         else:
249 |             w_train = None
250 |         X_test = X.iloc[test]
251 |         y_test = y.iloc[test]
252 |         if sample_weight is not None:
253 |             w_test = sample_weight.iloc[test].values
254 |         else:
255 |             w_test = None
256 |         clf_fit = clf.fit(X_train, y_train, sample_weight=w_train)
257 |         scores.loc[idx] = evaluate(clf_fit, X_test, y_test, scoring,
258 |                                    sample_weight=w_test)
259 | 
260 |         for clstr_name in clstrs.keys():
261 |             X_test_ = X_test.copy(deep=True)
262 |             for k in clstrs[clstr_name]:
263 |                 np.random.shuffle(X_test_[k].values)
264 |             scores_perm.loc[idx, clstr_name] = evaluate(clf_fit, X_test_, y_test,
265 |                                                         scoring, sample_weight=w_test)
266 |     # (Original score) - (premutated score)
267 |     imprv = (-scores_perm).add(scores, axis=0)
268 |     # Relative to maximum improvement
269 |     if scoring == 'neg_log_loss':
270 |         max_imprv = -scores_perm
271 |     else:
272 |         max_imprv = 1. - scores_perm
273 |     imp = imprv / max_imprv
274 |     imp = pd.concat({'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** -0.5}, axis=1)
275 |     imp.index = [f"C_{i}" for i in imp.index]
276 |     return imp


--------------------------------------------------------------------------------