├── quant.jpg
├── End_of_Script.mp3
├── modules
    ├── __init__.py
    ├── optimizer.py
    ├── risk_model.py
    ├── utils_s.py
    ├── factorize.py
    ├── feature_weights.py
    └── account.py
├── .gitignore
├── output
    └── optimal_weights_regularized.csv
├── LICENSE
├── README.md
└── alpha_research.ipynb


/quant.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keyvantaj/Quantitative/HEAD/quant.jpg


--------------------------------------------------------------------------------
/End_of_Script.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/keyvantaj/Quantitative/HEAD/End_of_Script.mp3


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer import OptimalHoldingsRegularization
2 | from .account import AccountManagement
3 | from .feature_weights import Learner
4 | from .risk_model import RiskManagement
5 | from .factorize import FactorManagement
6 | from .utils_s import Util


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .ipynb_checkpoints/
 3 | secret_key.txt
 4 | data/
 5 | Stock_Basic_Statistics.ipynb
 6 | sa_alpha_research.ipynb
 7 | account_ids.txt
 8 | feature_weights_research.ipynb
 9 | alpha_research_bis.ipynb
10 | xgboost_feature_weights_research.ipynb
11 | test_unit/
12 | ml_binary_classifier/
13 | 


--------------------------------------------------------------------------------
/output/optimal_weights_regularized.csv:
--------------------------------------------------------------------------------
 1 | asset,optimal_weights
 2 | ABT,-0.05559935271460593
 3 | ADBE,1.0832878961941056e-07
 4 | ALXN,-0.04639549743620592
 5 | ANET,-0.0075108010833871515
 6 | ATVI,0.03684950700657413
 7 | AYX,0.008458496465202412
 8 | BMRN,2.9112691538854106e-08
 9 | COF,-0.003174570534715477
10 | CTXS,0.08546509965309716
11 | DGX,-6.860808310964508e-09
12 | FAST,1.321257772051097e-08
13 | FTNT,0.10863169997235622
14 | GLW,-1.067262237337786e-08
15 | ILMN,0.005335203379046998
16 | INCY,1.4033489504143296e-08
17 | LH,-1.3187581249270227e-08
18 | LMT,0.10300616702004722
19 | LULU,0.029993615133875027
20 | MXIM,-0.09999999750319191
21 | NOC,0.05418114216093162
22 | PAYX,-7.247215980599917e-09
23 | PSA,-0.09999999791350304
24 | SPLK,0.026081529006246936
25 | SYK,-0.00435943680227242
26 | TTD,-0.0285204681381319
27 | TXN,-0.054439716071083916
28 | ZEN,-0.09999999794328515
29 | ZNGA,0.04199722994476514
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Keyvan Tajbakhsh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/modules/optimizer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from datetime import datetime
  4 | from abc import ABC, abstractmethod
  5 | import scipy.stats as stats
  6 | import cvxpy as cvx
  7 | 
  8 | 
  9 | class AbstractOptimalHoldings(ABC):
 10 |     @abstractmethod
 11 |     def _get_obj(self, weights, alpha_vector):
 12 |         raise NotImplementedError()
 13 | 
 14 |     @abstractmethod
 15 |     def _get_constraints(self, weights, factor_betas, risk):
 16 |         raise NotImplementedError()
 17 | 
 18 |     @staticmethod
 19 |     def _get_risk(weights, factor_betas, alpha_vector_index, factor_cov_matrix, idiosyncratic_var_vector):
 20 |         f = factor_betas.loc[alpha_vector_index].values.T * weights
 21 |         X = factor_cov_matrix
 22 |         S = np.diag(idiosyncratic_var_vector.loc[alpha_vector_index].values.flatten())
 23 | 
 24 |         return cvx.quad_form(f, X) + cvx.quad_form(weights, S)
 25 | 
 26 |     def find(self, alpha_vector, factor_betas, factor_cov_matrix, idiosyncratic_var_vector):
 27 |         weights = cvx.Variable(len(alpha_vector))
 28 |         risk = self._get_risk(weights, factor_betas, alpha_vector.index, factor_cov_matrix, idiosyncratic_var_vector)
 29 | 
 30 |         obj = self._get_obj(weights, alpha_vector)
 31 | 
 32 |         constraints = self._get_constraints(weights, factor_betas.loc[alpha_vector.index].values, risk)
 33 | 
 34 |         prob = cvx.Problem(obj, constraints)
 35 |         prob.solve(max_iters=500)
 36 | 
 37 |         optimal_weights = np.asarray(weights.value)
 38 | 
 39 |         return pd.DataFrame(data=optimal_weights, index=alpha_vector.index, columns=['optimal_weights'])
 40 | 
 41 | 
 42 | class OptimalHoldings(AbstractOptimalHoldings):
 43 | 
 44 |     def __init__(self, risk_cap=0.05, factor_max=10.0, factor_min=-10.0, weights_max=0.5, weights_min=-0.5):
 45 |         self.risk_cap = risk_cap
 46 |         self.factor_max = factor_max
 47 |         self.factor_min = factor_min
 48 |         self.weights_max = weights_max
 49 |         self.weights_min = weights_min
 50 | 
 51 |     def _get_obj(self, weights, alpha_vector):
 52 |         assert (len(alpha_vector.columns) == 1)
 53 | 
 54 |         objective = cvx.Maximize(weights * alpha_vector)
 55 | 
 56 |         return objective
 57 | 
 58 |     def _get_constraints(self, weights, factor_betas, risk):
 59 |         assert (len(factor_betas.shape) == 2)
 60 | 
 61 |         Constraints = [
 62 |             risk <= self.risk_cap ** 2,
 63 |             factor_betas.T * weights <= self.factor_max,
 64 |             factor_betas.T * weights >= self.factor_min,
 65 |             cvx.sum(weights.T) == 0,
 66 |             cvx.sum(cvx.abs(weights)) <= 1.0,
 67 |             weights >= self.weights_min,
 68 |             weights <= self.weights_max
 69 |         ]
 70 | 
 71 |         return Constraints
 72 | 
 73 | 
 74 | class OptimalHoldingsRegularization(OptimalHoldings):
 75 | 
 76 |     def __init__(self, lambda_reg=0.5, risk_cap=0.05, factor_max=10.0, factor_min=-10.0, weights_max=0.2,
 77 |                  weights_min=-0.2):
 78 |         super().__init__(risk_cap, factor_max, factor_min, weights_max, weights_min)
 79 |         self.lambda_reg = lambda_reg
 80 |         self.risk_cap = risk_cap
 81 |         self.factor_max = factor_max
 82 |         self.factor_min = factor_min
 83 |         self.weights_max = weights_max
 84 |         self.weights_min = weights_min
 85 | 
 86 |     def _get_obj(self, weights, alpha_vector):
 87 |         """
 88 |         Parameters
 89 |         ----------
 90 |         weights : CVXPY Variable
 91 |             Portfolio weights
 92 |         alpha_vector : DataFrame
 93 |             Alpha vector
 94 | 
 95 |         Returns
 96 |         -------
 97 |         objective : CVXPY Objective
 98 |             Objective function
 99 |         """
100 |         assert (len(alpha_vector.columns) == 1)
101 | 
102 |         objective = cvx.Maximize(weights * alpha_vector - self.lambda_reg * cvx.norm(weights, p=2, axis=None))
103 | 
104 |         return objective
105 | 


--------------------------------------------------------------------------------
/modules/risk_model.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import seaborn as sns
  3 | import pandas as pd
  4 | import numpy as np
  5 | from datetime import datetime
  6 | from sklearn.decomposition import PCA
  7 | from abc import ABC, abstractmethod
  8 | import scipy.stats as stats
  9 | import cvxpy as cvx
 10 | 
 11 | 
 12 | class RiskManagement:
 13 | 
 14 |     def __init__(self):
 15 | 
 16 |         pass
 17 | 
 18 |     @staticmethod
 19 |     def fit_pca(close, num_factor_exposures, svd_solver):
 20 | 
 21 |         returns = close.apply(lambda x: (x - x.shift(1)) / x).iloc[1:, :].fillna(0)
 22 |         pca = PCA(n_components=num_factor_exposures, svd_solver=svd_solver)
 23 |         pca.fit(returns)
 24 | 
 25 |         return pca
 26 | 
 27 |     @staticmethod
 28 |     def factor_betas(pca, returns):
 29 | 
 30 |         factor_beta_columns = pca.components_.shape[0]
 31 |         factor_betas = pd.DataFrame(pca.components_.T, returns.columns.values, np.arange(factor_beta_columns))
 32 |         return factor_betas
 33 | 
 34 |     @staticmethod
 35 |     def factor_returns(pca, returns, factor_return_indices, factor_return_columns):
 36 | 
 37 |         factor_returns = pd.DataFrame(pca.transform(returns), factor_return_indices, factor_return_columns)
 38 |         return factor_returns
 39 | 
 40 |     @staticmethod
 41 |     def factor_cov_matrix(factor_returns, ann_factor):
 42 | 
 43 |         annualized_factor_covariance_matrix = np.diag(factor_returns.var(axis=0, ddof=1) * ann_factor)
 44 | 
 45 |         return annualized_factor_covariance_matrix
 46 | 
 47 |     @staticmethod
 48 |     def idiosyncratic_var_matrix(returns, factor_returns, factor_betas, ann_factor):
 49 | 
 50 |         common_returns_ = pd.DataFrame(np.dot(factor_returns, factor_betas.T), returns.index, returns.columns)
 51 | 
 52 |         residuals_ = (returns - common_returns_)
 53 |         specific_risk_matrix = pd.DataFrame(np.diag(np.var(residuals_)) * ann_factor, returns.columns, returns.columns)
 54 |         return specific_risk_matrix
 55 | 
 56 |     @staticmethod
 57 |     def idiosyncratic_var_vector(returns, idiosyncratic_var_matrix):
 58 | 
 59 |         idiosyncratic_var_vector = pd.DataFrame(np.diag(idiosyncratic_var_matrix), index=returns.columns)
 60 |         return idiosyncratic_var_vector
 61 | 
 62 |     @staticmethod
 63 |     def predict_portfolio_risk(factor_betas, factor_cov_matrix, idiosyncratic_var_matrix, weights):
 64 | 
 65 |         K = factor_betas.dot(factor_cov_matrix).dot(factor_betas.T) + idiosyncratic_var_matrix
 66 | 
 67 |         predicted_portfolio_risk = np.sqrt(weights.T.dot(K).dot(weights))
 68 | 
 69 |         return predicted_portfolio_risk.values[0][0]
 70 | 
 71 |     def portfolio_risk(self, close, num_factor_exposures, weights):
 72 | 
 73 |         try:
 74 |             close.index = pd.to_datetime(close.index, format='%Y%m%d')
 75 |         except:
 76 |             pass
 77 | 
 78 |         returns = close.apply(lambda x: (x - x.shift(1)) / x).iloc[1:, :].fillna(0)
 79 | 
 80 |         pca = self.fit_pca(close=close, num_factor_exposures=num_factor_exposures, svd_solver='full')
 81 | 
 82 |         plt.title('Explained Variance')
 83 |         plt.bar(np.arange(num_factor_exposures), pca.explained_variance_ratio_)
 84 |         plt.grid(alpha=0.5)
 85 | 
 86 |         Risk_Model = {'factor_betas': self.factor_betas(pca, returns), 'factor_returns': self.factor_returns(
 87 |             pca,
 88 |             returns,
 89 |             returns.index,
 90 |             np.arange(num_factor_exposures))}
 91 | 
 92 |         Risk_Model['factor_returns'].cumsum().plot(legend=None)
 93 |         plt.grid(alpha=0.5)
 94 | 
 95 |         ann_factor = 252
 96 |         Risk_Model['factor_cov_matrix'] = self.factor_cov_matrix(Risk_Model['factor_returns'], ann_factor)
 97 | 
 98 |         Risk_Model['idiosyncratic_var_matrix'] = self.idiosyncratic_var_matrix(returns, Risk_Model['factor_returns'],
 99 |                                                                                Risk_Model['factor_betas'], ann_factor)
100 | 
101 |         Risk_Model['idiosyncratic_var_vector'] = self.idiosyncratic_var_vector(returns,
102 |                                                                                Risk_Model['idiosyncratic_var_matrix'])
103 | 
104 |         predicted_portfolio_risk = self.predict_portfolio_risk(
105 |             Risk_Model['factor_betas'],
106 |             Risk_Model['factor_cov_matrix'],
107 |             Risk_Model['idiosyncratic_var_matrix'],
108 |             weights)
109 | 
110 |         return predicted_portfolio_risk, Risk_Model
111 | 
112 |     @staticmethod
113 |     def portfolio_calculation(portfolio):
114 | 
115 |         long = portfolio[portfolio['Position'] > 0]
116 |         short = portfolio[portfolio['Position'] < 0]
117 |         long_value = long['marketValue'].sum()
118 |         short_value = short['marketValue'].sum()
119 |         grv = long_value + (short_value * -1)
120 | 
121 |         weights = []
122 |         for tick in portfolio.index:
123 |             weights.append(np.round(portfolio.loc[tick, 'marketValue'] / grv, 4))
124 | 
125 |         all_weights = pd.DataFrame(weights, portfolio.index, columns=['weights'])
126 | 
127 |         return all_weights, long, short, grv
128 | 


--------------------------------------------------------------------------------
/modules/utils_s.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import seaborn as sns
  3 | import pandas as pd
  4 | import numpy as np
  5 | from bs4 import BeautifulSoup
  6 | from datetime import datetime
  7 | from time import sleep, strftime, localtime, time
  8 | 
  9 | sleeptime = 2
 10 | 
 11 | 
 12 | # PREPROCESSING
 13 | 
 14 | class Util:
 15 | 
 16 |     @staticmethod
 17 |     def cleaning_dataframe(df, pernan_to_drop):
 18 |         print('cleaning data')
 19 | 
 20 |         df_cleaned = df.apply(pd.to_numeric, errors='coerce')
 21 |         subset = df_cleaned.select_dtypes(exclude=[np.float, np.int])
 22 |         col_len = len(subset.columns)
 23 |         if col_len != 0:
 24 |             df_cleaned.drop(subset.columns, axis=1, inplace=True)
 25 |         else:
 26 |             print('columns are clean')
 27 |             pass
 28 |         # df_cleaned.replace([0,-1], np.nan, inplace=True)
 29 | 
 30 |         total = df_cleaned.isna().sum().sort_values(ascending=False)
 31 |         percent = (df_cleaned.isna().sum() / df_cleaned.isna().count()).sort_values(ascending=False)
 32 |         nan_df_cleaned = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
 33 | 
 34 |         f, ax = plt.subplots(figsize=(20, 10))
 35 | 
 36 |         for tick in ax.xaxis.get_major_ticks():
 37 |             tick.label.set_fontsize(5)
 38 |             plt.xticks(rotation='90')
 39 | 
 40 |         sns.barplot(x=nan_df_cleaned.index, y=nan_df_cleaned['Percent'])
 41 |         plt.xlabel('Features', fontsize=15)
 42 |         plt.ylabel('Percent of nan values', fontsize=15)
 43 |         plt.title('Percent nan data by feature dataset', fontsize=15)
 44 |         plt.grid()
 45 |         plt.show()
 46 | 
 47 |         features_to_drop = nan_df_cleaned[nan_df_cleaned['Percent'] > pernan_to_drop].index
 48 |         cleaned_dropna = df_cleaned.drop(features_to_drop, axis=1)
 49 |         cleaned_df_all = cleaned_dropna.fillna(cleaned_dropna.mean(level=1))
 50 | 
 51 |         print('The percentage of dropped columns is {}%.'.format(
 52 |             int(((df_cleaned.shape[1] - cleaned_dropna.shape[1]) / df_cleaned.shape[1]) * 100)))
 53 |         print('Dropped {} columns out of {}'.format(len(df.columns) - len(cleaned_df_all.columns), len(df.columns)))
 54 | 
 55 |         return cleaned_df_all
 56 | 
 57 |     @staticmethod
 58 |     def q_indexing(quantile_to_analyse, df):
 59 | 
 60 |         # noinspection PyBroadException
 61 |         try:
 62 |             q_list = []
 63 |             for i in quantile_to_analyse:
 64 |                 q_list.append((df['quantile'] == i))
 65 | 
 66 |             df_merge = q_list[0]
 67 |             for d in q_list[1:]:
 68 |                 df_merge = df_merge ^ d
 69 | 
 70 |             q_final_vector = df[df_merge]
 71 | 
 72 |         except:
 73 | 
 74 |             print('no specific quantile selected')
 75 |             q_final_vector = df
 76 | 
 77 |         return q_final_vector
 78 | 
 79 |     @staticmethod
 80 |     def quantilize(quantile_portions, df, weights_col, q_col, sec_col, sec_df):
 81 | 
 82 |         quantile_optimal_stacked = pd.DataFrame(index=df.index,
 83 |                                                 columns=[weights_col, q_col, sec_col])
 84 |         qunatiles = np.linspace(0, 1, quantile_portions + 1)
 85 |         labels = [i + 1 for i in range(len(qunatiles) - 1)]
 86 | 
 87 |         for date in df.index.levels[0]:
 88 |             x = df[weights_col].loc[date, :]
 89 | 
 90 |             quantile_optimal_stacked.loc[date, q_col] = pd.qcut(x, qunatiles,
 91 |                                                                 labels=labels)
 92 | 
 93 |         quantile_optimal_stacked.loc[:, weights_col] = df[weights_col]
 94 |         quantile_optimal_stacked.loc[:, sec_col] = sec_df[sec_col]
 95 | 
 96 |         return quantile_optimal_stacked
 97 | 
 98 |     @staticmethod
 99 |     def select_sector(df, drop_long_sec, drop_short_sec, sec_col, factor_col):
100 | 
101 |         try:
102 |             drop_rows_list = []
103 |             for i in drop_long_sec:
104 |                 drop_rows_list.append((df.sector == i) & (df[factor_col] == labels[-1]))
105 | 
106 |             for i in drop_short_sec:
107 |                 drop_rows_list.append((df.sector == i) & (df[factor_col] == labels[0]))
108 | 
109 |             if len(drop_rows_list) == 1:
110 |                 df_merge = drop_rows_list[0]
111 |             else:
112 |                 df_merge = drop_rows_list[0]
113 | 
114 |                 for d in drop_rows_list[1:]:
115 |                     df_merge = df_merge ^ d
116 | 
117 |             final_vector = df[~df_merge]
118 |             sectors = final_vector[sec_col]
119 | 
120 |         finally:
121 |             final_vector = df
122 |             sectors = final_vector[sec_col]
123 | 
124 |         return final_vector, sectors
125 | 
126 |     @staticmethod
127 |     def rebalancing_to_leverage(percent_long_leverage_target, percent_short_leverage_target):
128 | 
129 |         try:
130 |             for date in df.index.levels[0]:
131 |                 long_balance = np.abs(df.loc[date, 'optimal_weights'][df.loc[date, 'optimal_weights'] > 0].sum())
132 |                 short_balance = np.abs(df.loc[date, 'optimal_weights'][df.loc[date, 'optimal_weights'] < 0].sum())
133 | 
134 |                 long_ratio = percent_long_leverage_target / long_balance
135 |                 short_ratio = percent_short_leverage_target / short_balance
136 | 
137 |                 df.loc[date, 'optimal_weights'][df.loc[date, 'optimal_weights'] > 0] = df.loc[date, 'optimal_weights'][
138 |                                                                                            df.loc[
139 |                                                                                                date, 'optimal_weights'] > 0] * long_ratio
140 |                 df.loc[date, 'optimal_weights'][df.loc[date, 'optimal_weights'] < 0] = df.loc[date, 'optimal_weights'][
141 |                                                                                            df.loc[
142 |                                                                                                date, 'optimal_weights'] < 0] * short_ratio
143 | 
144 |         finally:
145 |             pass
146 | 
147 |         return df
148 | 


--------------------------------------------------------------------------------
/modules/factorize.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import seaborn as sns
  3 | import pandas as pd
  4 | import numpy as np
  5 | import talib
  6 | from datetime import datetime
  7 | from sklearn import preprocessing
  8 | import statsmodels.api as sm
  9 | import statsmodels.formula.api as smf
 10 | from statsmodels.regression.rolling import RollingOLS
 11 | 
 12 | 
 13 | # from sklearn.impute import SimpleImputer
 14 | # imp = SimpleImputer(missing_values=[np.nan, np.inf, -np.inf], strategy='most_frequent')
 15 | 
 16 | class FactorManagement:
 17 | 
 18 |     def __init__(self):
 19 | 
 20 |         pass
 21 | 
 22 |     def momentum(self, close, window_length):
 23 | 
 24 |         momentum = self.log_Returns(close, window_length)
 25 |         momentum_drz = pd.DataFrame(data=preprocessing.scale(momentum),
 26 |                                     index=momentum.index,
 27 |                                     columns=momentum.columns)
 28 |         return momentum_drz
 29 | 
 30 |     @staticmethod
 31 |     def smooth(factor, window_length):
 32 | 
 33 |         smooth_factor = factor.rolling(window=window_length).mean().iloc[(window_length - 1):, :].fillna(0)
 34 | 
 35 |         return smooth_factor
 36 | 
 37 |     @staticmethod
 38 |     def returns(close, window_length):
 39 | 
 40 |         returns = close.pct_change(window_length).fillna(0)
 41 | 
 42 |         return returns
 43 | 
 44 |     @staticmethod
 45 |     def log_Returns(close, window_length):
 46 | 
 47 |         returns = (np.log(close / close.shift(window_length)).iloc[(window_length - 1):, :]).fillna(0)
 48 | 
 49 |         return returns
 50 | 
 51 |     @staticmethod
 52 |     def volatility(close, window_length, trailing_window):
 53 | 
 54 |         vol = close.pct_change().rolling(window_length).std(ddof=0).rolling(trailing_window).sum()
 55 | 
 56 |         vol_drz = pd.DataFrame(data=preprocessing.scale(vol),
 57 |                                index=vol.index,
 58 |                                columns=vol.columns)
 59 | 
 60 |         return vol_drz
 61 | 
 62 |     @staticmethod
 63 |     def overnight_sentiment(close, openn, window_length, trailing_window):
 64 | 
 65 |         return_over = pd.DataFrame(index=close.index, columns=close.columns)
 66 |         close_shifted = close.apply(lambda x: x.shift(window_length))
 67 | 
 68 |         for date in close.index:
 69 |             return_over.loc[date] = (openn.loc[date] - close_shifted.loc[date]) / close_shifted.loc[date]
 70 | 
 71 |         overnight_sentiment = return_over.rolling(trailing_window).sum()
 72 | 
 73 |         overnight_sentiment_drz = pd.DataFrame(data=preprocessing.scale(overnight_sentiment),
 74 |                                                index=overnight_sentiment.index,
 75 |                                                columns=overnight_sentiment.columns)
 76 | 
 77 |         return overnight_sentiment_drz
 78 | 
 79 |     @staticmethod
 80 |     def direction(close, openn, trailing_window):
 81 | 
 82 |         p = ((close - openn) / close) * -1
 83 | 
 84 |         p.replace([np.inf, -np.inf], np.nan, inplace=True)
 85 |         rolling_p = p.rolling(trailing_window).sum()
 86 | 
 87 |         direction_scaled = pd.DataFrame(data=preprocessing.scale(rolling_p),
 88 |                                         index=rolling_p.index,
 89 |                                         columns=rolling_p.columns)
 90 | 
 91 |         return direction_scaled
 92 | 
 93 |     @staticmethod
 94 |     def sma(close, window_length):
 95 | 
 96 |         df = pd.DataFrame(index=close.index)
 97 | 
 98 |         try:
 99 |             for tick in close.columns:
100 |                 df[tick] = talib.SMA(close[tick].values, timeperiod=window_length)
101 |         except:
102 |             pass
103 | 
104 |         sma_min = ((close - df) / df) * -1
105 |         return sma_min
106 | 
107 |     @staticmethod
108 |     def sentiment(close, high, low, sent, trailing_window, universe):
109 | 
110 |         indexer = close.index
111 | 
112 |         total = sent['news_volume'].unstack('ticker')[universe]
113 |         score = sent['sentiment'].unstack('ticker')[universe]
114 | 
115 |         close = close[universe]
116 |         high = high[universe]
117 |         low = low[universe]
118 | 
119 |         assert len(close.columns) == len(total.columns) == len(score.columns)
120 | 
121 |         p = ((high - low) / close)
122 |         v = p.rolling(trailing_window).sum()
123 |         s = (total * score).rolling(trailing_window).sum()
124 |         final = (v * s) * -1
125 | 
126 |         assert len(final.columns) == len(close.columns)
127 | 
128 |         sent_factor_scaled = pd.DataFrame(data=preprocessing.scale(final),
129 |                                           index=final.index,
130 |                                           columns=final.columns).reindex(indexer)
131 | 
132 |         return sent_factor_scaled[universe]
133 | 
134 |     @staticmethod
135 |     def sector_neutral(sectors: dict, df):
136 | 
137 |         result = []
138 |         for sec in sectors.keys():
139 |             result.append(df[sectors[sec]].sub(df[sectors[sec]].mean(axis=1), axis=0))
140 | 
141 |         df_neutralized = pd.concat(result, axis=1)
142 |         df_neutralized_scaled = pd.DataFrame(data=preprocessing.scale(df_neutralized),
143 |                                              index=df_neutralized.index,
144 |                                              columns=df_neutralized.columns)
145 | 
146 |         return df_neutralized_scaled
147 | 
148 |     def capm(self, close, market, window_length_return, window_length_beta):
149 | 
150 |         r_market = self.log_Returns(market, window_length_return).loc[slice(close.index[0], close.index[-1])]
151 | 
152 |         exog = sm.add_constant(r_market)
153 | 
154 |         cap_beta = pd.DataFrame(columns=close.columns)
155 | 
156 |         for tick in close.columns:
157 |             r_assets = self.log_Returns(close[[tick]], window_length_return)
158 | 
159 |             endog = r_assets
160 |             rols = RollingOLS(endog, exog, window=window_length_beta)
161 |             rres = rols.fit()
162 |             capm = rres.params.dropna()
163 |             capm.columns = ['intercept', 'beta']
164 |             cap_beta.loc[:, tick] = capm['beta']
165 | 
166 |         return cap_beta
167 | 
168 |     @staticmethod
169 |     def channels(close, window_length):
170 | 
171 |         df_ch = pd.DataFrame(index=close.index, columns=close.columns)
172 | 
173 |         sl = len(close.index) // window_length
174 | 
175 |         for tick in close.columns:
176 | 
177 |             for i in range(0, len(close.index), window_length):
178 | 
179 |                 j = i + window_length
180 | 
181 |                 if i == 0:
182 |                     distance = max(close[tick].iloc[-j:]) - min(close[tick].iloc[-j:])
183 |                     df_ch[tick].iloc[-j:] = (close[tick].iloc[-j:] - min(close[tick].iloc[-j:])) / distance
184 | 
185 |                 elif i == sl * window_length:
186 |                     distance = max(close[tick].iloc[:-i]) - min(close[tick].iloc[:-i])
187 |                     df_ch[tick].iloc[:-i] = (close[tick].iloc[:-i] - min(close[tick].iloc[:-i])) / distance
188 | 
189 |                 else:
190 |                     distance = max(close[tick].iloc[-j:-i]) - min(close[tick].iloc[-j:-i])
191 |                     df_ch[tick].iloc[-j:-i] = (close[tick].iloc[-j:-i] - min(close[tick].iloc[-j:-i])) / distance
192 |         res = df_ch @ -1
193 |         return res
194 | 


--------------------------------------------------------------------------------
/modules/feature_weights.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import datetime as datetime
  3 | import numpy as np
  4 | import pandas as pd
  5 | import talib
  6 | import seaborn as sns
  7 | from time import time
  8 | from sklearn import preprocessing
  9 | from pandas.plotting import register_matplotlib_converters
 10 | from .factorize import FactorManagement
 11 | import scipy.stats as stats
 12 | import cvxpy as cvx
 13 | import zipfile
 14 | import os
 15 | from sklearn import linear_model, decomposition, ensemble, preprocessing, isotonic, metrics
 16 | from sklearn.impute import SimpleImputer
 17 | import xgboost
 18 | 
 19 | register_matplotlib_converters()
 20 | 
 21 | 
 22 | class Learner:
 23 | 
 24 |     def __init__(self):
 25 |         pass
 26 | 
 27 |     @staticmethod
 28 |     def shift_mask_data(X, Y, upper_percentile, lower_percentile, n_fwd_days):
 29 |         # Shift X to match factors at t to returns at t+n_fwd_days (we want to predict future returns after all)
 30 |         shifted_X = np.roll(X, n_fwd_days + 1, axis=0)
 31 | 
 32 |         # Slice off rolled elements
 33 |         X = shifted_X[n_fwd_days + 1:]
 34 |         Y = Y[n_fwd_days + 1:]
 35 | 
 36 |         n_time, n_stocks, n_factors = X.shape
 37 | 
 38 |         # Look for biggest up and down movers
 39 |         upper = np.nanpercentile(Y, upper_percentile, axis=1)[:, np.newaxis]
 40 |         lower = np.nanpercentile(Y, lower_percentile, axis=1)[:, np.newaxis]
 41 | 
 42 |         upper_mask = (Y >= upper)
 43 |         lower_mask = (Y <= lower)
 44 | 
 45 |         mask = upper_mask | lower_mask  # This also drops nans
 46 |         mask = mask.flatten()
 47 | 
 48 |         # Only try to predict whether a stock moved up/down relative to other stocks
 49 |         Y_binary = np.zeros(n_time * n_stocks)
 50 |         Y_binary[upper_mask.flatten()] = 1
 51 |         Y_binary[lower_mask.flatten()] = -1
 52 | 
 53 |         # Flatten X
 54 |         X = X.reshape((n_time * n_stocks, n_factors))
 55 | 
 56 |         # Drop stocks that did not move much (i.e. are in the 30th to 70th percentile)
 57 |         X = X[mask]
 58 |         Y_binary = Y_binary[mask]
 59 | 
 60 |         return X, Y_binary
 61 | 
 62 |     def feature_importance_adaboost(self, n_fwd_days, close, all_factors, n_estimators, train_size,
 63 |                                     upper_percentile, lower_percentile):
 64 |         pipe = all_factors
 65 |         pipe.index = pipe.index.set_levels([pd.to_datetime(pipe.index.levels[0]), pipe.index.levels[1]])
 66 | 
 67 |         close = close[pipe.index.levels[1]]
 68 |         close.index = pd.to_datetime(close.index)
 69 | 
 70 |         chunk_start = pipe.index.levels[0][0]
 71 |         chunk_end = pipe.index.levels[0][-1]
 72 | 
 73 |         returns = FactorManagement().log_Returns(close, 1).loc[slice(chunk_start, chunk_end), :]
 74 |         returns_stacked = returns.stack().to_frame('Returns')
 75 | 
 76 |         results = pd.concat([pipe, returns_stacked], axis=1)
 77 |         results.index.set_names(['date', 'asset'], inplace=True)
 78 | 
 79 |         results_wo_returns = results.copy()
 80 |         returns = results_wo_returns.pop('Returns')
 81 |         Y = returns.unstack().values
 82 |         X = results_wo_returns.to_xarray().to_array()
 83 |         X = np.array(X)
 84 |         X = X.swapaxes(2, 0).swapaxes(0, 1)  # (factors, time, stocks) -> (time, stocks, factors)
 85 | 
 86 |         # Train-test split
 87 |         train_size_perc = train_size
 88 |         n_time, n_stocks, n_factors = X.shape
 89 |         train_size = np.int16(np.round(train_size_perc * n_time))
 90 |         X_train, Y_train = X[:train_size], Y[:train_size]
 91 |         X_test, Y_test = X[(train_size + n_fwd_days):], Y[(train_size + n_fwd_days):]
 92 | 
 93 |         X_train_shift, Y_train_shift = self.shift_mask_data(X_train, Y_train, n_fwd_days=n_fwd_days,
 94 |                                                             lower_percentile=lower_percentile,
 95 |                                                             upper_percentile=upper_percentile)
 96 | 
 97 |         X_test_shift, Y_test_shift = self.shift_mask_data(X_test, Y_test, n_fwd_days=n_fwd_days,
 98 |                                                           lower_percentile=lower_percentile,
 99 |                                                           upper_percentile=upper_percentile)
100 | 
101 |         start_timer = time()
102 | 
103 |         # Train classifier
104 |         imputer = SimpleImputer()
105 |         scaler = preprocessing.MinMaxScaler()
106 |         clf = ensemble.AdaBoostClassifier(
107 |             n_estimators=n_estimators)  # n_estimators controls how many weak classifiers are fi
108 | 
109 |         X_train_trans = imputer.fit_transform(X_train_shift)
110 |         X_train_trans = scaler.fit_transform(X_train_trans)
111 |         clf.fit(X_train_trans, Y_train_shift)
112 | 
113 |         end_timer = time()
114 |         print('Time to train full ML pipline: {} secs'.format(end_timer - start_timer))
115 | 
116 |         Y_pred = clf.predict(X_train_trans)
117 |         print('Accuracy on train set = {:.2f}%'.format(metrics.accuracy_score(Y_train_shift, Y_pred) * 100))
118 | 
119 |         # Transform test data
120 |         X_test_trans = imputer.transform(X_test_shift)
121 |         X_test_trans = scaler.transform(X_test_trans)
122 | 
123 |         # Predict!
124 |         Y_pred = clf.predict(X_test_trans)
125 |         Y_pred_prob = clf.predict_proba(X_test_trans)
126 | 
127 |         print('Predictions:', Y_pred)
128 |         print('Probabilities of class == 1:', Y_pred_prob[:, 1] * 100)
129 |         print('Accuracy on test set = {:.2f}%'.format(metrics.accuracy_score(Y_test_shift, Y_pred) * 100))
130 |         print('Log-loss = {:.5f}'.format(metrics.log_loss(Y_test_shift, Y_pred_prob)))
131 | 
132 |         feature_importances = pd.Series(clf.feature_importances_, index=results_wo_returns.columns)
133 |         feature_importances.sort_values(ascending=False)
134 |         ax = feature_importances.plot(kind='bar')
135 |         ax.set(ylabel='Importance (Gini Coefficient)', title='Feature importances')
136 | 
137 |         feature_importances = pd.DataFrame(data=feature_importances.values,
138 |                                            columns=['weights'],
139 |                                            index=feature_importances.index)
140 |         feature_importances.index.name = 'factors'
141 | 
142 |         return feature_importances
143 | 
144 |     def feature_importance_xgb(self, n_fwd_days, close, all_factors, n_estimators, train_size,
145 |                                upper_percentile, lower_percentile):
146 |         pipe = all_factors
147 |         pipe.index = pipe.index.set_levels([pd.to_datetime(pipe.index.levels[0]), pipe.index.levels[1]])
148 | 
149 |         close = close[pipe.index.levels[1]]
150 |         close.index = pd.to_datetime(close.index)
151 | 
152 |         chunk_start = pipe.index.levels[0][0]
153 |         chunk_end = pipe.index.levels[0][-1]
154 | 
155 |         returns = FactorManagement().log_Returns(close, 1).loc[slice(chunk_start, chunk_end), :]
156 |         returns_stacked = returns.stack().to_frame('Returns')
157 | 
158 |         results = pd.concat([pipe, returns_stacked], axis=1)
159 |         results.index.set_names(['date', 'asset'], inplace=True)
160 | 
161 |         results_wo_returns = results.copy()
162 |         returns = results_wo_returns.pop('Returns')
163 |         Y = returns.unstack().values
164 |         X = results_wo_returns.to_xarray().to_array()
165 |         X = np.array(X)
166 |         X = X.swapaxes(2, 0).swapaxes(0, 1)
167 | 
168 |         # Train-test split
169 |         train_size_perc = train_size
170 |         n_time, n_stocks, n_factors = X.shape
171 |         train_size = np.int16(np.round(train_size_perc * n_time))
172 |         X_train, Y_train = X[:train_size], Y[:train_size]
173 |         X_test, Y_test = X[(train_size + n_fwd_days):], Y[(train_size + n_fwd_days):]
174 | 
175 |         X_train_shift, Y_train_shift = self.shift_mask_data(X_train, Y_train, n_fwd_days=n_fwd_days,
176 |                                                             lower_percentile=lower_percentile,
177 |                                                             upper_percentile=upper_percentile)
178 | 
179 |         X_test_shift, Y_test_shift = self.shift_mask_data(X_test, Y_test, n_fwd_days=n_fwd_days,
180 |                                                           lower_percentile=lower_percentile,
181 |                                                           upper_percentile=upper_percentile)
182 | 
183 |         start_timer = time()
184 | 
185 |         # Train classifier
186 |         # imputer = SimpleImputer()
187 |         # scaler = preprocessing.MinMaxScaler()
188 |         clf = xgboost.XGBClassifier(n_estimators=n_estimators)
189 | 
190 |         # X_train_trans = imputer.fit_transform(X_train_shift)
191 |         # X_train_trans = scaler.fit_transform(X_train_trans)
192 |         clf.fit(X_train_shift, Y_train_shift)
193 | 
194 |         end_timer = time()
195 |         print('Time to train full ML pipline: {} secs'.format(end_timer - start_timer))
196 | 
197 |         Y_pred_train = clf.predict(X_train_shift)
198 |         print('Accuracy on train set = {:.2f}%'.format(metrics.accuracy_score(Y_train_shift, Y_pred_train) * 100))
199 | 
200 |         # Transform test data
201 |         # X_test_trans = imputer.transform(X_test_shift)
202 |         # X_test_trans = scaler.transform(X_test_trans)
203 | 
204 |         # Predict!
205 |         Y_pred = clf.predict(X_test_shift)
206 |         Y_pred_prob = clf.predict_proba(X_test_shift)
207 | 
208 |         print('Predictions:', Y_pred)
209 |         print('Probabilities of class == 1:', Y_pred_prob[:, 1] * 100)
210 |         print('Accuracy on test set = {:.2f}%'.format(metrics.accuracy_score(Y_test_shift, Y_pred) * 100))
211 |         print('Log-loss = {:.5f}'.format(metrics.log_loss(Y_test_shift, Y_pred_prob)))
212 | 
213 |         feature_importances = pd.Series(clf.feature_importances_, index=results_wo_returns.columns)
214 |         feature_importances.sort_values(ascending=False)
215 |         ax = feature_importances.plot(kind='bar')
216 |         ax.set(ylabel='Importance (Gini Coefficient)', title='Feature importances')
217 | 
218 |         feature_importances = pd.DataFrame(data=feature_importances.values,
219 |                                            columns=['weights'],
220 |                                            index=feature_importances.index)
221 |         feature_importances.index.name = 'factors'
222 | 
223 |         return feature_importances
224 | 


--------------------------------------------------------------------------------
/modules/account.py:
--------------------------------------------------------------------------------
  1 | from ibapi.client import EClient
  2 | from ibapi.wrapper import EWrapper
  3 | from ibapi.contract import Contract
  4 | from ibapi.order import Order
  5 | from ibapi.scanner import ScannerSubscription
  6 | from ibapi.ticktype import TickTypeEnum
  7 | from ibapi.common import *
  8 | from ibapi.tag_value import TagValue
  9 | from ibapi.execution import ExecutionFilter
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | import pandas as pd
 14 | import numpy as np
 15 | from bs4 import BeautifulSoup
 16 | from datetime import datetime
 17 | from time import sleep, strftime, localtime, time
 18 | 
 19 | sleeptime = 5
 20 | 
 21 | 
 22 | class AccountManagement:
 23 | 
 24 |     def read_nextvalidid(self):
 25 | 
 26 |         class TestApp(EWrapper, EClient):
 27 | 
 28 |             def __init__(self):
 29 |                 EClient.__init__(self, self)
 30 | 
 31 |                 self.nextValidOrderId = []
 32 | 
 33 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
 34 |                 if reqId > -1:
 35 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
 36 | 
 37 |             def nextValidId(self, orderId):
 38 |                 super().nextValidId(orderId)
 39 |                 self.nextValidOrderId.append(orderId)
 40 |                 print("NextValidId:", orderId)
 41 |                 self.disconnect()
 42 | 
 43 |         app = TestApp()
 44 |         app.connect('127.0.0.1', 7497, 0)
 45 |         sleep(sleeptime)
 46 | 
 47 |         app.reqIds(-1)
 48 |         nid = app.nextValidOrderId
 49 | 
 50 |         app.run()
 51 | 
 52 |         return nid[0]
 53 | 
 54 |     def placing_orders(self, symbol, sec_type, exch, prim_exch, curr, order_type, quantity, action):
 55 | 
 56 |         contract = Contract()
 57 |         contract.symbol = symbol
 58 |         contract.secType = sec_type
 59 |         contract.exchange = exch
 60 |         contract.primaryExchange = prim_exch
 61 |         contract.currency = curr
 62 | 
 63 |         order = Order()
 64 |         order.orderType = order_type
 65 |         order.totalQuantity = quantity
 66 |         order.action = action
 67 | 
 68 |         class TestApp(EWrapper, EClient):
 69 | 
 70 |             def __init__(self):
 71 |                 EClient.__init__(self, self)
 72 | 
 73 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
 74 |                 if reqId > -1:
 75 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
 76 | 
 77 |         app = TestApp()
 78 |         app.connect('127.0.0.1', 7497, 0)
 79 | 
 80 |         app.placeOrder(orderId=orderId, contract=contract, order=order)
 81 |         print('order quantity placed for {} is: {} '.format(contract.symbol, order.totalQuantity))
 82 | 
 83 |         sleep(sleeptime)
 84 | 
 85 |         return order, contract
 86 | 
 87 |         app.disconnect()
 88 |         app.run()
 89 | 
 90 |     def read_positions(self, subscribe, acctCode):
 91 | 
 92 |         class TestApp(EWrapper, EClient):
 93 | 
 94 |             def __init__(self):
 95 |                 EClient.__init__(self, self)
 96 |                 self.up = pd.DataFrame([], columns=['Position', 'marketPrice', 'marketValue', 'averageCost',
 97 |                                                     'unrealizedPNL', 'realizedPNL'])
 98 | 
 99 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
100 |                 if reqId > -1:
101 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
102 | 
103 |             def updatePortfolio(self, contract, position, marketPrice, marketValue, averageCost, unrealizedPNL,
104 |                                 realizedPNL, accountName):
105 |                 self.up.index.name = 'Symbol'
106 |                 self.up.loc[
107 |                     contract.symbol] = position, marketPrice, marketValue, averageCost, unrealizedPNL, realizedPNL
108 | 
109 |             def positionEnd(self):
110 |                 super().positionEnd()
111 |                 print("PositionEnd")
112 |                 self.cancelPositions()
113 |                 self.disconnect()
114 | 
115 |         app = TestApp()
116 |         app.connect('127.0.0.1', 7497, 0)
117 |         sleep(sleeptime)
118 | 
119 |         app.reqAccountUpdates(subscribe=subscribe, acctCode=acctCode)
120 |         app.reqPositions()
121 | 
122 |         update = app.up
123 | 
124 |         app.run()
125 | 
126 |         print('Reading Portfolio')
127 |         rows = update[update['Position'] == 0].index
128 |         update.drop(rows, axis=0, inplace=True)
129 | 
130 |         return update
131 | 
132 |     def read_account(self, subscribe, acctCode):
133 | 
134 |         class TestApp(EWrapper, EClient):
135 | 
136 |             def __init__(self):
137 |                 EClient.__init__(self, self)
138 |                 self.up = pd.DataFrame([], columns=['Values'])
139 | 
140 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
141 |                 if reqId > -1:
142 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
143 | 
144 |             def updateAccountValue(self, key, value, currency, accountName):
145 |                 self.up.index.name = 'Keys'
146 |                 self.up.loc[key] = value
147 | 
148 |             def accountDownloadEnd(self, account):
149 |                 print("AccountDownloadEnd. Account:", account)
150 |                 self.disconnect()
151 | 
152 |         app = TestApp()
153 |         app.connect('127.0.0.1', 7497, 0)
154 |         sleep(sleeptime)
155 | 
156 |         app.reqAccountUpdates(subscribe=subscribe, acctCode=acctCode)
157 | 
158 |         update = app.up
159 | 
160 |         app.reqAccountUpdates(False, acctCode)
161 | 
162 |         app.run()
163 | 
164 |         print('Reading Account')
165 |         return update
166 | 
167 |     def cancel_openorders(self):
168 | 
169 |         class TestApp(EWrapper, EClient):
170 | 
171 |             def __init__(self):
172 |                 EClient.__init__(self, self)
173 |                 self.open_orders = pd.DataFrame(columns=['action', 'quantity',
174 |                                                          'type', 'algoStrategy',
175 |                                                          'algoParams', 'pre_status'])
176 | 
177 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
178 |                 if reqId > -1:
179 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
180 | 
181 |             def cancelOrder(self, orderId):
182 |                 super().cancelOrder(orderId)
183 |                 print('cancel order ended')
184 | 
185 |             def openOrder(self, orderId, Contract, Order, OrderState):
186 |                 super().openOrder(orderId, Contract, Order, OrderState)
187 | 
188 |                 self.open_orders.loc[Contract.symbol, :] = [Order.action,
189 |                                                             Order.totalQuantity,
190 |                                                             Order.orderType,
191 |                                                             Order.algoStrategy,
192 |                                                             Order.algoParams[0],
193 |                                                             OrderState.status]
194 | 
195 |             def openOrderEnd(self):
196 |                 super().openOrderEnd()
197 |                 print('open order ended')
198 |                 self.disconnect()
199 | 
200 |         app = TestApp()
201 |         app.connect('127.0.0.1', 7497, 0)
202 |         sleep(sleeptime)
203 | 
204 |         app.reqIds(-1)
205 |         app.reqAllOpenOrders()
206 | 
207 |         open_orders = app.open_orders
208 |         app.reqGlobalCancel()
209 | 
210 |         app.run()
211 | 
212 |         return open_orders
213 | 
214 |     def get_openorders(self):
215 | 
216 |         class TestApp(EWrapper, EClient):
217 | 
218 |             def __init__(self):
219 |                 EClient.__init__(self, self)
220 |                 self.open_orders = pd.DataFrame(columns=['action', 'open orders',
221 |                                                          'type', 'algoStrategy',
222 |                                                          'algoParams', 'status'])
223 | 
224 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
225 |                 if reqId > -1:
226 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
227 | 
228 |             def openOrder(self, orderId, Contract, Order, OrderState):
229 |                 super().openOrder(orderId, Contract, Order, OrderState)
230 | 
231 |                 self.open_orders.loc[Contract.symbol, :] = [Order.action,
232 |                                                             Order.totalQuantity,
233 |                                                             Order.orderType,
234 |                                                             Order.algoStrategy,
235 |                                                             Order.algoParams[0],
236 |                                                             OrderState.status]
237 | 
238 |             def openOrderEnd(self):
239 |                 super().openOrderEnd()
240 |                 print('open order ended')
241 |                 self.disconnect()
242 | 
243 |         app = TestApp()
244 |         app.connect('127.0.0.1', 7497, 0)
245 | 
246 |         app.reqIds(-1)
247 |         app.reqAllOpenOrders()
248 |         sleep(sleeptime)
249 | 
250 |         open_orders = app.open_orders
251 | 
252 |         app.run()
253 | 
254 |         return open_orders
255 | 
256 |     def closing_positions(self, portfolio, order_id, ordersPriority, transmit):
257 | 
258 |         class TestApp(EWrapper, EClient):
259 | 
260 |             def __init__(self):
261 |                 EClient.__init__(self, self)
262 | 
263 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
264 |                 if reqId > -1:
265 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
266 | 
267 |         app = TestApp()
268 |         app.connect('127.0.0.1', 7497, 0)
269 | 
270 |         if app.isConnected():
271 |             print('app is running ...')
272 |             print('closing {} positions which are not present in action'.format(len(stock_to_close)))
273 |             # Closing Position
274 | 
275 |             for i in stock_to_close:
276 | 
277 |                 contract = Contract()
278 |                 contract.symbol = i
279 |                 contract.secType = 'STK'
280 |                 contract.exchange = 'SMART'
281 |                 # contract.primaryExchange = 'ISLAND'
282 |                 contract.currency = 'USD'
283 | 
284 |                 order = Order()
285 |                 order.orderType = 'MKT'
286 |                 order.totalQuantity = int(np.abs(portfolio.loc[i, 'Position']))
287 |                 order.transmit = transmit
288 | 
289 |                 if portfolio.loc[i, 'Position'] > 0:
290 | 
291 |                     order.action = 'SELL'
292 |                     # order.cashQty = weigth * 1.5 * net_liq
293 |                     order.algoStrategy = 'Adaptive'
294 |                     order.algoParams = []
295 |                     order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
296 | 
297 |                     app.placeOrder(orderId=order_id, contract=contract, order=order)
298 |                     sleep(sleeptime)
299 | 
300 |                     order_id = order_id + 1
301 |                     print('closing position for {} is: {} '.format(contract.symbol, order.totalQuantity))
302 | 
303 |                 elif portfolio.loc[i, 'Position'] < 0:
304 | 
305 |                     order.action = 'BUY'
306 |                     # order.cashQty = weigth * 1.5 * net_liq
307 |                     order.algoStrategy = 'Adaptive'
308 |                     order.algoParams = []
309 |                     order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
310 | 
311 |                     app.placeOrder(orderId=order_id, contract=contract, order=order)
312 |                     sleep(sleeptime)
313 | 
314 |                     order_id = order_id + 1
315 |                     print('closing position for {} is: {} '.format(contract.symbol, order.totalQuantity))
316 | 
317 |         else:
318 |             print('app not connected')
319 | 
320 |         app.disconnect()
321 |         return order_id + 1
322 | 
323 |     def rebalancing_to_leverage(self, order_id, ordersPriority, transmit):
324 | 
325 |         class TestApp(EWrapper, EClient):
326 | 
327 |             def __init__(self):
328 |                 EClient.__init__(self, self)
329 | 
330 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
331 |                 if reqId > -1:
332 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
333 | 
334 |         app = TestApp()
335 |         app.connect('127.0.0.1', 7497, 0)
336 | 
337 |         if app.isConnected():
338 |             print('app is running ...')
339 |             print('balancing {} positions'.format(len(action_balance.index)))
340 |             # Closing Position
341 | 
342 |             for i in action_balance.index:
343 | 
344 |                 contract = Contract()
345 |                 contract.symbol = i
346 |                 contract.secType = 'STK'
347 |                 contract.exchange = 'SMART'
348 |                 contract.currency = 'USD'
349 | 
350 |                 order = Order()
351 |                 order.orderType = 'MKT'
352 |                 order.totalQuantity = np.abs(action_balance.loc[i, 'shares'])
353 |                 order.transmit = transmit
354 | 
355 |                 if action_balance.loc[i, 'shares'] > 0:
356 | 
357 |                     order.action = 'BUY'
358 |                     order.algoStrategy = 'Adaptive'
359 |                     order.algoParams = []
360 |                     order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
361 |                     app.placeOrder(orderId=order_id, contract=contract, order=order)
362 |                     sleep(sleeptime)
363 | 
364 |                     order_id = order_id + 1
365 |                     print(' buy order quantity placed for {} is: {} '.format(contract.symbol, order.totalQuantity))
366 | 
367 |                 elif action_balance.loc[i, 'shares'] < 0:
368 | 
369 |                     order.action = 'SELL'
370 |                     order.algoStrategy = 'Adaptive'
371 |                     order.algoParams = []
372 |                     order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
373 |                     app.placeOrder(orderId=order_id, contract=contract, order=order)
374 |                     sleep(sleeptime)
375 | 
376 |                     order_id = order_id + 1
377 |                     print(' sell order quantity placed for {} is: {} '.format(contract.symbol, order.totalQuantity))
378 | 
379 |         else:
380 |             print('app not connected')
381 |         app.disconnect()
382 | 
383 |     def placing_final_orders(self, order_id, ordersPriority, transmit):
384 | 
385 |         class TestApp(EWrapper, EClient):
386 | 
387 |             def __init__(self):
388 |                 EClient.__init__(self, self)
389 | 
390 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
391 |                 if reqId > -1:
392 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
393 | 
394 |         app = TestApp()
395 |         app.connect('127.0.0.1', 7497, 0)
396 | 
397 |         for ticker in action_final.index:
398 | 
399 |             contract = Contract()
400 |             contract.symbol = ticker
401 |             contract.secType = 'STK'
402 |             contract.exchange = 'SMART'
403 |             # contract.primaryExchange = 'ISLAND'
404 |             contract.currency = 'USD'
405 | 
406 |             order = Order()
407 |             order.orderType = 'MKT'
408 |             order.transmit = transmit
409 | 
410 |             order.totalQuantity = np.abs(action_final.loc[ticker])[0]
411 | 
412 |             if action_final.loc[ticker][0] > 0:
413 | 
414 |                 order.action = 'BUY'
415 |                 order.algoStrategy = 'Adaptive'
416 |                 order.algoParams = []
417 |                 order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
418 | 
419 |                 app.placeOrder(orderId=order_id, contract=contract, order=order)
420 |                 sleep(sleeptime)
421 |                 order_id = order_id + 1
422 |                 print('buy order quantity placed for {} is: {} '.format(contract.symbol, order.totalQuantity))
423 | 
424 |             elif action_final.loc[ticker][0] < 0:
425 | 
426 |                 order.action = 'SELL'
427 |                 order.algoStrategy = 'Adaptive'
428 |                 order.algoParams = []
429 |                 order.algoParams.append(TagValue("adaptivePriority", ordersPriority))
430 | 
431 |                 app.placeOrder(orderId=order_id, contract=contract, order=order)
432 |                 sleep(sleeptime)
433 |                 order_id = order_id + 1
434 |                 print('sell order quantity placed for {} is: {} '.format(contract.symbol, order.totalQuantity))
435 | 
436 |         app.disconnect()
437 | 
438 |     def commission_report(self, time):
439 | 
440 |         class TestApp(EWrapper, EClient):
441 | 
442 |             def __init__(self):
443 |                 EClient.__init__(self, self)
444 | 
445 |                 self.executed_orders = pd.DataFrame(columns=['ticker',
446 |                                                              'time', 'shares', 'action',
447 |                                                              'price', 'marketValue',
448 |                                                              'RealizedPNL', 'commission'])
449 |                 self.val = 0
450 |                 self.val2 = 0
451 | 
452 |             def error(self, reqId: TickerId, errorCode: int, errorString: str):
453 |                 if reqId > -1:
454 |                     print("Error. Id: ", reqId, " Code: ", errorCode, " Msg: ", errorString)
455 | 
456 |             def execDetails(self, reqId, contract, execution):
457 |                 super().execDetails(reqId, contract, execution)
458 | 
459 |                 self.executed_orders.loc[self.val, ['ticker',
460 |                                                     'time',
461 |                                                     'shares',
462 |                                                     'action',
463 |                                                     'price',
464 |                                                     'marketValue']] = [contract.symbol,
465 |                                                                        pd.to_datetime(execution.time),
466 |                                                                        execution.shares, execution.side,
467 |                                                                        execution.price,
468 |                                                                        execution.shares * execution.price]
469 |                 self.val = self.val + 1
470 | 
471 |             def commissionReport(self, commissionReport):
472 |                 super().commissionReport(commissionReport)
473 | 
474 |                 self.executed_orders.loc[self.val2, ['RealizedPNL', 'commission']] = [
475 |                     float(commissionReport.realizedPNL),
476 |                     float(commissionReport.commission)]
477 | 
478 |                 self.val2 = self.val2 + 1
479 | 
480 |             def execDetailsEnd(self, reqId):
481 |                 super().execDetailsEnd(reqId)
482 |                 self.disconnect()
483 | 
484 |         app = TestApp()
485 |         app.connect('127.0.0.1', 7497, 0)
486 | 
487 |         execution_filter = ExecutionFilter()
488 |         execution_filter.acctCode = acctCode
489 |         execution_filter.time = time
490 | 
491 |         app.reqExecutions(0, execution_filter)
492 |         sleep(sleeptime)
493 | 
494 |         df = app.executed_orders
495 |         app.run()
496 |         sleep(sleeptime)
497 | 
498 |         df.set_index('time', inplace=True)
499 |         df.sort_index(inplace=True)
500 |         df['RealizedPNL'][df['RealizedPNL'] > 1000000] = 'OPEN'
501 | 
502 |         return df
503 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="quant.jpg">
  2 | 
  3 | 
  4 | # Quantitative Finance Project
  5 | 
  6 | Keyvan Tajbakhsh  
  7 | July 26th, 2020
  8 | 
  9 | This reasearch project is realized in Python language and edited in <i>Jupyter Notebook</i> environement. Before diving into it, please read carefully all requirements and instructions mentioned below. 
 10 | 
 11 | For decades financial institutions and alpha generation platforms focus solely on quantitative investment research rather than the rapid trading of investments. While some of these platforms do allow analysts to take their strategies to market, 
 12 | others focus solely on the research and development of these highly complex mathematical and statistical models. quantitative investing uses raw data to calculate potential stock values, earnings forecasts and other metrics that help investors make capital allocation decisions.<br>
 13 | The purpose of this project is to define a liquid universe of stocks where we would apply the alpha factors to see through our factor analysis if there is a potential or not to send these results to production. 
 14 | After selecting and combining factors using Machine Learning technics, the combined factor is analyzed and improved with an optimizer function and then integrated into the risk model.  
 15 | 
 16 | 
 17 | This project workflow is comprised of distinct stages including: 
 18 | 
 19 | 1. Parameters
 20 | 2. Universe definition
 21 | 3. Sector definition
 22 | 4. Alpha factors
 23 | 5. Factor analysis
 24 | 6. Factors combination
 25 | 7. Risk analysis for equal weights
 26 | 8. Integrating factor data to the optimizer
 27 | 9. Optimized alpha vector analysis 
 28 | 10. Predicted portfolio
 29 | 
 30 | ## Getting Started
 31 | 
 32 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
 33 | 
 34 | ### Prerequisites
 35 | 
 36 | #### Pypi Packages
 37 | 
 38 | * [NumPy](https://www.numpy.org/) - A fundamental package for scientific computing with Python.(<i>version == 1.19.1</i>)
 39 | * [Pandas](https://pandas.pydata.org/) - A library providing high-performance, easy-to-use data structures and data analysis tools.(<i>version == 0.22.0</i>)
 40 | * [ScikitLearn](https://scikit-learn.org/) - Simple and efficient tools for data mining and data analysis.(<i>version == 0.0</i>)
 41 | * [Matplotlib](https://matplotlib.org/) - Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms.(<i>version == 3.3.0</i>)
 42 | * [Sea Born](https://seaborn.pydata.org/) - Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.(<i>version == 0.10.1</i>)
 43 | * [Quandl](https://www.quandl.com/) - Quandl delivers market data from hundreds of sources via API, or directly into Python, R, Excel and many other tools.
 44 | * [Datetime](https://docs.python.org/3/library/datetime.html) - The datetime module supplies classes for manipulating dates and times.
 45 | * [Pytz](https://pypi.org/project/pytz/) - World timezone definitions, modern and historical.
 46 | * [Talib](https://mrjbq7.github.io/ta-lib/) - Talib is used by trading software developers requiring to perform technical analysis of financial market data.(<i>version == 0.4.17</i>)
 47 | * [Alphalens](https://quantopian.github.io/alphalens/index.html) - Alphalens is a library for performance analysis of predictive (alpha) stock factor.(<i>version == 0.3.6</i>)
 48 | * [Pyfolio](https://quantopian.github.io/pyfolio/) - Pyfolio is a library for performance and risk analysis of financial portfolios developed by Quantopian Inc.(<i>version == latest github</i>)  
 49 | * [Itertools](https://docs.python.org/3/library/itertools.html) - This module implements a number of iterator building blocks inspired by constructs from APL, Haskell, and SML. Each has been recast in a form suitable for Python.
 50 | * [Warnings](https://docs.python.org/3/library/warnings.html) - Warning messages are typically issued in situations where it is useful to alert the user of some condition in a program.
 51 | * [Os](https://docs.python.org/3/library/os.html) - This module provides a portable way of using operating system dependent functionality.
 52 | * [Zipfile](https://docs.python.org/3/library/zipfile.html) - The ZIP file format is a common archive and compression standard. This module provides tools to create, read, write, append, and list a ZIP file.
 53 | * [Time](https://docs.python.org/3/library/time.html) - This module provides various time-related functions. 
 54 | * [Yfinance](https://pypi.org/project/yfinance/) - Yahoo! Finance market data downloader (<i>version == 0.1.54</i>)
 55 | * [cvxpy](https://www.cvxpy.org/) - CVXPY is a Python-embedded modeling language for convex optimization problems. It allows you to express your problem in a natural way that follows the math, rather than in the restrictive standard form required by solvers.(<i>version == 1.0.11</i>)
 56 | * [ibapi](https://interactivebrokers.github.io/tws-api/index.html) - The TWS API is a simple yet powerful interface through which IB clients can automate their trading strategies, request market data and monitor your account balance and portfolio in real time.(<i>version == 9.76.1</i>)
 57 | 
 58 | #### Local Modules
 59 | 
 60 | * [risk_model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py) - This module provides functions used in risk modeling and risk management.
 61 | * [factorize](https://github.com/keyvantaj/Quantitative/blob/master/factorize.py) - This module regroups some of useful functions for factorization of raw data.
 62 | * [account](https://github.com/keyvantaj/Quantitative/blob/master/account.py) - A package composed of functions with implemented IBKR api for portfolio management.
 63 | * [utils_s](https://github.com/keyvantaj/Quantitative/blob/master/utils_s.py) - This modlul delivers functions used in preprocessing and cleaning data.
 64 | * [feature_weights](https://github.com/keyvantaj/Quantitative/blob/master/feature_weights.py) - This Machine Learning module is implemented to calculate optimal weights distribution of factors for alpah factor combination
 65 | 
 66 | ### Code
 67 | 
 68 | The project is divided into two parts. The code is provided in the `alpha_research.ipynb` and `portfoilo_management.ipynb` notebook file.
 69 | You will be required to have a <b>Quandl API access key</b> to download data and an <b>Interactive Brokers Account</b> for trading, to execute the code. 
 70 | 
 71 | ### Run
 72 | 
 73 | In a terminal or command window, navigate to the top-level project directory `Quantitative/` (that contains this README) and run one of the following commands:
 74 | 
 75 | ```bash
 76 | jupyter notebook alpha_research.ipynb
 77 | ```
 78 | 
 79 | This will open the Jupyter Notebook software and project file in your browser.
 80 | 
 81 | 
 82 | ### Data
 83 | 
 84 | For this porject multiple source of data has been used from 
 85 | [Sharadar](https://www.quandl.com/publishers/sharadar) and 
 86 | [IFT](https://www.quandl.com/publishers/ift) as described below:
 87 | 
 88 | - Sharadar Equity Prices ([SHARADAR/SEP](https://www.quandl.com/databases/SEP/data))
 89 | Updated daily,End-Of-Day (EOD) price (ohlcv) data for more than 14,000 US public companies.  
 90 | - Indicator Descriptions ([SHARADAR/INDICATORS](https://www.quandl.com/databases/SF1/data))
 91 | Description of indicators listed in SF1 table for more than 14,000 US public companies.
 92 | - Tickers and Metadata ([SHARADAR/TICKERS](https://www.quandl.com/databases/SF1/data))
 93 | Information and metadata for more than 14,000 US public companies.
 94 | - Core US Fundamentals ([SHARADAR/SF1](https://www.quandl.com/databases/SF1/data))
 95 |  150 essential fundamental indicators and financial ratios, for more than 14,000 US public companies.
 96 | - Daily Metrics ([SHARADAR/DAILY](https://www.quandl.com/databases/SF1/data))
 97 |  5 essential metrics indicators and financial ratios daily updated, for more than 14,000 US public companies.
 98 | - Sentiment Analysis and News Analytics ([IFT/NSA](https://www.quandl.com/databases/NS1/data)) 
 99 | News, blogs, social media and proprietary sources for thousands of stocks.
100 | 
101 | #### Features
102 | 
103 | ##### Tickers and Metadata [SHARADAR/TICKERS] features
104 | 
105 | - <b>table</b> : Sharadar Table : The database table which the ticker is featured in. Examples are: "SF1" or "SEP. 
106 | - <b>permaticker</b> : Permanent Ticker Symbol : The permaticker is a unique and unchanging identifier for an issuer in the dataset which is issued by Sharadar. 
107 | - <b>name</b> : Issuer Name : The name of the security issuer. 
108 | - <b>exchange</b> : Stock Exchange : The exchange on which the security trades. Examples are: "NASDAQ";"NYSE";"NYSEARCA";"BATS";"OTC" and "NYSEMKT" (previously the American Stock exchange). 
109 | - <b>isdelisted</b> : Is Delisted? : Is the security delisted? [Y]es or [N]o. 
110 | - <b>category</b> : Issuer Category : The category of the issuer: "Domestic"; "Canadian" or "ADR". 
111 | - <b>cusips</b> : CUSIPs : A security identifier. Space delimited in the event of multiple identifiers. 
112 | - <b>siccode</b> : Standard Industrial Classification (SIC) Code : The Standard Industrial Classification (SIC) is a system for classifying industries by a four-digit code; as sourced from SEC filings. More on the SIC system here: https://en.wikipedia.org/wiki/Standard_Industrial_Classification  
113 | - <b>sicsector</b> : SIC Sector : The SIC sector is based on the SIC code and the division tabled here: https://en.wikipedia.org/wiki/Standard_Industrial_Classification  
114 | - <b>sicindustry</b> : SIC Industry : The SIC industry is based on the SIC code and the industry tabled here: https://www.sec.gov/info/edgar/siccodes.htm 
115 | - <b>famasector</b> : Fama Sector : Not currently active - coming in a future update. 
116 | - <b>famaindustry</b> : Fama Industry : Industry classifications based on the SIC code and classifications by Fama and French here: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_48_ind_port.html 
117 | - <b>sector</b> : Sector : Sharadar's sector classification based on SIC codes in a format which approximates to GICS. 
118 | - <b>industry</b> : Industry : Sharadar's industry classification based on SIC codes in a format which approximates to GICS. 
119 | - <b>scalemarketcap</b> : Company Scale - Market Cap : This field is experimental and subject to change. It categorises the company according to it's maximum observed market cap as follows: 1 - Nano < 50m; 2 - Micro < 300m; 3 - Small < 2bn; 4 - Mid < 10bn; 5 - Large < 200bn; 6 - Mega >= 200bn 
120 | - <b>scalerevenue</b> : Company Scale - Revenue : This field is experimental and subject to change. It categorises the company according to it's maximum observed annual revenue as follows: 1 - Nano < 50m; 2 - Micro < 300m; 3 - Small < 2bn; 4 - Mid < 10bn; 5 - Large < 200bn; 6 - Mega >= 200bn 
121 | - <b>relatedtickers</b> : Related Tickers : Where related tickers have been identified this field is populated. Related tickers can include the prior ticker before a ticker change; and it tickers for alternative share classes. 
122 | - <b>currency</b> : Currency : The company functional reporting currency for the SF1 Fundamentals table or the currency for EOD prices in SEP and SFP. 
123 | - <b>location</b> : Location : The company location as registered with the Securities and Exchange Commission. 
124 | - <b>lastupdated</b> : Last Updated Date : Last Updated represents the last date that this database entry was updated; which is useful to users when updating their local records. 
125 | - <b>firstadded</b> : First Added Date : The date that the ticker was first added to coverage in the dataset. 
126 | - <b>firstpricedate</b> : First Price Date : The date of the first price observation for a given ticker. Can be used as a proxy for IPO date. Minimum value of 1986-01-01 for IPO's that occurred prior to this date. Note: this does not necessarily represent the first price date available in our datasets since our end of day price history currently starts in December 1998. 
127 | - <b>lastpricedate</b> : Last Price Date : The most recent price observation available. 
128 | - <b>firstquarter</b> : First Quarter : The first financial quarter available in the dataset. 
129 | - <b>lastquarter</b> : Last Quarter : The last financial quarter available in the dataset. 
130 | - <b>secfilings</b> : SEC Filings URL : The URL pointing to the SEC filings which also contains the Central Index Key (CIK). 
131 | - <b>companysite</b> : Company Website URL : The URL pointing to the company website. 
132 | 
133 | ##### Core US Fundamentals [SHARADAR/SF1] features
134 | 
135 | - <b>accoci</b> : Accumulated Other Comprehensive Income : [Balance Sheet] A component of [Equity] representing the accumulated change in equity from transactions and other events and circumstances from non-owner sources; net of tax effect; at period end. Includes foreign currency translation items; certain pension adjustments; unrealized gains and losses on certain investments in debt and equity securities. 
136 | - <b>assets</b> : Total Assets : [Balance Sheet] Sum of the carrying amounts as of the balance sheet date of all assets that are recognized. Major components are [CashnEq]; [Investments];[Intangibles]; [PPNENet];[TaxAssets] and [Receivables]. 
137 | - <b>assetsc</b> : Current Assets : [Balance Sheet] The current portion of [Assets]; reported if a company operates a classified balance sheet that segments current and non-current assets. 
138 | - <b>assetsnc</b> : Assets Non-Current : [Balance Sheet] Amount of non-current assets; for companies that operate a classified balance sheet. Calculated as the different between Total Assets [Assets] and Current Assets [AssetsC]. 
139 | - <b>bvps</b> : Book Value per Share : [Metrics] Measures the ratio between [Equity] and [SharesWA] as adjusted by [ShareFactor]. 
140 | - <b>capex</b> : Capital Expenditure : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of long-lived; physical & intangible assets that are used in the normal conduct of business to produce goods and services and are not intended for resale. Includes cash inflows/outflows to pay for construction of self-constructed assets & software. 
141 | - <b>cashneq</b> : Cash and Equivalents : [Balance Sheet] A component of [Assets] representing the amount of currency on hand as well as demand deposits with banks or financial institutions. 
142 | - <b>cashnequsd</b> : Cash and Equivalents (USD) : [Balance Sheet] [CashnEq] in USD; converted by [FXUSD]. 
143 | - <b>cor</b> : Cost of Revenue : [Income Statement] The aggregate cost of goods produced and sold and services rendered during the reporting period. 
144 | - <b>consolinc</b> : Consolidated Income : [Income Statement] The portion of profit or loss for the period; net of income taxes; which is attributable to the consolidated entity; before the deduction of [NetIncNCI]. 
145 | - <b>currentratio</b> : Current Ratio : [Metrics] The ratio between [AssetsC] and [LiabilitiesC]; for companies that operate a classified balance sheet. 
146 | - <b>de</b> : Debt to Equity Ratio : [Metrics] Measures the ratio between [Liabilities] and [Equity]. 
147 | - <b>debt</b> : Total Debt : [Balance Sheet] A component of [Liabilities] representing the total amount of current and non-current debt owed. Includes secured and unsecured bonds issued; commercial paper; notes payable; credit facilities; lines of credit; capital lease obligations; operating lease obligations; and convertible notes. 
148 | - <b>debtc</b> : Debt Current : [Balance Sheet] The current portion of [Debt]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. 
149 | - <b>debtnc</b> : Debt Non-Current : [Balance Sheet] The non-current portion of [Debt] reported if the company operates a classified balance sheet that segments current and non-current liabilities. 
150 | - <b>debtusd</b> : Total Debt (USD) : [Balance Sheet] [Debt] in USD; converted by [FXUSD]. 
151 | - <b>deferredrev</b> : Deferred Revenue : [Balance Sheet] A component of [Liabilities] representing the carrying amount of consideration received or receivable on potential earnings that were not recognized as revenue; including sales; license fees; and royalties; but excluding interest income. 
152 | - <b>depamor</b> : Depreciation Amortization & Accretion : [Cash Flow Statement] A component of operating cash flow representing the aggregate net amount of depreciation; amortization; and accretion recognized during an accounting period. As a non-cash item; the net amount is added back to net income when calculating cash provided by or used in operations using the indirect method. 
153 | - <b>deposits</b> : Deposit Liabilities : [Balance Sheet] A component of [Liabilities] representing the total of all deposit liabilities held; including foreign and domestic; interest and noninterest bearing. May include demand deposits; saving deposits; Negotiable Order of Withdrawal and time deposits among others. 
154 | - <b>divyield</b> : Dividend Yield : [Metrics] Dividend Yield measures the ratio between a company's [DPS] and its [Price]. 
155 | - <b>dps</b> : Dividends per Basic Common Share : [Income Statement] Aggregate dividends declared during the period for each split-adjusted share of common stock outstanding. Includes spinoffs where identified. 
156 | - <b>ebit</b> : Earning Before Interest & Taxes (EBIT) : [Income Statement] Earnings Before Interest and Tax is calculated by adding [TaxExp] and [IntExp] back to [NetInc]. 
157 | - <b>ebitda</b> : Earnings Before Interest Taxes & Depreciation Amortization (EBITDA) : [Metrics] EBITDA is a non-GAAP accounting metric that is widely used when assessing the performance of companies; calculated by adding [DepAmor] back to [EBIT]. 
158 | - <b>ebitdamargin</b> : EBITDA Margin : [Metrics] Measures the ratio between a company's [EBITDA] and [Revenue]. 
159 | - <b>ebitdausd</b> : Earnings Before Interest Taxes & Depreciation Amortization (USD) : [Metrics] [EBITDA] in USD; converted by [FXUSD]. 
160 | - <b>ebitusd</b> : Earning Before Interest & Taxes (USD) : [Income Statement] [EBIT] in USD; converted by [FXUSD]. 
161 | - <b>ebt</b> : Earnings before Tax : [Metrics] Earnings Before Tax is calculated by adding [TaxExp] back to [NetInc]. 
162 | - <b>eps</b> : Earnings per Basic Share : [Income Statement] Earnings per share as calculated and reported by the company. Approximates to the amount of [NetIncCmn] for the period per each [SharesWA] after adjusting for [ShareFactor]. 
163 | - <b>epsdil</b> : Earnings per Diluted Share : [Income Statement] Earnings per diluted share as calculated and reported by the company. Approximates to the amount of [NetIncCmn] for the period per each [SharesWADil] after adjusting for [ShareFactor].. 
164 | - <b>epsusd</b> : Earnings per Basic Share (USD) : [Income Statement] [EPS] in USD; converted by [FXUSD]. 
165 | - <b>equity</b> : Shareholders Equity : [Balance Sheet] A principal component of the balance sheet; in addition to [Liabilities] and [Assets]; that represents the total of all stockholders' equity (deficit) items; net of receivables from officers; directors; owners; and affiliates of the entity which are attributable to the parent. 
166 | - <b>equityusd</b> : Shareholders Equity (USD) : [Balance Sheet] [Equity] in USD; converted by [FXUSD]. 
167 | - <b>ev</b> : Enterprise Value : [Metrics] Enterprise value is a measure of the value of a business as a whole; calculated as [MarketCap] plus [DebtUSD] minus [CashnEqUSD]. 
168 | - <b>evebit</b> : Enterprise Value over EBIT : [Metrics] Measures the ratio between [EV] and [EBITUSD]. 
169 | - <b>evebitda</b> : Enterprise Value over EBITDA : [Metrics] Measures the ratio between [EV] and [EBITDAUSD]. 
170 | - <b>fcf</b> : Free Cash Flow : [Metrics] Free Cash Flow is a measure of financial performance calculated as [NCFO] minus [CapEx]. 
171 | - <b>fcfps</b> : Free Cash Flow per Share : [Metrics] Free Cash Flow per Share is a valuation metric calculated by dividing [FCF] by [SharesWA] and [ShareFactor]. 
172 | - <b>fxusd</b> : Foreign Currency to USD Exchange Rate : [Metrics] The exchange rate used for the conversion of foreign currency to USD for non-US companies that do not report in USD. 
173 | - <b>gp</b> : Gross Profit : [Income Statement] Aggregate revenue [Revenue] less cost of revenue [CoR] directly attributable to the revenue generation activity. 
174 | - <b>grossmargin</b> : Gross Margin : [Metrics] Gross Margin measures the ratio between a company's [GP] and [Revenue]. 
175 | - <b>intangibles</b> : Goodwill and Intangible Assets : [Balance Sheet] A component of [Assets] representing the carrying amounts of all intangible assets and goodwill as of the balance sheet date; net of accumulated amortization and impairment charges. 
176 | - <b>intexp</b> : Interest Expense : [Income Statement] Amount of the cost of borrowed funds accounted for as interest expense. 
177 | - <b>invcap</b> : Invested Capital : [Metrics] Invested capital is an input into the calculation of [ROIC]; and is calculated as: [Debt] plus [Assets] minus [Intangibles] minus [CashnEq] minus [LiabilitiesC]. Please note this calculation method is subject to change. 
178 | - <b>inventory</b> : Inventory : [Balance Sheet] A component of [Assets] representing the amount after valuation and reserves of inventory expected to be sold; or consumed within one year or operating cycle; if longer. 
179 | - <b>investments</b> : Investments : [Balance Sheet] A component of [Assets] representing the total amount of marketable and non-marketable securties; loans receivable and other invested assets. 
180 | - <b>investmentsc</b> : Investments Current : [Balance Sheet] The current portion of [Investments]; reported if the company operates a classified balance sheet that segments current and non-current assets. 
181 | - <b>investmentsnc</b> : Investments Non-Current : [Balance Sheet] The non-current portion of [Investments]; reported if the company operates a classified balance sheet that segments current and non-current assets. 
182 | - <b>liabilities</b> : Total Liabilities : [Balance Sheet] Sum of the carrying amounts as of the balance sheet date of all liabilities that are recognized. Principal components are [Debt]; [DeferredRev]; [Payables];[Deposits]; and [TaxLiabilities]. 
183 | - <b>liabilitiesc</b> : Current Liabilities : [Balance Sheet] The current portion of [Liabilities]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. 
184 | - <b>liabilitiesnc</b> : Liabilities Non-Current : [Balance Sheet] The non-current portion of [Liabilities]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. 
185 | - <b>marketcap</b> : Market Capitalization : [Metrics] Represents the product of [SharesBas]; [Price] and [ShareFactor]. 
186 | - <b>ncf</b> : Net Cash Flow / Change in Cash & Cash Equivalents : [Cash Flow Statement] Principal component of the cash flow statement representing the amount of increase (decrease) in cash and cash equivalents. Includes [NCFO]; investing [NCFI] and financing [NCFF] for continuing and discontinued operations; and the effect of exchange rate changes on cash [NCFX]. 
187 | - <b>ncfbus</b> : Net Cash Flow - Business Acquisitions and Disposals : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of businesses; joint-ventures; affiliates; and other named investments. 
188 | - <b>ncfcommon</b> : Issuance (Purchase) of Equity Shares : [Cash Flow Statement] A component of [NCFF] representing the net cash inflow (outflow) from common equity changes. Includes additional capital contributions from share issuances and exercise of stock options; and outflow from share repurchases.  
189 | - <b>ncfdebt</b> : Issuance (Repayment) of Debt Securities  : [Cash Flow Statement] A component of [NCFF] representing the net cash inflow (outflow) from issuance (repayment) of debt securities. 
190 | - <b>ncfdiv</b> : Payment of Dividends & Other Cash Distributions    : [Cash Flow Statement] A component of [NCFF] representing dividends and dividend equivalents paid on common stock and restricted stock units. 
191 | - <b>ncff</b> : Net Cash Flow from Financing : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from financing activities; from continuing and discontinued operations. Principal components of financing cash flow are: issuance (purchase) of equity shares; issuance (repayment) of debt securities; and payment of dividends & other cash distributions. 
192 | - <b>ncfi</b> : Net Cash Flow from Investing : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from investing activities; from continuing and discontinued operations. Principal components of investing cash flow are: capital (expenditure) disposal of equipment [CapEx]; business (acquisitions) disposition [NCFBus] and investment (acquisition) disposal [NCFInv]. 
193 | - <b>ncfinv</b> : Net Cash Flow - Investment Acquisitions and Disposals : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of investments; including marketable securities and loan originations. 
194 | - <b>ncfo</b> : Net Cash Flow from Operations : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from operating activities; from continuing and discontinued operations. 
195 | - <b>ncfx</b> : Effect of Exchange Rate Changes on Cash  : [Cash Flow Statement] A component of Net Cash Flow [NCF] representing the amount of increase (decrease) from the effect of exchange rate changes on cash and cash equivalent balances held in foreign currencies. 
196 | - <b>netinc</b> : Net Income : [Income Statement] The portion of profit or loss for the period; net of income taxes; which is attributable to the parent after the deduction of [NetIncNCI] from [ConsolInc]; and before the deduction of [PrefDivIS]. 
197 | - <b>netinccmn</b> : Net Income Common Stock : [Income Statement] The amount of net income (loss) for the period due to common shareholders. Typically differs from [NetInc] to the parent entity due to the deduction of [PrefDivIS]. 
198 | - <b>netinccmnusd</b> : Net Income Common Stock (USD) : [Income Statement] [NetIncCmn] in USD; converted by [FXUSD]. 
199 | - <b>netincdis</b> : Net Loss Income from Discontinued Operations : [Income Statement] Amount of loss (income) from a disposal group; net of income tax; reported as a separate component of income. 
200 | - <b>netincnci</b> : Net Income to Non-Controlling Interests : [Income Statement] The portion of income which is attributable to non-controlling interest shareholders; subtracted from [ConsolInc] in order to obtain [NetInc]. 
201 | - <b>netmargin</b> : Profit Margin : [Metrics] Measures the ratio between a company's [NetIncCmn] and [Revenue]. 
202 | - <b>opex</b> : Operating Expenses : [Income Statement] Operating expenses represents the total expenditure on [SGnA]; [RnD] and other operating expense items; it excludes [CoR]. 
203 | - <b>opinc</b> : Operating Income : [Income Statement] Operating income is a measure of financial performance before the deduction of [IntExp]; [TaxExp] and other Non-Operating items. It is calculated as [GP] minus [OpEx]. 
204 | - <b>payables</b> : Trade and Non-Trade Payables : [Balance Sheet] A component of [Liabilities] representing trade and non-trade payables. 
205 | - <b>payoutratio</b> : Payout Ratio : [Metrics] The percentage of earnings paid as dividends to common stockholders. - Calculated by dividing [DPS] by [EPSUSD]. 
206 | - <b>pb</b> : Price to Book Value : [Metrics] Measures the ratio between [MarketCap] and [EquityUSD]. 
207 | - <b>pe</b> : Price Earnings (Damodaran Method) : [Metrics] Measures the ratio between [MarketCap] and [NetIncCmnUSD] 
208 | - <b>pe1</b> : Price to Earnings Ratio : [Metrics] An alternative to [PE] representing the ratio between [Price] and [EPSUSD]. 
209 | - <b>ppnenet</b> : Property Plant & Equipment Net : [Balance Sheet] A component of [Assets] representing the amount after accumulated depreciation; depletion and amortization of physical assets used in the normal conduct of business to produce goods and services and not intended for resale. Includes Operating Right of Use Assets. 
210 | - <b>prefdivis</b> : Preferred Dividends Income Statement Impact : [Income Statement] Income statement item reflecting dividend payments to preferred stockholders. Subtracted from Net Income to Parent [NetInc] to obtain Net Income to Common Stockholders [NetIncCmn]. 
211 | - <b>price</b> : Share Price (Adjusted Close) : [Entity] The price per common share adjusted for stock splits but not adjusted for dividends; used in the computation of [PE1]; [PS1]; [DivYield] and [SPS]. 
212 | - <b>ps</b> : Price Sales (Damodaran Method) : [Metrics] Measures the ratio between [MarketCap] and [RevenueUSD]. 
213 | - <b>ps1</b> : Price to Sales Ratio : [Metrics] An alternative calculation method to [PS]; that measures the ratio between a company's [Price] and it's [SPS]. 
214 | - <b>receivables</b> : Trade and Non-Trade Receivables : [Balance Sheet] A component of [Assets] representing trade and non-trade receivables. 
215 | - <b>retearn</b> : Accumulated Retained Earnings (Deficit) : [Balance Sheet] A component of [Equity] representing the cumulative amount of the entities undistributed earnings or deficit. May only be reported annually by certain companies; rather than quarterly. 
216 | - <b>revenue</b> : Revenues : [Income Statement] Amount of Revenue recognized from goods sold; services rendered; insurance premiums; or other activities that constitute an earning process. Interest income for financial institutions is reported net of interest expense and provision for credit losses. 
217 | - <b>revenueusd</b> : Revenues (USD) : [Income Statement] [Revenue] in USD; converted by [FXUSD]. 
218 | - <b>rnd</b> : Research and Development Expense : [Income Statement] A component of [OpEx] representing the aggregate costs incurred in a planned search or critical investigation aimed at discovery of new knowledge with the hope that such knowledge will be useful in developing a new product or service. 
219 | - <b>sbcomp</b> : Share Based Compensation : [Cash Flow Statement] A component of [NCFO] representing the total amount of noncash; equity-based employee remuneration. This may include the value of stock or unit options; amortization of restricted stock or units; and adjustment for officers' compensation. As noncash; this element is an add back when calculating net cash generated by operating activities using the indirect method. 
220 | - <b>sgna</b> : Selling General and Administrative Expense : [Income Statement] A component of [OpEx] representing the aggregate total costs related to selling a firm's product and services; as well as all other general and administrative expenses. Direct selling expenses (for example; credit; warranty; and advertising) are expenses that can be directly linked to the sale of specific products. Indirect selling expenses are expenses that cannot be directly linked to the sale of specific products; for example telephone expenses; Internet; and postal charges. General and administrative expenses include salaries of non-sales personnel; rent; utilities; communication; etc. 
221 | - <b>sharefactor</b> : Share Factor : [Entity] Share factor is a multiplicant in the calculation of [MarketCap] and is used to adjust for: American Depository Receipts (ADRs) that represent more or less than 1 underlying share; and; companies which have different earnings share for different share classes (eg Berkshire Hathaway - BRK.B). 
222 | - <b>sharesbas</b> : Shares (Basic) : [Entity] The number of shares or other units outstanding of the entity's capital or common stock or other ownership interests; as stated on the cover of related periodic report (10-K/10-Q); after adjustment for stock splits. 
223 | - <b>shareswa</b> : Weighted Average Shares : [Income Statement] The weighted average number of shares or units issued and outstanding that are used by the company to calculate [EPS]; determined based on the timing of issuance of shares or units in the period. 
224 | - <b>shareswadil</b> : Weighted Average Shares Diluted : [Income Statement] The weighted average number of shares or units issued and outstanding that are used by the company to calculate [EPSDil]; determined based on the timing of issuance of shares or units in the period. 
225 | - <b>sps</b> : Sales per Share : [Metrics] Sales per Share measures the ratio between [RevenueUSD] and [SharesWA] as adjusted by [ShareFactor]. 
226 | - <b>tangibles</b> : Tangible Asset Value : [Metrics] The value of tangibles assets calculated as the difference between [Assets] and [Intangibles]. 
227 | - <b>taxassets</b> : Tax Assets : [Balance Sheet] A component of [Assets] representing tax assets and receivables. 
228 | - <b>taxexp</b> : Income Tax Expense : [Income Statement] Amount of current income tax expense (benefit) and deferred income tax expense (benefit) pertaining to continuing operations. 
229 | - <b>taxliabilities</b> : Tax Liabilities : [Balance Sheet] A component of [Liabilities] representing outstanding tax liabilities. 
230 | - <b>tbvps</b> : Tangible Assets Book Value per Share : [Metrics] Measures the ratio between [Tangibles] and [SharesWA] as adjusted by [ShareFactor]. 
231 | - <b>workingcapital</b> : Working Capital : [Metrics] Working capital measures the difference between [AssetsC] and [LiabilitiesC]. 
232 | - <b>roe</b>: Return on Average Equity : [Metrics] Return on equity measures a corporation's profitability by calculating the amount of [NetIncCmn] returned as a percentage of [EquityAvg]. 
233 | - <b>roa</b> : Return on Average Assets : [Metrics] Return on assets measures how profitable a company is [NetIncCmn] relative to its total assets [AssetsAvg].
234 | 
235 | ##### Sharadar Equity Prices [SHARADAR/SEP] features
236 | 
237 | - <b>open</b> : Open Price - Split Adjusted : The opening share price, adjusted for stock splits and stock dividends. 
238 | - <b>high</b> : High Price - Split Adjusted : The high share price, adjusted for stock splits and stock dividends. 
239 | - <b>low</b> : Low Price - Split Adjusted : The low share price, adjusted for stock splits and stock dividends. 
240 | - <b>close</b> : Close Price - Split Adjusted : The open share closing, adjusted for stock splits and stock dividends. 
241 | - <b>volume</b> : Volume - Split Adjusted : The traded volume, adjusted for stock splits and stock dividends.
242 | 
243 | ##### Daily Metrics ([SHARADAR/DAILY] features
244 | 
245 | - <b>ev</b> : Enterprise Value - Daily : Enterprise value is a measure of the value of a business as a whole; calculated as [MarketCap] plus [DebtUSD] minus [CashnEqUSD]. [MarketCap] is calculated by us, and the remaining figures are sourced from the most recent SEC form 10 filings. 
246 | - <b>evebit</b> : Enterprise Value over EBIT - Daily : Measures the ratio between [EV] and [EBITUSD]. EBITUSD is derived from the most recent SEC form 10 filings. 
247 | - <b>evebitda</b> : Enterprise Value over EBITDA - Daily : Measures the ratio between [EV] and [EBITDAUSD]. EBITDAUSD is derived from the most recent SEC form 10 filings. 
248 | - <b>marketcap</b> : Market Capitalization - Daily : Represents the product of [SharesBas]; [Price] and [ShareFactor]. [SharesBas] is sourced from the most recent SEC form 10 filing. 
249 | - <b>pb</b> : Price to Book Value - Daily : Measures the ratio between [MarketCap] and [EquityUSD]. [EquityUSD] is sourced from the most recent SEC form 10 filing. 
250 | - <b>pe</b> : Price Earnings (Damodaran Method) - Daily : Measures the ratio between [MarketCap] and [NetIncCmnUSD]. [NetIncCmnUSD] is sourced from the most recent SEC form 10 filings. 
251 | - <b>ps</b> : Price Sales (Damodaran Method) - Daily : Measures the ratio between [MarketCap] and [RevenueUSD]. [RevenueUSD] is sourced from the most recent SEC form 10 filings. 
252 | 
253 | ##### Sentiment Analysis and News Analytics ([IFT/NSA] features
254 | 
255 | - <b>sentiment</b>: a numeric measure of the bullishness / bearishness of news coverage of the stock.
256 | - <b>sentiment_high</b>: highest intraday sentiment scores.
257 | - <b>sentiment_low</b>: lowest intraday sentiment scores.
258 | - <b>news_volume</b>: the absolute number of news articles covering the stock.
259 | - <b>news_buzz</b>: a numeric measure of the change in coverage volume for the stock.
260 | 
261 | 
262 | ## Factor Analysis Target Variables
263 | 
264 | The factor analysis is performed using [alphalens](https://quantopian.github.io/alphalens/index.html) and [Pyfolio](https://quantopian.github.io/pyfolio/). These packages regrouped APIs useful for data processing and factor analysis over the pre-defined periods. These metrics are  mentioned here below:
265 | 
266 | - <b>Cleaning and preparing data</b> `alphalens.utils.get_clean_factor_and_forward_returns`: Formats the factor data, pricing data, and group mappings into a DataFrame that contains aligned MultiIndex indices of timestamp and asset. The returned data will be formatted to be suitable for Alphalens functions. 
267 | - <b>Cumulated factor return</b> `alphalens.performance.factor_returns`: Builds cumulative returns from ‘period’ returns. This function simulate the cumulative effect that a series of gains or losses (the ‘returns’) have on an original amount of capital over a period of time.
268 | -  <b>Mean quantile return</b> `alphalens.performance.mean_return_by_quantile`: Computes mean returns for factor quantiles across provided forward returns columns.
269 | - <b>Factor Rank Autocorrelation</b> `alphalens.performance.factor_rank_autocorrelation`: Computes autocorrelation of mean factor ranks in specified time spans. We must compare period to period factor ranks rather than factor values to account for systematic shifts in the factor values of all names or names within a group. This metric is useful for measuring the turnover of a factor. If the value of a factor for each name changes randomly from period to period, we’d expect an autocorrelation of 0.
270 | - <b>Sharpe ratio</b> `sharpe_ratio`: This function computes annualized sharpe ratio. This metric is used to understand the return of an investment compared to its risk. The ratio is the average return earned in excess per unit of volatility or total risk. Volatility is a measure of the factor return fluctuations of an asset.
271 | 
272 | 
273 | ## The Combined Alpha Vector
274 | 
275 | To get the single score for each stock we have to combine selected factors. This is an area where machine learning can be very helpful. In this context, the [feature_weights](https://github.com/keyvantaj/Quantitative/blob/master/feature_weights.py) module is implemented to gives us optimal weights to the selected alpha factors and result in the best combination.
276 | 
277 | 
278 | ## Risk Management
279 | 
280 | We measured the predicted risk cap using [risk_model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py) module. For this purpose the portfolio risk formula is √𝑋𝑇(𝐵𝐹𝐵𝑇+𝑆)𝑋 where:
281 | 
282 | * 𝑋  is the portfolio weights
283 | * 𝐵  is the factor betas
284 | * 𝐹  is the factor covariance matrix
285 | * 𝑆  is the idiosyncratic variance matrix
286 | 
287 | 
288 | ## Optimization
289 | 
290 | Once alpha model and a risk model are generated, we want to find a portfolio that trades as close as possible to the alpha model but limiting risk as measured by the [risk_model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py). The [cxpy](https://www.cvxpy.org/) package is used to implement the [optimizer](https://github.com/keyvantaj/Quantitative/blob/master/optimizer.py)
291 | 
292 | The CVXPY objective function is to maximize 𝛼𝑇 ∗ 𝑥 , where x is the portfolio weights and alpha is the alpha vector.
293 | 
294 | In the other hand we have the following constraints:
295 | 
296 | * 𝑟 ≤ 𝑟𝑖𝑠𝑘2cap
297 | * 𝐵𝑇 ∗ 𝑥 ⪯ 𝑓𝑎𝑐𝑡𝑜𝑟max
298 | * 𝐵𝑇 ∗ 𝑥 ⪰ 𝑓𝑎𝑐𝑡𝑜𝑟min
299 | * 𝑥𝑇𝟙 = 0
300 | * ‖𝑥‖ ≤ 1
301 | * 𝑥 ⪰ 𝑤𝑒𝑖𝑔ℎ𝑡𝑠min
302 | * 𝑥 ⪯ 𝑤𝑒𝑖𝑔ℎ𝑡𝑠max
303 | 
304 | Where x is the portfolio weights, B is the factor betas, and r is the portfolio risk calculated in [risk model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py) module.
305 | 
306 | The first constraint is that the predicted risk be less than some maximum limit. The second and third constraints are on the maximum and minimum portfolio factor exposures. The fourth constraint is the "market neutral constraint: the sum of the weights must be zero. The fifth constraint is the leverage constraint: the sum of the absolute value of the weights must be less than or equal to 1.0. The last are some minimum and maximum limits on individual holdings.
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 


--------------------------------------------------------------------------------
/alpha_research.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Alpha Research\n",
   8 |     "\n",
   9 |     "The goal in this project is to define a liquid universe of stocks where we would apply our factors into it to see through our factor analysis if there is a potential or not to send these results to [production](https://github.com/keyvantaj/Quantitative/blob/master/portfolio_management.ipynb). After selecting and combining factors using Machine Learning technics, the combined factor is analyzed and improved with an optimizer function to integrate our risk model.  \n",
  10 |     "\n",
  11 |     "This project workflow is comprised of distinct stages including: \n",
  12 |     "\n",
  13 |     "1. Parameters\n",
  14 |     "2. Universe definition\n",
  15 |     "3. Sector definition\n",
  16 |     "4. Alpha factors\n",
  17 |     "5. Factor analysis\n",
  18 |     "6. Factors combination\n",
  19 |     "7. Risk analysis for equal weights\n",
  20 |     "8. Integrating factor data to the optimizer\n",
  21 |     "9. Optimized alpha vector analysis \n",
  22 |     "10. Predicted portfolio\n",
  23 |     "\n",
  24 |     "In this context we have used different source of data provided from \n",
  25 |     "[Sharadar](https://www.quandl.com/publishers/sharadar) and \n",
  26 |     "[IFT](https://www.quandl.com/publishers/ift) as described below:\n",
  27 |     "\n",
  28 |     "- Sharadar Equity Prices ([SHARADAR/SEP](https://www.quandl.com/databases/SEP/data))\n",
  29 |     "Updated daily,End-Of-Day (EOD) price (ohlcv) data for more than 14,000 US public companies.  \n",
  30 |     "- Indicator Descriptions ([SHARADAR/INDICATORS](https://www.quandl.com/databases/SF1/data))\n",
  31 |     "Description of indicators listed in SF1 table for more than 14,000 US public companies.\n",
  32 |     "- Tickers and Metadata ([SHARADAR/TICKERS](https://www.quandl.com/databases/SF1/data))\n",
  33 |     "Information and metadata for more than 14,000 US public companies.\n",
  34 |     "- Core US Fundamentals ([SHARADAR/SF1](https://www.quandl.com/databases/SF1/data))\n",
  35 |     " 150 essential fundamental indicators and financial ratios, for more than 14,000 US public companies.\n",
  36 |     "- Daily Metrics ([SHARADAR/DAILY](https://www.quandl.com/databases/SF1/data))\n",
  37 |     " 5 essential metrics indicators and financial ratios daily updated, for more than 14,000 US public companies.\n",
  38 |     "- Sentiment Analysis and News Analytics ([IFT/NSA](https://www.quandl.com/databases/NS1/data)) \n",
  39 |     "News, blogs, social media and proprietary sources for thousands of stocks."
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "markdown",
  44 |    "metadata": {},
  45 |    "source": [
  46 |     "### Tickers and Metadata [SHARADAR/TICKERS] features"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "- <b>table</b> : Sharadar Table : The database table which the ticker is featured in. Examples are: \"SF1\" or \"SEP. \n",
  54 |     "\n",
  55 |     "- <b>permaticker</b> : Permanent Ticker Symbol : The permaticker is a unique and unchanging identifier for an issuer in the dataset which is issued by Sharadar. \n",
  56 |     "\n",
  57 |     "- <b>name</b> : Issuer Name : The name of the security issuer. \n",
  58 |     "\n",
  59 |     "- <b>exchange</b> : Stock Exchange : The exchange on which the security trades. Examples are: \"NASDAQ\";\"NYSE\";\"NYSEARCA\";\"BATS\";\"OTC\" and \"NYSEMKT\" (previously the American Stock exchange). \n",
  60 |     "\n",
  61 |     "- <b>isdelisted</b> : Is Delisted? : Is the security delisted? [Y]es or [N]o. \n",
  62 |     "\n",
  63 |     "- <b>category</b> : Issuer Category : The category of the issuer: \"Domestic\"; \"Canadian\" or \"ADR\". \n",
  64 |     "\n",
  65 |     "- <b>cusips</b> : CUSIPs : A security identifier. Space delimited in the event of multiple identifiers. \n",
  66 |     "\n",
  67 |     "- <b>siccode</b> : Standard Industrial Classification (SIC) Code : The Standard Industrial Classification (SIC) is a system for classifying industries by a four-digit code; as sourced from SEC filings. More on the SIC system here: https://en.wikipedia.org/wiki/Standard_Industrial_Classification  \n",
  68 |     "\n",
  69 |     "- <b>sicsector</b> : SIC Sector : The SIC sector is based on the SIC code and the division tabled here: https://en.wikipedia.org/wiki/Standard_Industrial_Classification  \n",
  70 |     "\n",
  71 |     "- <b>sicindustry</b> : SIC Industry : The SIC industry is based on the SIC code and the industry tabled here: https://www.sec.gov/info/edgar/siccodes.htm \n",
  72 |     "\n",
  73 |     "- <b>famasector</b> : Fama Sector : Not currently active - coming in a future update. \n",
  74 |     "\n",
  75 |     "- <b>famaindustry</b> : Fama Industry : Industry classifications based on the SIC code and classifications by Fama and French here: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_48_ind_port.html \n",
  76 |     "\n",
  77 |     "- <b>sector</b> : Sector : Sharadar's sector classification based on SIC codes in a format which approximates to GICS. \n",
  78 |     "\n",
  79 |     "- <b>industry</b> : Industry : Sharadar's industry classification based on SIC codes in a format which approximates to GICS. \n",
  80 |     "\n",
  81 |     "- <b>scalemarketcap</b> : Company Scale - Market Cap : This field is experimental and subject to change. It categorises the company according to it's maximum observed market cap as follows: 1 - Nano < 50m; 2 - Micro < 300m; 3 - Small < 2bn; 4 - Mid < 10bn; 5 - Large < 200bn; 6 - Mega >= 200bn \n",
  82 |     "\n",
  83 |     "- <b>scalerevenue</b> : Company Scale - Revenue : This field is experimental and subject to change. It categorises the company according to it's maximum observed annual revenue as follows: 1 - Nano < 50m; 2 - Micro < 300m; 3 - Small < 2bn; 4 - Mid < 10bn; 5 - Large < 200bn; 6 - Mega >= 200bn \n",
  84 |     "\n",
  85 |     "- <b>relatedtickers</b> : Related Tickers : Where related tickers have been identified this field is populated. Related tickers can include the prior ticker before a ticker change; and it tickers for alternative share classes. \n",
  86 |     "\n",
  87 |     "- <b>currency</b> : Currency : The company functional reporting currency for the SF1 Fundamentals table or the currency for EOD prices in SEP and SFP. \n",
  88 |     "\n",
  89 |     "- <b>location</b> : Location : The company location as registered with the Securities and Exchange Commission. \n",
  90 |     "\n",
  91 |     "- <b>lastupdated</b> : Last Updated Date : Last Updated represents the last date that this database entry was updated; which is useful to users when updating their local records. \n",
  92 |     "\n",
  93 |     "- <b>firstadded</b> : First Added Date : The date that the ticker was first added to coverage in the dataset. \n",
  94 |     "\n",
  95 |     "- <b>firstpricedate</b> : First Price Date : The date of the first price observation for a given ticker. Can be used as a proxy for IPO date. Minimum value of 1986-01-01 for IPO's that occurred prior to this date. Note: this does not necessarily represent the first price date available in our datasets since our end of day price history currently starts in December 1998. \n",
  96 |     "\n",
  97 |     "- <b>lastpricedate</b> : Last Price Date : The most recent price observation available. \n",
  98 |     "\n",
  99 |     "- <b>firstquarter</b> : First Quarter : The first financial quarter available in the dataset. \n",
 100 |     "\n",
 101 |     "- <b>lastquarter</b> : Last Quarter : The last financial quarter available in the dataset. \n",
 102 |     "\n",
 103 |     "- <b>secfilings</b> : SEC Filings URL : The URL pointing to the SEC filings which also contains the Central Index Key (CIK). \n",
 104 |     "\n",
 105 |     "- <b>companysite</b> : Company Website URL : The URL pointing to the company website. "
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "metadata": {},
 111 |    "source": [
 112 |     "### Core US Fundamentals [SHARADAR/SF1] features"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "- <b>accoci</b> : Accumulated Other Comprehensive Income : [Balance Sheet] A component of [Equity] representing the accumulated change in equity from transactions and other events and circumstances from non-owner sources; net of tax effect; at period end. Includes foreign currency translation items; certain pension adjustments; unrealized gains and losses on certain investments in debt and equity securities. \n",
 120 |     "\n",
 121 |     "- <b>assets</b> : Total Assets : [Balance Sheet] Sum of the carrying amounts as of the balance sheet date of all assets that are recognized. Major components are [CashnEq]; [Investments];[Intangibles]; [PPNENet];[TaxAssets] and [Receivables]. \n",
 122 |     "\n",
 123 |     "- <b>assetsc</b> : Current Assets : [Balance Sheet] The current portion of [Assets]; reported if a company operates a classified balance sheet that segments current and non-current assets. \n",
 124 |     "\n",
 125 |     "- <b>assetsnc</b> : Assets Non-Current : [Balance Sheet] Amount of non-current assets; for companies that operate a classified balance sheet. Calculated as the different between Total Assets [Assets] and Current Assets [AssetsC]. \n",
 126 |     "\n",
 127 |     "- <b>bvps</b> : Book Value per Share : [Metrics] Measures the ratio between [Equity] and [SharesWA] as adjusted by [ShareFactor]. \n",
 128 |     "\n",
 129 |     "- <b>capex</b> : Capital Expenditure : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of long-lived; physical & intangible assets that are used in the normal conduct of business to produce goods and services and are not intended for resale. Includes cash inflows/outflows to pay for construction of self-constructed assets & software. \n",
 130 |     "\n",
 131 |     "- <b>cashneq</b> : Cash and Equivalents : [Balance Sheet] A component of [Assets] representing the amount of currency on hand as well as demand deposits with banks or financial institutions. \n",
 132 |     "\n",
 133 |     "- <b>cashnequsd</b> : Cash and Equivalents (USD) : [Balance Sheet] [CashnEq] in USD; converted by [FXUSD]. \n",
 134 |     "\n",
 135 |     "- <b>cor</b> : Cost of Revenue : [Income Statement] The aggregate cost of goods produced and sold and services rendered during the reporting period. \n",
 136 |     "\n",
 137 |     "- <b>consolinc</b> : Consolidated Income : [Income Statement] The portion of profit or loss for the period; net of income taxes; which is attributable to the consolidated entity; before the deduction of [NetIncNCI]. \n",
 138 |     "\n",
 139 |     "- <b>currentratio</b> : Current Ratio : [Metrics] The ratio between [AssetsC] and [LiabilitiesC]; for companies that operate a classified balance sheet. \n",
 140 |     "\n",
 141 |     "- <b>de</b> : Debt to Equity Ratio : [Metrics] Measures the ratio between [Liabilities] and [Equity]. \n",
 142 |     "\n",
 143 |     "- <b>debt</b> : Total Debt : [Balance Sheet] A component of [Liabilities] representing the total amount of current and non-current debt owed. Includes secured and unsecured bonds issued; commercial paper; notes payable; credit facilities; lines of credit; capital lease obligations; operating lease obligations; and convertible notes. \n",
 144 |     "\n",
 145 |     "- <b>debtc</b> : Debt Current : [Balance Sheet] The current portion of [Debt]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. \n",
 146 |     "\n",
 147 |     "- <b>debtnc</b> : Debt Non-Current : [Balance Sheet] The non-current portion of [Debt] reported if the company operates a classified balance sheet that segments current and non-current liabilities. \n",
 148 |     "\n",
 149 |     "- <b>debtusd</b> : Total Debt (USD) : [Balance Sheet] [Debt] in USD; converted by [FXUSD]. \n",
 150 |     "\n",
 151 |     "- <b>deferredrev</b> : Deferred Revenue : [Balance Sheet] A component of [Liabilities] representing the carrying amount of consideration received or receivable on potential earnings that were not recognized as revenue; including sales; license fees; and royalties; but excluding interest income. \n",
 152 |     "\n",
 153 |     "- <b>depamor</b> : Depreciation Amortization & Accretion : [Cash Flow Statement] A component of operating cash flow representing the aggregate net amount of depreciation; amortization; and accretion recognized during an accounting period. As a non-cash item; the net amount is added back to net income when calculating cash provided by or used in operations using the indirect method. \n",
 154 |     "\n",
 155 |     "- <b>deposits</b> : Deposit Liabilities : [Balance Sheet] A component of [Liabilities] representing the total of all deposit liabilities held; including foreign and domestic; interest and noninterest bearing. May include demand deposits; saving deposits; Negotiable Order of Withdrawal and time deposits among others. \n",
 156 |     "\n",
 157 |     "- <b>divyield</b> : Dividend Yield : [Metrics] Dividend Yield measures the ratio between a company's [DPS] and its [Price]. \n",
 158 |     "\n",
 159 |     "- <b>dps</b> : Dividends per Basic Common Share : [Income Statement] Aggregate dividends declared during the period for each split-adjusted share of common stock outstanding. Includes spinoffs where identified. \n",
 160 |     "\n",
 161 |     "- <b>ebit</b> : Earning Before Interest & Taxes (EBIT) : [Income Statement] Earnings Before Interest and Tax is calculated by adding [TaxExp] and [IntExp] back to [NetInc]. \n",
 162 |     "\n",
 163 |     "- <b>ebitda</b> : Earnings Before Interest Taxes & Depreciation Amortization (EBITDA) : [Metrics] EBITDA is a non-GAAP accounting metric that is widely used when assessing the performance of companies; calculated by adding [DepAmor] back to [EBIT]. \n",
 164 |     "\n",
 165 |     "- <b>ebitdamargin</b> : EBITDA Margin : [Metrics] Measures the ratio between a company's [EBITDA] and [Revenue]. \n",
 166 |     "\n",
 167 |     "- <b>ebitdausd</b> : Earnings Before Interest Taxes & Depreciation Amortization (USD) : [Metrics] [EBITDA] in USD; converted by [FXUSD]. \n",
 168 |     "\n",
 169 |     "- <b>ebitusd</b> : Earning Before Interest & Taxes (USD) : [Income Statement] [EBIT] in USD; converted by [FXUSD]. \n",
 170 |     "\n",
 171 |     "- <b>ebt</b> : Earnings before Tax : [Metrics] Earnings Before Tax is calculated by adding [TaxExp] back to [NetInc]. \n",
 172 |     "\n",
 173 |     "- <b>eps</b> : Earnings per Basic Share : [Income Statement] Earnings per share as calculated and reported by the company. Approximates to the amount of [NetIncCmn] for the period per each [SharesWA] after adjusting for [ShareFactor]. \n",
 174 |     "\n",
 175 |     "- <b>epsdil</b> : Earnings per Diluted Share : [Income Statement] Earnings per diluted share as calculated and reported by the company. Approximates to the amount of [NetIncCmn] for the period per each [SharesWADil] after adjusting for [ShareFactor].. \n",
 176 |     "\n",
 177 |     "- <b>epsusd</b> : Earnings per Basic Share (USD) : [Income Statement] [EPS] in USD; converted by [FXUSD]. \n",
 178 |     "\n",
 179 |     "- <b>equity</b> : Shareholders Equity : [Balance Sheet] A principal component of the balance sheet; in addition to [Liabilities] and [Assets]; that represents the total of all stockholders' equity (deficit) items; net of receivables from officers; directors; owners; and affiliates of the entity which are attributable to the parent. \n",
 180 |     "\n",
 181 |     "- <b>equityusd</b> : Shareholders Equity (USD) : [Balance Sheet] [Equity] in USD; converted by [FXUSD]. \n",
 182 |     "\n",
 183 |     "- <b>ev</b> : Enterprise Value : [Metrics] Enterprise value is a measure of the value of a business as a whole; calculated as [MarketCap] plus [DebtUSD] minus [CashnEqUSD]. \n",
 184 |     "\n",
 185 |     "- <b>evebit</b> : Enterprise Value over EBIT : [Metrics] Measures the ratio between [EV] and [EBITUSD]. \n",
 186 |     "\n",
 187 |     "- <b>evebitda</b> : Enterprise Value over EBITDA : [Metrics] Measures the ratio between [EV] and [EBITDAUSD]. \n",
 188 |     "\n",
 189 |     "- <b>fcf</b> : Free Cash Flow : [Metrics] Free Cash Flow is a measure of financial performance calculated as [NCFO] minus [CapEx]. \n",
 190 |     "\n",
 191 |     "- <b>fcfps</b> : Free Cash Flow per Share : [Metrics] Free Cash Flow per Share is a valuation metric calculated by dividing [FCF] by [SharesWA] and [ShareFactor]. \n",
 192 |     "\n",
 193 |     "- <b>fxusd</b> : Foreign Currency to USD Exchange Rate : [Metrics] The exchange rate used for the conversion of foreign currency to USD for non-US companies that do not report in USD. \n",
 194 |     "\n",
 195 |     "- <b>gp</b> : Gross Profit : [Income Statement] Aggregate revenue [Revenue] less cost of revenue [CoR] directly attributable to the revenue generation activity. \n",
 196 |     "\n",
 197 |     "- <b>grossmargin</b> : Gross Margin : [Metrics] Gross Margin measures the ratio between a company's [GP] and [Revenue]. \n",
 198 |     "\n",
 199 |     "- <b>intangibles</b> : Goodwill and Intangible Assets : [Balance Sheet] A component of [Assets] representing the carrying amounts of all intangible assets and goodwill as of the balance sheet date; net of accumulated amortization and impairment charges. \n",
 200 |     "\n",
 201 |     "- <b>intexp</b> : Interest Expense : [Income Statement] Amount of the cost of borrowed funds accounted for as interest expense. \n",
 202 |     "\n",
 203 |     "- <b>invcap</b> : Invested Capital : [Metrics] Invested capital is an input into the calculation of [ROIC]; and is calculated as: [Debt] plus [Assets] minus [Intangibles] minus [CashnEq] minus [LiabilitiesC]. Please note this calculation method is subject to change. \n",
 204 |     "\n",
 205 |     "- <b>inventory</b> : Inventory : [Balance Sheet] A component of [Assets] representing the amount after valuation and reserves of inventory expected to be sold; or consumed within one year or operating cycle; if longer. \n",
 206 |     "\n",
 207 |     "- <b>investments</b> : Investments : [Balance Sheet] A component of [Assets] representing the total amount of marketable and non-marketable securties; loans receivable and other invested assets. \n",
 208 |     "\n",
 209 |     "- <b>investmentsc</b> : Investments Current : [Balance Sheet] The current portion of [Investments]; reported if the company operates a classified balance sheet that segments current and non-current assets. \n",
 210 |     "\n",
 211 |     "- <b>investmentsnc</b> : Investments Non-Current : [Balance Sheet] The non-current portion of [Investments]; reported if the company operates a classified balance sheet that segments current and non-current assets. \n",
 212 |     "\n",
 213 |     "- <b>liabilities</b> : Total Liabilities : [Balance Sheet] Sum of the carrying amounts as of the balance sheet date of all liabilities that are recognized. Principal components are [Debt]; [DeferredRev]; [Payables];[Deposits]; and [TaxLiabilities]. \n",
 214 |     "\n",
 215 |     "- <b>liabilitiesc</b> : Current Liabilities : [Balance Sheet] The current portion of [Liabilities]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. \n",
 216 |     "\n",
 217 |     "- <b>liabilitiesnc</b> : Liabilities Non-Current : [Balance Sheet] The non-current portion of [Liabilities]; reported if the company operates a classified balance sheet that segments current and non-current liabilities. \n",
 218 |     "\n",
 219 |     "- <b>marketcap</b> : Market Capitalization : [Metrics] Represents the product of [SharesBas]; [Price] and [ShareFactor]. \n",
 220 |     "\n",
 221 |     "- <b>ncf</b> : Net Cash Flow / Change in Cash & Cash Equivalents : [Cash Flow Statement] Principal component of the cash flow statement representing the amount of increase (decrease) in cash and cash equivalents. Includes [NCFO]; investing [NCFI] and financing [NCFF] for continuing and discontinued operations; and the effect of exchange rate changes on cash [NCFX]. \n",
 222 |     "\n",
 223 |     "- <b>ncfbus</b> : Net Cash Flow - Business Acquisitions and Disposals : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of businesses; joint-ventures; affiliates; and other named investments. \n",
 224 |     "\n",
 225 |     "- <b>ncfcommon</b> : Issuance (Purchase) of Equity Shares : [Cash Flow Statement] A component of [NCFF] representing the net cash inflow (outflow) from common equity changes. Includes additional capital contributions from share issuances and exercise of stock options; and outflow from share repurchases.  \n",
 226 |     "\n",
 227 |     "- <b>ncfdebt</b> : Issuance (Repayment) of Debt Securities  : [Cash Flow Statement] A component of [NCFF] representing the net cash inflow (outflow) from issuance (repayment) of debt securities. \n",
 228 |     "\n",
 229 |     "- <b>ncfdiv</b> : Payment of Dividends & Other Cash Distributions    : [Cash Flow Statement] A component of [NCFF] representing dividends and dividend equivalents paid on common stock and restricted stock units. \n",
 230 |     "\n",
 231 |     "- <b>ncff</b> : Net Cash Flow from Financing : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from financing activities; from continuing and discontinued operations. Principal components of financing cash flow are: issuance (purchase) of equity shares; issuance (repayment) of debt securities; and payment of dividends & other cash distributions. \n",
 232 |     "\n",
 233 |     "- <b>ncfi</b> : Net Cash Flow from Investing : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from investing activities; from continuing and discontinued operations. Principal components of investing cash flow are: capital (expenditure) disposal of equipment [CapEx]; business (acquisitions) disposition [NCFBus] and investment (acquisition) disposal [NCFInv]. \n",
 234 |     "\n",
 235 |     "- <b>ncfinv</b> : Net Cash Flow - Investment Acquisitions and Disposals : [Cash Flow Statement] A component of [NCFI] representing the net cash inflow (outflow) associated with the acquisition & disposal of investments; including marketable securities and loan originations. \n",
 236 |     "\n",
 237 |     "- <b>ncfo</b> : Net Cash Flow from Operations : [Cash Flow Statement] A component of [NCF] representing the amount of cash inflow (outflow) from operating activities; from continuing and discontinued operations. \n",
 238 |     "\n",
 239 |     "- <b>ncfx</b> : Effect of Exchange Rate Changes on Cash  : [Cash Flow Statement] A component of Net Cash Flow [NCF] representing the amount of increase (decrease) from the effect of exchange rate changes on cash and cash equivalent balances held in foreign currencies. \n",
 240 |     "\n",
 241 |     "- <b>netinc</b> : Net Income : [Income Statement] The portion of profit or loss for the period; net of income taxes; which is attributable to the parent after the deduction of [NetIncNCI] from [ConsolInc]; and before the deduction of [PrefDivIS]. \n",
 242 |     "\n",
 243 |     "- <b>netinccmn</b> : Net Income Common Stock : [Income Statement] The amount of net income (loss) for the period due to common shareholders. Typically differs from [NetInc] to the parent entity due to the deduction of [PrefDivIS]. \n",
 244 |     "\n",
 245 |     "- <b>netinccmnusd</b> : Net Income Common Stock (USD) : [Income Statement] [NetIncCmn] in USD; converted by [FXUSD]. \n",
 246 |     "\n",
 247 |     "- <b>netincdis</b> : Net Loss Income from Discontinued Operations : [Income Statement] Amount of loss (income) from a disposal group; net of income tax; reported as a separate component of income. \n",
 248 |     "\n",
 249 |     "- <b>netincnci</b> : Net Income to Non-Controlling Interests : [Income Statement] The portion of income which is attributable to non-controlling interest shareholders; subtracted from [ConsolInc] in order to obtain [NetInc]. \n",
 250 |     "\n",
 251 |     "- <b>netmargin</b> : Profit Margin : [Metrics] Measures the ratio between a company's [NetIncCmn] and [Revenue]. \n",
 252 |     "\n",
 253 |     "- <b>opex</b> : Operating Expenses : [Income Statement] Operating expenses represents the total expenditure on [SGnA]; [RnD] and other operating expense items; it excludes [CoR]. \n",
 254 |     "\n",
 255 |     "- <b>opinc</b> : Operating Income : [Income Statement] Operating income is a measure of financial performance before the deduction of [IntExp]; [TaxExp] and other Non-Operating items. It is calculated as [GP] minus [OpEx]. \n",
 256 |     "\n",
 257 |     "- <b>payables</b> : Trade and Non-Trade Payables : [Balance Sheet] A component of [Liabilities] representing trade and non-trade payables. \n",
 258 |     "\n",
 259 |     "- <b>payoutratio</b> : Payout Ratio : [Metrics] The percentage of earnings paid as dividends to common stockholders. - Calculated by dividing [DPS] by [EPSUSD]. \n",
 260 |     "\n",
 261 |     "- <b>pb</b> : Price to Book Value : [Metrics] Measures the ratio between [MarketCap] and [EquityUSD]. \n",
 262 |     "\n",
 263 |     "- <b>pe</b> : Price Earnings (Damodaran Method) : [Metrics] Measures the ratio between [MarketCap] and [NetIncCmnUSD] \n",
 264 |     "\n",
 265 |     "- <b>pe1</b> : Price to Earnings Ratio : [Metrics] An alternative to [PE] representing the ratio between [Price] and [EPSUSD]. \n",
 266 |     "\n",
 267 |     "- <b>ppnenet</b> : Property Plant & Equipment Net : [Balance Sheet] A component of [Assets] representing the amount after accumulated depreciation; depletion and amortization of physical assets used in the normal conduct of business to produce goods and services and not intended for resale. Includes Operating Right of Use Assets. \n",
 268 |     "\n",
 269 |     "- <b>prefdivis</b> : Preferred Dividends Income Statement Impact : [Income Statement] Income statement item reflecting dividend payments to preferred stockholders. Subtracted from Net Income to Parent [NetInc] to obtain Net Income to Common Stockholders [NetIncCmn]. \n",
 270 |     "\n",
 271 |     "- <b>price</b> : Share Price (Adjusted Close) : [Entity] The price per common share adjusted for stock splits but not adjusted for dividends; used in the computation of [PE1]; [PS1]; [DivYield] and [SPS]. \n",
 272 |     "\n",
 273 |     "- <b>ps</b> : Price Sales (Damodaran Method) : [Metrics] Measures the ratio between [MarketCap] and [RevenueUSD]. \n",
 274 |     "\n",
 275 |     "- <b>ps1</b> : Price to Sales Ratio : [Metrics] An alternative calculation method to [PS]; that measures the ratio between a company's [Price] and it's [SPS]. \n",
 276 |     "\n",
 277 |     "- <b>receivables</b> : Trade and Non-Trade Receivables : [Balance Sheet] A component of [Assets] representing trade and non-trade receivables. \n",
 278 |     "\n",
 279 |     "- <b>retearn</b> : Accumulated Retained Earnings (Deficit) : [Balance Sheet] A component of [Equity] representing the cumulative amount of the entities undistributed earnings or deficit. May only be reported annually by certain companies; rather than quarterly. \n",
 280 |     "\n",
 281 |     "- <b>revenue</b> : Revenues : [Income Statement] Amount of Revenue recognized from goods sold; services rendered; insurance premiums; or other activities that constitute an earning process. Interest income for financial institutions is reported net of interest expense and provision for credit losses. \n",
 282 |     "\n",
 283 |     "- <b>revenueusd</b> : Revenues (USD) : [Income Statement] [Revenue] in USD; converted by [FXUSD]. \n",
 284 |     "\n",
 285 |     "- <b>rnd</b> : Research and Development Expense : [Income Statement] A component of [OpEx] representing the aggregate costs incurred in a planned search or critical investigation aimed at discovery of new knowledge with the hope that such knowledge will be useful in developing a new product or service. \n",
 286 |     "\n",
 287 |     "- <b>sbcomp</b> : Share Based Compensation : [Cash Flow Statement] A component of [NCFO] representing the total amount of noncash; equity-based employee remuneration. This may include the value of stock or unit options; amortization of restricted stock or units; and adjustment for officers' compensation. As noncash; this element is an add back when calculating net cash generated by operating activities using the indirect method. \n",
 288 |     "\n",
 289 |     "- <b>sgna</b> : Selling General and Administrative Expense : [Income Statement] A component of [OpEx] representing the aggregate total costs related to selling a firm's product and services; as well as all other general and administrative expenses. Direct selling expenses (for example; credit; warranty; and advertising) are expenses that can be directly linked to the sale of specific products. Indirect selling expenses are expenses that cannot be directly linked to the sale of specific products; for example telephone expenses; Internet; and postal charges. General and administrative expenses include salaries of non-sales personnel; rent; utilities; communication; etc. \n",
 290 |     "\n",
 291 |     "- <b>sharefactor</b> : Share Factor : [Entity] Share factor is a multiplicant in the calculation of [MarketCap] and is used to adjust for: American Depository Receipts (ADRs) that represent more or less than 1 underlying share; and; companies which have different earnings share for different share classes (eg Berkshire Hathaway - BRK.B). \n",
 292 |     "\n",
 293 |     "- <b>sharesbas</b> : Shares (Basic) : [Entity] The number of shares or other units outstanding of the entity's capital or common stock or other ownership interests; as stated on the cover of related periodic report (10-K/10-Q); after adjustment for stock splits. \n",
 294 |     "\n",
 295 |     "- <b>shareswa</b> : Weighted Average Shares : [Income Statement] The weighted average number of shares or units issued and outstanding that are used by the company to calculate [EPS]; determined based on the timing of issuance of shares or units in the period. \n",
 296 |     "\n",
 297 |     "- <b>shareswadil</b> : Weighted Average Shares Diluted : [Income Statement] The weighted average number of shares or units issued and outstanding that are used by the company to calculate [EPSDil]; determined based on the timing of issuance of shares or units in the period. \n",
 298 |     "\n",
 299 |     "- <b>sps</b> : Sales per Share : [Metrics] Sales per Share measures the ratio between [RevenueUSD] and [SharesWA] as adjusted by [ShareFactor]. \n",
 300 |     "\n",
 301 |     "- <b>tangibles</b> : Tangible Asset Value : [Metrics] The value of tangibles assets calculated as the difference between [Assets] and [Intangibles]. \n",
 302 |     "\n",
 303 |     "- <b>taxassets</b> : Tax Assets : [Balance Sheet] A component of [Assets] representing tax assets and receivables. \n",
 304 |     "\n",
 305 |     "- <b>taxexp</b> : Income Tax Expense : [Income Statement] Amount of current income tax expense (benefit) and deferred income tax expense (benefit) pertaining to continuing operations. \n",
 306 |     "\n",
 307 |     "- <b>taxliabilities</b> : Tax Liabilities : [Balance Sheet] A component of [Liabilities] representing outstanding tax liabilities. \n",
 308 |     "\n",
 309 |     "- <b>tbvps</b> : Tangible Assets Book Value per Share : [Metrics] Measures the ratio between [Tangibles] and [SharesWA] as adjusted by [ShareFactor]. \n",
 310 |     "\n",
 311 |     "- <b>workingcapital</b> : Working Capital : [Metrics] Working capital measures the difference between [AssetsC] and [LiabilitiesC]. \n",
 312 |     "\n",
 313 |     "- <b>roe</b>: Return on Average Equity : [Metrics] Return on equity measures a corporation's profitability by calculating the amount of [NetIncCmn] returned as a percentage of [EquityAvg]. \n",
 314 |     "\n",
 315 |     "- <b>roa</b> : Return on Average Assets : [Metrics] Return on assets measures how profitable a company is [NetIncCmn] relative to its total assets [AssetsAvg]."
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "markdown",
 320 |    "metadata": {},
 321 |    "source": [
 322 |     "### Sharadar Equity Prices [SHARADAR/SEP] features"
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "markdown",
 327 |    "metadata": {},
 328 |    "source": [
 329 |     "- <b>open</b> : Open Price - Split Adjusted : The opening share price, adjusted for stock splits and stock dividends. \n",
 330 |     "- <b>high</b> : High Price - Split Adjusted : The high share price, adjusted for stock splits and stock dividends. \n",
 331 |     "- <b>low</b> : Low Price - Split Adjusted : The low share price, adjusted for stock splits and stock dividends. \n",
 332 |     "- <b>close</b> : Close Price - Split Adjusted : The open share closing, adjusted for stock splits and stock dividends. \n",
 333 |     "- <b>volume</b> : Volume - Split Adjusted : The traded volume, adjusted for stock splits and stock dividends."
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "markdown",
 338 |    "metadata": {},
 339 |    "source": [
 340 |     "### Daily Metrics ([SHARADAR/DAILY] features"
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "markdown",
 345 |    "metadata": {},
 346 |    "source": [
 347 |     "- <b>ev</b> : Enterprise Value - Daily : Enterprise value is a measure of the value of a business as a whole; calculated as [MarketCap] plus [DebtUSD] minus [CashnEqUSD]. [MarketCap] is calculated by us, and the remaining figures are sourced from the most recent SEC form 10 filings. \n",
 348 |     "- <b>evebit</b> : Enterprise Value over EBIT - Daily : Measures the ratio between [EV] and [EBITUSD]. EBITUSD is derived from the most recent SEC form 10 filings. \n",
 349 |     "- <b>evebitda</b> : Enterprise Value over EBITDA - Daily : Measures the ratio between [EV] and [EBITDAUSD]. EBITDAUSD is derived from the most recent SEC form 10 filings. \n",
 350 |     "- <b>marketcap</b> : Market Capitalization - Daily : Represents the product of [SharesBas]; [Price] and [ShareFactor]. [SharesBas] is sourced from the most recent SEC form 10 filing. \n",
 351 |     "- <b>pb</b> : Price to Book Value - Daily : Measures the ratio between [MarketCap] and [EquityUSD]. [EquityUSD] is sourced from the most recent SEC form 10 filing. \n",
 352 |     "- <b>pe</b> : Price Earnings (Damodaran Method) - Daily : Measures the ratio between [MarketCap] and [NetIncCmnUSD]. [NetIncCmnUSD] is sourced from the most recent SEC form 10 filings. \n",
 353 |     "- <b>ps</b> : Price Sales (Damodaran Method) - Daily : Measures the ratio between [MarketCap] and [RevenueUSD]. [RevenueUSD] is sourced from the most recent SEC form 10 filings. "
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "markdown",
 358 |    "metadata": {},
 359 |    "source": [
 360 |     "### Sentiment Analysis and News Analytics ([IFT/NSA] features"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "markdown",
 365 |    "metadata": {},
 366 |    "source": [
 367 |     "- <b>sentiment Score</b>: a numeric measure of the bullishness / bearishness of news coverage of the stock.\n",
 368 |     "- <b>sentiment_high</b>: highest intraday sentiment scores.\n",
 369 |     "- <b>sentiment_low</b>: lowest intraday sentiment scores.\n",
 370 |     "- <b>news_volume</b>: the absolute number of news articles covering the stock.\n",
 371 |     "- <b>news_buzz</b>: a numeric measure of the change in coverage volume for the stock."
 372 |    ]
 373 |   },
 374 |   {
 375 |    "cell_type": "code",
 376 |    "execution_count": 1,
 377 |    "metadata": {},
 378 |    "outputs": [],
 379 |    "source": [
 380 |     "# outsource packages\n",
 381 |     "import warnings\n",
 382 |     "warnings.filterwarnings(\"ignore\")\n",
 383 |     "\n",
 384 |     "import pyfolio as pf\n",
 385 |     "import alphalens as al\n",
 386 |     "import zipfile\n",
 387 |     "import os\n",
 388 |     "import alphalens as al\n",
 389 |     "import quandl\n",
 390 |     "import matplotlib.pyplot as plt\n",
 391 |     "import datetime as datetime\n",
 392 |     "import pandas as pd\n",
 393 |     "import numpy as np\n",
 394 |     "import seaborn as sns\n",
 395 |     "from time import time,sleep\n",
 396 |     "from sklearn import preprocessing\n",
 397 |     "import pytz\n",
 398 |     "import itertools\n",
 399 |     "\n",
 400 |     "from pandas.plotting import register_matplotlib_converters\n",
 401 |     "register_matplotlib_converters()\n",
 402 |     "\n",
 403 |     "import yfinance as yf\n",
 404 |     "yf.pdr_override()"
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "code",
 409 |    "execution_count": 2,
 410 |    "metadata": {},
 411 |    "outputs": [],
 412 |    "source": [
 413 |     "# local packages\n",
 414 |     "from modules import FactorManagement as FM\n",
 415 |     "from modules import RiskManagement as RM\n",
 416 |     "from modules import Learner as LE\n",
 417 |     "from modules import OptimalHoldingsRegularization as OHR\n",
 418 |     "from modules import Util as UT"
 419 |    ]
 420 |   },
 421 |   {
 422 |    "cell_type": "code",
 423 |    "execution_count": 3,
 424 |    "metadata": {},
 425 |    "outputs": [],
 426 |    "source": [
 427 |     "secret_key = pd.read_csv('secret_key.txt',header=None)\n",
 428 |     "quandl.ApiConfig.api_key = secret_key[0][0]"
 429 |    ]
 430 |   },
 431 |   {
 432 |    "cell_type": "markdown",
 433 |    "metadata": {},
 434 |    "source": [
 435 |     "# 1 - Parameters"
 436 |    ]
 437 |   },
 438 |   {
 439 |    "cell_type": "markdown",
 440 |    "metadata": {},
 441 |    "source": [
 442 |     "Before diving into factor research and quantitative analysis, we have to define parameters that will be used in different stages of this project. The purpose is to try different parameters to optimize our output portfolio during the time. The first parameter we set here below is `update_data` that is used to decide if we want to update the data or not."
 443 |    ]
 444 |   },
 445 |   {
 446 |    "cell_type": "code",
 447 |    "execution_count": 4,
 448 |    "metadata": {},
 449 |    "outputs": [
 450 |     {
 451 |      "name": "stdout",
 452 |      "output_type": "stream",
 453 |      "text": [
 454 |       "ready to update data\n"
 455 |      ]
 456 |     }
 457 |    ],
 458 |    "source": [
 459 |     "update_data = True\n",
 460 |     "if update_data:\n",
 461 |     "    print ('ready to update data')\n",
 462 |     "else:\n",
 463 |     "    print ('data is already updated') "
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "markdown",
 468 |    "metadata": {},
 469 |    "source": [
 470 |     "## 1 - 1 - Time series data parameters"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "markdown",
 475 |    "metadata": {},
 476 |    "source": [
 477 |     "In this part, we set date parameters which are used for calling data from Quandl API. In this context, we call SEP/SHARADAR, DAILY/SHARADAR, and IFT/NSA for three years of data and SF1/SHARADAR for four years. The reason for this slicing is related to our final factor data where we look back one year. For example, some factors window length is set to one year, which means we need to load a minimum of three years data to chunk the final data frame in one year slice."
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 5,
 483 |    "metadata": {},
 484 |    "outputs": [
 485 |     {
 486 |      "name": "stdout",
 487 |      "output_type": "stream",
 488 |      "text": [
 489 |       "current date: 2020-11-14\n"
 490 |      ]
 491 |     }
 492 |    ],
 493 |    "source": [
 494 |     "tod = datetime.datetime.today().date()\n",
 495 |     "print ('current date: {}'.format(tod))"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "code",
 500 |    "execution_count": 6,
 501 |    "metadata": {},
 502 |    "outputs": [
 503 |     {
 504 |      "name": "stdout",
 505 |      "output_type": "stream",
 506 |      "text": [
 507 |       "SF1 data starting date: 2016-11-14\n"
 508 |      ]
 509 |     }
 510 |    ],
 511 |    "source": [
 512 |     "# SF1\n",
 513 |     "some_years = str(tod.year - 4)\n",
 514 |     "month = str(tod.month)\n",
 515 |     "day = str(tod.day)\n",
 516 |     "start_f = '{}-{}-{}'.format(some_years,month,day)\n",
 517 |     "print ('SF1 data starting date: {}'.format(start_f))"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": 7,
 523 |    "metadata": {},
 524 |    "outputs": [
 525 |     {
 526 |      "name": "stdout",
 527 |      "output_type": "stream",
 528 |      "text": [
 529 |       "SEP,Daily and Sentiment data starting date: 2017-11-14\n"
 530 |      ]
 531 |     }
 532 |    ],
 533 |    "source": [
 534 |     "# SEP & daily\n",
 535 |     "two_years = str(tod.year - 3)\n",
 536 |     "month = str(tod.month)\n",
 537 |     "day = str(tod.day)\n",
 538 |     "start_sep = '{}-{}-{}'.format(two_years,month,day)\n",
 539 |     "print ('SEP,Daily and Sentiment data starting date: {}'.format(start_sep))"
 540 |    ]
 541 |   },
 542 |   {
 543 |    "cell_type": "markdown",
 544 |    "metadata": {},
 545 |    "source": [
 546 |     "## 1 - 2 - Factor data period parameters"
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "markdown",
 551 |    "metadata": {},
 552 |    "source": [
 553 |     "We set our final date parameter to one year factor data. The reason is that we believe more than one-year factor analysis will decrease our efficiency in prediction and could affect our interpretation of results."
 554 |    ]
 555 |   },
 556 |   {
 557 |    "cell_type": "code",
 558 |    "execution_count": 8,
 559 |    "metadata": {},
 560 |    "outputs": [
 561 |     {
 562 |      "name": "stdout",
 563 |      "output_type": "stream",
 564 |      "text": [
 565 |       "Final slicing date for 1 year: 2019-11-14\n"
 566 |      ]
 567 |     }
 568 |    ],
 569 |    "source": [
 570 |     "# Slicing data for 1y\n",
 571 |     "years_to_slice = 1\n",
 572 |     "year = str(tod.year - years_to_slice)\n",
 573 |     "month = str(tod.month)\n",
 574 |     "day = str(tod.day)\n",
 575 |     "ayear = '{}-{}-{}'.format(year,month,day)\n",
 576 |     "\n",
 577 |     "start = ayear\n",
 578 |     "end = str(tod)\n",
 579 |     "print ('Final slicing date for {} year: {}'.format(years_to_slice,ayear))"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "markdown",
 584 |    "metadata": {},
 585 |    "source": [
 586 |     "## 1 - 3 - Universe parameters"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "markdown",
 591 |    "metadata": {},
 592 |    "source": [
 593 |     "The universe definition is an important step in this project. These parameters set here below will impact our analysis and need to be tuned as well as other parameters. metadata parameters are composed of `cap_select`, `exchange_select`, `currency_select` and `delisted_select` which are used to define our fisrt universe of stocks. For the second universe, `filteration_number` represents the number of liquid securities selected by dollar volume function and `smoothing_universe_period` represents the moving average window length in dollar volume function to select liquid securities smoothed over time. The following parameters are selected from the following values:\n",
 594 |     "\n",
 595 |     "#### Market Cap:\n",
 596 |     "\n",
 597 |     "- 1 - Nano\n",
 598 |     "- 2 - Micro \n",
 599 |     "- 3 - Small \n",
 600 |     "- 4 - Mid \n",
 601 |     "- 5 - Large \n",
 602 |     "- 6 - Mega\n",
 603 |     "\n",
 604 |     "#### Exhcange:\n",
 605 |     "\n",
 606 |     "- NASDAQ \n",
 607 |     "- NYSE\n",
 608 |     "- BATS\n",
 609 |     "- NYSEARCA\n",
 610 |     "- NYSEMKT\n",
 611 |     "- OTC\n",
 612 |     "\n",
 613 |     "#### Currency:\n",
 614 |     "\n",
 615 |     "USD, EUR, ARS, AUD, BRL, CAD, CHF, CLP, <br>\n",
 616 |     "CNY, COP, DKK, GBP, HKD, IDR, ILS, INR, <br>\n",
 617 |     "JPY, KRW, MXN, MYR, NOK, NZD, PEN,PHP, <br>\n",
 618 |     "PLN, RUB, SEK, TRY, TWD, ZAR\n",
 619 |     "\n",
 620 |     "#### Delisted:\n",
 621 |     "\n",
 622 |     "'Y' or 'N'"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": 9,
 628 |    "metadata": {},
 629 |    "outputs": [],
 630 |    "source": [
 631 |     "# metadata parameters\n",
 632 |     "cap_select = ['6 - Mega', '5 - Large', '4 - Mid']\n",
 633 |     "exchange_select = ['NYSE','NASDAQ','BATS']\n",
 634 |     "currency_select = ['USD']\n",
 635 |     "delisted_select = ['N']"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "code",
 640 |    "execution_count": 10,
 641 |    "metadata": {},
 642 |    "outputs": [],
 643 |    "source": [
 644 |     "# dollar volume parameters\n",
 645 |     "filteration_number  = 800\n",
 646 |     "smoothing_universe_period = 120"
 647 |    ]
 648 |   },
 649 |   {
 650 |    "cell_type": "markdown",
 651 |    "metadata": {},
 652 |    "source": [
 653 |     "## 1 - 4 - Pipeline parameters"
 654 |    ]
 655 |   },
 656 |   {
 657 |    "cell_type": "markdown",
 658 |    "metadata": {},
 659 |    "source": [
 660 |     "Creating factor data in a different way to obtain significant results is a critical subject treated in this project. The goal is to tune up the factor parameter to get optimal results. This section lets us test different parameters for these factors and navigates over different factor data. `smoothed_value` is the window length moving average parameter used to remove the noise created by factor variation. We will discuss other parameters in the factors section. "
 661 |    ]
 662 |   },
 663 |   {
 664 |    "cell_type": "code",
 665 |    "execution_count": 11,
 666 |    "metadata": {},
 667 |    "outputs": [],
 668 |    "source": [
 669 |     "smoothed_value = 5"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": 12,
 675 |    "metadata": {},
 676 |    "outputs": [],
 677 |    "source": [
 678 |     "fundamental_in = ['ncf']\n",
 679 |     "momentum_in = {'momentum_252d':252}\n",
 680 |     "sma_in = {'sma200':200}\n",
 681 |     "daily_in = {'marketcap':120, 'evebitda':100, 'ps':100, 'pe':100, 'pb':100}\n",
 682 |     "over_in = {'overnight_sentiment_60d':60}\n",
 683 |     "direction_in = {'direction_100d':100}\n",
 684 |     "sent_in = {'sentiment_10d':10,'sentiment_60d':60}\n",
 685 |     "vol_in = {'volatility_5d':5,'volatility_20d':20}\n",
 686 |     "capm_in = {'capm_60d':60,'capm_20d':20,'capm_10d':10,'capm_5d':5}\n",
 687 |     "channels_in = {'chan_60d':60, 'chan_100d':100}"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "markdown",
 692 |    "metadata": {},
 693 |    "source": [
 694 |     "## 1 - 5 - Factor analysis parameters"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "markdown",
 699 |    "metadata": {},
 700 |    "source": [
 701 |     "Here below we select periods to analyze our multi-factor output. These periods are selected according to our trading strategy and portfolio management methods. The `rebalance_period` is also an important parameter to choose carefully and consider commission fees in our portfolio management system. \n",
 702 |     "\n",
 703 |     "We use [Principal Component Analysis](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) in our risk management model to reduce the dimensionality of the risk factors. `factor exposures` parameter is the dimension that we want to reduce to it."
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "code",
 708 |    "execution_count": 13,
 709 |    "metadata": {},
 710 |    "outputs": [],
 711 |    "source": [
 712 |     "combined_periods = (5,10,20)\n",
 713 |     "rebalance_period = 10"
 714 |    ]
 715 |   },
 716 |   {
 717 |    "cell_type": "code",
 718 |    "execution_count": 14,
 719 |    "metadata": {},
 720 |    "outputs": [],
 721 |    "source": [
 722 |     "# Risk analysis\n",
 723 |     "factor_exposures = 10"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "markdown",
 728 |    "metadata": {},
 729 |    "source": [
 730 |     "## 1 - 6 - Optimizer parameters"
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "markdown",
 735 |    "metadata": {},
 736 |    "source": [
 737 |     "After combining factors we put it into the optimizer function which will maximize alpha factor and consider our risk model in counterpart. `risk_cap` is used to set the risk exposure parameter. More the risk is and more the alpha will be. `lambda_reg` is used regularized our optimized function. it is operating like a portfolio enhancer."
 738 |    ]
 739 |   },
 740 |   {
 741 |    "cell_type": "code",
 742 |    "execution_count": 15,
 743 |    "metadata": {},
 744 |    "outputs": [],
 745 |    "source": [
 746 |     "risk_cap = 0.07\n",
 747 |     "lambda_reg = 0.5\n",
 748 |     "factor_max = 10\n",
 749 |     "factor_min = -10 \n",
 750 |     "weights_max = 0.2\n",
 751 |     "weights_min = -0.1"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": 16,
 757 |    "metadata": {},
 758 |    "outputs": [],
 759 |    "source": [
 760 |     "assert lambda_reg < 1"
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "markdown",
 765 |    "metadata": {},
 766 |    "source": [
 767 |     "## 1 - 7 - Sector parameters"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "markdown",
 772 |    "metadata": {},
 773 |    "source": [
 774 |     "Our factor data is preprocessed and grouped by sector. Here below we can select sectors to drop, `sec_to_drop` regrouped sectors to not appear in factor data and our analysis to avoid sectors which have poor results. We also have the choice to drop specific sectors from long universe or short universe. For example the Tehnology sector shows good results in long qunatile and poor results in short quantile, it would be interesting to add this sector to `drop_short_sec`. The following parameter is selected from the following values:\n",
 775 |     "\n",
 776 |     "#### Sectors:\n",
 777 |     "\n",
 778 |     "- Basic Materials\n",
 779 |     "- Communication Services\n",
 780 |     "- Consumer Cyclical\n",
 781 |     "- Consumer Defensive\n",
 782 |     "- Energy\n",
 783 |     "- Financial Services\n",
 784 |     "- Healthcare\n",
 785 |     "- Industrials\n",
 786 |     "- Real Estate\n",
 787 |     "- Technology\n",
 788 |     "- Utilities\n",
 789 |     "- None"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "code",
 794 |    "execution_count": 17,
 795 |    "metadata": {},
 796 |    "outputs": [],
 797 |    "source": [
 798 |     "sec_to_drop = ['Communication Services']"
 799 |    ]
 800 |   },
 801 |   {
 802 |    "cell_type": "code",
 803 |    "execution_count": 18,
 804 |    "metadata": {},
 805 |    "outputs": [],
 806 |    "source": [
 807 |     "drop_long_sec = []\n",
 808 |     "drop_short_sec = []"
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "code",
 813 |    "execution_count": 19,
 814 |    "metadata": {},
 815 |    "outputs": [],
 816 |    "source": [
 817 |     "assert set(sec_to_drop) & set(drop_long_sec) == set()\n",
 818 |     "assert set(sec_to_drop) & set(drop_short_sec) == set()\n",
 819 |     "assert set(drop_long_sec) & set(drop_short_sec) == set()"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "markdown",
 824 |    "metadata": {},
 825 |    "source": [
 826 |     "## 1 - 9 - Quantiles"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "markdown",
 831 |    "metadata": {},
 832 |    "source": [
 833 |     "Here after the quantiles parameters are defining the equal portions of datas to be analyzed and considered for the future portfolio. `quantile_portions` is the number of qunatiles we want to analyze and work with. In the other hand `quantile_to_analysis` is the quantiles selected for the final analyze and portfolio. We use to select extremety quantiles to get the best results. "
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "code",
 838 |    "execution_count": 20,
 839 |    "metadata": {},
 840 |    "outputs": [],
 841 |    "source": [
 842 |     "qunatile_portions = 10\n",
 843 |     "quantile_to_analyse = [1,qunatile_portions]"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "markdown",
 848 |    "metadata": {},
 849 |    "source": [
 850 |     "# 2 - Universe definition"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "markdown",
 855 |    "metadata": {},
 856 |    "source": [
 857 |     "In this project, the universe is defined as a group of assets having high liquidy over a period. This universe will be used to compare systematic factors. Before proceeding to the selection of stocks we use `get_table` from Quandl API to load metadata of all stocks.\n",
 858 |     "\n",
 859 |     "In the next step, we use our universe parameters to select the fisrt universe of stocks. The criteria for this selection are as followed:\n",
 860 |     "\n",
 861 |     "- Exchange\n",
 862 |     "- Market cap\n",
 863 |     "- Currency\n",
 864 |     "- Delisted\n",
 865 |     "\n",
 866 |     "The following codes are used to define the first universe described above."
 867 |    ]
 868 |   },
 869 |   {
 870 |    "cell_type": "markdown",
 871 |    "metadata": {},
 872 |    "source": [
 873 |     "## 2 - 1 - Metadata"
 874 |    ]
 875 |   },
 876 |   {
 877 |    "cell_type": "code",
 878 |    "execution_count": 21,
 879 |    "metadata": {},
 880 |    "outputs": [],
 881 |    "source": [
 882 |     "meta = quandl.get_table('SHARADAR/TICKERS', table='SF1',paginate=True)\n",
 883 |     "meta.set_index('ticker',inplace=True, drop=True)"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "markdown",
 888 |    "metadata": {},
 889 |    "source": [
 890 |     "## 2 - 1 - 1 - First universe"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": 22,
 896 |    "metadata": {},
 897 |    "outputs": [],
 898 |    "source": [
 899 |     "if exchange_select:\n",
 900 |     "    exchange = []\n",
 901 |     "    for i in exchange_select:\n",
 902 |     "        exchange.append(list(meta[(meta['exchange'] == i)].index))\n",
 903 |     "    meta_ex = meta.loc[list(itertools.chain.from_iterable(exchange))]\n",
 904 |     "else:\n",
 905 |     "    meta_ex = meta\n",
 906 |     "\n",
 907 |     "if currency_select:\n",
 908 |     "    currency = []\n",
 909 |     "    for i in currency_select:\n",
 910 |     "        currency.append(list(meta_ex[(meta_ex['currency'] == i)].index))\n",
 911 |     "    meta_ex_cu = meta_ex.loc[list(itertools.chain.from_iterable(currency))]\n",
 912 |     "else:\n",
 913 |     "    meta_ex_cu = meta_ex\n",
 914 |     "\n",
 915 |     "if delisted_select:\n",
 916 |     "    delisted = []\n",
 917 |     "    for i in delisted_select:\n",
 918 |     "        delisted.append(list(meta_ex_cu[(meta_ex_cu['isdelisted'] == i)].index))\n",
 919 |     "    meta_ex_cu_de = meta_ex_cu.loc[list(itertools.chain.from_iterable(delisted))]\n",
 920 |     "else:\n",
 921 |     "    meta_ex_cu_de = meta_ex_cu\n",
 922 |     "    \n",
 923 |     "if cap_select:    \n",
 924 |     "    cap = []\n",
 925 |     "    for i in cap_select:\n",
 926 |     "        cap.append(list(meta_ex_cu_de[(meta_ex_cu_de['scalemarketcap'] == i)].index))\n",
 927 |     "    meta_ex_cu_de_cap = meta_ex_cu_de.loc[list(itertools.chain.from_iterable(cap))]\n",
 928 |     "else:\n",
 929 |     "    meta_ex_cu_de_cap = meta_ex_cu_de"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": 23,
 935 |    "metadata": {},
 936 |    "outputs": [],
 937 |    "source": [
 938 |     "u1 = list(meta_ex_cu_de_cap.index)"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "code",
 943 |    "execution_count": 24,
 944 |    "metadata": {},
 945 |    "outputs": [
 946 |     {
 947 |      "name": "stdout",
 948 |      "output_type": "stream",
 949 |      "text": [
 950 |       "2040 assets selected in first selection\n"
 951 |      ]
 952 |     }
 953 |    ],
 954 |    "source": [
 955 |     "print ('{} assets selected in first selection'.format(len(u1)))"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "markdown",
 960 |    "metadata": {},
 961 |    "source": [
 962 |     "## 2 - 2 - OHLCV data"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "markdown",
 967 |    "metadata": {},
 968 |    "source": [
 969 |     "Once the `first_universe` is defined, we use it to get SHARADAR/SEP data and store this table as a zip file on the local drive. This table gives us the ohlcv data started at `start_sep` defined in the parameters section and ended at the current date `tod`. The zip file is extracted, sorted into a multi-index data frame, and finally cleaned using our local function `cleaning_dataframe` imported from `utils_s.py`."
 970 |    ]
 971 |   },
 972 |   {
 973 |    "cell_type": "code",
 974 |    "execution_count": null,
 975 |    "metadata": {},
 976 |    "outputs": [],
 977 |    "source": [
 978 |     "if update_data:\n",
 979 |     "    quandl.export_table('SHARADAR/SEP',\n",
 980 |     "                        ticker = u1, \n",
 981 |     "                        date = {'gte': start_sep, 'lte': str(end)}, \n",
 982 |     "                        filename = 'data/ohlcv.zip')"
 983 |    ]
 984 |   },
 985 |   {
 986 |    "cell_type": "code",
 987 |    "execution_count": null,
 988 |    "metadata": {},
 989 |    "outputs": [],
 990 |    "source": [
 991 |     "with zipfile.ZipFile('data/ohlcv.zip', 'r') as zip_ref:\n",
 992 |     "    zip_ref.extractall()\n",
 993 |     "\n",
 994 |     "for item in os.listdir(os.getcwd()):  # loop through items in dir\n",
 995 |     "    if item.endswith('.csv') and item.split('_')[0] == 'SHARADAR' and item.split('_')[1] == 'SEP':\n",
 996 |     "        \n",
 997 |     "        ohlcv = pd.read_csv(item)\n",
 998 |     "        ohlcv['date'] = pd.to_datetime(ohlcv['date'])\n",
 999 |     "        ohlcv = ohlcv.set_index(['date', 'ticker']).sort_index(level=[0,1], ascending=[True, False])\n",
1000 |     "        ohlcv.drop(['lastupdated','dividends','closeunadj'],axis=1,inplace=True)\n",
1001 |     "\n",
1002 |     "        os.remove(item)"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "code",
1007 |    "execution_count": null,
1008 |    "metadata": {},
1009 |    "outputs": [],
1010 |    "source": [
1011 |     "ohlcv = UT.cleaning_dataframe(df = ohlcv,\n",
1012 |     "                              pernan_to_drop = 0.2)"
1013 |    ]
1014 |   },
1015 |   {
1016 |    "cell_type": "markdown",
1017 |    "metadata": {},
1018 |    "source": [
1019 |     "## 2 - 2 - 1 - Second universe"
1020 |    ]
1021 |   },
1022 |   {
1023 |    "cell_type": "markdown",
1024 |    "metadata": {},
1025 |    "source": [
1026 |     "In this section, the `dollar_volume_universe` attends to select liquid stocks with significant market cap. In this context, the close frame is multiplied to volume to obtain the market cap data frame. Then we sort assets having the highest market cap. the number of assets selected is set in the parameters section through `filteration_number` and smoothed over time with `smoothing_universe_period`."
1027 |    ]
1028 |   },
1029 |   {
1030 |    "cell_type": "code",
1031 |    "execution_count": null,
1032 |    "metadata": {},
1033 |    "outputs": [],
1034 |    "source": [
1035 |     "def dollar_volume_universe(tickers_num, ohlcv, sma_period):\n",
1036 |     "    \n",
1037 |     "    ohlcv['dollar_volume'] = ohlcv['close']*ohlcv['volume']\n",
1038 |     "    dollar_vol = ohlcv['dollar_volume'].unstack('ticker')\n",
1039 |     "    \n",
1040 |     "    sma = FM().sma(dollar_vol,sma_period)\n",
1041 |     "    \n",
1042 |     "    last = sma.iloc[-1,:]\n",
1043 |     "    dol = pd.DataFrame(data = last.values,index = last.index, columns = ['dv'])\n",
1044 |     "    dol.dropna(inplace = True)\n",
1045 |     "    \n",
1046 |     "    return list(dol.sort_values(by='dv', ascending=False).iloc[:tickers_num].index)    "
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "code",
1051 |    "execution_count": null,
1052 |    "metadata": {},
1053 |    "outputs": [],
1054 |    "source": [
1055 |     "universe = dollar_volume_universe(tickers_num = filteration_number, ohlcv = ohlcv, sma_period = smoothing_universe_period)"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": null,
1061 |    "metadata": {},
1062 |    "outputs": [],
1063 |    "source": [
1064 |     "print ('{} assets selected out of {} for the second selection'.format(len(universe),len(u1)))"
1065 |    ]
1066 |   },
1067 |   {
1068 |    "cell_type": "code",
1069 |    "execution_count": null,
1070 |    "metadata": {},
1071 |    "outputs": [],
1072 |    "source": [
1073 |     "assert len(universe) == filteration_number"
1074 |    ]
1075 |   },
1076 |   {
1077 |    "cell_type": "markdown",
1078 |    "metadata": {},
1079 |    "source": [
1080 |     "## 2 - 2 - 2 - Third universe"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "markdown",
1085 |    "metadata": {},
1086 |    "source": [
1087 |     "A sector selection model is a systematic tool that tilts a portfolio towards sectors that are predicted to outperform and underweights those that are predicted to underperform. If the process can effectively discern winning/losing industry groups, it can enhance the value added from a stock-selection methodology. Here below we use `sec_to_drop` which is composed of sectors to not include in the final assets (third universe). the output of this function will be the final selection."
1088 |    ]
1089 |   },
1090 |   {
1091 |    "cell_type": "code",
1092 |    "execution_count": null,
1093 |    "metadata": {},
1094 |    "outputs": [],
1095 |    "source": [
1096 |     "universe_sectors = pd.DataFrame(index=universe, columns=['sectors'])\n",
1097 |     "for i in universe:\n",
1098 |     "    try:\n",
1099 |     "        universe_sectors.loc[i] = meta_ex_cu_de_cap.loc[i]['sector']\n",
1100 |     "    except:\n",
1101 |     "        universe_sectors.loc[i] = np.nan\n",
1102 |     "    try:\n",
1103 |     "        for sec in sec_to_drop:\n",
1104 |     "            if meta_ex_cu_de_cap.loc[i]['sector'] == sec:\n",
1105 |     "                universe_sectors.drop(i, axis=0,inplace=True)\n",
1106 |     "    except:\n",
1107 |     "        pass"
1108 |    ]
1109 |   },
1110 |   {
1111 |    "cell_type": "code",
1112 |    "execution_count": null,
1113 |    "metadata": {},
1114 |    "outputs": [],
1115 |    "source": [
1116 |     "universe = list(universe_sectors.index)"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "metadata": {
1123 |     "scrolled": false
1124 |    },
1125 |    "outputs": [],
1126 |    "source": [
1127 |     "print ('{} assets selected after sector cleaning out of {} for the third selection'.format(len(universe),len(u1)))"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "markdown",
1132 |    "metadata": {},
1133 |    "source": [
1134 |     "## 2 - 3 - Benchmark"
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "markdown",
1139 |    "metadata": {},
1140 |    "source": [
1141 |     "Before starting to define the universe, we have to prepare the benchmark. For this project the <b>S&P500</b> will be the reference index and we will use this data for our factor analysis."
1142 |    ]
1143 |   },
1144 |   {
1145 |    "cell_type": "code",
1146 |    "execution_count": null,
1147 |    "metadata": {
1148 |     "inputHidden": false,
1149 |     "outputHidden": false
1150 |    },
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "# input\n",
1154 |     "market = '^GSPC'\n",
1155 |     "# read data \n",
1156 |     "dfm = yf.download(market,ohlcv.index.levels[0][0],ohlcv.index.levels[0][-1] + pd.Timedelta(days=1))"
1157 |    ]
1158 |   },
1159 |   {
1160 |    "cell_type": "code",
1161 |    "execution_count": null,
1162 |    "metadata": {
1163 |     "inputHidden": false,
1164 |     "outputHidden": false
1165 |    },
1166 |    "outputs": [],
1167 |    "source": [
1168 |     "dfm = dfm.rename(columns={'Open': 'open', \n",
1169 |     "                          'High': 'high', \n",
1170 |     "                          'Low':'low', \n",
1171 |     "                          'Close':'close',\n",
1172 |     "                          'Volume':'volume'}).drop('Adj Close', axis=1)\n",
1173 |     "dfm.index.name = 'date'\n",
1174 |     "# return\n",
1175 |     "benchmark = dfm['close'].pct_change().loc[slice(start,end)]\n",
1176 |     "benchmark.index = benchmark.index.tz_localize('UTC')"
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "code",
1181 |    "execution_count": null,
1182 |    "metadata": {},
1183 |    "outputs": [],
1184 |    "source": [
1185 |     "benchmark.head()"
1186 |    ]
1187 |   },
1188 |   {
1189 |    "cell_type": "markdown",
1190 |    "metadata": {},
1191 |    "source": [
1192 |     "## 2 - 4 - Fundamental data"
1193 |    ]
1194 |   },
1195 |   {
1196 |    "cell_type": "markdown",
1197 |    "metadata": {},
1198 |    "source": [
1199 |     "After defining the final `universe`, we get fundamental SHARADAR/SF1 data and store this table as a zip file on the local drive. This table gives us the fundamental data of Most Recent Quarter (MRQ) Reported started at `start_f` defined in the parameters section and ended at the current date `tod`. The zip file is extracted, sorted into a multi-index data frame, and finally cleaned using our local function `cleaning_dataframe` imported from `utils_s.py`."
1200 |    ]
1201 |   },
1202 |   {
1203 |    "cell_type": "code",
1204 |    "execution_count": null,
1205 |    "metadata": {},
1206 |    "outputs": [],
1207 |    "source": [
1208 |     "if update_data:\n",
1209 |     "    #qopts={\"columns\":ind}\n",
1210 |     "    quandl.export_table('SHARADAR/SF1',\n",
1211 |     "                        ticker=universe, \n",
1212 |     "                        dimension = 'MRQ', \n",
1213 |     "                        calendardate={'gte':start_f,'lte':str(tod)}, \n",
1214 |     "                        filename='data/fundamental.zip')"
1215 |    ]
1216 |   },
1217 |   {
1218 |    "cell_type": "code",
1219 |    "execution_count": null,
1220 |    "metadata": {},
1221 |    "outputs": [],
1222 |    "source": [
1223 |     "with zipfile.ZipFile('data/fundamental.zip', 'r') as zip_ref:\n",
1224 |     "    zip_ref.extractall()\n",
1225 |     "\n",
1226 |     "for item in os.listdir(os.getcwd()):  # loop through items in dir\n",
1227 |     "    \n",
1228 |     "    if item.endswith('.csv') and item.split('_')[0] == 'SHARADAR' and item.split('_')[1] == 'SF1':\n",
1229 |     "        sf1 = pd.read_csv(item)\n",
1230 |     "        sf1 = sf1.set_index(['calendardate', 'ticker']).sort_index(level=[0,1], ascending=[True, False])\n",
1231 |     "        sf1.drop(['datekey','reportperiod','lastupdated','dimension'],axis=1,inplace=True)\n",
1232 |     "        os.remove(item)  "
1233 |    ]
1234 |   },
1235 |   {
1236 |    "cell_type": "code",
1237 |    "execution_count": null,
1238 |    "metadata": {},
1239 |    "outputs": [],
1240 |    "source": [
1241 |     "sf1 = UT.cleaning_dataframe(sf1,0.2)"
1242 |    ]
1243 |   },
1244 |   {
1245 |    "cell_type": "code",
1246 |    "execution_count": null,
1247 |    "metadata": {},
1248 |    "outputs": [],
1249 |    "source": [
1250 |     "sf1['roe'] = sf1['netinc'] / sf1['equity']\n",
1251 |     "sf1['roa'] = sf1['netinc'] / sf1['assets']"
1252 |    ]
1253 |   },
1254 |   {
1255 |    "cell_type": "markdown",
1256 |    "metadata": {},
1257 |    "source": [
1258 |     "## 2 - 5 - Daily metrics data"
1259 |    ]
1260 |   },
1261 |   {
1262 |    "cell_type": "markdown",
1263 |    "metadata": {},
1264 |    "source": [
1265 |     "We continue to download data, here get daily metrics SHARADAR/DAILY data and store this table as a zip file on the local drive. This table gives us the data that with discussed in previously started at `start_sep` defined in the parameters section and ended at the current date `tod`. The zip file is extracted, sorted into a multi-index data frame, and finally cleaned using our local function `cleaning_dataframe` imported from `utils_s.py`."
1266 |    ]
1267 |   },
1268 |   {
1269 |    "cell_type": "code",
1270 |    "execution_count": null,
1271 |    "metadata": {},
1272 |    "outputs": [],
1273 |    "source": [
1274 |     "if update_data:\n",
1275 |     "    quandl.export_table('SHARADAR/DAILY',\n",
1276 |     "                        ticker=universe, \n",
1277 |     "                        date={'gte': start_sep, 'lte': str(tod)}, \n",
1278 |     "                        filename='data/daily.zip')"
1279 |    ]
1280 |   },
1281 |   {
1282 |    "cell_type": "code",
1283 |    "execution_count": null,
1284 |    "metadata": {},
1285 |    "outputs": [],
1286 |    "source": [
1287 |     "with zipfile.ZipFile('data/daily.zip', 'r') as zip_ref:\n",
1288 |     "    zip_ref.extractall()\n",
1289 |     "\n",
1290 |     "for item in os.listdir(os.getcwd()):  # loop through items in dir\n",
1291 |     "    \n",
1292 |     "    if item.endswith('.csv') and item.split('_')[0] == 'SHARADAR' and item.split('_')[1] == 'DAILY':\n",
1293 |     "        daily = pd.read_csv(item)\n",
1294 |     "        daily['date'] = pd.to_datetime(daily['date'])\n",
1295 |     "        daily = daily.set_index(['date', 'ticker']).sort_index(level=[0,1], ascending=[True, False])\n",
1296 |     "        daily.drop(['lastupdated'],axis=1,inplace=True)\n",
1297 |     "        os.remove(item) "
1298 |    ]
1299 |   },
1300 |   {
1301 |    "cell_type": "code",
1302 |    "execution_count": null,
1303 |    "metadata": {},
1304 |    "outputs": [],
1305 |    "source": [
1306 |     "daily = UT.cleaning_dataframe(daily,0.2)"
1307 |    ]
1308 |   },
1309 |   {
1310 |    "cell_type": "markdown",
1311 |    "metadata": {},
1312 |    "source": [
1313 |     "## 2 - 6 - Sentiment data"
1314 |    ]
1315 |   },
1316 |   {
1317 |    "cell_type": "markdown",
1318 |    "metadata": {},
1319 |    "source": [
1320 |     "The last data we get is sentiment data IFT/NSA data and store this table as a zip file on the local drive. This table gives us the data that discussed previously started at `start_sep` defined in the parameters section and ended at the current date `tod`. The zip file is extracted, sorted into a multi-index data frame, and finally cleaned using our local function `cleaning_dataframe` imported from `utils_s.py`."
1321 |    ]
1322 |   },
1323 |   {
1324 |    "cell_type": "code",
1325 |    "execution_count": null,
1326 |    "metadata": {},
1327 |    "outputs": [],
1328 |    "source": [
1329 |     "if update_data:\n",
1330 |     "    quandl.export_table('IFT/NSA',\n",
1331 |     "                        ticker=universe, \n",
1332 |     "                        date={'gte': start_sep, 'lte': str(end)}, \n",
1333 |     "                        filename='data/sent_test.zip')"
1334 |    ]
1335 |   },
1336 |   {
1337 |    "cell_type": "code",
1338 |    "execution_count": null,
1339 |    "metadata": {},
1340 |    "outputs": [],
1341 |    "source": [
1342 |     "with zipfile.ZipFile('data/sent.zip', 'r') as zip_ref:\n",
1343 |     "    zip_ref.extractall()\n",
1344 |     "\n",
1345 |     "for item in os.listdir(os.getcwd()):  # loop through items in dir\n",
1346 |     "    \n",
1347 |     "    if item.endswith('.csv') and item.split('_')[0] == 'IFT' and item.split('_')[1] == 'NSA':\n",
1348 |     "        sent = pd.read_csv(item)\n",
1349 |     "        sent['date'] = pd.to_datetime(sent['date'])\n",
1350 |     "        sent = sent.set_index(['date', 'ticker']).sort_index(level=[0,1], ascending=[True, False])\n",
1351 |     "        sent = sent[sent['exchange_cd']=='US']\n",
1352 |     "        sent.drop(['name','exchange_cd'],axis=1,inplace=True)\n",
1353 |     "        os.remove(item) "
1354 |    ]
1355 |   },
1356 |   {
1357 |    "cell_type": "code",
1358 |    "execution_count": null,
1359 |    "metadata": {},
1360 |    "outputs": [],
1361 |    "source": [
1362 |     "sent = UT.cleaning_dataframe(sent,0.2)"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "markdown",
1367 |    "metadata": {},
1368 |    "source": [
1369 |     "# 3 - Universe intersection"
1370 |    ]
1371 |   },
1372 |   {
1373 |    "cell_type": "markdown",
1374 |    "metadata": {},
1375 |    "source": [
1376 |     "All the data is collected and stacked separately into a multi-index data frame. However, some of the assets defined previously in the universe section are dropped or not collected during the download and cleaning process. Therefore, the goal in this section is to get the intersection of assets represented for each data frame (`sent`,`sf1`,`daily` and `ohlcv`)."
1377 |    ]
1378 |   },
1379 |   {
1380 |    "cell_type": "code",
1381 |    "execution_count": null,
1382 |    "metadata": {},
1383 |    "outputs": [],
1384 |    "source": [
1385 |     "new_universe =list(set(sent['sentiment'].unstack('ticker').columns) & set(sf1.index.levels[1]) & set(daily.index.levels[1]) & set(ohlcv.index.levels[1]))"
1386 |    ]
1387 |   },
1388 |   {
1389 |    "cell_type": "code",
1390 |    "execution_count": null,
1391 |    "metadata": {},
1392 |    "outputs": [],
1393 |    "source": [
1394 |     "print ('{} assets dropped after intersectioning'.format((len(universe) - len(new_universe))))\n",
1395 |     "print ('{} assets are loaded'.format(len(new_universe)))"
1396 |    ]
1397 |   },
1398 |   {
1399 |    "cell_type": "markdown",
1400 |    "metadata": {},
1401 |    "source": [
1402 |     "## 3 - 1- Sectors"
1403 |    ]
1404 |   },
1405 |   {
1406 |    "cell_type": "markdown",
1407 |    "metadata": {},
1408 |    "source": [
1409 |     "The sectors represented in the cleaned data are as followed:"
1410 |    ]
1411 |   },
1412 |   {
1413 |    "cell_type": "code",
1414 |    "execution_count": null,
1415 |    "metadata": {},
1416 |    "outputs": [],
1417 |    "source": [
1418 |     "meta_cleaned = meta_ex_cu_de_cap.loc[new_universe,:]\n",
1419 |     "print ('Sectors in cleaned data: \\n')\n",
1420 |     "sectors = {}\n",
1421 |     "for i in set(meta_cleaned['sector']):\n",
1422 |     "    print (' ',i)\n",
1423 |     "    sectors[i] = list(meta_cleaned[meta_cleaned['sector'] == i].index)"
1424 |    ]
1425 |   },
1426 |   {
1427 |    "cell_type": "markdown",
1428 |    "metadata": {},
1429 |    "source": [
1430 |     "# 4 - Alpha factors"
1431 |    ]
1432 |   },
1433 |   {
1434 |    "cell_type": "markdown",
1435 |    "metadata": {},
1436 |    "source": [
1437 |     "Factor-driven alpha investment strategies, designed to delivering market-beating returns, come in a number of different forms. In this porject, the following alpha factors are discussed:\n",
1438 |     "\n",
1439 |     "- Daily Metrics\n",
1440 |     "- Simple Moving Average\n",
1441 |     "- Overnight Sentiment\n",
1442 |     "- Mean Reversion\n",
1443 |     "- Direction\n",
1444 |     "- Sentiment Analysis\n",
1445 |     "- Volatility\n",
1446 |     "- Capm\n",
1447 |     "- Fundamentals"
1448 |    ]
1449 |   },
1450 |   {
1451 |    "cell_type": "markdown",
1452 |    "metadata": {},
1453 |    "source": [
1454 |     "## 4 - 1 - Factorizing"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "markdown",
1459 |    "metadata": {},
1460 |    "source": [
1461 |     "These factors are generated as followed:\n",
1462 |     "\n",
1463 |     "1 - Formulation <br>\n",
1464 |     "2 - Scaling <br>\n",
1465 |     "3 - Smoothing <br>\n",
1466 |     "4 - Slicing <br>\n",
1467 |     "5 - Neutralizing by Sector <br>\n",
1468 |     "6 - Scaling <br>\n",
1469 |     "\n",
1470 |     "Each of these factors of stored into a dictionary to be fetch into a final multiindex dataframe `all_factors` as the ouput of our pipleline. Once this is done, the multiindex data frame will be used as the input of other functions for analysis and optimization etc."
1471 |    ]
1472 |   },
1473 |   {
1474 |    "cell_type": "code",
1475 |    "execution_count": null,
1476 |    "metadata": {},
1477 |    "outputs": [],
1478 |    "source": [
1479 |     "close = ohlcv['close'].unstack('ticker')[new_universe]\n",
1480 |     "openn = ohlcv['open'].unstack('ticker')[new_universe]\n",
1481 |     "high = ohlcv['high'].unstack('ticker')[new_universe]\n",
1482 |     "low = ohlcv['low'].unstack('ticker')[new_universe]"
1483 |    ]
1484 |   },
1485 |   {
1486 |    "cell_type": "code",
1487 |    "execution_count": null,
1488 |    "metadata": {},
1489 |    "outputs": [],
1490 |    "source": [
1491 |     "pricing = openn.loc[slice(start,end),:].tz_localize('UTC')[new_universe]"
1492 |    ]
1493 |   },
1494 |   {
1495 |    "cell_type": "markdown",
1496 |    "metadata": {},
1497 |    "source": [
1498 |     "### 4 - 1 - 1 - Daily Metrics"
1499 |    ]
1500 |   },
1501 |   {
1502 |    "cell_type": "code",
1503 |    "execution_count": null,
1504 |    "metadata": {},
1505 |    "outputs": [],
1506 |    "source": [
1507 |     "daily_data = {}\n",
1508 |     "\n",
1509 |     "for i in daily_in.keys():\n",
1510 |     "    \n",
1511 |     "    df_daily = daily[i].unstack('ticker')\n",
1512 |     "    # formulation and scaling\n",
1513 |     "    reversion = FM().momentum(df_daily,daily_in[i])*-1\n",
1514 |     "    # smoothing\n",
1515 |     "    smoothed_reversion = FM().smooth(reversion,smoothed_value)\n",
1516 |     "    # slicing\n",
1517 |     "    smoothed_reversion.index = pd.to_datetime(smoothed_reversion.index)\n",
1518 |     "    smoothed_reversion = smoothed_reversion.loc[slice(start,end),:]\n",
1519 |     "    # neutralizing and scaling\n",
1520 |     "    smoothed_reversion_neutralized_scaled = FM().sector_neutral(sectors, smoothed_reversion)\n",
1521 |     "\n",
1522 |     "    \n",
1523 |     "    daily_data[i] = smoothed_reversion_neutralized_scaled[new_universe]\n"
1524 |    ]
1525 |   },
1526 |   {
1527 |    "cell_type": "markdown",
1528 |    "metadata": {},
1529 |    "source": [
1530 |     "### 4 - 1 - 2 - Simple Moving Average"
1531 |    ]
1532 |   },
1533 |   {
1534 |    "cell_type": "code",
1535 |    "execution_count": null,
1536 |    "metadata": {},
1537 |    "outputs": [],
1538 |    "source": [
1539 |     "sma_data = {}\n",
1540 |     "\n",
1541 |     "for name,period in sma_in.items():\n",
1542 |     "    # formulation and scaling\n",
1543 |     "    sma_min = FM().sma(close, period)\n",
1544 |     "    # smoothing\n",
1545 |     "    smoothed_sma_min = FM().smooth(sma_min,smoothed_value)\n",
1546 |     "    # slicing\n",
1547 |     "    smoothed_sma_min.index = pd.to_datetime(smoothed_sma_min.index)\n",
1548 |     "    smoothed_sma_min = smoothed_sma_min.loc[slice(start,end),:]\n",
1549 |     "    # neutralizing and scaling\n",
1550 |     "    smoothed_sma_min_neutralized_scaled = FM().sector_neutral(sectors, smoothed_sma_min)\n",
1551 |     "    sma_data[name] = smoothed_sma_min_neutralized_scaled[new_universe]\n",
1552 |     "    "
1553 |    ]
1554 |   },
1555 |   {
1556 |    "cell_type": "markdown",
1557 |    "metadata": {},
1558 |    "source": [
1559 |     "### 4 - 1 - 3 - Overnight Sentiment"
1560 |    ]
1561 |   },
1562 |   {
1563 |    "cell_type": "code",
1564 |    "execution_count": null,
1565 |    "metadata": {},
1566 |    "outputs": [],
1567 |    "source": [
1568 |     "over_data = {}\n",
1569 |     "\n",
1570 |     "for name,period in over_in.items():\n",
1571 |     "    # formulation and scaling\n",
1572 |     "    overnight_sentiment = FM().overnight_sentiment(close, openn, 2, trailing_window=period)\n",
1573 |     "    # smoothing\n",
1574 |     "    smoothed_overnight_sentiment = FM().smooth(overnight_sentiment,smoothed_value)\n",
1575 |     "    # slicing \n",
1576 |     "    smoothed_overnight_sentiment.index = pd.to_datetime(smoothed_overnight_sentiment.index)\n",
1577 |     "    smoothed_overnight_sentiment = smoothed_overnight_sentiment.loc[slice(start,end),:]\n",
1578 |     "    # neutralizing and scaling\n",
1579 |     "    smoothed_overnight_sentiment_neutralized_scaled = FM().sector_neutral(sectors, smoothed_overnight_sentiment)\n",
1580 |     "    over_data[name] = (smoothed_overnight_sentiment_neutralized_scaled*-1)[new_universe]"
1581 |    ]
1582 |   },
1583 |   {
1584 |    "cell_type": "markdown",
1585 |    "metadata": {},
1586 |    "source": [
1587 |     "### 4 - 1 - 4 - Mean Reversion"
1588 |    ]
1589 |   },
1590 |   {
1591 |    "cell_type": "code",
1592 |    "execution_count": null,
1593 |    "metadata": {},
1594 |    "outputs": [],
1595 |    "source": [
1596 |     "momentum_data = {}\n",
1597 |     "\n",
1598 |     "for name,period in momentum_in.items():\n",
1599 |     "    # formulation and scaling\n",
1600 |     "    mean_reversion = FM().momentum(close,period)*-1\n",
1601 |     "    # smoothing\n",
1602 |     "    smoothed_mean_reversion = FM().smooth(mean_reversion,smoothed_value)\n",
1603 |     "    # slicing\n",
1604 |     "    smoothed_mean_reversion.index = pd.to_datetime(smoothed_mean_reversion.index)\n",
1605 |     "    smoothed_mean_reversion = smoothed_mean_reversion.loc[slice(start,end),:]\n",
1606 |     "    # neutralizing and scaling\n",
1607 |     "    smoothed_mean_reversion_neutralized_scaled = FM().sector_neutral(sectors, smoothed_mean_reversion)\n",
1608 |     "    momentum_data[name] = smoothed_mean_reversion_neutralized_scaled[new_universe]"
1609 |    ]
1610 |   },
1611 |   {
1612 |    "cell_type": "markdown",
1613 |    "metadata": {},
1614 |    "source": [
1615 |     "### 4 - 1 - 5 - Direction"
1616 |    ]
1617 |   },
1618 |   {
1619 |    "cell_type": "code",
1620 |    "execution_count": null,
1621 |    "metadata": {},
1622 |    "outputs": [],
1623 |    "source": [
1624 |     "direction_data = {}\n",
1625 |     "\n",
1626 |     "for name,period in direction_in.items():\n",
1627 |     "    \n",
1628 |     "    # formulation and scaling\n",
1629 |     "    direct = FM().direction(close, openn, 1, period)\n",
1630 |     "    # smoothing\n",
1631 |     "    smoothed_direct = FM().smooth(direct,smoothed_value)\n",
1632 |     "    # slicing\n",
1633 |     "    smoothed_direct.index = pd.to_datetime(smoothed_direct.index)\n",
1634 |     "    smoothed_direct = smoothed_direct.loc[slice(start,end),:]\n",
1635 |     "    # neutralizing and scaling\n",
1636 |     "    smoothed_direct_neutralized_scaled = FM().sector_neutral(sectors, smoothed_direct)\n",
1637 |     "    direction_data[name] = smoothed_direct_neutralized_scaled[new_universe]"
1638 |    ]
1639 |   },
1640 |   {
1641 |    "cell_type": "markdown",
1642 |    "metadata": {},
1643 |    "source": [
1644 |     "### 4 - 1 - 6 - Sentiment Analysis"
1645 |    ]
1646 |   },
1647 |   {
1648 |    "cell_type": "code",
1649 |    "execution_count": null,
1650 |    "metadata": {},
1651 |    "outputs": [],
1652 |    "source": [
1653 |     "# sentiment\n",
1654 |     "sent_data = {}\n",
1655 |     "\n",
1656 |     "for name,period in sent_in.items():\n",
1657 |     "    \n",
1658 |     "    # formulation and scaling\n",
1659 |     "    sentiment = FM().sentiment(close, high, low, sent, period, new_universe)\n",
1660 |     "    # smoothing\n",
1661 |     "    smoothed_sentiment= FM().smooth(sentiment,smoothed_value)\n",
1662 |     "    # slicing\n",
1663 |     "    smoothed_sentiment.index = pd.to_datetime(smoothed_sentiment.index)\n",
1664 |     "    smoothed_sentiment = smoothed_sentiment.loc[slice(start,end),:]\n",
1665 |     "    # neutralizing and scaling\n",
1666 |     "    smoothed_sentiment_neutralized_scaled = FM().sector_neutral(sectors, smoothed_sentiment)\n",
1667 |     "    sent_data[name] = smoothed_sentiment_neutralized_scaled[new_universe]"
1668 |    ]
1669 |   },
1670 |   {
1671 |    "cell_type": "markdown",
1672 |    "metadata": {},
1673 |    "source": [
1674 |     "### 4 - 1 - 7 - Volatility"
1675 |    ]
1676 |   },
1677 |   {
1678 |    "cell_type": "code",
1679 |    "execution_count": null,
1680 |    "metadata": {},
1681 |    "outputs": [],
1682 |    "source": [
1683 |     "volatility_data = {}\n",
1684 |     "\n",
1685 |     "for name,period in vol_in.items():\n",
1686 |     "    \n",
1687 |     "    # formulation and scaling\n",
1688 |     "    vol = FM().volatility(close, 5, period)\n",
1689 |     "    # smoothing\n",
1690 |     "    smoothed_vol = FM().smooth(vol,smoothed_value)\n",
1691 |     "    # slicing\n",
1692 |     "    smoothed_vol.index = pd.to_datetime(smoothed_vol.index)\n",
1693 |     "    smoothed_vol = smoothed_vol.loc[slice(start,end),:]\n",
1694 |     "    # neutralizing and scaling\n",
1695 |     "    smoothed_vol_neutralized_scaled = FM().sector_neutral(sectors, smoothed_vol)\n",
1696 |     "    volatility_data[name] = smoothed_vol_neutralized_scaled[new_universe]"
1697 |    ]
1698 |   },
1699 |   {
1700 |    "cell_type": "markdown",
1701 |    "metadata": {},
1702 |    "source": [
1703 |     "### 4 - 1 - 8 - Capm"
1704 |    ]
1705 |   },
1706 |   {
1707 |    "cell_type": "code",
1708 |    "execution_count": null,
1709 |    "metadata": {},
1710 |    "outputs": [],
1711 |    "source": [
1712 |     "capm_data = {}\n",
1713 |     "\n",
1714 |     "for name,period in capm_in.items():\n",
1715 |     "    \n",
1716 |     "    # formulation and scaling\n",
1717 |     "    cap = FM().capm(close, dfm[['close']], 1, period)\n",
1718 |     "    # smoothing\n",
1719 |     "    smoothed_cap = FM().smooth(cap,smoothed_value)\n",
1720 |     "    # slicing\n",
1721 |     "    smoothed_cap.index = pd.to_datetime(smoothed_cap.index)\n",
1722 |     "    smoothed_cap = smoothed_cap.loc[slice(start,end),:]\n",
1723 |     "    # neutralizing and scaling\n",
1724 |     "    #smoothed_cap_neutralized_scaled = FM().sector_neutral(sectors, smoothed_cap)\n",
1725 |     "    capm_data[name] = smoothed_cap[new_universe]\n"
1726 |    ]
1727 |   },
1728 |   {
1729 |    "cell_type": "markdown",
1730 |    "metadata": {},
1731 |    "source": [
1732 |     "### 4 - 1 - 9 - Channels "
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": null,
1738 |    "metadata": {},
1739 |    "outputs": [],
1740 |    "source": [
1741 |     "chan_data = {}\n",
1742 |     "\n",
1743 |     "for name,period in channels_in.items():\n",
1744 |     "    \n",
1745 |     "    # formulation and scaling\n",
1746 |     "    chan = FM().channels(close, period)\n",
1747 |     "    # smoothing\n",
1748 |     "    smoothed_chan = FM().smooth(chan,20)\n",
1749 |     "    # slicing\n",
1750 |     "    smoothed_chan.index = pd.to_datetime(smoothed_chan.index)\n",
1751 |     "    smoothed_chan = smoothed_chan.loc[slice(start,end),:]\n",
1752 |     "    # neutralizing and scaling\n",
1753 |     "    smoothed_chan_neutralized_scaled = FM().sector_neutral(sectors, smoothed_chan)\n",
1754 |     "    chan_data[name] = smoothed_chan_neutralized_scaled[new_universe]"
1755 |    ]
1756 |   },
1757 |   {
1758 |    "cell_type": "markdown",
1759 |    "metadata": {},
1760 |    "source": [
1761 |     "### 4 - 1 - 10 - Fundamentals"
1762 |    ]
1763 |   },
1764 |   {
1765 |    "cell_type": "code",
1766 |    "execution_count": null,
1767 |    "metadata": {},
1768 |    "outputs": [],
1769 |    "source": [
1770 |     "fund_data = {}\n",
1771 |     "\n",
1772 |     "for i in fundamental_in:\n",
1773 |     "    \n",
1774 |     "    df = sf1[i].unstack('ticker')[new_universe]\n",
1775 |     "    df.fillna(df.mean(axis=0),inplace=True)\n",
1776 |     "    \n",
1777 |     "    # formulation\n",
1778 |     "    returns = FM().returns(df,1)\n",
1779 |     "    returns.replace([np.inf, -np.inf], np.nan, inplace=True)    \n",
1780 |     "\n",
1781 |     "    # neutralizing and scaling\n",
1782 |     "    returns_neutralize_scaled = FM().sector_neutral(sectors, returns)\n",
1783 |     "    \n",
1784 |     "    chunk = (ohlcv.index.levels[0][-1]+datetime.timedelta(days=1)).date()\n",
1785 |     "    chunk_minus = ohlcv.index.levels[0][-1].date()\n",
1786 |     "    # resampling\n",
1787 |     "    if datetime.datetime.strptime(sf1.index.levels[0][-1], '%Y-%m-%d').timestamp() > ohlcv.index.levels[0][-1].timestamp():\n",
1788 |     "        pass\n",
1789 |     "    else:\n",
1790 |     "        returns_neutralize_scaled.loc[chunk,:] = np.nan\n",
1791 |     "        \n",
1792 |     "    returns_neutralize_scaled.index = pd.to_datetime(returns_neutralize_scaled.index)\n",
1793 |     "    returns_neutralize_resampled = returns_neutralize_scaled.resample('D').pad()\n",
1794 |     "    returns_neutralize_resampled = returns_neutralize_resampled.loc[ohlcv.index.levels[0][0]:chunk_minus,:]\n",
1795 |     "    \n",
1796 |     "    # drop holidays\n",
1797 |     "    holidays = returns_neutralize_resampled.index ^ ohlcv.index.levels[0]\n",
1798 |     "    returns_neutralize_resampled.drop(list(holidays),axis=0,inplace = True)\n",
1799 |     "    returns_neutralize_resampled.index.name = ohlcv.index.levels[0].name\n",
1800 |     "    \n",
1801 |     "    # slicing\n",
1802 |     "    returns_neutralize_resampled_1y = returns_neutralize_resampled.loc[slice(start,end),:]\n",
1803 |     "\n",
1804 |     "    fund_data[i] = returns_neutralize_resampled_1y[new_universe]"
1805 |    ]
1806 |   },
1807 |   {
1808 |    "cell_type": "markdown",
1809 |    "metadata": {},
1810 |    "source": [
1811 |     "## 4 - 4 - Multiindex factors"
1812 |    ]
1813 |   },
1814 |   {
1815 |    "cell_type": "markdown",
1816 |    "metadata": {},
1817 |    "source": [
1818 |     "As mentioned before, factor data are unpacked and stacked into a multi-index data frame to finally be regrouped in a final multi-index data frame `all_factors`. The index is composed of two levels respectively `date` and `ticker`. The column is composed of factors created in previous sections relative to the date and ticker."
1819 |    ]
1820 |   },
1821 |   {
1822 |    "cell_type": "code",
1823 |    "execution_count": null,
1824 |    "metadata": {},
1825 |    "outputs": [],
1826 |    "source": [
1827 |     "result = []\n",
1828 |     "\n",
1829 |     "for i in fundamental_in:\n",
1830 |     "    \n",
1831 |     "    fund_return_neutralized_stacked = fund_data[i].stack().to_frame(i)\n",
1832 |     "    result.append(fund_return_neutralized_stacked)\n",
1833 |     "\n",
1834 |     "for i in daily_in.keys():\n",
1835 |     "\n",
1836 |     "    daily_stacked = daily_data[i].stack().to_frame('daily_{}{}days'.format(i,daily_in[i]))\n",
1837 |     "    result.append(daily_stacked)\n",
1838 |     "\n",
1839 |     "for i in sma_data.keys():\n",
1840 |     "\n",
1841 |     "    sma_stacked = sma_data[i].stack().to_frame('sma{}'.format(sma_in[i]))\n",
1842 |     "    result.append(sma_stacked)\n",
1843 |     "\n",
1844 |     "for i in momentum_data.keys():\n",
1845 |     "    \n",
1846 |     "    momentum_stacked = momentum_data[i].stack().to_frame('momentum{}days'.format(momentum_in[i]))\n",
1847 |     "    result.append(momentum_stacked)\n",
1848 |     "    \n",
1849 |     "for i in over_data.keys():\n",
1850 |     "    \n",
1851 |     "    over_stacked = over_data[i].stack().to_frame('overnight_sent{}days'.format(over_in[i]))\n",
1852 |     "    result.append(over_stacked)    \n",
1853 |     "\n",
1854 |     "for i in direction_data.keys():\n",
1855 |     "    \n",
1856 |     "    direct_stacked = direction_data[i].stack().to_frame('direction{}days'.format(direction_in[i]))\n",
1857 |     "    result.append(direct_stacked)\n",
1858 |     "    \n",
1859 |     "for i in volatility_data.keys():\n",
1860 |     "    \n",
1861 |     "    vol_stacked = volatility_data[i].stack().to_frame('volatility{}days'.format(vol_in[i]))\n",
1862 |     "    result.append(vol_stacked)\n",
1863 |     "    \n",
1864 |     "for i in capm_data.keys():\n",
1865 |     "    \n",
1866 |     "    capm_stacked = capm_data[i].stack().to_frame('capm{}days'.format(capm_in[i]))\n",
1867 |     "    result.append(capm_stacked)\n",
1868 |     "    \n",
1869 |     "for i in chan_data.keys():\n",
1870 |     "    \n",
1871 |     "    chan_stacked = chan_data[i].stack().to_frame('channels{}days'.format(channels_in[i]))\n",
1872 |     "    result.append(chan_stacked)\n",
1873 |     "    \n",
1874 |     "for i in sent_data.keys():\n",
1875 |     "    \n",
1876 |     "    sent_stacked = sent_data[i].stack().to_frame('sentiment{}days'.format(sent_in[i]))\n",
1877 |     "    result.append(sent_stacked)    \n",
1878 |     "    \n",
1879 |     "all_factors = pd.concat(result,axis=1)\n",
1880 |     "\n",
1881 |     "all_factors.index.set_names(['date', 'asset'], inplace=True)\n",
1882 |     "\n",
1883 |     "all_factors.index = all_factors.index\\\n",
1884 |     "                               .set_levels([all_factors.index.levels[0].tz_localize('UTC'), all_factors.index.levels[1]])\n",
1885 |     "\n",
1886 |     "all_factors"
1887 |    ]
1888 |   },
1889 |   {
1890 |    "cell_type": "code",
1891 |    "execution_count": null,
1892 |    "metadata": {},
1893 |    "outputs": [],
1894 |    "source": [
1895 |     "all_factors['ncf_reversed'] = all_factors['ncf'] * -1\n",
1896 |     "#all_factors['fcfps_reversed'] = all_factors['fcfps'] * -1"
1897 |    ]
1898 |   },
1899 |   {
1900 |    "cell_type": "code",
1901 |    "execution_count": null,
1902 |    "metadata": {},
1903 |    "outputs": [],
1904 |    "source": [
1905 |     "# nan values per columns\n",
1906 |     "all_factors.isna().sum()"
1907 |    ]
1908 |   },
1909 |   {
1910 |    "cell_type": "code",
1911 |    "execution_count": null,
1912 |    "metadata": {},
1913 |    "outputs": [],
1914 |    "source": [
1915 |     "all_factors.describe()"
1916 |    ]
1917 |   },
1918 |   {
1919 |    "cell_type": "markdown",
1920 |    "metadata": {},
1921 |    "source": [
1922 |     "# 5 - All factors analysis"
1923 |    ]
1924 |   },
1925 |   {
1926 |    "cell_type": "markdown",
1927 |    "metadata": {},
1928 |    "source": [
1929 |     "Now we have processed and regrouped factor data, we are ready to analyze the factor one by one to see if they have the potential to be combined or not. In this context, [alphalens](https://quantopian.github.io/alphalens/index.html) is used for the analysis. This package regrouped APIs useful for data processing and factor analysis over the pre-defined period `rebalance_period`. These metrics are as followed:\n",
1930 |     "\n",
1931 |     "\n",
1932 |     "- <b>Cleaning and preparing data</b> `alphalens.utils.get_clean_factor_and_forward_returns`: Formats the factor data, pricing data, and group mappings into a DataFrame that contains aligned MultiIndex indices of timestamp and asset. The returned data will be formatted to be suitable for Alphalens functions. \n",
1933 |     "\n",
1934 |     "- <b>Cumulated factor return</b> `alphalens.performance.factor_returns`: Builds cumulative returns from ‘period’ returns. This function simulate the cumulative effect that a series of gains or losses (the ‘returns’) have on an original amount of capital over a period of time.\n",
1935 |     "\n",
1936 |     "-  <b>Mean quantile return</b> `alphalens.performance.mean_return_by_quantile`: Computes mean returns for factor quantiles across provided forward returns columns.\n",
1937 |     "\n",
1938 |     "- <b>Factor Rank Autocorrelation</b> `alphalens.performance.factor_rank_autocorrelation`: Computes autocorrelation of mean factor ranks in specified time spans. We must compare period to period factor ranks rather than factor values to account for systematic shifts in the factor values of all names or names within a group. This metric is useful for measuring the turnover of a factor. If the value of a factor for each name changes randomly from period to period, we’d expect an autocorrelation of 0.\n",
1939 |     "\n",
1940 |     "- <b>Sharpe ratio</b> `sharpe_ratio`: This function computes annualized sharpe ratio. This metric is used to understand the return of an investment compared to its risk. The ratio is the average return earned in excess per unit of volatility or total risk. Volatility is a measure of the factor return fluctuations of an asset."
1941 |    ]
1942 |   },
1943 |   {
1944 |    "cell_type": "code",
1945 |    "execution_count": null,
1946 |    "metadata": {},
1947 |    "outputs": [],
1948 |    "source": [
1949 |     "print ('Rebalance period set to {} days for all factors analysis'.format(rebalance_period))"
1950 |    ]
1951 |   },
1952 |   {
1953 |    "cell_type": "code",
1954 |    "execution_count": null,
1955 |    "metadata": {},
1956 |    "outputs": [],
1957 |    "source": [
1958 |     "clean_factor_data = {\n",
1959 |     "    factor: al.utils.get_clean_factor_and_forward_returns(factor=factor_data, \n",
1960 |     "                                                          prices=pricing, \n",
1961 |     "                                                          periods=[rebalance_period], \n",
1962 |     "                                                          quantiles=5,\n",
1963 |     "                                                          filter_zscore=20,\n",
1964 |     "                                                          max_loss=0.35)\n",
1965 |     "    for factor, factor_data in all_factors[['capm20days',\n",
1966 |     "                                            'sma200',\n",
1967 |     "                                            'daily_ps100days',\n",
1968 |     "                                            #'daily_pb100days',\n",
1969 |     "                                            'direction100days',\n",
1970 |     "                                            'momentum252days',\n",
1971 |     "                                            'sentiment10days',\n",
1972 |     "                                            'overnight_sent60days',\n",
1973 |     "                                            'volatility20days',\n",
1974 |     "                                            'daily_marketcap120days',\n",
1975 |     "                                            'daily_evebitda100days',\n",
1976 |     "                                            #'daily_pe100days',\n",
1977 |     "                                            'channels100days',\n",
1978 |     "                                         ]].iteritems()}\n",
1979 |     "\n",
1980 |     "unixt_factor_data = {\n",
1981 |     "    factor: factor_data.set_index(pd.MultiIndex.from_tuples(\n",
1982 |     "        [(x.timestamp(), y) for x, y in factor_data.index.values],\n",
1983 |     "        names=['date', 'asset']))\n",
1984 |     "    for factor, factor_data in clean_factor_data.items()}"
1985 |    ]
1986 |   },
1987 |   {
1988 |    "cell_type": "markdown",
1989 |    "metadata": {},
1990 |    "source": [
1991 |     "### 5 - 1 - Cumulated factor return"
1992 |    ]
1993 |   },
1994 |   {
1995 |    "cell_type": "code",
1996 |    "execution_count": null,
1997 |    "metadata": {
1998 |     "scrolled": true
1999 |    },
2000 |    "outputs": [],
2001 |    "source": [
2002 |     "plt.style.use('ggplot')\n",
2003 |     "plt.rcParams['figure.figsize'] = (25, 15)\n",
2004 |     "\n",
2005 |     "ls_factor_returns = pd.DataFrame()\n",
2006 |     "\n",
2007 |     "for factor, factor_data in clean_factor_data.items():\n",
2008 |     "    ls_factor_returns[factor] = al.performance.factor_returns(factor_data).iloc[:, 0]\n",
2009 |     "\n",
2010 |     "plt.plot((1+ls_factor_returns).cumprod(), lw=3);"
2011 |    ]
2012 |   },
2013 |   {
2014 |    "cell_type": "markdown",
2015 |    "metadata": {},
2016 |    "source": [
2017 |     "### 5 - 2 - Quantile analysis"
2018 |    ]
2019 |   },
2020 |   {
2021 |    "cell_type": "code",
2022 |    "execution_count": null,
2023 |    "metadata": {},
2024 |    "outputs": [],
2025 |    "source": [
2026 |     "qr_factor_returns = pd.DataFrame()\n",
2027 |     "\n",
2028 |     "for factor, factor_data in unixt_factor_data.items():\n",
2029 |     "    qr_factor_returns[factor] = al.performance.mean_return_by_quantile(factor_data)[0].iloc[:, 0]\n",
2030 |     "\n",
2031 |     "(10000*qr_factor_returns).plot.bar(\n",
2032 |     "    subplots=True,\n",
2033 |     "    sharey=True,\n",
2034 |     "    layout=(20,2),\n",
2035 |     "    figsize=(14, 50),\n",
2036 |     "    legend=False, fontsize=2);"
2037 |    ]
2038 |   },
2039 |   {
2040 |    "cell_type": "markdown",
2041 |    "metadata": {},
2042 |    "source": [
2043 |     "### 5 - 3 - Factor Rank Autocorrelation"
2044 |    ]
2045 |   },
2046 |   {
2047 |    "cell_type": "code",
2048 |    "execution_count": null,
2049 |    "metadata": {},
2050 |    "outputs": [],
2051 |    "source": [
2052 |     "ls_FRA = pd.DataFrame()\n",
2053 |     "\n",
2054 |     "for factor, factor_data in unixt_factor_data.items():\n",
2055 |     "    ls_FRA[factor] = al.performance.factor_rank_autocorrelation(factor_data,period=rebalance_period)\n",
2056 |     "\n",
2057 |     "plt.plot(ls_FRA,lw=2)\n",
2058 |     "plt.title(\"Factor Rank Autocorrelation\");"
2059 |    ]
2060 |   },
2061 |   {
2062 |    "cell_type": "markdown",
2063 |    "metadata": {},
2064 |    "source": [
2065 |     "### 5 - 4 - Sharpe ratio"
2066 |    ]
2067 |   },
2068 |   {
2069 |    "cell_type": "code",
2070 |    "execution_count": null,
2071 |    "metadata": {},
2072 |    "outputs": [],
2073 |    "source": [
2074 |     "def sharpe_ratio(factor_returns, annualization_factor):\n",
2075 |     "\n",
2076 |     "    df_sharpe = pd.Series(annualization_factor*factor_returns.mean()/factor_returns.std())\n",
2077 |     "    \n",
2078 |     "    return df_sharpe"
2079 |    ]
2080 |   },
2081 |   {
2082 |    "cell_type": "code",
2083 |    "execution_count": null,
2084 |    "metadata": {},
2085 |    "outputs": [],
2086 |    "source": [
2087 |     "daily_annualization_factor = np.sqrt(252)\n",
2088 |     "df_sharpe = sharpe_ratio(ls_factor_returns, daily_annualization_factor).round(2)"
2089 |    ]
2090 |   },
2091 |   {
2092 |    "cell_type": "code",
2093 |    "execution_count": null,
2094 |    "metadata": {},
2095 |    "outputs": [],
2096 |    "source": [
2097 |     "df_sharpe.sort_values(ascending=False)"
2098 |    ]
2099 |   },
2100 |   {
2101 |    "cell_type": "markdown",
2102 |    "metadata": {},
2103 |    "source": [
2104 |     "# 6 - Combined factors"
2105 |    ]
2106 |   },
2107 |   {
2108 |    "cell_type": "markdown",
2109 |    "metadata": {},
2110 |    "source": [
2111 |     "## 6 - 1 - Combining selceted factors"
2112 |    ]
2113 |   },
2114 |   {
2115 |    "cell_type": "code",
2116 |    "execution_count": null,
2117 |    "metadata": {
2118 |     "scrolled": true
2119 |    },
2120 |    "outputs": [],
2121 |    "source": [
2122 |     "\n",
2123 |     "selected_factors = [\n",
2124 |     "                         'sma200',\n",
2125 |     "                         #'daily_ps100days',\n",
2126 |     "                         #'daily_pb100days',\n",
2127 |     "                         'direction100days',\n",
2128 |     "                         'momentum252days',\n",
2129 |     "                         #'sentiment10days',\n",
2130 |     "                         #'overnight_sent60days',\n",
2131 |     "                         #'volatility20days',\n",
2132 |     "                         'daily_marketcap120days',\n",
2133 |     "                         #'daily_evebitda100days',\n",
2134 |     "                         #'daily_pe100days',\n",
2135 |     "                         #'capm20days'\n",
2136 |     "                         'channels100days'\n",
2137 |     "                   ]\n",
2138 |     "\n",
2139 |     "print('Selected Factors:\\n{} '.format(',\\n'.join(selected_factors)))\n",
2140 |     "\n",
2141 |     "all_factors = all_factors[selected_factors]"
2142 |    ]
2143 |   },
2144 |   {
2145 |    "cell_type": "code",
2146 |    "execution_count": null,
2147 |    "metadata": {},
2148 |    "outputs": [],
2149 |    "source": [
2150 |     "all_factors.to_csv('data/all_factors_test.csv')"
2151 |    ]
2152 |   },
2153 |   {
2154 |    "cell_type": "code",
2155 |    "execution_count": null,
2156 |    "metadata": {},
2157 |    "outputs": [],
2158 |    "source": [
2159 |     "feature_importances = LE().feature_importance_xgb(n_fwd_days = rebalance_period, \n",
2160 |     "                                                         close = close, \n",
2161 |     "                                                         all_factors = all_factors,\n",
2162 |     "                                                         lower_percentile = 40,\n",
2163 |     "                                                         upper_percentile = 60,\n",
2164 |     "                                                         n_estimators = 150, \n",
2165 |     "                                                         train_size = 0.7)"
2166 |    ]
2167 |   },
2168 |   {
2169 |    "cell_type": "code",
2170 |    "execution_count": null,
2171 |    "metadata": {
2172 |     "scrolled": true
2173 |    },
2174 |    "outputs": [],
2175 |    "source": [
2176 |     "feature_importances.sort_values(by='weights')"
2177 |    ]
2178 |   },
2179 |   {
2180 |    "cell_type": "code",
2181 |    "execution_count": null,
2182 |    "metadata": {},
2183 |    "outputs": [],
2184 |    "source": [
2185 |     "all_factors_copy = all_factors.copy() "
2186 |    ]
2187 |   },
2188 |   {
2189 |    "cell_type": "code",
2190 |    "execution_count": null,
2191 |    "metadata": {},
2192 |    "outputs": [],
2193 |    "source": [
2194 |     "for factor in selected_factors:\n",
2195 |     "    all_factors_copy.loc[:,factor] = feature_importances.loc[factor][0] * all_factors.loc[:,factor]\n",
2196 |     "all_factors_copy.loc[:,'alpha_vector'] = all_factors.sum(axis=1)\n",
2197 |     "all_factors = all_factors_copy  "
2198 |    ]
2199 |   },
2200 |   {
2201 |    "cell_type": "code",
2202 |    "execution_count": null,
2203 |    "metadata": {},
2204 |    "outputs": [],
2205 |    "source": [
2206 |     "vec = pd.DataFrame(data=all_factors['alpha_vector'],columns = ['alpha_vector','sector'])\n",
2207 |     "for date in vec.index.levels[0]:\n",
2208 |     "    vec.loc[date,['sector']] = meta_ex_cu_de_cap.loc[vec.index.levels[1]]['sector'].values\n",
2209 |     "sectors = vec['sector']"
2210 |    ]
2211 |   },
2212 |   {
2213 |    "cell_type": "code",
2214 |    "execution_count": null,
2215 |    "metadata": {},
2216 |    "outputs": [],
2217 |    "source": [
2218 |     "vec.shape"
2219 |    ]
2220 |   },
2221 |   {
2222 |    "cell_type": "markdown",
2223 |    "metadata": {},
2224 |    "source": [
2225 |     "\n",
2226 |     "\n",
2227 |     "## 6 - 2 - Creating clean factor data"
2228 |    ]
2229 |   },
2230 |   {
2231 |    "cell_type": "code",
2232 |    "execution_count": null,
2233 |    "metadata": {},
2234 |    "outputs": [],
2235 |    "source": [
2236 |     "factor_data = al.utils.get_clean_factor_and_forward_returns(factor = all_factors['alpha_vector'], \n",
2237 |     "                                                          prices = pricing, \n",
2238 |     "                                                          periods = combined_periods,\n",
2239 |     "                                                          quantiles = qunatile_portions,\n",
2240 |     "                                                          groupby=sectors,\n",
2241 |     "                                                          binning_by_group=False,\n",
2242 |     "                                                          filter_zscore=20,\n",
2243 |     "                                                          max_loss=0.15)"
2244 |    ]
2245 |   },
2246 |   {
2247 |    "cell_type": "markdown",
2248 |    "metadata": {},
2249 |    "source": [
2250 |     "## 6 - 3 - Creating tear sheets "
2251 |    ]
2252 |   },
2253 |   {
2254 |    "cell_type": "code",
2255 |    "execution_count": null,
2256 |    "metadata": {},
2257 |    "outputs": [],
2258 |    "source": [
2259 |     "pf_returns, pf_positions, pf_benchmark = \\\n",
2260 |     "al.performance.create_pyfolio_input(factor_data,\n",
2261 |     "                                       period='{}D'.format(rebalance_period),\n",
2262 |     "                                       capital=1,\n",
2263 |     "                                       long_short=True,\n",
2264 |     "                                       group_neutral=False,\n",
2265 |     "                                       equal_weight=False,\n",
2266 |     "                                       #quantiles=[1,2,4,5],\n",
2267 |     "                                       groups=sectors)"
2268 |    ]
2269 |   },
2270 |   {
2271 |    "cell_type": "code",
2272 |    "execution_count": null,
2273 |    "metadata": {},
2274 |    "outputs": [],
2275 |    "source": [
2276 |     "pf_benchmark = benchmark.loc[slice(pf_returns.index[0],pf_returns.index[-1])]"
2277 |    ]
2278 |   },
2279 |   {
2280 |    "cell_type": "code",
2281 |    "execution_count": null,
2282 |    "metadata": {},
2283 |    "outputs": [],
2284 |    "source": [
2285 |     "sec_mappings = sectors.loc[pd.IndexSlice[pf_returns.index[0]]]"
2286 |    ]
2287 |   },
2288 |   {
2289 |    "cell_type": "code",
2290 |    "execution_count": null,
2291 |    "metadata": {},
2292 |    "outputs": [],
2293 |    "source": [
2294 |     "sector_mappings = {}\n",
2295 |     "for i in sec_mappings.index:\n",
2296 |     "    sector_mappings[i] = sec_mappings.loc[i]"
2297 |    ]
2298 |   },
2299 |   {
2300 |    "cell_type": "code",
2301 |    "execution_count": null,
2302 |    "metadata": {
2303 |     "scrolled": true
2304 |    },
2305 |    "outputs": [],
2306 |    "source": [
2307 |     "# pf.create_full_tear_sheet(returns = pf_returns, \n",
2308 |     "#                           positions = pf_positions,\n",
2309 |     "#                           sector_mappings = sector_mappings,\n",
2310 |     "#                           benchmark_rets = pf_benchmark )"
2311 |    ]
2312 |   },
2313 |   {
2314 |    "cell_type": "code",
2315 |    "execution_count": null,
2316 |    "metadata": {},
2317 |    "outputs": [],
2318 |    "source": [
2319 |     "al.tears.create_full_tear_sheet(factor_data, by_group=True, long_short=True, group_neutral=False);"
2320 |    ]
2321 |   },
2322 |   {
2323 |    "cell_type": "code",
2324 |    "execution_count": null,
2325 |    "metadata": {},
2326 |    "outputs": [],
2327 |    "source": [
2328 |     "qr_factor_returns = pd.DataFrame()\n",
2329 |     "\n",
2330 |     "qr_factor_returns = al.performance.mean_return_by_quantile(factor_data)[0]\n",
2331 |     "\n",
2332 |     "(10000*qr_factor_returns).plot.bar(\n",
2333 |     "    subplots=True,\n",
2334 |     "    sharey=True,\n",
2335 |     "    layout=(4,2),\n",
2336 |     "    figsize=(14, 14),\n",
2337 |     "    legend=False);"
2338 |    ]
2339 |   },
2340 |   {
2341 |    "cell_type": "code",
2342 |    "execution_count": null,
2343 |    "metadata": {},
2344 |    "outputs": [],
2345 |    "source": [
2346 |     "ls_factor_returns = al.performance.factor_returns(factor_data)"
2347 |    ]
2348 |   },
2349 |   {
2350 |    "cell_type": "code",
2351 |    "execution_count": null,
2352 |    "metadata": {},
2353 |    "outputs": [],
2354 |    "source": [
2355 |     "daily_annualization_factor = np.sqrt(252)\n",
2356 |     "sharpe_ratio(ls_factor_returns, daily_annualization_factor).round(2)"
2357 |    ]
2358 |   },
2359 |   {
2360 |    "cell_type": "markdown",
2361 |    "metadata": {},
2362 |    "source": [
2363 |     "# 7 -  Risk analysis "
2364 |    ]
2365 |   },
2366 |   {
2367 |    "cell_type": "code",
2368 |    "execution_count": null,
2369 |    "metadata": {},
2370 |    "outputs": [],
2371 |    "source": [
2372 |     "dff = pd.DataFrame()\n",
2373 |     "dff['factor'] = all_factors['alpha_vector']\n",
2374 |     "df_all_weights = al.performance.factor_weights(dff, demeaned=True, group_adjust=False, equal_weight=False)\n",
2375 |     "all_weights = df_all_weights.loc[pd.IndexSlice[all_factors.index.levels[0][-1]]]\n",
2376 |     "all_weights = pd.DataFrame(data = all_weights.values, \n",
2377 |     "                                           columns = ['optimal_weights'],\n",
2378 |     "                                           index = all_weights.index)\n",
2379 |     "all_weights.index.name = 'asset'"
2380 |    ]
2381 |   },
2382 |   {
2383 |    "cell_type": "code",
2384 |    "execution_count": null,
2385 |    "metadata": {},
2386 |    "outputs": [],
2387 |    "source": [
2388 |     "assets = all_weights.index"
2389 |    ]
2390 |   },
2391 |   {
2392 |    "cell_type": "code",
2393 |    "execution_count": null,
2394 |    "metadata": {},
2395 |    "outputs": [],
2396 |    "source": [
2397 |     "predicted_portfolio_risk,Risk_Model = RM().portfolio_risk(close[assets],num_factor_exposures=13,weights=all_weights)"
2398 |    ]
2399 |   },
2400 |   {
2401 |    "cell_type": "code",
2402 |    "execution_count": null,
2403 |    "metadata": {},
2404 |    "outputs": [],
2405 |    "source": [
2406 |     "print ('Predicted Risk: {} %'.format(np.round((predicted_portfolio_risk*100),2)))"
2407 |    ]
2408 |   },
2409 |   {
2410 |    "cell_type": "markdown",
2411 |    "metadata": {},
2412 |    "source": [
2413 |     "# 8 - Integrating factor data to optimzer"
2414 |    ]
2415 |   },
2416 |   {
2417 |    "cell_type": "markdown",
2418 |    "metadata": {},
2419 |    "source": [
2420 |     "Once alpha model and a risk model are generated, we want to find a portfolio that trades as close as possible to the alpha model but limiting risk as measured by the [risk_model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py). The [cxpy](https://www.cvxpy.org/) package is used to implement the [optimizer](https://github.com/keyvantaj/Quantitative/blob/master/optimizer.py)\n",
2421 |     "\n",
2422 |     "The CVXPY objective function is to maximize 𝛼𝑇 ∗ 𝑥 , where x is the portfolio weights and alpha is the alpha vector.\n",
2423 |     "\n",
2424 |     "In the other hand we have the following constraints:\n",
2425 |     "\n",
2426 |     "- $ r \\leq risk_{\\text{cap}}^2 \\\\ $\n",
2427 |     "- $ B^T * x \\preceq factor_{\\text{max}} \\\\ $\n",
2428 |     "- $ B^T * x \\succeq factor_{\\text{min}} \\\\ $\n",
2429 |     "- $ x^T\\mathbb{1} = 0 \\\\ $\n",
2430 |     "- $ \\|x\\|_1 \\leq 1 \\\\ $\n",
2431 |     "- $ x \\succeq weights_{\\text{min}} \\\\ $\n",
2432 |     "- $ x \\preceq weights_{\\text{max}} $\n",
2433 |     "\n",
2434 |     "Where x is the portfolio weights, B is the factor betas, and r is the portfolio risk calculated in [risk model](https://github.com/keyvantaj/Quantitative/blob/master/risk_model.py) module.\n",
2435 |     "\n",
2436 |     "The first constraint is that the predicted risk be less than some maximum limit. The second and third constraints are on the maximum and minimum portfolio factor exposures. The fourth constraint is the \"market neutral constraint: the sum of the weights must be zero. The fifth constraint is the leverage constraint: the sum of the absolute value of the weights must be less than or equal to 1.0. The last are some minimum and maximum limits on individual holdings."
2437 |    ]
2438 |   },
2439 |   {
2440 |    "cell_type": "code",
2441 |    "execution_count": null,
2442 |    "metadata": {},
2443 |    "outputs": [],
2444 |    "source": [
2445 |     "optimal = pd.DataFrame(index = all_factors.index.levels[0], columns = all_factors.index.levels[1])\n",
2446 |     "for date in all_factors.index.levels[0]:\n",
2447 |     "    \n",
2448 |     "    x = all_factors[['alpha_vector']].loc[date,:]\n",
2449 |     "    optimal.loc[date] = OHR(lambda_reg = lambda_reg,\n",
2450 |     "                                         factor_max = factor_max, factor_min = factor_min, \n",
2451 |     "                                         weights_max = weights_max, weights_min = weights_min,\n",
2452 |     "                                         risk_cap = risk_cap).find(\n",
2453 |     "                                                                    x, \n",
2454 |     "                                                                    Risk_Model['factor_betas'], \n",
2455 |     "                                                                    Risk_Model['factor_cov_matrix'], \n",
2456 |     "                                                                    Risk_Model['idiosyncratic_var_vector']).values.flatten()\n",
2457 |     "    \n",
2458 |     "optimal = optimal.astype(np.float)\n",
2459 |     "optimal_stacked = optimal.stack().to_frame('optimal_weights')"
2460 |    ]
2461 |   },
2462 |   {
2463 |    "cell_type": "code",
2464 |    "execution_count": null,
2465 |    "metadata": {},
2466 |    "outputs": [],
2467 |    "source": [
2468 |     "assert optimal_stacked.shape[0] == vec.shape[0]"
2469 |    ]
2470 |   },
2471 |   {
2472 |    "cell_type": "markdown",
2473 |    "metadata": {},
2474 |    "source": [
2475 |     "## 8 - 1 - Quantile data"
2476 |    ]
2477 |   },
2478 |   {
2479 |    "cell_type": "markdown",
2480 |    "metadata": {},
2481 |    "source": [
2482 |     "Before starting the factor analysis of the optimized alpha vector, we will define quantiles for data of each date. For this purpose, we use the `qunatile_portions` parameter defined previously to cut data with specific portions. Then we iterate over `optimal_stacked` and apply [pandas qcut](https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.qcut.html) function for each date to finally stacked the data into a pandas data frame."
2483 |    ]
2484 |   },
2485 |   {
2486 |    "cell_type": "code",
2487 |    "execution_count": null,
2488 |    "metadata": {},
2489 |    "outputs": [],
2490 |    "source": [
2491 |     "quantile_optimal_stacked = UT.quantilize(qunatile_portions, \n",
2492 |     "                                              optimal_stacked,\n",
2493 |     "                                              weights_col='optimal_weights',\n",
2494 |     "                                              q_col='quantile',\n",
2495 |     "                                              sec_col='sector',\n",
2496 |     "                                              sec_df=vec)"
2497 |    ]
2498 |   },
2499 |   {
2500 |    "cell_type": "code",
2501 |    "execution_count": null,
2502 |    "metadata": {},
2503 |    "outputs": [],
2504 |    "source": [
2505 |     "print ('quantiles:', list(set(quantile_optimal_stacked['quantile'].values)))"
2506 |    ]
2507 |   },
2508 |   {
2509 |    "cell_type": "markdown",
2510 |    "metadata": {},
2511 |    "source": [
2512 |     "## 8 - 2 - Quantiles indexing"
2513 |    ]
2514 |   },
2515 |   {
2516 |    "cell_type": "markdown",
2517 |    "metadata": {},
2518 |    "source": [
2519 |     "The goal here is to select the edges using `quantile_to_analyse` previously define in <b>parameters</b> section. "
2520 |    ]
2521 |   },
2522 |   {
2523 |    "cell_type": "code",
2524 |    "execution_count": null,
2525 |    "metadata": {},
2526 |    "outputs": [],
2527 |    "source": [
2528 |     "q_final_vector = UT.q_indexing(quantile_to_analyse, quantile_optimal_stacked)"
2529 |    ]
2530 |   },
2531 |   {
2532 |    "cell_type": "code",
2533 |    "execution_count": null,
2534 |    "metadata": {},
2535 |    "outputs": [],
2536 |    "source": [
2537 |     "percent_q_dropped = ((quantile_optimal_stacked.shape[0] - q_final_vector.shape[0])/quantile_optimal_stacked.shape[0])*100\n",
2538 |     "print ('{} % dropped after quantile select'.format(np.round(percent_q_dropped,2)))"
2539 |    ]
2540 |   },
2541 |   {
2542 |    "cell_type": "code",
2543 |    "execution_count": null,
2544 |    "metadata": {},
2545 |    "outputs": [],
2546 |    "source": [
2547 |     "print ('selected quantiles:', list(set(q_final_vector['quantile'].values)))"
2548 |    ]
2549 |   },
2550 |   {
2551 |    "cell_type": "markdown",
2552 |    "metadata": {},
2553 |    "source": [
2554 |     "## 8 - 3 - Sectors Selected"
2555 |    ]
2556 |   },
2557 |   {
2558 |    "cell_type": "markdown",
2559 |    "metadata": {},
2560 |    "source": []
2561 |   },
2562 |   {
2563 |    "cell_type": "code",
2564 |    "execution_count": null,
2565 |    "metadata": {},
2566 |    "outputs": [],
2567 |    "source": [
2568 |     "final_vector,sectors = UT.select_sector(q_final_vector, drop_long_sec, drop_short_sec,\n",
2569 |     "                                        sec_col='sector',factor_col = 'quantile')"
2570 |    ]
2571 |   },
2572 |   {
2573 |    "cell_type": "code",
2574 |    "execution_count": null,
2575 |    "metadata": {},
2576 |    "outputs": [],
2577 |    "source": [
2578 |     "print ('{} rows dropped after sector select'.format(q_final_vector.shape[0] - final_vector.shape[0]))"
2579 |    ]
2580 |   },
2581 |   {
2582 |    "cell_type": "markdown",
2583 |    "metadata": {},
2584 |    "source": [
2585 |     "# 9 - Optimized alpha vector analysis "
2586 |    ]
2587 |   },
2588 |   {
2589 |    "cell_type": "markdown",
2590 |    "metadata": {},
2591 |    "source": [
2592 |     "## 9 - 1 - Creating clean factor data"
2593 |    ]
2594 |   },
2595 |   {
2596 |    "cell_type": "code",
2597 |    "execution_count": null,
2598 |    "metadata": {},
2599 |    "outputs": [],
2600 |    "source": [
2601 |     "factor_data = al.utils.get_clean_factor_and_forward_returns(factor = final_vector['optimal_weights'], \n",
2602 |     "                                                          prices = pricing, \n",
2603 |     "                                                          periods = combined_periods,\n",
2604 |     "                                                          quantiles = len(quantile_to_analyse),\n",
2605 |     "                                                          groupby=sectors,\n",
2606 |     "                                                          binning_by_group=False,\n",
2607 |     "                                                          filter_zscore=20,\n",
2608 |     "                                                          max_loss=0.15)"
2609 |    ]
2610 |   },
2611 |   {
2612 |    "cell_type": "markdown",
2613 |    "metadata": {},
2614 |    "source": [
2615 |     "## 9 - 2 - Sector selection for factor data"
2616 |    ]
2617 |   },
2618 |   {
2619 |    "cell_type": "code",
2620 |    "execution_count": null,
2621 |    "metadata": {},
2622 |    "outputs": [],
2623 |    "source": [
2624 |     "factor_data, sectors = UT.select_sector(factor_data, drop_long_sec, drop_short_sec,\n",
2625 |     "                                     sec_col='group',factor_col = 'factor_quantile')"
2626 |    ]
2627 |   },
2628 |   {
2629 |    "cell_type": "code",
2630 |    "execution_count": null,
2631 |    "metadata": {},
2632 |    "outputs": [],
2633 |    "source": [
2634 |     "print ('{} rows dropped from factor data'.format(factor_data.shape[0] - factor_data.shape[0]))"
2635 |    ]
2636 |   },
2637 |   {
2638 |    "cell_type": "markdown",
2639 |    "metadata": {},
2640 |    "source": [
2641 |     "## 9 - 3 - Creating tear sheets "
2642 |    ]
2643 |   },
2644 |   {
2645 |    "cell_type": "code",
2646 |    "execution_count": null,
2647 |    "metadata": {},
2648 |    "outputs": [],
2649 |    "source": [
2650 |     "pf_returns, pf_positions, pf_benchmark = \\\n",
2651 |     "al.performance.create_pyfolio_input(factor_data,\n",
2652 |     "                                   period = '{}D'.format(rebalance_period),\n",
2653 |     "                                   capital = 1,\n",
2654 |     "                                   long_short = True,\n",
2655 |     "                                   group_neutral = False,\n",
2656 |     "                                   equal_weight = False,\n",
2657 |     "                                   groups = sectors)\n",
2658 |     "\n",
2659 |     "pf_benchmark = benchmark.loc[slice(pf_returns.index[0],pf_returns.index[-1])]\n",
2660 |     "sec_mappings = sectors.loc[pd.IndexSlice[pf_returns.index[0]]]\n",
2661 |     "\n",
2662 |     "sector_mappings = {}\n",
2663 |     "for i in sec_mappings.index:\n",
2664 |     "    sector_mappings[i] = sec_mappings.loc[i]"
2665 |    ]
2666 |   },
2667 |   {
2668 |    "cell_type": "code",
2669 |    "execution_count": null,
2670 |    "metadata": {
2671 |     "scrolled": false
2672 |    },
2673 |    "outputs": [],
2674 |    "source": [
2675 |     "# pf.create_full_tear_sheet(returns = pf_returns, \n",
2676 |     "#                           positions = pf_positions,\n",
2677 |     "#                           sector_mappings = sector_mappings,\n",
2678 |     "#                           benchmark_rets = pf_benchmark,\n",
2679 |     "#                           factor_returns = factor_data[['factor']] )"
2680 |    ]
2681 |   },
2682 |   {
2683 |    "cell_type": "code",
2684 |    "execution_count": null,
2685 |    "metadata": {
2686 |     "scrolled": false
2687 |    },
2688 |    "outputs": [],
2689 |    "source": [
2690 |     "al.tears.create_full_tear_sheet(factor_data, by_group=True, long_short=True, group_neutral=False);"
2691 |    ]
2692 |   },
2693 |   {
2694 |    "cell_type": "code",
2695 |    "execution_count": null,
2696 |    "metadata": {},
2697 |    "outputs": [],
2698 |    "source": [
2699 |     "ls_factor_returns = al.performance.factor_returns(factor_data)"
2700 |    ]
2701 |   },
2702 |   {
2703 |    "cell_type": "code",
2704 |    "execution_count": null,
2705 |    "metadata": {},
2706 |    "outputs": [],
2707 |    "source": [
2708 |     "daily_annualization_factor = np.sqrt(252)\n",
2709 |     "sharpe_ratio(ls_factor_returns, daily_annualization_factor).round(2)"
2710 |    ]
2711 |   },
2712 |   {
2713 |    "cell_type": "markdown",
2714 |    "metadata": {},
2715 |    "source": [
2716 |     "# 10 - Predicted Portfolio"
2717 |    ]
2718 |   },
2719 |   {
2720 |    "cell_type": "code",
2721 |    "execution_count": null,
2722 |    "metadata": {},
2723 |    "outputs": [],
2724 |    "source": [
2725 |     "optimal_weights_regularized = final_vector[['optimal_weights']].loc[final_vector.index.levels[0][-1]]"
2726 |    ]
2727 |   },
2728 |   {
2729 |    "cell_type": "code",
2730 |    "execution_count": null,
2731 |    "metadata": {},
2732 |    "outputs": [],
2733 |    "source": [
2734 |     "print ('for {} assets  with end date: {}'.format(len(optimal_weights_regularized.index),optimal.index[-1]))\n",
2735 |     "optimal_weights_regularized.plot.bar(legend=None, title='Portfolio % Holdings by Stock')\n",
2736 |     "plt.grid(alpha=0.8)"
2737 |    ]
2738 |   },
2739 |   {
2740 |    "cell_type": "code",
2741 |    "execution_count": null,
2742 |    "metadata": {},
2743 |    "outputs": [],
2744 |    "source": [
2745 |     "optimal_weights_regularized = pd.DataFrame(data = optimal_weights_regularized.values, \n",
2746 |     "                                           columns = ['optimal_weights'],\n",
2747 |     "                                           index = optimal_weights_regularized.index)\n",
2748 |     "optimal_weights_regularized.index.name = 'asset'"
2749 |    ]
2750 |   },
2751 |   {
2752 |    "cell_type": "code",
2753 |    "execution_count": null,
2754 |    "metadata": {},
2755 |    "outputs": [],
2756 |    "source": [
2757 |     "q5 = optimal_weights_regularized.sort_values(by='optimal_weights',ascending=False)[:len(optimal_weights_regularized.index)//5]\n",
2758 |     "q1 = optimal_weights_regularized.sort_values(by='optimal_weights',ascending=True)[:len(optimal_weights_regularized.index)//5]"
2759 |    ]
2760 |   },
2761 |   {
2762 |    "cell_type": "code",
2763 |    "execution_count": null,
2764 |    "metadata": {},
2765 |    "outputs": [],
2766 |    "source": [
2767 |     "print (q5.iloc[0])\n",
2768 |     "print (q1.iloc[0])"
2769 |    ]
2770 |   },
2771 |   {
2772 |    "cell_type": "markdown",
2773 |    "metadata": {},
2774 |    "source": [
2775 |     "## 10 - 1 - Risk analysis with optimized weights"
2776 |    ]
2777 |   },
2778 |   {
2779 |    "cell_type": "code",
2780 |    "execution_count": null,
2781 |    "metadata": {},
2782 |    "outputs": [],
2783 |    "source": [
2784 |     "predicted_portfolio_risk,Risk_Model = RM().portfolio_risk(close[optimal_weights_regularized.index],num_factor_exposures=factor_exposures,weights=optimal_weights_regularized)"
2785 |    ]
2786 |   },
2787 |   {
2788 |    "cell_type": "code",
2789 |    "execution_count": null,
2790 |    "metadata": {},
2791 |    "outputs": [],
2792 |    "source": [
2793 |     "print ('Predicted Risk: {} %'.format(np.round((predicted_portfolio_risk*100),2)))"
2794 |    ]
2795 |   },
2796 |   {
2797 |    "cell_type": "code",
2798 |    "execution_count": null,
2799 |    "metadata": {},
2800 |    "outputs": [],
2801 |    "source": [
2802 |     "close.to_csv('data/close.csv')\n",
2803 |     "optimal_weights_regularized.to_csv('output/optimal_weights_regularized.csv')"
2804 |    ]
2805 |   },
2806 |   {
2807 |    "cell_type": "code",
2808 |    "execution_count": null,
2809 |    "metadata": {},
2810 |    "outputs": [],
2811 |    "source": [
2812 |     "print ('the total leverage is {}'.format(optimal_weights_regularized.abs().sum().round(2)[0]))\n",
2813 |     "print ('the long/short leverage balance is {}'.format(optimal_weights_regularized.sum().round(2)[0]))"
2814 |    ]
2815 |   }
2816 |  ],
2817 |  "metadata": {
2818 |   "kernelspec": {
2819 |    "display_name": "Python 3",
2820 |    "language": "python",
2821 |    "name": "python3"
2822 |   },
2823 |   "language_info": {
2824 |    "codemirror_mode": {
2825 |     "name": "ipython",
2826 |     "version": 3
2827 |    },
2828 |    "file_extension": ".py",
2829 |    "mimetype": "text/x-python",
2830 |    "name": "python",
2831 |    "nbconvert_exporter": "python",
2832 |    "pygments_lexer": "ipython3",
2833 |    "version": "3.8.5"
2834 |   }
2835 |  },
2836 |  "nbformat": 4,
2837 |  "nbformat_minor": 2
2838 | }
2839 | 


--------------------------------------------------------------------------------