├── codes ├── pyeconometrics │ ├── __pycache__ │ │ ├── 1 │ │ ├── base.cpython-38.pyc │ │ ├── utils.cpython-38.pyc │ │ ├── __init__.cpython-38.pyc │ │ └── panel_discrete_models.cpython-38.pyc │ ├── requirements.txt │ ├── setup.py │ ├── README.md │ ├── utils.py │ ├── base.py │ ├── censored_data_models.py │ └── panel_discrete_models.py ├── pyeconometrics.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── SOURCES.txt │ └── PKG-INFO ├── datasets │ ├── FraudTrain.txt │ ├── bs_v.3.txt │ ├── diff_test_msft.csv │ ├── diff_test_aapl.csv │ ├── diff_train_msft.csv │ └── diff_train_aapl.csv ├── chp_1.ipynb ├── chp_3.ipynb ├── chp_10.ipynb ├── chp_7.ipynb ├── chp_2.ipynb └── chp_9.ipynb ├── README.md ├── requirements.txt └── License.txt /codes/pyeconometrics/__pycache__/1: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /codes/pyeconometrics.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /codes/pyeconometrics.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pyeconometrics 2 | -------------------------------------------------------------------------------- /codes/pyeconometrics/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | matplotlib 4 | sklearn 5 | pandas -------------------------------------------------------------------------------- /codes/pyeconometrics/__pycache__/base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/base.cpython-38.pyc -------------------------------------------------------------------------------- /codes/pyeconometrics/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /codes/pyeconometrics/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /codes/pyeconometrics/__pycache__/panel_discrete_models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/panel_discrete_models.cpython-38.pyc -------------------------------------------------------------------------------- /codes/datasets/FraudTrain.txt: -------------------------------------------------------------------------------- 1 | As FraudTrain dataset is larger than 25MB, I prefer uploading this into google drive, 2 | you can easily go to the following link and download the dataset: 3 | 4 | https://drive.google.com/file/d/1Ko15MscTWzgVIKH64yT0zVeUt0OqmUff/view 5 | -------------------------------------------------------------------------------- /codes/datasets/bs_v.3.txt: -------------------------------------------------------------------------------- 1 | As bs_v.3 dataset is larger than 25MB, I prefer uploading this into google drive, 2 | you can easily go to the following link and download the dataset: 3 | 4 | https://drive.google.com/file/d/1NlVFxDoZXl-eVSyL1ZWXUX4fNt07QopR/view?usp=sharing 5 | -------------------------------------------------------------------------------- /codes/pyeconometrics.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | pyeconometrics/__init__.py 4 | pyeconometrics/base.py 5 | pyeconometrics/censored_data_models.py 6 | pyeconometrics/panel_discrete_models.py 7 | pyeconometrics/setup.py 8 | pyeconometrics/utils.py 9 | pyeconometrics.egg-info/PKG-INFO 10 | pyeconometrics.egg-info/SOURCES.txt 11 | pyeconometrics.egg-info/dependency_links.txt 12 | pyeconometrics.egg-info/top_level.txt -------------------------------------------------------------------------------- /codes/pyeconometrics/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='pyeconometrics', 5 | version='1.0.2', 6 | description='Econometrics Models for Python', 7 | long_description=open('README.md').read(), 8 | author='Nicolas HENNETIER', 9 | author_email='nicolashennetier2@gmail.com', 10 | packages=['pyeconometrics'], 11 | requires=['numpy', 'pandas', 'scipy', 'matplotlib', 'sklearn'] 12 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning for Financial Risk Management with Python 2 | 3 | This repository provides Python code and Jupyter Notebooks accompanying the Machine Learning for Financial Risk Management with Python book published by O'Reilly. 4 | 5 | Buy the book on [Amazon.](https://www.amazon.com/Machine-Learning-Financial-Management-Python/dp/1492085251) 6 | 7 | 8 | github_cover 9 | -------------------------------------------------------------------------------- /codes/pyeconometrics/README.md: -------------------------------------------------------------------------------- 1 | Python Econometrics Models 2 | =========================== 3 | Python package to build econometrics models. 4 | 5 | Available models 6 | ---------------- 7 | 8 | - Fixed Effects Logistic Regression (Logit) 9 | - Random Effects Logistic Regression (Logit and Probit) 10 | - Tobit I (Linear Regression for truncated data) 11 | 12 | 13 | Installing from Source 14 | ---------------------- 15 | 16 | Download and extract the source distribution from Github 17 | 18 | https://github.com/nicolashennetier/pyeconometrics 19 | 20 | Or clone the bleeding edge code from our repository on github at 21 | 22 | git clone git://github.com/nicolashennetier/pyeconometrics.git 23 | 24 | In the pyeconometrics directory do (with proper permissions) 25 | 26 | python setup.py install -------------------------------------------------------------------------------- /codes/datasets/diff_test_msft.csv: -------------------------------------------------------------------------------- 1 | Date,MSFT 2 | 2020-11-24,3.75 3 | 2020-11-25,0.0099945068359375 4 | 2020-11-27,1.3600006103515625 5 | 2020-11-30,-1.1599884033203125 6 | 2020-12-01,2.1399993896484375 7 | 2020-12-02,-0.8400115966796875 8 | 2020-12-03,-1.1299896240234375 9 | 2020-12-04,0.1199951171875 10 | 2020-12-07,-0.07000732421875 11 | 2020-12-08,1.720001220703125 12 | 2020-12-09,-4.209991455078125 13 | 2020-12-10,-1.279998779296875 14 | 2020-12-11,2.739990234375 15 | 2020-12-14,0.94000244140625 16 | 2020-12-15,-0.0699920654296875 17 | 2020-12-16,5.149993896484375 18 | 2020-12-17,0.1399993896484375 19 | 2020-12-18,-0.8300018310546875 20 | 2020-12-21,4.0 21 | 2020-12-22,1.350006103515625 22 | 2020-12-23,-2.9199981689453125 23 | 2020-12-24,1.7299957275390625 24 | 2020-12-28,2.2100067138671875 25 | 2020-12-29,-0.8100128173828125 26 | 2020-12-30,-2.470001220703125 27 | 2020-12-31,0.7400054931640625 28 | -------------------------------------------------------------------------------- /codes/datasets/diff_test_aapl.csv: -------------------------------------------------------------------------------- 1 | Date,AAPL 2 | 2020-11-24,1.3199996948242188 3 | 2020-11-25,0.8600006103515625 4 | 2020-11-27,0.55999755859375 5 | 2020-11-30,2.4600067138671875 6 | 2020-12-01,3.6699981689453125 7 | 2020-12-02,0.3600006103515625 8 | 2020-12-03,-0.1399993896484375 9 | 2020-12-04,-0.69000244140625 10 | 2020-12-07,1.5 11 | 2020-12-08,0.6299972534179688 12 | 2020-12-09,-2.5999984741210938 13 | 2020-12-10,1.4599990844726562 14 | 2020-12-11,-0.8299942016601562 15 | 2020-12-14,-0.6300048828125 16 | 2020-12-15,6.099998474121094 17 | 2020-12-16,-0.06999969482421875 18 | 2020-12-17,0.8899993896484375 19 | 2020-12-18,-2.0399932861328125 20 | 2020-12-21,1.5699920654296875 21 | 2020-12-22,3.6500091552734375 22 | 2020-12-23,-0.9199981689453125 23 | 2020-12-24,1.0099945068359375 24 | 2020-12-28,4.720001220703125 25 | 2020-12-29,-1.82000732421875 26 | 2020-12-30,-1.149993896484375 27 | 2020-12-31,-1.029998779296875 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #requirements 2 | #Note: At the time the book is written, costcla library works compatible with sklearn version of 0.22. 3 | arch==5.0.1 4 | arviz==0.11.2 5 | basemap==1.2.1 6 | copulae==0.7.5 7 | copulas==0.5.1 8 | costcla==0.6 9 | ctgan==0.4.3 10 | decorator==4.4.2 11 | gap==0.4.6 12 | gap-stat==2.0.1 13 | graphviz==0.17 14 | hmmlearn==0.2.6 15 | keras==2.6.0 16 | missingno==0.5.0 17 | mpl-toolkits.clifford==0.0.3 18 | numpy==1.21.2 19 | numpy-financial==1.0.0 20 | pandas==1.1.4 21 | pandas-datareader==0.10.0 22 | plotly==5.2.1 23 | pmdarima==1.8.2 24 | portfoliolab==0.3.0 25 | py4j==0.10.9 26 | pyensae==1.3.884 27 | pymc3==3.11.4 28 | pyportfolioopt==1.4.2 29 | python-dateutil==2.8.0 30 | Quandl==3.6.1 31 | quantecon==0.5.1 32 | scikit-image==0.18.1 33 | scikit-learn==1.0.1 34 | scipy==1.6.0 35 | sklearn-som==1.1.0 36 | table-evaluator==1.2.2.post1 37 | tables==3.6.1 38 | tensorboard==2.6.0 39 | tensorflow==2.6.0 40 | xgboost==1.4.2 41 | yellowbrick==1.3.post1 42 | yfinance==0.1.63 -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | All the contents and codes, Jupyter Notebooks and other materials in this Github repository related to Machine Learning for Financial Risk Management by Abdullah Karasan, PhD are copyrighted and only intended for personal use. 2 | 3 | Any kind of sharing, distribution, duplication, etc. without written permission by the O'Reilly is prohibited. 4 | 5 | The contents, Python codes, Jupyter Notebooks and other materials come without warranties or representations, to the extent permitted by applicable law. 6 | 7 | Notice that the code provided might be work in progress and that substantial additions, changes, updates, etc. can take place in the future. It is advised to regularly check for updates. 8 | 9 | None of the material represents any kind of recommendation or investment advice. The material is only meant as a technical illustration. Leveraged and unleveraged trading of financial instruments, and contracts for difference (CFDs) in particular, involves a number of risks. Make sure to understand and manage these risks. 10 | 11 | (c) Abdullah Karasan, December 2021. 12 | -------------------------------------------------------------------------------- /codes/pyeconometrics.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: pyeconometrics 3 | Version: 1.0.2 4 | Summary: Econometrics Models for Python 5 | Home-page: UNKNOWN 6 | Author: Nicolas HENNETIER 7 | Author-email: nicolashennetier2@gmail.com 8 | License: UNKNOWN 9 | Description: Python Econometrics Models 10 | =========================== 11 | Python package to build econometrics models. 12 | 13 | Available models 14 | ---------------- 15 | 16 | - Fixed Effects Logistic Regression (Logit) 17 | - Random Effects Logistic Regression (Logit and Probit) 18 | - Tobit I (Linear Regression for truncated data) 19 | 20 | 21 | Installing from Source 22 | ---------------------- 23 | 24 | Download and extract the source distribution from Github 25 | 26 | https://github.com/nicolashennetier/pyeconometrics 27 | 28 | Or clone the bleeding edge code from our repository on github at 29 | 30 | git clone git://github.com/nicolashennetier/pyeconometrics.git 31 | 32 | In the pyeconometrics directory do (with proper permissions) 33 | 34 | python setup.py install 35 | Platform: UNKNOWN 36 | Requires: numpy 37 | Requires: pandas 38 | Requires: scipy 39 | Requires: matplotlib 40 | Requires: sklearn 41 | -------------------------------------------------------------------------------- /codes/pyeconometrics/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from math import sqrt, factorial, exp 3 | import scipy.stats as st 4 | 5 | 6 | def norm_cdf(x): 7 | a1 = 0.254829592 8 | a2 = -0.284496736 9 | a3 = 1.421413741 10 | a4 = -1.453152027 11 | a5 = 1.061405429 12 | p = 0.3275911 13 | sign = 1 14 | if x < 0: 15 | sign = -1 16 | x = abs(x)/sqrt(2.0) 17 | t = 1.0/(1.0 + p*x) 18 | y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x) 19 | return 0.5*(1.0 + sign*y) 20 | 21 | def unique_permutations(seq): 22 | i_indices = range(len(seq)-1, -1, -1) 23 | k_indices = i_indices[1:] 24 | seq = sorted(seq) 25 | while True: 26 | yield seq 27 | for k in k_indices: 28 | if seq[k] < seq[k+1]: 29 | break 30 | else: 31 | return 32 | k_val = seq[k] 33 | for i in i_indices: 34 | if k_val < seq[i]: 35 | break 36 | (seq[k], seq[i]) = (seq[i], seq[k]) 37 | seq[k+1:] = seq[-1:k:-1] 38 | 39 | def nCr(n,r): 40 | try: 41 | return factorial(n) / factorial(r) / factorial(n-r) 42 | except: 43 | return 101 44 | 45 | 46 | def inverse_mills_ratio(x): 47 | return st.norm.pdf(x) / st.norm.cdf(x) 48 | 49 | 50 | def derivate_inverse_mills_ratio(x): 51 | return - inverse_mills_ratio(x) * (x + inverse_mills_ratio(x)) -------------------------------------------------------------------------------- /codes/pyeconometrics/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import scipy.stats as st 4 | import matplotlib.pyplot as plt 5 | 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | 9 | from pyeconometrics.utils import norm_cdf 10 | 11 | 12 | 13 | class Results(): 14 | def model_description(self): 15 | print('='*80) 16 | print('%-17s %30s %1s %-20s %8s' \ 17 | % ('Dep. Variable:', self.output, ' ', 'Pseudo R-squ.:', 18 | "%.5f" % (1-self.final_ll/self.init_ll))) 19 | print('%-17s %30s %1s %-20s %8s' \ 20 | % ('Model:', self.name, ' ', 'Log-Likelihood:', "%.3f" % self.final_ll)) 21 | print('%-17s %30s %1s %-20s %8s' \ 22 | % ('Method:', 'MLE', ' ', 'LL-Null:', "%.3f" % self.init_ll)) 23 | print('%-17s %30s %1s %-20s %8s' \ 24 | % ('No. Observations:', self.nb_obs, ' ', 'LLR p-value:', 25 | "%.3f" % (1-st.chi2.cdf(2 * (self.final_ll - self.init_ll), 26 | len(self.beta))))) 27 | print('%-17s %30s %1s %-20s %8s' \ 28 | % ('Df Model:', len(self.beta)-1, ' ', ' ', ' ')) 29 | print('%-17s %30s %1s %-20s %8s' \ 30 | % ('Converged:', self.converged, ' ', ' ', ' ')) 31 | print('='*80) 32 | 33 | def columns_header(self): 34 | print('%25s %8s %8s %8s %8s %18s' \ 35 | % (' ', 'coef', 'std err', 't', 'P>|t|', '[95.0% Conf. Int.]')) 36 | print('-'*80) 37 | 38 | def beta_description(self): 39 | for i,var in enumerate([x for x in self.variables if x != self.output]): 40 | print('%-24s %8s %8s %8s %8s %9s %9s' \ 41 | % (var, 42 | "%.4f" % self.beta[i], 43 | "%.3f" % self.beta_se[i], 44 | "%.3f" % (self.beta[i] / self.beta_se[i]), 45 | "%.3f" % (1-st.norm.cdf(abs(self.beta[i]) / self.beta_se[i])), 46 | "%.3f" % self.confidence_interval[i, 0], 47 | "%.3f" % self.confidence_interval[i, 1]) 48 | ) 49 | print('-'*80) 50 | 51 | def sigma_description(self): 52 | print('%-24s %8s %8s %8s %8s %9s %9s' \ 53 | % ('/sigma', 54 | "%.4f" % self.sigma, 55 | "%.3f" % self.beta_se[-1], 56 | "", 57 | "", 58 | "%.3f" % self.confidence_interval[-1, 0], 59 | "%.3f" % self.confidence_interval[-1, 1]) 60 | ) 61 | print('-'*80) 62 | 63 | def censored_data_description(self): 64 | print('%27s %-52s' \ 65 | % ('Obs. summary:', 66 | '%s censored observations' % self.nb_censored_obs)) 67 | print('%27s %-52s' \ 68 | % ('', 69 | '%s uncensored observations' % self.nb_uncensored_obs)) 70 | 71 | 72 | class BaseModel(Results): 73 | '''Base class inherited by other models 74 | Not intended to be used separately 75 | ''' 76 | def input_data_preparation(self, X, drop_na=None, fill_value=None): 77 | X = self.handle_missing_values(X, drop_na, fill_value) 78 | return X 79 | 80 | def handle_missing_values(self, X, drop_na=None, fill_value=None): 81 | if drop_na is not None: 82 | if drop_na: 83 | X.dropna(inplace=True) 84 | elif fill_value == 'mean': 85 | X.fillna(X.mean(), inplace=True) 86 | elif fill_value == 'median': 87 | X.fillna(X.median(), inplace=True) 88 | elif fill_value is not None: 89 | for var in X.columns: 90 | try: 91 | X[var].fillna(fill_value.get(var), inplace=True) 92 | except: 93 | raise ValueError('\'fill_value\' argument must be in list ' \ 94 | + '[\'mean\', \'median\'] or of type dict. See docstring for more info.') 95 | 96 | return X 97 | 98 | def plot_trace_estimators(self): 99 | if self.beta is None: 100 | raise AttributeError('Fit method should be called before evaluating of the model') 101 | 102 | colors = ['b','g','r','c','m','y','k'] 103 | for k in range(len(self.beta)): 104 | plt.plot(np.arange(1, len(self.beta_est)+1), 105 | self.beta_est[:,k], 106 | color=colors[(k-1) % len(colors)], 107 | label="Beta_%s" % k) 108 | 109 | plt.xlim((1,len(self.beta_est)*1.2)) 110 | plt.xlabel('Iterations') 111 | plt.ylabel('Estimators') 112 | plt.title('Trace plot of estimators of beta', size=16) 113 | plt.legend(loc='best') 114 | plt.show() 115 | 116 | def predict(self, X): 117 | if self.beta is None: 118 | raise AttributeError('Fit method should be called before evaluating the model.') 119 | 120 | X = self.input_data_preparation(X) 121 | X.insert(0, '_cons', 1) 122 | 123 | Z = self.response_function(X, self.beta) 124 | result = (np.sign(Z)+1)/2 125 | 126 | try: 127 | result = result.astype(int) 128 | except: 129 | raise ValueError('One or several data are missing.') 130 | 131 | return result 132 | 133 | def predict_proba(self, X): 134 | if self.beta is None: 135 | raise AttributeError('Fit method should be called before evaluating the model.') 136 | 137 | X = self.input_data_preparation(X) 138 | X.insert(0, '_cons', 1) 139 | 140 | Z = self.response_function(X,self.beta) 141 | return Z.apply(lambda x : norm_cdf(x)) 142 | 143 | def summary(self): 144 | if self.beta is None: 145 | raise AttributeError('Fit method should be called before evaluating of the model.') 146 | 147 | self.model_description() 148 | self.columns_header() 149 | self.beta_description() 150 | 151 | 152 | 153 | class PanelBaseModel(BaseModel): 154 | '''Base class inherited by other models 155 | Not intended to be used separately 156 | ''' 157 | def input_data_preparation(self, X, drop_na=None, fill_value=None): 158 | try: 159 | X = X.to_frame() 160 | except: 161 | if len(X.index.names) != 2: 162 | raise ValueError("Only 2-level MultiIndex and Panel are supported.") 163 | 164 | X = self.handle_missing_values(X, drop_na, fill_value) 165 | return X 166 | 167 | 168 | class CensoredBaseModel(BaseModel): 169 | '''Base class inherited by other models 170 | Not intended to be used separately 171 | ''' 172 | def input_data_preparation(self, X, drop_na=None, fill_value=None): 173 | if self.output in X.columns: 174 | neg_values = X[X[self.output] <= 0] 175 | if len(neg_values[neg_values[self.output] < 0]) > 0: 176 | raise ValueError("Negative values where found in output variable." \ 177 | + "Please set all censored observations to 0 before fitting the model.") 178 | elif len(neg_values) == 0: 179 | raise ValueError("No censored observations were found." \ 180 | + "Please set output of all censored observations to 0 before fitting the model.") 181 | 182 | X = self.handle_missing_values(X, drop_na, fill_value) 183 | return X 184 | 185 | def summary(self): 186 | BaseModel.summary(self) 187 | self.sigma_description() 188 | self.censored_data_description() 189 | 190 | -------------------------------------------------------------------------------- /codes/chp_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Risk-Return" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#!pip install plotly\n", 17 | "import statsmodels.api as sm\n", 18 | "import numpy as np\n", 19 | "import plotly.graph_objs as go\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import plotly\n", 22 | "import warnings\n", 23 | "warnings.filterwarnings('ignore')\n", 24 | "plt.rcParams['figure.dpi'] = 300\n", 25 | "plt.rcParams['savefig.dpi'] = 300" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "n_assets = 5\n", 35 | "n_simulation = 500" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "returns = np.random.randn(n_assets, n_simulation)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "rand = np.random.rand(n_assets\n", 54 | "weights = rand/sum(rand)\n", 55 | "\n", 56 | "\n", 57 | "def port_return(returns):\n", 58 | " rets = np.mean(returns, axis=1)\n", 59 | " cov = np.cov(rets.T, aweights=weights, ddof=1)\n", 60 | " portfolio_returns = np.dot(weights, rets.T)\n", 61 | " portfolio_std_dev = np.sqrt(np.dot(weights, np.dot(cov, weights)))\n", 62 | " return portfolio_returns, portfolio_std_dev" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "portfolio_returns, portfolio_std_dev = port_return(returns)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "print(portfolio_returns)\n", 81 | "print(portfolio_std_dev)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "portfolio = np.array([port_return(np.random.randn(n_assets, i))\n", 91 | " for i in range(1, 101)])" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "best_fit = sm.OLS(portfolio[:, 1], sm.add_constant(portfolio[:, 0]))\\\n", 101 | " .fit().fittedvalues" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "scrolled": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "fig = go.Figure()\n", 113 | "fig.add_trace(go.Scatter(name='Risk-Return Relationship',\n", 114 | " x=portfolio[:, 0],\n", 115 | " y=portfolio[:, 1], mode='markers'))\n", 116 | "fig.add_trace(go.Scatter(name='Best Fit Line',\n", 117 | " x=portfolio[:, 0],\n", 118 | " y=best_fit, mode='lines'))\n", 119 | "fig.update_layout(xaxis_title = 'Return',\n", 120 | " yaxis_title = 'Standard Deviation',\n", 121 | " width=900, height=470)\n", 122 | "fig.show()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "## Adverse Selection" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import matplotlib.pyplot as plt\n", 139 | "import numpy as np\n", 140 | "plt.style.use('seaborn')" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def utility(x):\n", 150 | " return(np.exp(x**gamma))" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "pi = np.random.uniform(0,1,20)\n", 160 | "pi = np.sort(pi)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "print('The highest three probability of losses are {}'\n", 170 | " .format(pi[-3:]))" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "y = 2\n", 180 | "c = 1.5\n", 181 | "Q = 5\n", 182 | "D = 0.01\n", 183 | "gamma = 0.4" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "def supply(Q):\n", 193 | " return(np.mean(pi[-Q:]) * c)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "def demand(D):\n", 203 | " return(np.sum(utility(y - D) > pi * utility(y - c) + (1 - pi) \n", 204 | " * utility(y)))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "plt.figure()\n", 214 | "plt.plot([demand(i) for i in np.arange(0, 1.9, 0.02)],\n", 215 | " np.arange(0, 1.9, 0.02), \n", 216 | " 'r', label='insurance demand')\n", 217 | "plt.plot(range(1,21), [supply(j) for j in range(1,21)],\n", 218 | " 'g', label='insurance supply')\n", 219 | "plt.ylabel(\"Average Cost\")\n", 220 | "plt.xlabel(\"Number of People\")\n", 221 | "plt.legend()\n", 222 | "plt.show()\n" 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.8.8" 243 | }, 244 | "latex_envs": { 245 | "LaTeX_envs_menu_present": true, 246 | "autoclose": false, 247 | "autocomplete": true, 248 | "bibliofile": "biblio.bib", 249 | "cite_by": "apalike", 250 | "current_citInitial": 1, 251 | "eqLabelWithNumbers": true, 252 | "eqNumInitial": 1, 253 | "hotkeys": { 254 | "equation": "Ctrl-E", 255 | "itemize": "Ctrl-I" 256 | }, 257 | "labels_anchors": false, 258 | "latex_user_defs": false, 259 | "report_style_numbering": false, 260 | "user_envs_cfg": false 261 | }, 262 | "toc": { 263 | "base_numbering": 1, 264 | "nav_menu": {}, 265 | "number_sections": false, 266 | "sideBar": true, 267 | "skip_h1_title": false, 268 | "title_cell": "Table of Contents", 269 | "title_sidebar": "Contents", 270 | "toc_cell": false, 271 | "toc_position": {}, 272 | "toc_section_display": true, 273 | "toc_window_display": false 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 4 278 | } 279 | -------------------------------------------------------------------------------- /codes/pyeconometrics/censored_data_models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import scipy.stats as st 4 | 5 | import warnings 6 | warnings.filterwarnings('ignore') 7 | 8 | from numpy.linalg import inv 9 | from math import exp, sqrt, log, pi 10 | 11 | from pyeconometrics.base import CensoredBaseModel 12 | from pyeconometrics.utils import inverse_mills_ratio, derivate_inverse_mills_ratio 13 | 14 | 15 | 16 | class TobitModel(CensoredBaseModel): 17 | '''Fixed Effects Logit model for Panel Data 18 | Estimation of parameters with the Conditional Maximum Likelihood method 19 | ''' 20 | def __init__(self): 21 | self.name = 'Tobit I Model' 22 | self.output = None 23 | self.variables = None 24 | self.nb_obs = None 25 | self.nb_censored_obs = None 26 | self.nb_uncensored_obs = None 27 | self.init_ll = None 28 | self.beta = None 29 | self.sigma = None 30 | self.beta_est = None 31 | self.beta_se = None 32 | self.confidence_interval = None 33 | self.final_ll = None 34 | self.converged = None 35 | 36 | def response_function(self, X, beta): 37 | A = X.copy() 38 | try: 39 | A.drop(self.output, axis=1, inplace=True) 40 | except: 41 | pass 42 | 43 | return np.array(A).dot(beta) 44 | 45 | def __log_likelihood_censored(self, X, beta, sigma): 46 | Z = np.array(self.response_function(X, beta)) 47 | Z = Z/sigma 48 | 49 | norm_cdf_vec = np.vectorize(st.norm.cdf) 50 | result = np.sum(np.log(norm_cdf_vec(Z))) 51 | 52 | return result 53 | 54 | def __log_likelihood_uncensored(self, X, beta, sigma): 55 | Z = np.array(self.response_function(X, beta)) 56 | y = np.array(X[self.output]) 57 | Z = 0.5 * np.multiply((y - Z)/sigma, (y - Z)/sigma) 58 | result = np.sum(Z) 59 | 60 | return result 61 | 62 | def __log_likelihood(self, X, beta, sigma): 63 | X_cens = X[X[self.output]==0] 64 | X_uncens = X[X[self.output]>0] 65 | 66 | result = - self.__log_likelihood_censored(X, beta, sigma) \ 67 | - self.__log_likelihood_uncensored(X, beta, sigma) \ 68 | - len(X_uncens) * log(sigma * sqrt(2*pi)) 69 | 70 | return result 71 | 72 | def __grad_b_log_likelihood(self, X, b, s): 73 | X_cens = X[X[self.output]==0] 74 | X_uncens = X[X[self.output]>0] 75 | y_uncens = X_uncens[self.output] 76 | X_cens.drop(self.output, axis=1, inplace=True) 77 | X_uncens.drop(self.output, axis=1, inplace=True) 78 | 79 | inverse_mills_ratio_vec = np.vectorize(inverse_mills_ratio) 80 | 81 | grad_cens = inverse_mills_ratio_vec(np.array(self.response_function(X_cens, b), ndmin=2)) 82 | grad_cens = - np.sum(np.array(X_cens) * grad_cens.T, axis=0) 83 | 84 | grad_uncens = s * np.array(y_uncens, ndmin=2) - np.array(self.response_function(X_uncens, b), ndmin=2) 85 | grad_uncens = np.sum(np.array(X_uncens) * grad_uncens.T, axis=0) 86 | 87 | result = grad_cens + grad_uncens 88 | return result 89 | 90 | def __derivate_s_log_likelihood(self, X, b, s): 91 | X_uncens = X[X[self.output]>0] 92 | y_uncens = X_uncens[self.output] 93 | X_uncens.drop(self.output, axis=1, inplace=True) 94 | 95 | inverse_mills_ratio_vec = np.vectorize(inverse_mills_ratio) 96 | 97 | grad_uncens = s * np.array(y_uncens) - np.array(self.response_function(X_uncens, b)) 98 | grad_uncens = - np.sum(np.multiply(y_uncens, grad_uncens)) 99 | 100 | result = grad_uncens + len(X_uncens)/s 101 | return result 102 | 103 | def __score(self, X, b, s): 104 | return np.concatenate([self.__grad_b_log_likelihood(X, b, s), 105 | np.array(self.__derivate_s_log_likelihood(X, b, s), ndmin=1)]) 106 | 107 | def __hessian_b_b(self, X, b, s): 108 | X_uncens = X[X[self.output]>0] 109 | y_uncens = X_uncens[self.output] 110 | X_uncens.drop(self.output, axis=1, inplace=True) 111 | 112 | derivate_inverse_mills_ratio_vec = np.vectorize(derivate_inverse_mills_ratio) 113 | hessian_uncens = 1 + derivate_inverse_mills_ratio_vec(-np.array(self.response_function(X_uncens, b), ndmin=2)) 114 | 115 | list_XXT = [] 116 | for i in range(X_uncens.shape[0]): 117 | row = np.array(np.array(X_uncens)[i,:], ndmin=2) 118 | list_XXT.append(row.T.dot(row)) 119 | hessian_uncens = [-hessian_uncens[0,i]*list_XXT[i] for i in range(len(list_XXT))] 120 | 121 | result = sum(hessian_uncens) 122 | return result 123 | 124 | 125 | def __hessian_s_s(self, X, b, s): 126 | X_uncens = X[X[self.output]>0] 127 | y_uncens = X_uncens[self.output] 128 | 129 | item1 = -np.multiply(np.array(y_uncens), np.array(y_uncens)) 130 | item2 = -1/s**2 131 | result = np.sum(item1 + item2) 132 | return result 133 | 134 | def __hessian_b_s(self, X, b, s): 135 | X_uncens = X[X[self.output]>0] 136 | y_uncens = X_uncens[self.output] 137 | X_uncens.drop(self.output, axis=1, inplace=True) 138 | 139 | result = np.sum(np.array(X_uncens) * np.array(y_uncens, ndmin=2).T, axis=0) 140 | return result 141 | 142 | def __hessian(self, X, b, s): 143 | a = self.__hessian_b_b(X, b, s) 144 | b = self.__hessian_b_s(X, b, s) 145 | c = self.__hessian_s_s(X, b, s) 146 | 147 | item1 = np.concatenate([a,np.array(b, ndmin=2).T], axis=1) 148 | item2 = np.array(np.concatenate([b.T, np.array(c, ndmin=1)]), ndmin=2) 149 | result = np.concatenate([item1, item2], axis=0) 150 | return result 151 | 152 | def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False): 153 | '''Maximum Likelihhod Estimation 154 | Implement a Newton-Raphson algorithm to estimate parameters 155 | 156 | Parameters: 157 | ---------- 158 | X: Dataframe 159 | Database to fit the model 160 | 161 | output: string 162 | Name of the variable to predict 163 | 164 | nb_iter: integer (optional, default 20) 165 | Maximal number of iteration before the end of the Newton-Raphson algorithm 166 | 167 | drop_na: boolean (optional, default True) 168 | Indicate the method to handle missing values in X 169 | If drop_na = False, fill_value has to be given 170 | 171 | fill_value: string or dict (optional, defaul None) 172 | Considered only if drop_na = False 173 | Possible values: 174 | - 'mean': missing values of a column are replaced by the mean of that column 175 | - 'median': missing values of a column are replaced by the median of that column 176 | - dict: keys must be variables' names and associated values the values used to fill Nan 177 | 178 | verbose: boolean (optional, default False) 179 | If set to True, allows prints of Newton-Raphson algorithm's progress 180 | ''' 181 | self.output = output 182 | X = self.input_data_preparation(X.copy(), drop_na, fill_value) 183 | X.insert(0, '_cons', 1) 184 | 185 | self.nb_obs = len(X) 186 | self.nb_censored_obs = len(X[X[self.output] == 0]) 187 | self.nb_uncensored_obs = len(X[X[self.output] > 0]) 188 | 189 | self.variables = [x for x in X.columns if x != self.output] 190 | 191 | beta_init = [0 for _ in range(len(self.variables))] + [1] 192 | self.beta_est = np.zeros((nb_iter,len(beta_init))) 193 | self.beta_est[0] = beta_init 194 | 195 | self.init_ll = self.__log_likelihood(X, beta_init[:-1], beta_init[-1]) 196 | 197 | if verbose: 198 | print('Initial log-likelihood : '+ str(self.init_ll)) 199 | print('Parameters estimation in progress.') 200 | 201 | current_ll = self.init_ll 202 | prev_ll = self.init_ll 203 | j = 1 204 | while (j < nb_iter) \ 205 | and (j == 1 or (current_ll - prev_ll > 0.01)): 206 | b = self.beta_est[j-1,:-1]/self.beta_est[j-1,-1] 207 | s = 1/self.beta_est[j-1,-1] 208 | 209 | score = self.__score(X, b, s) 210 | hessian = self.__hessian(X, b, s) 211 | 212 | try: 213 | step = inv(hessian).dot(score) 214 | except: 215 | raise ValueError('Improper classification problem' \ 216 | + ', should be 2 different labels') 217 | 218 | b -= step[:-1] 219 | s -= step[-1] 220 | self.beta_est[j] = np.concatenate([b, np.array(1/s, ndmin=1)]) 221 | 222 | prev_ll = current_ll 223 | if self.beta_est[j,-1] > 0: 224 | current_ll = self.__log_likelihood(X, self.beta_est[j,:-1], 225 | self.beta_est[j,-1]) 226 | if verbose: 227 | print('Iteration %s, log_likelihood : %s'\ 228 | % (j, current_ll)) 229 | else: 230 | current_ll = prev_ll - 1 231 | j += 1 232 | 233 | self.beta = self.beta_est[j-3,:-1] 234 | self.sigma = self.beta_est[j-3,-1] 235 | self.beta_est = self.beta_est[:j-2,:] 236 | 237 | sqrt_vec = np.vectorize(sqrt) 238 | b = self.beta/self.sigma 239 | s = 1/self.sigma 240 | hessian = self.__hessian(X, b, s) 241 | self.beta_se = sqrt_vec(-inv(hessian).diagonal()) 242 | 243 | self.confidence_interval = np.array( 244 | [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i], 245 | self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]] 246 | for i in range(len(self.beta))] 247 | + [[self.sigma - st.norm.ppf(0.975) * self.beta_se[-1], 248 | self.sigma + st.norm.ppf(0.975) * self.beta_se[-1]]]) 249 | 250 | self.final_ll = prev_ll 251 | 252 | if j < nb_iter: 253 | self.converged = True 254 | else: 255 | self.converged = False 256 | 257 | return self -------------------------------------------------------------------------------- /codes/chp_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "exciting-springer", 6 | "metadata": {}, 7 | "source": [ 8 | "## Recurrent Neural Network" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "mineral-doctor", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "import math\n", 21 | "import datetime\n", 22 | "import yfinance as yf\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import tensorflow as tf\n", 25 | "from tensorflow.keras.models import Sequential\n", 26 | "from tensorflow.keras.callbacks import EarlyStopping\n", 27 | "from tensorflow.keras.layers import (Dense, Dropout, \n", 28 | " Activation, Flatten, \n", 29 | " MaxPooling2D, SimpleRNN)\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "plt.rcParams['figure.dpi'] = 300\n", 32 | "plt.rcParams['savefig.dpi'] = 300" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "adapted-great", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "n_steps = 13\n", 43 | "n_features = 1" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "distinct-distinction", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "model = Sequential()\n", 54 | "model.add(SimpleRNN(512, activation='relu', \n", 55 | " input_shape=(n_steps, n_features),\n", 56 | " return_sequences=True))\n", 57 | "model.add(Dropout(0.2))\n", 58 | "model.add(Dense(256, activation = 'relu'))\n", 59 | "model.add(Flatten())>\n", 60 | "model.add(Dense(1, activation='linear'))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "subsequent-distinction", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "model.compile(optimizer='rmsprop',\n", 71 | " loss='mean_squared_error',\n", 72 | " metrics=['mse'])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "russian-donna", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "def split_sequence(sequence, n_steps):\n", 83 | " X, y = [], []\n", 84 | " for i in range(len(sequence)):\n", 85 | " end_ix = i + n_steps\n", 86 | " if end_ix > len(sequence) - 1:\n", 87 | " break\n", 88 | " seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]\n", 89 | " X.append(seq_x)\n", 90 | " y.append(seq_y)\n", 91 | " return np.array(X), np.array(y)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "generic-missouri", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "ticker = ['AAPL', 'MSFT']\n", 102 | "start = datetime.datetime(2019, 1, 1)\n", 103 | "end = datetime.datetime(2020, 1 ,1)\n", 104 | "stock_prices = yf.download(ticker,start=start, end = end, interval='1d')\\\n", 105 | " .Close" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "informational-steering", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "diff_stock_prices = stock_prices.diff().dropna()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "moral-knock", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "split = int(len(diff_stock_prices['AAPL'].values) * 0.95)\n", 126 | "diff_train_aapl = diff_stock_prices['AAPL'].iloc[:split]\n", 127 | "diff_test_aapl = diff_stock_prices['AAPL'].iloc[split:]\n", 128 | "diff_train_msft = diff_stock_prices['MSFT'].iloc[:split]\n", 129 | "diff_test_msft = diff_stock_prices['MSFT'].iloc[split:]" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "obvious-logging", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "X_aapl, y_aapl = split_sequence(diff_train_aapl, n_steps)\n", 140 | "X_aapl = X_aapl.reshape((X_aapl.shape[0], X_aapl.shape[1],\n", 141 | " n_features))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "narrow-department", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "history = model.fit(X_aapl, y_aapl, \n", 152 | " epochs=400, batch_size=150, verbose=0, \n", 153 | " validation_split = 0.10)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "unlimited-attachment", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "start = X_aapl[X_aapl.shape[0] - n_steps]\n", 164 | "x_input = start\n", 165 | "x_input = x_input.reshape((1, n_steps, n_features))" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "experimental-sight", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "tempList_aapl = []\n", 176 | "for i in range(len(diff_test_aapl)):\n", 177 | " x_input = x_input.reshape((1, n_steps, n_features))\n", 178 | " yhat = model.predict(x_input, verbose=0)\n", 179 | " x_input = np.append(x_input, yhat)\n", 180 | " x_input = x_input[1:]\n", 181 | " tempList_aapl.append(yhat)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "appropriate-killer", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "X_msft, y_msft = split_sequence(diff_train_msft, n_steps)\n", 192 | "X_msft = X_msft.reshape((X_msft.shape[0], X_msft.shape[1],\n", 193 | " n_features))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "animated-school", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "history = model.fit(X_msft, y_msft, \n", 204 | " epochs=400, batch_size=150, verbose=0, \n", 205 | " validation_split = 0.10)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "proprietary-limit", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "start = X_msft[X_msft.shape[0] - n_steps]\n", 216 | "x_input = start\n", 217 | "x_input = x_input.reshape((1, n_steps, n_features))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "invalid-episode", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "tempList_msft = []\n", 228 | "for i in range(len(diff_test_msft)):\n", 229 | " x_input = x_input.reshape((1, n_steps, n_features))\n", 230 | " yhat = model.predict(x_input, verbose=0)\n", 231 | " x_input = np.append(x_input, yhat)\n", 232 | " x_input = x_input[1:]\n", 233 | " tempList_msft.append(yhat)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "hidden-johnston", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "fig, ax = plt.subplots(2,1, figsize=(18,15))\n", 244 | "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n", 245 | "ax[0].plot(diff_test_aapl.index, np.array(tempList_aapl).flatten(),\n", 246 | " linestyle='solid', label=\"Prediction\")\n", 247 | "ax[0].set_title('Predicted Stock Price-Apple')\n", 248 | "ax[0].legend(loc='best')\n", 249 | "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n", 250 | "ax[1].plot(diff_test_msft.index,np.array(tempList_msft).flatten(),\n", 251 | " linestyle='solid', label=\"Prediction\")\n", 252 | "ax[1].set_title('Predicted Stock Price-Microsoft')\n", 253 | "ax[1].legend(loc='best')\n", 254 | "\n", 255 | "\n", 256 | "for ax in ax.flat:\n", 257 | " ax.set(xlabel='Date', ylabel='Differenced Price')\n", 258 | "plt.show()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "id": "quiet-concentration", 264 | "metadata": {}, 265 | "source": [ 266 | "## LSTM" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "light-validity", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "from tensorflow.keras.layers import LSTM\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "id": "short-advocacy", 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "n_steps = 13\n", 287 | "n_features = 1" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "boring-binding", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "model = Sequential()\n", 298 | "model.add(LSTM(512, activation='relu',\n", 299 | " input_shape=(n_steps, n_features),\n", 300 | " return_sequences=True))\n", 301 | "model.add(Dropout(0.2))\n", 302 | "model.add(LSTM(256,activation='relu'))\n", 303 | "model.add(Flatten())\n", 304 | "model.add(Dense(1, activation='linear'))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "id": "great-meter", 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "model.compile(optimizer='rmsprop', loss='mean_squared_error',\n", 315 | " metrics=['mse'])" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "bound-supervisor", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "history = model.fit(X_aapl, y_aapl, \n", 326 | " epochs=400, batch_size=150, verbose=0, \n", 327 | " validation_split = 0.10)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "graduate-truth", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "start = X_aapl[X_aapl.shape[0] - 13]\n", 338 | "x_input = start\n", 339 | "x_input = x_input.reshape((1, n_steps, n_features))" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "polished-internship", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "tempList_aapl = []\n", 350 | "for i in range(len(diff_test_aapl)):\n", 351 | " x_input = x_input.reshape((1, n_steps, n_features))\n", 352 | " yhat = model.predict(x_input, verbose=0)\n", 353 | " x_input = np.append(x_input, yhat)\n", 354 | " x_input = x_input[1:]\n", 355 | " tempList_aapl.append(yhat)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "utility-adaptation", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "history = model.fit(X_msft, y_msft, \n", 366 | " epochs=400, batch_size=150, verbose=0, \n", 367 | " validation_split = 0.10)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "id": "given-copyright", 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "start = X_msft[X_msft.shape[0] - 13]\n", 378 | "x_input = start\n", 379 | "x_input = x_input.reshape((1, n_steps, n_features))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "id": "adjusted-discipline", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "tempList_msft = []\n", 390 | "for i in range(len(diff_test_msft)):\n", 391 | " x_input = x_input.reshape((1, n_steps, n_features))\n", 392 | " yhat = model.predict(x_input, verbose=0)\n", 393 | " x_input = np.append(x_input, yhat)\n", 394 | " x_input = x_input[1:]\n", 395 | " tempList_msft.append(yhat)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "id": "hispanic-labor", 402 | "metadata": { 403 | "scrolled": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n", 408 | "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n", 409 | "ax[0].plot(diff_test_aapl.index, np.array(tempList_aapl).flatten(),\n", 410 | " linestyle='solid', label=\"Prediction\")\n", 411 | "ax[0].set_title('Predicted Stock Price-Apple')\n", 412 | "ax[0].legend(loc='best')\n", 413 | "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n", 414 | "ax[1].plot(diff_test_msft.index, np.array(tempList_msft).flatten(),\n", 415 | " linestyle='solid', label=\"Prediction\")\n", 416 | "ax[1].set_title('Predicted Stock Price-Microsoft')\n", 417 | "ax[1].legend(loc='best')\n", 418 | "\n", 419 | "for ax in ax.flat:\n", 420 | " ax.set(xlabel='Date', ylabel='$')\n", 421 | "plt.show()" 422 | ] 423 | } 424 | ], 425 | "metadata": { 426 | "kernelspec": { 427 | "display_name": "Python 3", 428 | "language": "python", 429 | "name": "python3" 430 | }, 431 | "language_info": { 432 | "codemirror_mode": { 433 | "name": "ipython", 434 | "version": 3 435 | }, 436 | "file_extension": ".py", 437 | "mimetype": "text/x-python", 438 | "name": "python", 439 | "nbconvert_exporter": "python", 440 | "pygments_lexer": "ipython3", 441 | "version": "3.8.8" 442 | }, 443 | "latex_envs": { 444 | "LaTeX_envs_menu_present": true, 445 | "autoclose": false, 446 | "autocomplete": true, 447 | "bibliofile": "biblio.bib", 448 | "cite_by": "apalike", 449 | "current_citInitial": 1, 450 | "eqLabelWithNumbers": true, 451 | "eqNumInitial": 1, 452 | "hotkeys": { 453 | "equation": "Ctrl-E", 454 | "itemize": "Ctrl-I" 455 | }, 456 | "labels_anchors": false, 457 | "latex_user_defs": false, 458 | "report_style_numbering": false, 459 | "user_envs_cfg": false 460 | }, 461 | "toc": { 462 | "base_numbering": 1, 463 | "nav_menu": {}, 464 | "number_sections": true, 465 | "sideBar": true, 466 | "skip_h1_title": false, 467 | "title_cell": "Table of Contents", 468 | "title_sidebar": "Contents", 469 | "toc_cell": false, 470 | "toc_position": {}, 471 | "toc_section_display": true, 472 | "toc_window_display": false 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 5 477 | } 478 | -------------------------------------------------------------------------------- /codes/datasets/diff_train_msft.csv: -------------------------------------------------------------------------------- 1 | Date,MSFT 2 | 2019-01-02,-0.4499969482421875 3 | 2019-01-03,-3.720001220703125 4 | 2019-01-04,4.529998779296875 5 | 2019-01-07,0.12999725341796875 6 | 2019-01-08,0.7400054931640625 7 | 2019-01-09,1.4699935913085938 8 | 2019-01-10,-0.6699981689453125 9 | 2019-01-11,-0.7999954223632812 10 | 2019-01-14,-0.75 11 | 2019-01-15,2.9599990844726562 12 | 2019-01-16,0.3699951171875 13 | 2019-01-17,0.7400054931640625 14 | 2019-01-18,1.589996337890625 15 | 2019-01-22,-2.029998779296875 16 | 2019-01-23,1.029998779296875 17 | 2019-01-24,-0.5100021362304688 18 | 2019-01-25,0.970001220703125 19 | 2019-01-28,-2.089996337890625 20 | 2019-01-29,-2.1399993896484375 21 | 2019-01-30,3.4399948120117188 22 | 2019-01-31,-1.9499969482421875 23 | 2019-02-01,-1.6500015258789062 24 | 2019-02-04,2.9599990844726562 25 | 2019-02-05,1.4800033569335938 26 | 2019-02-06,-1.19000244140625 27 | 2019-02-07,-0.7600021362304688 28 | 2019-02-08,0.40000152587890625 29 | 2019-02-11,-0.4199981689453125 30 | 2019-02-12,1.6399993896484375 31 | 2019-02-13,-0.0800018310546875 32 | 2019-02-14,0.09000396728515625 33 | 2019-02-15,1.3199996948242188 34 | 2019-02-19,-0.0500030517578125 35 | 2019-02-20,-1.0199966430664062 36 | 2019-02-21,2.2600021362304688 37 | 2019-02-22,1.55999755859375 38 | 2019-02-25,0.6199951171875 39 | 2019-02-26,0.7700042724609375 40 | 2019-02-27,-0.19000244140625 41 | 2019-02-28,-0.1399993896484375 42 | 2019-03-01,0.5 43 | 2019-03-04,-0.26999664306640625 44 | 2019-03-05,-0.5600051879882812 45 | 2019-03-06,0.0500030517578125 46 | 2019-03-07,-1.3600006103515625 47 | 2019-03-08,0.12000274658203125 48 | 2019-03-11,2.3199996948242188 49 | 2019-03-12,0.7900009155273438 50 | 2019-03-13,0.8799972534179688 51 | 2019-03-14,0.089996337890625 52 | 2019-03-15,1.32000732421875 53 | 2019-03-18,1.6599960327148438 54 | 2019-03-19,0.0800018310546875 55 | 2019-03-20,-0.1300048828125 56 | 2019-03-21,2.7000045776367188 57 | 2019-03-22,-3.1699981689453125 58 | 2019-03-25,0.6100006103515625 59 | 2019-03-26,0.25 60 | 2019-03-27,-1.1400070190429688 61 | 2019-03-28,0.160003662109375 62 | 2019-03-29,1.0100021362304688 63 | 2019-04-01,1.0799942016601562 64 | 2019-04-02,0.17000579833984375 65 | 2019-04-03,0.779998779296875 66 | 2019-04-04,-0.6100006103515625 67 | 2019-04-05,0.529998779296875 68 | 2019-04-08,0.04000091552734375 69 | 2019-04-09,-0.6500015258789062 70 | 2019-04-10,0.910003662109375 71 | 2019-04-11,0.1399993896484375 72 | 2019-04-12,0.6199951171875 73 | 2019-04-15,0.100006103515625 74 | 2019-04-16,-0.28000640869140625 75 | 2019-04-17,1.0 76 | 2019-04-18,1.600006103515625 77 | 2019-04-22,0.3899993896484375 78 | 2019-04-23,1.6800003051757812 79 | 2019-04-24,-0.43000030517578125 80 | 2019-04-25,4.139991760253906 81 | 2019-04-26,0.7400054931640625 82 | 2019-04-29,-0.1199951171875 83 | 2019-04-30,0.8300018310546875 84 | 2019-05-01,-2.7200088500976562 85 | 2019-05-02,-1.6699981689453125 86 | 2019-05-03,2.6899948120117188 87 | 2019-05-06,-0.75 88 | 2019-05-07,-2.6299972534179688 89 | 2019-05-08,-0.0099945068359375 90 | 2019-05-09,-0.01000213623046875 91 | 2019-05-10,1.6299972534179688 92 | 2019-05-13,-3.779998779296875 93 | 2019-05-14,1.3800048828125 94 | 2019-05-15,1.2899932861328125 95 | 2019-05-16,2.9099960327148438 96 | 2019-05-17,-0.8599853515625 97 | 2019-05-20,-1.850006103515625 98 | 2019-05-21,0.6800003051757812 99 | 2019-05-22,0.7699966430664062 100 | 2019-05-23,-1.4899978637695312 101 | 2019-05-24,0.05999755859375 102 | 2019-05-28,-0.07999420166015625 103 | 2019-05-29,-1.220001220703125 104 | 2019-05-30,0.7900009155273438 105 | 2019-05-31,-2.0500030517578125 106 | 2019-06-03,-3.8400039672851562 107 | 2019-06-04,3.32000732421875 108 | 2019-06-05,2.6699981689453125 109 | 2019-06-06,1.9899978637695312 110 | 2019-06-07,3.5799942016601562 111 | 2019-06-10,1.20001220703125 112 | 2019-06-11,-0.5 113 | 2019-06-12,-0.6100006103515625 114 | 2019-06-13,0.8300018310546875 115 | 2019-06-14,0.1299896240234375 116 | 2019-06-17,0.4000091552734375 117 | 2019-06-18,2.30999755859375 118 | 2019-06-19,0.529998779296875 119 | 2019-06-20,1.2599945068359375 120 | 2019-06-21,0.0200042724609375 121 | 2019-06-24,0.80999755859375 122 | 2019-06-25,-4.350006103515625 123 | 2019-06-26,0.5 124 | 2019-06-27,0.220001220703125 125 | 2019-06-28,-0.1899871826171875 126 | 2019-07-01,1.7199859619140625 127 | 2019-07-02,0.9000091552734375 128 | 2019-07-03,0.8800048828125 129 | 2019-07-05,-0.4000091552734375 130 | 2019-07-08,-0.0999908447265625 131 | 2019-07-09,-0.5 132 | 2019-07-10,1.3899993896484375 133 | 2019-07-11,0.54998779296875 134 | 2019-07-12,0.5 135 | 2019-07-15,0.0 136 | 2019-07-16,-1.8199920654296875 137 | 2019-07-17,-0.80999755859375 138 | 2019-07-18,0.149993896484375 139 | 2019-07-19,0.1999969482421875 140 | 2019-07-22,1.80999755859375 141 | 2019-07-23,0.8600006103515625 142 | 2019-07-24,1.4300079345703125 143 | 2019-07-25,-0.529998779296875 144 | 2019-07-26,1.149993896484375 145 | 2019-07-29,-0.30999755859375 146 | 2019-07-30,-0.67999267578125 147 | 2019-07-31,-4.0800018310546875 148 | 2019-08-01,1.7899932861328125 149 | 2019-08-02,-1.160003662109375 150 | 2019-08-05,-4.6899871826171875 151 | 2019-08-06,2.4799957275390625 152 | 2019-08-07,0.589996337890625 153 | 2019-08-08,3.6100006103515625 154 | 2019-08-09,-1.17999267578125 155 | 2019-08-12,-1.920013427734375 156 | 2019-08-13,2.8100128173828125 157 | 2019-08-14,-4.6200103759765625 158 | 2019-08-15,-0.3000030517578125 159 | 2019-08-16,2.45001220703125 160 | 2019-08-19,2.279998779296875 161 | 2019-08-20,-1.1500091552734375 162 | 2019-08-21,1.529998779296875 163 | 2019-08-22,-1.0099945068359375 164 | 2019-08-23,-4.3899993896484375 165 | 2019-08-26,2.05999755859375 166 | 2019-08-27,0.290008544921875 167 | 2019-08-28,-0.1800079345703125 168 | 2019-08-29,2.55999755859375 169 | 2019-08-30,-0.2599945068359375 170 | 2019-09-03,-1.82000732421875 171 | 2019-09-04,1.5900115966796875 172 | 2019-09-05,2.4199981689453125 173 | 2019-09-06,-0.9499969482421875 174 | 2019-09-09,-1.5800018310546875 175 | 2019-09-10,-1.44000244140625 176 | 2019-09-11,0.0399932861328125 177 | 2019-09-12,1.4000091552734375 178 | 2019-09-13,-0.1999969482421875 179 | 2019-09-16,-0.9900054931640625 180 | 2019-09-17,1.05999755859375 181 | 2019-09-18,1.1300048828125 182 | 2019-09-19,2.5500030517578125 183 | 2019-09-20,-1.6300048828125 184 | 2019-09-23,-0.3000030517578125 185 | 2019-09-24,-1.7599945068359375 186 | 2019-09-25,1.9799957275390625 187 | 2019-09-26,0.17999267578125 188 | 2019-09-27,-1.80999755859375 189 | 2019-09-30,1.3000030517578125 190 | 2019-10-01,-1.959991455078125 191 | 2019-10-02,-2.420013427734375 192 | 2019-10-03,1.6300048828125 193 | 2019-10-04,1.839996337890625 194 | 2019-10-07,-1.0 195 | 2019-10-08,-1.4499969482421875 196 | 2019-10-09,2.57000732421875 197 | 2019-10-10,0.8600006103515625 198 | 2019-10-11,0.579986572265625 199 | 2019-10-14,-0.1299896240234375 200 | 2019-10-15,2.0200042724609375 201 | 2019-10-16,-1.160003662109375 202 | 2019-10-17,-0.720001220703125 203 | 2019-10-18,-2.279998779296875 204 | 2019-10-21,1.019989013671875 205 | 2019-10-22,-2.05999755859375 206 | 2019-10-23,0.8700103759765625 207 | 2019-10-24,2.6999969482421875 208 | 2019-10-25,0.7899932861328125 209 | 2019-10-28,3.4600067138671875 210 | 2019-10-29,-1.3600006103515625 211 | 2019-10-30,1.779998779296875 212 | 2019-10-31,-1.2400054931640625 213 | 2019-11-01,0.350006103515625 214 | 2019-11-04,0.8300018310546875 215 | 2019-11-05,-0.089996337890625 216 | 2019-11-06,-0.4000091552734375 217 | 2019-11-07,0.1999969482421875 218 | 2019-11-08,1.70001220703125 219 | 2019-11-11,0.149993896484375 220 | 2019-11-12,0.9600067138671875 221 | 2019-11-13,0.239990234375 222 | 2019-11-14,0.75 223 | 2019-11-15,1.910003662109375 224 | 2019-11-18,0.3699951171875 225 | 2019-11-19,0.0500030517578125 226 | 2019-11-20,-0.7700042724609375 227 | 2019-11-21,-0.1399993896484375 228 | 2019-11-22,0.1100006103515625 229 | 2019-11-25,1.6399993896484375 230 | 2019-11-26,0.8000030517578125 231 | 2019-11-27,0.290008544921875 232 | 2019-11-29,-0.94000244140625 233 | 2019-12-02,-1.8300018310546875 234 | 2019-12-03,-0.2400054931640625 235 | 2019-12-04,0.540008544921875 236 | 2019-12-05,0.079986572265625 237 | 2019-12-06,1.82000732421875 238 | 2019-12-09,-0.3899993896484375 239 | 2019-12-10,-0.2299957275390625 240 | 2019-12-11,0.5699920654296875 241 | 2019-12-12,1.540008544921875 242 | 2019-12-13,1.2899932861328125 243 | 2019-12-16,1.0 244 | 2019-12-17,-0.839996337890625 245 | 2019-12-18,-0.32000732421875 246 | 2019-12-19,1.3400115966796875 247 | 2019-12-20,1.6999969482421875 248 | 2019-12-23,0.0 249 | 2019-12-24,-0.029998779296875 250 | 2019-12-26,1.2899932861328125 251 | 2019-12-27,0.290008544921875 252 | 2019-12-30,-1.3700103759765625 253 | 2019-12-31,0.1100006103515625 254 | 2020-01-02,2.9199981689453125 255 | 2020-01-03,-2.0 256 | 2020-01-06,0.410003662109375 257 | 2020-01-07,-1.4499969482421875 258 | 2020-01-08,2.5099945068359375 259 | 2020-01-09,2.0 260 | 2020-01-10,-0.75 261 | 2020-01-13,1.94000244140625 262 | 2020-01-14,-1.149993896484375 263 | 2020-01-15,1.04998779296875 264 | 2020-01-16,2.9900054931640625 265 | 2020-01-17,0.9300079345703125 266 | 2020-01-21,-0.600006103515625 267 | 2020-01-22,-0.8000030517578125 268 | 2020-01-23,1.0200042724609375 269 | 2020-01-24,-1.6800079345703125 270 | 2020-01-27,-2.7599945068359375 271 | 2020-01-28,3.1800079345703125 272 | 2020-01-29,2.579986572265625 273 | 2020-01-30,4.7400054931640625 274 | 2020-01-31,-2.5500030517578125 275 | 2020-02-03,4.1500091552734375 276 | 2020-02-04,5.739990234375 277 | 2020-02-05,-0.220001220703125 278 | 2020-02-06,3.730010986328125 279 | 2020-02-07,0.2599945068359375 280 | 2020-02-10,4.80999755859375 281 | 2020-02-11,-4.2599945068359375 282 | 2020-02-12,0.2700042724609375 283 | 2020-02-13,-1.0 284 | 2020-02-14,1.6399993896484375 285 | 2020-02-18,1.8799896240234375 286 | 2020-02-19,0.0500030517578125 287 | 2020-02-20,-2.8600006103515625 288 | 2020-02-21,-5.8300018310546875 289 | 2020-02-24,-7.6999969482421875 290 | 2020-02-25,-2.8199920654296875 291 | 2020-02-26,2.0999908447265625 292 | 2020-02-27,-11.990005493164062 293 | 2020-02-28,3.8300018310546875 294 | 2020-03-02,10.779998779296875 295 | 2020-03-03,-8.279998779296875 296 | 2020-03-04,6.040008544921875 297 | 2020-03-05,-4.279998779296875 298 | 2020-03-06,-4.6999969482421875 299 | 2020-03-09,-10.95001220703125 300 | 2020-03-10,10.300003051757812 301 | 2020-03-11,-7.2899932861328125 302 | 2020-03-12,-14.57000732421875 303 | 2020-03-13,19.770004272460938 304 | 2020-03-16,-23.410003662109375 305 | 2020-03-17,11.150009155273438 306 | 2020-03-18,-6.170013427734375 307 | 2020-03-19,2.3100128173828125 308 | 2020-03-20,-5.3600006103515625 309 | 2020-03-23,-1.3700103759765625 310 | 2020-03-24,12.360000610351562 311 | 2020-03-25,-1.4199981689453125 312 | 2020-03-26,9.19000244140625 313 | 2020-03-27,-6.410003662109375 314 | 2020-03-30,10.529998779296875 315 | 2020-03-31,-2.519989013671875 316 | 2020-04-01,-5.600006103515625 317 | 2020-04-02,3.149993896484375 318 | 2020-04-03,-1.42999267578125 319 | 2020-04-06,11.44000244140625 320 | 2020-04-07,-1.779998779296875 321 | 2020-04-08,1.6399993896484375 322 | 2020-04-09,0.0099945068359375 323 | 2020-04-13,0.3699951171875 324 | 2020-04-14,8.19000244140625 325 | 2020-04-15,-1.8199920654296875 326 | 2020-04-16,5.1599884033203125 327 | 2020-04-17,1.5600128173828125 328 | 2020-04-20,-3.540008544921875 329 | 2020-04-21,-7.239990234375 330 | 2020-04-22,5.6999969482421875 331 | 2020-04-23,-2.100006103515625 332 | 2020-04-24,3.1300048828125 333 | 2020-04-27,-0.5 334 | 2020-04-28,-4.2400054931640625 335 | 2020-04-29,7.6199951171875 336 | 2020-04-30,1.7800140380859375 337 | 2020-05-01,-4.6399993896484375 338 | 2020-05-04,4.269989013671875 339 | 2020-05-05,1.9199981689453125 340 | 2020-05-06,1.779998779296875 341 | 2020-05-07,1.0600128173828125 342 | 2020-05-08,1.079986572265625 343 | 2020-05-11,2.0600128173828125 344 | 2020-05-12,-4.230010986328125 345 | 2020-05-13,-2.7599945068359375 346 | 2020-05-14,0.779998779296875 347 | 2020-05-15,2.6300048828125 348 | 2020-05-18,1.75 349 | 2020-05-19,-1.279998779296875 350 | 2020-05-20,2.029998779296875 351 | 2020-05-21,-2.230010986328125 352 | 2020-05-22,0.0800018310546875 353 | 2020-05-26,-1.9399871826171875 354 | 2020-05-27,0.239990234375 355 | 2020-05-28,-0.410003662109375 356 | 2020-05-29,1.850006103515625 357 | 2020-06-01,-0.4199981689453125 358 | 2020-06-02,2.0800018310546875 359 | 2020-06-03,0.4499969482421875 360 | 2020-06-04,-2.44000244140625 361 | 2020-06-05,4.279998779296875 362 | 2020-06-08,1.160003662109375 363 | 2020-06-09,1.44000244140625 364 | 2020-06-10,7.0399932861328125 365 | 2020-06-11,-10.569992065429688 366 | 2020-06-12,1.470001220703125 367 | 2020-06-15,1.1999969482421875 368 | 2020-06-16,4.6300048828125 369 | 2020-06-17,0.6699981689453125 370 | 2020-06-18,2.0800018310546875 371 | 2020-06-19,-1.170013427734375 372 | 2020-06-22,5.420013427734375 373 | 2020-06-23,1.339996337890625 374 | 2020-06-24,-4.07000732421875 375 | 2020-06-25,2.5 376 | 2020-06-26,-4.0099945068359375 377 | 2020-06-29,2.1100006103515625 378 | 2020-06-30,5.0699920654296875 379 | 2020-07-01,1.19000244140625 380 | 2020-07-02,1.55999755859375 381 | 2020-07-06,4.44000244140625 382 | 2020-07-07,-2.4499969482421875 383 | 2020-07-08,4.5800018310546875 384 | 2020-07-09,1.4900054931640625 385 | 2020-07-10,-0.6500091552734375 386 | 2020-07-13,-6.5999908447265625 387 | 2020-07-14,1.279998779296875 388 | 2020-07-15,-0.3100128173828125 389 | 2020-07-16,-4.1199951171875 390 | 2020-07-17,-1.0399932861328125 391 | 2020-07-20,8.720001220703125 392 | 2020-07-21,-2.850006103515625 393 | 2020-07-22,3.0 394 | 2020-07-23,-9.210006713867188 395 | 2020-07-24,-1.239990234375 396 | 2020-07-27,2.5500030517578125 397 | 2020-07-28,-1.8300018310546875 398 | 2020-07-29,2.0399932861328125 399 | 2020-07-30,-0.160003662109375 400 | 2020-07-31,1.1100006103515625 401 | 2020-08-03,11.529998779296875 402 | 2020-08-04,-3.25 403 | 2020-08-05,-0.3499908447265625 404 | 2020-08-06,3.410003662109375 405 | 2020-08-07,-3.8700103759765625 406 | 2020-08-10,-4.2299957275390625 407 | 2020-08-11,-4.8699951171875 408 | 2020-08-12,5.80999755859375 409 | 2020-08-13,-0.4900054931640625 410 | 2020-08-14,0.1999969482421875 411 | 2020-08-17,1.3800048828125 412 | 2020-08-18,1.2100067138671875 413 | 2020-08-19,-1.790008544921875 414 | 2020-08-20,4.8800048828125 415 | 2020-08-21,-1.55999755859375 416 | 2020-08-24,0.6699981689453125 417 | 2020-08-25,2.779998779296875 418 | 2020-08-26,4.67999267578125 419 | 2020-08-27,5.4300079345703125 420 | 2020-08-28,2.3300018310546875 421 | 2020-08-31,-3.3800048828125 422 | 2020-09-01,1.7400054931640625 423 | 2020-09-02,4.3799896240234375 424 | 2020-09-03,-14.349990844726562 425 | 2020-09-04,-3.0500030517578125 426 | 2020-09-08,-11.589996337890625 427 | 2020-09-09,8.629989624023438 428 | 2020-09-10,-5.9199981689453125 429 | 2020-09-11,-1.339996337890625 430 | 2020-09-14,1.3800048828125 431 | 2020-09-15,3.3699951171875 432 | 2020-09-16,-3.7299957275390625 433 | 2020-09-17,-2.1399993896484375 434 | 2020-09-18,-2.5200042724609375 435 | 2020-09-21,2.149993896484375 436 | 2020-09-22,4.8800048828125 437 | 2020-09-23,-6.8300018310546875 438 | 2020-09-24,2.600006103515625 439 | 2020-09-25,4.6300048828125 440 | 2020-09-28,1.6199951171875 441 | 2020-09-29,-2.1800079345703125 442 | 2020-09-30,3.07000732421875 443 | 2020-10-01,2.1300048828125 444 | 2020-10-02,-6.2700042724609375 445 | 2020-10-05,4.19000244140625 446 | 2020-10-06,-4.470001220703125 447 | 2020-10-07,3.9199981689453125 448 | 2020-10-08,0.75 449 | 2020-10-09,5.2299957275390625 450 | 2020-10-12,5.589996337890625 451 | 2020-10-13,1.4600067138671875 452 | 2020-10-14,-2.0 453 | 2020-10-15,-1.1999969482421875 454 | 2020-10-16,0.0 455 | 2020-10-19,-5.44000244140625 456 | 2020-10-20,0.42999267578125 457 | 2020-10-21,0.1500091552734375 458 | 2020-10-22,0.089996337890625 459 | 2020-10-23,1.339996337890625 460 | 2020-10-26,-6.149993896484375 461 | 2020-10-27,3.1699981689453125 462 | 2020-10-28,-10.57000732421875 463 | 2020-10-29,2.040008544921875 464 | 2020-10-30,-2.25 465 | 2020-11-02,-0.1399993896484375 466 | 2020-11-03,4.0999908447265625 467 | 2020-11-04,9.960006713867188 468 | 2020-11-05,6.899993896484375 469 | 2020-11-06,0.4300079345703125 470 | 2020-11-09,-5.3300018310546875 471 | 2020-11-10,-7.3800048828125 472 | 2020-11-11,5.540008544921875 473 | 2020-11-12,-1.1100006103515625 474 | 2020-11-13,1.0699920654296875 475 | 2020-11-16,0.720001220703125 476 | 2020-11-17,-2.769989013671875 477 | 2020-11-18,-3.3800048828125 478 | 2020-11-19,1.339996337890625 479 | 2020-11-20,-2.029998779296875 480 | 2020-11-23,-0.279998779296875 481 | -------------------------------------------------------------------------------- /codes/datasets/diff_train_aapl.csv: -------------------------------------------------------------------------------- 1 | Date,AAPL 2 | 2019-01-02,0.0449981689453125 3 | 2019-01-03,-3.9324989318847656 4 | 2019-01-04,1.5174980163574219 5 | 2019-01-07,-0.08250045776367188 6 | 2019-01-08,0.7050018310546875 7 | 2019-01-09,0.6399993896484375 8 | 2019-01-10,0.12250137329101562 9 | 2019-01-11,-0.37750244140625 10 | 2019-01-14,-0.5724983215332031 11 | 2019-01-15,0.7675018310546875 12 | 2019-01-16,0.467498779296875 13 | 2019-01-17,0.22999954223632812 14 | 2019-01-18,0.24000167846679688 15 | 2019-01-22,-0.8800010681152344 16 | 2019-01-23,0.154998779296875 17 | 2019-01-24,-0.30500030517578125 18 | 2019-01-25,1.2649993896484375 19 | 2019-01-28,-0.36499786376953125 20 | 2019-01-29,-0.4050025939941406 21 | 2019-01-30,2.6425018310546875 22 | 2019-01-31,0.2975006103515625 23 | 2019-02-01,0.020000457763671875 24 | 2019-02-04,1.1824989318847656 25 | 2019-02-05,0.7324981689453125 26 | 2019-02-06,0.015003204345703125 27 | 2019-02-07,-0.8250007629394531 28 | 2019-02-08,-0.13249969482421875 29 | 2019-02-11,-0.24500274658203125 30 | 2019-02-12,0.3650016784667969 31 | 2019-02-13,-0.17750167846679688 32 | 2019-02-14,0.15500259399414062 33 | 2019-02-15,-0.095001220703125 34 | 2019-02-19,0.12749862670898438 35 | 2019-02-20,0.27500152587890625 36 | 2019-02-21,-0.24250030517578125 37 | 2019-02-22,0.47750091552734375 38 | 2019-02-25,0.3149986267089844 39 | 2019-02-26,0.02500152587890625 40 | 2019-02-27,0.13499832153320312 41 | 2019-02-28,-0.43000030517578125 42 | 2019-03-01,0.4550018310546875 43 | 2019-03-04,0.220001220703125 44 | 2019-03-05,-0.0800018310546875 45 | 2019-03-06,-0.2524986267089844 46 | 2019-03-07,-0.5050010681152344 47 | 2019-03-08,0.10250091552734375 48 | 2019-03-11,1.49749755859375 49 | 2019-03-12,0.50250244140625 50 | 2019-03-13,0.20000076293945312 51 | 2019-03-14,0.5049972534179688 52 | 2019-03-15,0.5974998474121094 53 | 2019-03-18,0.4750022888183594 54 | 2019-03-19,-0.3725013732910156 55 | 2019-03-20,0.407501220703125 56 | 2019-03-21,1.7324981689453125 57 | 2019-03-22,-1.0099983215332031 58 | 2019-03-25,-0.5774993896484375 59 | 2019-03-26,-0.4875030517578125 60 | 2019-03-27,0.4200019836425781 61 | 2019-03-28,0.0625 62 | 2019-03-29,0.3074989318847656 63 | 2019-04-01,0.32250213623046875 64 | 2019-04-02,0.6949996948242188 65 | 2019-04-03,0.3325004577636719 66 | 2019-04-04,0.08499908447265625 67 | 2019-04-05,0.3274993896484375 68 | 2019-04-08,0.7750015258789062 69 | 2019-04-09,-0.15000152587890625 70 | 2019-04-10,0.279998779296875 71 | 2019-04-11,-0.4174995422363281 72 | 2019-04-12,-0.020000457763671875 73 | 2019-04-15,0.09000015258789062 74 | 2019-04-16,0.005001068115234375 75 | 2019-04-17,0.970001220703125 76 | 2019-04-18,0.18249893188476562 77 | 2019-04-22,0.16749954223632812 78 | 2019-04-23,0.7374992370605469 79 | 2019-04-24,-0.07999801635742188 80 | 2019-04-25,-0.470001220703125 81 | 2019-04-26,-0.24499893188476562 82 | 2019-04-29,0.0774993896484375 83 | 2019-04-30,-0.9850006103515625 84 | 2019-05-01,2.4625015258789062 85 | 2019-05-02,-0.3425025939941406 86 | 2019-05-03,0.6500015258789062 87 | 2019-05-06,-0.8175010681152344 88 | 2019-05-07,-1.404998779296875 89 | 2019-05-08,0.009998321533203125 90 | 2019-05-09,-0.5449981689453125 91 | 2019-05-10,-0.8850021362304688 92 | 2019-05-13,-2.8649978637695312 93 | 2019-05-14,0.7350006103515625 94 | 2019-05-15,0.5649986267089844 95 | 2019-05-16,-0.20999908447265625 96 | 2019-05-17,-0.2700004577636719 97 | 2019-05-20,-1.4775009155273438 98 | 2019-05-21,0.87750244140625 99 | 2019-05-22,-0.9550018310546875 100 | 2019-05-23,-0.779998779296875 101 | 2019-05-24,-0.1725006103515625 102 | 2019-05-28,-0.18500137329101562 103 | 2019-05-29,-0.21249771118164062 104 | 2019-05-30,0.22999954223632812 105 | 2019-05-31,-0.8074989318847656 106 | 2019-06-03,-0.4425010681152344 107 | 2019-06-04,1.5849990844726562 108 | 2019-06-05,0.7249984741210938 109 | 2019-06-06,0.6700019836425781 110 | 2019-06-07,1.2324981689453125 111 | 2019-06-10,0.6075019836425781 112 | 2019-06-11,0.5574989318847656 113 | 2019-06-12,-0.154998779296875 114 | 2019-06-13,-0.01000213623046875 115 | 2019-06-14,-0.3524971008300781 116 | 2019-06-17,0.28749847412109375 117 | 2019-06-18,1.1399993896484375 118 | 2019-06-19,-0.14500045776367188 119 | 2019-06-20,0.3975028991699219 120 | 2019-06-21,-0.17000198364257812 121 | 2019-06-24,-0.049999237060546875 122 | 2019-06-25,-0.7524986267089844 123 | 2019-06-26,1.0574989318847656 124 | 2019-06-27,-0.0149993896484375 125 | 2019-06-28,-0.4550018310546875 126 | 2019-07-01,0.907501220703125 127 | 2019-07-02,0.2949981689453125 128 | 2019-07-03,0.4200019836425781 129 | 2019-07-05,-0.045001983642578125 130 | 2019-07-08,-1.0524978637695312 131 | 2019-07-09,0.30500030517578125 132 | 2019-07-10,0.49749755859375 133 | 2019-07-11,-0.3699989318847656 134 | 2019-07-12,0.3875007629394531 135 | 2019-07-15,0.47750091552734375 136 | 2019-07-16,-0.17750167846679688 137 | 2019-07-17,-0.28749847412109375 138 | 2019-07-18,0.5774993896484375 139 | 2019-07-19,-0.7675018310546875 140 | 2019-07-22,1.157501220703125 141 | 2019-07-23,0.404998779296875 142 | 2019-07-24,-0.042499542236328125 143 | 2019-07-25,-0.41249847412109375 144 | 2019-07-26,0.18000030517578125 145 | 2019-07-29,0.4849967956542969 146 | 2019-07-30,-0.22499847412109375 147 | 2019-07-31,1.0649986267089844 148 | 2019-08-01,-1.1525001525878906 149 | 2019-08-02,-1.1024971008300781 150 | 2019-08-05,-2.670001983642578 151 | 2019-08-06,0.9150009155273438 152 | 2019-08-07,0.5099983215332031 153 | 2019-08-08,1.0974998474121094 154 | 2019-08-09,-0.6099967956542969 155 | 2019-08-12,-0.12750244140625 156 | 2019-08-13,2.1225013732910156 157 | 2019-08-14,-1.5550003051757812 158 | 2019-08-15,-0.2524986267089844 159 | 2019-08-16,1.1899986267089844 160 | 2019-08-19,0.9625015258789062 161 | 2019-08-20,0.002498626708984375 162 | 2019-08-21,0.5699996948242188 163 | 2019-08-22,-0.0449981689453125 164 | 2019-08-23,-2.4550018310546875 165 | 2019-08-26,0.9625015258789062 166 | 2019-08-27,-0.5825004577636719 167 | 2019-08-28,0.342498779296875 168 | 2019-08-29,0.8699989318847656 169 | 2019-08-30,-0.06749725341796875 170 | 2019-09-03,-0.7600021362304688 171 | 2019-09-04,0.8725013732910156 172 | 2019-09-05,1.0224990844726562 173 | 2019-09-06,-0.005001068115234375 174 | 2019-09-09,0.22750091552734375 175 | 2019-09-10,0.6324996948242188 176 | 2019-09-11,1.7224998474121094 177 | 2019-09-12,-0.125 178 | 2019-09-13,-1.0849990844726562 179 | 2019-09-16,0.28749847412109375 180 | 2019-09-17,0.20000076293945312 181 | 2019-09-18,0.5175018310546875 182 | 2019-09-19,-0.4524993896484375 183 | 2019-09-20,-0.8075027465820312 184 | 2019-09-23,0.24750137329101562 185 | 2019-09-24,-0.26000213623046875 186 | 2019-09-25,0.8375015258789062 187 | 2019-09-26,-0.2849998474121094 188 | 2019-09-27,-0.2674980163574219 189 | 2019-09-30,1.2874984741210938 190 | 2019-10-01,0.154998779296875 191 | 2019-10-02,-1.4074974060058594 192 | 2019-10-03,0.4650001525878906 193 | 2019-10-04,1.5474967956542969 194 | 2019-10-07,0.012500762939453125 195 | 2019-10-08,-0.6650009155273438 196 | 2019-10-09,0.657501220703125 197 | 2019-10-10,0.7649993896484375 198 | 2019-10-11,1.5300025939941406 199 | 2019-10-14,-0.08500289916992188 200 | 2019-10-15,-0.1374969482421875 201 | 2019-10-16,-0.2375030517578125 202 | 2019-10-17,0.22750091552734375 203 | 2019-10-18,0.282501220703125 204 | 2019-10-21,1.0249977111816406 205 | 2019-10-22,-0.1374969482421875 206 | 2019-10-23,0.8049964904785156 207 | 2019-10-24,0.10000228881835938 208 | 2019-10-25,0.75 209 | 2019-10-28,0.6175003051757812 210 | 2019-10-29,-1.44000244140625 211 | 2019-10-30,-0.00749969482421875 212 | 2019-10-31,1.375 213 | 2019-11-01,1.7650032043457031 214 | 2019-11-04,0.4199981689453125 215 | 2019-11-05,-0.092498779296875 216 | 2019-11-06,0.027496337890625 217 | 2019-11-07,0.5475006103515625 218 | 2019-11-08,0.1775054931640625 219 | 2019-11-11,0.5149993896484375 220 | 2019-11-12,-0.06000518798828125 221 | 2019-11-13,0.62750244140625 222 | 2019-11-14,-0.45749664306640625 223 | 2019-11-15,0.779998779296875 224 | 2019-11-18,0.33499908447265625 225 | 2019-11-19,-0.2024993896484375 226 | 2019-11-20,-0.7750015258789062 227 | 2019-11-21,-0.2949981689453125 228 | 2019-11-22,-0.05750274658203125 229 | 2019-11-25,1.1474990844726562 230 | 2019-11-26,-0.5199966430664062 231 | 2019-11-27,0.8874969482421875 232 | 2019-11-29,-0.14749908447265625 233 | 2019-12-02,-0.7724990844726562 234 | 2019-12-03,-1.1774978637695312 235 | 2019-12-04,0.5724945068359375 236 | 2019-12-05,0.9599990844726562 237 | 2019-12-06,1.282501220703125 238 | 2019-12-09,-0.9474945068359375 239 | 2019-12-10,0.3899993896484375 240 | 2019-12-11,0.5724945068359375 241 | 2019-12-12,0.1725006103515625 242 | 2019-12-13,0.9225006103515625 243 | 2019-12-16,1.1774978637695312 244 | 2019-12-17,0.13750457763671875 245 | 2019-12-18,-0.16750335693359375 246 | 2019-12-19,0.06999969482421875 247 | 2019-12-20,-0.14499664306640625 248 | 2019-12-23,1.1399993896484375 249 | 2019-12-24,0.06749725341796875 250 | 2019-12-26,1.410003662109375 251 | 2019-12-27,-0.02750396728515625 252 | 2019-12-30,0.43000030517578125 253 | 2019-12-31,0.532501220703125 254 | 2020-01-02,1.6750030517578125 255 | 2020-01-03,-0.7300033569335938 256 | 2020-01-06,0.592498779296875 257 | 2020-01-07,-0.3524932861328125 258 | 2020-01-08,1.1999969482421875 259 | 2020-01-09,1.6100006103515625 260 | 2020-01-10,0.17499542236328125 261 | 2020-01-13,1.657501220703125 262 | 2020-01-14,-1.0699996948242188 263 | 2020-01-15,-0.33499908447265625 264 | 2020-01-16,0.9749984741210938 265 | 2020-01-17,0.8725051879882812 266 | 2020-01-21,-0.5400009155273438 267 | 2020-01-22,0.282501220703125 268 | 2020-01-23,0.38249969482421875 269 | 2020-01-24,-0.23000335693359375 270 | 2020-01-27,-2.339996337890625 271 | 2020-01-28,2.18499755859375 272 | 2020-01-29,1.6624984741210938 273 | 2020-01-30,-0.11750030517578125 274 | 2020-01-31,-3.589996337890625 275 | 2020-02-03,-0.21250152587890625 276 | 2020-02-04,2.5475006103515625 277 | 2020-02-05,0.6500015258789062 278 | 2020-02-06,0.9399948120117188 279 | 2020-02-07,-1.2949981689453125 280 | 2020-02-10,0.37999725341796875 281 | 2020-02-11,-0.4850006103515625 282 | 2020-02-12,1.8975067138671875 283 | 2020-02-13,-0.5825042724609375 284 | 2020-02-14,0.0200042724609375 285 | 2020-02-18,-1.4875030517578125 286 | 2020-02-19,1.154998779296875 287 | 2020-02-20,-0.8300018310546875 288 | 2020-02-21,-1.8125 289 | 2020-02-24,-3.717498779296875 290 | 2020-02-25,-2.5250015258789062 291 | 2020-02-26,1.1425018310546875 292 | 2020-02-27,-4.782501220703125 293 | 2020-02-28,-0.04000091552734375 294 | 2020-03-02,6.3625030517578125 295 | 2020-03-03,-2.37249755859375 296 | 2020-03-04,3.3549957275390625 297 | 2020-03-05,-2.4549942016601562 298 | 2020-03-06,-0.972503662109375 299 | 2020-03-09,-5.714996337890625 300 | 2020-03-10,4.7924957275390625 301 | 2020-03-11,-2.4775009155273438 302 | 2020-03-12,-6.799999237060547 303 | 2020-03-13,7.435001373291016 304 | 2020-03-16,-8.939998626708984 305 | 2020-03-17,2.6624984741210938 306 | 2020-03-18,-1.5475006103515625 307 | 2020-03-19,-0.4724998474121094 308 | 2020-03-20,-3.884998321533203 309 | 2020-03-23,-1.2175025939941406 310 | 2020-03-24,5.62750244140625 311 | 2020-03-25,-0.3400001525878906 312 | 2020-03-26,3.229999542236328 313 | 2020-03-27,-2.674999237060547 314 | 2020-03-30,1.7674980163574219 315 | 2020-03-31,-0.13000106811523438 316 | 2020-04-01,-3.3449974060058594 317 | 2020-04-02,1.0049972534179688 318 | 2020-04-03,-0.8799972534179688 319 | 2020-04-06,5.2649993896484375 320 | 2020-04-07,-0.7600021362304688 321 | 2020-04-08,1.660003662109375 322 | 2020-04-09,0.4799957275390625 323 | 2020-04-13,1.31500244140625 324 | 2020-04-14,3.4499969482421875 325 | 2020-04-15,-0.654998779296875 326 | 2020-04-16,0.56500244140625 327 | 2020-04-17,-0.972503662109375 328 | 2020-04-20,-1.467498779296875 329 | 2020-04-21,-2.1399993896484375 330 | 2020-04-22,1.9325027465820312 331 | 2020-04-23,-0.2675018310546875 332 | 2020-04-24,1.9850006103515625 333 | 2020-04-27,0.0500030517578125 334 | 2020-04-28,-1.1475067138671875 335 | 2020-04-29,2.287506103515625 336 | 2020-04-30,1.5174942016601562 337 | 2020-05-01,-1.1824951171875 338 | 2020-05-04,1.0224990844726562 339 | 2020-05-05,1.0999984741210938 340 | 2020-05-06,0.7675018310546875 341 | 2020-05-07,0.777496337890625 342 | 2020-05-08,1.597503662109375 343 | 2020-05-11,1.220001220703125 344 | 2020-05-12,-0.9000015258789062 345 | 2020-05-13,-0.94000244140625 346 | 2020-05-14,0.472503662109375 347 | 2020-05-15,-0.4575042724609375 348 | 2020-05-18,1.8125 349 | 2020-05-19,-0.45499420166015625 350 | 2020-05-20,1.5224990844726562 351 | 2020-05-21,-0.595001220703125 352 | 2020-05-22,0.5100021362304688 353 | 2020-05-26,-0.5400009155273438 354 | 2020-05-27,0.34499359130859375 355 | 2020-05-28,0.035003662109375 356 | 2020-05-29,-0.0774993896484375 357 | 2020-06-01,0.9775009155273438 358 | 2020-06-02,0.37249755859375 359 | 2020-06-03,0.44499969482421875 360 | 2020-06-04,-0.6999969482421875 361 | 2020-06-05,2.2949981689453125 362 | 2020-06-08,0.48999786376953125 363 | 2020-06-09,2.6324996948242188 364 | 2020-06-10,2.2125015258789062 365 | 2020-06-11,-4.2350006103515625 366 | 2020-06-12,0.7249984741210938 367 | 2020-06-15,1.0475006103515625 368 | 2020-06-16,2.2724990844726562 369 | 2020-06-17,-0.12249755859375 370 | 2020-06-18,0.035003662109375 371 | 2020-06-19,-0.50250244140625 372 | 2020-06-22,2.2874984741210938 373 | 2020-06-23,1.9150009155273438 374 | 2020-06-24,-1.6175003051757812 375 | 2020-06-25,1.1949996948242188 376 | 2020-06-26,-2.8024978637695312 377 | 2020-06-29,2.0374984741210938 378 | 2020-06-30,0.7549972534179688 379 | 2020-07-01,-0.1725006103515625 380 | 2020-07-02,0.0 381 | 2020-07-06,2.4350051879882812 382 | 2020-07-07,-0.29000091552734375 383 | 2020-07-08,2.1699981689453125 384 | 2020-07-09,0.410003662109375 385 | 2020-07-10,0.1674957275390625 386 | 2020-07-13,-0.44249725341796875 387 | 2020-07-14,1.5800018310546875 388 | 2020-07-15,0.6674957275390625 389 | 2020-07-16,-1.2024993896484375 390 | 2020-07-17,-0.19499969482421875 391 | 2020-07-20,2.029998779296875 392 | 2020-07-21,-1.3574981689453125 393 | 2020-07-22,0.27249908447265625 394 | 2020-07-23,-4.427497863769531 395 | 2020-07-24,-0.23000335693359375 396 | 2020-07-27,2.1949996948242188 397 | 2020-07-28,-1.5574951171875 398 | 2020-07-29,1.7874984741210938 399 | 2020-07-30,1.1500015258789062 400 | 2020-07-31,10.069999694824219 401 | 2020-08-03,2.6774978637695312 402 | 2020-08-04,0.7275009155273438 403 | 2020-08-05,0.39749908447265625 404 | 2020-08-06,3.839996337890625 405 | 2020-08-07,-2.7899932861328125 406 | 2020-08-10,1.6149978637695312 407 | 2020-08-11,-3.3525009155273438 408 | 2020-08-12,3.6350021362304688 409 | 2020-08-13,2.0 410 | 2020-08-14,-0.10250091552734375 411 | 2020-08-17,-0.3000030517578125 412 | 2020-08-18,0.9550018310546875 413 | 2020-08-19,0.14499664306640625 414 | 2020-08-20,2.5675048828125 415 | 2020-08-21,6.095001220703125 416 | 2020-08-24,1.4874954223632812 417 | 2020-08-25,-1.032501220703125 418 | 2020-08-26,1.6975021362304688 419 | 2020-08-27,-1.5124969482421875 420 | 2020-08-28,-0.2024993896484375 421 | 2020-08-31,4.232490539550781 422 | 2020-09-01,5.1399993896484375 423 | 2020-09-02,-2.779998779296875 424 | 2020-09-03,-10.519996643066406 425 | 2020-09-04,0.0800018310546875 426 | 2020-09-08,-8.139999389648438 427 | 2020-09-09,4.5 428 | 2020-09-10,-3.8300018310546875 429 | 2020-09-11,-1.4899978637695312 430 | 2020-09-14,3.3600006103515625 431 | 2020-09-15,0.18000030517578125 432 | 2020-09-16,-3.410003662109375 433 | 2020-09-17,-1.7900009155273438 434 | 2020-09-18,-3.5 435 | 2020-09-21,3.2400054931640625 436 | 2020-09-22,1.7299957275390625 437 | 2020-09-23,-4.689994812011719 438 | 2020-09-24,1.0999984741210938 439 | 2020-09-25,4.05999755859375 440 | 2020-09-28,2.6800003051757812 441 | 2020-09-29,-0.8700027465820312 442 | 2020-09-30,1.720001220703125 443 | 2020-10-01,0.9800033569335938 444 | 2020-10-02,-3.7700042724609375 445 | 2020-10-05,3.4800033569335938 446 | 2020-10-06,-3.339996337890625 447 | 2020-10-07,1.9199981689453125 448 | 2020-10-08,-0.1100006103515625 449 | 2020-10-09,2.0 450 | 2020-10-12,7.430000305175781 451 | 2020-10-13,-3.3000030517578125 452 | 2020-10-14,0.09000396728515625 453 | 2020-10-15,-0.48000335693359375 454 | 2020-10-16,-1.69000244140625 455 | 2020-10-19,-3.0399932861328125 456 | 2020-10-20,1.529998779296875 457 | 2020-10-21,-0.6399993896484375 458 | 2020-10-22,-1.1200027465820312 459 | 2020-10-23,-0.7099990844726562 460 | 2020-10-26,0.01000213623046875 461 | 2020-10-27,1.5499954223632812 462 | 2020-10-28,-5.400001525878906 463 | 2020-10-29,4.120002746582031 464 | 2020-10-30,-6.459999084472656 465 | 2020-11-02,-0.09000396728515625 466 | 2020-11-03,1.6700057983398438 467 | 2020-11-04,4.5099945068359375 468 | 2020-11-05,4.0800018310546875 469 | 2020-11-06,-0.339996337890625 470 | 2020-11-09,-2.3700027465820312 471 | 2020-11-10,-0.34999847412109375 472 | 2020-11-11,3.5199966430664062 473 | 2020-11-12,-0.279998779296875 474 | 2020-11-13,0.0500030517578125 475 | 2020-11-16,1.0400009155273438 476 | 2020-11-17,-0.910003662109375 477 | 2020-11-18,-1.3600006103515625 478 | 2020-11-19,0.6100006103515625 479 | 2020-11-20,-1.3000030517578125 480 | 2020-11-23,-3.4899978637695312 481 | -------------------------------------------------------------------------------- /codes/pyeconometrics/panel_discrete_models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import scipy.stats as st 4 | import scipy.integrate as spint 5 | 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | 9 | from numpy.linalg import inv 10 | from math import exp, sqrt, log 11 | 12 | from pyeconometrics.base import PanelBaseModel 13 | from pyeconometrics.utils import nCr, unique_permutations 14 | 15 | 16 | class FixedEffectPanelModel(PanelBaseModel): 17 | '''Fixed Effects Logit model for Panel Data 18 | Estimation of parameters with the Conditional Maximum Likelihood method 19 | ''' 20 | def __init__(self): 21 | self.name = 'Panel Fixed Effects Logit' 22 | self.output = None 23 | self.variables = None 24 | self.nb_obs = None 25 | self.init_ll = None 26 | self.beta = None 27 | self.beta_est = None 28 | self.beta_se = None 29 | self.confidence_interval = None 30 | self.final_ll = None 31 | self.converged = None 32 | 33 | def response_function(self, X, beta): 34 | A = X.copy() 35 | try: 36 | A.drop(self.output, axis=1, inplace=True) 37 | except: 38 | pass 39 | 40 | return np.array(A).dot(beta) 41 | 42 | def __log_likelihood_obs(self, X, y, beta): 43 | X.reset_index(drop=True,inplace=True) 44 | y.reset_index(drop=True,inplace=True) 45 | 46 | Z = np.array(self.response_function(X, beta)) 47 | 48 | if nCr(len(y),sum(y)) <= 100: 49 | perms = unique_permutations(y) 50 | else: 51 | perms = [np.random.permutation(y) for _ in range(100)] 52 | 53 | result = [] 54 | for a in perms: 55 | result.append(np.exp(Z.dot(a))) 56 | 57 | result = Z.dot(np.array(y)) - log(sum(result)) 58 | return result 59 | 60 | def __log_likelihood(self, X, beta): 61 | result = sum(np.array(X.apply(lambda group : \ 62 | self.__log_likelihood_obs(group, 63 | group[self.output], beta)))) 64 | 65 | return result 66 | 67 | def __conditional_probability(self, X, y, beta): 68 | if nCr(len(y),sum(y)) <= 100: 69 | perms = unique_permutations(y) 70 | else: 71 | perms = [np.random.permutation(y) for _ in range(100)] 72 | 73 | result = [] 74 | for z in perms: 75 | result.append(exp(np.array(z).T.dot(np.array(X).dot(beta)))) 76 | 77 | result = np.sum(np.array(result), axis=0) 78 | result = exp(np.array(y).T.dot(np.array(X).dot(beta))) / result 79 | 80 | return result 81 | 82 | def __score_obs(self, X, y, beta): 83 | X.drop(self.output, axis=1, inplace=True) 84 | 85 | X.reset_index(drop=True,inplace=True) 86 | y.reset_index(drop=True,inplace=True) 87 | 88 | if sum(y) == 0 or sum(y) == len(y): 89 | return np.array([0 for _ in range(len(X.columns))]) 90 | 91 | else: 92 | if nCr(len(y),sum(y)) <= 100: 93 | perms = unique_permutations(y) 94 | else: 95 | perms = [np.random.permutation(y) for _ in range(100)] 96 | 97 | result = [] 98 | for z in perms: 99 | result.append(np.array(z) \ 100 | * self.__conditional_probability(X,z,beta)) 101 | 102 | result = np.sum(np.array(result), axis=0) 103 | result = np.array(X).T.dot(np.array(y) - result) 104 | 105 | return result 106 | 107 | def __score(self, X, beta): 108 | return np.sum(np.array(X.apply(lambda group : \ 109 | self.__score_obs(group, group[self.output], beta))), axis=0) 110 | 111 | def __hessian_obs(self, X, y, beta): 112 | X.drop(self.output, axis=1, inplace=True) 113 | 114 | X.reset_index(drop=True,inplace=True) 115 | y.reset_index(drop=True,inplace=True) 116 | 117 | if sum(y) == 0 or sum(y) == len(y): 118 | return np.array([[0 for _ in range(len(X.columns))] \ 119 | for _ in range(len(X.columns))]) 120 | 121 | else: 122 | if nCr(len(y),sum(y)) <= 100: 123 | perms = unique_permutations(y) 124 | else: 125 | perms = [list(np.random.permutation(y)) for _ in range(100)] 126 | 127 | probas = [] 128 | esp = [] 129 | result = [] 130 | i = 0 131 | for z in perms: 132 | probas.append(self.__conditional_probability(X,z,beta)) 133 | esp.append(np.array(z) * probas[i]) 134 | result.append(np.array(z).dot(np.array(z).T) * probas[i]) 135 | i += 1 136 | 137 | esp = np.sum(np.array(esp), axis=0) 138 | result = np.sum(np.array(result), axis=0) 139 | result = np.array(X).T.dot( 140 | result - esp.T.dot(esp)).dot(np.array(X)) 141 | 142 | return -result 143 | 144 | def __hessian(self, X, beta): 145 | return np.sum(np.array(X.apply(lambda group : \ 146 | self.__hessian_obs(group,group[self.output], beta))), axis=0) 147 | 148 | def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False): 149 | '''Maximum Likelihhod Estimation 150 | Implement a Newton-Raphson algorithm to estimate parameters 151 | 152 | Parameters: 153 | ---------- 154 | X: 2-level MultiIndex Dataframe 155 | Database to fit the model 156 | 157 | output: string 158 | Name of the variable to predict 159 | 160 | nb_iter: integer (optional, default 20) 161 | Maximal number of iteration before the end of the Newton-Raphson algorithm 162 | 163 | drop_na: boolean (optional, default True) 164 | Indicate the method to handle missing values in X 165 | If drop_na = False, fill_value has to be given 166 | 167 | fill_value: string or dict (optional, defaul None) 168 | Considered only if drop_na = False 169 | Possible values: 170 | - 'mean': missing values of a column are replaced by the mean of that column 171 | - 'median': missing values of a column are replaced by the median of that column 172 | - dict: keys must be variables' names and associated values the values used to fill Nan 173 | 174 | verbose: boolean (optional, default False) 175 | If set to True, allows prints of Newton-Raphson algorithm's progress 176 | ''' 177 | self.output = output 178 | X = self.input_data_preparation(X.copy(), drop_na, fill_value) 179 | X.insert(0, '_cons', 1) 180 | 181 | labels = list(np.unique(X[self.output])) 182 | if labels != [0,1]: 183 | raise ValueError("Labels must be in the unit interval.") 184 | 185 | self.nb_obs = len(X) 186 | self.variables = [x for x in X.columns if x != self.output] 187 | 188 | beta_init = [0 for _ in range(len(self.variables))] 189 | self.beta_est = np.zeros((nb_iter,len(beta_init))) 190 | self.beta_est[0] = beta_init 191 | 192 | X = X.groupby(level=0) 193 | 194 | self.init_ll = self.__log_likelihood(X, beta_init) 195 | 196 | if verbose: 197 | print('Initial log-likelihood : '+ str(self.init_ll)) 198 | print('Parameters estimation in progress.') 199 | 200 | current_ll = self.init_ll 201 | prev_ll = self.init_ll 202 | j = 1 203 | while (j < nb_iter) \ 204 | and (j == 1 or (current_ll - prev_ll > 0.01)): 205 | 206 | score = self.__score(X, self.beta_est[j-1]) 207 | hessian = self.__hessian(X, self.beta_est[j-1]) 208 | 209 | try: 210 | self.beta_est[j] = self.beta_est[j-1] \ 211 | - inv(hessian).dot(score) 212 | except: 213 | raise ValueError('Improper classification problem' \ 214 | + ', should be 2 different labels') 215 | 216 | prev_ll = current_ll 217 | current_ll = self.__log_likelihood(X, self.beta_est[j]) 218 | if verbose: 219 | print('Iteration %s, log-likelihood : %s'\ 220 | % (j, current_ll)) 221 | j += 1 222 | 223 | self.beta = self.beta_est[j-2] 224 | self.beta_est = self.beta_est[:j-1,:] 225 | 226 | sqrt_vec = np.vectorize(sqrt) 227 | hessian = self.__hessian(X, self.beta_est[j-2]) 228 | self.beta_se = sqrt_vec(-inv(hessian).diagonal()) 229 | 230 | self.confidence_interval = np.array( 231 | [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i], 232 | self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]] 233 | for i in range(len(self.beta))]) 234 | 235 | self.final_ll = prev_ll 236 | 237 | if j < nb_iter: 238 | self.converged = True 239 | else: 240 | self.converged = False 241 | 242 | return self 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | class RandomEffectsPanelModel(PanelBaseModel): 251 | def __init__(self, residual_dist): 252 | self.name = 'Panel Random Effects Model' 253 | self.residual_dist = residual_dist 254 | self.output = None 255 | self.variables = None 256 | self.nb_obs = None 257 | self.init_ll = None 258 | self.beta = None 259 | self.mu = None 260 | self.sigma = None 261 | self.beta_est = None 262 | self.beta_se = None 263 | self.confidence_interval = None 264 | self.final_ll = None 265 | self.converged = None 266 | 267 | def response_function(self, X, beta, mu): 268 | A = X.copy() 269 | try: 270 | A.drop(self.output, axis=1, inplace=True) 271 | except: 272 | pass 273 | 274 | Z = mu 275 | for i,var in enumerate(self.variables): 276 | Z += beta[i] * A[var] 277 | 278 | return Z.rename('response') 279 | 280 | def __calculus_tools(self, X, w, beta, mu, sigma): 281 | z = np.repeat(np.array([[1, w]]), X.shape[0], axis=0) 282 | z = np.concatenate((z, X), axis=1).T 283 | 284 | gamma = np.repeat(np.array([[mu, sigma]]), X.shape[0], axis=0) 285 | beta = np.array(beta, ndmin=2) 286 | beta = np.repeat(beta, X.shape[0], axis=0) 287 | gamma = np.concatenate((gamma, beta), axis=1).T 288 | 289 | return z, gamma 290 | 291 | def __conditional_density_obs(self, X, w, y, beta, mu, sigma): 292 | z, gamma = self.__calculus_tools(X, w, beta, mu, sigma) 293 | item = z.T.dot(gamma)[:,0] 294 | 295 | num = np.exp(np.multiply(np.array(y), item)) 296 | denom = 1 + np.exp(item) 297 | result = np.prod(np.divide(num, denom)) 298 | 299 | return result 300 | 301 | def __grad_conditional_density_obs(self, X, w, y, beta, mu, sigma): 302 | z, gamma = self.__calculus_tools(X, w, beta, mu, sigma) 303 | 304 | item = np.exp(z.T.dot(gamma)[:,0]) 305 | result = np.array(y) - item / (1+item) 306 | 307 | result = z.dot(result) 308 | result = result * self.__conditional_density_obs(X, w, y, beta, mu, sigma) 309 | 310 | return result 311 | 312 | def __log_likelihood_obs(self, X, y, beta, mu, sigma): 313 | X.reset_index(drop=True,inplace=True) 314 | y.reset_index(drop=True,inplace=True) 315 | try: 316 | X.drop(self.output, axis=1, inplace=True) 317 | except: 318 | pass 319 | 320 | if self.residual_dist == 'probit': 321 | result = spint.quad(lambda w : self.__conditional_density_obs(X, w, y, beta, mu, sigma) \ 322 | * st.norm(0,1).pdf(w), -3*sigma, 3*sigma)[0] 323 | elif self.residual_dist == 'logit': 324 | result = spint.quad(lambda w : self.__conditional_density_obs(X, w, y, beta, mu, sigma) \ 325 | * st.logistic(0,1).pdf(w), -3*sigma, 3*sigma)[0] 326 | else: 327 | raise ValueError('Unknown value for argument residual_dist') 328 | 329 | return log(result) 330 | 331 | 332 | def __log_likelihood(self, X, beta, mu, sigma): 333 | result = np.sum(np.array(X.apply(lambda group : \ 334 | self.__log_likelihood_obs(group, group[self.output], beta, mu, sigma))), axis=0) 335 | 336 | return result 337 | 338 | def __score_obs(self, X, y, beta, mu, sigma): 339 | X.reset_index(drop=True,inplace=True) 340 | y.reset_index(drop=True,inplace=True) 341 | X.drop(self.output, axis=1, inplace=True) 342 | 343 | if self.residual_dist == 'probit': 344 | result = np.array([spint.quad(lambda w : self.__grad_conditional_density_obs(X, w, y, beta, mu, sigma)[i] \ 345 | * st.norm(0,1).pdf(w), -3*sigma, 3*sigma)[0] for i in range(len(beta)+2)]) 346 | elif self.residual_dist == 'logit': 347 | result = np.array([spint.quad(lambda w : self.__grad_conditional_density_obs(X, w, y, beta, mu, sigma)[i] \ 348 | * st.logistic(0,1).pdf(w), -3*sigma, 3*sigma)[0] for i in range(len(beta)+2)]) 349 | else: 350 | raise ValueError('Unknown value for argument residual_dist') 351 | 352 | result = result / exp(self.__log_likelihood_obs(X, y, beta, mu, sigma)) 353 | return result 354 | 355 | def __score(self, X, beta, mu, sigma): 356 | list_score_obs = X.apply(lambda group : self.__score_obs( 357 | group, group[self.output], beta, mu, sigma)) 358 | return (list_score_obs, np.sum(np.array(list_score_obs), axis=0)) 359 | 360 | def __hessian(self, list_score_obs): 361 | list_score_obs = list_score_obs.apply(lambda array : np.array(array, ndmin=2)).values 362 | list_score_obs = np.concatenate(list(list_score_obs)) 363 | sum_score_obs = [] 364 | for i in range(list_score_obs.shape[0]): 365 | row = np.array(list_score_obs[i,:], ndmin=2) 366 | sum_score_obs.append(row.T.dot(row)) 367 | sum_score_obs = sum(sum_score_obs) 368 | 369 | score = np.array(np.sum(np.array(list_score_obs), axis=0), ndmin=2).T 370 | result = sum_score_obs - score.dot(score.T) / self.nb_obs 371 | 372 | return result 373 | 374 | def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False): 375 | '''Maximum Likelihhod Estimation 376 | Implement a Newton-Raphson algorithm to estimate parameters 377 | 378 | Parameters: 379 | ---------- 380 | X: 2-level MultiIndex Dataframe 381 | Database to fit the model 382 | 383 | output: string 384 | Name of the variable to predict 385 | 386 | nb_iter: integer (optional, default 20) 387 | Maximal number of iteration before the end of the Newton-Raphson algorithm 388 | 389 | drop_na: boolean (optional, default True) 390 | Indicate the method to handle missing values in X 391 | If drop_na = False, fill_value has to be given 392 | 393 | fill_value: string or dict (optional, defaul None) 394 | Considered only if drop_na = False 395 | Possible values: 396 | - 'mean': missing values of a column are replaced by the mean of that column 397 | - 'median': missing values of a column are replaced by the median of that column 398 | - dict: keys must be variables' names and associated values the values used to fill Nan 399 | 400 | verbose: boolean (optional, default False) 401 | If set to True, allows prints of Newton-Raphson algorithm's progress 402 | ''' 403 | self.output = output 404 | X = self.input_data_preparation(X.copy(), drop_na, fill_value) 405 | 406 | labels = list(np.unique(X[self.output])) 407 | if labels != [0,1]: 408 | raise ValueError("Labels must be in the unit interval.") 409 | 410 | self.nb_obs = len(X) 411 | self.variables = [x for x in X.columns if x != self.output] 412 | 413 | beta_init = [0, 1] + [0 for _ in range(len(self.variables))] 414 | self.beta_est = np.zeros((nb_iter,len(beta_init))) 415 | self.beta_est[0] = beta_init 416 | 417 | X = X.groupby(level=0) 418 | 419 | self.init_ll = self.__log_likelihood(X, beta_init[2:], 0, 1) 420 | 421 | if verbose: 422 | print('Initial log-likelihood : '+ str(self.init_ll)) 423 | print('Parameters estimation in progress.') 424 | 425 | current_ll = self.init_ll 426 | prev_ll = self.init_ll 427 | j = 1 428 | while (j < nb_iter) \ 429 | and (j == 1 or (current_ll - prev_ll > 0.01)): 430 | 431 | list_score_obs, score = self.__score(X, self.beta_est[j-1,2:], 432 | self.beta_est[j-1,0], self.beta_est[j-1,1]) 433 | hessian = self.__hessian(list_score_obs) 434 | 435 | try: 436 | self.beta_est[j] = self.beta_est[j-1] \ 437 | - inv(hessian).dot(score) 438 | except: 439 | raise ValueError('Improper classification problem' \ 440 | + ', should be 2 different labels') 441 | 442 | prev_ll = current_ll 443 | current_ll = self.__log_likelihood(X, self.beta_est[j,2:], 444 | self.beta_est[j,0], self.beta_est[j,1]) 445 | if verbose: 446 | print('Iteration %s, log-likelihood : %s'\ 447 | % (j, current_ll)) 448 | j += 1 449 | 450 | self.beta = self.beta_est[j-2,2:] 451 | self.mu = self.beta_est[j-2,0] 452 | self.sigma = self.beta_est[j-2,1] 453 | self.beta_est = self.beta_est[:j-1,:] 454 | 455 | sqrt_vec = np.vectorize(sqrt) 456 | list_score_obs, score = self.__score(X, self.beta, 457 | self.mu, self.sigma) 458 | hessian = self.__hessian(list_score_obs) 459 | self.beta_se = sqrt_vec(inv(hessian).diagonal()) 460 | 461 | self.confidence_interval = np.array( 462 | [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i], 463 | self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]] 464 | for i in range(len(self.beta))]) 465 | 466 | self.final_ll = prev_ll 467 | 468 | if j < nb_iter: 469 | self.converged = True 470 | else: 471 | self.converged = False 472 | 473 | return self -------------------------------------------------------------------------------- /codes/chp_10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9d94387d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Synthetic Data Generation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c4bf2152", 14 | "metadata": {}, 15 | "source": [ 16 | "### Synthetic data from real data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "e15547c9", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from sklearn.datasets import fetch_california_housing\n", 27 | "import pandas as pd\n", 28 | "import numpy as np\n", 29 | "import matplotlib. pyplot as plt\n", 30 | "import yfinance as yf\n", 31 | "import datetime\n", 32 | "import warnings\n", 33 | "warnings.filterwarnings('ignore')\n", 34 | "plt.rcParams['figure.dpi'] = 300\n", 35 | "plt.rcParams['savefig.dpi'] = 300" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "eb4372d3", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "X, y = fetch_california_housing(return_X_y=True)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "a89d8166", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "import numpy as np\n", 56 | "california_housing=np.column_stack([X, y])\n", 57 | "california_housing_df=pd.DataFrame(california_housing)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "3be1ef00", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from ctgan import CTGANSynthesizer\n", 68 | "\n", 69 | "ctgan = CTGANSynthesizer(epochs=10)\n", 70 | "ctgan.fit(california_housing_df)\n", 71 | "synt_sample = ctgan.sample(len(california_housing_df))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "2d195987", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "california_housing_df.describe()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "753f84fc", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "synt_sample.describe()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "f3abe89f", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from sdv.evaluation import evaluate\n", 102 | "\n", 103 | "evaluate(synt_sample, california_housing_df)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "4ed12311", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from table_evaluator import TableEvaluator\n", 114 | "\n", 115 | "table_evaluator = TableEvaluator(california_housing_df, synt_sample)\n", 116 | "\n", 117 | "table_evaluator.visual_evaluation()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "8f6abe40", 123 | "metadata": {}, 124 | "source": [ 125 | "### Synthetic data from model" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "a17caa4f", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from sklearn.datasets import make_regression\n", 136 | "import matplotlib.pyplot as plt\n", 137 | "from matplotlib import cm" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "6b6fed38", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "X, y = make_regression(n_samples=1000, n_features=3, noise=0.2,\n", 148 | " random_state=123)\n", 149 | "\n", 150 | "plt.scatter(X[:, 0], X[:, 1], alpha= 0.3, cmap='Greys', c=y)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "6d82677e", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "plt.figure(figsize=(18, 18))\n", 161 | "k = 0\n", 162 | "\n", 163 | "for i in range(0, 10):\n", 164 | " X, y = make_regression(n_samples=100, n_features=3, noise=i,\n", 165 | " random_state=123) \n", 166 | " k+=1\n", 167 | " plt.subplot(5, 2, k)\n", 168 | " profit_margin_orange = np.asarray([20, 35, 40])\n", 169 | " plt.scatter(X[:, 0], X[:, 1], alpha=0.3, cmap=cm.Greys, c=y)\n", 170 | " plt.title('Synthetic Data with Different Noises: ' + str(i))\n", 171 | "plt.show()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "2865b966", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "from sklearn.datasets import make_classification" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "e6d249dc", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "plt.figure(figsize=(18, 18))\n", 192 | "k = 0\n", 193 | "\n", 194 | "for i in range(2, 6):\n", 195 | " X, y = make_classification(n_samples=100,\n", 196 | " n_features=4,\n", 197 | " n_classes=i,\n", 198 | " n_redundant=0,\n", 199 | " n_informative=4,\n", 200 | " random_state=123)\n", 201 | " k+=1\n", 202 | " plt.subplot(2, 2, k)\n", 203 | " plt.scatter(X[: ,0], X[:, 1], alpha=0.8, cmap='gray', c=y)\n", 204 | " plt.title('Synthetic Data with Different Classes: ' + str(i))\n", 205 | "plt.show()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "f1b543a0", 211 | "metadata": {}, 212 | "source": [ 213 | "## Synthetic Data for Unsupervised Learning" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "d7601cd9", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "from sklearn.datasets import make_blobs" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "715ab91e", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "X, y = make_blobs(n_samples=100, centers=2, \n", 234 | " n_features=2, random_state=0)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "459ec07c", 241 | "metadata": { 242 | "scrolled": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "plt.figure(figsize=(18, 18))\n", 247 | "k = 0\n", 248 | "for i in range(2, 6):\n", 249 | " X, y = make_blobs(n_samples=100, centers=i,\n", 250 | " n_features=2, random_state=0)\n", 251 | " k += 1\n", 252 | " plt.subplot(2, 2, k)\n", 253 | " my_scatter_plot = plt.scatter(X[:, 0], X[:, 1],\n", 254 | " alpha=0.3, cmap='gray', c=y)\n", 255 | " plt.title('Synthetic Data with Different Clusters: ' + str(i))\n", 256 | "plt.show()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "id": "16258b7a", 262 | "metadata": {}, 263 | "source": [ 264 | "## HMM" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "1c46f351", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "ff = pd.read_csv('datasets/FF3.csv', skiprows=4)\n", 275 | "ff = ff.rename(columns={'Unnamed: 0': 'Date'})\n", 276 | "ff = ff.iloc[:-1]\n", 277 | "ff.head()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "e14b804b", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "ff.info()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "39442b02", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "ff['Date'] = pd.to_datetime(ff['Date'])\n", 298 | "ff.set_index('Date', inplace=True)\n", 299 | "ff_trim = ff.loc['2000-01-01':]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "id": "51699036", 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "ff_trim.head()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "1acb838b", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "ticker = 'SPY'\n", 320 | "start = datetime.datetime(2000, 1, 3)\n", 321 | "end = datetime.datetime(2021, 4, 30)\n", 322 | "SP_ETF = yf.download(ticker, start, end, interval='1d').Close" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "a4080d52", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "ff_merge = pd.merge(ff_trim, SP_ETF, how='inner', on='Date')" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "7740f0bd", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "SP = pd.DataFrame()\n", 343 | "SP['Close']= ff_merge['Close']" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "2edfb00b", 350 | "metadata": { 351 | "scrolled": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "SP['return'] = (SP['Close'] / SP['Close'].shift(1))-1" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "d7760a2f", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from hmmlearn import hmm" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "06dfcf72", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "hmm_model = hmm.GaussianHMM(n_components=3,\n", 376 | " covariance_type=\"full\",\n", 377 | " n_iter=100)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "id": "652b909f", 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "hmm_model.fit(np.array(SP['return'].dropna()).reshape(-1, 1))\n", 388 | "hmm_predict = hmm_model.predict(np.array(SP['return'].dropna())\n", 389 | " .reshape(-1, 1))\n", 390 | "df_hmm = pd.DataFrame(hmm_predict)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "id": "cf3f93fe", 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "ret_merged = pd.concat([df_hmm,SP['return'].dropna().reset_index()],\n", 401 | " axis=1)\n", 402 | "ret_merged.drop('Date',axis=1, inplace=True)\n", 403 | "ret_merged.rename(columns={0:'states'}, inplace=True)\n", 404 | "ret_merged.dropna().head()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "b1c16930", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "ret_merged['states'].value_counts()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "id": "0338d6d6", 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "state_means = []\n", 425 | "state_std = []\n", 426 | "\n", 427 | "for i in range(3):\n", 428 | " state_means.append(ret_merged[ret_merged.states == i]['return']\n", 429 | " .mean())\n", 430 | " state_std.append(ret_merged[ret_merged.states == i]['return']\n", 431 | " .std())\n", 432 | "print('State Means are: {}'.format(state_means))\n", 433 | "print('State Standard Deviations are: {}'.format(state_std))" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "id": "bd141016", 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "print(f'HMM means\\n {hmm_model.means_}')\n", 444 | "print(f'HMM covariances\\n {hmm_model.covars_}')\n", 445 | "print(f'HMM transition matrix\\n {hmm_model.transmat_}')\n", 446 | "print(f'HMM initial probability\\n {hmm_model.startprob_}')" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "id": "4d3adf16", 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "sp_ret = SP['return'].dropna().values.reshape(-1,1)\n", 457 | "n_components = np.arange(1, 10)\n", 458 | "clusters = [hmm.GaussianHMM(n_components=n, \n", 459 | " covariance_type=\"full\").fit(sp_ret)\n", 460 | " for n in n_components]\n", 461 | "plt.plot(n_components, [m.score(np.array(SP['return'].dropna())\\\n", 462 | " .reshape(-1,1)) for m in clusters])\n", 463 | "plt.title('Optimum Number of States')\n", 464 | "plt.xlabel('n_components')\n", 465 | "plt.ylabel('Log Likelihood')" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "id": "1a6f7bcf", 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "hmm_model = hmm.GaussianHMM(n_components=3, \n", 476 | " covariance_type=\"full\", \n", 477 | " random_state=123).fit(sp_ret)\n", 478 | "hidden_states = hmm_model.predict(sp_ret)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "id": "8d276d64", 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "from matplotlib.dates import YearLocator, MonthLocator\n", 489 | "from matplotlib import cm" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "id": "64648060", 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "df_sp_ret = SP['return'].dropna()\n", 500 | "\n", 501 | "hmm_model = hmm.GaussianHMM(n_components=3, \n", 502 | " covariance_type=\"full\", \n", 503 | " random_state=123).fit(sp_ret)\n", 504 | "\n", 505 | "hidden_states = hmm_model.predict(sp_ret)\n", 506 | "\n", 507 | "fig, axs = plt.subplots(hmm_model.n_components, sharex=True,\n", 508 | " sharey=True, figsize=(12, 9))\n", 509 | "colors = cm.gray(np.linspace(0, 0.7, hmm_model.n_components))\n", 510 | "\n", 511 | "for i, (ax, color) in enumerate(zip(axs, colors)):\n", 512 | " mask = hidden_states == i\n", 513 | " ax.plot_date(df_sp_ret.index.values[mask],\n", 514 | " df_sp_ret.values[mask],\n", 515 | " \".-\", c=color)\n", 516 | " ax.set_title(\"Hidden state {}\".format(i + 1), fontsize=16)\n", 517 | " ax.xaxis.set_minor_locator(MonthLocator())\n", 518 | "plt.tight_layout()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "id": "0f3bc462", 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "ret_merged.groupby('states')['return'].mean()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "id": "315b6180", 534 | "metadata": {}, 535 | "source": [ 536 | "## Fama-French Model vs. HMM" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "id": "5f3574be", 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "ff_merge['return'] = ff_merge['Close'].pct_change()\n", 547 | "ff_merge.dropna(inplace=True)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "id": "d2b35ffe", 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "split = int(len(ff_merge) * 0.9)\n", 558 | "train_ff= ff_merge.iloc[:split].dropna()\n", 559 | "test_ff = ff_merge.iloc[split:].dropna()" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "id": "b656a45f", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "hmm_model = hmm.GaussianHMM(n_components=3,\n", 570 | " covariance_type=\"full\",\n", 571 | " n_iter=100, init_params=\" \")" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "id": "8cd9f8f5", 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "predictions = []\n", 582 | "\n", 583 | "for i in range(len(test_ff)):\n", 584 | " hmm_model.fit(train_ff)\n", 585 | " adjustment = np.dot(hmm_model.transmat_, hmm_model.means_)\n", 586 | " predictions.append(test_ff.iloc[i] + adjustment[0])\n", 587 | "predictions = pd.DataFrame(predictions)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "id": "3d0df7ea", 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "std_dev = predictions['return'].std()\n", 598 | "sharpe = predictions['return'].mean() / std_dev\n", 599 | "print('Sharpe ratio with HMM is {:.4f}'.format(sharpe))" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "id": "82993b8a", 605 | "metadata": {}, 606 | "source": [ 607 | "## Fama-French Model with OLS" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "id": "c195097c", 614 | "metadata": {}, 615 | "outputs": [], 616 | "source": [ 617 | "import statsmodels.api as sm" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "id": "a9c57902", 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "Y = train_ff['return']\n", 628 | "X = train_ff[['Mkt-RF', 'SMB', 'HML']]" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "id": "6bcec34d", 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "model = sm.OLS(Y, X)\n", 639 | "ff_ols = model.fit()\n", 640 | "print(ff_ols.summary())" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "id": "dc59afd3", 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "ff_pred = ff_ols.predict(test_ff[[\"Mkt-RF\", \"SMB\", \"HML\"]])\n", 651 | "ff_pred.head()" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "id": "70e91c46", 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "std_dev = ff_pred.std()\n", 662 | "sharpe = ff_pred.mean() / std_dev\n", 663 | "print('Sharpe ratio with FF 3 factor model is {:.4f}'.format(sharpe))" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "id": "94dd1023", 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [ 673 | "split = int(len(SP['return']) * 0.9)\n", 674 | "train_ret_SP = SP['return'].iloc[split:].dropna()\n", 675 | "test_ret_SP = SP['return'].iloc[:split].dropna()" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "id": "afc83e38", 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "hmm_model = hmm.GaussianHMM(n_components=3,\n", 686 | " covariance_type=\"full\",\n", 687 | " n_iter=100)\n", 688 | "hmm_model.fit(np.array(train_ret_SP).reshape(-1, 1))\n", 689 | "hmm_predict_vol = hmm_model.predict(np.array(test_ret_SP)\n", 690 | " .reshape(-1, 1))\n", 691 | "pd.DataFrame(hmm_predict_vol).value_counts()" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "id": "41a110d6", 697 | "metadata": {}, 698 | "source": [ 699 | "## Synthetic Data Generation and Hidden Markov" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "24aed2a5", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "startprob = hmm_model.startprob_\n", 710 | "transmat = hmm_model.transmat_\n", 711 | "means = hmm_model.means_ \n", 712 | "covars = hmm_model.covars_" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "id": "81f550c3", 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "syn_hmm = hmm.GaussianHMM(n_components=3, covariance_type=\"full\")" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "id": "2b28defb", 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "syn_hmm.startprob_ = startprob\n", 733 | "syn_hmm.transmat_ = transmat \n", 734 | "syn_hmm.means_ = means \n", 735 | "syn_hmm.covars_ = covars" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "id": "6a7992d1", 742 | "metadata": {}, 743 | "outputs": [], 744 | "source": [ 745 | "syn_data, _ = syn_hmm.sample(n_samples=1000)" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "id": "772ca78a", 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "plt.hist(syn_data)\n", 756 | "plt.title('Histogram of Synthetic Data')\n", 757 | "plt.show()" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "id": "f59b8002", 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "plt.plot(syn_data, \"--\")\n", 768 | "plt.title('Line Plot of Synthetic Data')\n", 769 | "plt.show()" 770 | ] 771 | } 772 | ], 773 | "metadata": { 774 | "kernelspec": { 775 | "display_name": "Python 3", 776 | "language": "python", 777 | "name": "python3" 778 | }, 779 | "language_info": { 780 | "codemirror_mode": { 781 | "name": "ipython", 782 | "version": 3 783 | }, 784 | "file_extension": ".py", 785 | "mimetype": "text/x-python", 786 | "name": "python", 787 | "nbconvert_exporter": "python", 788 | "pygments_lexer": "ipython3", 789 | "version": "3.8.8" 790 | }, 791 | "latex_envs": { 792 | "LaTeX_envs_menu_present": true, 793 | "autoclose": false, 794 | "autocomplete": true, 795 | "bibliofile": "biblio.bib", 796 | "cite_by": "apalike", 797 | "current_citInitial": 1, 798 | "eqLabelWithNumbers": true, 799 | "eqNumInitial": 1, 800 | "hotkeys": { 801 | "equation": "Ctrl-E", 802 | "itemize": "Ctrl-I" 803 | }, 804 | "labels_anchors": false, 805 | "latex_user_defs": false, 806 | "report_style_numbering": false, 807 | "user_envs_cfg": false 808 | }, 809 | "toc": { 810 | "base_numbering": 1, 811 | "nav_menu": {}, 812 | "number_sections": false, 813 | "sideBar": true, 814 | "skip_h1_title": false, 815 | "title_cell": "Table of Contents", 816 | "title_sidebar": "Contents", 817 | "toc_cell": false, 818 | "toc_position": {}, 819 | "toc_section_display": true, 820 | "toc_window_display": false 821 | } 822 | }, 823 | "nbformat": 4, 824 | "nbformat_minor": 5 825 | } 826 | -------------------------------------------------------------------------------- /codes/chp_7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "c314ee7b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import warnings\n", 14 | "warnings.filterwarnings(\"ignore\")\n", 15 | "plt.rcParams['figure.figsize'] = (10, 6)\n", 16 | "pd.set_option('use_inf_as_na', True)\n", 17 | "plt.rcParams['figure.dpi'] = 300\n", 18 | "plt.rcParams['savefig.dpi'] = 300" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "6dbe8daf", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "liq_data = pd.read_csv('datasets/bid_ask.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "ec9ef45b", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "liq_data.head()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "7c1d7e13", 45 | "metadata": { 46 | "code_folding": [] 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "rolling_five = []\n", 51 | "\n", 52 | "for j in liq_data.TICKER.unique():\n", 53 | " for i in range(len(liq_data[liq_data.TICKER == j])):\n", 54 | " rolling_five.append(liq_data[i:i+5].agg({'BIDLO': 'min',\n", 55 | " 'ASKHI': 'max',\n", 56 | " 'VOL': 'sum',\n", 57 | " 'SHROUT': 'mean',\n", 58 | " 'PRC': 'mean'}))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "3f642423", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "rolling_five_df = pd.DataFrame(rolling_five)\n", 69 | "rolling_five_df.columns = ['bidlo_min', 'askhi_max', 'vol_sum',\n", 70 | " 'shrout_mean', 'prc_mean']\n", 71 | "liq_vol_all = pd.concat([liq_data,rolling_five_df], axis=1)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "b11274b7", 77 | "metadata": {}, 78 | "source": [ 79 | "## Volume Based Measure" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "fdaec6b2", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "liq_ratio = []\n", 90 | "\n", 91 | "for j in liq_vol_all.TICKER.unique():\n", 92 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 93 | " liq_ratio.append((liq_vol_all['PRC'][i+1:i+6] * \n", 94 | " liq_vol_all['VOL'][i+1:i+6]).sum()/\n", 95 | " (np.abs(liq_vol_all['PRC'][i+1:i+6].mean() - \n", 96 | " liq_vol_all['PRC'][i:i+5].mean())))" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "3cbe3e36", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "Lhh = []\n", 107 | "\n", 108 | "for j in liq_vol_all.TICKER.unique():\n", 109 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 110 | " Lhh.append((liq_vol_all['PRC'][i:i+5].max() - \n", 111 | " liq_vol_all['PRC'][i:i+5].min()) / \n", 112 | " liq_vol_all['PRC'][i:i+5].min() / \n", 113 | " (liq_vol_all['VOL'][i:i+5].sum() / \n", 114 | " liq_vol_all['SHROUT'][i:i+5].mean() * \n", 115 | " liq_vol_all['PRC'][i:i+5].mean()))" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "b825b590", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "turnover_ratio = []\n", 126 | "\n", 127 | "for j in liq_vol_all.TICKER.unique():\n", 128 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 129 | " turnover_ratio.append((1/liq_vol_all['VOL'].count()) * \n", 130 | " (np.sum(liq_vol_all['VOL'][i:i+1]) / \n", 131 | " np.sum(liq_vol_all['SHROUT'][i:i+1])))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "5e2edf81", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "liq_vol_all['liq_ratio'] = pd.DataFrame(liq_ratio)\n", 142 | "liq_vol_all['Lhh'] = pd.DataFrame(Lhh)\n", 143 | "liq_vol_all['turnover_ratio'] = pd.DataFrame(turnover_ratio)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "e135f29f", 149 | "metadata": {}, 150 | "source": [ 151 | "## Transaction Cost Based Measures" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "b4cccab2", 157 | "metadata": {}, 158 | "source": [ 159 | "### Bid-Ask Spreads" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "d3cc91ef", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "liq_vol_all['mid_price'] = (liq_vol_all.ASKHI + liq_vol_all.BIDLO) / 2\n", 170 | "liq_vol_all['percent_quoted_ba'] = (liq_vol_all.ASKHI - \n", 171 | " liq_vol_all.BIDLO) / \\\n", 172 | " liq_vol_all.mid_price\n", 173 | "liq_vol_all['percent_effective_ba'] = 2 * abs((liq_vol_all.PRC - \n", 174 | " liq_vol_all.mid_price)) / \\\n", 175 | " liq_vol_all.mid_price" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "192ca49a", 181 | "metadata": {}, 182 | "source": [ 183 | "### Roll's Spread" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "72e37d34", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "liq_vol_all['price_diff'] = liq_vol_all.groupby('TICKER')['PRC']\\\n", 194 | " .apply(lambda x:x.diff())\n", 195 | "liq_vol_all.dropna(inplace=True)\n", 196 | "roll = []\n", 197 | "\n", 198 | "for j in liq_vol_all.TICKER.unique():\n", 199 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 200 | " roll_cov = np.cov(liq_vol_all['price_diff'][i:i+5], \n", 201 | " liq_vol_all['price_diff'][i+1:i+6])\n", 202 | " if roll_cov[0,1] < 0:\n", 203 | " roll.append(2 * np.sqrt(-roll_cov[0, 1]))\n", 204 | " else:\n", 205 | " roll.append(2 * np.sqrt(np.abs(roll_cov[0, 1])))\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "5472c49c", 211 | "metadata": {}, 212 | "source": [ 213 | "### Corwin and Schultz (2012)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "6ec507fd", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "gamma = []\n", 224 | "\n", 225 | "for j in liq_vol_all.TICKER.unique():\n", 226 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 227 | " gamma.append((max(liq_vol_all['ASKHI'].iloc[i+1], \n", 228 | " liq_vol_all['ASKHI'].iloc[i]) - \n", 229 | " min(liq_vol_all['BIDLO'].iloc[i+1], \n", 230 | " liq_vol_all['BIDLO'].iloc[i])) ** 2)\n", 231 | " gamma_array = np.array(gamma)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "ab72f09a", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "beta = []\n", 242 | "\n", 243 | "for j in liq_vol_all.TICKER.unique():\n", 244 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 245 | " beta.append((liq_vol_all['ASKHI'].iloc[i+1] - \n", 246 | " liq_vol_all['BIDLO'].iloc[i+1]) ** 2 + \n", 247 | " (liq_vol_all['ASKHI'].iloc[i] - \n", 248 | " liq_vol_all['BIDLO'].iloc[i]) ** 2)\n", 249 | " beta_array = np.array(beta)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "id": "0c6da937", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "alpha = ((np.sqrt(2 * beta_array) - np.sqrt(beta_array)) / \n", 260 | " (3 - (2 * np.sqrt(2)))) - np.sqrt(gamma_array / \n", 261 | " (3 - (2 * np.sqrt(2))))\n", 262 | "CS_spread = (2 * np.exp(alpha - 1)) / (1 + np.exp(alpha))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "c260377d", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "liq_vol_all = liq_vol_all.reset_index()\n", 273 | "liq_vol_all['roll'] = pd.DataFrame(roll)\n", 274 | "liq_vol_all['CS_spread'] = pd.DataFrame(CS_spread)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "id": "989ee3e6", 280 | "metadata": {}, 281 | "source": [ 282 | "## Price Based Measures" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "62c9209d", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "dvol = []\n", 293 | "\n", 294 | "for j in liq_vol_all.TICKER.unique():\n", 295 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 296 | " dvol.append((liq_vol_all['PRC'][i:i+5] *\n", 297 | " liq_vol_all['VOL'][i:i+5]).sum())\n", 298 | "liq_vol_all['dvol'] = pd.DataFrame(dvol)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "id": "35252634", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "amihud = []\n", 309 | "\n", 310 | "for j in liq_vol_all.TICKER.unique():\n", 311 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 312 | " amihud.append((1 / liq_vol_all['RET'].count()) * \n", 313 | " (np.sum(np.abs(liq_vol_all['RET'][i:i+1])) / \n", 314 | " np.sum(liq_vol_all['dvol'][i:i+1])))" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "id": "06803a21", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "florackis = []\n", 325 | "\n", 326 | "for j in liq_vol_all.TICKER.unique():\n", 327 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 328 | " florackis.append((1 / liq_vol_all['RET'].count()) * \n", 329 | " (np.sum(np.abs(liq_vol_all['RET'][i:i+1]) / \n", 330 | " liq_vol_all['turnover_ratio'][i:i+1])))" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "b4540974", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "liq_vol_all['vol_diff_pct'] = liq_vol_all.groupby('TICKER')['VOL']\\\n", 341 | " .apply(lambda x: x.diff()).pct_change()\n", 342 | "liq_vol_all['price_diff_pct'] = liq_vol_all.groupby('TICKER')['PRC']\\\n", 343 | " .apply(lambda x: x.diff()).pct_change()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "b46df604", 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "cet = []\n", 354 | "\n", 355 | "for j in liq_vol_all.TICKER.unique():\n", 356 | " for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n", 357 | " cet.append(np.sum(liq_vol_all['vol_diff_pct'][i:i+1])/\n", 358 | " np.sum(liq_vol_all['price_diff_pct'][i:i+1]))" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "57cbfa54", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "liq_vol_all['amihud'] = pd.DataFrame(amihud)\n", 369 | "liq_vol_all['florackis'] = pd.DataFrame(florackis)\n", 370 | "liq_vol_all['cet'] = pd.DataFrame(cet)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "id": "b33015fc", 376 | "metadata": {}, 377 | "source": [ 378 | "## Market Impact Measures" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "1eff896d", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "import statsmodels.api as sm" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "4a4255c6", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "liq_vol_all['VOL_pct_change'] = liq_vol_all.groupby('TICKER')['VOL']\\\n", 399 | " .apply(lambda x: x.pct_change())\n", 400 | "liq_vol_all.dropna(subset=['VOL_pct_change'], inplace=True)\n", 401 | "liq_vol_all = liq_vol_all.reset_index()" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "id": "61843be0", 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "unsys_resid = []\n", 412 | "\n", 413 | "for i in liq_vol_all.TICKER.unique():\n", 414 | " X1 = liq_vol_all[liq_vol_all['TICKER'] == i]['vwretx']\n", 415 | " y = liq_vol_all[liq_vol_all['TICKER'] == i]['RET']\n", 416 | " ols = sm.OLS(y, X1).fit()\n", 417 | " unsys_resid.append(ols.resid)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "id": "c82d8f75", 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "market_impact = {}\n", 428 | "\n", 429 | "for i, j in zip(liq_vol_all.TICKER.unique(), \n", 430 | " range(len(liq_vol_all['TICKER'].unique()))):\n", 431 | " X2 = liq_vol_all[liq_vol_all['TICKER'] == i]['VOL_pct_change']\n", 432 | " ols = sm.OLS(unsys_resid[j] ** 2, X2).fit()\n", 433 | " print('***' * 30)\n", 434 | " print(f'OLS Result for {i}')\n", 435 | " print(ols.summary())\n", 436 | " market_impact[j] = ols.resid" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "id": "6faa0827", 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "append1 = market_impact[0].append(market_impact[1])\n", 447 | "liq_vol_all['market_impact'] = append1.append(market_impact[2])" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "f612c974", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "cols = ['vol_diff_pct', 'price_diff_pct', 'price_diff',\n", 458 | " 'VOL_pct_change', 'dvol', 'mid_price']\n", 459 | "liq_measures_all = liq_vol_all.drop(liq_vol_all[cols], axis=1)\\\n", 460 | " .iloc[:, -11:]\n", 461 | "liq_measures_all.dropna(inplace=True)\n", 462 | "liq_measures_all.describe().T" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "id": "d6e951cc", 468 | "metadata": {}, 469 | "source": [ 470 | "## GMM" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "id": "20dd3c79", 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "from sklearn.mixture import GaussianMixture\n", 481 | "from sklearn.preprocessing import StandardScaler" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "id": "41f5b22e", 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "liq_measures_all2 = liq_measures_all.dropna()\n", 492 | "scaled_liq = StandardScaler().fit_transform(liq_measures_all2)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "id": "52393f11", 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "kwargs = dict(alpha=0.5, bins=50, stacked=True)\n", 503 | "plt.hist(liq_measures_all.loc[:, 'percent_quoted_ba'],\n", 504 | " **kwargs, label='TC-based')\n", 505 | "plt.hist(liq_measures_all.loc[:, 'turnover_ratio'],\n", 506 | " **kwargs, label='Volume-based')\n", 507 | "plt.hist(liq_measures_all.loc[:, 'market_impact'],\n", 508 | " **kwargs, label='Market-based')\n", 509 | "plt.title('Multimodality of the Liquidity Measures')\n", 510 | "plt.legend()\n", 511 | "plt.show()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "id": "65ba2e29", 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "n_components = np.arange(1, 10)\n", 522 | "clusters = [GaussianMixture(n, covariance_type='spherical',\n", 523 | " random_state=0).fit(scaled_liq)\n", 524 | " for n in n_components]\n", 525 | "plt.plot(n_components, [m.bic(scaled_liq) for m in clusters])\n", 526 | "plt.title('Optimum Number of Components')\n", 527 | "plt.xlabel('n_components')\n", 528 | "plt.ylabel('BIC values')\n", 529 | "plt.show()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "id": "1ee26c41", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "def cluster_state(data, nstates):\n", 540 | " gmm = GaussianMixture(n_components=nstates,\n", 541 | " covariance_type='spherical',\n", 542 | " init_params='kmeans')\n", 543 | " gmm_fit = gmm.fit(scaled_liq)\n", 544 | " labels = gmm_fit.predict(scaled_liq)\n", 545 | " state_probs = gmm.predict_proba(scaled_liq)\n", 546 | " state_probs_df = pd.DataFrame(state_probs, \n", 547 | " columns=['state-1','state-2','state-3'])\n", 548 | " state_prob_means = [state_probs_df.iloc[:, i].mean() \n", 549 | " for i in range(len(state_probs_df.columns))]\n", 550 | " if np.max(state_prob_means) == state_prob_means[0]:\n", 551 | " print('State-1 is likely to occur with a probability of {:4f}'\n", 552 | " .format(state_prob_means[0]))\n", 553 | " elif np.max(state_prob_means) == state_prob_means[1]:\n", 554 | " print('State-2 is likely to occur with a probability of {:4f}'\n", 555 | " .format(state_prob_means[1]))\n", 556 | " else:\n", 557 | " print('State-3 is likely to occur with a probability of {:4f}'\n", 558 | " .format(state_prob_means[2]))\n", 559 | " return state_probs" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "id": "41148b88", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "state_probs = cluster_state(scaled_liq, 3)\n", 570 | "print(f'State probabilities are {state_probs.mean(axis=0)}')" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "id": "90da06dc", 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "from sklearn.decomposition import PCA" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "id": "b3740fbf", 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "pca = PCA(n_components=11)\n", 591 | "components = pca.fit_transform(scaled_liq)\n", 592 | "plt.plot(pca.explained_variance_ratio_)\n", 593 | "plt.title('Scree Plot')\n", 594 | "plt.xlabel('Number of Components')\n", 595 | "plt.ylabel('% of Explained Variance')\n", 596 | "plt.show()" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "id": "d8588a7a", 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "def gmm_pca(data, nstate):\n", 607 | " pca = PCA(n_components=3)\n", 608 | " components = pca.fit_transform(data)\n", 609 | " mxtd = GaussianMixture(n_components=nstate,\n", 610 | " covariance_type='spherical')\n", 611 | " gmm = mxtd.fit(components)\n", 612 | " labels = gmm.predict(components)\n", 613 | " state_probs = gmm.predict_proba(components)\n", 614 | " return state_probs,pca" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "id": "ef610512", 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "state_probs, pca = gmm_pca(scaled_liq, 3)\n", 625 | "print(f'State probabilities are {state_probs.mean(axis=0)}')" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "id": "53e4a0f0", 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "def wpc():\n", 636 | " state_probs_df = pd.DataFrame(state_probs,\n", 637 | " columns=['state-1', 'state-2',\n", 638 | " 'state-3'])\n", 639 | " state_prob_means = [state_probs_df.iloc[:, i].mean() \n", 640 | " for i in range(len(state_probs_df.columns))]\n", 641 | " if np.max(state_prob_means) == state_prob_means[0]:\n", 642 | " print('State-1 is likely to occur with a probability of {:4f}'\n", 643 | " .format(state_prob_means[0]))\n", 644 | " elif np.max(state_prob_means) == state_prob_means[1]:\n", 645 | " print('State-2 is likely to occur with a probability of {:4f}'\n", 646 | " .format(state_prob_means[1]))\n", 647 | " else:\n", 648 | " print('State-3 is likely to occur with a probability of {:4f}'\n", 649 | " .format(state_prob_means[2]))\n", 650 | "wpc()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "id": "2f538d4f", 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "loadings = pca.components_.T * np.sqrt(pca.explained_variance_)\n", 661 | "loading_matrix = pd.DataFrame(loadings, \n", 662 | " columns=['PC1', 'PC2', 'PC3'],\n", 663 | " index=liq_measures_all.columns)\n", 664 | "loading_matrix" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "id": "213db006", 670 | "metadata": {}, 671 | "source": [ 672 | "## GMCM" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "id": "4f3d5149", 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "from copulae.mixtures.gmc.gmc import GaussianMixtureCopula" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "b702ba5c", 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "_, dim = scaled_liq.shape\n", 693 | "gmcm = GaussianMixtureCopula(n_clusters=3, ndim=dim)" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "id": "8e26e609", 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "gmcm_fit = gmcm.fit(scaled_liq, method='kmeans',\n", 704 | " criteria='GMCM', eps=0.0001)\n", 705 | "state_prob = gmcm_fit.params.prob\n", 706 | "print(f'The state {np.argmax(state_prob) + 1} is likely to occur')\n", 707 | "print(f'State probabilities based on GMCM are {state_prob}')" 708 | ] 709 | } 710 | ], 711 | "metadata": { 712 | "celltoolbar": "Raw Cell Format", 713 | "kernelspec": { 714 | "display_name": "Python 3", 715 | "language": "python", 716 | "name": "python3" 717 | }, 718 | "language_info": { 719 | "codemirror_mode": { 720 | "name": "ipython", 721 | "version": 3 722 | }, 723 | "file_extension": ".py", 724 | "mimetype": "text/x-python", 725 | "name": "python", 726 | "nbconvert_exporter": "python", 727 | "pygments_lexer": "ipython3", 728 | "version": "3.8.8" 729 | }, 730 | "latex_envs": { 731 | "LaTeX_envs_menu_present": true, 732 | "autoclose": false, 733 | "autocomplete": true, 734 | "bibliofile": "biblio.bib", 735 | "cite_by": "apalike", 736 | "current_citInitial": 1, 737 | "eqLabelWithNumbers": true, 738 | "eqNumInitial": 1, 739 | "hotkeys": { 740 | "equation": "Ctrl-E", 741 | "itemize": "Ctrl-I" 742 | }, 743 | "labels_anchors": false, 744 | "latex_user_defs": false, 745 | "report_style_numbering": false, 746 | "user_envs_cfg": false 747 | }, 748 | "toc": { 749 | "base_numbering": 1, 750 | "nav_menu": {}, 751 | "number_sections": false, 752 | "sideBar": true, 753 | "skip_h1_title": false, 754 | "title_cell": "Table of Contents", 755 | "title_sidebar": "Contents", 756 | "toc_cell": false, 757 | "toc_position": {}, 758 | "toc_section_display": true, 759 | "toc_window_display": false 760 | } 761 | }, 762 | "nbformat": 4, 763 | "nbformat_minor": 5 764 | } 765 | -------------------------------------------------------------------------------- /codes/chp_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Oil Price Graph" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import quandl\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import warnings\n", 19 | "warnings.filterwarnings('ignore')\n", 20 | "plt.style.use('seaborn')\n", 21 | "plt.rcParams['figure.dpi'] = 300\n", 22 | "plt.rcParams['savefig.dpi'] = 300" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "oil = quandl.get(\"NSE/OIL\", authtoken=\"vEjGTysiCFBuN-z5bjGP\",#insert you api token\n", 32 | " start_date=\"1980-01-01\",\n", 33 | " end_date=\"2020-01-01\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "plt.figure(figsize=(10, 6))\n", 43 | "plt.plot(oil.Close)\n", 44 | "plt.ylabel('$')\n", 45 | "plt.xlabel('Date')\n", 46 | "plt.show()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Trend" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "import yfinance as yf\n", 63 | "import numpy as np\n", 64 | "import pandas as pd\n", 65 | "import datetime\n", 66 | "import statsmodels.api as sm\n", 67 | "from statsmodels.tsa.stattools import adfuller\n", 68 | "from statsmodels.tsa.seasonal import seasonal_decompose" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "ticker = '^GSPC'\n", 78 | "start = datetime.datetime(2015, 1, 1)\n", 79 | "end = datetime.datetime(2021, 1, 1)\n", 80 | "SP_prices = yf.download(ticker, start=start, end=end, interval='1mo')\\\n", 81 | " .Close" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "seasonal_decompose(SP_prices, period=12).plot()\n", 91 | "plt.show()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "plt.figure(figsize=(10, 6))\n", 101 | "plt.plot(SP_prices)\n", 102 | "plt.title('S&P-500 Prices')\n", 103 | "plt.ylabel('$')\n", 104 | "plt.xlabel('Date')\n", 105 | "plt.show()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "sm.graphics.tsa.plot_acf(SP_prices, lags=30)\n", 115 | "plt.xlabel('Number of Lags')\n", 116 | "plt.show()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "sm.graphics.tsa.plot_pacf(SP_prices, lags=30)\n", 126 | "plt.xlabel('Number of Lags')\n", 127 | "plt.show()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | " ## Seasonality" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "from fredapi import Fred\n", 144 | "import statsmodels.api as sm" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "fred = Fred(api_key='78b14ec6ba46f484b94db43694468bb1')#insert you api key" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "energy = fred.get_series(\"CAPUTLG2211A2S\", \n", 163 | " observation_start=\"2010-01-01\", \n", 164 | " observation_end=\"2020-12-31\")\n", 165 | "energy.head(12)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "plt.plot(energy)\n", 175 | "plt.title('Energy Capacity Utilization')\n", 176 | "plt.ylabel('$')\n", 177 | "plt.xlabel('Date')\n", 178 | "plt.show()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "sm.graphics.tsa.plot_acf(energy, lags=30)\n", 188 | "plt.xlabel('Number of Lags')\n", 189 | "plt.show()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Stationarity" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "stat_test = adfuller(SP_prices)[0:2]\n", 206 | "print(\"The test statistic and p-value of ADF test are {}\"\n", 207 | " .format(stat_test))" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "diff_SP_price = SP_prices.diff()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "scrolled": true 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "plt.figure(figsize=(10, 6))\n", 228 | "plt.plot(diff_SP_price)\n", 229 | "plt.title('Differenced S&P-500 Price')\n", 230 | "plt.ylabel('$')\n", 231 | "plt.xlabel('Date')\n", 232 | "plt.show()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "sm.graphics.tsa.plot_acf(diff_SP_price.dropna(),lags=30)\n", 242 | "plt.xlabel('Number of Lags')\n", 243 | "plt.show()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "stat_test2 = adfuller(diff_SP_price.dropna())[0:2]\n", 253 | "print(\"The test statistic and p-value of ADF test after differencing are {}\"\\\n", 254 | " .format(stat_test2))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "seasonal_index = energy.resample('Q').mean()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "dates = energy.index.year.unique()\n", 273 | "deseasonalized = []\n", 274 | "for i in dates:\n", 275 | " for j in range(1, 13):\n", 276 | " deseasonalized.append((energy[str(i)][energy[str(i)]\\\n", 277 | " .index.month==j]))\n", 278 | "concat_deseasonalized = np.concatenate(deseasonalized)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "deseason_energy = []\n", 288 | "for i,s in zip(range(0, len(energy), 3), range(len(seasonal_index))):\n", 289 | " deseason_energy.append(concat_deseasonalized[i:i+3] / \n", 290 | " seasonal_index.iloc[s])\n", 291 | "concat_deseason_energy = np.concatenate(deseason_energy)\n", 292 | "deseason_energy = pd.DataFrame(concat_deseason_energy, \n", 293 | " index=energy.index)\n", 294 | "deseason_energy.columns = ['Deaseasonalized Energy']\n", 295 | "deseason_energy.head()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "sm.graphics.tsa.plot_acf(deseason_energy, lags=10)\n", 305 | "plt.xlabel('Number of Lags')\n", 306 | "plt.show()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "sm.graphics.tsa.plot_pacf(deseason_energy, lags=10)\n", 316 | "plt.xlabel('Number of Lags')\n", 317 | "plt.show()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "## White Noise" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "mu = 0\n", 334 | "std = 1 \n", 335 | "WN = np.random.normal(mu, std, 1000)\n", 336 | "\n", 337 | "plt.plot(WN)\n", 338 | "plt.xlabel('Number of Simulations')\n", 339 | "plt.show()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "## Moving Average" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "ticker = ['AAPL', 'MSFT']\n", 356 | "start = datetime.datetime(2019, 1, 1)\n", 357 | "end = datetime.datetime(2021, 1, 1)\n", 358 | "stock_prices = yf.download(ticker, start, end, interval='1d')\\\n", 359 | " .Close" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "stock_prices = stock_prices.dropna()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "for i in ticker:\n", 378 | " stat_test = adfuller(stock_prices[i])[0:2]\n", 379 | " print(\"The ADF test statistic and p-value of {} are {}\"\\\n", 380 | " .format(i, stat_test))" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "diff_stock_prices = stock_prices.diff().dropna()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "split = int(len(diff_stock_prices['AAPL'].values) * 0.95)\n", 399 | "diff_train_aapl = diff_stock_prices['AAPL'].iloc[:split]\n", 400 | "diff_test_aapl = diff_stock_prices['AAPL'].iloc[split:]\n", 401 | "diff_train_msft = diff_stock_prices['MSFT'].iloc[:split]\n", 402 | "diff_test_msft = diff_stock_prices['MSFT'].iloc[split:]" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "diff_train_aapl.to_csv('diff_train_aapl.csv')\n", 412 | "diff_test_aapl.to_csv('diff_test_aapl.csv')\n", 413 | "diff_train_msft.to_csv('diff_train_msft.csv')\n", 414 | "diff_test_msft.to_csv('diff_test_msft.csv')" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "fig, ax = plt.subplots(2, 1, figsize=(10, 6))\n", 424 | "plt.tight_layout() \n", 425 | "sm.graphics.tsa.plot_acf(diff_train_aapl,lags=30,\n", 426 | " ax=ax[0], title='ACF - Apple')\n", 427 | "sm.graphics.tsa.plot_acf(diff_train_msft,lags=30,\n", 428 | " ax=ax[1], title='ACF - Microsoft')\n", 429 | "plt.show()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": { 436 | "scrolled": true 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "short_moving_average_appl = diff_train_aapl.rolling(window=9).mean()\n", 441 | "long_moving_average_appl = diff_train_aapl.rolling(window=22).mean()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": { 448 | "scrolled": true 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "fig, ax = plt.subplots(figsize=(10, 6))\n", 453 | "ax.plot(diff_train_aapl.loc[start:end].index, \n", 454 | " diff_train_aapl.loc[start:end],\n", 455 | " label='Stock Price', linestyle='--')\n", 456 | "ax.plot(short_moving_average_appl.loc[start:end].index, \n", 457 | " short_moving_average_appl.loc[start:end],\n", 458 | " label = 'Short MA', linestyle='solid')\n", 459 | "ax.plot(long_moving_average_appl.loc[start:end].index, \n", 460 | " long_moving_average_appl.loc[start:end],\n", 461 | " label = 'Long MA', linestyle='solid')\n", 462 | "ax.legend(loc='best')\n", 463 | "ax.set_ylabel('Differenced Price')\n", 464 | "ax.set_title('Stock Prediction-Apple')\n", 465 | "plt.show()" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "short_moving_average_msft = diff_train_msft.rolling(window=2).mean()\n", 475 | "long_moving_average_msft = diff_train_msft.rolling(window=22).mean()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "fig, ax = plt.subplots(figsize=(10, 6))\n", 485 | "ax.plot(diff_train_msft.loc[start:end].index,\n", 486 | " diff_train_msft.loc[start:end],\n", 487 | " label='Stock Price', linestyle='--')\n", 488 | "ax.plot(short_moving_average_msft.loc[start:end].index,\n", 489 | " short_moving_average_msft.loc[start:end],\n", 490 | " label = 'Short MA', linestyle='solid')\n", 491 | "ax.plot(long_moving_average_msft.loc[start:end].index,\n", 492 | " long_moving_average_msft.loc[start:end],\n", 493 | " label = 'Long MA', linestyle='-.')\n", 494 | "ax.legend(loc='best')\n", 495 | "ax.set_ylabel('Differenced Price')\n", 496 | "ax.set_xlabel('Date')\n", 497 | "ax.set_title('Stock Prediction-Microsoft')\n", 498 | "plt.show()" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "## Autoregressive Model" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "sm.graphics.tsa.plot_pacf(diff_train_aapl, lags=30)\n", 515 | "plt.title('PACF of Apple')\n", 516 | "plt.xlabel('Number of Lags')\n", 517 | "plt.show()" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "sm.graphics.tsa.plot_pacf(diff_train_msft, lags=30)\n", 527 | "plt.title('PACF of Microsoft')\n", 528 | "plt.xlabel('Number of Lags')\n", 529 | "plt.show()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "from statsmodels.tsa.ar_model import AutoReg\n", 539 | "import warnings\n", 540 | "warnings.filterwarnings('ignore')" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "ar_aapl = AutoReg(diff_train_aapl.values, lags=29)\n", 550 | "ar_fitted_aapl = ar_aapl.fit()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "ar_predictions_aapl = ar_fitted_aapl.predict(start=len(diff_train_aapl), \n", 560 | " end=len(diff_train_aapl)\\\n", 561 | " + len(diff_test_aapl) - 1, \n", 562 | " dynamic=False)" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "for i in range(len(ar_predictions_aapl)):\n", 572 | " print('==' * 25)\n", 573 | " print('predicted values:{:.4f} & actual values:{:.4f}'\\\n", 574 | " .format(ar_predictions_aapl[i], diff_test_aapl[i]))" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "ar_predictions_aapl = pd.DataFrame(ar_predictions_aapl)\n", 584 | "ar_predictions_aapl.index = diff_test_aapl.index" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "ar_msft = AutoReg(diff_train_msft.values, lags=26)\n", 594 | "ar_fitted_msft = ar_msft.fit()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "ar_predictions_msft = ar_fitted_msft.predict(start=len(diff_train_msft), \n", 604 | " end=len(diff_train_msft)\\\n", 605 | " +len(diff_test_msft) - 1,\n", 606 | " dynamic=False)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "ar_predictions_msft = pd.DataFrame(ar_predictions_msft)\n", 616 | "ar_predictions_msft.index = diff_test_msft.index" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "fig, ax = plt.subplots(2,1, figsize=(18, 15))\n", 626 | " \n", 627 | "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n", 628 | "ax[0].plot(ar_predictions_aapl, linestyle='solid', label=\"Prediction\")\n", 629 | "ax[0].set_title('Predicted Stock Price-Apple')\n", 630 | "ax[0].legend(loc='best')\n", 631 | "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n", 632 | "ax[1].plot(ar_predictions_msft, linestyle='solid', label=\"Prediction\")\n", 633 | "ax[1].set_title('Predicted Stock Price-Microsoft')\n", 634 | "ax[1].legend(loc='best')\n", 635 | "for ax in ax.flat:\n", 636 | " ax.set(xlabel='Date', ylabel='Differenced Price')\n", 637 | "plt.show()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": {}, 643 | "source": [ 644 | "## ARIMA Model" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [ 653 | "from statsmodels.tsa.arima_model import ARIMA" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [ 662 | "split = int(len(stock_prices['AAPL'].values) * 0.95)\n", 663 | "train_aapl = stock_prices['AAPL'].iloc[:split]\n", 664 | "test_aapl = stock_prices['AAPL'].iloc[split:]\n", 665 | "train_msft = stock_prices['MSFT'].iloc[:split]\n", 666 | "test_msft = stock_prices['MSFT'].iloc[split:]" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "arima_aapl = ARIMA(train_aapl,order=(9, 1, 9))\n", 676 | "arima_fit_aapl = arima_aapl.fit()" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "arima_msft = ARIMA(train_msft, order=(6, 1, 6))\n", 686 | "arima_fit_msft = arima_msft.fit()" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "arima_predict_aapl = arima_fit_aapl.predict(start=len(train_aapl), \n", 696 | " end=len(train_aapl)\\\n", 697 | " + len(test_aapl) - 1, \n", 698 | " dynamic=False)\n", 699 | "arima_predict_msft = arima_fit_msft.predict(start=len(train_msft), \n", 700 | " end=len(train_msft)\\\n", 701 | " + len(test_msft) - 1, \n", 702 | " dynamic=False)" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "arima_predict_aapl = pd.DataFrame(arima_predict_aapl)\n", 712 | "arima_predict_aapl.index = diff_test_aapl.index\n", 713 | "arima_predict_msft = pd.DataFrame(arima_predict_msft)\n", 714 | "arima_predict_msft.index = diff_test_msft.index" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n", 724 | " \n", 725 | "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n", 726 | "ax[0].plot(arima_predict_aapl, linestyle='solid', label=\"Prediction\")\n", 727 | "ax[0].set_title('Predicted Stock Price-Apple')\n", 728 | "ax[0].legend(loc='best')\n", 729 | "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n", 730 | "ax[1].plot(arima_predict_msft, linestyle='solid', label=\"Prediction\")\n", 731 | "ax[1].set_title('Predicted Stock Price-Microsoft')\n", 732 | "ax[1].legend(loc='best')\n", 733 | "for ax in ax.flat:\n", 734 | " ax.set(xlabel='Date', ylabel='Differenced Price')\n", 735 | "plt.show()\n" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "import itertools" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "p = q = range(0, 9)\n", 754 | "d = range(0, 3)\n", 755 | "pdq = list(itertools.product(p, d, q))\n", 756 | "arima_results_aapl = []\n", 757 | "for param_set in pdq:\n", 758 | " try:\n", 759 | " arima_aapl = ARIMA(train_aapl, order=param_set)\n", 760 | " arima_fitted_aapl = arima_aapl.fit()\n", 761 | " arima_results_aapl.append(arima_fitted_aapl.aic)\n", 762 | " except:\n", 763 | " continue\n", 764 | "print('**'*25)\n", 765 | "print('The Lowest AIC score is {:.4f} and the corresponding parameters are {}'\n", 766 | " .format(pd.DataFrame(arima_results_aapl)\n", 767 | " .where(pd.DataFrame(arima_results_aapl).T.notnull().all()).min()[0], \n", 768 | " pdq[arima_results_aapl.index(min(arima_results_aapl))]))" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "metadata": {}, 775 | "outputs": [], 776 | "source": [ 777 | "arima_aapl = ARIMA(train_aapl, order=(4, 1, 4))\n", 778 | "arima_fit_aapl = arima_aapl.fit()" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": {}, 785 | "outputs": [], 786 | "source": [ 787 | "p = q = range(0, 6)\n", 788 | "d = range(0, 3)\n", 789 | "pdq = list(itertools.product(p, d, q))\n", 790 | "arima_results_msft = []\n", 791 | "for param_set in pdq:\n", 792 | " try:\n", 793 | " arima_msft = ARIMA(stock_prices['MSFT'], order=param_set)\n", 794 | " arima_fitted_msft = arima_msft.fit()\n", 795 | " arima_results_msft.append(arima_fitted_msft.aic)\n", 796 | " except:\n", 797 | " continue\n", 798 | "print('**' * 25)\n", 799 | "print('The lowest AIC score is {:.4f} and parameters are {}'\n", 800 | " .format(pd.DataFrame(arima_results_msft)\n", 801 | " .where(pd.DataFrame(arima_results_msft).T.notnull()\\\n", 802 | " .all()).min()[0], \n", 803 | " pdq[arima_results_msft.index(min(arima_results_msft))]))" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "arima_msft = ARIMA(stock_prices['MSFT'], order=(4, 2 ,4))\n", 813 | "arima_fit_msft= arima_msft.fit()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "arima_predict_aapl = arima_fit_aapl.predict(start=len(train_aapl), \n", 823 | " end=len(train_aapl)\\\n", 824 | " +len(test_aapl) - 1, \n", 825 | " dynamic=False)\n", 826 | "arima_predict_msft = arima_fit_msft.predict(start=len(train_msft), \n", 827 | " end=len(train_msft)\\\n", 828 | " + len(test_msft) - 1, \n", 829 | " dynamic=False)" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "arima_predict_aapl = pd.DataFrame(arima_predict_aapl)\n", 839 | "arima_predict_aapl.index = diff_test_aapl.index\n", 840 | "arima_predict_msft = pd.DataFrame(arima_predict_msft)\n", 841 | "arima_predict_msft.index = diff_test_msft.index" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": null, 847 | "metadata": {}, 848 | "outputs": [], 849 | "source": [ 850 | "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n", 851 | " \n", 852 | "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n", 853 | "ax[0].plot(arima_predict_aapl, linestyle='solid', label=\"Prediction\")\n", 854 | "ax[0].set_title('Predicted Stock Price-Apple')\n", 855 | "ax[0].legend(loc='best')\n", 856 | "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n", 857 | "ax[1].plot(arima_predict_msft, linestyle='solid', label=\"Prediction\")\n", 858 | "ax[1].set_title('Predicted Stock Price-Microsoft')\n", 859 | "ax[1].legend(loc='best')\n", 860 | "for ax in ax.flat:\n", 861 | " ax.set(xlabel='Date', ylabel='Differenced Price')\n", 862 | "plt.show()" 863 | ] 864 | } 865 | ], 866 | "metadata": { 867 | "kernelspec": { 868 | "display_name": "Python 3", 869 | "language": "python", 870 | "name": "python3" 871 | }, 872 | "language_info": { 873 | "codemirror_mode": { 874 | "name": "ipython", 875 | "version": 3 876 | }, 877 | "file_extension": ".py", 878 | "mimetype": "text/x-python", 879 | "name": "python", 880 | "nbconvert_exporter": "python", 881 | "pygments_lexer": "ipython3", 882 | "version": "3.8.8" 883 | }, 884 | "latex_envs": { 885 | "LaTeX_envs_menu_present": true, 886 | "autoclose": false, 887 | "autocomplete": true, 888 | "bibliofile": "biblio.bib", 889 | "cite_by": "apalike", 890 | "current_citInitial": 1, 891 | "eqLabelWithNumbers": true, 892 | "eqNumInitial": 1, 893 | "hotkeys": { 894 | "equation": "Ctrl-E", 895 | "itemize": "Ctrl-I" 896 | }, 897 | "labels_anchors": false, 898 | "latex_user_defs": false, 899 | "report_style_numbering": false, 900 | "user_envs_cfg": false 901 | }, 902 | "toc": { 903 | "base_numbering": 1, 904 | "nav_menu": {}, 905 | "number_sections": true, 906 | "sideBar": true, 907 | "skip_h1_title": false, 908 | "title_cell": "Table of Contents", 909 | "title_sidebar": "Contents", 910 | "toc_cell": false, 911 | "toc_position": {}, 912 | "toc_section_display": true, 913 | "toc_window_display": false 914 | } 915 | }, 916 | "nbformat": 4, 917 | "nbformat_minor": 4 918 | } 919 | -------------------------------------------------------------------------------- /codes/chp_9.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import numpy as np\n", 12 | "import seaborn as sns; sns.set()\n", 13 | "pd.set_option('use_inf_as_na', True)\n", 14 | "import warnings\n", 15 | "warnings.filterwarnings('ignore')\n", 16 | "plt.rcParams['figure.dpi'] = 300\n", 17 | "plt.rcParams['savefig.dpi'] = 300" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "crash_data = pd.read_csv('datasets/crash_data.csv')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "crash_data.head()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "crash_data.date = pd.to_datetime(crash_data.date, format='%Y%m%d')\n", 45 | "crash_data = crash_data.set_index('date')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "crash_dataw = crash_data.groupby('TICKER').resample('W').\\\n", 55 | " agg({'RET':'mean', 'vwretx':'mean', 'VOL':'mean',\n", 56 | " 'BIDLO':'mean', 'ASKHI':'mean', 'PRC':'mean'})" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "crash_dataw = crash_dataw.reset_index()\n", 66 | "crash_dataw.dropna(inplace=True)\n", 67 | "stocks = crash_dataw.TICKER.unique()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "plt.figure(figsize=(12, 8))\n", 77 | "k = 1\n", 78 | "\n", 79 | "for i in stocks[: 4]:\n", 80 | " plt.subplot(2, 2, k)\n", 81 | " plt.hist(crash_dataw[crash_dataw.TICKER == i]['RET'])\n", 82 | " plt.title('Histogram of '+i)\n", 83 | " k+=1\n", 84 | "plt.show()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Firm-specific return" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "import statsmodels.api as sm\n", 101 | "residuals = []\n", 102 | "\n", 103 | "for i in stocks:\n", 104 | " Y = crash_dataw.loc[crash_dataw['TICKER'] == i]['RET'].values\n", 105 | " X = crash_dataw.loc[crash_dataw['TICKER'] == i]['vwretx'].values\n", 106 | " X = sm.add_constant(X)\n", 107 | " ols = sm.OLS(Y[2:-2], X[2:-2] + X[1:-3] + X[0:-4] + \\\n", 108 | " X[3:-1] + X[4:]).fit()\n", 109 | " residuals.append(ols.resid)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "residuals = list(map(lambda x: np.log(1 + x), residuals))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "crash_data_sliced = pd.DataFrame([])\n", 128 | "for i in stocks:\n", 129 | " crash_data_sliced = crash_data_sliced.\\\n", 130 | " append(crash_dataw.loc[crash_dataw.TICKER == i]\n", 131 | " [2:-2])\n", 132 | "crash_data_sliced.head()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Elliptic Envelope " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.covariance import EllipticEnvelope\n", 149 | "envelope = EllipticEnvelope(contamination=0.02, support_fraction=1)\n", 150 | "ee_predictions = {}\n", 151 | "\n", 152 | "for i, j in zip(range(len(stocks)), stocks):\n", 153 | " envelope.fit(np.array(residuals[i]).reshape(-1, 1))\n", 154 | " ee_predictions[j] = envelope.predict(np.array(residuals[i])\n", 155 | " .reshape(-1, 1))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "transform = []\n", 165 | "\n", 166 | "for i in stocks:\n", 167 | " for j in range(len(ee_predictions[i])):\n", 168 | " transform.append(np.where(ee_predictions[i][j] == 1, 0, -1))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "crash_data_sliced = crash_data_sliced.reset_index()\n", 178 | "crash_data_sliced['residuals'] = np.concatenate(residuals)\n", 179 | "crash_data_sliced['neg_outliers'] = np.where((np.array(transform)) == -1, 1, 0)\n", 180 | "crash_data_sliced.loc[(crash_data_sliced.neg_outliers == 1) &\n", 181 | " (crash_data_sliced.residuals > 0),\n", 182 | " 'neg_outliers'] = 0" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "crash_data_sliced['neg_outliers'].value_counts()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "plt.figure(figsize=(12, 8)) \n", 201 | "k=1\n", 202 | "\n", 203 | "for i in stocks[8:12]:\n", 204 | " plt.subplot(2, 2, k)\n", 205 | " crash_data_sliced['residuals'][crash_data_sliced.TICKER == i]\\\n", 206 | " .hist(label='normal', bins=30, color='gray')\n", 207 | " outliers = crash_data_sliced['residuals'][(crash_data_sliced.TICKER == i) &\n", 208 | " (crash_data_sliced.neg_outliers > 0)]\n", 209 | " outliers.hist(color='black', label='anomaly') \n", 210 | " plt.title(i)\n", 211 | " plt.legend()\n", 212 | " k+=1\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "crash_data_sliced = crash_data_sliced.set_index('date')\n", 222 | "crash_data_sliced.index = pd.to_datetime(crash_data_sliced.index)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "std = crash_data.groupby('TICKER')['RET'].resample('W').std()\\\n", 232 | " .reset_index()\n", 233 | "crash_dataw['std'] = pd.DataFrame(std['RET'])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "yearly_data = crash_data_sliced.groupby('TICKER')['residuals']\\\n", 243 | " .resample('Y').agg({'residuals':{'mean', 'std'}})\\\n", 244 | " .reset_index()\n", 245 | "yearly_data.columns = ['TICKER', 'date', 'mean', 'std']\n", 246 | "yearly_data.head()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "merge_crash = pd.merge(crash_data_sliced.reset_index(), yearly_data,\n", 256 | " how='outer', on=['TICKER', 'date'])" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "merge_crash[['annual_mean', 'annual_std']] = merge_crash\\\n", 266 | " .sort_values(by=['TICKER',\n", 267 | " 'date'])\\\n", 268 | " .iloc[:, -2:]\\\n", 269 | " .fillna(method='bfill')\n", 270 | "merge_crash['residuals'] = merge_crash.sort_values(by=['TICKER',\n", 271 | " 'date'])\\\n", 272 | " ['residuals']\\\n", 273 | " .fillna(method='ffill')\n", 274 | "merge_crash = merge_crash.drop(merge_crash.iloc[: ,-4:-2], axis=1)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "crash_risk_out = []\n", 284 | "\n", 285 | "for j in stocks:\n", 286 | " for k in range(len(merge_crash[merge_crash.TICKER == j])):\n", 287 | " if merge_crash[merge_crash.TICKER == j]['residuals'].iloc[k] < \\\n", 288 | " merge_crash[merge_crash.TICKER == j]['annual_mean'].iloc[k] - \\\n", 289 | " 3.09 * \\\n", 290 | " merge_crash[merge_crash.TICKER == j]['annual_std'].iloc[k]:\n", 291 | " crash_risk_out.append(1)\n", 292 | " else:\n", 293 | " crash_risk_out.append(0)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "merge_crash['crash_risk'] = crash_risk_out\n", 303 | "merge_crash['crash_risk'].value_counts()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "merge_crash = merge_crash.set_index('date')\n", 313 | "merge_crash_annual = merge_crash.groupby('TICKER')\\\n", 314 | " .resample('1Y')['crash_risk'].sum().reset_index()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "down = []\n", 324 | "\n", 325 | "for j in range(len(merge_crash)):\n", 326 | " if merge_crash['residuals'].iloc[j] < \\\n", 327 | " merge_crash['annual_mean'].iloc[j]:\n", 328 | " down.append(1)\n", 329 | " else:\n", 330 | " down.append(0)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "merge_crash = merge_crash.reset_index()\n", 340 | "merge_crash['down'] = pd.DataFrame(down)\n", 341 | "merge_crash['up'] = 1 - merge_crash['down']\n", 342 | "down_residuals = merge_crash[merge_crash.down == 1]\\\n", 343 | " [['residuals', 'TICKER', 'date']]\n", 344 | "up_residuals = merge_crash[merge_crash.up == 1]\\\n", 345 | " [['residuals', 'TICKER', 'date']]" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "down_residuals['residuals_down_sq'] = down_residuals['residuals'] ** 2\n", 355 | "down_residuals['residuals_down_cubic'] = down_residuals['residuals'] **3\n", 356 | "up_residuals['residuals_up_sq'] = up_residuals['residuals'] ** 2\n", 357 | "up_residuals['residuals_up_cubic'] = up_residuals['residuals'] ** 3\n", 358 | "down_residuals['down_residuals'] = down_residuals['residuals']\n", 359 | "up_residuals['up_residuals'] = up_residuals['residuals']\n", 360 | "del down_residuals['residuals']\n", 361 | "del up_residuals['residuals']" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "merge_crash['residuals_sq'] = merge_crash['residuals'] ** 2\n", 371 | "merge_crash['residuals_cubic'] = merge_crash['residuals'] ** 3" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "merge_crash_all = merge_crash.merge(down_residuals,\n", 381 | " on=['TICKER', 'date'],\n", 382 | " how='outer')\n", 383 | "merge_crash_all = merge_crash_all.merge(up_residuals,\n", 384 | " on=['TICKER', 'date'],\n", 385 | " how='outer')" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "cols = ['BIDLO', 'ASKHI', 'residuals', \n", 395 | " 'annual_std', 'residuals_sq', 'residuals_cubic',\n", 396 | " 'down', 'up', 'residuals_up_sq', 'residuals_down_sq',\n", 397 | " 'neg_outliers']\n", 398 | "merge_crash_all = merge_crash_all.set_index('date')\n", 399 | "merge_grouped = merge_crash_all.groupby('TICKER')[cols]\\\n", 400 | " .resample('1Y').sum().reset_index()\n", 401 | "merge_grouped['neg_outliers'] = np.where(merge_grouped.neg_outliers >=\n", 402 | " 1, 1, 0)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "merge_grouped = merge_grouped.set_index('date')\n", 412 | "merge_all = merge_grouped.groupby('TICKER')\\\n", 413 | " .resample('1Y').agg({'down':['sum', 'count'],\n", 414 | " 'up':['sum', 'count']})\\\n", 415 | " .reset_index()\n", 416 | "merge_all.head()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "merge_grouped['down'] = merge_all['down']['sum'].values\n", 426 | "merge_grouped['up'] = merge_all['up']['sum'].values\n", 427 | "merge_grouped['count'] = merge_grouped['down'] + merge_grouped['up']" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "merge_grouped = merge_grouped.reset_index()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "merge_grouped['duvol'] = np.log(((merge_grouped['up'] - 1) * \n", 446 | " merge_grouped['residuals_down_sq']) /\n", 447 | " ((merge_grouped['down'] - 1) * \n", 448 | " merge_grouped['residuals_up_sq']))" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "merge_grouped['duvol'].mean()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "merge_grouped['ncskew'] = - (((merge_grouped['count'] * \n", 467 | " (merge_grouped['count'] - 1) **\n", 468 | " (3 / 2)) * \n", 469 | " merge_grouped['residuals_cubic']) / \n", 470 | " (((merge_grouped['count'] - 1) * \n", 471 | " (merge_grouped['count'] - 2)) * \n", 472 | " merge_grouped['residuals_sq'] **\n", 473 | " (3 / 2)))" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "merge_grouped['ncskew'].mean()" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": { 489 | "scrolled": true 490 | }, 491 | "outputs": [], 492 | "source": [ 493 | "merge_grouped['crash_risk'] = merge_crash_annual['crash_risk']\n", 494 | "merge_grouped['crash_risk'] = np.where(merge_grouped.crash_risk >= \n", 495 | " 1, 1, 0)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "merge_crash_all_grouped2 = merge_crash_all.groupby('TICKER')\\\n", 505 | " [['VOL', 'PRC']]\\\n", 506 | " .resample('1Y').mean().reset_index()\n", 507 | "merge_grouped[['VOL', 'PRC']] = merge_crash_all_grouped2[['VOL', 'PRC']]" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "merge_grouped[['ncskew','duvol']].corr()" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "## Balance Sheet Data" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "bs = pd.read_csv('datasets/bs_v.3.csv')\n", 533 | "bs['Date'] = pd.to_datetime(bs.datadate, format='%Y%m%d')\n", 534 | "bs['annual_date'] = bs['Date'].dt.year" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "bs['RoA'] = bs['ni'] / bs['at']\n", 544 | "bs['leverage'] = bs['lt'] / bs['at']" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "merge_grouped['annual_date'] = merge_grouped['date'].dt.year\n", 554 | "bs['TICKER'] = bs.tic\n", 555 | "del bs['tic']" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "merge_ret_bs = pd.merge(bs, merge_grouped,\n", 565 | " on=['TICKER', 'annual_date'])" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "merge_ret_bs2 = merge_ret_bs.set_index('Date')\n", 575 | "merge_ret_bs2 = merge_ret_bs2.groupby('TICKER').resample('Y').mean()\n", 576 | "merge_ret_bs2.reset_index(inplace=True)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "merge_ret_bs2['vol_csho_diff'] = (merge_ret_bs2.groupby('TICKER')\n", 586 | " ['VOL'].shift(-1) / \n", 587 | " merge_ret_bs2.groupby('TICKER')\n", 588 | " ['csho'].shift(-1))\n", 589 | "merge_ret_bs2['dturn1'] = merge_ret_bs2['VOL'] / merge_ret_bs2['csho']\n", 590 | "merge_ret_bs2['dturn'] = merge_ret_bs2['vol_csho_diff'] - \\\n", 591 | " merge_ret_bs2['dturn1']" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "merge_ret_bs2['p/e'] = merge_ret_bs2['PRC'] / merge_ret_bs2['ni']\n", 601 | "merge_ret_bs2['turnover_rate'] = merge_ret_bs2['VOL'] / \\\n", 602 | " merge_ret_bs2['csho']\n", 603 | "merge_ret_bs2['equity_share'] = merge_ret_bs2['ceq'] / \\\n", 604 | " (merge_ret_bs2['ceq'] +\n", 605 | " merge_ret_bs2['dt'])\n", 606 | "merge_ret_bs2['firm_size'] = np.log(merge_ret_bs2['at'])\n", 607 | "merge_ret_bs2['cefd'] = (((merge_ret_bs2['at'] -\n", 608 | " merge_ret_bs2['lt']) / merge_ret_bs2['csho']) - \n", 609 | " merge_ret_bs2['PRC']) / (merge_ret_bs2['at'] - \n", 610 | " merge_ret_bs2['lt']) / merge_ret_bs2['csho']" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "merge_ret_bs2 = merge_ret_bs2.set_index('Date')\n", 620 | "merge_ret_bs2['buying_volume'] = merge_ret_bs2['VOL'] * \\\n", 621 | " (merge_ret_bs2['PRC'] - \n", 622 | " merge_ret_bs2['BIDLO']) / \\\n", 623 | " (merge_ret_bs2['ASKHI'] - \n", 624 | " merge_ret_bs2['BIDLO'])\n", 625 | "merge_ret_bs2['selling_volume'] = merge_ret_bs2['VOL'] * \\\n", 626 | " (merge_ret_bs2['ASKHI'] - \n", 627 | " merge_ret_bs2['PRC']) / \\\n", 628 | " (merge_ret_bs2['ASKHI'] - \n", 629 | " merge_ret_bs2['BIDLO'])\n", 630 | "buying_volume = merge_ret_bs2.groupby('TICKER')['buying_volume'] \\\n", 631 | " .resample('Y').sum().reset_index()\n", 632 | "selling_volume = merge_ret_bs2.groupby('TICKER')['selling_volume'] \\\n", 633 | " .resample('Y').sum().reset_index()\n", 634 | "del buying_volume['TICKER']\n", 635 | "del buying_volume['Date']" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "buy_sel_vol = pd.concat([buying_volume,selling_volume], axis=1)\n", 645 | "buy_sel_vol['bsi'] = (buy_sel_vol.buying_volume - \n", 646 | " buy_sel_vol.selling_volume) / \\\n", 647 | " (buy_sel_vol.buying_volume + \n", 648 | " buy_sel_vol.selling_volume)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "merge_ret_bs2 = merge_ret_bs2.reset_index()\n", 658 | "merge_ret_bs2 = pd.merge(buy_sel_vol ,merge_ret_bs2,\n", 659 | " on=['TICKER', 'Date'])" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "## Firm Sentiment via PCA" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "from sklearn.preprocessing import StandardScaler\n", 676 | "from sklearn.decomposition import PCA" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "firm_sentiment = merge_ret_bs2[['p/e', 'turnover_rate',\n", 686 | " 'equity_share', 'cefd',\n", 687 | " 'leverage', 'bsi']]\n", 688 | "firm_sentiment = firm_sentiment.apply(lambda x: x.fillna(x.mean()),\n", 689 | " axis=0)" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": { 696 | "scrolled": true 697 | }, 698 | "outputs": [], 699 | "source": [ 700 | "firm_sentiment_std = StandardScaler().fit_transform(firm_sentiment)\n", 701 | "pca = PCA(n_components=6)\n", 702 | "pca_market_sentiment = pca.fit_transform(firm_sentiment_std)\n", 703 | "print('Explained Variance Ratios per Component are:\\n {}'\\\n", 704 | " .format(pca.explained_variance_ratio_))" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "loadings_1 = pd.DataFrame(pca.components_.T * \n", 714 | " np.sqrt(pca.explained_variance_), \n", 715 | " columns=['PC1', 'PC2', 'PC3',\n", 716 | " 'PC4', 'PC5', 'PC6'],\n", 717 | " index=firm_sentiment.columns)\n", 718 | "loadings_1" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "df_loading1 = pd.DataFrame(loadings_1.mean(axis=1))\n", 728 | "df_loading1" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "firm_sentiment = pd.DataFrame(np.dot(pca_market_sentiment,\n", 738 | " np.array(df_loading1)))\n", 739 | "merge_ret_bs2['firm_sent'] = firm_sentiment" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "## Panel Data Application" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "merge_ret_bs2['log_size'] = np.log(merge_ret_bs2['at'])" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "merge_ret_bs2.set_index(['TICKER', 'Date'], inplace=True)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "X = (merge_ret_bs2[['log_size', 'rect', 'ppegt', 'dturn',\n", 774 | " 'ncskew', 'residuals', 'RoA', 'annual_std',\n", 775 | " 'firm_sent']]).shift(1)\n", 776 | "X['neg_outliers'] = merge_ret_bs2['neg_outliers']" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": null, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "from pyeconometrics.panel_discrete_models import FixedEffectPanelModel\n", 786 | "from sklearn.model_selection import train_test_split\n", 787 | "from sklearn.metrics import accuracy_score" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": { 794 | "scrolled": true 795 | }, 796 | "outputs": [], 797 | "source": [ 798 | "FE_ML = FixedEffectPanelModel()\n", 799 | "FE_ML.fit(X, 'neg_outliers')\n", 800 | "FE_ML.summary()" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "del X['neg_outliers']\n", 810 | "X['crash_risk'] = merge_ret_bs2['crash_risk']" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "metadata": { 817 | "scrolled": false 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "FE_crash = FixedEffectPanelModel()\n", 822 | "FE_crash.fit(X, 'crash_risk')\n", 823 | "FE_crash.summary()" 824 | ] 825 | } 826 | ], 827 | "metadata": { 828 | "kernelspec": { 829 | "display_name": "Python 3", 830 | "language": "python", 831 | "name": "python3" 832 | }, 833 | "language_info": { 834 | "codemirror_mode": { 835 | "name": "ipython", 836 | "version": 3 837 | }, 838 | "file_extension": ".py", 839 | "mimetype": "text/x-python", 840 | "name": "python", 841 | "nbconvert_exporter": "python", 842 | "pygments_lexer": "ipython3", 843 | "version": "3.8.8" 844 | }, 845 | "latex_envs": { 846 | "LaTeX_envs_menu_present": true, 847 | "autoclose": false, 848 | "autocomplete": true, 849 | "bibliofile": "biblio.bib", 850 | "cite_by": "apalike", 851 | "current_citInitial": 1, 852 | "eqLabelWithNumbers": true, 853 | "eqNumInitial": 1, 854 | "hotkeys": { 855 | "equation": "Ctrl-E", 856 | "itemize": "Ctrl-I" 857 | }, 858 | "labels_anchors": false, 859 | "latex_user_defs": false, 860 | "report_style_numbering": false, 861 | "user_envs_cfg": false 862 | }, 863 | "toc": { 864 | "base_numbering": 1, 865 | "nav_menu": {}, 866 | "number_sections": false, 867 | "sideBar": true, 868 | "skip_h1_title": false, 869 | "title_cell": "Table of Contents", 870 | "title_sidebar": "Contents", 871 | "toc_cell": false, 872 | "toc_position": {}, 873 | "toc_section_display": true, 874 | "toc_window_display": false 875 | } 876 | }, 877 | "nbformat": 4, 878 | "nbformat_minor": 4 879 | } 880 | --------------------------------------------------------------------------------