├── codes
    ├── pyeconometrics
    │   ├── __pycache__
    │   │   ├── 1
    │   │   ├── base.cpython-38.pyc
    │   │   ├── utils.cpython-38.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   └── panel_discrete_models.cpython-38.pyc
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── README.md
    │   ├── utils.py
    │   ├── base.py
    │   ├── censored_data_models.py
    │   └── panel_discrete_models.py
    ├── pyeconometrics.egg-info
    │   ├── dependency_links.txt
    │   ├── top_level.txt
    │   ├── SOURCES.txt
    │   └── PKG-INFO
    ├── datasets
    │   ├── FraudTrain.txt
    │   ├── bs_v.3.txt
    │   ├── diff_test_msft.csv
    │   ├── diff_test_aapl.csv
    │   ├── diff_train_msft.csv
    │   └── diff_train_aapl.csv
    ├── chp_1.ipynb
    ├── chp_3.ipynb
    ├── chp_10.ipynb
    ├── chp_7.ipynb
    ├── chp_2.ipynb
    └── chp_9.ipynb
├── README.md
├── requirements.txt
└── License.txt


/codes/pyeconometrics/__pycache__/1:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | pyeconometrics
2 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | matplotlib
4 | sklearn
5 | pandas


--------------------------------------------------------------------------------
/codes/pyeconometrics/__pycache__/base.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/base.cpython-38.pyc


--------------------------------------------------------------------------------
/codes/pyeconometrics/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/codes/pyeconometrics/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/codes/pyeconometrics/__pycache__/panel_discrete_models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahkarasan/mlfrm/HEAD/codes/pyeconometrics/__pycache__/panel_discrete_models.cpython-38.pyc


--------------------------------------------------------------------------------
/codes/datasets/FraudTrain.txt:
--------------------------------------------------------------------------------
1 | As FraudTrain dataset is larger than 25MB, I prefer uploading this into google drive,
2 | you can easily go to the following link and download the dataset:
3 | 
4 | https://drive.google.com/file/d/1Ko15MscTWzgVIKH64yT0zVeUt0OqmUff/view
5 | 


--------------------------------------------------------------------------------
/codes/datasets/bs_v.3.txt:
--------------------------------------------------------------------------------
1 | As bs_v.3 dataset is larger than 25MB, I prefer uploading this into google drive,
2 | you can easily go to the following link and download the dataset:
3 | 
4 | https://drive.google.com/file/d/1NlVFxDoZXl-eVSyL1ZWXUX4fNt07QopR/view?usp=sharing
5 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.py
 3 | pyeconometrics/__init__.py
 4 | pyeconometrics/base.py
 5 | pyeconometrics/censored_data_models.py
 6 | pyeconometrics/panel_discrete_models.py
 7 | pyeconometrics/setup.py
 8 | pyeconometrics/utils.py
 9 | pyeconometrics.egg-info/PKG-INFO
10 | pyeconometrics.egg-info/SOURCES.txt
11 | pyeconometrics.egg-info/dependency_links.txt
12 | pyeconometrics.egg-info/top_level.txt


--------------------------------------------------------------------------------
/codes/pyeconometrics/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='pyeconometrics',
 5 |     version='1.0.2',
 6 |     description='Econometrics Models for Python',
 7 |     long_description=open('README.md').read(),
 8 |     author='Nicolas HENNETIER',
 9 |     author_email='nicolashennetier2@gmail.com',
10 |     packages=['pyeconometrics'],
11 |     requires=['numpy', 'pandas', 'scipy', 'matplotlib', 'sklearn']
12 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning for Financial Risk Management with Python
2 | 
3 | This repository provides Python code and Jupyter Notebooks accompanying the Machine Learning for Financial Risk Management with Python book published by O'Reilly.
4 | 
5 | Buy the book on [Amazon.](https://www.amazon.com/Machine-Learning-Financial-Management-Python/dp/1492085251)
6 | 
7 | 
8 | <img width="596" alt="github_cover" src="https://user-images.githubusercontent.com/67332480/144302816-b5caa990-ffd4-4250-b3bb-6246e1a17677.png">
9 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics/README.md:
--------------------------------------------------------------------------------
 1 | Python Econometrics Models
 2 | ===========================
 3 | Python package to build econometrics models.
 4 | 
 5 | Available models
 6 | ----------------
 7 | 
 8 | - Fixed Effects Logistic Regression (Logit)
 9 | - Random Effects Logistic Regression (Logit and Probit)
10 | - Tobit I (Linear Regression for truncated data)
11 | 
12 | 
13 | Installing from Source
14 | ----------------------
15 | 
16 | Download and extract the source distribution from Github
17 | 
18 |     https://github.com/nicolashennetier/pyeconometrics
19 | 
20 | Or clone the bleeding edge code from our repository on github at
21 | 
22 |     git clone git://github.com/nicolashennetier/pyeconometrics.git
23 | 
24 | In the pyeconometrics directory do (with proper permissions)
25 | 
26 |     python setup.py install


--------------------------------------------------------------------------------
/codes/datasets/diff_test_msft.csv:
--------------------------------------------------------------------------------
 1 | Date,MSFT
 2 | 2020-11-24,3.75
 3 | 2020-11-25,0.0099945068359375
 4 | 2020-11-27,1.3600006103515625
 5 | 2020-11-30,-1.1599884033203125
 6 | 2020-12-01,2.1399993896484375
 7 | 2020-12-02,-0.8400115966796875
 8 | 2020-12-03,-1.1299896240234375
 9 | 2020-12-04,0.1199951171875
10 | 2020-12-07,-0.07000732421875
11 | 2020-12-08,1.720001220703125
12 | 2020-12-09,-4.209991455078125
13 | 2020-12-10,-1.279998779296875
14 | 2020-12-11,2.739990234375
15 | 2020-12-14,0.94000244140625
16 | 2020-12-15,-0.0699920654296875
17 | 2020-12-16,5.149993896484375
18 | 2020-12-17,0.1399993896484375
19 | 2020-12-18,-0.8300018310546875
20 | 2020-12-21,4.0
21 | 2020-12-22,1.350006103515625
22 | 2020-12-23,-2.9199981689453125
23 | 2020-12-24,1.7299957275390625
24 | 2020-12-28,2.2100067138671875
25 | 2020-12-29,-0.8100128173828125
26 | 2020-12-30,-2.470001220703125
27 | 2020-12-31,0.7400054931640625
28 | 


--------------------------------------------------------------------------------
/codes/datasets/diff_test_aapl.csv:
--------------------------------------------------------------------------------
 1 | Date,AAPL
 2 | 2020-11-24,1.3199996948242188
 3 | 2020-11-25,0.8600006103515625
 4 | 2020-11-27,0.55999755859375
 5 | 2020-11-30,2.4600067138671875
 6 | 2020-12-01,3.6699981689453125
 7 | 2020-12-02,0.3600006103515625
 8 | 2020-12-03,-0.1399993896484375
 9 | 2020-12-04,-0.69000244140625
10 | 2020-12-07,1.5
11 | 2020-12-08,0.6299972534179688
12 | 2020-12-09,-2.5999984741210938
13 | 2020-12-10,1.4599990844726562
14 | 2020-12-11,-0.8299942016601562
15 | 2020-12-14,-0.6300048828125
16 | 2020-12-15,6.099998474121094
17 | 2020-12-16,-0.06999969482421875
18 | 2020-12-17,0.8899993896484375
19 | 2020-12-18,-2.0399932861328125
20 | 2020-12-21,1.5699920654296875
21 | 2020-12-22,3.6500091552734375
22 | 2020-12-23,-0.9199981689453125
23 | 2020-12-24,1.0099945068359375
24 | 2020-12-28,4.720001220703125
25 | 2020-12-29,-1.82000732421875
26 | 2020-12-30,-1.149993896484375
27 | 2020-12-31,-1.029998779296875
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #requirements
 2 | #Note: At the time the book is written, costcla library works compatible with sklearn version of 0.22.
 3 | arch==5.0.1
 4 | arviz==0.11.2
 5 | basemap==1.2.1
 6 | copulae==0.7.5
 7 | copulas==0.5.1
 8 | costcla==0.6
 9 | ctgan==0.4.3
10 | decorator==4.4.2
11 | gap==0.4.6
12 | gap-stat==2.0.1
13 | graphviz==0.17
14 | hmmlearn==0.2.6
15 | keras==2.6.0
16 | missingno==0.5.0
17 | mpl-toolkits.clifford==0.0.3
18 | numpy==1.21.2
19 | numpy-financial==1.0.0
20 | pandas==1.1.4
21 | pandas-datareader==0.10.0
22 | plotly==5.2.1
23 | pmdarima==1.8.2
24 | portfoliolab==0.3.0
25 | py4j==0.10.9
26 | pyensae==1.3.884
27 | pymc3==3.11.4
28 | pyportfolioopt==1.4.2
29 | python-dateutil==2.8.0
30 | Quandl==3.6.1
31 | quantecon==0.5.1
32 | scikit-image==0.18.1
33 | scikit-learn==1.0.1
34 | scipy==1.6.0
35 | sklearn-som==1.1.0
36 | table-evaluator==1.2.2.post1
37 | tables==3.6.1
38 | tensorboard==2.6.0
39 | tensorflow==2.6.0
40 | xgboost==1.4.2
41 | yellowbrick==1.3.post1
42 | yfinance==0.1.63


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | All the contents and codes, Jupyter Notebooks and other materials in this Github repository related to Machine Learning for Financial Risk Management by Abdullah Karasan, PhD are copyrighted and only intended for personal use.
 2 | 
 3 | Any kind of sharing, distribution, duplication, etc. without written permission by the O'Reilly is prohibited.
 4 | 
 5 | The contents, Python codes, Jupyter Notebooks and other materials come without warranties or representations, to the extent permitted by applicable law.
 6 | 
 7 | Notice that the code provided might be work in progress and that substantial additions, changes, updates, etc. can take place in the future. It is advised to regularly check for updates.
 8 | 
 9 | None of the material represents any kind of recommendation or investment advice. The material is only meant as a technical illustration. Leveraged and unleveraged trading of financial instruments, and contracts for difference (CFDs) in particular, involves a number of risks. Make sure to understand and manage these risks.
10 | 
11 | (c) Abdullah Karasan, December 2021.
12 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: pyeconometrics
 3 | Version: 1.0.2
 4 | Summary: Econometrics Models for Python
 5 | Home-page: UNKNOWN
 6 | Author: Nicolas HENNETIER
 7 | Author-email: nicolashennetier2@gmail.com
 8 | License: UNKNOWN
 9 | Description: Python Econometrics Models
10 |         ===========================
11 |         Python package to build econometrics models.
12 |         
13 |         Available models
14 |         ----------------
15 |         
16 |         - Fixed Effects Logistic Regression (Logit)
17 |         - Random Effects Logistic Regression (Logit and Probit)
18 |         - Tobit I (Linear Regression for truncated data)
19 |         
20 |         
21 |         Installing from Source
22 |         ----------------------
23 |         
24 |         Download and extract the source distribution from Github
25 |         
26 |             https://github.com/nicolashennetier/pyeconometrics
27 |         
28 |         Or clone the bleeding edge code from our repository on github at
29 |         
30 |             git clone git://github.com/nicolashennetier/pyeconometrics.git
31 |         
32 |         In the pyeconometrics directory do (with proper permissions)
33 |         
34 |             python setup.py install
35 | Platform: UNKNOWN
36 | Requires: numpy
37 | Requires: pandas
38 | Requires: scipy
39 | Requires: matplotlib
40 | Requires: sklearn
41 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from math import sqrt, factorial, exp
 3 | import scipy.stats as st
 4 | 
 5 | 
 6 | def norm_cdf(x):
 7 |     a1 =  0.254829592
 8 |     a2 = -0.284496736
 9 |     a3 =  1.421413741
10 |     a4 = -1.453152027
11 |     a5 =  1.061405429
12 |     p  =  0.3275911
13 |     sign = 1
14 |     if x < 0:
15 |         sign = -1
16 |     x = abs(x)/sqrt(2.0)
17 |     t = 1.0/(1.0 + p*x)
18 |     y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x)
19 |     return 0.5*(1.0 + sign*y)
20 |     
21 | def unique_permutations(seq):
22 |     i_indices = range(len(seq)-1, -1, -1)
23 |     k_indices = i_indices[1:]
24 |     seq = sorted(seq)
25 |     while True:
26 |         yield seq
27 |         for k in k_indices:
28 |             if seq[k] < seq[k+1]:
29 |                 break
30 |         else:
31 |             return
32 |         k_val = seq[k]
33 |         for i in i_indices:
34 |             if k_val < seq[i]:
35 |                 break
36 |         (seq[k], seq[i]) = (seq[i], seq[k])
37 |         seq[k+1:] = seq[-1:k:-1]
38 | 
39 | def nCr(n,r):
40 |     try:
41 |         return factorial(n) / factorial(r) / factorial(n-r)
42 |     except:
43 |         return 101
44 | 
45 | 
46 | def inverse_mills_ratio(x):
47 |     return st.norm.pdf(x) / st.norm.cdf(x)
48 | 
49 | 
50 | def derivate_inverse_mills_ratio(x):
51 |     return - inverse_mills_ratio(x) * (x + inverse_mills_ratio(x))


--------------------------------------------------------------------------------
/codes/pyeconometrics/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import scipy.stats as st
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | import warnings
  7 | warnings.filterwarnings('ignore')
  8 | 
  9 | from pyeconometrics.utils import norm_cdf
 10 | 
 11 | 
 12 | 
 13 | class Results():
 14 |     def model_description(self):
 15 |         print('='*80)
 16 |         print('%-17s %30s %1s %-20s %8s' \
 17 |             % ('Dep. Variable:', self.output, ' ', 'Pseudo R-squ.:', 
 18 |                 "%.5f" % (1-self.final_ll/self.init_ll)))
 19 |         print('%-17s %30s %1s %-20s %8s' \
 20 |             % ('Model:', self.name, ' ', 'Log-Likelihood:', "%.3f" % self.final_ll))
 21 |         print('%-17s %30s %1s %-20s %8s' \
 22 |             % ('Method:', 'MLE', ' ', 'LL-Null:', "%.3f" % self.init_ll))
 23 |         print('%-17s %30s %1s %-20s %8s' \
 24 |             % ('No. Observations:', self.nb_obs, ' ', 'LLR p-value:',
 25 |                 "%.3f" % (1-st.chi2.cdf(2 * (self.final_ll - self.init_ll),
 26 |                 len(self.beta)))))
 27 |         print('%-17s %30s %1s %-20s %8s' \
 28 |             % ('Df Model:', len(self.beta)-1, ' ', ' ', ' '))
 29 |         print('%-17s %30s %1s %-20s %8s' \
 30 |             % ('Converged:', self.converged, ' ', ' ', ' '))
 31 |         print('='*80)
 32 | 
 33 |     def columns_header(self):
 34 |         print('%25s %8s %8s %8s %8s %18s' \
 35 |             % (' ', 'coef', 'std err', 't', 'P>|t|', '[95.0% Conf. Int.]'))
 36 |         print('-'*80)
 37 | 
 38 |     def beta_description(self):
 39 |         for i,var in enumerate([x for x in self.variables if x != self.output]):
 40 |             print('%-24s %8s %8s %8s %8s %9s %9s' \
 41 |                 % (var, 
 42 |                    "%.4f" % self.beta[i], 
 43 |                    "%.3f" % self.beta_se[i], 
 44 |                    "%.3f" % (self.beta[i] / self.beta_se[i]), 
 45 |                    "%.3f" % (1-st.norm.cdf(abs(self.beta[i]) / self.beta_se[i])), 
 46 |                    "%.3f" % self.confidence_interval[i, 0], 
 47 |                    "%.3f" % self.confidence_interval[i, 1])
 48 |                 )
 49 |         print('-'*80)
 50 | 
 51 |     def sigma_description(self):
 52 |         print('%-24s %8s %8s %8s %8s %9s %9s' \
 53 |             % ('/sigma', 
 54 |                "%.4f" % self.sigma, 
 55 |                "%.3f" % self.beta_se[-1], 
 56 |                "", 
 57 |                "", 
 58 |                "%.3f" % self.confidence_interval[-1, 0], 
 59 |                "%.3f" % self.confidence_interval[-1, 1])
 60 |             )
 61 |         print('-'*80)
 62 | 
 63 |     def censored_data_description(self):
 64 |         print('%27s %-52s' \
 65 |             % ('Obs. summary:',
 66 |                 '%s censored observations' % self.nb_censored_obs))
 67 |         print('%27s %-52s' \
 68 |             % ('',
 69 |                 '%s uncensored observations' % self.nb_uncensored_obs))
 70 | 
 71 | 
 72 | class BaseModel(Results):
 73 |     '''Base class inherited by other models
 74 |     Not intended to be used separately
 75 |     '''
 76 |     def input_data_preparation(self, X, drop_na=None, fill_value=None):
 77 |         X = self.handle_missing_values(X, drop_na, fill_value)
 78 |         return X
 79 | 
 80 |     def handle_missing_values(self, X, drop_na=None, fill_value=None):
 81 |         if drop_na is not None:
 82 |             if drop_na:
 83 |                 X.dropna(inplace=True)
 84 |             elif fill_value == 'mean':
 85 |                 X.fillna(X.mean(), inplace=True)
 86 |             elif fill_value == 'median':
 87 |                 X.fillna(X.median(), inplace=True)
 88 |             elif fill_value is not None:
 89 |                 for var in X.columns:
 90 |                     try:
 91 |                         X[var].fillna(fill_value.get(var), inplace=True)
 92 |                     except:
 93 |                         raise ValueError('\'fill_value\' argument must be in list ' \
 94 |                             + '[\'mean\', \'median\'] or of type dict. See docstring for more info.')
 95 | 
 96 |         return X
 97 | 
 98 |     def plot_trace_estimators(self):
 99 |         if self.beta is None:
100 |             raise AttributeError('Fit method should be called before evaluating of the model')
101 |             
102 |         colors = ['b','g','r','c','m','y','k']
103 |         for k in range(len(self.beta)):
104 |             plt.plot(np.arange(1, len(self.beta_est)+1),
105 |                      self.beta_est[:,k],
106 |                      color=colors[(k-1) % len(colors)],
107 |                      label="Beta_%s" % k)
108 | 
109 |         plt.xlim((1,len(self.beta_est)*1.2))
110 |         plt.xlabel('Iterations')
111 |         plt.ylabel('Estimators')
112 |         plt.title('Trace plot of estimators of beta', size=16)
113 |         plt.legend(loc='best')
114 |         plt.show()
115 | 
116 |     def predict(self, X):
117 |         if self.beta is None:
118 |             raise AttributeError('Fit method should be called before evaluating the model.')
119 | 
120 |         X = self.input_data_preparation(X)
121 |         X.insert(0, '_cons', 1)
122 | 
123 |         Z = self.response_function(X, self.beta)
124 |         result = (np.sign(Z)+1)/2
125 | 
126 |         try:
127 |             result = result.astype(int)
128 |         except:
129 |             raise ValueError('One or several data are missing.')
130 | 
131 |         return result
132 |         
133 |     def predict_proba(self, X):
134 |         if self.beta is None:
135 |             raise AttributeError('Fit method should be called before evaluating the model.')
136 | 
137 |         X = self.input_data_preparation(X)
138 |         X.insert(0, '_cons', 1)
139 | 
140 |         Z = self.response_function(X,self.beta)
141 |         return Z.apply(lambda x : norm_cdf(x))
142 | 
143 |     def summary(self):
144 |         if self.beta is None:
145 |             raise AttributeError('Fit method should be called before evaluating of the model.')
146 | 
147 |         self.model_description()
148 |         self.columns_header()
149 |         self.beta_description()
150 | 
151 | 
152 | 
153 | class PanelBaseModel(BaseModel):
154 |     '''Base class inherited by other models
155 |     Not intended to be used separately
156 |     '''
157 |     def input_data_preparation(self, X, drop_na=None, fill_value=None):
158 |         try:
159 |             X = X.to_frame()
160 |         except:
161 |             if len(X.index.names) != 2:
162 |                 raise ValueError("Only 2-level MultiIndex and Panel are supported.")
163 | 
164 |         X = self.handle_missing_values(X, drop_na, fill_value)
165 |         return X
166 | 
167 | 
168 | class CensoredBaseModel(BaseModel):
169 |     '''Base class inherited by other models
170 |     Not intended to be used separately
171 |     '''
172 |     def input_data_preparation(self, X, drop_na=None, fill_value=None):
173 |         if self.output in X.columns:
174 |             neg_values = X[X[self.output] <= 0]
175 |             if len(neg_values[neg_values[self.output] < 0]) > 0:
176 |                 raise ValueError("Negative values where found in output variable." \
177 |                     + "Please set all censored observations to 0 before fitting the model.")
178 |             elif len(neg_values) == 0:
179 |                 raise ValueError("No censored observations were found." \
180 |                     + "Please set output of all censored observations to 0 before fitting the model.")
181 | 
182 |         X = self.handle_missing_values(X, drop_na, fill_value)
183 |         return X
184 | 
185 |     def summary(self):
186 |         BaseModel.summary(self)
187 |         self.sigma_description()
188 |         self.censored_data_description()
189 |      
190 |         


--------------------------------------------------------------------------------
/codes/chp_1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Risk-Return"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "#!pip install plotly\n",
 17 |     "import statsmodels.api as sm\n",
 18 |     "import numpy as np\n",
 19 |     "import plotly.graph_objs as go\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "import plotly\n",
 22 |     "import warnings\n",
 23 |     "warnings.filterwarnings('ignore')\n",
 24 |     "plt.rcParams['figure.dpi'] = 300\n",
 25 |     "plt.rcParams['savefig.dpi'] = 300"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "n_assets = 5\n",
 35 |     "n_simulation = 500"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "returns = np.random.randn(n_assets, n_simulation)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "rand = np.random.rand(n_assets\n",
 54 |     "weights = rand/sum(rand)\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "def port_return(returns):\n",
 58 |     "    rets = np.mean(returns, axis=1)\n",
 59 |     "    cov = np.cov(rets.T, aweights=weights, ddof=1)\n",
 60 |     "    portfolio_returns = np.dot(weights, rets.T)\n",
 61 |     "    portfolio_std_dev = np.sqrt(np.dot(weights, np.dot(cov, weights)))\n",
 62 |     "    return portfolio_returns, portfolio_std_dev"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "portfolio_returns, portfolio_std_dev = port_return(returns)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "print(portfolio_returns)\n",
 81 |     "print(portfolio_std_dev)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "portfolio = np.array([port_return(np.random.randn(n_assets, i))\n",
 91 |     "                      for i in range(1, 101)])"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "best_fit = sm.OLS(portfolio[:, 1], sm.add_constant(portfolio[:, 0]))\\\n",
101 |     "           .fit().fittedvalues"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "scrolled": true
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "fig = go.Figure()\n",
113 |     "fig.add_trace(go.Scatter(name='Risk-Return Relationship',\n",
114 |     "                         x=portfolio[:, 0],\n",
115 |     "                         y=portfolio[:, 1], mode='markers'))\n",
116 |     "fig.add_trace(go.Scatter(name='Best Fit Line',\n",
117 |     "                         x=portfolio[:, 0],\n",
118 |     "                         y=best_fit, mode='lines'))\n",
119 |     "fig.update_layout(xaxis_title = 'Return',\n",
120 |     "                  yaxis_title = 'Standard Deviation',\n",
121 |     "                  width=900, height=470)\n",
122 |     "fig.show()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## Adverse Selection"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import matplotlib.pyplot as plt\n",
139 |     "import numpy as np\n",
140 |     "plt.style.use('seaborn')"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "def utility(x):\n",
150 |     "    return(np.exp(x**gamma))"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "pi = np.random.uniform(0,1,20)\n",
160 |     "pi = np.sort(pi)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "print('The highest three probability of losses are {}'\n",
170 |     "      .format(pi[-3:]))"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "y = 2\n",
180 |     "c = 1.5\n",
181 |     "Q = 5\n",
182 |     "D = 0.01\n",
183 |     "gamma = 0.4"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "def supply(Q):\n",
193 |     "    return(np.mean(pi[-Q:]) * c)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "def demand(D):\n",
203 |     "    return(np.sum(utility(y - D) > pi * utility(y - c) + (1 - pi) \n",
204 |     "                  * utility(y)))"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "plt.figure()\n",
214 |     "plt.plot([demand(i) for i in np.arange(0, 1.9, 0.02)],\n",
215 |     "         np.arange(0, 1.9, 0.02), \n",
216 |     "         'r', label='insurance demand')\n",
217 |     "plt.plot(range(1,21), [supply(j) for j in range(1,21)],\n",
218 |     "         'g', label='insurance supply')\n",
219 |     "plt.ylabel(\"Average Cost\")\n",
220 |     "plt.xlabel(\"Number of People\")\n",
221 |     "plt.legend()\n",
222 |     "plt.show()\n"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.8.8"
243 |   },
244 |   "latex_envs": {
245 |    "LaTeX_envs_menu_present": true,
246 |    "autoclose": false,
247 |    "autocomplete": true,
248 |    "bibliofile": "biblio.bib",
249 |    "cite_by": "apalike",
250 |    "current_citInitial": 1,
251 |    "eqLabelWithNumbers": true,
252 |    "eqNumInitial": 1,
253 |    "hotkeys": {
254 |     "equation": "Ctrl-E",
255 |     "itemize": "Ctrl-I"
256 |    },
257 |    "labels_anchors": false,
258 |    "latex_user_defs": false,
259 |    "report_style_numbering": false,
260 |    "user_envs_cfg": false
261 |   },
262 |   "toc": {
263 |    "base_numbering": 1,
264 |    "nav_menu": {},
265 |    "number_sections": false,
266 |    "sideBar": true,
267 |    "skip_h1_title": false,
268 |    "title_cell": "Table of Contents",
269 |    "title_sidebar": "Contents",
270 |    "toc_cell": false,
271 |    "toc_position": {},
272 |    "toc_section_display": true,
273 |    "toc_window_display": false
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 4
278 | }
279 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics/censored_data_models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import scipy.stats as st
  4 | 
  5 | import warnings
  6 | warnings.filterwarnings('ignore')
  7 | 
  8 | from numpy.linalg import inv
  9 | from math import exp, sqrt, log, pi
 10 | 
 11 | from pyeconometrics.base import CensoredBaseModel
 12 | from pyeconometrics.utils import inverse_mills_ratio, derivate_inverse_mills_ratio
 13 | 
 14 | 
 15 | 
 16 | class TobitModel(CensoredBaseModel):
 17 |     '''Fixed Effects Logit model for Panel Data
 18 |     Estimation of parameters with the Conditional Maximum Likelihood method
 19 |     '''
 20 |     def __init__(self):
 21 |         self.name = 'Tobit I Model'
 22 |         self.output = None
 23 |         self.variables = None
 24 |         self.nb_obs = None
 25 |         self.nb_censored_obs = None
 26 |         self.nb_uncensored_obs = None
 27 |         self.init_ll = None
 28 |         self.beta = None
 29 |         self.sigma = None
 30 |         self.beta_est = None
 31 |         self.beta_se = None
 32 |         self.confidence_interval = None
 33 |         self.final_ll = None
 34 |         self.converged = None
 35 | 
 36 |     def response_function(self, X, beta):
 37 |         A = X.copy()
 38 |         try:
 39 |             A.drop(self.output, axis=1, inplace=True)
 40 |         except:
 41 |             pass
 42 | 
 43 |         return np.array(A).dot(beta)
 44 |         
 45 |     def __log_likelihood_censored(self, X, beta, sigma):
 46 |         Z = np.array(self.response_function(X, beta))
 47 |         Z = Z/sigma
 48 | 
 49 |         norm_cdf_vec = np.vectorize(st.norm.cdf)
 50 |         result = np.sum(np.log(norm_cdf_vec(Z)))
 51 | 
 52 |         return result
 53 |             
 54 |     def __log_likelihood_uncensored(self, X, beta, sigma):
 55 |         Z = np.array(self.response_function(X, beta))
 56 |         y = np.array(X[self.output])
 57 |         Z = 0.5 * np.multiply((y - Z)/sigma, (y - Z)/sigma)
 58 |         result = np.sum(Z)
 59 | 
 60 |         return result
 61 |             
 62 |     def __log_likelihood(self, X, beta, sigma):
 63 |         X_cens = X[X[self.output]==0]
 64 |         X_uncens = X[X[self.output]>0]
 65 | 
 66 |         result = - self.__log_likelihood_censored(X, beta, sigma) \
 67 |             - self.__log_likelihood_uncensored(X, beta, sigma) \
 68 |             - len(X_uncens) * log(sigma * sqrt(2*pi))
 69 | 
 70 |         return result
 71 |         
 72 |     def __grad_b_log_likelihood(self, X, b, s):
 73 |         X_cens = X[X[self.output]==0]
 74 |         X_uncens = X[X[self.output]>0]
 75 |         y_uncens = X_uncens[self.output]
 76 |         X_cens.drop(self.output, axis=1, inplace=True)
 77 |         X_uncens.drop(self.output, axis=1, inplace=True)
 78 | 
 79 |         inverse_mills_ratio_vec = np.vectorize(inverse_mills_ratio)
 80 | 
 81 |         grad_cens = inverse_mills_ratio_vec(np.array(self.response_function(X_cens, b), ndmin=2))
 82 |         grad_cens = - np.sum(np.array(X_cens) * grad_cens.T, axis=0)
 83 |         
 84 |         grad_uncens = s * np.array(y_uncens, ndmin=2) - np.array(self.response_function(X_uncens, b), ndmin=2)
 85 |         grad_uncens = np.sum(np.array(X_uncens) * grad_uncens.T, axis=0)
 86 | 
 87 |         result = grad_cens + grad_uncens
 88 |         return result
 89 | 
 90 |     def __derivate_s_log_likelihood(self, X, b, s):
 91 |         X_uncens = X[X[self.output]>0]
 92 |         y_uncens = X_uncens[self.output]
 93 |         X_uncens.drop(self.output, axis=1, inplace=True)
 94 | 
 95 |         inverse_mills_ratio_vec = np.vectorize(inverse_mills_ratio)
 96 | 
 97 |         grad_uncens = s * np.array(y_uncens) - np.array(self.response_function(X_uncens, b))
 98 |         grad_uncens = - np.sum(np.multiply(y_uncens, grad_uncens))
 99 | 
100 |         result = grad_uncens + len(X_uncens)/s
101 |         return result
102 | 
103 |     def __score(self, X, b, s):
104 |         return np.concatenate([self.__grad_b_log_likelihood(X, b, s),
105 |             np.array(self.__derivate_s_log_likelihood(X, b, s), ndmin=1)])
106 |             
107 |     def __hessian_b_b(self, X, b, s):
108 |         X_uncens = X[X[self.output]>0]
109 |         y_uncens = X_uncens[self.output]
110 |         X_uncens.drop(self.output, axis=1, inplace=True)
111 | 
112 |         derivate_inverse_mills_ratio_vec = np.vectorize(derivate_inverse_mills_ratio)
113 |         hessian_uncens = 1 + derivate_inverse_mills_ratio_vec(-np.array(self.response_function(X_uncens, b), ndmin=2))
114 | 
115 |         list_XXT = []
116 |         for i in range(X_uncens.shape[0]):
117 |             row = np.array(np.array(X_uncens)[i,:], ndmin=2)
118 |             list_XXT.append(row.T.dot(row))
119 |         hessian_uncens = [-hessian_uncens[0,i]*list_XXT[i] for i in range(len(list_XXT))]
120 |         
121 |         result = sum(hessian_uncens)
122 |         return result
123 |         
124 | 
125 |     def __hessian_s_s(self, X, b, s):
126 |         X_uncens = X[X[self.output]>0]
127 |         y_uncens = X_uncens[self.output]
128 | 
129 |         item1 = -np.multiply(np.array(y_uncens), np.array(y_uncens))
130 |         item2 = -1/s**2 
131 |         result = np.sum(item1 + item2)
132 |         return result
133 | 
134 |     def __hessian_b_s(self, X, b, s):
135 |         X_uncens = X[X[self.output]>0]
136 |         y_uncens = X_uncens[self.output]
137 |         X_uncens.drop(self.output, axis=1, inplace=True)
138 | 
139 |         result = np.sum(np.array(X_uncens) * np.array(y_uncens, ndmin=2).T, axis=0)
140 |         return result
141 |     
142 |     def __hessian(self, X, b, s):
143 |         a = self.__hessian_b_b(X, b, s)
144 |         b = self.__hessian_b_s(X, b, s)
145 |         c = self.__hessian_s_s(X, b, s)
146 | 
147 |         item1 = np.concatenate([a,np.array(b, ndmin=2).T], axis=1)
148 |         item2 = np.array(np.concatenate([b.T, np.array(c, ndmin=1)]), ndmin=2)
149 |         result = np.concatenate([item1, item2], axis=0)
150 |         return result
151 | 
152 |     def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False):
153 |         '''Maximum Likelihhod Estimation
154 |         Implement a Newton-Raphson algorithm to estimate parameters
155 | 
156 |         Parameters:
157 |         ----------
158 |         X: Dataframe
159 |             Database to fit the model
160 | 
161 |         output: string
162 |             Name of the variable to predict
163 | 
164 |         nb_iter: integer (optional, default 20)
165 |             Maximal number of iteration before the end of the Newton-Raphson algorithm
166 | 
167 |         drop_na: boolean (optional, default True)
168 |             Indicate the method to handle missing values in X
169 |             If drop_na = False, fill_value has to be given
170 | 
171 |         fill_value: string or dict (optional, defaul None)
172 |             Considered only if drop_na = False
173 |             Possible values:
174 |                 - 'mean': missing values of a column are replaced by the mean of that column
175 |                 - 'median': missing values of a column are replaced by the median of that column
176 |                 - dict: keys must be variables' names and associated values the values used to fill Nan
177 | 
178 |         verbose: boolean (optional, default False)
179 |             If set to True, allows prints of Newton-Raphson algorithm's progress
180 |         '''
181 |         self.output = output
182 |         X = self.input_data_preparation(X.copy(), drop_na, fill_value)
183 |         X.insert(0, '_cons', 1)
184 | 
185 |         self.nb_obs = len(X)
186 |         self.nb_censored_obs = len(X[X[self.output] == 0])
187 |         self.nb_uncensored_obs = len(X[X[self.output] > 0])
188 | 
189 |         self.variables = [x for x in X.columns if x != self.output]
190 | 
191 |         beta_init = [0 for _ in range(len(self.variables))] + [1]   
192 |         self.beta_est = np.zeros((nb_iter,len(beta_init)))
193 |         self.beta_est[0] = beta_init
194 | 
195 |         self.init_ll = self.__log_likelihood(X, beta_init[:-1], beta_init[-1])
196 | 
197 |         if verbose:
198 |             print('Initial log-likelihood : '+ str(self.init_ll))
199 |             print('Parameters estimation in progress.')
200 |         
201 |         current_ll = self.init_ll
202 |         prev_ll = self.init_ll
203 |         j = 1
204 |         while (j < nb_iter) \
205 |             and (j == 1 or (current_ll - prev_ll > 0.01)):
206 |             b = self.beta_est[j-1,:-1]/self.beta_est[j-1,-1]
207 |             s = 1/self.beta_est[j-1,-1]
208 |             
209 |             score = self.__score(X, b, s)
210 |             hessian = self.__hessian(X, b, s)
211 | 
212 |             try:
213 |                 step = inv(hessian).dot(score)
214 |             except:
215 |                 raise ValueError('Improper classification problem' \
216 |                     + ', should be 2 different labels')
217 | 
218 |             b -= step[:-1]
219 |             s -= step[-1]
220 |             self.beta_est[j] = np.concatenate([b, np.array(1/s, ndmin=1)])
221 |             
222 |             prev_ll = current_ll
223 |             if self.beta_est[j,-1] > 0:
224 |                 current_ll = self.__log_likelihood(X, self.beta_est[j,:-1],
225 |                     self.beta_est[j,-1])
226 |                 if verbose:              
227 |                     print('Iteration %s, log_likelihood : %s'\
228 |                         % (j, current_ll))
229 |             else:
230 |                 current_ll = prev_ll - 1
231 |             j += 1
232 | 
233 |         self.beta = self.beta_est[j-3,:-1]
234 |         self.sigma = self.beta_est[j-3,-1]
235 |         self.beta_est = self.beta_est[:j-2,:]
236 | 
237 |         sqrt_vec = np.vectorize(sqrt)
238 |         b = self.beta/self.sigma
239 |         s = 1/self.sigma
240 |         hessian = self.__hessian(X, b, s)
241 |         self.beta_se = sqrt_vec(-inv(hessian).diagonal())
242 | 
243 |         self.confidence_interval = np.array(
244 |                 [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i],
245 |                     self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]]
246 |                     for i in range(len(self.beta))]
247 |                 + [[self.sigma - st.norm.ppf(0.975) * self.beta_se[-1],
248 |                       self.sigma + st.norm.ppf(0.975) * self.beta_se[-1]]])
249 | 
250 |         self.final_ll = prev_ll
251 | 
252 |         if j < nb_iter:
253 |             self.converged = True
254 |         else:
255 |             self.converged = False
256 | 
257 |         return self


--------------------------------------------------------------------------------
/codes/chp_3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "exciting-springer",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Recurrent Neural Network"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "mineral-doctor",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "import math\n",
 21 |     "import datetime\n",
 22 |     "import yfinance as yf\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import tensorflow as tf\n",
 25 |     "from tensorflow.keras.models import Sequential\n",
 26 |     "from tensorflow.keras.callbacks import EarlyStopping\n",
 27 |     "from tensorflow.keras.layers import (Dense, Dropout, \n",
 28 |     "                                     Activation, Flatten, \n",
 29 |     "                                     MaxPooling2D, SimpleRNN)\n",
 30 |     "from sklearn.model_selection import train_test_split\n",
 31 |     "plt.rcParams['figure.dpi'] = 300\n",
 32 |     "plt.rcParams['savefig.dpi'] = 300"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "adapted-great",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "n_steps = 13\n",
 43 |     "n_features = 1"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "distinct-distinction",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "model = Sequential()\n",
 54 |     "model.add(SimpleRNN(512, activation='relu', \n",
 55 |     "                    input_shape=(n_steps, n_features),\n",
 56 |     "                    return_sequences=True))\n",
 57 |     "model.add(Dropout(0.2))\n",
 58 |     "model.add(Dense(256, activation = 'relu'))\n",
 59 |     "model.add(Flatten())>\n",
 60 |     "model.add(Dense(1, activation='linear'))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "subsequent-distinction",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "model.compile(optimizer='rmsprop',\n",
 71 |     "              loss='mean_squared_error',\n",
 72 |     "              metrics=['mse'])"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "id": "russian-donna",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "def split_sequence(sequence, n_steps):\n",
 83 |     "    X, y = [], []\n",
 84 |     "    for i in range(len(sequence)):\n",
 85 |     "        end_ix = i + n_steps\n",
 86 |     "        if end_ix > len(sequence) - 1:\n",
 87 |     "            break\n",
 88 |     "        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]\n",
 89 |     "        X.append(seq_x)\n",
 90 |     "        y.append(seq_y)\n",
 91 |     "    return np.array(X), np.array(y)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "generic-missouri",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "ticker = ['AAPL', 'MSFT']\n",
102 |     "start = datetime.datetime(2019, 1, 1)\n",
103 |     "end = datetime.datetime(2020, 1 ,1)\n",
104 |     "stock_prices = yf.download(ticker,start=start, end = end, interval='1d')\\\n",
105 |     "               .Close"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "informational-steering",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "diff_stock_prices = stock_prices.diff().dropna()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "moral-knock",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "split = int(len(diff_stock_prices['AAPL'].values) * 0.95)\n",
126 |     "diff_train_aapl = diff_stock_prices['AAPL'].iloc[:split]\n",
127 |     "diff_test_aapl = diff_stock_prices['AAPL'].iloc[split:]\n",
128 |     "diff_train_msft = diff_stock_prices['MSFT'].iloc[:split]\n",
129 |     "diff_test_msft = diff_stock_prices['MSFT'].iloc[split:]"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "id": "obvious-logging",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "X_aapl, y_aapl = split_sequence(diff_train_aapl, n_steps)\n",
140 |     "X_aapl = X_aapl.reshape((X_aapl.shape[0],  X_aapl.shape[1],\n",
141 |     "                         n_features))"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "narrow-department",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "history = model.fit(X_aapl, y_aapl, \n",
152 |     "                    epochs=400, batch_size=150, verbose=0, \n",
153 |     "                    validation_split = 0.10)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "unlimited-attachment",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "start = X_aapl[X_aapl.shape[0] - n_steps]\n",
164 |     "x_input = start\n",
165 |     "x_input = x_input.reshape((1, n_steps, n_features))"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "experimental-sight",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "tempList_aapl = []\n",
176 |     "for i in range(len(diff_test_aapl)):\n",
177 |     "    x_input = x_input.reshape((1, n_steps, n_features))\n",
178 |     "    yhat = model.predict(x_input, verbose=0)\n",
179 |     "    x_input = np.append(x_input, yhat)\n",
180 |     "    x_input = x_input[1:]\n",
181 |     "    tempList_aapl.append(yhat)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "appropriate-killer",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "X_msft, y_msft = split_sequence(diff_train_msft, n_steps)\n",
192 |     "X_msft = X_msft.reshape((X_msft.shape[0],  X_msft.shape[1],\n",
193 |     "                         n_features))"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "animated-school",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "history = model.fit(X_msft, y_msft, \n",
204 |     "                    epochs=400, batch_size=150, verbose=0, \n",
205 |     "                    validation_split = 0.10)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "id": "proprietary-limit",
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "start = X_msft[X_msft.shape[0] - n_steps]\n",
216 |     "x_input = start\n",
217 |     "x_input = x_input.reshape((1, n_steps, n_features))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "invalid-episode",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "tempList_msft = []\n",
228 |     "for i in range(len(diff_test_msft)):\n",
229 |     "    x_input = x_input.reshape((1, n_steps, n_features))\n",
230 |     "    yhat = model.predict(x_input, verbose=0)\n",
231 |     "    x_input = np.append(x_input, yhat)\n",
232 |     "    x_input = x_input[1:]\n",
233 |     "    tempList_msft.append(yhat)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "hidden-johnston",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "fig, ax = plt.subplots(2,1, figsize=(18,15))\n",
244 |     "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n",
245 |     "ax[0].plot(diff_test_aapl.index, np.array(tempList_aapl).flatten(),\n",
246 |     "           linestyle='solid', label=\"Prediction\")\n",
247 |     "ax[0].set_title('Predicted Stock Price-Apple')\n",
248 |     "ax[0].legend(loc='best')\n",
249 |     "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n",
250 |     "ax[1].plot(diff_test_msft.index,np.array(tempList_msft).flatten(),\n",
251 |     "           linestyle='solid', label=\"Prediction\")\n",
252 |     "ax[1].set_title('Predicted Stock Price-Microsoft')\n",
253 |     "ax[1].legend(loc='best')\n",
254 |     "\n",
255 |     "\n",
256 |     "for ax in ax.flat:\n",
257 |     "    ax.set(xlabel='Date', ylabel='Differenced Price')\n",
258 |     "plt.show()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "id": "quiet-concentration",
264 |    "metadata": {},
265 |    "source": [
266 |     "## LSTM"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "id": "light-validity",
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "from tensorflow.keras.layers import LSTM\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "id": "short-advocacy",
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "n_steps = 13\n",
287 |     "n_features = 1"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "boring-binding",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "model = Sequential()\n",
298 |     "model.add(LSTM(512, activation='relu',\n",
299 |     "          input_shape=(n_steps, n_features),\n",
300 |     "          return_sequences=True))\n",
301 |     "model.add(Dropout(0.2))\n",
302 |     "model.add(LSTM(256,activation='relu'))\n",
303 |     "model.add(Flatten())\n",
304 |     "model.add(Dense(1, activation='linear'))"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "id": "great-meter",
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "model.compile(optimizer='rmsprop', loss='mean_squared_error',\n",
315 |     "              metrics=['mse'])"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "id": "bound-supervisor",
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "history = model.fit(X_aapl, y_aapl, \n",
326 |     "                    epochs=400, batch_size=150, verbose=0, \n",
327 |     "                    validation_split = 0.10)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "graduate-truth",
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "start = X_aapl[X_aapl.shape[0] - 13]\n",
338 |     "x_input = start\n",
339 |     "x_input = x_input.reshape((1, n_steps, n_features))"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "id": "polished-internship",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "tempList_aapl = []\n",
350 |     "for i in range(len(diff_test_aapl)):\n",
351 |     "    x_input = x_input.reshape((1, n_steps, n_features))\n",
352 |     "    yhat = model.predict(x_input, verbose=0)\n",
353 |     "    x_input = np.append(x_input, yhat)\n",
354 |     "    x_input = x_input[1:]\n",
355 |     "    tempList_aapl.append(yhat)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "utility-adaptation",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "history = model.fit(X_msft, y_msft, \n",
366 |     "                    epochs=400, batch_size=150, verbose=0, \n",
367 |     "                    validation_split = 0.10)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "id": "given-copyright",
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "start = X_msft[X_msft.shape[0] - 13]\n",
378 |     "x_input = start\n",
379 |     "x_input = x_input.reshape((1, n_steps, n_features))"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "id": "adjusted-discipline",
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "tempList_msft = []\n",
390 |     "for i in range(len(diff_test_msft)):\n",
391 |     "    x_input = x_input.reshape((1, n_steps, n_features))\n",
392 |     "    yhat = model.predict(x_input, verbose=0)\n",
393 |     "    x_input = np.append(x_input, yhat)\n",
394 |     "    x_input = x_input[1:]\n",
395 |     "    tempList_msft.append(yhat)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "id": "hispanic-labor",
402 |    "metadata": {
403 |     "scrolled": true
404 |    },
405 |    "outputs": [],
406 |    "source": [
407 |     "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n",
408 |     "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n",
409 |     "ax[0].plot(diff_test_aapl.index, np.array(tempList_aapl).flatten(),\n",
410 |     "           linestyle='solid', label=\"Prediction\")\n",
411 |     "ax[0].set_title('Predicted Stock Price-Apple')\n",
412 |     "ax[0].legend(loc='best')\n",
413 |     "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n",
414 |     "ax[1].plot(diff_test_msft.index, np.array(tempList_msft).flatten(),\n",
415 |     "           linestyle='solid', label=\"Prediction\")\n",
416 |     "ax[1].set_title('Predicted Stock Price-Microsoft')\n",
417 |     "ax[1].legend(loc='best')\n",
418 |     "\n",
419 |     "for ax in ax.flat:\n",
420 |     "    ax.set(xlabel='Date', ylabel='$')\n",
421 |     "plt.show()"
422 |    ]
423 |   }
424 |  ],
425 |  "metadata": {
426 |   "kernelspec": {
427 |    "display_name": "Python 3",
428 |    "language": "python",
429 |    "name": "python3"
430 |   },
431 |   "language_info": {
432 |    "codemirror_mode": {
433 |     "name": "ipython",
434 |     "version": 3
435 |    },
436 |    "file_extension": ".py",
437 |    "mimetype": "text/x-python",
438 |    "name": "python",
439 |    "nbconvert_exporter": "python",
440 |    "pygments_lexer": "ipython3",
441 |    "version": "3.8.8"
442 |   },
443 |   "latex_envs": {
444 |    "LaTeX_envs_menu_present": true,
445 |    "autoclose": false,
446 |    "autocomplete": true,
447 |    "bibliofile": "biblio.bib",
448 |    "cite_by": "apalike",
449 |    "current_citInitial": 1,
450 |    "eqLabelWithNumbers": true,
451 |    "eqNumInitial": 1,
452 |    "hotkeys": {
453 |     "equation": "Ctrl-E",
454 |     "itemize": "Ctrl-I"
455 |    },
456 |    "labels_anchors": false,
457 |    "latex_user_defs": false,
458 |    "report_style_numbering": false,
459 |    "user_envs_cfg": false
460 |   },
461 |   "toc": {
462 |    "base_numbering": 1,
463 |    "nav_menu": {},
464 |    "number_sections": true,
465 |    "sideBar": true,
466 |    "skip_h1_title": false,
467 |    "title_cell": "Table of Contents",
468 |    "title_sidebar": "Contents",
469 |    "toc_cell": false,
470 |    "toc_position": {},
471 |    "toc_section_display": true,
472 |    "toc_window_display": false
473 |   }
474 |  },
475 |  "nbformat": 4,
476 |  "nbformat_minor": 5
477 | }
478 | 


--------------------------------------------------------------------------------
/codes/datasets/diff_train_msft.csv:
--------------------------------------------------------------------------------
  1 | Date,MSFT
  2 | 2019-01-02,-0.4499969482421875
  3 | 2019-01-03,-3.720001220703125
  4 | 2019-01-04,4.529998779296875
  5 | 2019-01-07,0.12999725341796875
  6 | 2019-01-08,0.7400054931640625
  7 | 2019-01-09,1.4699935913085938
  8 | 2019-01-10,-0.6699981689453125
  9 | 2019-01-11,-0.7999954223632812
 10 | 2019-01-14,-0.75
 11 | 2019-01-15,2.9599990844726562
 12 | 2019-01-16,0.3699951171875
 13 | 2019-01-17,0.7400054931640625
 14 | 2019-01-18,1.589996337890625
 15 | 2019-01-22,-2.029998779296875
 16 | 2019-01-23,1.029998779296875
 17 | 2019-01-24,-0.5100021362304688
 18 | 2019-01-25,0.970001220703125
 19 | 2019-01-28,-2.089996337890625
 20 | 2019-01-29,-2.1399993896484375
 21 | 2019-01-30,3.4399948120117188
 22 | 2019-01-31,-1.9499969482421875
 23 | 2019-02-01,-1.6500015258789062
 24 | 2019-02-04,2.9599990844726562
 25 | 2019-02-05,1.4800033569335938
 26 | 2019-02-06,-1.19000244140625
 27 | 2019-02-07,-0.7600021362304688
 28 | 2019-02-08,0.40000152587890625
 29 | 2019-02-11,-0.4199981689453125
 30 | 2019-02-12,1.6399993896484375
 31 | 2019-02-13,-0.0800018310546875
 32 | 2019-02-14,0.09000396728515625
 33 | 2019-02-15,1.3199996948242188
 34 | 2019-02-19,-0.0500030517578125
 35 | 2019-02-20,-1.0199966430664062
 36 | 2019-02-21,2.2600021362304688
 37 | 2019-02-22,1.55999755859375
 38 | 2019-02-25,0.6199951171875
 39 | 2019-02-26,0.7700042724609375
 40 | 2019-02-27,-0.19000244140625
 41 | 2019-02-28,-0.1399993896484375
 42 | 2019-03-01,0.5
 43 | 2019-03-04,-0.26999664306640625
 44 | 2019-03-05,-0.5600051879882812
 45 | 2019-03-06,0.0500030517578125
 46 | 2019-03-07,-1.3600006103515625
 47 | 2019-03-08,0.12000274658203125
 48 | 2019-03-11,2.3199996948242188
 49 | 2019-03-12,0.7900009155273438
 50 | 2019-03-13,0.8799972534179688
 51 | 2019-03-14,0.089996337890625
 52 | 2019-03-15,1.32000732421875
 53 | 2019-03-18,1.6599960327148438
 54 | 2019-03-19,0.0800018310546875
 55 | 2019-03-20,-0.1300048828125
 56 | 2019-03-21,2.7000045776367188
 57 | 2019-03-22,-3.1699981689453125
 58 | 2019-03-25,0.6100006103515625
 59 | 2019-03-26,0.25
 60 | 2019-03-27,-1.1400070190429688
 61 | 2019-03-28,0.160003662109375
 62 | 2019-03-29,1.0100021362304688
 63 | 2019-04-01,1.0799942016601562
 64 | 2019-04-02,0.17000579833984375
 65 | 2019-04-03,0.779998779296875
 66 | 2019-04-04,-0.6100006103515625
 67 | 2019-04-05,0.529998779296875
 68 | 2019-04-08,0.04000091552734375
 69 | 2019-04-09,-0.6500015258789062
 70 | 2019-04-10,0.910003662109375
 71 | 2019-04-11,0.1399993896484375
 72 | 2019-04-12,0.6199951171875
 73 | 2019-04-15,0.100006103515625
 74 | 2019-04-16,-0.28000640869140625
 75 | 2019-04-17,1.0
 76 | 2019-04-18,1.600006103515625
 77 | 2019-04-22,0.3899993896484375
 78 | 2019-04-23,1.6800003051757812
 79 | 2019-04-24,-0.43000030517578125
 80 | 2019-04-25,4.139991760253906
 81 | 2019-04-26,0.7400054931640625
 82 | 2019-04-29,-0.1199951171875
 83 | 2019-04-30,0.8300018310546875
 84 | 2019-05-01,-2.7200088500976562
 85 | 2019-05-02,-1.6699981689453125
 86 | 2019-05-03,2.6899948120117188
 87 | 2019-05-06,-0.75
 88 | 2019-05-07,-2.6299972534179688
 89 | 2019-05-08,-0.0099945068359375
 90 | 2019-05-09,-0.01000213623046875
 91 | 2019-05-10,1.6299972534179688
 92 | 2019-05-13,-3.779998779296875
 93 | 2019-05-14,1.3800048828125
 94 | 2019-05-15,1.2899932861328125
 95 | 2019-05-16,2.9099960327148438
 96 | 2019-05-17,-0.8599853515625
 97 | 2019-05-20,-1.850006103515625
 98 | 2019-05-21,0.6800003051757812
 99 | 2019-05-22,0.7699966430664062
100 | 2019-05-23,-1.4899978637695312
101 | 2019-05-24,0.05999755859375
102 | 2019-05-28,-0.07999420166015625
103 | 2019-05-29,-1.220001220703125
104 | 2019-05-30,0.7900009155273438
105 | 2019-05-31,-2.0500030517578125
106 | 2019-06-03,-3.8400039672851562
107 | 2019-06-04,3.32000732421875
108 | 2019-06-05,2.6699981689453125
109 | 2019-06-06,1.9899978637695312
110 | 2019-06-07,3.5799942016601562
111 | 2019-06-10,1.20001220703125
112 | 2019-06-11,-0.5
113 | 2019-06-12,-0.6100006103515625
114 | 2019-06-13,0.8300018310546875
115 | 2019-06-14,0.1299896240234375
116 | 2019-06-17,0.4000091552734375
117 | 2019-06-18,2.30999755859375
118 | 2019-06-19,0.529998779296875
119 | 2019-06-20,1.2599945068359375
120 | 2019-06-21,0.0200042724609375
121 | 2019-06-24,0.80999755859375
122 | 2019-06-25,-4.350006103515625
123 | 2019-06-26,0.5
124 | 2019-06-27,0.220001220703125
125 | 2019-06-28,-0.1899871826171875
126 | 2019-07-01,1.7199859619140625
127 | 2019-07-02,0.9000091552734375
128 | 2019-07-03,0.8800048828125
129 | 2019-07-05,-0.4000091552734375
130 | 2019-07-08,-0.0999908447265625
131 | 2019-07-09,-0.5
132 | 2019-07-10,1.3899993896484375
133 | 2019-07-11,0.54998779296875
134 | 2019-07-12,0.5
135 | 2019-07-15,0.0
136 | 2019-07-16,-1.8199920654296875
137 | 2019-07-17,-0.80999755859375
138 | 2019-07-18,0.149993896484375
139 | 2019-07-19,0.1999969482421875
140 | 2019-07-22,1.80999755859375
141 | 2019-07-23,0.8600006103515625
142 | 2019-07-24,1.4300079345703125
143 | 2019-07-25,-0.529998779296875
144 | 2019-07-26,1.149993896484375
145 | 2019-07-29,-0.30999755859375
146 | 2019-07-30,-0.67999267578125
147 | 2019-07-31,-4.0800018310546875
148 | 2019-08-01,1.7899932861328125
149 | 2019-08-02,-1.160003662109375
150 | 2019-08-05,-4.6899871826171875
151 | 2019-08-06,2.4799957275390625
152 | 2019-08-07,0.589996337890625
153 | 2019-08-08,3.6100006103515625
154 | 2019-08-09,-1.17999267578125
155 | 2019-08-12,-1.920013427734375
156 | 2019-08-13,2.8100128173828125
157 | 2019-08-14,-4.6200103759765625
158 | 2019-08-15,-0.3000030517578125
159 | 2019-08-16,2.45001220703125
160 | 2019-08-19,2.279998779296875
161 | 2019-08-20,-1.1500091552734375
162 | 2019-08-21,1.529998779296875
163 | 2019-08-22,-1.0099945068359375
164 | 2019-08-23,-4.3899993896484375
165 | 2019-08-26,2.05999755859375
166 | 2019-08-27,0.290008544921875
167 | 2019-08-28,-0.1800079345703125
168 | 2019-08-29,2.55999755859375
169 | 2019-08-30,-0.2599945068359375
170 | 2019-09-03,-1.82000732421875
171 | 2019-09-04,1.5900115966796875
172 | 2019-09-05,2.4199981689453125
173 | 2019-09-06,-0.9499969482421875
174 | 2019-09-09,-1.5800018310546875
175 | 2019-09-10,-1.44000244140625
176 | 2019-09-11,0.0399932861328125
177 | 2019-09-12,1.4000091552734375
178 | 2019-09-13,-0.1999969482421875
179 | 2019-09-16,-0.9900054931640625
180 | 2019-09-17,1.05999755859375
181 | 2019-09-18,1.1300048828125
182 | 2019-09-19,2.5500030517578125
183 | 2019-09-20,-1.6300048828125
184 | 2019-09-23,-0.3000030517578125
185 | 2019-09-24,-1.7599945068359375
186 | 2019-09-25,1.9799957275390625
187 | 2019-09-26,0.17999267578125
188 | 2019-09-27,-1.80999755859375
189 | 2019-09-30,1.3000030517578125
190 | 2019-10-01,-1.959991455078125
191 | 2019-10-02,-2.420013427734375
192 | 2019-10-03,1.6300048828125
193 | 2019-10-04,1.839996337890625
194 | 2019-10-07,-1.0
195 | 2019-10-08,-1.4499969482421875
196 | 2019-10-09,2.57000732421875
197 | 2019-10-10,0.8600006103515625
198 | 2019-10-11,0.579986572265625
199 | 2019-10-14,-0.1299896240234375
200 | 2019-10-15,2.0200042724609375
201 | 2019-10-16,-1.160003662109375
202 | 2019-10-17,-0.720001220703125
203 | 2019-10-18,-2.279998779296875
204 | 2019-10-21,1.019989013671875
205 | 2019-10-22,-2.05999755859375
206 | 2019-10-23,0.8700103759765625
207 | 2019-10-24,2.6999969482421875
208 | 2019-10-25,0.7899932861328125
209 | 2019-10-28,3.4600067138671875
210 | 2019-10-29,-1.3600006103515625
211 | 2019-10-30,1.779998779296875
212 | 2019-10-31,-1.2400054931640625
213 | 2019-11-01,0.350006103515625
214 | 2019-11-04,0.8300018310546875
215 | 2019-11-05,-0.089996337890625
216 | 2019-11-06,-0.4000091552734375
217 | 2019-11-07,0.1999969482421875
218 | 2019-11-08,1.70001220703125
219 | 2019-11-11,0.149993896484375
220 | 2019-11-12,0.9600067138671875
221 | 2019-11-13,0.239990234375
222 | 2019-11-14,0.75
223 | 2019-11-15,1.910003662109375
224 | 2019-11-18,0.3699951171875
225 | 2019-11-19,0.0500030517578125
226 | 2019-11-20,-0.7700042724609375
227 | 2019-11-21,-0.1399993896484375
228 | 2019-11-22,0.1100006103515625
229 | 2019-11-25,1.6399993896484375
230 | 2019-11-26,0.8000030517578125
231 | 2019-11-27,0.290008544921875
232 | 2019-11-29,-0.94000244140625
233 | 2019-12-02,-1.8300018310546875
234 | 2019-12-03,-0.2400054931640625
235 | 2019-12-04,0.540008544921875
236 | 2019-12-05,0.079986572265625
237 | 2019-12-06,1.82000732421875
238 | 2019-12-09,-0.3899993896484375
239 | 2019-12-10,-0.2299957275390625
240 | 2019-12-11,0.5699920654296875
241 | 2019-12-12,1.540008544921875
242 | 2019-12-13,1.2899932861328125
243 | 2019-12-16,1.0
244 | 2019-12-17,-0.839996337890625
245 | 2019-12-18,-0.32000732421875
246 | 2019-12-19,1.3400115966796875
247 | 2019-12-20,1.6999969482421875
248 | 2019-12-23,0.0
249 | 2019-12-24,-0.029998779296875
250 | 2019-12-26,1.2899932861328125
251 | 2019-12-27,0.290008544921875
252 | 2019-12-30,-1.3700103759765625
253 | 2019-12-31,0.1100006103515625
254 | 2020-01-02,2.9199981689453125
255 | 2020-01-03,-2.0
256 | 2020-01-06,0.410003662109375
257 | 2020-01-07,-1.4499969482421875
258 | 2020-01-08,2.5099945068359375
259 | 2020-01-09,2.0
260 | 2020-01-10,-0.75
261 | 2020-01-13,1.94000244140625
262 | 2020-01-14,-1.149993896484375
263 | 2020-01-15,1.04998779296875
264 | 2020-01-16,2.9900054931640625
265 | 2020-01-17,0.9300079345703125
266 | 2020-01-21,-0.600006103515625
267 | 2020-01-22,-0.8000030517578125
268 | 2020-01-23,1.0200042724609375
269 | 2020-01-24,-1.6800079345703125
270 | 2020-01-27,-2.7599945068359375
271 | 2020-01-28,3.1800079345703125
272 | 2020-01-29,2.579986572265625
273 | 2020-01-30,4.7400054931640625
274 | 2020-01-31,-2.5500030517578125
275 | 2020-02-03,4.1500091552734375
276 | 2020-02-04,5.739990234375
277 | 2020-02-05,-0.220001220703125
278 | 2020-02-06,3.730010986328125
279 | 2020-02-07,0.2599945068359375
280 | 2020-02-10,4.80999755859375
281 | 2020-02-11,-4.2599945068359375
282 | 2020-02-12,0.2700042724609375
283 | 2020-02-13,-1.0
284 | 2020-02-14,1.6399993896484375
285 | 2020-02-18,1.8799896240234375
286 | 2020-02-19,0.0500030517578125
287 | 2020-02-20,-2.8600006103515625
288 | 2020-02-21,-5.8300018310546875
289 | 2020-02-24,-7.6999969482421875
290 | 2020-02-25,-2.8199920654296875
291 | 2020-02-26,2.0999908447265625
292 | 2020-02-27,-11.990005493164062
293 | 2020-02-28,3.8300018310546875
294 | 2020-03-02,10.779998779296875
295 | 2020-03-03,-8.279998779296875
296 | 2020-03-04,6.040008544921875
297 | 2020-03-05,-4.279998779296875
298 | 2020-03-06,-4.6999969482421875
299 | 2020-03-09,-10.95001220703125
300 | 2020-03-10,10.300003051757812
301 | 2020-03-11,-7.2899932861328125
302 | 2020-03-12,-14.57000732421875
303 | 2020-03-13,19.770004272460938
304 | 2020-03-16,-23.410003662109375
305 | 2020-03-17,11.150009155273438
306 | 2020-03-18,-6.170013427734375
307 | 2020-03-19,2.3100128173828125
308 | 2020-03-20,-5.3600006103515625
309 | 2020-03-23,-1.3700103759765625
310 | 2020-03-24,12.360000610351562
311 | 2020-03-25,-1.4199981689453125
312 | 2020-03-26,9.19000244140625
313 | 2020-03-27,-6.410003662109375
314 | 2020-03-30,10.529998779296875
315 | 2020-03-31,-2.519989013671875
316 | 2020-04-01,-5.600006103515625
317 | 2020-04-02,3.149993896484375
318 | 2020-04-03,-1.42999267578125
319 | 2020-04-06,11.44000244140625
320 | 2020-04-07,-1.779998779296875
321 | 2020-04-08,1.6399993896484375
322 | 2020-04-09,0.0099945068359375
323 | 2020-04-13,0.3699951171875
324 | 2020-04-14,8.19000244140625
325 | 2020-04-15,-1.8199920654296875
326 | 2020-04-16,5.1599884033203125
327 | 2020-04-17,1.5600128173828125
328 | 2020-04-20,-3.540008544921875
329 | 2020-04-21,-7.239990234375
330 | 2020-04-22,5.6999969482421875
331 | 2020-04-23,-2.100006103515625
332 | 2020-04-24,3.1300048828125
333 | 2020-04-27,-0.5
334 | 2020-04-28,-4.2400054931640625
335 | 2020-04-29,7.6199951171875
336 | 2020-04-30,1.7800140380859375
337 | 2020-05-01,-4.6399993896484375
338 | 2020-05-04,4.269989013671875
339 | 2020-05-05,1.9199981689453125
340 | 2020-05-06,1.779998779296875
341 | 2020-05-07,1.0600128173828125
342 | 2020-05-08,1.079986572265625
343 | 2020-05-11,2.0600128173828125
344 | 2020-05-12,-4.230010986328125
345 | 2020-05-13,-2.7599945068359375
346 | 2020-05-14,0.779998779296875
347 | 2020-05-15,2.6300048828125
348 | 2020-05-18,1.75
349 | 2020-05-19,-1.279998779296875
350 | 2020-05-20,2.029998779296875
351 | 2020-05-21,-2.230010986328125
352 | 2020-05-22,0.0800018310546875
353 | 2020-05-26,-1.9399871826171875
354 | 2020-05-27,0.239990234375
355 | 2020-05-28,-0.410003662109375
356 | 2020-05-29,1.850006103515625
357 | 2020-06-01,-0.4199981689453125
358 | 2020-06-02,2.0800018310546875
359 | 2020-06-03,0.4499969482421875
360 | 2020-06-04,-2.44000244140625
361 | 2020-06-05,4.279998779296875
362 | 2020-06-08,1.160003662109375
363 | 2020-06-09,1.44000244140625
364 | 2020-06-10,7.0399932861328125
365 | 2020-06-11,-10.569992065429688
366 | 2020-06-12,1.470001220703125
367 | 2020-06-15,1.1999969482421875
368 | 2020-06-16,4.6300048828125
369 | 2020-06-17,0.6699981689453125
370 | 2020-06-18,2.0800018310546875
371 | 2020-06-19,-1.170013427734375
372 | 2020-06-22,5.420013427734375
373 | 2020-06-23,1.339996337890625
374 | 2020-06-24,-4.07000732421875
375 | 2020-06-25,2.5
376 | 2020-06-26,-4.0099945068359375
377 | 2020-06-29,2.1100006103515625
378 | 2020-06-30,5.0699920654296875
379 | 2020-07-01,1.19000244140625
380 | 2020-07-02,1.55999755859375
381 | 2020-07-06,4.44000244140625
382 | 2020-07-07,-2.4499969482421875
383 | 2020-07-08,4.5800018310546875
384 | 2020-07-09,1.4900054931640625
385 | 2020-07-10,-0.6500091552734375
386 | 2020-07-13,-6.5999908447265625
387 | 2020-07-14,1.279998779296875
388 | 2020-07-15,-0.3100128173828125
389 | 2020-07-16,-4.1199951171875
390 | 2020-07-17,-1.0399932861328125
391 | 2020-07-20,8.720001220703125
392 | 2020-07-21,-2.850006103515625
393 | 2020-07-22,3.0
394 | 2020-07-23,-9.210006713867188
395 | 2020-07-24,-1.239990234375
396 | 2020-07-27,2.5500030517578125
397 | 2020-07-28,-1.8300018310546875
398 | 2020-07-29,2.0399932861328125
399 | 2020-07-30,-0.160003662109375
400 | 2020-07-31,1.1100006103515625
401 | 2020-08-03,11.529998779296875
402 | 2020-08-04,-3.25
403 | 2020-08-05,-0.3499908447265625
404 | 2020-08-06,3.410003662109375
405 | 2020-08-07,-3.8700103759765625
406 | 2020-08-10,-4.2299957275390625
407 | 2020-08-11,-4.8699951171875
408 | 2020-08-12,5.80999755859375
409 | 2020-08-13,-0.4900054931640625
410 | 2020-08-14,0.1999969482421875
411 | 2020-08-17,1.3800048828125
412 | 2020-08-18,1.2100067138671875
413 | 2020-08-19,-1.790008544921875
414 | 2020-08-20,4.8800048828125
415 | 2020-08-21,-1.55999755859375
416 | 2020-08-24,0.6699981689453125
417 | 2020-08-25,2.779998779296875
418 | 2020-08-26,4.67999267578125
419 | 2020-08-27,5.4300079345703125
420 | 2020-08-28,2.3300018310546875
421 | 2020-08-31,-3.3800048828125
422 | 2020-09-01,1.7400054931640625
423 | 2020-09-02,4.3799896240234375
424 | 2020-09-03,-14.349990844726562
425 | 2020-09-04,-3.0500030517578125
426 | 2020-09-08,-11.589996337890625
427 | 2020-09-09,8.629989624023438
428 | 2020-09-10,-5.9199981689453125
429 | 2020-09-11,-1.339996337890625
430 | 2020-09-14,1.3800048828125
431 | 2020-09-15,3.3699951171875
432 | 2020-09-16,-3.7299957275390625
433 | 2020-09-17,-2.1399993896484375
434 | 2020-09-18,-2.5200042724609375
435 | 2020-09-21,2.149993896484375
436 | 2020-09-22,4.8800048828125
437 | 2020-09-23,-6.8300018310546875
438 | 2020-09-24,2.600006103515625
439 | 2020-09-25,4.6300048828125
440 | 2020-09-28,1.6199951171875
441 | 2020-09-29,-2.1800079345703125
442 | 2020-09-30,3.07000732421875
443 | 2020-10-01,2.1300048828125
444 | 2020-10-02,-6.2700042724609375
445 | 2020-10-05,4.19000244140625
446 | 2020-10-06,-4.470001220703125
447 | 2020-10-07,3.9199981689453125
448 | 2020-10-08,0.75
449 | 2020-10-09,5.2299957275390625
450 | 2020-10-12,5.589996337890625
451 | 2020-10-13,1.4600067138671875
452 | 2020-10-14,-2.0
453 | 2020-10-15,-1.1999969482421875
454 | 2020-10-16,0.0
455 | 2020-10-19,-5.44000244140625
456 | 2020-10-20,0.42999267578125
457 | 2020-10-21,0.1500091552734375
458 | 2020-10-22,0.089996337890625
459 | 2020-10-23,1.339996337890625
460 | 2020-10-26,-6.149993896484375
461 | 2020-10-27,3.1699981689453125
462 | 2020-10-28,-10.57000732421875
463 | 2020-10-29,2.040008544921875
464 | 2020-10-30,-2.25
465 | 2020-11-02,-0.1399993896484375
466 | 2020-11-03,4.0999908447265625
467 | 2020-11-04,9.960006713867188
468 | 2020-11-05,6.899993896484375
469 | 2020-11-06,0.4300079345703125
470 | 2020-11-09,-5.3300018310546875
471 | 2020-11-10,-7.3800048828125
472 | 2020-11-11,5.540008544921875
473 | 2020-11-12,-1.1100006103515625
474 | 2020-11-13,1.0699920654296875
475 | 2020-11-16,0.720001220703125
476 | 2020-11-17,-2.769989013671875
477 | 2020-11-18,-3.3800048828125
478 | 2020-11-19,1.339996337890625
479 | 2020-11-20,-2.029998779296875
480 | 2020-11-23,-0.279998779296875
481 | 


--------------------------------------------------------------------------------
/codes/datasets/diff_train_aapl.csv:
--------------------------------------------------------------------------------
  1 | Date,AAPL
  2 | 2019-01-02,0.0449981689453125
  3 | 2019-01-03,-3.9324989318847656
  4 | 2019-01-04,1.5174980163574219
  5 | 2019-01-07,-0.08250045776367188
  6 | 2019-01-08,0.7050018310546875
  7 | 2019-01-09,0.6399993896484375
  8 | 2019-01-10,0.12250137329101562
  9 | 2019-01-11,-0.37750244140625
 10 | 2019-01-14,-0.5724983215332031
 11 | 2019-01-15,0.7675018310546875
 12 | 2019-01-16,0.467498779296875
 13 | 2019-01-17,0.22999954223632812
 14 | 2019-01-18,0.24000167846679688
 15 | 2019-01-22,-0.8800010681152344
 16 | 2019-01-23,0.154998779296875
 17 | 2019-01-24,-0.30500030517578125
 18 | 2019-01-25,1.2649993896484375
 19 | 2019-01-28,-0.36499786376953125
 20 | 2019-01-29,-0.4050025939941406
 21 | 2019-01-30,2.6425018310546875
 22 | 2019-01-31,0.2975006103515625
 23 | 2019-02-01,0.020000457763671875
 24 | 2019-02-04,1.1824989318847656
 25 | 2019-02-05,0.7324981689453125
 26 | 2019-02-06,0.015003204345703125
 27 | 2019-02-07,-0.8250007629394531
 28 | 2019-02-08,-0.13249969482421875
 29 | 2019-02-11,-0.24500274658203125
 30 | 2019-02-12,0.3650016784667969
 31 | 2019-02-13,-0.17750167846679688
 32 | 2019-02-14,0.15500259399414062
 33 | 2019-02-15,-0.095001220703125
 34 | 2019-02-19,0.12749862670898438
 35 | 2019-02-20,0.27500152587890625
 36 | 2019-02-21,-0.24250030517578125
 37 | 2019-02-22,0.47750091552734375
 38 | 2019-02-25,0.3149986267089844
 39 | 2019-02-26,0.02500152587890625
 40 | 2019-02-27,0.13499832153320312
 41 | 2019-02-28,-0.43000030517578125
 42 | 2019-03-01,0.4550018310546875
 43 | 2019-03-04,0.220001220703125
 44 | 2019-03-05,-0.0800018310546875
 45 | 2019-03-06,-0.2524986267089844
 46 | 2019-03-07,-0.5050010681152344
 47 | 2019-03-08,0.10250091552734375
 48 | 2019-03-11,1.49749755859375
 49 | 2019-03-12,0.50250244140625
 50 | 2019-03-13,0.20000076293945312
 51 | 2019-03-14,0.5049972534179688
 52 | 2019-03-15,0.5974998474121094
 53 | 2019-03-18,0.4750022888183594
 54 | 2019-03-19,-0.3725013732910156
 55 | 2019-03-20,0.407501220703125
 56 | 2019-03-21,1.7324981689453125
 57 | 2019-03-22,-1.0099983215332031
 58 | 2019-03-25,-0.5774993896484375
 59 | 2019-03-26,-0.4875030517578125
 60 | 2019-03-27,0.4200019836425781
 61 | 2019-03-28,0.0625
 62 | 2019-03-29,0.3074989318847656
 63 | 2019-04-01,0.32250213623046875
 64 | 2019-04-02,0.6949996948242188
 65 | 2019-04-03,0.3325004577636719
 66 | 2019-04-04,0.08499908447265625
 67 | 2019-04-05,0.3274993896484375
 68 | 2019-04-08,0.7750015258789062
 69 | 2019-04-09,-0.15000152587890625
 70 | 2019-04-10,0.279998779296875
 71 | 2019-04-11,-0.4174995422363281
 72 | 2019-04-12,-0.020000457763671875
 73 | 2019-04-15,0.09000015258789062
 74 | 2019-04-16,0.005001068115234375
 75 | 2019-04-17,0.970001220703125
 76 | 2019-04-18,0.18249893188476562
 77 | 2019-04-22,0.16749954223632812
 78 | 2019-04-23,0.7374992370605469
 79 | 2019-04-24,-0.07999801635742188
 80 | 2019-04-25,-0.470001220703125
 81 | 2019-04-26,-0.24499893188476562
 82 | 2019-04-29,0.0774993896484375
 83 | 2019-04-30,-0.9850006103515625
 84 | 2019-05-01,2.4625015258789062
 85 | 2019-05-02,-0.3425025939941406
 86 | 2019-05-03,0.6500015258789062
 87 | 2019-05-06,-0.8175010681152344
 88 | 2019-05-07,-1.404998779296875
 89 | 2019-05-08,0.009998321533203125
 90 | 2019-05-09,-0.5449981689453125
 91 | 2019-05-10,-0.8850021362304688
 92 | 2019-05-13,-2.8649978637695312
 93 | 2019-05-14,0.7350006103515625
 94 | 2019-05-15,0.5649986267089844
 95 | 2019-05-16,-0.20999908447265625
 96 | 2019-05-17,-0.2700004577636719
 97 | 2019-05-20,-1.4775009155273438
 98 | 2019-05-21,0.87750244140625
 99 | 2019-05-22,-0.9550018310546875
100 | 2019-05-23,-0.779998779296875
101 | 2019-05-24,-0.1725006103515625
102 | 2019-05-28,-0.18500137329101562
103 | 2019-05-29,-0.21249771118164062
104 | 2019-05-30,0.22999954223632812
105 | 2019-05-31,-0.8074989318847656
106 | 2019-06-03,-0.4425010681152344
107 | 2019-06-04,1.5849990844726562
108 | 2019-06-05,0.7249984741210938
109 | 2019-06-06,0.6700019836425781
110 | 2019-06-07,1.2324981689453125
111 | 2019-06-10,0.6075019836425781
112 | 2019-06-11,0.5574989318847656
113 | 2019-06-12,-0.154998779296875
114 | 2019-06-13,-0.01000213623046875
115 | 2019-06-14,-0.3524971008300781
116 | 2019-06-17,0.28749847412109375
117 | 2019-06-18,1.1399993896484375
118 | 2019-06-19,-0.14500045776367188
119 | 2019-06-20,0.3975028991699219
120 | 2019-06-21,-0.17000198364257812
121 | 2019-06-24,-0.049999237060546875
122 | 2019-06-25,-0.7524986267089844
123 | 2019-06-26,1.0574989318847656
124 | 2019-06-27,-0.0149993896484375
125 | 2019-06-28,-0.4550018310546875
126 | 2019-07-01,0.907501220703125
127 | 2019-07-02,0.2949981689453125
128 | 2019-07-03,0.4200019836425781
129 | 2019-07-05,-0.045001983642578125
130 | 2019-07-08,-1.0524978637695312
131 | 2019-07-09,0.30500030517578125
132 | 2019-07-10,0.49749755859375
133 | 2019-07-11,-0.3699989318847656
134 | 2019-07-12,0.3875007629394531
135 | 2019-07-15,0.47750091552734375
136 | 2019-07-16,-0.17750167846679688
137 | 2019-07-17,-0.28749847412109375
138 | 2019-07-18,0.5774993896484375
139 | 2019-07-19,-0.7675018310546875
140 | 2019-07-22,1.157501220703125
141 | 2019-07-23,0.404998779296875
142 | 2019-07-24,-0.042499542236328125
143 | 2019-07-25,-0.41249847412109375
144 | 2019-07-26,0.18000030517578125
145 | 2019-07-29,0.4849967956542969
146 | 2019-07-30,-0.22499847412109375
147 | 2019-07-31,1.0649986267089844
148 | 2019-08-01,-1.1525001525878906
149 | 2019-08-02,-1.1024971008300781
150 | 2019-08-05,-2.670001983642578
151 | 2019-08-06,0.9150009155273438
152 | 2019-08-07,0.5099983215332031
153 | 2019-08-08,1.0974998474121094
154 | 2019-08-09,-0.6099967956542969
155 | 2019-08-12,-0.12750244140625
156 | 2019-08-13,2.1225013732910156
157 | 2019-08-14,-1.5550003051757812
158 | 2019-08-15,-0.2524986267089844
159 | 2019-08-16,1.1899986267089844
160 | 2019-08-19,0.9625015258789062
161 | 2019-08-20,0.002498626708984375
162 | 2019-08-21,0.5699996948242188
163 | 2019-08-22,-0.0449981689453125
164 | 2019-08-23,-2.4550018310546875
165 | 2019-08-26,0.9625015258789062
166 | 2019-08-27,-0.5825004577636719
167 | 2019-08-28,0.342498779296875
168 | 2019-08-29,0.8699989318847656
169 | 2019-08-30,-0.06749725341796875
170 | 2019-09-03,-0.7600021362304688
171 | 2019-09-04,0.8725013732910156
172 | 2019-09-05,1.0224990844726562
173 | 2019-09-06,-0.005001068115234375
174 | 2019-09-09,0.22750091552734375
175 | 2019-09-10,0.6324996948242188
176 | 2019-09-11,1.7224998474121094
177 | 2019-09-12,-0.125
178 | 2019-09-13,-1.0849990844726562
179 | 2019-09-16,0.28749847412109375
180 | 2019-09-17,0.20000076293945312
181 | 2019-09-18,0.5175018310546875
182 | 2019-09-19,-0.4524993896484375
183 | 2019-09-20,-0.8075027465820312
184 | 2019-09-23,0.24750137329101562
185 | 2019-09-24,-0.26000213623046875
186 | 2019-09-25,0.8375015258789062
187 | 2019-09-26,-0.2849998474121094
188 | 2019-09-27,-0.2674980163574219
189 | 2019-09-30,1.2874984741210938
190 | 2019-10-01,0.154998779296875
191 | 2019-10-02,-1.4074974060058594
192 | 2019-10-03,0.4650001525878906
193 | 2019-10-04,1.5474967956542969
194 | 2019-10-07,0.012500762939453125
195 | 2019-10-08,-0.6650009155273438
196 | 2019-10-09,0.657501220703125
197 | 2019-10-10,0.7649993896484375
198 | 2019-10-11,1.5300025939941406
199 | 2019-10-14,-0.08500289916992188
200 | 2019-10-15,-0.1374969482421875
201 | 2019-10-16,-0.2375030517578125
202 | 2019-10-17,0.22750091552734375
203 | 2019-10-18,0.282501220703125
204 | 2019-10-21,1.0249977111816406
205 | 2019-10-22,-0.1374969482421875
206 | 2019-10-23,0.8049964904785156
207 | 2019-10-24,0.10000228881835938
208 | 2019-10-25,0.75
209 | 2019-10-28,0.6175003051757812
210 | 2019-10-29,-1.44000244140625
211 | 2019-10-30,-0.00749969482421875
212 | 2019-10-31,1.375
213 | 2019-11-01,1.7650032043457031
214 | 2019-11-04,0.4199981689453125
215 | 2019-11-05,-0.092498779296875
216 | 2019-11-06,0.027496337890625
217 | 2019-11-07,0.5475006103515625
218 | 2019-11-08,0.1775054931640625
219 | 2019-11-11,0.5149993896484375
220 | 2019-11-12,-0.06000518798828125
221 | 2019-11-13,0.62750244140625
222 | 2019-11-14,-0.45749664306640625
223 | 2019-11-15,0.779998779296875
224 | 2019-11-18,0.33499908447265625
225 | 2019-11-19,-0.2024993896484375
226 | 2019-11-20,-0.7750015258789062
227 | 2019-11-21,-0.2949981689453125
228 | 2019-11-22,-0.05750274658203125
229 | 2019-11-25,1.1474990844726562
230 | 2019-11-26,-0.5199966430664062
231 | 2019-11-27,0.8874969482421875
232 | 2019-11-29,-0.14749908447265625
233 | 2019-12-02,-0.7724990844726562
234 | 2019-12-03,-1.1774978637695312
235 | 2019-12-04,0.5724945068359375
236 | 2019-12-05,0.9599990844726562
237 | 2019-12-06,1.282501220703125
238 | 2019-12-09,-0.9474945068359375
239 | 2019-12-10,0.3899993896484375
240 | 2019-12-11,0.5724945068359375
241 | 2019-12-12,0.1725006103515625
242 | 2019-12-13,0.9225006103515625
243 | 2019-12-16,1.1774978637695312
244 | 2019-12-17,0.13750457763671875
245 | 2019-12-18,-0.16750335693359375
246 | 2019-12-19,0.06999969482421875
247 | 2019-12-20,-0.14499664306640625
248 | 2019-12-23,1.1399993896484375
249 | 2019-12-24,0.06749725341796875
250 | 2019-12-26,1.410003662109375
251 | 2019-12-27,-0.02750396728515625
252 | 2019-12-30,0.43000030517578125
253 | 2019-12-31,0.532501220703125
254 | 2020-01-02,1.6750030517578125
255 | 2020-01-03,-0.7300033569335938
256 | 2020-01-06,0.592498779296875
257 | 2020-01-07,-0.3524932861328125
258 | 2020-01-08,1.1999969482421875
259 | 2020-01-09,1.6100006103515625
260 | 2020-01-10,0.17499542236328125
261 | 2020-01-13,1.657501220703125
262 | 2020-01-14,-1.0699996948242188
263 | 2020-01-15,-0.33499908447265625
264 | 2020-01-16,0.9749984741210938
265 | 2020-01-17,0.8725051879882812
266 | 2020-01-21,-0.5400009155273438
267 | 2020-01-22,0.282501220703125
268 | 2020-01-23,0.38249969482421875
269 | 2020-01-24,-0.23000335693359375
270 | 2020-01-27,-2.339996337890625
271 | 2020-01-28,2.18499755859375
272 | 2020-01-29,1.6624984741210938
273 | 2020-01-30,-0.11750030517578125
274 | 2020-01-31,-3.589996337890625
275 | 2020-02-03,-0.21250152587890625
276 | 2020-02-04,2.5475006103515625
277 | 2020-02-05,0.6500015258789062
278 | 2020-02-06,0.9399948120117188
279 | 2020-02-07,-1.2949981689453125
280 | 2020-02-10,0.37999725341796875
281 | 2020-02-11,-0.4850006103515625
282 | 2020-02-12,1.8975067138671875
283 | 2020-02-13,-0.5825042724609375
284 | 2020-02-14,0.0200042724609375
285 | 2020-02-18,-1.4875030517578125
286 | 2020-02-19,1.154998779296875
287 | 2020-02-20,-0.8300018310546875
288 | 2020-02-21,-1.8125
289 | 2020-02-24,-3.717498779296875
290 | 2020-02-25,-2.5250015258789062
291 | 2020-02-26,1.1425018310546875
292 | 2020-02-27,-4.782501220703125
293 | 2020-02-28,-0.04000091552734375
294 | 2020-03-02,6.3625030517578125
295 | 2020-03-03,-2.37249755859375
296 | 2020-03-04,3.3549957275390625
297 | 2020-03-05,-2.4549942016601562
298 | 2020-03-06,-0.972503662109375
299 | 2020-03-09,-5.714996337890625
300 | 2020-03-10,4.7924957275390625
301 | 2020-03-11,-2.4775009155273438
302 | 2020-03-12,-6.799999237060547
303 | 2020-03-13,7.435001373291016
304 | 2020-03-16,-8.939998626708984
305 | 2020-03-17,2.6624984741210938
306 | 2020-03-18,-1.5475006103515625
307 | 2020-03-19,-0.4724998474121094
308 | 2020-03-20,-3.884998321533203
309 | 2020-03-23,-1.2175025939941406
310 | 2020-03-24,5.62750244140625
311 | 2020-03-25,-0.3400001525878906
312 | 2020-03-26,3.229999542236328
313 | 2020-03-27,-2.674999237060547
314 | 2020-03-30,1.7674980163574219
315 | 2020-03-31,-0.13000106811523438
316 | 2020-04-01,-3.3449974060058594
317 | 2020-04-02,1.0049972534179688
318 | 2020-04-03,-0.8799972534179688
319 | 2020-04-06,5.2649993896484375
320 | 2020-04-07,-0.7600021362304688
321 | 2020-04-08,1.660003662109375
322 | 2020-04-09,0.4799957275390625
323 | 2020-04-13,1.31500244140625
324 | 2020-04-14,3.4499969482421875
325 | 2020-04-15,-0.654998779296875
326 | 2020-04-16,0.56500244140625
327 | 2020-04-17,-0.972503662109375
328 | 2020-04-20,-1.467498779296875
329 | 2020-04-21,-2.1399993896484375
330 | 2020-04-22,1.9325027465820312
331 | 2020-04-23,-0.2675018310546875
332 | 2020-04-24,1.9850006103515625
333 | 2020-04-27,0.0500030517578125
334 | 2020-04-28,-1.1475067138671875
335 | 2020-04-29,2.287506103515625
336 | 2020-04-30,1.5174942016601562
337 | 2020-05-01,-1.1824951171875
338 | 2020-05-04,1.0224990844726562
339 | 2020-05-05,1.0999984741210938
340 | 2020-05-06,0.7675018310546875
341 | 2020-05-07,0.777496337890625
342 | 2020-05-08,1.597503662109375
343 | 2020-05-11,1.220001220703125
344 | 2020-05-12,-0.9000015258789062
345 | 2020-05-13,-0.94000244140625
346 | 2020-05-14,0.472503662109375
347 | 2020-05-15,-0.4575042724609375
348 | 2020-05-18,1.8125
349 | 2020-05-19,-0.45499420166015625
350 | 2020-05-20,1.5224990844726562
351 | 2020-05-21,-0.595001220703125
352 | 2020-05-22,0.5100021362304688
353 | 2020-05-26,-0.5400009155273438
354 | 2020-05-27,0.34499359130859375
355 | 2020-05-28,0.035003662109375
356 | 2020-05-29,-0.0774993896484375
357 | 2020-06-01,0.9775009155273438
358 | 2020-06-02,0.37249755859375
359 | 2020-06-03,0.44499969482421875
360 | 2020-06-04,-0.6999969482421875
361 | 2020-06-05,2.2949981689453125
362 | 2020-06-08,0.48999786376953125
363 | 2020-06-09,2.6324996948242188
364 | 2020-06-10,2.2125015258789062
365 | 2020-06-11,-4.2350006103515625
366 | 2020-06-12,0.7249984741210938
367 | 2020-06-15,1.0475006103515625
368 | 2020-06-16,2.2724990844726562
369 | 2020-06-17,-0.12249755859375
370 | 2020-06-18,0.035003662109375
371 | 2020-06-19,-0.50250244140625
372 | 2020-06-22,2.2874984741210938
373 | 2020-06-23,1.9150009155273438
374 | 2020-06-24,-1.6175003051757812
375 | 2020-06-25,1.1949996948242188
376 | 2020-06-26,-2.8024978637695312
377 | 2020-06-29,2.0374984741210938
378 | 2020-06-30,0.7549972534179688
379 | 2020-07-01,-0.1725006103515625
380 | 2020-07-02,0.0
381 | 2020-07-06,2.4350051879882812
382 | 2020-07-07,-0.29000091552734375
383 | 2020-07-08,2.1699981689453125
384 | 2020-07-09,0.410003662109375
385 | 2020-07-10,0.1674957275390625
386 | 2020-07-13,-0.44249725341796875
387 | 2020-07-14,1.5800018310546875
388 | 2020-07-15,0.6674957275390625
389 | 2020-07-16,-1.2024993896484375
390 | 2020-07-17,-0.19499969482421875
391 | 2020-07-20,2.029998779296875
392 | 2020-07-21,-1.3574981689453125
393 | 2020-07-22,0.27249908447265625
394 | 2020-07-23,-4.427497863769531
395 | 2020-07-24,-0.23000335693359375
396 | 2020-07-27,2.1949996948242188
397 | 2020-07-28,-1.5574951171875
398 | 2020-07-29,1.7874984741210938
399 | 2020-07-30,1.1500015258789062
400 | 2020-07-31,10.069999694824219
401 | 2020-08-03,2.6774978637695312
402 | 2020-08-04,0.7275009155273438
403 | 2020-08-05,0.39749908447265625
404 | 2020-08-06,3.839996337890625
405 | 2020-08-07,-2.7899932861328125
406 | 2020-08-10,1.6149978637695312
407 | 2020-08-11,-3.3525009155273438
408 | 2020-08-12,3.6350021362304688
409 | 2020-08-13,2.0
410 | 2020-08-14,-0.10250091552734375
411 | 2020-08-17,-0.3000030517578125
412 | 2020-08-18,0.9550018310546875
413 | 2020-08-19,0.14499664306640625
414 | 2020-08-20,2.5675048828125
415 | 2020-08-21,6.095001220703125
416 | 2020-08-24,1.4874954223632812
417 | 2020-08-25,-1.032501220703125
418 | 2020-08-26,1.6975021362304688
419 | 2020-08-27,-1.5124969482421875
420 | 2020-08-28,-0.2024993896484375
421 | 2020-08-31,4.232490539550781
422 | 2020-09-01,5.1399993896484375
423 | 2020-09-02,-2.779998779296875
424 | 2020-09-03,-10.519996643066406
425 | 2020-09-04,0.0800018310546875
426 | 2020-09-08,-8.139999389648438
427 | 2020-09-09,4.5
428 | 2020-09-10,-3.8300018310546875
429 | 2020-09-11,-1.4899978637695312
430 | 2020-09-14,3.3600006103515625
431 | 2020-09-15,0.18000030517578125
432 | 2020-09-16,-3.410003662109375
433 | 2020-09-17,-1.7900009155273438
434 | 2020-09-18,-3.5
435 | 2020-09-21,3.2400054931640625
436 | 2020-09-22,1.7299957275390625
437 | 2020-09-23,-4.689994812011719
438 | 2020-09-24,1.0999984741210938
439 | 2020-09-25,4.05999755859375
440 | 2020-09-28,2.6800003051757812
441 | 2020-09-29,-0.8700027465820312
442 | 2020-09-30,1.720001220703125
443 | 2020-10-01,0.9800033569335938
444 | 2020-10-02,-3.7700042724609375
445 | 2020-10-05,3.4800033569335938
446 | 2020-10-06,-3.339996337890625
447 | 2020-10-07,1.9199981689453125
448 | 2020-10-08,-0.1100006103515625
449 | 2020-10-09,2.0
450 | 2020-10-12,7.430000305175781
451 | 2020-10-13,-3.3000030517578125
452 | 2020-10-14,0.09000396728515625
453 | 2020-10-15,-0.48000335693359375
454 | 2020-10-16,-1.69000244140625
455 | 2020-10-19,-3.0399932861328125
456 | 2020-10-20,1.529998779296875
457 | 2020-10-21,-0.6399993896484375
458 | 2020-10-22,-1.1200027465820312
459 | 2020-10-23,-0.7099990844726562
460 | 2020-10-26,0.01000213623046875
461 | 2020-10-27,1.5499954223632812
462 | 2020-10-28,-5.400001525878906
463 | 2020-10-29,4.120002746582031
464 | 2020-10-30,-6.459999084472656
465 | 2020-11-02,-0.09000396728515625
466 | 2020-11-03,1.6700057983398438
467 | 2020-11-04,4.5099945068359375
468 | 2020-11-05,4.0800018310546875
469 | 2020-11-06,-0.339996337890625
470 | 2020-11-09,-2.3700027465820312
471 | 2020-11-10,-0.34999847412109375
472 | 2020-11-11,3.5199966430664062
473 | 2020-11-12,-0.279998779296875
474 | 2020-11-13,0.0500030517578125
475 | 2020-11-16,1.0400009155273438
476 | 2020-11-17,-0.910003662109375
477 | 2020-11-18,-1.3600006103515625
478 | 2020-11-19,0.6100006103515625
479 | 2020-11-20,-1.3000030517578125
480 | 2020-11-23,-3.4899978637695312
481 | 


--------------------------------------------------------------------------------
/codes/pyeconometrics/panel_discrete_models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import scipy.stats as st
  4 | import scipy.integrate as spint
  5 | 
  6 | import warnings
  7 | warnings.filterwarnings('ignore')
  8 | 
  9 | from numpy.linalg import inv
 10 | from math import exp, sqrt, log
 11 | 
 12 | from pyeconometrics.base import PanelBaseModel
 13 | from pyeconometrics.utils import nCr, unique_permutations
 14 | 
 15 | 
 16 | class FixedEffectPanelModel(PanelBaseModel):
 17 |     '''Fixed Effects Logit model for Panel Data
 18 |     Estimation of parameters with the Conditional Maximum Likelihood method
 19 |     '''
 20 |     def __init__(self):
 21 |         self.name = 'Panel Fixed Effects Logit'
 22 |         self.output = None
 23 |         self.variables = None
 24 |         self.nb_obs = None
 25 |         self.init_ll = None
 26 |         self.beta = None
 27 |         self.beta_est = None
 28 |         self.beta_se = None
 29 |         self.confidence_interval = None
 30 |         self.final_ll = None
 31 |         self.converged = None
 32 | 
 33 |     def response_function(self, X, beta):
 34 |         A = X.copy()
 35 |         try:
 36 |             A.drop(self.output, axis=1, inplace=True)
 37 |         except:
 38 |             pass
 39 |         
 40 |         return np.array(A).dot(beta)
 41 |         
 42 |     def __log_likelihood_obs(self, X, y, beta):
 43 |         X.reset_index(drop=True,inplace=True)
 44 |         y.reset_index(drop=True,inplace=True)
 45 | 
 46 |         Z = np.array(self.response_function(X, beta))
 47 | 
 48 |         if nCr(len(y),sum(y)) <= 100:
 49 |             perms = unique_permutations(y)
 50 |         else:
 51 |             perms = [np.random.permutation(y) for _ in range(100)]
 52 | 
 53 |         result = []
 54 |         for a in perms:
 55 |             result.append(np.exp(Z.dot(a)))
 56 | 
 57 |         result = Z.dot(np.array(y)) - log(sum(result))
 58 |         return result
 59 |             
 60 |     def __log_likelihood(self, X, beta):
 61 |         result = sum(np.array(X.apply(lambda group : \
 62 |             self.__log_likelihood_obs(group,
 63 |             group[self.output], beta))))
 64 | 
 65 |         return result
 66 |         
 67 |     def __conditional_probability(self, X, y, beta):
 68 |         if nCr(len(y),sum(y)) <= 100:
 69 |             perms = unique_permutations(y)
 70 |         else:
 71 |             perms = [np.random.permutation(y) for _ in range(100)]
 72 | 
 73 |         result = []
 74 |         for z in perms:
 75 |             result.append(exp(np.array(z).T.dot(np.array(X).dot(beta))))
 76 | 
 77 |         result = np.sum(np.array(result), axis=0)
 78 |         result = exp(np.array(y).T.dot(np.array(X).dot(beta))) / result
 79 | 
 80 |         return result
 81 |     
 82 |     def __score_obs(self, X, y, beta):
 83 |         X.drop(self.output, axis=1, inplace=True)
 84 | 
 85 |         X.reset_index(drop=True,inplace=True)
 86 |         y.reset_index(drop=True,inplace=True)
 87 | 
 88 |         if sum(y) == 0 or sum(y) == len(y):
 89 |             return np.array([0 for _ in range(len(X.columns))])
 90 | 
 91 |         else:
 92 |             if nCr(len(y),sum(y)) <= 100:
 93 |                 perms = unique_permutations(y)
 94 |             else:
 95 |                 perms = [np.random.permutation(y) for _ in range(100)]
 96 | 
 97 |             result = []
 98 |             for z in perms:
 99 |                 result.append(np.array(z) \
100 |                     * self.__conditional_probability(X,z,beta))
101 | 
102 |             result = np.sum(np.array(result), axis=0)
103 |             result = np.array(X).T.dot(np.array(y) - result)
104 | 
105 |             return result
106 | 
107 |     def __score(self, X, beta):
108 |         return np.sum(np.array(X.apply(lambda group : \
109 |             self.__score_obs(group, group[self.output], beta))), axis=0)
110 |             
111 |     def __hessian_obs(self, X, y, beta):
112 |         X.drop(self.output, axis=1, inplace=True)
113 | 
114 |         X.reset_index(drop=True,inplace=True)
115 |         y.reset_index(drop=True,inplace=True)
116 | 
117 |         if sum(y) == 0 or sum(y) == len(y):
118 |             return np.array([[0 for _ in range(len(X.columns))] \
119 |                 for _ in range(len(X.columns))])
120 | 
121 |         else:
122 |             if nCr(len(y),sum(y)) <= 100:
123 |                 perms = unique_permutations(y)
124 |             else:
125 |                 perms = [list(np.random.permutation(y)) for _ in range(100)]
126 | 
127 |             probas = []
128 |             esp = []
129 |             result = []
130 |             i = 0
131 |             for z in perms:
132 |                 probas.append(self.__conditional_probability(X,z,beta))
133 |                 esp.append(np.array(z) * probas[i])
134 |                 result.append(np.array(z).dot(np.array(z).T) * probas[i])
135 |                 i += 1
136 | 
137 |             esp = np.sum(np.array(esp), axis=0)
138 |             result = np.sum(np.array(result), axis=0)
139 |             result = np.array(X).T.dot(
140 |                 result - esp.T.dot(esp)).dot(np.array(X))
141 | 
142 |             return -result
143 | 
144 |     def __hessian(self, X, beta):
145 |         return np.sum(np.array(X.apply(lambda group : \
146 |             self.__hessian_obs(group,group[self.output], beta))), axis=0)
147 | 
148 |     def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False):
149 |         '''Maximum Likelihhod Estimation
150 |         Implement a Newton-Raphson algorithm to estimate parameters
151 | 
152 |         Parameters:
153 |         ----------
154 |         X: 2-level MultiIndex Dataframe
155 |             Database to fit the model
156 | 
157 |         output: string
158 |             Name of the variable to predict
159 | 
160 |         nb_iter: integer (optional, default 20)
161 |             Maximal number of iteration before the end of the Newton-Raphson algorithm
162 | 
163 |         drop_na: boolean (optional, default True)
164 |             Indicate the method to handle missing values in X
165 |             If drop_na = False, fill_value has to be given
166 | 
167 |         fill_value: string or dict (optional, defaul None)
168 |             Considered only if drop_na = False
169 |             Possible values:
170 |                 - 'mean': missing values of a column are replaced by the mean of that column
171 |                 - 'median': missing values of a column are replaced by the median of that column
172 |                 - dict: keys must be variables' names and associated values the values used to fill Nan
173 | 
174 |         verbose: boolean (optional, default False)
175 |             If set to True, allows prints of Newton-Raphson algorithm's progress
176 |         '''
177 |         self.output = output
178 |         X = self.input_data_preparation(X.copy(), drop_na, fill_value)
179 |         X.insert(0, '_cons', 1)
180 | 
181 |         labels = list(np.unique(X[self.output]))
182 |         if labels != [0,1]:
183 |             raise ValueError("Labels must be in the unit interval.")
184 |         
185 |         self.nb_obs = len(X)
186 |         self.variables = [x for x in X.columns if x != self.output]
187 |         
188 |         beta_init = [0 for _ in range(len(self.variables))]   
189 |         self.beta_est = np.zeros((nb_iter,len(beta_init)))
190 |         self.beta_est[0] = beta_init
191 | 
192 |         X = X.groupby(level=0)
193 | 
194 |         self.init_ll = self.__log_likelihood(X, beta_init)
195 | 
196 |         if verbose:
197 |             print('Initial log-likelihood : '+ str(self.init_ll))
198 |             print('Parameters estimation in progress.')
199 |         
200 |         current_ll = self.init_ll
201 |         prev_ll = self.init_ll
202 |         j = 1
203 |         while (j < nb_iter) \
204 |             and (j == 1 or (current_ll - prev_ll > 0.01)):
205 |             
206 |             score = self.__score(X, self.beta_est[j-1])
207 |             hessian = self.__hessian(X, self.beta_est[j-1])
208 | 
209 |             try:
210 |                 self.beta_est[j] = self.beta_est[j-1] \
211 |                     - inv(hessian).dot(score)
212 |             except:
213 |                 raise ValueError('Improper classification problem' \
214 |                     + ', should be 2 different labels')
215 | 
216 |             prev_ll = current_ll
217 |             current_ll = self.__log_likelihood(X, self.beta_est[j])
218 |             if verbose:              
219 |                 print('Iteration %s, log-likelihood : %s'\
220 |                     % (j, current_ll))
221 |             j += 1
222 | 
223 |         self.beta = self.beta_est[j-2]
224 |         self.beta_est = self.beta_est[:j-1,:]
225 | 
226 |         sqrt_vec = np.vectorize(sqrt)
227 |         hessian = self.__hessian(X, self.beta_est[j-2])
228 |         self.beta_se = sqrt_vec(-inv(hessian).diagonal())
229 | 
230 |         self.confidence_interval = np.array(
231 |                 [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i],
232 |                     self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]]
233 |                     for i in range(len(self.beta))])
234 | 
235 |         self.final_ll = prev_ll
236 | 
237 |         if j < nb_iter:
238 |             self.converged = True
239 |         else:
240 |             self.converged = False
241 | 
242 |         return self
243 |         
244 |     
245 | 
246 | 
247 | 
248 | 
249 | 
250 | class RandomEffectsPanelModel(PanelBaseModel):
251 |     def __init__(self, residual_dist):
252 |         self.name = 'Panel Random Effects Model'
253 |         self.residual_dist = residual_dist
254 |         self.output = None
255 |         self.variables = None
256 |         self.nb_obs = None
257 |         self.init_ll = None
258 |         self.beta = None
259 |         self.mu = None
260 |         self.sigma = None
261 |         self.beta_est = None
262 |         self.beta_se = None
263 |         self.confidence_interval = None
264 |         self.final_ll = None
265 |         self.converged = None
266 | 
267 |     def response_function(self, X, beta, mu):
268 |         A = X.copy()
269 |         try:
270 |             A.drop(self.output, axis=1, inplace=True)
271 |         except:
272 |             pass
273 |         
274 |         Z = mu
275 |         for i,var in enumerate(self.variables):
276 |             Z += beta[i] * A[var]
277 | 
278 |         return Z.rename('response')
279 | 
280 |     def __calculus_tools(self, X, w, beta, mu, sigma):
281 |         z = np.repeat(np.array([[1, w]]), X.shape[0], axis=0)
282 |         z = np.concatenate((z, X), axis=1).T
283 | 
284 |         gamma = np.repeat(np.array([[mu, sigma]]), X.shape[0], axis=0)
285 |         beta = np.array(beta, ndmin=2)
286 |         beta = np.repeat(beta, X.shape[0], axis=0)
287 |         gamma = np.concatenate((gamma, beta), axis=1).T
288 |         
289 |         return z, gamma
290 | 
291 |     def __conditional_density_obs(self, X, w, y, beta, mu, sigma):
292 |         z, gamma = self.__calculus_tools(X, w, beta, mu, sigma)
293 |         item = z.T.dot(gamma)[:,0]
294 | 
295 |         num = np.exp(np.multiply(np.array(y), item))
296 |         denom = 1 + np.exp(item)
297 |         result = np.prod(np.divide(num, denom))
298 |         
299 |         return result
300 | 
301 |     def __grad_conditional_density_obs(self, X, w, y, beta, mu, sigma):
302 |         z, gamma = self.__calculus_tools(X, w, beta, mu, sigma)
303 |         
304 |         item = np.exp(z.T.dot(gamma)[:,0])
305 |         result = np.array(y) - item / (1+item)
306 |         
307 |         result = z.dot(result)
308 |         result = result * self.__conditional_density_obs(X, w, y, beta, mu, sigma)
309 |         
310 |         return result
311 |         
312 |     def __log_likelihood_obs(self, X, y, beta, mu, sigma):
313 |         X.reset_index(drop=True,inplace=True)
314 |         y.reset_index(drop=True,inplace=True)
315 |         try:
316 |             X.drop(self.output, axis=1, inplace=True)
317 |         except:
318 |             pass
319 | 
320 |         if self.residual_dist == 'probit':
321 |             result = spint.quad(lambda w : self.__conditional_density_obs(X, w, y, beta, mu, sigma) \
322 |                 * st.norm(0,1).pdf(w), -3*sigma, 3*sigma)[0]
323 |         elif self.residual_dist == 'logit':
324 |             result = spint.quad(lambda w : self.__conditional_density_obs(X, w, y, beta, mu, sigma) \
325 |                 * st.logistic(0,1).pdf(w), -3*sigma, 3*sigma)[0]
326 |         else:
327 |             raise ValueError('Unknown value for argument residual_dist')
328 |         
329 |         return log(result)
330 |         
331 |             
332 |     def __log_likelihood(self, X, beta, mu, sigma):
333 |         result = np.sum(np.array(X.apply(lambda group : \
334 |             self.__log_likelihood_obs(group, group[self.output], beta, mu, sigma))), axis=0)
335 | 
336 |         return result
337 |         
338 |     def __score_obs(self, X, y, beta, mu, sigma):
339 |         X.reset_index(drop=True,inplace=True)
340 |         y.reset_index(drop=True,inplace=True)
341 |         X.drop(self.output, axis=1, inplace=True)
342 |         
343 |         if self.residual_dist == 'probit':
344 |             result = np.array([spint.quad(lambda w : self.__grad_conditional_density_obs(X, w, y, beta, mu, sigma)[i] \
345 |                 * st.norm(0,1).pdf(w), -3*sigma, 3*sigma)[0] for i in range(len(beta)+2)])
346 |         elif self.residual_dist == 'logit':
347 |             result = np.array([spint.quad(lambda w : self.__grad_conditional_density_obs(X, w, y, beta, mu, sigma)[i] \
348 |                 * st.logistic(0,1).pdf(w), -3*sigma, 3*sigma)[0] for i in range(len(beta)+2)])
349 |         else:
350 |             raise ValueError('Unknown value for argument residual_dist')
351 | 
352 |         result = result / exp(self.__log_likelihood_obs(X, y, beta, mu, sigma))
353 |         return result
354 | 
355 |     def __score(self, X, beta, mu, sigma):
356 |         list_score_obs = X.apply(lambda group : self.__score_obs(
357 |             group, group[self.output], beta, mu, sigma))
358 |         return (list_score_obs, np.sum(np.array(list_score_obs), axis=0))
359 |             
360 |     def __hessian(self, list_score_obs):
361 |         list_score_obs = list_score_obs.apply(lambda array : np.array(array, ndmin=2)).values
362 |         list_score_obs = np.concatenate(list(list_score_obs))
363 |         sum_score_obs = []
364 |         for i in range(list_score_obs.shape[0]):
365 |             row = np.array(list_score_obs[i,:], ndmin=2)
366 |             sum_score_obs.append(row.T.dot(row))
367 |         sum_score_obs = sum(sum_score_obs)
368 |         
369 |         score = np.array(np.sum(np.array(list_score_obs), axis=0), ndmin=2).T
370 |         result = sum_score_obs - score.dot(score.T) / self.nb_obs
371 | 
372 |         return result
373 | 
374 |     def fit(self, X, output, nb_iter=20, drop_na=True, fill_value=None, verbose=False):
375 |         '''Maximum Likelihhod Estimation
376 |         Implement a Newton-Raphson algorithm to estimate parameters
377 | 
378 |         Parameters:
379 |         ----------
380 |         X: 2-level MultiIndex Dataframe
381 |             Database to fit the model
382 | 
383 |         output: string
384 |             Name of the variable to predict
385 | 
386 |         nb_iter: integer (optional, default 20)
387 |             Maximal number of iteration before the end of the Newton-Raphson algorithm
388 | 
389 |         drop_na: boolean (optional, default True)
390 |             Indicate the method to handle missing values in X
391 |             If drop_na = False, fill_value has to be given
392 | 
393 |         fill_value: string or dict (optional, defaul None)
394 |             Considered only if drop_na = False
395 |             Possible values:
396 |                 - 'mean': missing values of a column are replaced by the mean of that column
397 |                 - 'median': missing values of a column are replaced by the median of that column
398 |                 - dict: keys must be variables' names and associated values the values used to fill Nan
399 | 
400 |         verbose: boolean (optional, default False)
401 |             If set to True, allows prints of Newton-Raphson algorithm's progress
402 |         '''
403 |         self.output = output
404 |         X = self.input_data_preparation(X.copy(), drop_na, fill_value)
405 | 
406 |         labels = list(np.unique(X[self.output]))
407 |         if labels != [0,1]:
408 |             raise ValueError("Labels must be in the unit interval.")
409 |         
410 |         self.nb_obs = len(X)
411 |         self.variables = [x for x in X.columns if x != self.output]
412 |         
413 |         beta_init = [0, 1] + [0 for _ in range(len(self.variables))]   
414 |         self.beta_est = np.zeros((nb_iter,len(beta_init)))
415 |         self.beta_est[0] = beta_init
416 | 
417 |         X = X.groupby(level=0)
418 | 
419 |         self.init_ll = self.__log_likelihood(X, beta_init[2:], 0, 1)
420 | 
421 |         if verbose:
422 |             print('Initial log-likelihood : '+ str(self.init_ll))
423 |             print('Parameters estimation in progress.')
424 |         
425 |         current_ll = self.init_ll
426 |         prev_ll = self.init_ll
427 |         j = 1
428 |         while (j < nb_iter) \
429 |             and (j == 1 or (current_ll - prev_ll > 0.01)):
430 | 
431 |             list_score_obs, score = self.__score(X, self.beta_est[j-1,2:],
432 |                 self.beta_est[j-1,0], self.beta_est[j-1,1])
433 |             hessian = self.__hessian(list_score_obs)
434 | 
435 |             try:
436 |                 self.beta_est[j] = self.beta_est[j-1] \
437 |                     - inv(hessian).dot(score)
438 |             except:
439 |                 raise ValueError('Improper classification problem' \
440 |                     + ', should be 2 different labels')
441 | 
442 |             prev_ll = current_ll
443 |             current_ll = self.__log_likelihood(X, self.beta_est[j,2:],
444 |                 self.beta_est[j,0], self.beta_est[j,1])
445 |             if verbose:              
446 |                 print('Iteration %s, log-likelihood : %s'\
447 |                     % (j, current_ll))
448 |             j += 1
449 | 
450 |         self.beta = self.beta_est[j-2,2:]
451 |         self.mu = self.beta_est[j-2,0]
452 |         self.sigma = self.beta_est[j-2,1]
453 |         self.beta_est = self.beta_est[:j-1,:]
454 | 
455 |         sqrt_vec = np.vectorize(sqrt)
456 |         list_score_obs, score = self.__score(X, self.beta,
457 |             self.mu, self.sigma)
458 |         hessian = self.__hessian(list_score_obs)
459 |         self.beta_se = sqrt_vec(inv(hessian).diagonal())
460 | 
461 |         self.confidence_interval = np.array(
462 |                 [[self.beta[i] - st.norm.ppf(0.975) * self.beta_se[i],
463 |                     self.beta[i] + st.norm.ppf(0.975) * self.beta_se[i]]
464 |                     for i in range(len(self.beta))])
465 | 
466 |         self.final_ll = prev_ll
467 | 
468 |         if j < nb_iter:
469 |             self.converged = True
470 |         else:
471 |             self.converged = False
472 | 
473 |         return self


--------------------------------------------------------------------------------
/codes/chp_10.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "9d94387d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Synthetic Data Generation"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "c4bf2152",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### Synthetic data from real data"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "e15547c9",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from sklearn.datasets import fetch_california_housing\n",
 27 |     "import pandas as pd\n",
 28 |     "import numpy as np\n",
 29 |     "import matplotlib. pyplot as plt\n",
 30 |     "import yfinance as yf\n",
 31 |     "import datetime\n",
 32 |     "import warnings\n",
 33 |     "warnings.filterwarnings('ignore')\n",
 34 |     "plt.rcParams['figure.dpi'] = 300\n",
 35 |     "plt.rcParams['savefig.dpi'] = 300"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "eb4372d3",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "X, y = fetch_california_housing(return_X_y=True)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "a89d8166",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import numpy as np\n",
 56 |     "california_housing=np.column_stack([X, y])\n",
 57 |     "california_housing_df=pd.DataFrame(california_housing)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "3be1ef00",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from ctgan import CTGANSynthesizer\n",
 68 |     "\n",
 69 |     "ctgan = CTGANSynthesizer(epochs=10)\n",
 70 |     "ctgan.fit(california_housing_df)\n",
 71 |     "synt_sample = ctgan.sample(len(california_housing_df))"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "2d195987",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "california_housing_df.describe()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "753f84fc",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "synt_sample.describe()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "f3abe89f",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "from sdv.evaluation import evaluate\n",
102 |     "\n",
103 |     "evaluate(synt_sample, california_housing_df)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "id": "4ed12311",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from table_evaluator import TableEvaluator\n",
114 |     "\n",
115 |     "table_evaluator =  TableEvaluator(california_housing_df, synt_sample)\n",
116 |     "\n",
117 |     "table_evaluator.visual_evaluation()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "id": "8f6abe40",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Synthetic data from model"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "a17caa4f",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from sklearn.datasets import make_regression\n",
136 |     "import matplotlib.pyplot as plt\n",
137 |     "from matplotlib import cm"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "id": "6b6fed38",
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "X, y = make_regression(n_samples=1000, n_features=3, noise=0.2,\n",
148 |     "                       random_state=123)\n",
149 |     "\n",
150 |     "plt.scatter(X[:, 0], X[:, 1], alpha= 0.3, cmap='Greys', c=y)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "6d82677e",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "plt.figure(figsize=(18, 18))\n",
161 |     "k = 0\n",
162 |     "\n",
163 |     "for i in range(0, 10):\n",
164 |     "    X, y = make_regression(n_samples=100, n_features=3, noise=i,\n",
165 |     "                           random_state=123) \n",
166 |     "    k+=1\n",
167 |     "    plt.subplot(5, 2, k)\n",
168 |     "    profit_margin_orange = np.asarray([20, 35, 40])\n",
169 |     "    plt.scatter(X[:, 0], X[:, 1], alpha=0.3, cmap=cm.Greys, c=y)\n",
170 |     "    plt.title('Synthetic Data with Different Noises: ' + str(i))\n",
171 |     "plt.show()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "2865b966",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "from sklearn.datasets import make_classification"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "e6d249dc",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "plt.figure(figsize=(18, 18))\n",
192 |     "k = 0\n",
193 |     "\n",
194 |     "for i in range(2, 6):\n",
195 |     "    X, y = make_classification(n_samples=100,\n",
196 |     "                               n_features=4,\n",
197 |     "                               n_classes=i,\n",
198 |     "                               n_redundant=0,\n",
199 |     "                               n_informative=4,\n",
200 |     "                               random_state=123)\n",
201 |     "    k+=1\n",
202 |     "    plt.subplot(2, 2, k)\n",
203 |     "    plt.scatter(X[: ,0], X[:, 1], alpha=0.8, cmap='gray', c=y)\n",
204 |     "    plt.title('Synthetic Data with Different Classes: ' + str(i))\n",
205 |     "plt.show()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "id": "f1b543a0",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Synthetic Data for Unsupervised Learning"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "id": "d7601cd9",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "from sklearn.datasets import make_blobs"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "715ab91e",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "X, y = make_blobs(n_samples=100, centers=2, \n",
234 |     "                      n_features=2, random_state=0)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "id": "459ec07c",
241 |    "metadata": {
242 |     "scrolled": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "plt.figure(figsize=(18, 18))\n",
247 |     "k = 0\n",
248 |     "for i in range(2, 6):\n",
249 |     "    X, y = make_blobs(n_samples=100, centers=i,\n",
250 |     "                      n_features=2, random_state=0)\n",
251 |     "    k += 1\n",
252 |     "    plt.subplot(2, 2, k)\n",
253 |     "    my_scatter_plot = plt.scatter(X[:, 0], X[:, 1],\n",
254 |     "                                  alpha=0.3, cmap='gray', c=y)\n",
255 |     "    plt.title('Synthetic Data with Different Clusters: ' + str(i))\n",
256 |     "plt.show()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "id": "16258b7a",
262 |    "metadata": {},
263 |    "source": [
264 |     "## HMM"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "id": "1c46f351",
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "ff = pd.read_csv('datasets/FF3.csv', skiprows=4)\n",
275 |     "ff = ff.rename(columns={'Unnamed: 0': 'Date'})\n",
276 |     "ff = ff.iloc[:-1]\n",
277 |     "ff.head()"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "id": "e14b804b",
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "ff.info()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "39442b02",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "ff['Date'] = pd.to_datetime(ff['Date'])\n",
298 |     "ff.set_index('Date', inplace=True)\n",
299 |     "ff_trim = ff.loc['2000-01-01':]"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "id": "51699036",
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "ff_trim.head()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "id": "1acb838b",
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "ticker = 'SPY'\n",
320 |     "start = datetime.datetime(2000, 1, 3)\n",
321 |     "end = datetime.datetime(2021, 4, 30)\n",
322 |     "SP_ETF = yf.download(ticker, start, end, interval='1d').Close"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "id": "a4080d52",
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "ff_merge = pd.merge(ff_trim, SP_ETF, how='inner', on='Date')"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "id": "7740f0bd",
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "SP = pd.DataFrame()\n",
343 |     "SP['Close']= ff_merge['Close']"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "2edfb00b",
350 |    "metadata": {
351 |     "scrolled": true
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "SP['return'] = (SP['Close'] / SP['Close'].shift(1))-1"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "d7760a2f",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "from hmmlearn import hmm"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "06dfcf72",
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "hmm_model = hmm.GaussianHMM(n_components=3,\n",
376 |     "                            covariance_type=\"full\",\n",
377 |     "                            n_iter=100)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "id": "652b909f",
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "hmm_model.fit(np.array(SP['return'].dropna()).reshape(-1, 1))\n",
388 |     "hmm_predict = hmm_model.predict(np.array(SP['return'].dropna())\n",
389 |     "                                .reshape(-1, 1))\n",
390 |     "df_hmm = pd.DataFrame(hmm_predict)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "id": "cf3f93fe",
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "ret_merged = pd.concat([df_hmm,SP['return'].dropna().reset_index()],\n",
401 |     "                       axis=1)\n",
402 |     "ret_merged.drop('Date',axis=1, inplace=True)\n",
403 |     "ret_merged.rename(columns={0:'states'}, inplace=True)\n",
404 |     "ret_merged.dropna().head()"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "id": "b1c16930",
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "ret_merged['states'].value_counts()"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "id": "0338d6d6",
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "state_means = []\n",
425 |     "state_std = []\n",
426 |     "\n",
427 |     "for i in range(3):\n",
428 |     "    state_means.append(ret_merged[ret_merged.states == i]['return']\n",
429 |     "                       .mean())\n",
430 |     "    state_std.append(ret_merged[ret_merged.states == i]['return']\n",
431 |     "                     .std())\n",
432 |     "print('State Means are: {}'.format(state_means))\n",
433 |     "print('State Standard Deviations are: {}'.format(state_std))"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "id": "bd141016",
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "print(f'HMM means\\n {hmm_model.means_}')\n",
444 |     "print(f'HMM covariances\\n {hmm_model.covars_}')\n",
445 |     "print(f'HMM transition matrix\\n {hmm_model.transmat_}')\n",
446 |     "print(f'HMM initial probability\\n {hmm_model.startprob_}')"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "id": "4d3adf16",
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "sp_ret = SP['return'].dropna().values.reshape(-1,1)\n",
457 |     "n_components = np.arange(1, 10)\n",
458 |     "clusters = [hmm.GaussianHMM(n_components=n, \n",
459 |     "                            covariance_type=\"full\").fit(sp_ret)\n",
460 |     "           for n in n_components]\n",
461 |     "plt.plot(n_components, [m.score(np.array(SP['return'].dropna())\\\n",
462 |     "                                .reshape(-1,1)) for m in clusters])\n",
463 |     "plt.title('Optimum Number of States')\n",
464 |     "plt.xlabel('n_components')\n",
465 |     "plt.ylabel('Log Likelihood')"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "id": "1a6f7bcf",
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "hmm_model = hmm.GaussianHMM(n_components=3, \n",
476 |     "                        covariance_type=\"full\", \n",
477 |     "                        random_state=123).fit(sp_ret)\n",
478 |     "hidden_states = hmm_model.predict(sp_ret)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "id": "8d276d64",
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "from matplotlib.dates import YearLocator, MonthLocator\n",
489 |     "from matplotlib import cm"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "id": "64648060",
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "df_sp_ret = SP['return'].dropna()\n",
500 |     "\n",
501 |     "hmm_model = hmm.GaussianHMM(n_components=3, \n",
502 |     "                            covariance_type=\"full\", \n",
503 |     "                            random_state=123).fit(sp_ret)\n",
504 |     "\n",
505 |     "hidden_states = hmm_model.predict(sp_ret)\n",
506 |     "\n",
507 |     "fig, axs = plt.subplots(hmm_model.n_components, sharex=True,\n",
508 |     "                        sharey=True, figsize=(12, 9))\n",
509 |     "colors = cm.gray(np.linspace(0, 0.7, hmm_model.n_components))\n",
510 |     "\n",
511 |     "for i, (ax, color) in enumerate(zip(axs, colors)):\n",
512 |     "    mask = hidden_states == i\n",
513 |     "    ax.plot_date(df_sp_ret.index.values[mask],\n",
514 |     "                 df_sp_ret.values[mask],\n",
515 |     "                 \".-\", c=color)\n",
516 |     "    ax.set_title(\"Hidden state {}\".format(i + 1), fontsize=16)\n",
517 |     "    ax.xaxis.set_minor_locator(MonthLocator())\n",
518 |     "plt.tight_layout()"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "id": "0f3bc462",
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "ret_merged.groupby('states')['return'].mean()"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "id": "315b6180",
534 |    "metadata": {},
535 |    "source": [
536 |     "## Fama-French Model vs. HMM"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": null,
542 |    "id": "5f3574be",
543 |    "metadata": {},
544 |    "outputs": [],
545 |    "source": [
546 |     "ff_merge['return'] = ff_merge['Close'].pct_change()\n",
547 |     "ff_merge.dropna(inplace=True)"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "id": "d2b35ffe",
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "split = int(len(ff_merge) * 0.9)\n",
558 |     "train_ff= ff_merge.iloc[:split].dropna()\n",
559 |     "test_ff = ff_merge.iloc[split:].dropna()"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "id": "b656a45f",
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "hmm_model = hmm.GaussianHMM(n_components=3,\n",
570 |     "                            covariance_type=\"full\",\n",
571 |     "                            n_iter=100, init_params=\" \")"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": null,
577 |    "id": "8cd9f8f5",
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": [
581 |     "predictions = []\n",
582 |     "\n",
583 |     "for i in range(len(test_ff)):\n",
584 |     "    hmm_model.fit(train_ff)\n",
585 |     "    adjustment = np.dot(hmm_model.transmat_, hmm_model.means_)\n",
586 |     "    predictions.append(test_ff.iloc[i] + adjustment[0])\n",
587 |     "predictions = pd.DataFrame(predictions)"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": null,
593 |    "id": "3d0df7ea",
594 |    "metadata": {},
595 |    "outputs": [],
596 |    "source": [
597 |     "std_dev = predictions['return'].std()\n",
598 |     "sharpe = predictions['return'].mean() / std_dev\n",
599 |     "print('Sharpe ratio with HMM is {:.4f}'.format(sharpe))"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "id": "82993b8a",
605 |    "metadata": {},
606 |    "source": [
607 |     "## Fama-French Model with OLS"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": null,
613 |    "id": "c195097c",
614 |    "metadata": {},
615 |    "outputs": [],
616 |    "source": [
617 |     "import statsmodels.api as sm"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "id": "a9c57902",
624 |    "metadata": {},
625 |    "outputs": [],
626 |    "source": [
627 |     "Y = train_ff['return']\n",
628 |     "X = train_ff[['Mkt-RF', 'SMB', 'HML']]"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "id": "6bcec34d",
635 |    "metadata": {},
636 |    "outputs": [],
637 |    "source": [
638 |     "model = sm.OLS(Y, X)\n",
639 |     "ff_ols = model.fit()\n",
640 |     "print(ff_ols.summary())"
641 |    ]
642 |   },
643 |   {
644 |    "cell_type": "code",
645 |    "execution_count": null,
646 |    "id": "dc59afd3",
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "ff_pred = ff_ols.predict(test_ff[[\"Mkt-RF\", \"SMB\", \"HML\"]])\n",
651 |     "ff_pred.head()"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "code",
656 |    "execution_count": null,
657 |    "id": "70e91c46",
658 |    "metadata": {},
659 |    "outputs": [],
660 |    "source": [
661 |     "std_dev = ff_pred.std()\n",
662 |     "sharpe = ff_pred.mean() / std_dev\n",
663 |     "print('Sharpe ratio with FF 3 factor model is {:.4f}'.format(sharpe))"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": null,
669 |    "id": "94dd1023",
670 |    "metadata": {},
671 |    "outputs": [],
672 |    "source": [
673 |     "split = int(len(SP['return']) * 0.9)\n",
674 |     "train_ret_SP = SP['return'].iloc[split:].dropna()\n",
675 |     "test_ret_SP = SP['return'].iloc[:split].dropna()"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": null,
681 |    "id": "afc83e38",
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "hmm_model = hmm.GaussianHMM(n_components=3,\n",
686 |     "                            covariance_type=\"full\",\n",
687 |     "                            n_iter=100)\n",
688 |     "hmm_model.fit(np.array(train_ret_SP).reshape(-1, 1))\n",
689 |     "hmm_predict_vol = hmm_model.predict(np.array(test_ret_SP)\n",
690 |     "                                    .reshape(-1, 1))\n",
691 |     "pd.DataFrame(hmm_predict_vol).value_counts()"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "markdown",
696 |    "id": "41a110d6",
697 |    "metadata": {},
698 |    "source": [
699 |     "## Synthetic Data Generation and Hidden Markov"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "id": "24aed2a5",
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": [
709 |     "startprob = hmm_model.startprob_\n",
710 |     "transmat = hmm_model.transmat_\n",
711 |     "means = hmm_model.means_ \n",
712 |     "covars = hmm_model.covars_"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "code",
717 |    "execution_count": null,
718 |    "id": "81f550c3",
719 |    "metadata": {},
720 |    "outputs": [],
721 |    "source": [
722 |     "syn_hmm = hmm.GaussianHMM(n_components=3, covariance_type=\"full\")"
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "code",
727 |    "execution_count": null,
728 |    "id": "2b28defb",
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "syn_hmm.startprob_ = startprob\n",
733 |     "syn_hmm.transmat_ = transmat \n",
734 |     "syn_hmm.means_ = means \n",
735 |     "syn_hmm.covars_ = covars"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "id": "6a7992d1",
742 |    "metadata": {},
743 |    "outputs": [],
744 |    "source": [
745 |     "syn_data, _ = syn_hmm.sample(n_samples=1000)"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": null,
751 |    "id": "772ca78a",
752 |    "metadata": {},
753 |    "outputs": [],
754 |    "source": [
755 |     "plt.hist(syn_data)\n",
756 |     "plt.title('Histogram of Synthetic Data')\n",
757 |     "plt.show()"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": null,
763 |    "id": "f59b8002",
764 |    "metadata": {},
765 |    "outputs": [],
766 |    "source": [
767 |     "plt.plot(syn_data, \"--\")\n",
768 |     "plt.title('Line Plot of Synthetic Data')\n",
769 |     "plt.show()"
770 |    ]
771 |   }
772 |  ],
773 |  "metadata": {
774 |   "kernelspec": {
775 |    "display_name": "Python 3",
776 |    "language": "python",
777 |    "name": "python3"
778 |   },
779 |   "language_info": {
780 |    "codemirror_mode": {
781 |     "name": "ipython",
782 |     "version": 3
783 |    },
784 |    "file_extension": ".py",
785 |    "mimetype": "text/x-python",
786 |    "name": "python",
787 |    "nbconvert_exporter": "python",
788 |    "pygments_lexer": "ipython3",
789 |    "version": "3.8.8"
790 |   },
791 |   "latex_envs": {
792 |    "LaTeX_envs_menu_present": true,
793 |    "autoclose": false,
794 |    "autocomplete": true,
795 |    "bibliofile": "biblio.bib",
796 |    "cite_by": "apalike",
797 |    "current_citInitial": 1,
798 |    "eqLabelWithNumbers": true,
799 |    "eqNumInitial": 1,
800 |    "hotkeys": {
801 |     "equation": "Ctrl-E",
802 |     "itemize": "Ctrl-I"
803 |    },
804 |    "labels_anchors": false,
805 |    "latex_user_defs": false,
806 |    "report_style_numbering": false,
807 |    "user_envs_cfg": false
808 |   },
809 |   "toc": {
810 |    "base_numbering": 1,
811 |    "nav_menu": {},
812 |    "number_sections": false,
813 |    "sideBar": true,
814 |    "skip_h1_title": false,
815 |    "title_cell": "Table of Contents",
816 |    "title_sidebar": "Contents",
817 |    "toc_cell": false,
818 |    "toc_position": {},
819 |    "toc_section_display": true,
820 |    "toc_window_display": false
821 |   }
822 |  },
823 |  "nbformat": 4,
824 |  "nbformat_minor": 5
825 | }
826 | 


--------------------------------------------------------------------------------
/codes/chp_7.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "c314ee7b",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import warnings\n",
 14 |     "warnings.filterwarnings(\"ignore\")\n",
 15 |     "plt.rcParams['figure.figsize'] = (10, 6)\n",
 16 |     "pd.set_option('use_inf_as_na', True)\n",
 17 |     "plt.rcParams['figure.dpi'] = 300\n",
 18 |     "plt.rcParams['savefig.dpi'] = 300"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "6dbe8daf",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "liq_data = pd.read_csv('datasets/bid_ask.csv')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "ec9ef45b",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "liq_data.head()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "7c1d7e13",
 45 |    "metadata": {
 46 |     "code_folding": []
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "rolling_five = []\n",
 51 |     "\n",
 52 |     "for j in liq_data.TICKER.unique():\n",
 53 |     "    for i in range(len(liq_data[liq_data.TICKER == j])):\n",
 54 |     "        rolling_five.append(liq_data[i:i+5].agg({'BIDLO': 'min',\n",
 55 |     "                                                'ASKHI': 'max',\n",
 56 |     "                                                 'VOL': 'sum',\n",
 57 |     "                                                 'SHROUT': 'mean',\n",
 58 |     "                                                 'PRC': 'mean'}))"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "3f642423",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "rolling_five_df = pd.DataFrame(rolling_five)\n",
 69 |     "rolling_five_df.columns = ['bidlo_min', 'askhi_max', 'vol_sum',\n",
 70 |     "                           'shrout_mean', 'prc_mean']\n",
 71 |     "liq_vol_all = pd.concat([liq_data,rolling_five_df], axis=1)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "b11274b7",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Volume Based Measure"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "fdaec6b2",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "liq_ratio = []\n",
 90 |     "\n",
 91 |     "for j in liq_vol_all.TICKER.unique():\n",
 92 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
 93 |     "        liq_ratio.append((liq_vol_all['PRC'][i+1:i+6] * \n",
 94 |     "                          liq_vol_all['VOL'][i+1:i+6]).sum()/\n",
 95 |     "                         (np.abs(liq_vol_all['PRC'][i+1:i+6].mean() - \n",
 96 |     "                                 liq_vol_all['PRC'][i:i+5].mean())))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "3cbe3e36",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "Lhh = []\n",
107 |     "\n",
108 |     "for j in liq_vol_all.TICKER.unique():\n",
109 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
110 |     "        Lhh.append((liq_vol_all['PRC'][i:i+5].max() - \n",
111 |     "                    liq_vol_all['PRC'][i:i+5].min()) /  \n",
112 |     "                   liq_vol_all['PRC'][i:i+5].min() /  \n",
113 |     "                   (liq_vol_all['VOL'][i:i+5].sum() / \n",
114 |     "                    liq_vol_all['SHROUT'][i:i+5].mean() * \n",
115 |     "                    liq_vol_all['PRC'][i:i+5].mean()))"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "b825b590",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "turnover_ratio = []\n",
126 |     "\n",
127 |     "for j in liq_vol_all.TICKER.unique():\n",
128 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
129 |     "        turnover_ratio.append((1/liq_vol_all['VOL'].count()) * \n",
130 |     "                              (np.sum(liq_vol_all['VOL'][i:i+1]) / \n",
131 |     "                               np.sum(liq_vol_all['SHROUT'][i:i+1])))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "5e2edf81",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "liq_vol_all['liq_ratio'] = pd.DataFrame(liq_ratio)\n",
142 |     "liq_vol_all['Lhh'] = pd.DataFrame(Lhh)\n",
143 |     "liq_vol_all['turnover_ratio'] = pd.DataFrame(turnover_ratio)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "id": "e135f29f",
149 |    "metadata": {},
150 |    "source": [
151 |     "## Transaction Cost Based Measures"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "b4cccab2",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Bid-Ask Spreads"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "id": "d3cc91ef",
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "liq_vol_all['mid_price'] = (liq_vol_all.ASKHI + liq_vol_all.BIDLO) / 2\n",
170 |     "liq_vol_all['percent_quoted_ba'] = (liq_vol_all.ASKHI - \n",
171 |     "                                    liq_vol_all.BIDLO) / \\\n",
172 |     "                                    liq_vol_all.mid_price\n",
173 |     "liq_vol_all['percent_effective_ba'] = 2 * abs((liq_vol_all.PRC - \n",
174 |     "                                               liq_vol_all.mid_price)) / \\\n",
175 |     "                                               liq_vol_all.mid_price"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "id": "192ca49a",
181 |    "metadata": {},
182 |    "source": [
183 |     "### Roll's Spread"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "72e37d34",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "liq_vol_all['price_diff'] = liq_vol_all.groupby('TICKER')['PRC']\\\n",
194 |     "                            .apply(lambda x:x.diff())\n",
195 |     "liq_vol_all.dropna(inplace=True)\n",
196 |     "roll = []\n",
197 |     "\n",
198 |     "for j in liq_vol_all.TICKER.unique():\n",
199 |     "     for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
200 |     "        roll_cov = np.cov(liq_vol_all['price_diff'][i:i+5], \n",
201 |     "                          liq_vol_all['price_diff'][i+1:i+6])\n",
202 |     "        if roll_cov[0,1] < 0:\n",
203 |     "            roll.append(2 * np.sqrt(-roll_cov[0, 1]))\n",
204 |     "        else:\n",
205 |     "             roll.append(2 * np.sqrt(np.abs(roll_cov[0, 1])))\n"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "id": "5472c49c",
211 |    "metadata": {},
212 |    "source": [
213 |     "### Corwin and Schultz (2012)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "id": "6ec507fd",
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "gamma = []\n",
224 |     "\n",
225 |     "for j in liq_vol_all.TICKER.unique():\n",
226 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
227 |     "        gamma.append((max(liq_vol_all['ASKHI'].iloc[i+1], \n",
228 |     "                          liq_vol_all['ASKHI'].iloc[i]) - \n",
229 |     "                      min(liq_vol_all['BIDLO'].iloc[i+1], \n",
230 |     "                          liq_vol_all['BIDLO'].iloc[i])) ** 2)\n",
231 |     "        gamma_array = np.array(gamma)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "id": "ab72f09a",
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "beta = []\n",
242 |     "\n",
243 |     "for j in liq_vol_all.TICKER.unique():\n",
244 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
245 |     "        beta.append((liq_vol_all['ASKHI'].iloc[i+1] - \n",
246 |     "                     liq_vol_all['BIDLO'].iloc[i+1]) ** 2 + \n",
247 |     "                    (liq_vol_all['ASKHI'].iloc[i] - \n",
248 |     "                     liq_vol_all['BIDLO'].iloc[i]) ** 2)\n",
249 |     "        beta_array = np.array(beta)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "id": "0c6da937",
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "alpha = ((np.sqrt(2 * beta_array) - np.sqrt(beta_array)) / \n",
260 |     "       (3 - (2 * np.sqrt(2)))) - np.sqrt(gamma_array / \n",
261 |     "                                         (3 - (2 * np.sqrt(2))))\n",
262 |     "CS_spread = (2 * np.exp(alpha - 1)) / (1 + np.exp(alpha))"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "id": "c260377d",
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "liq_vol_all = liq_vol_all.reset_index()\n",
273 |     "liq_vol_all['roll'] = pd.DataFrame(roll)\n",
274 |     "liq_vol_all['CS_spread'] = pd.DataFrame(CS_spread)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "989ee3e6",
280 |    "metadata": {},
281 |    "source": [
282 |     "## Price Based Measures"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "62c9209d",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "dvol = []\n",
293 |     "\n",
294 |     "for j in liq_vol_all.TICKER.unique():\n",
295 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
296 |     "        dvol.append((liq_vol_all['PRC'][i:i+5] *\n",
297 |     "                     liq_vol_all['VOL'][i:i+5]).sum())\n",
298 |     "liq_vol_all['dvol'] = pd.DataFrame(dvol)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "id": "35252634",
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "amihud = []\n",
309 |     "\n",
310 |     "for j in liq_vol_all.TICKER.unique():\n",
311 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
312 |     "        amihud.append((1 / liq_vol_all['RET'].count()) * \n",
313 |     "                      (np.sum(np.abs(liq_vol_all['RET'][i:i+1])) / \n",
314 |     "                              np.sum(liq_vol_all['dvol'][i:i+1])))"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "id": "06803a21",
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "florackis = []\n",
325 |     "\n",
326 |     "for j in liq_vol_all.TICKER.unique():\n",
327 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
328 |     "        florackis.append((1 / liq_vol_all['RET'].count()) * \n",
329 |     "                         (np.sum(np.abs(liq_vol_all['RET'][i:i+1]) / \n",
330 |     "                                 liq_vol_all['turnover_ratio'][i:i+1])))"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "id": "b4540974",
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "liq_vol_all['vol_diff_pct'] = liq_vol_all.groupby('TICKER')['VOL']\\\n",
341 |     "                              .apply(lambda x: x.diff()).pct_change()\n",
342 |     "liq_vol_all['price_diff_pct'] = liq_vol_all.groupby('TICKER')['PRC']\\\n",
343 |     "                              .apply(lambda x: x.diff()).pct_change()"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "b46df604",
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "cet = []\n",
354 |     "\n",
355 |     "for j in liq_vol_all.TICKER.unique():\n",
356 |     "    for i in range(len(liq_vol_all[liq_vol_all.TICKER == j])):\n",
357 |     "        cet.append(np.sum(liq_vol_all['vol_diff_pct'][i:i+1])/\n",
358 |     "                   np.sum(liq_vol_all['price_diff_pct'][i:i+1]))"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "id": "57cbfa54",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "liq_vol_all['amihud'] = pd.DataFrame(amihud)\n",
369 |     "liq_vol_all['florackis'] = pd.DataFrame(florackis)\n",
370 |     "liq_vol_all['cet'] = pd.DataFrame(cet)"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "id": "b33015fc",
376 |    "metadata": {},
377 |    "source": [
378 |     "## Market Impact Measures"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "1eff896d",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "import statsmodels.api as sm"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "id": "4a4255c6",
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "liq_vol_all['VOL_pct_change'] = liq_vol_all.groupby('TICKER')['VOL']\\\n",
399 |     "                                .apply(lambda x: x.pct_change())\n",
400 |     "liq_vol_all.dropna(subset=['VOL_pct_change'], inplace=True)\n",
401 |     "liq_vol_all = liq_vol_all.reset_index()"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "id": "61843be0",
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "unsys_resid = []\n",
412 |     "\n",
413 |     "for i in liq_vol_all.TICKER.unique():\n",
414 |     "    X1 = liq_vol_all[liq_vol_all['TICKER'] == i]['vwretx']\n",
415 |     "    y = liq_vol_all[liq_vol_all['TICKER'] == i]['RET']\n",
416 |     "    ols = sm.OLS(y, X1).fit()\n",
417 |     "    unsys_resid.append(ols.resid)"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "id": "c82d8f75",
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "market_impact = {}\n",
428 |     "\n",
429 |     "for i, j in zip(liq_vol_all.TICKER.unique(), \n",
430 |     "                range(len(liq_vol_all['TICKER'].unique()))):\n",
431 |     "    X2 = liq_vol_all[liq_vol_all['TICKER'] == i]['VOL_pct_change']\n",
432 |     "    ols = sm.OLS(unsys_resid[j] ** 2, X2).fit()\n",
433 |     "    print('***' * 30)\n",
434 |     "    print(f'OLS Result for {i}')\n",
435 |     "    print(ols.summary())\n",
436 |     "    market_impact[j] = ols.resid"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "id": "6faa0827",
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "append1 = market_impact[0].append(market_impact[1])\n",
447 |     "liq_vol_all['market_impact'] = append1.append(market_impact[2])"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "id": "f612c974",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "cols = ['vol_diff_pct', 'price_diff_pct', 'price_diff',\n",
458 |     "        'VOL_pct_change', 'dvol', 'mid_price']\n",
459 |     "liq_measures_all = liq_vol_all.drop(liq_vol_all[cols], axis=1)\\\n",
460 |     "                   .iloc[:, -11:]\n",
461 |     "liq_measures_all.dropna(inplace=True)\n",
462 |     "liq_measures_all.describe().T"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "id": "d6e951cc",
468 |    "metadata": {},
469 |    "source": [
470 |     "## GMM"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "id": "20dd3c79",
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "from sklearn.mixture import GaussianMixture\n",
481 |     "from sklearn.preprocessing import StandardScaler"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "id": "41f5b22e",
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "liq_measures_all2 = liq_measures_all.dropna()\n",
492 |     "scaled_liq = StandardScaler().fit_transform(liq_measures_all2)"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "id": "52393f11",
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "kwargs = dict(alpha=0.5, bins=50,  stacked=True)\n",
503 |     "plt.hist(liq_measures_all.loc[:, 'percent_quoted_ba'],\n",
504 |     "         **kwargs, label='TC-based')\n",
505 |     "plt.hist(liq_measures_all.loc[:, 'turnover_ratio'],\n",
506 |     "         **kwargs, label='Volume-based')\n",
507 |     "plt.hist(liq_measures_all.loc[:, 'market_impact'],\n",
508 |     "         **kwargs, label='Market-based')\n",
509 |     "plt.title('Multimodality of the Liquidity Measures')\n",
510 |     "plt.legend()\n",
511 |     "plt.show()"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "id": "65ba2e29",
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "n_components = np.arange(1, 10)\n",
522 |     "clusters = [GaussianMixture(n, covariance_type='spherical',\n",
523 |     "                            random_state=0).fit(scaled_liq)\n",
524 |     "          for n in n_components]\n",
525 |     "plt.plot(n_components, [m.bic(scaled_liq) for m in clusters])\n",
526 |     "plt.title('Optimum Number of Components')\n",
527 |     "plt.xlabel('n_components')\n",
528 |     "plt.ylabel('BIC values')\n",
529 |     "plt.show()"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "id": "1ee26c41",
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "def cluster_state(data, nstates):\n",
540 |     "    gmm = GaussianMixture(n_components=nstates,\n",
541 |     "                          covariance_type='spherical',\n",
542 |     "                          init_params='kmeans')\n",
543 |     "    gmm_fit = gmm.fit(scaled_liq)\n",
544 |     "    labels = gmm_fit.predict(scaled_liq)\n",
545 |     "    state_probs = gmm.predict_proba(scaled_liq)\n",
546 |     "    state_probs_df = pd.DataFrame(state_probs, \n",
547 |     "                                  columns=['state-1','state-2','state-3'])\n",
548 |     "    state_prob_means = [state_probs_df.iloc[:, i].mean() \n",
549 |     "                        for i in range(len(state_probs_df.columns))]\n",
550 |     "    if np.max(state_prob_means) == state_prob_means[0]:\n",
551 |     "        print('State-1 is likely to occur with a probability of {:4f}'\n",
552 |     "              .format(state_prob_means[0]))\n",
553 |     "    elif np.max(state_prob_means) == state_prob_means[1]:\n",
554 |     "        print('State-2 is likely to occur with a probability of {:4f}'\n",
555 |     "              .format(state_prob_means[1]))\n",
556 |     "    else:\n",
557 |     "        print('State-3 is likely to occur with a probability of {:4f}'\n",
558 |     "              .format(state_prob_means[2]))\n",
559 |     "    return state_probs"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "id": "41148b88",
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "state_probs = cluster_state(scaled_liq, 3)\n",
570 |     "print(f'State probabilities are {state_probs.mean(axis=0)}')"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "id": "90da06dc",
577 |    "metadata": {},
578 |    "outputs": [],
579 |    "source": [
580 |     "from sklearn.decomposition import PCA"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": null,
586 |    "id": "b3740fbf",
587 |    "metadata": {},
588 |    "outputs": [],
589 |    "source": [
590 |     "pca = PCA(n_components=11)\n",
591 |     "components = pca.fit_transform(scaled_liq)\n",
592 |     "plt.plot(pca.explained_variance_ratio_)\n",
593 |     "plt.title('Scree Plot')\n",
594 |     "plt.xlabel('Number of Components')\n",
595 |     "plt.ylabel('% of Explained Variance')\n",
596 |     "plt.show()"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "id": "d8588a7a",
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": [
606 |     "def gmm_pca(data, nstate):\n",
607 |     "    pca = PCA(n_components=3)\n",
608 |     "    components = pca.fit_transform(data)\n",
609 |     "    mxtd = GaussianMixture(n_components=nstate,\n",
610 |     "                           covariance_type='spherical')\n",
611 |     "    gmm = mxtd.fit(components)\n",
612 |     "    labels = gmm.predict(components)\n",
613 |     "    state_probs = gmm.predict_proba(components)\n",
614 |     "    return state_probs,pca"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": null,
620 |    "id": "ef610512",
621 |    "metadata": {},
622 |    "outputs": [],
623 |    "source": [
624 |     "state_probs, pca = gmm_pca(scaled_liq, 3)\n",
625 |     "print(f'State probabilities are {state_probs.mean(axis=0)}')"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": null,
631 |    "id": "53e4a0f0",
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "def wpc():\n",
636 |     "    state_probs_df = pd.DataFrame(state_probs,\n",
637 |     "                                  columns=['state-1', 'state-2',\n",
638 |     "                                           'state-3'])\n",
639 |     "    state_prob_means = [state_probs_df.iloc[:, i].mean() \n",
640 |     "                        for i in range(len(state_probs_df.columns))]\n",
641 |     "    if np.max(state_prob_means) == state_prob_means[0]:\n",
642 |     "        print('State-1 is likely to occur with a probability of {:4f}'\n",
643 |     "              .format(state_prob_means[0]))\n",
644 |     "    elif np.max(state_prob_means) == state_prob_means[1]:\n",
645 |     "        print('State-2 is likely to occur with a probability of {:4f}'\n",
646 |     "              .format(state_prob_means[1]))\n",
647 |     "    else:\n",
648 |     "        print('State-3 is likely to occur with a probability of {:4f}'\n",
649 |     "              .format(state_prob_means[2]))\n",
650 |     "wpc()"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": null,
656 |    "id": "2f538d4f",
657 |    "metadata": {},
658 |    "outputs": [],
659 |    "source": [
660 |     "loadings = pca.components_.T * np.sqrt(pca.explained_variance_)\n",
661 |     "loading_matrix = pd.DataFrame(loadings, \n",
662 |     "                              columns=['PC1', 'PC2', 'PC3'],\n",
663 |     "                              index=liq_measures_all.columns)\n",
664 |     "loading_matrix"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "markdown",
669 |    "id": "213db006",
670 |    "metadata": {},
671 |    "source": [
672 |     "## GMCM"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": null,
678 |    "id": "4f3d5149",
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "from copulae.mixtures.gmc.gmc import GaussianMixtureCopula"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": null,
688 |    "id": "b702ba5c",
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "_, dim = scaled_liq.shape\n",
693 |     "gmcm = GaussianMixtureCopula(n_clusters=3, ndim=dim)"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "code",
698 |    "execution_count": null,
699 |    "id": "8e26e609",
700 |    "metadata": {},
701 |    "outputs": [],
702 |    "source": [
703 |     "gmcm_fit = gmcm.fit(scaled_liq, method='kmeans',\n",
704 |     "                    criteria='GMCM', eps=0.0001)\n",
705 |     "state_prob = gmcm_fit.params.prob\n",
706 |     "print(f'The state {np.argmax(state_prob) + 1} is likely to occur')\n",
707 |     "print(f'State probabilities based on GMCM are {state_prob}')"
708 |    ]
709 |   }
710 |  ],
711 |  "metadata": {
712 |   "celltoolbar": "Raw Cell Format",
713 |   "kernelspec": {
714 |    "display_name": "Python 3",
715 |    "language": "python",
716 |    "name": "python3"
717 |   },
718 |   "language_info": {
719 |    "codemirror_mode": {
720 |     "name": "ipython",
721 |     "version": 3
722 |    },
723 |    "file_extension": ".py",
724 |    "mimetype": "text/x-python",
725 |    "name": "python",
726 |    "nbconvert_exporter": "python",
727 |    "pygments_lexer": "ipython3",
728 |    "version": "3.8.8"
729 |   },
730 |   "latex_envs": {
731 |    "LaTeX_envs_menu_present": true,
732 |    "autoclose": false,
733 |    "autocomplete": true,
734 |    "bibliofile": "biblio.bib",
735 |    "cite_by": "apalike",
736 |    "current_citInitial": 1,
737 |    "eqLabelWithNumbers": true,
738 |    "eqNumInitial": 1,
739 |    "hotkeys": {
740 |     "equation": "Ctrl-E",
741 |     "itemize": "Ctrl-I"
742 |    },
743 |    "labels_anchors": false,
744 |    "latex_user_defs": false,
745 |    "report_style_numbering": false,
746 |    "user_envs_cfg": false
747 |   },
748 |   "toc": {
749 |    "base_numbering": 1,
750 |    "nav_menu": {},
751 |    "number_sections": false,
752 |    "sideBar": true,
753 |    "skip_h1_title": false,
754 |    "title_cell": "Table of Contents",
755 |    "title_sidebar": "Contents",
756 |    "toc_cell": false,
757 |    "toc_position": {},
758 |    "toc_section_display": true,
759 |    "toc_window_display": false
760 |   }
761 |  },
762 |  "nbformat": 4,
763 |  "nbformat_minor": 5
764 | }
765 | 


--------------------------------------------------------------------------------
/codes/chp_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Oil Price Graph"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import quandl\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import warnings\n",
 19 |     "warnings.filterwarnings('ignore')\n",
 20 |     "plt.style.use('seaborn')\n",
 21 |     "plt.rcParams['figure.dpi'] = 300\n",
 22 |     "plt.rcParams['savefig.dpi'] = 300"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "oil = quandl.get(\"NSE/OIL\", authtoken=\"vEjGTysiCFBuN-z5bjGP\",#insert you api token\n",
 32 |     "                 start_date=\"1980-01-01\",\n",
 33 |     "                 end_date=\"2020-01-01\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "plt.figure(figsize=(10, 6))\n",
 43 |     "plt.plot(oil.Close)\n",
 44 |     "plt.ylabel('$')\n",
 45 |     "plt.xlabel('Date')\n",
 46 |     "plt.show()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Trend"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import yfinance as yf\n",
 63 |     "import numpy as np\n",
 64 |     "import pandas as pd\n",
 65 |     "import datetime\n",
 66 |     "import statsmodels.api as sm\n",
 67 |     "from statsmodels.tsa.stattools import adfuller\n",
 68 |     "from statsmodels.tsa.seasonal import seasonal_decompose"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "ticker = '^GSPC'\n",
 78 |     "start = datetime.datetime(2015, 1, 1)\n",
 79 |     "end = datetime.datetime(2021, 1, 1)\n",
 80 |     "SP_prices = yf.download(ticker, start=start, end=end, interval='1mo')\\\n",
 81 |     "            .Close"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "seasonal_decompose(SP_prices, period=12).plot()\n",
 91 |     "plt.show()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "plt.figure(figsize=(10, 6))\n",
101 |     "plt.plot(SP_prices)\n",
102 |     "plt.title('S&P-500 Prices')\n",
103 |     "plt.ylabel('$')\n",
104 |     "plt.xlabel('Date')\n",
105 |     "plt.show()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "sm.graphics.tsa.plot_acf(SP_prices, lags=30)\n",
115 |     "plt.xlabel('Number of Lags')\n",
116 |     "plt.show()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "sm.graphics.tsa.plot_pacf(SP_prices, lags=30)\n",
126 |     "plt.xlabel('Number of Lags')\n",
127 |     "plt.show()"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     " ## Seasonality"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "from fredapi import Fred\n",
144 |     "import statsmodels.api as sm"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "fred = Fred(api_key='78b14ec6ba46f484b94db43694468bb1')#insert you api key"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "energy = fred.get_series(\"CAPUTLG2211A2S\", \n",
163 |     "                         observation_start=\"2010-01-01\", \n",
164 |     "                         observation_end=\"2020-12-31\")\n",
165 |     "energy.head(12)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "plt.plot(energy)\n",
175 |     "plt.title('Energy Capacity Utilization')\n",
176 |     "plt.ylabel('$')\n",
177 |     "plt.xlabel('Date')\n",
178 |     "plt.show()"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "sm.graphics.tsa.plot_acf(energy, lags=30)\n",
188 |     "plt.xlabel('Number of Lags')\n",
189 |     "plt.show()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "## Stationarity"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "stat_test = adfuller(SP_prices)[0:2]\n",
206 |     "print(\"The test statistic and p-value of ADF test are {}\"\n",
207 |     "      .format(stat_test))"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "diff_SP_price = SP_prices.diff()"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {
223 |     "scrolled": true
224 |    },
225 |    "outputs": [],
226 |    "source": [
227 |     "plt.figure(figsize=(10, 6))\n",
228 |     "plt.plot(diff_SP_price)\n",
229 |     "plt.title('Differenced S&P-500 Price')\n",
230 |     "plt.ylabel('$')\n",
231 |     "plt.xlabel('Date')\n",
232 |     "plt.show()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "sm.graphics.tsa.plot_acf(diff_SP_price.dropna(),lags=30)\n",
242 |     "plt.xlabel('Number of Lags')\n",
243 |     "plt.show()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "stat_test2 = adfuller(diff_SP_price.dropna())[0:2]\n",
253 |     "print(\"The test statistic and p-value of ADF test after differencing are {}\"\\\n",
254 |     "      .format(stat_test2))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "seasonal_index = energy.resample('Q').mean()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "dates = energy.index.year.unique()\n",
273 |     "deseasonalized = []\n",
274 |     "for i in dates:\n",
275 |     "    for j in range(1, 13):\n",
276 |     "        deseasonalized.append((energy[str(i)][energy[str(i)]\\\n",
277 |     "                                              .index.month==j]))\n",
278 |     "concat_deseasonalized = np.concatenate(deseasonalized)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "deseason_energy = []\n",
288 |     "for i,s in zip(range(0, len(energy), 3), range(len(seasonal_index))):\n",
289 |     "    deseason_energy.append(concat_deseasonalized[i:i+3] / \n",
290 |     "                           seasonal_index.iloc[s])\n",
291 |     "concat_deseason_energy = np.concatenate(deseason_energy)\n",
292 |     "deseason_energy = pd.DataFrame(concat_deseason_energy, \n",
293 |     "                               index=energy.index)\n",
294 |     "deseason_energy.columns = ['Deaseasonalized Energy']\n",
295 |     "deseason_energy.head()"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "sm.graphics.tsa.plot_acf(deseason_energy, lags=10)\n",
305 |     "plt.xlabel('Number of Lags')\n",
306 |     "plt.show()"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "sm.graphics.tsa.plot_pacf(deseason_energy, lags=10)\n",
316 |     "plt.xlabel('Number of Lags')\n",
317 |     "plt.show()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "## White Noise"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "mu = 0\n",
334 |     "std = 1 \n",
335 |     "WN = np.random.normal(mu, std, 1000)\n",
336 |     "\n",
337 |     "plt.plot(WN)\n",
338 |     "plt.xlabel('Number of Simulations')\n",
339 |     "plt.show()"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "## Moving Average"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "ticker = ['AAPL', 'MSFT']\n",
356 |     "start = datetime.datetime(2019, 1, 1)\n",
357 |     "end = datetime.datetime(2021, 1, 1)\n",
358 |     "stock_prices = yf.download(ticker, start, end, interval='1d')\\\n",
359 |     "               .Close"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "stock_prices = stock_prices.dropna()"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "for i in ticker:\n",
378 |     "    stat_test = adfuller(stock_prices[i])[0:2]\n",
379 |     "    print(\"The ADF test statistic and p-value of {} are {}\"\\\n",
380 |     "          .format(i, stat_test))"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "diff_stock_prices = stock_prices.diff().dropna()"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "split = int(len(diff_stock_prices['AAPL'].values) * 0.95)\n",
399 |     "diff_train_aapl = diff_stock_prices['AAPL'].iloc[:split]\n",
400 |     "diff_test_aapl = diff_stock_prices['AAPL'].iloc[split:]\n",
401 |     "diff_train_msft = diff_stock_prices['MSFT'].iloc[:split]\n",
402 |     "diff_test_msft = diff_stock_prices['MSFT'].iloc[split:]"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "diff_train_aapl.to_csv('diff_train_aapl.csv')\n",
412 |     "diff_test_aapl.to_csv('diff_test_aapl.csv')\n",
413 |     "diff_train_msft.to_csv('diff_train_msft.csv')\n",
414 |     "diff_test_msft.to_csv('diff_test_msft.csv')"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "fig, ax = plt.subplots(2, 1, figsize=(10, 6))\n",
424 |     "plt.tight_layout() \n",
425 |     "sm.graphics.tsa.plot_acf(diff_train_aapl,lags=30,\n",
426 |     "                         ax=ax[0], title='ACF - Apple')\n",
427 |     "sm.graphics.tsa.plot_acf(diff_train_msft,lags=30,\n",
428 |     "                         ax=ax[1], title='ACF - Microsoft')\n",
429 |     "plt.show()"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {
436 |     "scrolled": true
437 |    },
438 |    "outputs": [],
439 |    "source": [
440 |     "short_moving_average_appl = diff_train_aapl.rolling(window=9).mean()\n",
441 |     "long_moving_average_appl = diff_train_aapl.rolling(window=22).mean()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {
448 |     "scrolled": true
449 |    },
450 |    "outputs": [],
451 |    "source": [
452 |     "fig, ax = plt.subplots(figsize=(10, 6))\n",
453 |     "ax.plot(diff_train_aapl.loc[start:end].index, \n",
454 |     "        diff_train_aapl.loc[start:end],\n",
455 |     "        label='Stock Price', linestyle='--')\n",
456 |     "ax.plot(short_moving_average_appl.loc[start:end].index, \n",
457 |     "        short_moving_average_appl.loc[start:end],\n",
458 |     "        label = 'Short MA', linestyle='solid')\n",
459 |     "ax.plot(long_moving_average_appl.loc[start:end].index, \n",
460 |     "        long_moving_average_appl.loc[start:end],\n",
461 |     "        label = 'Long MA', linestyle='solid')\n",
462 |     "ax.legend(loc='best')\n",
463 |     "ax.set_ylabel('Differenced Price')\n",
464 |     "ax.set_title('Stock Prediction-Apple')\n",
465 |     "plt.show()"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {},
472 |    "outputs": [],
473 |    "source": [
474 |     "short_moving_average_msft = diff_train_msft.rolling(window=2).mean()\n",
475 |     "long_moving_average_msft = diff_train_msft.rolling(window=22).mean()"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": null,
481 |    "metadata": {},
482 |    "outputs": [],
483 |    "source": [
484 |     "fig, ax = plt.subplots(figsize=(10, 6))\n",
485 |     "ax.plot(diff_train_msft.loc[start:end].index,\n",
486 |     "        diff_train_msft.loc[start:end],\n",
487 |     "        label='Stock Price', linestyle='--')\n",
488 |     "ax.plot(short_moving_average_msft.loc[start:end].index,\n",
489 |     "        short_moving_average_msft.loc[start:end],\n",
490 |     "        label = 'Short MA', linestyle='solid')\n",
491 |     "ax.plot(long_moving_average_msft.loc[start:end].index,\n",
492 |     "        long_moving_average_msft.loc[start:end],\n",
493 |     "        label = 'Long MA', linestyle='-.')\n",
494 |     "ax.legend(loc='best')\n",
495 |     "ax.set_ylabel('Differenced Price')\n",
496 |     "ax.set_xlabel('Date')\n",
497 |     "ax.set_title('Stock Prediction-Microsoft')\n",
498 |     "plt.show()"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "markdown",
503 |    "metadata": {},
504 |    "source": [
505 |     "## Autoregressive Model"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {},
512 |    "outputs": [],
513 |    "source": [
514 |     "sm.graphics.tsa.plot_pacf(diff_train_aapl, lags=30)\n",
515 |     "plt.title('PACF of Apple')\n",
516 |     "plt.xlabel('Number of Lags')\n",
517 |     "plt.show()"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "sm.graphics.tsa.plot_pacf(diff_train_msft, lags=30)\n",
527 |     "plt.title('PACF of Microsoft')\n",
528 |     "plt.xlabel('Number of Lags')\n",
529 |     "plt.show()"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": [
538 |     "from statsmodels.tsa.ar_model import AutoReg\n",
539 |     "import warnings\n",
540 |     "warnings.filterwarnings('ignore')"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "ar_aapl = AutoReg(diff_train_aapl.values, lags=29)\n",
550 |     "ar_fitted_aapl = ar_aapl.fit()"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "ar_predictions_aapl = ar_fitted_aapl.predict(start=len(diff_train_aapl), \n",
560 |     "                                             end=len(diff_train_aapl)\\\n",
561 |     "                                             + len(diff_test_aapl) - 1, \n",
562 |     "                                             dynamic=False)"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "for i in range(len(ar_predictions_aapl)):\n",
572 |     "    print('==' * 25)\n",
573 |     "    print('predicted values:{:.4f} & actual values:{:.4f}'\\\n",
574 |     "          .format(ar_predictions_aapl[i], diff_test_aapl[i]))"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "ar_predictions_aapl = pd.DataFrame(ar_predictions_aapl)\n",
584 |     "ar_predictions_aapl.index = diff_test_aapl.index"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "ar_msft = AutoReg(diff_train_msft.values, lags=26)\n",
594 |     "ar_fitted_msft = ar_msft.fit()"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "metadata": {},
601 |    "outputs": [],
602 |    "source": [
603 |     "ar_predictions_msft = ar_fitted_msft.predict(start=len(diff_train_msft), \n",
604 |     "                                             end=len(diff_train_msft)\\\n",
605 |     "                                             +len(diff_test_msft) - 1,\n",
606 |     "                                             dynamic=False)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {},
613 |    "outputs": [],
614 |    "source": [
615 |     "ar_predictions_msft = pd.DataFrame(ar_predictions_msft)\n",
616 |     "ar_predictions_msft.index = diff_test_msft.index"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {},
623 |    "outputs": [],
624 |    "source": [
625 |     "fig, ax = plt.subplots(2,1, figsize=(18, 15))\n",
626 |     " \n",
627 |     "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n",
628 |     "ax[0].plot(ar_predictions_aapl, linestyle='solid', label=\"Prediction\")\n",
629 |     "ax[0].set_title('Predicted Stock Price-Apple')\n",
630 |     "ax[0].legend(loc='best')\n",
631 |     "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n",
632 |     "ax[1].plot(ar_predictions_msft, linestyle='solid', label=\"Prediction\")\n",
633 |     "ax[1].set_title('Predicted Stock Price-Microsoft')\n",
634 |     "ax[1].legend(loc='best')\n",
635 |     "for ax in ax.flat:\n",
636 |     "    ax.set(xlabel='Date', ylabel='Differenced Price')\n",
637 |     "plt.show()"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "markdown",
642 |    "metadata": {},
643 |    "source": [
644 |     "## ARIMA Model"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": null,
650 |    "metadata": {},
651 |    "outputs": [],
652 |    "source": [
653 |     "from statsmodels.tsa.arima_model import ARIMA"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": null,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "split = int(len(stock_prices['AAPL'].values) * 0.95)\n",
663 |     "train_aapl = stock_prices['AAPL'].iloc[:split]\n",
664 |     "test_aapl = stock_prices['AAPL'].iloc[split:]\n",
665 |     "train_msft = stock_prices['MSFT'].iloc[:split]\n",
666 |     "test_msft = stock_prices['MSFT'].iloc[split:]"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": null,
672 |    "metadata": {},
673 |    "outputs": [],
674 |    "source": [
675 |     "arima_aapl = ARIMA(train_aapl,order=(9, 1, 9))\n",
676 |     "arima_fit_aapl = arima_aapl.fit()"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": null,
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "arima_msft = ARIMA(train_msft, order=(6, 1, 6))\n",
686 |     "arima_fit_msft = arima_msft.fit()"
687 |    ]
688 |   },
689 |   {
690 |    "cell_type": "code",
691 |    "execution_count": null,
692 |    "metadata": {},
693 |    "outputs": [],
694 |    "source": [
695 |     "arima_predict_aapl = arima_fit_aapl.predict(start=len(train_aapl), \n",
696 |     "                                                  end=len(train_aapl)\\\n",
697 |     "                                                  + len(test_aapl) - 1, \n",
698 |     "                                                  dynamic=False)\n",
699 |     "arima_predict_msft = arima_fit_msft.predict(start=len(train_msft), \n",
700 |     "                                                  end=len(train_msft)\\\n",
701 |     "                                                  + len(test_msft) - 1, \n",
702 |     "                                                  dynamic=False)"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": null,
708 |    "metadata": {},
709 |    "outputs": [],
710 |    "source": [
711 |     "arima_predict_aapl = pd.DataFrame(arima_predict_aapl)\n",
712 |     "arima_predict_aapl.index = diff_test_aapl.index\n",
713 |     "arima_predict_msft = pd.DataFrame(arima_predict_msft)\n",
714 |     "arima_predict_msft.index = diff_test_msft.index"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": null,
720 |    "metadata": {},
721 |    "outputs": [],
722 |    "source": [
723 |     "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n",
724 |     " \n",
725 |     "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n",
726 |     "ax[0].plot(arima_predict_aapl, linestyle='solid', label=\"Prediction\")\n",
727 |     "ax[0].set_title('Predicted Stock Price-Apple')\n",
728 |     "ax[0].legend(loc='best')\n",
729 |     "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n",
730 |     "ax[1].plot(arima_predict_msft, linestyle='solid', label=\"Prediction\")\n",
731 |     "ax[1].set_title('Predicted Stock Price-Microsoft')\n",
732 |     "ax[1].legend(loc='best')\n",
733 |     "for ax in ax.flat:\n",
734 |     "    ax.set(xlabel='Date', ylabel='Differenced Price')\n",
735 |     "plt.show()\n"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "code",
740 |    "execution_count": null,
741 |    "metadata": {},
742 |    "outputs": [],
743 |    "source": [
744 |     "import itertools"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "code",
749 |    "execution_count": null,
750 |    "metadata": {},
751 |    "outputs": [],
752 |    "source": [
753 |     "p = q = range(0, 9)\n",
754 |     "d = range(0, 3)\n",
755 |     "pdq = list(itertools.product(p, d, q))\n",
756 |     "arima_results_aapl = []\n",
757 |     "for param_set in pdq:\n",
758 |     "    try:\n",
759 |     "        arima_aapl = ARIMA(train_aapl, order=param_set)\n",
760 |     "        arima_fitted_aapl = arima_aapl.fit()\n",
761 |     "        arima_results_aapl.append(arima_fitted_aapl.aic)\n",
762 |     "    except:\n",
763 |     "        continue\n",
764 |     "print('**'*25)\n",
765 |     "print('The Lowest AIC score is {:.4f} and the corresponding parameters are {}'\n",
766 |     "      .format(pd.DataFrame(arima_results_aapl)\n",
767 |     "             .where(pd.DataFrame(arima_results_aapl).T.notnull().all()).min()[0], \n",
768 |     "             pdq[arima_results_aapl.index(min(arima_results_aapl))]))"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": null,
774 |    "metadata": {},
775 |    "outputs": [],
776 |    "source": [
777 |     "arima_aapl = ARIMA(train_aapl, order=(4, 1, 4))\n",
778 |     "arima_fit_aapl = arima_aapl.fit()"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": null,
784 |    "metadata": {},
785 |    "outputs": [],
786 |    "source": [
787 |     "p = q = range(0, 6)\n",
788 |     "d = range(0, 3)\n",
789 |     "pdq = list(itertools.product(p, d, q))\n",
790 |     "arima_results_msft = []\n",
791 |     "for param_set in pdq:\n",
792 |     "    try:\n",
793 |     "        arima_msft = ARIMA(stock_prices['MSFT'], order=param_set)\n",
794 |     "        arima_fitted_msft = arima_msft.fit()\n",
795 |     "        arima_results_msft.append(arima_fitted_msft.aic)\n",
796 |     "    except:\n",
797 |     "        continue\n",
798 |     "print('**' * 25)\n",
799 |     "print('The lowest AIC score is {:.4f} and parameters are {}'\n",
800 |     "      .format(pd.DataFrame(arima_results_msft)\n",
801 |     "              .where(pd.DataFrame(arima_results_msft).T.notnull()\\\n",
802 |     "                     .all()).min()[0], \n",
803 |     "              pdq[arima_results_msft.index(min(arima_results_msft))]))"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "code",
808 |    "execution_count": null,
809 |    "metadata": {},
810 |    "outputs": [],
811 |    "source": [
812 |     "arima_msft = ARIMA(stock_prices['MSFT'], order=(4, 2 ,4))\n",
813 |     "arima_fit_msft= arima_msft.fit()"
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "code",
818 |    "execution_count": null,
819 |    "metadata": {},
820 |    "outputs": [],
821 |    "source": [
822 |     "arima_predict_aapl = arima_fit_aapl.predict(start=len(train_aapl), \n",
823 |     "                                                  end=len(train_aapl)\\\n",
824 |     "                                                  +len(test_aapl) - 1, \n",
825 |     "                                                  dynamic=False)\n",
826 |     "arima_predict_msft = arima_fit_msft.predict(start=len(train_msft), \n",
827 |     "                                                  end=len(train_msft)\\\n",
828 |     "                                                  + len(test_msft) - 1, \n",
829 |     "                                                  dynamic=False)"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "code",
834 |    "execution_count": null,
835 |    "metadata": {},
836 |    "outputs": [],
837 |    "source": [
838 |     "arima_predict_aapl = pd.DataFrame(arima_predict_aapl)\n",
839 |     "arima_predict_aapl.index = diff_test_aapl.index\n",
840 |     "arima_predict_msft = pd.DataFrame(arima_predict_msft)\n",
841 |     "arima_predict_msft.index = diff_test_msft.index"
842 |    ]
843 |   },
844 |   {
845 |    "cell_type": "code",
846 |    "execution_count": null,
847 |    "metadata": {},
848 |    "outputs": [],
849 |    "source": [
850 |     "fig, ax = plt.subplots(2, 1, figsize=(18, 15))\n",
851 |     " \n",
852 |     "ax[0].plot(diff_test_aapl, label='Actual Stock Price', linestyle='--')\n",
853 |     "ax[0].plot(arima_predict_aapl, linestyle='solid', label=\"Prediction\")\n",
854 |     "ax[0].set_title('Predicted Stock Price-Apple')\n",
855 |     "ax[0].legend(loc='best')\n",
856 |     "ax[1].plot(diff_test_msft, label='Actual Stock Price', linestyle='--')\n",
857 |     "ax[1].plot(arima_predict_msft, linestyle='solid', label=\"Prediction\")\n",
858 |     "ax[1].set_title('Predicted Stock Price-Microsoft')\n",
859 |     "ax[1].legend(loc='best')\n",
860 |     "for ax in ax.flat:\n",
861 |     "    ax.set(xlabel='Date', ylabel='Differenced Price')\n",
862 |     "plt.show()"
863 |    ]
864 |   }
865 |  ],
866 |  "metadata": {
867 |   "kernelspec": {
868 |    "display_name": "Python 3",
869 |    "language": "python",
870 |    "name": "python3"
871 |   },
872 |   "language_info": {
873 |    "codemirror_mode": {
874 |     "name": "ipython",
875 |     "version": 3
876 |    },
877 |    "file_extension": ".py",
878 |    "mimetype": "text/x-python",
879 |    "name": "python",
880 |    "nbconvert_exporter": "python",
881 |    "pygments_lexer": "ipython3",
882 |    "version": "3.8.8"
883 |   },
884 |   "latex_envs": {
885 |    "LaTeX_envs_menu_present": true,
886 |    "autoclose": false,
887 |    "autocomplete": true,
888 |    "bibliofile": "biblio.bib",
889 |    "cite_by": "apalike",
890 |    "current_citInitial": 1,
891 |    "eqLabelWithNumbers": true,
892 |    "eqNumInitial": 1,
893 |    "hotkeys": {
894 |     "equation": "Ctrl-E",
895 |     "itemize": "Ctrl-I"
896 |    },
897 |    "labels_anchors": false,
898 |    "latex_user_defs": false,
899 |    "report_style_numbering": false,
900 |    "user_envs_cfg": false
901 |   },
902 |   "toc": {
903 |    "base_numbering": 1,
904 |    "nav_menu": {},
905 |    "number_sections": true,
906 |    "sideBar": true,
907 |    "skip_h1_title": false,
908 |    "title_cell": "Table of Contents",
909 |    "title_sidebar": "Contents",
910 |    "toc_cell": false,
911 |    "toc_position": {},
912 |    "toc_section_display": true,
913 |    "toc_window_display": false
914 |   }
915 |  },
916 |  "nbformat": 4,
917 |  "nbformat_minor": 4
918 | }
919 | 


--------------------------------------------------------------------------------
/codes/chp_9.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import matplotlib.pyplot as plt\n",
 11 |     "import numpy as np\n",
 12 |     "import seaborn as sns; sns.set()\n",
 13 |     "pd.set_option('use_inf_as_na', True)\n",
 14 |     "import warnings\n",
 15 |     "warnings.filterwarnings('ignore')\n",
 16 |     "plt.rcParams['figure.dpi'] = 300\n",
 17 |     "plt.rcParams['savefig.dpi'] = 300"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "crash_data = pd.read_csv('datasets/crash_data.csv')"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "crash_data.head()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "crash_data.date = pd.to_datetime(crash_data.date, format='%Y%m%d')\n",
 45 |     "crash_data = crash_data.set_index('date')"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "crash_dataw = crash_data.groupby('TICKER').resample('W').\\\n",
 55 |     "              agg({'RET':'mean', 'vwretx':'mean', 'VOL':'mean',\n",
 56 |     "                   'BIDLO':'mean', 'ASKHI':'mean', 'PRC':'mean'})"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "crash_dataw = crash_dataw.reset_index()\n",
 66 |     "crash_dataw.dropna(inplace=True)\n",
 67 |     "stocks = crash_dataw.TICKER.unique()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "plt.figure(figsize=(12, 8))\n",
 77 |     "k = 1\n",
 78 |     "\n",
 79 |     "for i in stocks[: 4]:\n",
 80 |     "    plt.subplot(2, 2, k)\n",
 81 |     "    plt.hist(crash_dataw[crash_dataw.TICKER == i]['RET'])\n",
 82 |     "    plt.title('Histogram of '+i)\n",
 83 |     "    k+=1\n",
 84 |     "plt.show()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Firm-specific return"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import statsmodels.api as sm\n",
101 |     "residuals = []\n",
102 |     "\n",
103 |     "for i in stocks:\n",
104 |     "    Y = crash_dataw.loc[crash_dataw['TICKER'] == i]['RET'].values\n",
105 |     "    X = crash_dataw.loc[crash_dataw['TICKER'] == i]['vwretx'].values\n",
106 |     "    X = sm.add_constant(X)\n",
107 |     "    ols = sm.OLS(Y[2:-2], X[2:-2] + X[1:-3] + X[0:-4] + \\\n",
108 |     "                 X[3:-1] + X[4:]).fit()\n",
109 |     "    residuals.append(ols.resid)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "residuals = list(map(lambda x: np.log(1 + x), residuals))"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "crash_data_sliced = pd.DataFrame([])\n",
128 |     "for i in stocks:\n",
129 |     "    crash_data_sliced = crash_data_sliced.\\\n",
130 |     "                        append(crash_dataw.loc[crash_dataw.TICKER == i]\n",
131 |     "                               [2:-2])\n",
132 |     "crash_data_sliced.head()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Elliptic Envelope "
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "from sklearn.covariance import EllipticEnvelope\n",
149 |     "envelope = EllipticEnvelope(contamination=0.02, support_fraction=1)\n",
150 |     "ee_predictions = {}\n",
151 |     "\n",
152 |     "for i, j in zip(range(len(stocks)), stocks):\n",
153 |     "    envelope.fit(np.array(residuals[i]).reshape(-1, 1))\n",
154 |     "    ee_predictions[j] = envelope.predict(np.array(residuals[i])\n",
155 |     "                                         .reshape(-1, 1))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "transform = []\n",
165 |     "\n",
166 |     "for i in stocks:\n",
167 |     "    for j in range(len(ee_predictions[i])):\n",
168 |     "        transform.append(np.where(ee_predictions[i][j] == 1, 0, -1))"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "crash_data_sliced = crash_data_sliced.reset_index()\n",
178 |     "crash_data_sliced['residuals'] = np.concatenate(residuals)\n",
179 |     "crash_data_sliced['neg_outliers'] = np.where((np.array(transform)) == -1, 1, 0)\n",
180 |     "crash_data_sliced.loc[(crash_data_sliced.neg_outliers == 1) &\n",
181 |     "                      (crash_data_sliced.residuals > 0),\n",
182 |     "                      'neg_outliers'] = 0"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "crash_data_sliced['neg_outliers'].value_counts()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "plt.figure(figsize=(12, 8)) \n",
201 |     "k=1\n",
202 |     "\n",
203 |     "for i in stocks[8:12]:\n",
204 |     "    plt.subplot(2, 2, k)\n",
205 |     "    crash_data_sliced['residuals'][crash_data_sliced.TICKER == i]\\\n",
206 |     "    .hist(label='normal', bins=30, color='gray')\n",
207 |     "    outliers = crash_data_sliced['residuals'][(crash_data_sliced.TICKER == i) &\n",
208 |     "    (crash_data_sliced.neg_outliers > 0)]\n",
209 |     "    outliers.hist(color='black', label='anomaly') \n",
210 |     "    plt.title(i)\n",
211 |     "    plt.legend()\n",
212 |     "    k+=1\n"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "crash_data_sliced = crash_data_sliced.set_index('date')\n",
222 |     "crash_data_sliced.index = pd.to_datetime(crash_data_sliced.index)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "std = crash_data.groupby('TICKER')['RET'].resample('W').std()\\\n",
232 |     "      .reset_index()\n",
233 |     "crash_dataw['std'] = pd.DataFrame(std['RET'])"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "yearly_data = crash_data_sliced.groupby('TICKER')['residuals']\\\n",
243 |     "              .resample('Y').agg({'residuals':{'mean', 'std'}})\\\n",
244 |     "              .reset_index()\n",
245 |     "yearly_data.columns = ['TICKER', 'date', 'mean', 'std']\n",
246 |     "yearly_data.head()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "merge_crash = pd.merge(crash_data_sliced.reset_index(), yearly_data,\n",
256 |     "                       how='outer', on=['TICKER', 'date'])"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "merge_crash[['annual_mean', 'annual_std']] = merge_crash\\\n",
266 |     "                                             .sort_values(by=['TICKER',\n",
267 |     "                                                              'date'])\\\n",
268 |     "                                             .iloc[:, -2:]\\\n",
269 |     "                                             .fillna(method='bfill')\n",
270 |     "merge_crash['residuals'] = merge_crash.sort_values(by=['TICKER',\n",
271 |     "                                                       'date'])\\\n",
272 |     "                                                      ['residuals']\\\n",
273 |     "                                             .fillna(method='ffill')\n",
274 |     "merge_crash = merge_crash.drop(merge_crash.iloc[: ,-4:-2], axis=1)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "crash_risk_out = []\n",
284 |     "\n",
285 |     "for j in stocks:\n",
286 |     "    for k in range(len(merge_crash[merge_crash.TICKER == j])):\n",
287 |     "        if merge_crash[merge_crash.TICKER == j]['residuals'].iloc[k] < \\\n",
288 |     "        merge_crash[merge_crash.TICKER == j]['annual_mean'].iloc[k] - \\\n",
289 |     "        3.09 * \\\n",
290 |     "        merge_crash[merge_crash.TICKER == j]['annual_std'].iloc[k]:\n",
291 |     "            crash_risk_out.append(1)\n",
292 |     "        else:\n",
293 |     "            crash_risk_out.append(0)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "merge_crash['crash_risk'] = crash_risk_out\n",
303 |     "merge_crash['crash_risk'].value_counts()"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "merge_crash = merge_crash.set_index('date')\n",
313 |     "merge_crash_annual = merge_crash.groupby('TICKER')\\\n",
314 |     "                     .resample('1Y')['crash_risk'].sum().reset_index()"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "down = []\n",
324 |     "\n",
325 |     "for j in range(len(merge_crash)):\n",
326 |     "    if merge_crash['residuals'].iloc[j] < \\\n",
327 |     "       merge_crash['annual_mean'].iloc[j]:\n",
328 |     "        down.append(1)\n",
329 |     "    else:\n",
330 |     "        down.append(0)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "merge_crash = merge_crash.reset_index()\n",
340 |     "merge_crash['down'] = pd.DataFrame(down)\n",
341 |     "merge_crash['up'] = 1 - merge_crash['down']\n",
342 |     "down_residuals = merge_crash[merge_crash.down == 1]\\\n",
343 |     "                 [['residuals', 'TICKER', 'date']]\n",
344 |     "up_residuals = merge_crash[merge_crash.up == 1]\\\n",
345 |     "               [['residuals', 'TICKER', 'date']]"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "down_residuals['residuals_down_sq'] = down_residuals['residuals'] ** 2\n",
355 |     "down_residuals['residuals_down_cubic'] = down_residuals['residuals'] **3\n",
356 |     "up_residuals['residuals_up_sq'] = up_residuals['residuals'] ** 2\n",
357 |     "up_residuals['residuals_up_cubic'] = up_residuals['residuals'] ** 3\n",
358 |     "down_residuals['down_residuals'] = down_residuals['residuals']\n",
359 |     "up_residuals['up_residuals'] = up_residuals['residuals']\n",
360 |     "del down_residuals['residuals']\n",
361 |     "del up_residuals['residuals']"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "merge_crash['residuals_sq'] = merge_crash['residuals'] ** 2\n",
371 |     "merge_crash['residuals_cubic'] = merge_crash['residuals'] ** 3"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "merge_crash_all = merge_crash.merge(down_residuals,\n",
381 |     "                                    on=['TICKER', 'date'],\n",
382 |     "                                    how='outer')\n",
383 |     "merge_crash_all = merge_crash_all.merge(up_residuals,\n",
384 |     "                                        on=['TICKER', 'date'],\n",
385 |     "                                        how='outer')"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "cols = ['BIDLO', 'ASKHI', 'residuals', \n",
395 |     "        'annual_std', 'residuals_sq', 'residuals_cubic',\n",
396 |     "        'down', 'up', 'residuals_up_sq', 'residuals_down_sq',\n",
397 |     "        'neg_outliers']\n",
398 |     "merge_crash_all = merge_crash_all.set_index('date')\n",
399 |     "merge_grouped = merge_crash_all.groupby('TICKER')[cols]\\\n",
400 |     "                .resample('1Y').sum().reset_index()\n",
401 |     "merge_grouped['neg_outliers'] = np.where(merge_grouped.neg_outliers >=\n",
402 |     "                                         1, 1, 0)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "merge_grouped = merge_grouped.set_index('date')\n",
412 |     "merge_all = merge_grouped.groupby('TICKER')\\\n",
413 |     "            .resample('1Y').agg({'down':['sum', 'count'],\n",
414 |     "                                 'up':['sum', 'count']})\\\n",
415 |     "            .reset_index()\n",
416 |     "merge_all.head()"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "merge_grouped['down'] = merge_all['down']['sum'].values\n",
426 |     "merge_grouped['up'] = merge_all['up']['sum'].values\n",
427 |     "merge_grouped['count'] = merge_grouped['down'] + merge_grouped['up']"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "merge_grouped = merge_grouped.reset_index()"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": [
445 |     "merge_grouped['duvol'] = np.log(((merge_grouped['up'] - 1) * \n",
446 |     "                                 merge_grouped['residuals_down_sq']) /\n",
447 |     "                                ((merge_grouped['down'] - 1) * \n",
448 |     "                                 merge_grouped['residuals_up_sq']))"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "merge_grouped['duvol'].mean()"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "merge_grouped['ncskew'] = - (((merge_grouped['count'] * \n",
467 |     "                               (merge_grouped['count'] - 1) **\n",
468 |     "                               (3 / 2)) * \n",
469 |     "                             merge_grouped['residuals_cubic']) / \n",
470 |     "                             (((merge_grouped['count'] - 1) * \n",
471 |     "                               (merge_grouped['count'] - 2)) * \n",
472 |     "                              merge_grouped['residuals_sq'] **\n",
473 |     "                              (3 / 2)))"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "merge_grouped['ncskew'].mean()"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {
489 |     "scrolled": true
490 |    },
491 |    "outputs": [],
492 |    "source": [
493 |     "merge_grouped['crash_risk'] = merge_crash_annual['crash_risk']\n",
494 |     "merge_grouped['crash_risk'] = np.where(merge_grouped.crash_risk >= \n",
495 |     "                                       1, 1, 0)"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "merge_crash_all_grouped2 = merge_crash_all.groupby('TICKER')\\\n",
505 |     "                            [['VOL', 'PRC']]\\\n",
506 |     "                           .resample('1Y').mean().reset_index()\n",
507 |     "merge_grouped[['VOL', 'PRC']] = merge_crash_all_grouped2[['VOL', 'PRC']]"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "merge_grouped[['ncskew','duvol']].corr()"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": [
523 |     "## Balance Sheet Data"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": [
532 |     "bs = pd.read_csv('datasets/bs_v.3.csv')\n",
533 |     "bs['Date'] = pd.to_datetime(bs.datadate, format='%Y%m%d')\n",
534 |     "bs['annual_date'] = bs['Date'].dt.year"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {},
541 |    "outputs": [],
542 |    "source": [
543 |     "bs['RoA'] = bs['ni'] / bs['at']\n",
544 |     "bs['leverage'] = bs['lt'] / bs['at']"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "merge_grouped['annual_date'] = merge_grouped['date'].dt.year\n",
554 |     "bs['TICKER'] = bs.tic\n",
555 |     "del bs['tic']"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {},
562 |    "outputs": [],
563 |    "source": [
564 |     "merge_ret_bs = pd.merge(bs, merge_grouped,\n",
565 |     "                        on=['TICKER', 'annual_date'])"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {},
572 |    "outputs": [],
573 |    "source": [
574 |     "merge_ret_bs2 = merge_ret_bs.set_index('Date')\n",
575 |     "merge_ret_bs2 = merge_ret_bs2.groupby('TICKER').resample('Y').mean()\n",
576 |     "merge_ret_bs2.reset_index(inplace=True)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "merge_ret_bs2['vol_csho_diff'] = (merge_ret_bs2.groupby('TICKER')\n",
586 |     "                                  ['VOL'].shift(-1) / \n",
587 |     "                                  merge_ret_bs2.groupby('TICKER')\n",
588 |     "                                  ['csho'].shift(-1))\n",
589 |     "merge_ret_bs2['dturn1'] = merge_ret_bs2['VOL'] / merge_ret_bs2['csho']\n",
590 |     "merge_ret_bs2['dturn'] = merge_ret_bs2['vol_csho_diff'] - \\\n",
591 |     "                         merge_ret_bs2['dturn1']"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {},
598 |    "outputs": [],
599 |    "source": [
600 |     "merge_ret_bs2['p/e'] = merge_ret_bs2['PRC'] / merge_ret_bs2['ni']\n",
601 |     "merge_ret_bs2['turnover_rate'] = merge_ret_bs2['VOL'] / \\\n",
602 |     "                                 merge_ret_bs2['csho']\n",
603 |     "merge_ret_bs2['equity_share'] = merge_ret_bs2['ceq'] / \\\n",
604 |     "                                (merge_ret_bs2['ceq'] +\n",
605 |     "                                 merge_ret_bs2['dt'])\n",
606 |     "merge_ret_bs2['firm_size'] = np.log(merge_ret_bs2['at'])\n",
607 |     "merge_ret_bs2['cefd'] = (((merge_ret_bs2['at'] -\n",
608 |     "                           merge_ret_bs2['lt']) / merge_ret_bs2['csho']) - \n",
609 |     "                           merge_ret_bs2['PRC']) / (merge_ret_bs2['at'] - \n",
610 |     "                           merge_ret_bs2['lt']) / merge_ret_bs2['csho']"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": [
619 |     "merge_ret_bs2 = merge_ret_bs2.set_index('Date')\n",
620 |     "merge_ret_bs2['buying_volume'] = merge_ret_bs2['VOL'] * \\\n",
621 |     "                                 (merge_ret_bs2['PRC'] - \n",
622 |     "                                  merge_ret_bs2['BIDLO']) / \\\n",
623 |     "                                 (merge_ret_bs2['ASKHI'] - \n",
624 |     "                                  merge_ret_bs2['BIDLO'])\n",
625 |     "merge_ret_bs2['selling_volume'] = merge_ret_bs2['VOL'] * \\\n",
626 |     "                                  (merge_ret_bs2['ASKHI'] - \n",
627 |     "                                   merge_ret_bs2['PRC']) / \\\n",
628 |     "                                  (merge_ret_bs2['ASKHI'] - \n",
629 |     "                                   merge_ret_bs2['BIDLO'])\n",
630 |     "buying_volume = merge_ret_bs2.groupby('TICKER')['buying_volume'] \\\n",
631 |     "                .resample('Y').sum().reset_index()\n",
632 |     "selling_volume = merge_ret_bs2.groupby('TICKER')['selling_volume'] \\\n",
633 |     "                .resample('Y').sum().reset_index()\n",
634 |     "del buying_volume['TICKER']\n",
635 |     "del buying_volume['Date']"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": null,
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "buy_sel_vol = pd.concat([buying_volume,selling_volume], axis=1)\n",
645 |     "buy_sel_vol['bsi'] = (buy_sel_vol.buying_volume - \n",
646 |     "                      buy_sel_vol.selling_volume) / \\\n",
647 |     "                     (buy_sel_vol.buying_volume + \n",
648 |     "                      buy_sel_vol.selling_volume)"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": null,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": [
657 |     "merge_ret_bs2 = merge_ret_bs2.reset_index()\n",
658 |     "merge_ret_bs2 = pd.merge(buy_sel_vol ,merge_ret_bs2,\n",
659 |     "                         on=['TICKER', 'Date'])"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "markdown",
664 |    "metadata": {},
665 |    "source": [
666 |     "## Firm Sentiment via PCA"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": null,
672 |    "metadata": {},
673 |    "outputs": [],
674 |    "source": [
675 |     "from sklearn.preprocessing import StandardScaler\n",
676 |     "from sklearn.decomposition import PCA"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": null,
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "firm_sentiment = merge_ret_bs2[['p/e', 'turnover_rate',\n",
686 |     "                                'equity_share', 'cefd',\n",
687 |     "                                'leverage', 'bsi']]\n",
688 |     "firm_sentiment = firm_sentiment.apply(lambda x: x.fillna(x.mean()),\n",
689 |     "                                      axis=0)"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": null,
695 |    "metadata": {
696 |     "scrolled": true
697 |    },
698 |    "outputs": [],
699 |    "source": [
700 |     "firm_sentiment_std = StandardScaler().fit_transform(firm_sentiment)\n",
701 |     "pca = PCA(n_components=6)\n",
702 |     "pca_market_sentiment = pca.fit_transform(firm_sentiment_std)\n",
703 |     "print('Explained Variance Ratios per Component are:\\n {}'\\\n",
704 |     "      .format(pca.explained_variance_ratio_))"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": null,
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": [
713 |     "loadings_1 = pd.DataFrame(pca.components_.T * \n",
714 |     "                          np.sqrt(pca.explained_variance_), \n",
715 |     "                          columns=['PC1', 'PC2', 'PC3',\n",
716 |     "                                   'PC4', 'PC5', 'PC6'],\n",
717 |     "                          index=firm_sentiment.columns)\n",
718 |     "loadings_1"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": null,
724 |    "metadata": {},
725 |    "outputs": [],
726 |    "source": [
727 |     "df_loading1 = pd.DataFrame(loadings_1.mean(axis=1))\n",
728 |     "df_loading1"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": null,
734 |    "metadata": {},
735 |    "outputs": [],
736 |    "source": [
737 |     "firm_sentiment = pd.DataFrame(np.dot(pca_market_sentiment,\n",
738 |     "                                     np.array(df_loading1)))\n",
739 |     "merge_ret_bs2['firm_sent'] = firm_sentiment"
740 |    ]
741 |   },
742 |   {
743 |    "cell_type": "markdown",
744 |    "metadata": {},
745 |    "source": [
746 |     "## Panel Data Application"
747 |    ]
748 |   },
749 |   {
750 |    "cell_type": "code",
751 |    "execution_count": null,
752 |    "metadata": {},
753 |    "outputs": [],
754 |    "source": [
755 |     "merge_ret_bs2['log_size'] = np.log(merge_ret_bs2['at'])"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": null,
761 |    "metadata": {},
762 |    "outputs": [],
763 |    "source": [
764 |     "merge_ret_bs2.set_index(['TICKER', 'Date'], inplace=True)"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "code",
769 |    "execution_count": null,
770 |    "metadata": {},
771 |    "outputs": [],
772 |    "source": [
773 |     "X = (merge_ret_bs2[['log_size', 'rect', 'ppegt', 'dturn',\n",
774 |     "                'ncskew', 'residuals', 'RoA', 'annual_std',\n",
775 |     "                'firm_sent']]).shift(1)\n",
776 |     "X['neg_outliers'] = merge_ret_bs2['neg_outliers']"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": null,
782 |    "metadata": {},
783 |    "outputs": [],
784 |    "source": [
785 |     "from pyeconometrics.panel_discrete_models import FixedEffectPanelModel\n",
786 |     "from sklearn.model_selection import train_test_split\n",
787 |     "from sklearn.metrics import accuracy_score"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": null,
793 |    "metadata": {
794 |     "scrolled": true
795 |    },
796 |    "outputs": [],
797 |    "source": [
798 |     "FE_ML = FixedEffectPanelModel()\n",
799 |     "FE_ML.fit(X, 'neg_outliers')\n",
800 |     "FE_ML.summary()"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": [
809 |     "del X['neg_outliers']\n",
810 |     "X['crash_risk'] = merge_ret_bs2['crash_risk']"
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "code",
815 |    "execution_count": null,
816 |    "metadata": {
817 |     "scrolled": false
818 |    },
819 |    "outputs": [],
820 |    "source": [
821 |     "FE_crash = FixedEffectPanelModel()\n",
822 |     "FE_crash.fit(X, 'crash_risk')\n",
823 |     "FE_crash.summary()"
824 |    ]
825 |   }
826 |  ],
827 |  "metadata": {
828 |   "kernelspec": {
829 |    "display_name": "Python 3",
830 |    "language": "python",
831 |    "name": "python3"
832 |   },
833 |   "language_info": {
834 |    "codemirror_mode": {
835 |     "name": "ipython",
836 |     "version": 3
837 |    },
838 |    "file_extension": ".py",
839 |    "mimetype": "text/x-python",
840 |    "name": "python",
841 |    "nbconvert_exporter": "python",
842 |    "pygments_lexer": "ipython3",
843 |    "version": "3.8.8"
844 |   },
845 |   "latex_envs": {
846 |    "LaTeX_envs_menu_present": true,
847 |    "autoclose": false,
848 |    "autocomplete": true,
849 |    "bibliofile": "biblio.bib",
850 |    "cite_by": "apalike",
851 |    "current_citInitial": 1,
852 |    "eqLabelWithNumbers": true,
853 |    "eqNumInitial": 1,
854 |    "hotkeys": {
855 |     "equation": "Ctrl-E",
856 |     "itemize": "Ctrl-I"
857 |    },
858 |    "labels_anchors": false,
859 |    "latex_user_defs": false,
860 |    "report_style_numbering": false,
861 |    "user_envs_cfg": false
862 |   },
863 |   "toc": {
864 |    "base_numbering": 1,
865 |    "nav_menu": {},
866 |    "number_sections": false,
867 |    "sideBar": true,
868 |    "skip_h1_title": false,
869 |    "title_cell": "Table of Contents",
870 |    "title_sidebar": "Contents",
871 |    "toc_cell": false,
872 |    "toc_position": {},
873 |    "toc_section_display": true,
874 |    "toc_window_display": false
875 |   }
876 |  },
877 |  "nbformat": 4,
878 |  "nbformat_minor": 4
879 | }
880 | 


--------------------------------------------------------------------------------