├── .gitignore
├── Procfile
├── README.md
├── References.md
├── correlations
└── correlation_plot.ipynb
├── decision_trees
├── decision_tree.JPG
├── decision_tree.pptx
└── decision_trees.ipynb
├── dynamically_controlled_kernel_estimation
├── dcke
│ ├── __init__.py
│ ├── dcke.py
│ └── test_dcke.py
├── dynamically_controlled_kernel_estimation.ipynb
├── locreg
│ ├── __init__.py
│ ├── local_regression.py
│ └── test_local_regression.py
├── models
│ ├── __init__.py
│ ├── black_scholes.py
│ └── test_black_scholes.py
└── pics
│ ├── american_option_pricing.png
│ ├── conditional_expectation_orthogonal_projection.png
│ ├── dcke_basket_heston.jpg
│ ├── dcke_performance.jpg
│ └── dcke_rbergomi.jpg
├── ensemble
├── adaboost_classifier.ipynb
└── adaboost_regressor.ipynb
├── environment-explicit.txt
├── environment.yml
├── gaussian_process_regression
└── gaussian_process_regression.ipynb
├── lda_qda
└── linear_quadratic_discriminant_analysis.ipynb
├── local_regression
├── local_regression.ipynb
└── locreg
│ ├── __init__.py
│ ├── local_regression.py
│ └── test_local_regression.py
├── logistic_regression
└── logistic_regression.ipynb
├── lstm_intro
├── lstm.pdf
├── lstm_cell.pdf
├── lstm_cell.png
└── lstm_intro.ipynb
├── naive_bayes
└── naive_bayes.ipynb
├── network_topology_selection
├── data_how_deep_financial_models.zip
├── data_surgery.zip
├── how_deep_are_financial_models.ipynb
├── keras_grid
│ ├── __init__.py
│ ├── model_grid.py
│ └── test_model_grid.py
├── network_topology_selection.ipynb
├── networks_financial_models_brain_surgery.ipynb
└── pricinglib
│ ├── __init__.py
│ ├── black_scholes.py
│ └── heston.py
├── neural_network_intro
└── neural_network_intro_model_setup.ipynb
├── newton_gradient_backprop
├── adjoint.ipynb
├── backpropagation.ipynb
├── gradient_descent.ipynb
└── newton.ipynb
├── regression_revisited
└── regression_revisited.ipynb
├── requirements.txt
├── runtime.txt
└── start.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .idea/
107 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: voila --port=$PORT --no-browser --enable_nbextensions=True --strip_sources=False
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning Fundamentals
2 |
3 | ## Probability
4 | * [Correlation Matrices](https://github.com/niknow/machine-learning-examples/blob/master/correlations/correlation_plot.ipynb): A plotly vizualization of the space of 3x3 correlation matrices using a nice parametrization
5 |
6 |
7 | ## Classification
8 | * [Naive Bayes](https://github.com/niknow/machine-learning-examples/blob/master/naive_bayes/naive_bayes.ipynb): Introduction, derivation and reconciliation of Naive Bayes - a baseline model for classification.
9 |
10 | * [Linear / Quadratic Discriminant Analysis](https://github.com/niknow/machine-learning-examples/blob/master/lda_qda/linear_quadratic_discriminant_analysis.ipynb): Introduction, derivation, properties and examples of LDA/QDA classification
11 |
12 | * [Logistic Regression](https://github.com/niknow/machine-learning-examples/blob/master/logistic_regression/logistic_regression.ipynb): Definitions, Binary and multi-class case, Sigmoid and Softmax functions, Cross-Entropy Loss, Regularization, Examples
13 |
14 | * [Decision Tree Classifiers](https://github.com/niknow/machine-learning-examples/blob/master/decision_trees/decision_trees.ipynb): Graph theory, binary rooted trees, impurity functions, minimal cost-complexity pruning
15 |
16 |
17 | ## Advanced Regression Techniques
18 |
19 | * [Linear Regression](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/regression_revisited/regression_revisited.ipynb): A recap of linear regression - a core fundamental of machine learning.
20 |
21 | * [Local Regression](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/local_regression/local_regression.ipynb): Local regression is a refinement of linear regression that adapts the model at each point of the prediction.
22 |
23 | * [Gaussian Process Regression (GPR)](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/gaussian_process_regression/gaussian_process_regression.ipynb): An advanced regression technique that produces not only predictions, but also confidence bounds around them.
24 |
25 | * [Dynamically Controlled Kernel Estimation (DCKE)](https://github.com/niknow/machine-learning-examples/blob/master/dynamically_controlled_kernel_estimation/dynamically_controlled_kernel_estimation.ipynb): A combination of local regression, control variates and Gaussian process regression to estimate conditional expectations. The method is model free, data-driven and particularly suited for financial applications.
26 |
27 | * [Decision Tree Regressors](https://github.com/niknow/machine-learning-examples/blob/master/decision_trees/decision_trees.ipynb): Decision trees can be used for regression as well
28 |
29 |
30 | ## Neural Network Topologies
31 |
32 | * [Multilayer Perceptrons (MLP)](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/neural_network_intro/neural_network_intro_model_setup.ipynb): Introduction to the most common form of artificial neural networks (ANN).
33 |
34 | * [Long-Term-Short-Term-Memory networks (LSTM)](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/lstm_intro/lstm_intro.ipynb): Introduction to LSTMs, a popular form of recurrent neural networks (RNNs).
35 |
36 | * [Network Topology Selection](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/network_topology_selection/network_topology_selection.ipynb): A methodology to choose a topology for a neural network, e.g. the number of hidden layers and units.
37 |
38 |
39 | ## Ensemble Learning
40 |
41 | * [Boosting Classifications](https://github.com/niknow/machine-learning-examples/blob/master/ensemble/adaboost_classifier.ipynb): Boosting decision tree classifiers is a very common form of ensemble learning. We discuss the famous SAMME algorithm including the weak classifier training, motivate its the weighting and the exponential loss function.
42 |
43 | * [Boosting Regressions](https://github.com/niknow/machine-learning-examples/blob/master/ensemble/adaboost_regressor.ipynb): Boosting regressors is possible, but slightly different from classifier boosting. We discuss the popular R2 algorithm including the bootstrap sampling and its differences to the SAMME classifier boosting.
44 |
45 |
46 | ## Training Networks & Optimization Techniques
47 |
48 | * [Newton's Method](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/newton_gradient_backprop/newton.ipynb): A recap of the Newton's method.
49 |
50 | * [Gradient Descent - Basics](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/newton_gradient_backprop/gradient_descent.ipynb): Mathematical foundations and basics of gradient descent.
51 |
52 | * [Gradient Descent - Advanced](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/stochastic_gradient_descent.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): Some illustrations, background and examples of gradient descent.
53 |
54 | * [Backpropagation](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/newton_gradient_backprop/backpropagation.ipynb): Derivation of the backpropagation algorithm.
55 |
56 | * [Adjoint Method](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/newton_gradient_backprop/adjoint.ipynb): Relationship between backpropagation and the adjoint method.
57 |
58 | ## Basic Examples
59 |
60 | * [Learning the Sine](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/LearnSine_JK.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): Simple example of how to use keras and tensorflow to learn a curve.
61 |
62 | * [Learning a 2D function](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/Learn2dFunction.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): A slightly more complex example of how to learn a surface.
63 |
64 |
65 | # Machine Learning & Quantitative Finance
66 | * [How deep are financial models?](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/network_topology_selection/how_deep_are_financial_models.ipynb): Learn the pricing function of Black-Scholes and Heston model. Application of [network topology selection](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/network_topology_selection/network_topology_selection.ipynb).
67 |
68 | * [Neural Network Brain Surgery](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/network_topology_selection/networks_financial_models_brain_surgery.ipynb): Can the difference between the Black-Scholes and the Heston model be visualized as the brains of the networks that learn their pricing function?
69 |
70 | * [Calibrating Heston](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/Calibration_Illustration.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): Learn the calibration function of a Heston model using a neural network.
71 |
72 | * [Calibrating Hull-White](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/HW_1F_Pricing.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): Learn the calibration function of a Hull-White model using a neural network.
73 |
74 | * [Autograd](https://nbviewer.jupyter.org/github/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--/blob/master/Autograd.ipynb) [[@lapislago]](https://github.com/Lapsilago/Machine-Learning---Option-Pricing-Calibration-Hedging--): A small example on how to automatically differentiate the Black-Scholes pricing formula.
75 |
76 |
77 | [References](https://github.com/niknow/machine-learning-examples/blob/master/References.md)
78 |
--------------------------------------------------------------------------------
/References.md:
--------------------------------------------------------------------------------
1 | # Machine Learning & Quant Finance
2 |
3 | ## Deep Pricing
4 | * [[URL]](https://arxiv.org/abs/1901.08943) Liu, Oosterlee, Bohte: Pricing options and computing implied volatilities using neural networks.
5 | * [[URL]](https://arxiv.org/abs/1809.02233) Ferguson, Green: Deeply Learning Derivatives.
6 | * [[URL]](https://ssrn.com/abstract=3288882) McGhee: An Artificial Neural Network Representation of the SABR Stochastic Volatility Model.
7 | * [[URL]](https://ssrn.com/abstract=236673) Hutchinson, Lo, Poggio: A Nonparametric Approach to Pricing and Hedging Derivative Securities Via Learning Networks.
8 | * [[URL]](https://ssrn.com/abstract=3191050) Spiegeleer, Madan, Reyners, Schoutens: Machine Learning for Quantitative Finance.
9 | * [[URL]](https://aaltodoc.aalto.fi/handle/123456789/30398) Stark: Machine Learning and Options Pricing.
10 |
11 | ## Deep Calibration
12 | * [[URL]](https://www.researchgate.net/publication/220505020_Machine_Learning_Vasicek_Model_Calibration_with_Gaussian_Processes) Sousa, Esquivel, Gaspar: Machine learning Vasicek model calibration with Gaussian processes.
13 | * [[URL]](https://arxiv.org/abs/1810.03399) Bayer, Stemper: Deep calibration of rough stochastic volatility models.
14 | * [[URL]](https://ssrn.com/abstract=3252432) Dimitroff, Roeder, Fries: Volatility model calibration with convolutional neural networks.
15 | * [[URL]](http://ssrn.com/abstract=2812140) Hernandez: Model Calibration with Neural Networks.
16 | * [[URL]](https://arxiv.org/abs/1901.09647) Horvath, Muguruza, Tomas: Deep Learning Volatility.
17 |
18 | ## Deep Hedging
19 | * [[URL]](https://arxiv.org/abs/1802.03042) Bühler, Gonon, Teichmann, Wood: Deep Hedging.
20 |
21 | ## Curve Dynamics & Term Structures
22 | * [[URL]](https://ssrn.com/abstract=3041232) Kondratyev: Learning Curve Dynamics with Artificial Neural Networks.
23 | * [[URL]](https://arxiv.org/abs/1703.01536) Sambasivan, Das: A Statistical Machine Learning Approach to Yield Curve Forecasting.
24 | * [[URL]](https://arxiv.org/abs/1604.02237) Cousin, Maatouk, Rulliere: Kriging of financial term-structures.
25 |
26 | ## CDS Spreads
27 | * [[URL]](https://arxiv.org/abs/1705.06899) Brummelhuis, Luo: CDS rate construction methods by machine learning techniques.
28 |
29 | ## XVA
30 | * [[URL]](https://arxiv.org/abs/1901.11081) Crépey, Dixon: Gaussian Process Regression for Derivative Portfolio Modeling and Application to CVA Computations
31 | * [[URL]](https://ssrn.com/abstract=3357626) Ma, Spinner, Venditti, Li, Tang: Initial Margin Simulation with Deep Learning
32 |
33 |
34 | # Quantitative Finance
35 |
36 | ## Risk Factor Models
37 | * [[URL]](https://doi.org/10.1093/rfs/3.4.573) Hull, White: Pricing Interest-Rate-Derivative Securities
38 | * [[URL]](https://www.scribd.com/doc/198899911/Evaluating-and-Hedging-Exotic-Swap-Instruments-via-LGM) Hagan: Evaluating and Hedging Exotic Swap Instruments via LGM
39 | * [[URL]](https://doi.org/10.1093/rfs/6.2.327) Heston: A Closed-Form Solution for Options with Stochastic Volatility with Applications to Bond and Currency Options
40 | * [[URL]](https://ssrn.com/abstract=946405) Andersen: Efficient Simulation of the Heston Stochastic Volatility Model
41 | * [[URL]](https://www.researchgate.net/profile/Patrick_Hagan3/publication/300789919_Probability_Distribution_in_the_SABR_Model_of_Stochastic_Volatility/links/5c91734a299bf11169395d8f/Probability-Distribution-in-the-SABR-Model-of-Stochastic-Volatility.pdf) Hagan, Lesniewski, Woodward: Probability Distribution in the SABR Model of Stochastic Volatility
42 | * [[URL]](https://ssrn.com/abstract=966364) Trolle, Schwarz: A General Stochastic Volatility Model for the Pricing of Interest Rate Derivatives
43 |
44 | ## American Monte Carlo
45 | * [[URL]](https://escholarship.org/uc/item/43n1k4jb) Longstaff, Schwarz: Valuing American Options by Simulation: A Simple Least-Squares Approach
46 | * [[URL]](https://www.mit.edu/~jnt/Papers/J086-01-bvr-options.pdf) Tsitsiklis, Van Roy: Regression Methods for Pricing Complex American-Style Options
47 | * [[URL]](https://www.mit.edu/~jnt/Papers/J074-99-bvr-stop.pdf) Tsitsiklis, Van Roy: Optimal Stopping of Markov Processes: Hilbert Space Theory, Approximation Algorithms, and an Application to Pricing Financial Derivatives
48 |
49 | ## Counterparty Risk
50 | * [[URL]](https://ssrn.com/abstract=1032522) Pykhtin, Zhu: A Guide to Modeling Counterparty Credit Risk
51 |
52 | ## CVA
53 | * [[URL]](https://ssrn.com/abstract=1782063) Pykhtin, Rosen: Pricing Counterparty Risk at the Trade Level and CVA Allocations
54 |
55 | ## FVA
56 | * [[URL]](https://ssrn.com/abstract=2027195) Burghard, Kjar: Funding Costs, Funding Strategies
57 | * [[URL]](https://ssrn.com/abstract=2157634) Burgard, Kjaer: The FVA Debate: In Theory and Practice
58 | * [[URL]](https://ssrn.com/abstract=1785262) Burgard, Kjaer: In the Balance
59 | * [[URL]](https://www.risk.net/derivatives/1589992/funding-beyond-discounting-collateral-agreements-and-derivatives-pricing) Piterbarg: Funding Beyond Discounting
60 | * [[URL]](https://ssrn.com/abstract=2746010) Andersen, Duffie, Song: Funding Value Adjustments
61 |
62 |
63 |
64 | ## DIM and MVA
65 | * [[URL]](https://ssrn.com/abstract=2716279) Anfuso, Aziz, Giltinan, Loukopoulos: A Sound Modelling and Backtesting Framework for Forecasting Initial Margin Requirements
66 | * [[URL]](https://ssrn.com/abstract=2911167) Caspers, Giltinan, Lichters, Nowaczyk: Forecasting Initial Margin Requirements - A Model Evaluation
67 | * [[URL]](https://arxiv.org/abs/1808.08221) Ruiz, Zeron: Dynamic Initial Margin via Chebyshev Spectral Decomposition
68 | * [[URL]](https://ssrn.com/abstract=3147811) McWalter, Kienitz, Nowaczyk, Rudd, Acar: Dynamic Initial Margin Estimation Based on Quantiles of Johnson Distributions
69 | * [[URL]](https://ssrn.com/abstract=2806156) Andersen, Pykhtin, Sokol: Credit Exposure in the Presence of Initial Margin
70 | * [[URL]](https://ssrn.com/abstract=2902737) Andersen, Pykhtin, Sokol: Rethinking the Margin Period of Risk
71 | * [[URL]](https://ssrn.com/abstract=3040061) Antonov, Issakov, McClelland: Efficient SIMM-MVA Calculations for Callable Exotics
72 | * [[URL]](https://ssrn.com/abstract=3018165) Fries: Fast Stochastic Forward Sensitivities in Monte-Carlo Simulations Using Stochastic Automatic Differentiation (with Applications to Initial Margin Valuation Adjustments (MVA))
73 | * [[URL]](https://arxiv.org/abs/1512.07337) Lou: MVA Transfer Pricing
74 |
75 |
76 |
--------------------------------------------------------------------------------
/decision_trees/decision_tree.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/decision_trees/decision_tree.JPG
--------------------------------------------------------------------------------
/decision_trees/decision_tree.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/decision_trees/decision_tree.pptx
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/dcke/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/dcke/__init__.py
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/dcke/dcke.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from inspect import isfunction
3 | import numpy as np
4 | from sklearn.base import RegressorMixin
5 | from copy import deepcopy
6 | from GPy.models import GPRegression
7 |
8 |
9 | class DCKE(RegressorMixin):
10 | """ Dynamically Controlled Kernel Estimation
11 | Computes the conditional expectation $E[Y \mid X=x]$ from
12 | a training set $X_i$, $y_i$, $i=1, \ldots, N$ of joint
13 | realizations of $X$ and $Y$ for an arbitrary prediction
14 | set of $x$'s. The DCKE regressor first uses local regression
15 | on a mesh grid to solve the problem on the mesh grid and then
16 | uses GPR to evaluate in between the points on the mesh grid.
17 | Optionally, a control variate $Z$ can be supplied together
18 | with $\mu_Z = E[Z \mid X=x_k]$ for the points $x_k$ on the
19 | mesh grid. In that case, the expectation
20 | $E[Y +\beta (Z-\mu_Z) \mid X=x_k]$ is computed on the
21 | mesh grid with variance reduced by the correlation between
22 | $Y$ and $Z$.
23 | """
24 |
25 | def __init__(self, locreg, gpr_kernel):
26 | """
27 | Initializes the DCKE object.
28 | :param locreg: an instance of LocalRegression
29 | :param gpr_kernel: an instance of GPy.kern
30 | """
31 | self.locreg = locreg
32 | self.gpr_kernel = gpr_kernel
33 | self.gpr_ = None
34 | self.X_train_ = None
35 | self.y_train_ = None
36 | self.x_mesh_ = None
37 | self.y_mesh_ = None
38 | self.Z_ = None
39 | self.mz_ = None
40 | self.cov_ = None
41 | self.var_ = None
42 | self.beta_ = None
43 |
44 | def fit(self, X, y, x_mesh, Z=None, mz=None, bandwidth=None):
45 | """
46 | Fits the DCKE to training data.
47 |
48 | :param X: a numpy array of shape (num_samples, num_dimensions)
49 | :param y: a numpy array of shape (num_samples,)
50 | :param Z: a numpy array of shape (num_samples,)
51 | :param x_mesh: a numpy array of shape (num_meshes, num_dimensions)
52 | :param mz: a numpy array of shape (num_meshes,) any any mz[k]
53 | satisties mz[k] = E[Z \mid X=x_k]$ where x_k are the
54 | points in x_mesh
55 | :param bandwidth: bandwidth parameter for the local regression
56 | :return:
57 | """
58 | self.X_train_ = X
59 | self.y_train_ = y
60 | self.x_mesh_ = x_mesh
61 | if Z is None and mz is None:
62 | self.Z_ = np.zeros_like(self.y_train_)
63 | self.mz_ = np.zeros(self.x_mesh_.shape[0])
64 | elif (Z is None and mz is not None) or (Z is not None and mz is None):
65 | raise ValueError('Parameter Z and mz have to be either both None or both not None.')
66 | else:
67 | self.Z_ = Z
68 | self.mz_ = mz
69 | self.locreg.warm_start = True
70 | self.locreg.fit(X, y, bandwidth)
71 |
72 | def _calculate_locregs(self):
73 | """
74 | Uses the approximate conditional expectation operator
75 | $\tilde E[_ \mid X=x]$ defined by the local regression in self.locreg
76 | to compute the approximate optimal beta for the control variate $Z$
77 | defined by $\beta_x = - \tfrac{\Cov[Y, Z \mid X=x]}{\Var[Z \mid X=x]}$
78 | for all $x$ in self.x_mesh.
79 |
80 | :return: beta, a numpy array of shape (num_mesh_points, )
81 | """
82 | h = self.locreg.bandwidth
83 | n = self.x_mesh_.shape[0]
84 | self.cov_ = np.zeros(n)
85 | self.var_ = np.zeros(n)
86 | self.y_mesh_ = np.zeros(n)
87 | self.beta_ = np.zeros(n)
88 | m_y = np.zeros(n)
89 | m_z = np.zeros(n)
90 | for i in range(n):
91 | m_y[i] = self.locreg.predict(np.atleast_2d(self.x_mesh_[i]).T).squeeze()
92 | self.locreg.fit_partial(np.atleast_2d(self.Z_).T, h)
93 | m_z[i] = self.locreg.predict_partial().squeeze()
94 | self.locreg.fit_partial((self.y_train_ - m_y[i]) * (self.Z_ - m_z[i]), h)
95 | self.cov_[i] = self.locreg.predict_partial().squeeze()
96 | self.locreg.fit_partial((self.Z_ - m_z[i]) ** 2, h)
97 | self.var_[i] = self.locreg.predict_partial().squeeze()
98 | self.beta_[i] = - self.cov_[i] / self.var_[i]
99 | self.locreg.fit_partial(self.y_train_ + self.beta_[i] * (self.Z_ - self.mz_[i]), h)
100 | self.y_mesh_[i] = self.locreg.predict_partial()
101 |
102 | def predict(self, X):
103 | """
104 | Predicts the conditional expectation $E[Y \mid X=x]$ for all x in $X$.
105 |
106 | :param X: a numpy array of shape (num_predictions, num_dimensions)
107 | :return: a numpy array of shape (num_predictions,)
108 | """
109 |
110 | self._calculate_locregs()
111 | self.gpr_ = GPRegression(self.x_mesh_,
112 | np.atleast_2d(self.y_mesh_).T,
113 | self.gpr_kernel)
114 | self.gpr_.optimize(messages=False)
115 | #self.gpr_.optimize_restarts(num_restarts = 10)
116 | y_pred, self.gp_var_ = self.gpr_.predict(X)
117 | self.gp_var_ = self.gp_var_.squeeze()
118 | return y_pred.squeeze()
119 |
120 |
121 | class DCKEGrid(ABC):
122 |
123 | def __init__(self, locreg, gpr):
124 | self.locreg = locreg
125 | self.gpr = gpr
126 | self.dckes = []
127 |
128 | @abstractmethod
129 | def fit(self, X, y, x_mesh, Z=None, mz=None, bandwidth=None):
130 | pass
131 |
132 | @abstractmethod
133 | def predict(self, X):
134 | pass
135 |
136 | def __getitem__(self, key):
137 | return self.dckes[key]
138 |
139 | @property
140 | def cov_(self):
141 | return np.array([dcke.cov_ for dcke in self.dckes])
142 |
143 | @property
144 | def var_(self):
145 | return np.array([dcke.var_ for dcke in self.dckes])
146 |
147 | @property
148 | def beta_(self):
149 | return np.array([dcke.beta_ for dcke in self.dckes])
150 |
151 |
152 | class DCKEGridIndependent(DCKEGrid):
153 | """
154 | Provides a wrapper for consistently estimating conditional expectations
155 | via DCKE on a grid of random variables, e.g. from a stochastic process.
156 | """
157 |
158 | def _get_bandwidths(self, bandwidth, m):
159 | if bandwidth is None:
160 | return [None for _ in range(m)]
161 | elif isinstance(bandwidth, (list, tuple, np.ndarray)):
162 | return bandwidth
163 | else:
164 | return np.array([bandwidth for _ in range(m)])
165 |
166 | def fit(self, X, y, x_mesh, Z=None, mz=None, bandwidth=None):
167 | """
168 | Fits the DCKE to training data.
169 |
170 | :param X: a numpy array of shape (num_grid_points, num_samples, num_dimensions)
171 | :param y: a numpy array of shape (num_grid_points, num_samples,)
172 | :param Z: a numpy array of shape (num_grid_points, num_samples,)
173 | :param x_mesh: a numpy array of shape (num_grid_points, num_meshes, num_dimensions)
174 | :param mz: a numpy array of shape (num_meshes,) any any mz[k]
175 | satisfies mz[k] = E[Z \mid X=x_k]$ where x_k are the
176 | points in x_mesh
177 | :param bandwidth: bandwidth parameters for the local regression
178 | if None, then bandwidth will be selected automatically
179 | if scalar, then bandwith will be the same for all
180 | if array, then each DCKE uses its own bandwidth
181 | :return:
182 | """
183 | m = X.shape[0]
184 | self.dckes = [DCKE(deepcopy(self.locreg), deepcopy(self.gpr)) for _ in range(m)]
185 | bandwidths = self._get_bandwidths(bandwidth, m)
186 | for i in range(m):
187 | self.dckes[i].fit(
188 | np.atleast_2d(X[i]),
189 | y[i],
190 | x_mesh[i],
191 | Z[i],
192 | mz[i],
193 | bandwidths[i])
194 |
195 | def predict(self, X):
196 | """
197 | Predicts the conditional expectation $E[Y \mid X=x]$ for all x in $X$.
198 |
199 | :param X: a numpy array of shape (num_grid_points, num_predictions, num_dimensions)
200 | :return: a numpy array of shape (num_grid_points, num_predictions,)
201 | """
202 | m = X.shape[0]
203 | return np.array([self.dckes[i].predict(np.atleast_2d(X[i])) for i in range(m)])
204 |
205 | @property
206 | def X_(self):
207 | return np.array([dcke.X_train_ for dcke in self.dckes])
208 |
209 | @property
210 | def y_(self):
211 | return np.array([dcke.y_train_ for dcke in self.dckes])
212 |
213 | @property
214 | def x_mesh_(self):
215 | return np.array([dcke.x_mesh_ for dcke in self.dckes])
216 |
217 | @property
218 | def Z_(self):
219 | return np.array([dcke.Z_ for dcke in self.dckes])
220 |
221 | @property
222 | def mz_(self):
223 | return np.array([dcke.mz_ for dcke in self.dckes])
224 |
225 |
226 | class DCKEGridRecursive(DCKEGrid):
227 |
228 | def __init__(self, locreg, gpr):
229 | super().__init__(locreg, gpr)
230 | self.X_train_ = None
231 | self.y_train_ = None
232 | self.x_mesh_ = None
233 | self.Z_ = None
234 | self.mz_ = None
235 | self.bandwidths_ = None
236 | self.recursion_functions_ = None
237 | self.y_rec_ = None
238 |
239 | def fit(self, X, y, x_mesh, Z=None, mz=None, bandwidth=None, recursion_functions=None):
240 | """
241 | Fits the DCKE to training data.
242 |
243 | :param X: a numpy array of shape (num_grid_points, num_samples, num_dimensions)
244 | :param y: a numpy array of shape (num_samples,)
245 | or of shape (num_samples,)
246 | :param Z: a numpy array of shape (num_grid_points, num_samples,)
247 | :param x_mesh: a numpy array of shape (num_grid_points, num_meshes, num_dimensions)
248 | :param mz: a numpy array of shape (num_meshes,) any any mz[k]
249 | satisfies mz[k] = E[Z \mid X=x_k]$ where x_k are the
250 | points in x_mesh
251 | :param bandwidth: bandwidth parameters for the local regression
252 | if None, then bandwidth will be selected automatically
253 | if scalar, then bandwith will be the same for all
254 | if array of scalars, then each DCKE uses its own bandwidth
255 | if array of functions, then each DCKE computes its own bandwidth
256 | by evaluating the function on y_train_
257 | :return:
258 | """
259 | self.X_train_ = X
260 | self.y_train_ = y
261 | self.x_mesh_ = x_mesh
262 | self.Z_ = Z
263 | self.mz_ = mz
264 | self.bandwidths_ = self._get_bandwidths(bandwidth)
265 | m = X.shape[0]
266 | self.dckes = [DCKE(deepcopy(self.locreg), deepcopy(self.gpr)) for _ in range(m)]
267 | self.bandwidths_ = self._get_bandwidths(bandwidth)
268 | self.recursion_functions_ = self._get_recursion_functions(recursion_functions)
269 |
270 | def _get_bandwidths(self, bandwidth):
271 | m = self.X_train_.shape[0]
272 | if bandwidth is None:
273 | bw = [lambda x: None for _ in range(m)]
274 | elif np.isscalar(bandwidth):
275 | bw = [lambda x: bandwidth for _ in range(m)]
276 | elif isinstance(bandwidth, (list, tuple, np.ndarray)):
277 | if np.isscalar(bandwidth[0]):
278 | bw = [lambda x, b=b: b for b in bandwidth]
279 | elif isfunction(bandwidth[0]):
280 | bw = bandwidth
281 | else:
282 | raise ValueError("Bandwidths not recognized.")
283 | else:
284 | raise ValueError("Bandwidths not recognized..")
285 | return bw
286 |
287 | def _get_recursion_functions(self, recursion_functions):
288 | m = self.X_train_.shape[0]
289 | if recursion_functions is None:
290 | rf = [lambda x: x for _ in range(m)]
291 | elif isinstance(recursion_functions, (list, tuple, np.ndarray)):
292 | if isfunction(recursion_functions[0]):
293 | rf = recursion_functions
294 | else:
295 | raise ValueError("Recursion functions not recognized.")
296 | else:
297 | raise ValueError("Recursion functions not recognized..")
298 | return rf
299 |
300 | def predict(self, X=None):
301 | """
302 | Predicts the conditional expectation $E[Y \mid X=x]$ for all x in $X$.
303 |
304 | :param X: a numpy array of shape (num_grid_points, num_predictions, num_dimensions)
305 | :param recursion_functions: If not None, then only self[-1] uses y_train_ for
306 | the prediction. Traversing the list of DCKEs backwards,
307 | in step i, self[i] uses f(self[i+1].predict(X[i+1]))
308 | instead of self[i].y_train_, where
309 | f = recursion_functions[i].
310 | :return: a numpy array of shape (num_grid_points, num_predictions,)
311 | """
312 |
313 | num_grid_points = self.X_train_.shape[0]
314 | num_samples = self.X_train_.shape[1]
315 | self.y_rec_ = np.zeros((num_grid_points, num_samples))
316 | if X is not None:
317 | num_predictions = X.shape[1]
318 | y_pred = np.zeros((num_grid_points, num_predictions))
319 | self[-1].fit(
320 | self.X_train_[-1],
321 | self.y_train_,
322 | self.x_mesh_[-1],
323 | self.Z_[-1],
324 | self.mz_[-1],
325 | self.bandwidths_[-1](self.y_train_))
326 | self.y_rec_[-1, :] = self[-1].predict(self.X_train_[-1])
327 | if X is not None:
328 | y_pred[-1, :] = self[-1].predict(X[-1])
329 | for i in range(num_grid_points-2, -1, -1):
330 | y = self.recursion_functions_[i](self.y_rec_[i+1])
331 | self[i].fit(
332 | self.X_train_[i],
333 | y,
334 | self.x_mesh_[i],
335 | self.Z_[i],
336 | self.mz_[i],
337 | self.bandwidths_[i](y))
338 | self.y_rec_[i, :] = self[i].predict(self.X_train_[i])
339 | if X is not None:
340 | y_pred[i, :] = self[i].predict(X[i])
341 | if X is not None:
342 | return y_pred
343 | else:
344 | return self.y_rec_
345 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/dcke/test_dcke.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from unittest import TestCase
3 | import numpy as np
4 | import GPy
5 | import matplotlib.pyplot as plt
6 |
7 | from dcke import DCKE, DCKEGridIndependent, DCKEGridRecursive
8 | from locreg import LocalRegression
9 | from models.black_scholes import BlackScholes
10 |
11 |
12 | class TestDCKE(TestCase):
13 |
14 | def setUp(self):
15 | self.locreg = LocalRegression(degree=0)
16 | self.gpr_kernel = GPy.kern.RBF(input_dim=1)
17 |
18 | def test_black_scholes(self):
19 | self.r = 0.01
20 | self.sigma = 0.3
21 | self.bs = BlackScholes(r=self.r, sigma=self.sigma)
22 | self.time_grid = np.array([0, 0.5, 1.])
23 | self.num_sims = 1000
24 | np.random.seed(1)
25 | self.X = self.bs.paths(s0=100, time_grid=self.time_grid, num_sims=self.num_sims)
26 | self.T = self.time_grid[-1]
27 | self.t = self.time_grid[-2]
28 | self.K = 95
29 | self.df = np.exp(-(self.T-self.t) * self.r)
30 | self.y = self.df * np.maximum(self.X[-1] - self.K, 0)
31 | self.h = (4 / (3 * self.num_sims)) ** (1 / 5) * np.std(self.y)
32 | self.eps = 1 / (2 * self.h **2)
33 | self.num_quantiles = 100
34 | self.quantile_grid = np.linspace(0.1, 99.0, num=self.num_quantiles)
35 | self.x_mesh = np.percentile(self.X[1], self.quantile_grid)
36 | self.beta = np.zeros(self.num_quantiles)
37 | self.mz = np.zeros(self.num_quantiles)
38 | self.my = np.zeros(self.num_quantiles)
39 | self.var = np.zeros(self.num_quantiles)
40 | self.cov = np.zeros(self.num_quantiles)
41 | for i in range(self.x_mesh.shape[0]):
42 | x = self.x_mesh[i]
43 | k = np.exp(-self.eps * (self.X[1] - x)**2)
44 | self.mz[i] = np.sum(self.df * self.X[2] * k) / np.sum(k)
45 | self.my[i] = np.sum(self.y * k) / np.sum(k)
46 | cov = (self.y - self.my[i]) * (self.df * self.X[2] - self.mz[i])
47 | self.cov[i] = np.sum(cov * k) / np.sum(k)
48 | var = (self.df * self.X[2] - self.mz[i])**2
49 | self.var[i] = np.sum(var * k) / np.sum(k)
50 | self.beta[i] = - self.cov[i] / self.var[i]
51 | self.y_mesh = self.my + self.beta * (self.mz - self.x_mesh)
52 | self.gpr = GPy.models.GPRegression(np.atleast_2d(self.x_mesh).T,
53 | np.atleast_2d(self.y_mesh).T,
54 | deepcopy(self.gpr_kernel))
55 | self.gpr.optimize(messages=False)
56 | y_pred = self.gpr.predict(np.atleast_2d(self.x_mesh).T)[0].squeeze()
57 | self.dcke = DCKE(locreg=deepcopy(self.locreg), gpr_kernel=deepcopy(self.gpr_kernel))
58 | self.dcke.fit(X=np.atleast_2d(self.X[1]).T,
59 | y=self.y,
60 | Z=self.df * self.X[2],
61 | x_mesh=np.atleast_2d(self.x_mesh).T,
62 | mz=self.x_mesh)
63 | y_pred_dcke = self.dcke.predict(np.atleast_2d(self.x_mesh).T)
64 | y_true = np.array([self.bs.option_price(s, self.T - self.t, self.K) for s in self.x_mesh])
65 | # plt.plot(self.x_mesh, y_true, label="truth")
66 | # plt.plot(self.x_mesh, y_pred, label="pred")
67 | # plt.plot(self.x_mesh, y_pred_dcke, label="pred dcke")
68 | # plt.legend()
69 | # plt.show()
70 | np.testing.assert_array_almost_equal(self.y_mesh.squeeze(), self.dcke.y_mesh_.squeeze())
71 | np.testing.assert_array_almost_equal(self.x_mesh.squeeze(), self.dcke.x_mesh_.squeeze())
72 | np.testing.assert_array_almost_equal(y_pred, y_pred_dcke)
73 | self.assertTrue(np.all(np.abs(y_pred -y_true)<1))
74 |
75 |
76 | class TestDCKEGridIndependent(TestCase):
77 |
78 | def setUp(self):
79 | self.mu = np.array([1, 2, 0])
80 | self.Sigma = np.array([[3, 0, 0],
81 | [0, 4, 0],
82 | [0, 0, 5]])
83 | quantile_levels = np.linspace(0.1, 99, 10)
84 | N = 100
85 | np.random.seed(1)
86 | W = np.random.multivariate_normal(self.mu, self.Sigma, N)
87 | self.X = W[:, 0]
88 | self.Y = W[:, 1]
89 | self.Z = W[:, 2]
90 | self.x_mesh = np.percentile(self.X, quantile_levels)
91 | self.mz = np.zeros_like(self.x_mesh)
92 | self.locreg = LocalRegression(degree=0)
93 | self.gpr_kernel = GPy.kern.RBF(input_dim=1)
94 |
95 | def test_singleton(self):
96 | self.dcke = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
97 | self.dcke.fit(np.atleast_2d(self.X).T, self.Y, np.atleast_2d(self.x_mesh).T, self.Z, self.mz)
98 | y_pred = self.dcke.predict(np.atleast_2d(self.x_mesh).T)
99 | self.dcke_grid = DCKEGridIndependent(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
100 | self.dcke_grid.fit(self.X[np.newaxis, : , np.newaxis],
101 | self.Y[np.newaxis, :],
102 | self.x_mesh[np.newaxis, :, np.newaxis],
103 | self.Z[np.newaxis, :],
104 | self.mz[np.newaxis, :])
105 | y_pred_grid = self.dcke_grid.predict(self.x_mesh[np.newaxis, :, np.newaxis])
106 | np.testing.assert_array_almost_equal(y_pred, y_pred_grid.squeeze())
107 |
108 | def test_grid_components(self):
109 | self.dcke1 = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
110 | self.dcke2 = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
111 | self.dcke1.fit(np.atleast_2d(self.X).T,
112 | self.Y,
113 | np.atleast_2d(self.x_mesh).T,
114 | self.Z,
115 | self.mz)
116 | bandwidth = self.dcke1.locreg.bandwidth
117 | self.dcke2.fit(2 * np.atleast_2d(self.X).T,
118 | 2 * self.Y,
119 | 2 * np.atleast_2d(self.x_mesh).T,
120 | 2 * self.Z,
121 | 2 * self.mz,
122 | bandwidth)
123 | y_pred1 = self.dcke1.predict(np.atleast_2d(self.x_mesh).T)
124 | y_pred2 = self.dcke2.predict(2 * np.atleast_2d(self.x_mesh).T)
125 | self.dcke_grid = DCKEGridIndependent(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
126 | X = np.concatenate((np.atleast_2d(self.X).T, 2 * np.atleast_2d(self.X).T), axis=1).T[:, :, np.newaxis]
127 | y = np.concatenate((np.atleast_2d(self.Y).T, 2 * np.atleast_2d(self.Y).T), axis=1).T
128 | Z = np.concatenate((np.atleast_2d(self.Z).T, 2 * np.atleast_2d(self.Z).T), axis=1).T
129 | mz = np.concatenate((np.atleast_2d(self.mz).T, 2 * np.atleast_2d(self.mz).T), axis=1).T
130 | x_mesh = np.concatenate((np.atleast_2d(self.x_mesh).T, 2 * np.atleast_2d(self.x_mesh).T), axis=1).T[:, :, np.newaxis]
131 | self.dcke_grid.fit(X, y, x_mesh, Z, mz, bandwidth)
132 | y_pred_grid = self.dcke_grid.predict(x_mesh)
133 | y_pred = {0: y_pred1, 1: y_pred2}
134 | dcke = {0: self.dcke1, 1: self.dcke2}
135 | for i in range(2):
136 | np.testing.assert_array_almost_equal(y_pred[i], y_pred_grid[i].squeeze())
137 | np.testing.assert_array_almost_equal(self.dcke_grid.X_[i], dcke[i].X_train_)
138 | np.testing.assert_array_almost_equal(self.dcke_grid.y_[i], dcke[i].y_train_)
139 | np.testing.assert_array_almost_equal(self.dcke_grid.x_mesh_[i], dcke[i].x_mesh_)
140 | np.testing.assert_array_almost_equal(self.dcke_grid.Z_[i], dcke[i].Z_)
141 | np.testing.assert_array_almost_equal(self.dcke_grid.mz_[i], dcke[i].mz_)
142 | np.testing.assert_array_almost_equal(self.dcke_grid.cov_[i], dcke[i].cov_)
143 | np.testing.assert_array_almost_equal(self.dcke_grid.var_[i], dcke[i].var_)
144 | np.testing.assert_array_almost_equal(self.dcke_grid.beta_[i], dcke[i].beta_)
145 |
146 | def test_bandwidths(self):
147 | self.dcke = DCKEGridIndependent(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
148 | self.assertListEqual(self.dcke._get_bandwidths(None, 3), [None, None, None])
149 | np.testing.assert_array_almost_equal(self.dcke._get_bandwidths(2.7, 3),
150 | np.array([2.7, 2.7, 2.7]))
151 | np.testing.assert_array_almost_equal(self.dcke._get_bandwidths(np.array([1., 2., 3.]), 3),
152 | np.array([1., 2., 3.]))
153 |
154 |
155 | class TestDCKEGridRecursive(TestCase):
156 |
157 | def setUp(self):
158 | self.mu = np.array([1, 2, 0])
159 | self.Sigma = np.array([[3, 0, 0],
160 | [0, 4, 0],
161 | [0, 0, 5]])
162 | quantile_levels = np.linspace(0.1, 99, 10)
163 | N = 100
164 | np.random.seed(1)
165 | W = np.random.multivariate_normal(self.mu, self.Sigma, N)
166 | self.X = W[:, 0]
167 | self.Y = W[:, 1]
168 | self.Z = W[:, 2]
169 | self.x_mesh = np.percentile(self.X, quantile_levels)
170 | self.mz = np.zeros_like(self.x_mesh)
171 | self.locreg = LocalRegression(degree=0)
172 | self.gpr_kernel = GPy.kern.RBF(input_dim=1)
173 |
174 | def test_singleton(self):
175 | self.dcke = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
176 | self.dcke.fit(np.atleast_2d(self.X).T, self.Y, np.atleast_2d(self.x_mesh).T, self.Z, self.mz)
177 | y_pred = self.dcke.predict(np.atleast_2d(self.x_mesh).T)
178 | y_pred = self.dcke.predict(np.atleast_2d(self.x_mesh).T)
179 | h = self.dcke.locreg.bandwidth
180 | self.dcke_grid = DCKEGridRecursive(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
181 | self.dcke_grid.fit(self.X[np.newaxis, : , np.newaxis],
182 | self.Y[np.newaxis, :],
183 | self.x_mesh[np.newaxis, :, np.newaxis],
184 | self.Z[np.newaxis, :],
185 | self.mz[np.newaxis, :],
186 | h)
187 | y_pred_grid = self.dcke_grid.predict(self.x_mesh[np.newaxis, :, np.newaxis])
188 | np.testing.assert_array_almost_equal(y_pred, y_pred_grid.squeeze())
189 |
190 | def test_grid_components(self):
191 | self.dcke2 = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
192 | self.dcke1 = DCKE(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
193 | self.dcke2.fit(2 * np.atleast_2d(self.X).T,
194 | 2 * self.Y,
195 | 2 * np.atleast_2d(self.x_mesh).T,
196 | 2 * self.Z,
197 | 2 * self.mz)
198 | y_pred2 = self.dcke2.predict(2 * np.atleast_2d(self.X).T)
199 | df = np.exp(-0.5)
200 | self.dcke1.fit(np.atleast_2d(self.X).T,
201 | df * y_pred2,
202 | np.atleast_2d(self.x_mesh).T,
203 | self.Z,
204 | self.mz)
205 | y_pred1 = self.dcke1.predict(np.atleast_2d(self.X).T)
206 | self.dcke_grid = DCKEGridRecursive(deepcopy(self.locreg), deepcopy(self.gpr_kernel))
207 | X = np.concatenate((np.atleast_2d(self.X).T, 2 * np.atleast_2d(self.X).T), axis=1).T[:, :, np.newaxis]
208 | Z = np.concatenate((np.atleast_2d(self.Z).T, 2 * np.atleast_2d(self.Z).T), axis=1).T
209 | mz = np.concatenate((np.atleast_2d(self.mz).T, 2 * np.atleast_2d(self.mz).T), axis=1).T
210 | x_mesh = np.concatenate((np.atleast_2d(self.x_mesh).T, 2 * np.atleast_2d(self.x_mesh).T), axis=1).T[:, :, np.newaxis]
211 | bandwidths = np.array([self.dcke1.locreg.bandwidth, self.dcke2.locreg.bandwidth])
212 | self.dcke_grid.fit(X, 2 * self.Y, x_mesh, Z, mz, bandwidths, recursion_functions=[lambda x: x * df])
213 | y_pred_grid = self.dcke_grid.predict()
214 | y_pred = {0: y_pred1, 1: y_pred2}
215 | dcke = {0: self.dcke1, 1: self.dcke2}
216 | for i in range(2):
217 | np.testing.assert_array_almost_equal(self.dcke_grid[i].X_train_, dcke[i].X_train_)
218 | np.testing.assert_array_almost_equal(self.dcke_grid[i].y_train_, dcke[i].y_train_)
219 | np.testing.assert_array_almost_equal(self.dcke_grid[i].x_mesh_, dcke[i].x_mesh_)
220 | np.testing.assert_array_almost_equal(self.dcke_grid[i].Z_, dcke[i].Z_)
221 | np.testing.assert_array_almost_equal(self.dcke_grid[i].mz_, dcke[i].mz_)
222 | np.testing.assert_array_almost_equal(self.dcke_grid[i].var_, dcke[i].var_)
223 | np.testing.assert_array_almost_equal(self.dcke_grid[i].cov_, dcke[i].cov_)
224 | np.testing.assert_array_almost_equal(self.dcke_grid[i].beta_, dcke[i].beta_)
225 | np.testing.assert_array_almost_equal(self.dcke_grid[i].y_mesh_, dcke[i].y_mesh_)
226 | np.testing.assert_array_almost_equal(y_pred[i], y_pred_grid[i].squeeze())
227 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/locreg/__init__.py:
--------------------------------------------------------------------------------
1 | from locreg.local_regression import LocalRegression
2 | __all__ = ['LocalRegression',
3 | ]
4 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/locreg/local_regression.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import RegressorMixin
2 | from sklearn.preprocessing import PolynomialFeatures
3 | from scipy.optimize import leastsq
4 | import numpy as np
5 | from scipy.linalg import solve_triangular
6 |
7 |
8 | def exp_kernel(z):
9 | """
10 | Implements the exponential kernel $$ e^{-|z|^2/2}$$
11 | as a vectorized function.
12 | :param z: a numpy array of any dimension
13 | :return: the exponential kernel evaluated on the array z assuming
14 | the last axis is the dimension of the elements of z
15 | """
16 | return np.exp(-np.linalg.norm(np.atleast_1d(z), axis=-1) ** 2 / 2)
17 |
18 |
19 | class LocalRegression(RegressorMixin):
20 | """
21 | This class performs local polynomial regression of dimension $d$
22 | and degree $p$, i.e. given a training set $(X,y)$ of $N$ samples
23 | $x_i \in \R^d$ and $y_i \in \R$, it makes a new prediction at
24 | $x \in \R^d$ as $x=\beta^0$, where $\beta$ is the value minimizing
25 | the cost functional
26 | \begin{align*}
27 | J(\beta) & := \sum_{i=1}^{N}{(y_i - \beta ^0 + j_1(x_i))^2 w_i},
28 | j_1(x_i) & := \sum_{1 \leq |\alpha| \leq p}{(x-x_i)^\alpha}, \\
29 | w_i & := K_h(x-x_i),
30 | \end{align*}
31 | where $K_h$ is a kernel function scaled with bandwith $h$. The minimum
32 | of the cost function is computed via QR decomposition (or analytically).
33 | """
34 |
35 | def __init__(self, degree, kernel=exp_kernel, warm_start=True):
36 | """
37 | :param degree: an int specifying the degree of the polynomial
38 | :param kernel: a kernel function for the weight calculation
39 | :param warm_start: if True the fitted kernel is preserved enabling
40 | partial_fit() for new y
41 | """
42 | self.degree = degree
43 | self.kernel = kernel
44 | self.warm_start = warm_start
45 | self.bandwidth = None
46 | self.fitted_kernel = None
47 | self.method = None
48 | self.X_train_ = None
49 | self.y_train = None
50 |
51 | def fit(self, X, y, bandwidth=None):
52 | """
53 | Fits the regressor to the data. As the concept of local regression is
54 | to fit the data to each prediction, this function only stores the data
55 | and either sets a fixed bandwidth or estimates an optimal one.
56 | :param X: a numpy array of shape (N, d)
57 | :param y: a numpy array of shape (N,)
58 | :param bandwidth: a scalar or None
59 | :return: self
60 | """
61 | self.X_train_ = X
62 | self.y_train = y
63 | self._set_bandwidth(bandwidth)
64 | return self
65 |
66 | def predict(self, X, method=None):
67 | """
68 | Performs the prediction for each value x in the prediction set X.
69 | If $d=1$ and $p=0,1$ the cost functional can be minimized analytically.
70 | :param X: a numpy array of dimension (M, d)
71 | :param method: can be 'analytic' or 'qr' or 'leastsq' or None. Method is
72 | set automatically if None.
73 | :return: a numpy array y of dimension (M,) with the predictions
74 | """
75 | if method is None:
76 | method = self._determine_method()
77 | if self.fitted_kernel is not None:
78 | self.fitted_kernel = None
79 | self.method = None
80 | return self._predict_with_method(X, method)
81 |
82 | def fit_partial(self, y, bandwidth=None):
83 | """ Re-fits only the y values of the regression. Only works if
84 | warm_start==True and a previous (full) fit and predict
85 | has already been performed.
86 | """
87 | if self.warm_start and self.fitted_kernel is not None and self.method in ['analytic', 'qr']:
88 | self.y_train = y
89 | self._set_bandwidth(bandwidth)
90 | else:
91 | raise ValueError("The fit_partial method can only be invoked\
92 | if fit and predict have been invoked previously with method \
93 | `analytic´ or `qr` and warm_start is set to True")
94 |
95 | def predict_partial(self):
96 | """ Predicts on the last value of y_train set by fit() or fit_partial()
97 | and the last X that has been used for prediction. """
98 | if self.fitted_kernel is not None and self.method is not None:
99 | return self._predict_with_method(X=None, method=self.method)
100 | else:
101 | raise ValueError("The method predict_partial requires a full\
102 | prior run of predict with method `analytic´ \
103 | or `qr´.")
104 |
105 | def _set_bandwidth(self, bandwidth):
106 | """ Sets the bandwidth in the fitting."""
107 | if bandwidth is None:
108 | self.bandwidth = self._silverman()
109 | else:
110 | self.bandwidth = bandwidth
111 |
112 | def _silverman(self):
113 | """
114 | This function implements Silverman's Rule of Thumb
115 | \begin{align*}
116 | h = \Big( \frac{4}{3n} \Big)^{\frac{1}{5}} \hat \sigma_Y
117 | \end{align*}
118 | to estimate the optimal bandwidth of the training data y.
119 | :return: bandwidth h
120 | """
121 | sigma_y = np.std(self.y_train)
122 | n = self.y_train.shape[0]
123 | return (4 / (3 * n)) ** (1 / 5) * sigma_y
124 |
125 | def _scaled_kernel(self):
126 | """
127 | Scales the kernel function self.kernel by the
128 | bandwidth self.bandwidth.
129 | :return: scaled kernel function
130 | """
131 | def kh(z):
132 | d = self.X_train_.shape[1]
133 | return self.kernel(z / self.bandwidth) / self.bandwidth ** d
134 | return kh
135 |
136 | def _predict_with_method(self, X, method):
137 | """ Performs the prediction based on the `method´ flag.
138 | :param method: a string chosen from 'analytic', 'leastsq', 'qr'
139 | """
140 | self.method = method
141 | num_dims = self.X_train_.shape[1]
142 | if method == 'analytic':
143 | if self.degree == 0:
144 | return self._predict_nadaraya_watson(X)
145 | elif num_dims == 1 and self.degree == 1:
146 | return self._predict_locally_linear(X)
147 | else:
148 | raise ValueError('Method `analytic´ is only available if \
149 | self.degree=0 or self.degree=1 and X.shape[1] == 1.')
150 | elif method == 'qr':
151 | return self._predict_qr(X)
152 | elif method == 'leastsq':
153 | return self._predict_leastsq(X)
154 | else:
155 | raise ValueError('Parameter `method´ has to be `analytic´ or\
156 | `qr´ or `leastsq´ or None, but is currently set to: %s' % method)
157 |
158 | def _determine_method(self):
159 | """ Automatically selects the method for prediction based on the dimension
160 | of the training data self_X_train_.
161 | """
162 | num_dims = self.X_train_.shape[1]
163 | if self.degree == 0 or (self.degree == 1 and num_dims == 1):
164 | return 'analytic'
165 | else:
166 | return 'qr'
167 |
168 | def _predict_nadaraya_watson(self, X):
169 | """
170 | Performs local regression of degree $p=0$ in dimension $d=1$.
171 | In this case, the cost functional can be minimized analytically
172 | and for any $x \in \R$, the estimate $y$ is given by
173 | \begin{align*}
174 | y &= \sum_{i=1}^{N}{W^i_h(x) y_i}, \\
175 | W^0_i(x) &= \frac{K_h(x - x_i)}{\sum_{j=1}^N{K_h(x - x_j)}}
176 | \end{align*}
177 | :param X: a numpy array of dimension (M, d) at which to predict
178 | :return: a numpy array of dimension (M,) with the M predicted y's
179 | """
180 | if (self.warm_start and self.fitted_kernel is None) or (not self.warm_start):
181 | k_h = self._scaled_kernel()
182 | nwk = k_h(np.array([x - self.X_train_ for x in X]))
183 | self.fitted_kernel = nwk / np.sum(nwk, axis=1)[:, np.newaxis]
184 | return np.sum(self.fitted_kernel * self.y_train.squeeze(), axis=1)
185 |
186 | def _predict_locally_linear(self, X):
187 | """
188 | Performs local regression of degree $p=1$ in dimension $d=1$.
189 | In this case, the cost functional can be minimized analytically
190 | and for any $x \in \R$, the estimate $y$ is given by
191 | \begin{align*}
192 | y &= \sum_{i=1}^{N}{W^i_h(x) y_i}, \\
193 | W^1_i(x) &:= \frac{K_h(x-x_i)}{N}\frac{s_2(x)-s_1(x)(x-x_i)}{x_2(x)s_0(x)} \\
194 | s_r(x) &:= \frac{1}{N} \sum_{i=1}^N{(x-x_i)^rK_h(x-x_i)}.
195 | \end{align*}
196 | :param X: a numpy array of dimension (M, d) at which to predict
197 | :return: a numpy array of dimension (M,) with the M predicted y's
198 | """
199 | if (self.warm_start and self.fitted_kernel is None) or (not self.warm_start):
200 | n = self.X_train_.shape[0]
201 | m = X.shape[0]
202 | self.fitted_kernel = np.zeros((m, n))
203 | k_h = self._scaled_kernel()
204 | X_ = self.X_train_.squeeze()
205 | for i in range(m):
206 | x = X[i]
207 | llk = k_h(x - self.X_train_).squeeze()
208 | s0 = np.mean(llk)
209 | s1 = np.mean((x - X_) * llk)
210 | s2 = np.mean((x - X_) ** 2 * llk)
211 | s = (s2 - s1 * (x - X_)) / (s2 * s0 - s1 ** 2) / n
212 | self.fitted_kernel[i, :] = s * llk
213 | return np.sum(self.fitted_kernel * self.y_train.squeeze(), axis=1)
214 |
215 | def _predict_qr(self, X):
216 | """
217 | Performs a prediction for each x in X by solving the associated
218 | normal equations via QR decomposition.
219 |
220 | :param X: a numpy array of dimension (M, d) at which to predict
221 | :return: a numpy array of dimension (M,) with the M predicted y's
222 | """
223 | if (self.warm_start and self.fitted_kernel is None) or (not self.warm_start):
224 | self.fitted_kernel = []
225 | poly = PolynomialFeatures(degree=self.degree)
226 | for i in range(X.shape[0]):
227 | x = X[i, :]
228 | phi = poly.fit_transform(self.X_train_ - x)
229 | m = phi.shape[1]
230 | kh = self._scaled_kernel()
231 | w = kh(x - self.X_train_)
232 | w_mat = np.diag(np.sqrt(w))
233 | a = w_mat @ phi
234 | q, r = np.linalg.qr(a, mode='complete')
235 | r = r[:m, :m]
236 | self.fitted_kernel.append((q.transpose() @ w_mat, r))
237 | n = len(self.fitted_kernel)
238 | y_pred = np.zeros(n)
239 | for i in range(n):
240 | qw, r = self.fitted_kernel[i]
241 | m = r.shape[1]
242 | c = (qw @ self.y_train)[:m]
243 | beta = solve_triangular(r, c)
244 | y_pred[i] = beta[0]
245 | return y_pred
246 |
247 | def _predict_leastsq(self, X):
248 | """
249 | Performs a prediction for each x in X by minimizing the cost
250 | functional $J$.
251 |
252 | :param X: a numpy array of dimension (M, d) at which to predict
253 | :return: a numpy array of dimension (M,) with the M predicted y's
254 | """
255 | poly = PolynomialFeatures(degree=self.degree)
256 | x0 = poly.fit_transform(np.zeros((1, X[0].shape[0])))[0]
257 | return np.array([leastsq(func=self._cost_functional(poly, x),
258 | x0=x0)[0][0] for x in X])
259 |
260 | def _cost_functional(self, poly, x):
261 | """
262 | Creates the cost functional $J$ for optimization using the parameters.
263 | :param poly: an instance of PolynomialFeatures
264 | :param x: a numpy array of shape (d,)
265 | :return: cost funtional $J$
266 | """
267 | def cost(beta):
268 | res = np.sum(poly.fit_transform(self.X_train_ - x) * beta, axis=1) - self.y_train
269 | kh = self._scaled_kernel()
270 | w = kh(x - self.X_train_)
271 | res *= np.sqrt(np.abs(w))
272 | return res
273 | return cost
274 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/locreg/test_local_regression.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from local_regression import LocalRegression
3 | import numpy as np
4 |
5 |
6 | class TestLocalRegression(TestCase):
7 |
8 | def setUp(self):
9 | self.n = 10
10 | self.x = np.linspace(-10, 10, self.n)
11 | self.X = self.x[:, np.newaxis]
12 | np.random.seed(1)
13 | self.e = np.random.normal(0, 0.01, self.n)
14 |
15 | def test_nw_analytic_vs_qr_vs_leastsq_1d(self):
16 | self.y = self.x**2 + self.e
17 | self.nw = LocalRegression(degree=0, warm_start=False)
18 | self.nw.fit(self.X, self.y)
19 | X_eval = np.linspace(-10, 10, 2 * self.n)[:, np.newaxis]
20 | y_pred_analytic = self.nw.predict(X_eval, method='analytic')
21 | y_pred_qr = self.nw.predict(X_eval, method='qr')
22 | y_pred_leastsq = self.nw.predict(X_eval, method='leastsq')
23 | np.testing.assert_almost_equal(y_pred_analytic, y_pred_qr, decimal=6)
24 | np.testing.assert_almost_equal(y_pred_qr, y_pred_leastsq, decimal=6)
25 |
26 | def test_nw_analytic_vs_qr_vs_leastsq_2d(self):
27 | grid = np.linspace(-10, 10, self.n)
28 | x1, x2 = np.meshgrid(grid, grid)
29 | self.X = np.vstack((x1.flatten(), x2.flatten())).T
30 | self.y = self.X[:, 0] * self.X[:, 1] + np.random.normal(0, 0.01, self.X.shape[0])
31 | self.nw = LocalRegression(degree=0, warm_start=False)
32 | self.nw.fit(self.X, self.y)
33 | grid_eval = np.linspace(-10, 10, 2 * self.n)[:, np.newaxis]
34 | x1e, x2e = np.meshgrid(grid_eval, grid_eval)
35 | X_eval = np.vstack((x1e.flatten(), x2e.flatten())).T
36 | y_pred_analytic = self.nw.predict(X_eval, method='analytic')
37 | y_pred_qr = self.nw.predict(X_eval, method='qr')
38 | y_pred_leastsq = self.nw.predict(X_eval, method='leastsq')
39 | np.testing.assert_almost_equal(y_pred_qr, y_pred_leastsq, decimal=5)
40 | np.testing.assert_almost_equal(y_pred_analytic, y_pred_qr, decimal=5)
41 |
42 | def test_ll_analytic_vs_cost(self):
43 | self.y = self.x + self.e
44 | self.ll = LocalRegression(degree=1, warm_start=False)
45 | self.ll.fit(self.X, self.y)
46 | X_eval = np.linspace(-10, 10, 2 * self.n)[:, np.newaxis]
47 | y_pred_analytic = self.ll.predict(X_eval, method='analytic')
48 | y_pred_qr = self.ll.predict(X_eval, method='qr')
49 | y_pred_leastsq = self.ll.predict(X_eval, method='leastsq')
50 | np.testing.assert_almost_equal(y_pred_analytic, y_pred_qr)
51 | np.testing.assert_almost_equal(y_pred_qr, y_pred_leastsq)
52 |
53 | def test_nw_analytic_loc_const_const_1d(self):
54 | self.y = np.ones_like(self.x) + self.e
55 | self.nw = LocalRegression(degree=0)
56 | self.nw.fit(self.X, self.y)
57 | np.testing.assert_almost_equal(self.nw.predict(self.X), self.y)
58 |
59 | def test_ll_analytic_loc_lin_lin_1d(self):
60 | self.y = self.x + self.e
61 | self.ll = LocalRegression(degree=1)
62 | self.ll.fit(self.X, self.y)
63 | np.testing.assert_almost_equal(self.ll.predict(self.X), self.y, decimal=1)
64 |
65 | def test_2d_loc_const_const(self):
66 | n = 5
67 | c = 7.
68 | x = np.linspace(-5, 5, n)
69 | y = np.linspace(-5, 5, n)
70 | z = np.array([[c for xx in x] for yy in y])
71 | x, y = np.meshgrid(x, y)
72 | X = np.array(list(zip(x.flatten(), y.flatten())))
73 | np.random.seed(1)
74 | e = np.random.normal(0, 0.01, (n, n))
75 | z = z + e
76 | locreg3d = LocalRegression(degree=0).fit(X, z.flatten())
77 | z_pred = locreg3d.predict(X)
78 | self.assertTrue(np.all(np.abs(z_pred / c - 1) <= 0.01))
79 |
80 | def test_2d_loc_lin_lin(self):
81 | n = 5
82 | x = np.linspace(1, 10, n)
83 | y = np.linspace(1, 10, n)
84 | z = np.array([[xx + yy for xx in x] for yy in y])
85 | res = z.flatten()
86 | x, y = np.meshgrid(x, y)
87 | X = np.array(list(zip(x.flatten(), y.flatten())))
88 | np.random.seed(1)
89 | e = np.random.normal(0, 0.01, (n, n))
90 | z = z + e
91 | locreg3d = LocalRegression(degree=1).fit(X, z.flatten())
92 | z_pred = locreg3d.predict(X)
93 | self.assertTrue(np.all(np.abs(z_pred / res - 1) <= 0.01))
94 |
95 | def test_2d_least_sq_vs_qr(self):
96 | n = 5
97 | x1 = np.linspace(1, 10, n)
98 | x2 = np.linspace(1, 10, n)
99 | y = np.array([[xx1 ** 2 + xx2 ** 2 for xx1 in x1] for xx2 in x2])
100 | x1, x2 = np.meshgrid(x1, x2)
101 | X = np.array(list(zip(x1.flatten(), x2.flatten())))
102 | np.random.seed(1)
103 | e = np.random.normal(0, 0.01, (n, n))
104 | y = y + e
105 | locreg3d = LocalRegression(degree=2).fit(X, y.flatten())
106 | y_pred_leastsq = locreg3d.predict(X, method='leastsq')
107 | y_pred_qr = locreg3d.predict(X, method='qr')
108 | np.testing.assert_array_almost_equal(y_pred_leastsq, y_pred_qr)
109 |
110 | def test_fit_partial(self):
111 | self.y = self.x + self.e
112 | self.ll = LocalRegression(degree=1, warm_start=True)
113 | self.ll.fit(self.X, self.y)
114 | X_eval = np.linspace(-10, 10, 2 * self.n)[:, np.newaxis]
115 | _ = self.ll.predict(X_eval, method='analytic')
116 | self.y = self.x ** 2 + self.e
117 | h = self.ll.bandwidth
118 | self.ll.fit_partial(self.y, h)
119 | y_pred_partial = self.ll.predict(X_eval, method='analytic')
120 | self.ll2 = LocalRegression(degree=1, warm_start=False)
121 | self.ll2.fit(self.X, self.y, h)
122 | y_pred = self.ll2.predict(X_eval, method='analytic')
123 | np.testing.assert_almost_equal(y_pred, y_pred_partial)
124 | # test against new instance to validate correct state
125 | self.ll3 = LocalRegression(degree=1, warm_start=False)
126 | self.ll3.fit(self.X, self.y, h)
127 | y_pred_new = self.ll3.predict(X_eval, method='analytic')
128 | np.testing.assert_array_almost_equal(y_pred_new, y_pred)
129 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/models/__init__.py
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/models/black_scholes.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.stats import norm
3 | from scipy.optimize import minimize_scalar, newton
4 |
5 |
6 | class BlackScholes:
7 | """
8 | Implements the Black-Scholes model $dS_t = r S_t dt + \sigma S_t dW_t$.
9 | """
10 |
11 | def __init__(self, sigma, r):
12 | self.sigma = sigma
13 | self.r = r
14 |
15 | @staticmethod
16 | def _d1(sigma, r, s0, tm, sk):
17 | """
18 | Implements the d1 in the Black-Scholes option price formula.
19 | """
20 | d1 = np.log(s0 / sk) + (r + sigma ** 2 / 2) * tm
21 | return d1 / (sigma * np.sqrt(tm))
22 |
23 | @staticmethod
24 | def _option_price(sigma, r, s0, tm, sk, call):
25 | """
26 | Implements Black-Scholes option price formula.
27 | :param sigma: instantaneous volatility
28 | :param r: risk-free rate
29 | :param s0: value of underlying stock price at t=0
30 | :param tm: time to maturity of the option
31 | :param sk: strike of the option
32 | :param call: True if call option, False if put
33 | :return: option price
34 | """
35 | d1 = BlackScholes._d1(sigma, r, s0, tm, sk)
36 | d2 = d1 - sigma * np.sqrt(tm)
37 | pvk = sk * np.exp(-r * tm)
38 | phi = norm.cdf
39 | if call:
40 | return phi(d1) * s0 - phi(d2) * pvk
41 | else:
42 | return phi(-d2) * pvk - phi(-d1) * s0
43 |
44 | @staticmethod
45 | def _paths(sigma, r, s0, time_grid, num_sims, seed=1):
46 | """
47 | Create random paths of the underlying.
48 |
49 | :param sigma: instantaneous volatility
50 | :param r: risk-free rate
51 | :param time_grid: time grid of shape (num_time_steps) on which to simulate
52 | :param s0: initial value of stock at time_grid[0]
53 | :param num_sims: number of paths to generate
54 | :param seed: seed value of random number generator
55 |
56 | returns: a tensor `paths´ of shape (num_time_steps, num_sims) where S[i,j] is the j-th
57 | realization of the underlying at time_grid[i]
58 | """
59 | delta = time_grid[1:] - time_grid[:-1]
60 | num_steps = delta.shape[0]
61 | np.random.seed(seed)
62 | dw = np.random.randn(num_sims, num_steps)
63 | paths = s0 * np.cumprod(np.exp((r - sigma ** 2 / 2) * delta + sigma * np.sqrt(delta) * dw), axis=1)
64 | return np.transpose(np.c_[np.ones(num_sims) * s0, paths])
65 |
66 | @staticmethod
67 | def _delta(sigma, r, s0, tm, sk, call=True):
68 | """
69 | Computes the Delta of a European call/put option.
70 |
71 | :param sigma: instantaneous volatility
72 | :param r: risk-free rate
73 | :param s0: value of underlying stock price at t=0
74 | :param tm: time to maturity of the option
75 | :param sk: strike of the option
76 | :param call: True if call option, False if put
77 | """
78 | phi = norm.cdf
79 | delta = phi(BlackScholes._d1(sigma, r, s0, tm, sk))
80 | if call:
81 | return delta
82 | else:
83 | return delta - 1
84 |
85 | @staticmethod
86 | def _vega(sigma, r, s0, tm, sk):
87 | """
88 | Computes the Vega of a European call/put option.
89 |
90 | :param sigma: instantaneous volatility
91 | :param r: risk-free rate
92 | :param s0: value of underlying stock price at t=0
93 | :param tm: time to maturity of the option
94 | :param sk: strike of the option
95 | """
96 | d1 = BlackScholes._d1(sigma, r, s0, tm, sk)
97 | return s0 * norm.pdf(d1) * np.sqrt(tm)
98 |
99 | @staticmethod
100 | def calibrate(vol_quotes):
101 |
102 | def cost(sigma):
103 | num_quotes = vol_quotes.shape[0]
104 | c = np.zeros(num_quotes)
105 | for i in range(num_quotes):
106 | tm, sk, iv = vol_quotes[i]
107 | c[i] = (iv - sigma) ** 2
108 | return np.sum(c) / 2
109 |
110 | return minimize_scalar(cost, bounds=(0, 1), method='bounded', options={'xatol': 1e-8}).x
111 |
112 | def option_price(self, s0, tm, sk, call=True):
113 | return BlackScholes._option_price(self.sigma, self.r, s0, tm, sk, call)
114 |
115 | def paths(self, s0, time_grid, num_sims, seed=1):
116 | return BlackScholes._paths(self.sigma, self.r, s0, time_grid, num_sims, seed)
117 |
118 | def delta(self, s0, tm, sk, call=True):
119 | return BlackScholes._delta(self.sigma, self.r, s0, tm, sk, call)
120 |
121 | def vega(self, s0, tm, sk):
122 | return BlackScholes._vega(self.sigma, self.r, s0, tm, sk)
123 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/models/test_black_scholes.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from unittest import TestCase
3 | import numpy as np
4 | from py_vollib.black_scholes import black_scholes
5 | from py_vollib.black_scholes.implied_volatility import implied_volatility
6 |
7 | from models.black_scholes import BlackScholes
8 |
9 |
10 | class TestBlackScholes(TestCase):
11 |
12 | def setUp(self):
13 | self.sigma = 0.2
14 | self.r = 0.03
15 | self.bs = BlackScholes(self.sigma, self.r)
16 | self.maturities = np.array([3/12, 9/12, 1., 5., 10.])
17 | self.strikes = np.array([80., 95., 105., 120.3])
18 | self.rates = np.array([-0.03, -0.01, 0, 0.01, 0.03])
19 | self.sigmas = np.array([0.01, 0.05, 0.2, 0.5, 1.5])
20 | self.spots = np.array([80., 90., 100., 110., 120])
21 | self.call = [True, False]
22 | self.s0 = 100.
23 | self.sk = 103.
24 | self.tm = 2.4
25 |
26 | def test_vs_py_vollib(self):
27 | for sigma, r, s0, tm, sk, call in itertools.product(self.sigmas, self.rates, self.spots, self.maturities, self.strikes, self.call):
28 | with self.subTest():
29 | np.testing.assert_almost_equal(BlackScholes._option_price(sigma, r, s0, tm, sk, call),
30 | black_scholes(flag='c' if call else 'p', S=s0, K=sk, t=tm, r=r, sigma=sigma))
31 |
32 | def test_put_call_parity(self):
33 | call = self.bs.option_price(self.s0, self.tm, self.sk, call=True)
34 | put = self.bs.option_price(self.s0, self.tm, self.sk, call=False)
35 | df = np.exp(-self.r * self.tm)
36 | np.testing.assert_almost_equal(call - put, self.s0 - self.sk * df, decimal=6)
37 |
38 | def test_path_distribution(self):
39 | self.time_grid = np.array([0., 1., 5.])
40 | self.num_sims = 10000
41 | self.seed = 1
42 | paths = self.bs.paths(self.s0, self.time_grid, self.num_sims, self.seed)
43 | #np.testing.assert_array_almost_equal(paths.mean(axis=1), np.exp(self.time_grid * self.r) * self.s0)
44 | #print(paths.mean(axis=1), np.exp(self.time_grid * self.r) * self.s0)
45 | #print(paths.std(axis=1)**2, self.s0**2 * np.exp(2 * self.time_grid * self.r) * (np.exp(self.sigma**2 * self.time_grid) - 1))
46 |
47 | def test_vega(self):
48 | bump = 1. / 10000
49 | for tm, sk in itertools.product(self.maturities, self.strikes):
50 | with self.subTest():
51 | price = BlackScholes._option_price(self.sigma, self.r, self.s0, tm, sk, True)
52 | price_bumped = BlackScholes._option_price(self.sigma + bump, self.r, self.s0, tm, sk, True)
53 | vega_df = (price_bumped - price) / bump
54 | vega = BlackScholes._vega(self.sigma, self.r, self.s0, tm, sk)
55 | np.testing.assert_almost_equal(vega, vega_df, decimal=2)
56 |
57 | def test_delta(self):
58 | bump = 1. / 10000
59 | for tm, sk, call in itertools.product(self.maturities, self.strikes, self.call):
60 | with self.subTest():
61 | price = BlackScholes._option_price(self.sigma, self.r, self.s0, tm, sk, True)
62 | price_bumped = BlackScholes._option_price(self.sigma, self.r, self.s0 + bump, tm, sk, True)
63 | delta_fd = (price_bumped - price) / bump
64 | delta = BlackScholes._delta(self.sigma, self.r, self.s0, tm, sk)
65 | np.testing.assert_almost_equal(delta, delta_fd, decimal=5)
66 |
67 | def test_implied_volatility(self):
68 | for sigma, r, s0, tm, sk, call in itertools.product(self.sigmas, self.rates, self.spots, self.maturities, self.strikes, self.call):
69 | with self.subTest():
70 | price = BlackScholes._option_price(sigma, r, s0, tm, sk, call)
71 | iv = BlackScholes._implied_volatility(r, s0, tm, sk, call, price)
72 | np.testing.assert_almost_equal(sigma, iv)
73 |
74 | def test_implied_volatility_vs_py_vollib(self):
75 | for sigma, r, s0, tm, sk, call in itertools.product(self.sigmas, self.rates, self.spots, self.maturities, self.strikes, self.call):
76 | with self.subTest():
77 | price = BlackScholes._option_price(sigma, r, s0, tm, sk, True)
78 | np.testing.assert_almost_equal(BlackScholes._implied_volatility(r, s0, tm, sk, call, price),
79 | implied_volatility(price=price, S=s0, K=sk, t=tm, r=r, flag='c' if call else 'p'))
80 |
81 | def test_calibrate(self):
82 | self.bs = BlackScholes(self.r, self.sigma)
83 | self.vol_quotes = np.array([[1., 100., 0.23]])
84 | sigma = self.bs.calibrate(self.vol_quotes)
85 | np.testing.assert_almost_equal(sigma, 0.23)
86 |
87 | def test_calibrate_multiple(self):
88 | self.vol_quotes = np.array([[1., 100., 0.23], [2., 100., 0.27]])
89 | sigma = self.bs.calibrate(self.vol_quotes)
90 | self.assertTrue(sigma <= 0.27)
91 | self.assertTrue(sigma >= 0.23)
92 |
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/pics/american_option_pricing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/pics/american_option_pricing.png
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/pics/conditional_expectation_orthogonal_projection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/pics/conditional_expectation_orthogonal_projection.png
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/pics/dcke_basket_heston.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/pics/dcke_basket_heston.jpg
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/pics/dcke_performance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/pics/dcke_performance.jpg
--------------------------------------------------------------------------------
/dynamically_controlled_kernel_estimation/pics/dcke_rbergomi.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niknow/machine-learning-examples/b147133c7b0115b591739fa4a600462cbaee6225/dynamically_controlled_kernel_estimation/pics/dcke_rbergomi.jpg
--------------------------------------------------------------------------------
/environment-explicit.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name Artificial Neural Networks
Derivation of the Backpropagation Algorithm
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Assume we are given a neural network $\\operatorname{NN}$ with feed forward $F = F_{\\Theta}:\\mathbb{R}^{n_i} \\to \\mathbb{R}^{n_o}$, where $\\Theta$ is the collection of the weights in all the layers. If we want to train this network using [gradient descent](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/newton_gradient_backprop/gradient_descent.ipynb), we need to calculate the derivative $\\nabla_{\\Theta} F_{\\Theta}$. Because $F = F_L \\circ \\ldots F_1$ is a composition of the various feed forwards of the layers and each layer has some weights, computing this derivative is not entirely trivial.\n", 15 | "\n", 16 | "Backpropagation is an algorithm that is based on a clever computation of the derivative $\\nabla_{\\Theta}F_{\\Theta}$, which - as the name might suggest - starts from the back of the network, i.e. the output layer $F_L$ and then works its way backwards to the first layer.\n", 17 | "\n", 18 | "In this notebook, we provide the mathematical foundations of backpropagations and derive the key equations." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Recall Definition & Notation for Neural Networks" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "In order to pin down the precise equations for backpropagation, we first have to pin down the definition of neural network. Even for multilayer perceptrons (MLPs), there are various formulations of them in the literature. We will use the following:\n", 33 | "\n", 34 | "**Definition (neural network):** A *neural network* $\\operatorname{NN}$ is a tuple $\\operatorname{NN}=(A_l, b_l, \\sigma_l)_{1 \\leq l \\leq L}$ defined by\n", 35 | "* a numer $n_i$ of *inputs*,\n", 36 | "* a number $n_o$ of *outputs*\n", 37 | "* a number $L$ of *layers* and\n", 38 | "* for each layer $1 \\leq l \\leq L$ \n", 39 | " * a number $n_l$ of *neurons* (or *units*),\n", 40 | " * a matrix $A_l \\in \\mathbb{R}^{n_{l} \\times n_{l-1}}$ and a vector $b_l \\in \\mathbb{R}^{n_l}$ of *weights* such that $n_0 = n_i$, $n_{L}=n_o$ and\n", 41 | " * an *activation function* $\\sigma_l:\\mathbb{R} \\to \\mathbb{R}$.\n", 42 | "\n", 43 | "For any $1 \\leq l \\leq L$, the tuple $(A_l, b_l, \\sigma_l)$ is called a *layer*. For $l=L$, the layer is called *output layer* and for $1 \\leq l< L$, the layer is called *hidden layer*. We denote by $\\Theta_l := (b_l, A_l) \\in \\mathbb{R}^{(n_l+1) \\times n_{l-1}}$ the total weights of layer $l$ and set $\\Theta := (\\Theta_1, \\ldots, \\Theta_L)$.\n", 44 | "\n", 45 | "A graphical representation of the layers can be found in the [introduction to MLPs](https://nbviewer.jupyter.org/github/niknow/machine-learning-examples/blob/master/neural_network_intro/neural_network_intro_model_setup.ipynb). " 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# Feed Forward\n", 53 | "The *feed forward* of a neural network is the process of feeding an input data sample into the network and computing the output." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "The following notation will be convenient:\n", 61 | "\n", 62 | "**Definition (affine linear map):** Let $A \\in \\mathbb{R}^{m \\times n}$ be a matrix and $b \\in \\mathbb{R}^{m}$ be a vector. Then we denote by\n", 63 | "\\begin{align*}\n", 64 | " f_{A,b}:\\mathbb{R}^{n} \\to \\mathbb{R}^m, && v \\mapsto Av + b\n", 65 | "\\end{align*}\n", 66 | "the *affine linear map with parameters $A$ and $b$*." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "**Definition (feed forward function):** Let $\\operatorname{NN}=(A_l, b_l, \\sigma_l)_{1 \\leq l \\leq L}$ be a neural network. Then for each $1 \\leq l \\leq L$, we define a function \n", 74 | "\\begin{align*}\n", 75 | "F_l := \\sigma_l \\circ f_{A_l, b_l}: \\mathbb{R}^{n_{l-1}} \\to \\mathbb{R}^{n_l}, && v \\mapsto \\sigma_l(A_lv + b_l),\n", 76 | "\\end{align*}\n", 77 | "\n", 78 | "where we employ the convention that $\\sigma_l$ is applied in every component.\n", 79 | "The composition $F:= F_{\\Theta}:\\mathbb{R}^{n_i} \\to \\mathbb{R}^{n_o}$, $F_{\\Theta} := F_L \\circ \\ldots \\circ F_2 \\circ F_1$ is called the *feed forward function* of $\\operatorname{NN}$. Any set of inputs $x \\in \\mathbb{R}^{n_i}$ is called an *input layer*." 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "**Algorithm (feed forward):** The feed forward of a neural network on an input $x \\in \\mathbb{R}^{n_i}$ is simply the evaluation of the feed forward function $F$ on $x$, i.e. the computation of $y=F(x)$. As $F$ is a composition of the various $F_l$, this evaluation is computed by evaluating the $F_l$ one by one feeding the input foward through the network as follows:\n", 87 | "\n", 88 | "\\begin{align}\n", 89 | " a_0 &:= x \\in \\mathbb{R}^{n_i} \\\\\n", 90 | " z_1 & := f_{A_1, b_1}(a_0) = A_1 a_0 + b_1 \\in \\mathbb{R}^{n_1} \\\\\n", 91 | " a_1 & := \\sigma_1(z_1) \\in \\mathbb{R}^{n_1} \\\\\n", 92 | " z_2 & := f_{A_2, b_2}(a_1) = A_2 a_1 + b_2 \\in \\mathbb{R}^{n_2} \\\\\n", 93 | " a_2 & := \\sigma_2(z_2) \\in \\mathbb{R}^{n_2} \\\\\n", 94 | " & \\vdots \\\\\n", 95 | " z_l &:= f_{A_l, b_l}(a_{l-1}) = A_l a_{l-1} + b_l \\in \\mathbb{R}^{n_l}\\\\\n", 96 | " a_l &:= \\sigma_l(z_{l}) \\in \\mathbb{R}^{n_l}\\\\\n", 97 | " & \\vdots \\\\\n", 98 | " z_L &:= f_{A_L, b_L}(a_{L-1}) = A_L a_{L-1} + b_L \\in \\mathbb{R}^{n_L} \\\\\n", 99 | " a_L &:= \\sigma_L(z_L) \\in \\mathbb{R}^{n_L} \\\\\n", 100 | " y &:= a_L \\in \\mathbb{R}^{n_o}\n", 101 | "\\end{align}" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Backpropagation" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Cost Functions" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "The final result $y=F(x)$ of the feed-forward depends on all the weights in all the layers. In supervised learning, we are typically given a labeled training set $(x_1, y_1), \\ldots, (x_N, y_N)$, $x_k \\in \\mathbb{R}^{n_i}$, $y_k \\in \\mathbb{R}^{n_o}$, and we are interested in how well the network fits the data set, i.e. how close the $F(x_k)$ are to the given $y_k$. In order to measure this, we need a *cost function* $J$ that measures the distance between the vector of vectors $(F(x_1), \\ldots, F(x_N))$ and $(y_1, \\ldots, y_N)$. While in theory, this function can have arbitrary shape, the most common way to chose it, is to choose a cost function $C_k$, which only measures the distance between $F(x_k)$ and $y_k$, and then aggregate these to the total cost via\n", 123 | "\\begin{align*}\n", 124 | " J_{\\Theta}(x_1, \\ldots, x_N, y_1, \\ldots, y_N) = \\frac{1}{N} \\sum_{k=1}^{N}{C_k(F_{\\Theta}(x_k))}\n", 125 | "\\end{align*}\n", 126 | "One of the most common choices for the cost function is $C_k(y) := \\|y - y_k\\|^2$, i.e. to choose the least squares. " 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "When training the neural network we want to minimize the cost function $J_{\\Theta}$ by changing the parameters $\\Theta$ - usually via gradient descent. Obviously, gradient descent requires the gradient of the function it is trying to minimize. The big advantage of assuming that the cost function $J_{\\Theta}$ can be written as a sum of cost functions $C_k$ is that instead of having to compute the gradient $\\nabla_{\\Theta} J_{\\Theta}(x_1, \\ldots, x_N, y_1, \\ldots, y_N)$, we can compute the gradients $\\nabla_{\\Theta}C_k(F_{\\Theta}(x_k))$ separately. Thus, instead of working on the whole training set, we will restrict our attention to a single sample $(x,y)$ with $x \\in \\mathbb{R}^{n_i}$ and $y \\in \\mathbb{R}^{n_o}$. Our aim is to compute the gradient of a single cost function $C$ on that sample, i.e. to compute\n", 134 | "\\begin{align*}\n", 135 | " \\nabla_{\\Theta}(C \\circ F_{\\Theta})(x)).\n", 136 | "\\end{align*}\n", 137 | "This means, we assume that\n", 138 | "\\begin{align*}\n", 139 | " C:\\mathbb{R}^{n_o} \\to \\mathbb{R}, && a \\mapsto C(a)\n", 140 | "\\end{align*}\n", 141 | "is a differentiable function." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Reminder of Calculus: Nabla, Grad and Chain Rule" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "To derive the backpropagation algorithm, we employ the following notation from calculus: \n", 156 | "\n", 157 | "**Nabla:** For any differentiable function $g:\\mathbb{R}^{n} \\to \\mathbb{R}^{m}$ and any $x \\in \\mathbb{R}^n$, we denote by $\\nabla g(x) \\in \\mathbb{R}^{m \\times n}$ the matrix of partial derivatives, i.e. \n", 158 | "\\begin{align*}\n", 159 | " (\\nabla g(x))_{ij}) = \\partial_{x_j} g_i\n", 160 | "\\end{align*}\n", 161 | "In particular, for a function $g: \\mathbb{R}^n \\to \\mathbb{R}$, we denote by $\\nabla g(x) \\in \\mathbb{R}^{1 \\times n}$ the row vector of partial derivatives.\n", 162 | "\n", 163 | "**Gradient:** For a differentiable function $g: \\mathbb{R}^n \\to \\mathbb{R}$ and an $x \\in \\mathbb{R}^n$, we denote by $\\operatorname{grad}(x) \\in \\mathbb{R}^{n \\times 1}$ the column vector of partial derivatives, i.e.\n", 164 | "\\begin{align*}\n", 165 | " \\operatorname{grad} g(x) = \\nabla g(x)^{\\top}\n", 166 | "\\end{align*}\n", 167 | "\n", 168 | "We generally regard $\\mathbb{R}^n$ as a space of column vectors.\n", 169 | "\n", 170 | "**Transpose:** For any matrix $A \\in \\mathbb{R}^{m \\times n}$, we denote its transpose by $A^{\\top} \\in \\mathbb{R}^{n \\times m}$.\n", 171 | "\n", 172 | "**Chain Rule:** For two differentiable functions $g:\\mathbb{R}^n \\to \\mathbb{R}^m$ and $h:\\mathbb{R}^m \\to \\mathbb{R}^{k}$, the derivative of the composition $h \\circ g$ is related to the derivative of the components via\n", 173 | "\\begin{align*}\n", 174 | " \\forall x \\in \\mathbb{R}^n: \\nabla(h \\circ g)(x) = \\nabla h(g(x)) \\bullet \\nabla g(x),\n", 175 | "\\end{align*}\n", 176 | "where $\\bullet$ denotes the matrix product." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Plan of Attack\n", 184 | "In order to compute the gradient $\\nabla_{\\Theta}(C(F_\\Theta(x))$ , we will proceed in two steps:\n", 185 | "1. Compute $\\nabla_x (C(F_{\\Theta}(x))$ step by step working backwards through the network\n", 186 | "2. Relate the result to $\\nabla_\\Theta (C(F_{\\Theta}(x))$ " 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Backwards Recursion\n", 194 | "The key idea to execute the first step is the following insight: The function $F_{\\Theta} = F_L \\circ \\ldots \\circ F_1$ is a complex composition of many functions $F_l$. Thus, computing $\\nabla F$ requires a lot of applications of the chain rule. However, computing only the last gradient $F_L$ is easy. Therefore, the idea is to work backwards by computing the derivatives of increasingly comples compositions. To that end, the following definition is helpful.\n", 195 | "\n", 196 | "**Definition:** Let $\\operatorname{NN}$ be a neural network with feed-forward function $F = F_{\\Theta} = F_L \\circ \\ldots \\circ F_1$ and $C$ be a const function for a single sample. We define the functions\n", 197 | "\\begin{align*}\n", 198 | " G_l := C \\circ F_L \\circ \\ldots \\circ F_{l+1} \\circ \\sigma_l : \\mathbb{R}^{n_l} \\to \\mathbb{R}\n", 199 | "\\end{align*}\n", 200 | "for $1 \\leq l \\leq L$.\n", 201 | "\n", 202 | "The main insight into these function is the following\n", 203 | "\n", 204 | "**Lemma:** Let $G_l$ be as a above and assume that $z_l$ are computed via feed-forward as above. Then the sequence of error terms\n", 205 | "\\begin{align*}\n", 206 | " \\varepsilon_l := \\operatorname{grad} G_l(z_l) \\in \\mathbb{R}^{n_l}\n", 207 | "\\end{align*}\n", 208 | "satisfies the backward recusion\n", 209 | "\\begin{align*}\n", 210 | " \\varepsilon_L = \\nabla \\sigma_L (z_L) \\bullet \\operatorname{grad} C(a_L), && \\varepsilon_l = \\nabla \\sigma_l (z_l) \\bullet A_{l+1}^{\\top} \\bullet \\varepsilon_{l+1}.\n", 211 | "\\end{align*}\n", 212 | "\n", 213 | "**Proof:** For $l=L$, this follows from the definitions and the chain rule as\n", 214 | "\\begin{align*}\n", 215 | " \\nabla G_L(z_L)\n", 216 | " = \\nabla (C \\circ \\sigma_L)(z_L)\n", 217 | " = \\nabla C(\\sigma_L(z_L)) \\bullet \\nabla \\sigma_L(z_L)\n", 218 | "\\end{align*}\n", 219 | "and thus\n", 220 | "\\begin{align*}\n", 221 | " \\varepsilon_L = \\operatorname{grad} G_L (z_L) = (\\nabla G_L(z_L))^{\\top} = \\nabla \\sigma_L(z_L) \\bullet \\operatorname{grad} C(\\sigma_L(z_L)).\n", 222 | "\\end{align*}\n", 223 | "Here, we use the above mentioned convention that we identify the scalar function $\\sigma_l:\\mathbb{R} \\to \\mathbb{R}$ with the vector valued function $\\sigma_l:\\mathbb{R}^{n_l} \\to \\mathbb{R}^{n_l}$, $v \\mapsto (\\sigma(v_1), \\ldots, \\sigma(v_{n_l}))$. Thus, the derivative of this vector valued function is given as a diagonal matrix $\\nabla \\sigma_l (v)$, where the diagonal is given by $\\sigma'(v_1), \\ldots, \\sigma'(v_{n_l})$. Thus, this matrix is symmetric, i.e. $\\nabla \\sigma_l (v) = \\nabla \\sigma_l (v)^{\\top}$.\n", 224 | "\n", 225 | "For $l+1 \\to l$, notice that by definition, the funtions $G_l$ satisfy\n", 226 | "\\begin{align*}\n", 227 | " G_l &= C \\circ F_L \\circ \\ldots \\circ F_{l+2} \\circ F_{l+1} \\circ \\sigma_l \\\\\n", 228 | " &= C \\circ F_L \\circ \\ldots \\circ F_{l+2} \\circ \\sigma_{l+1} \\circ f_{A_{l+1},b_{l+1}} \\circ \\sigma_l \\\\\n", 229 | " &= G_{l+1} \\circ f_{A_{l+1},b_{l+1}} \\circ \\sigma_l \\\\\n", 230 | "\\end{align*}\n", 231 | "\n", 232 | "Thus,\n", 233 | "\\begin{align*}\n", 234 | " \\nabla G_l(z_l) & = \\nabla G_{l+1}(f_{A_{l+1},b_{l+1}}(\\sigma_l(z_l))) \\bullet \\nabla f_{A_{l+1},b_{l+1}}(\\sigma_l(z_l)) \\bullet \\nabla \\sigma_l(z_l) \\\\\n", 235 | " &= \\nabla G_{l+1}(z_{l+1})) \\bullet A_{l+1} \\bullet \\nabla \\sigma_l(z_l),\n", 236 | "\\end{align*}\n", 237 | "which implies\n", 238 | "\\begin{align*}\n", 239 | " \\varepsilon_l \n", 240 | " = \\operatorname{grad} G_l(z_l)\n", 241 | " = \\nabla G_l(z_l)^{\\top} \n", 242 | " = \\nabla \\sigma_l(z_l) \\bullet A_{l+1}^{\\top} \\bullet \\varepsilon_{l+1}.\n", 243 | "\\end{align*}" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Backwards Gradient Computation\n", 251 | "\n", 252 | "Finally, we use the result of the previous lemma to compute the derivative $\\nabla_{\\Theta}(F_{\\Theta}(x))$.\n", 253 | "\n", 254 | "**Theorem (backpropagation):** Let $\\operatorname{NN} = (\\Theta_l \\sigma_l)_{1 \\leq l \\leq L}$, $\\Theta_l=(A_l, b_l)$, be an MLP and $x \\in \\mathbb{R}^{n_i}$ be an input. Let $C:\\mathbb{R}^{n_o} \\to \\mathbb{R}$ be a differentiable cost function. Let $(\\varepsilon_l)_{1 \\leq l \\leq L}$ be the sequence of error terms of the previous lemma. Then\n", 255 | "\\begin{align*}\n", 256 | " \\operatorname{grad}_{b_l}(C(F_{\\Theta}(x))) &= \\varepsilon_{l} \\\\\n", 257 | " \\operatorname{grad}_{A_l}(C(F_{\\Theta}(x))) &= a_{l-1} \\varepsilon_{l}^{\\top}\n", 258 | "\\end{align*}\n", 259 | "where $a_l$ is defined as above (feed forward).\n", 260 | "\n", 261 | "**Proof:** Analogously to the previous lemma, we define the functions\n", 262 | "\\begin{align*}\n", 263 | " G_{A,b}^l := C \\circ F_L \\circ \\ldots \\circ F_{l+1} \\circ \\sigma_l \\circ f_{A,b}: \\mathbb{R}^{n_{l-1}} \\to \\mathbb{R}\n", 264 | "\\end{align*}\n", 265 | "By construction $G_{A,b}^l = G_l \\circ f_{A,b}$. Therefore,\n", 266 | "\\begin{align*}\n", 267 | " \\nabla_b (G_{A_l,b}^l(a_{l-1}))(b_l) = \\nabla G_l (f_{A_l,b_l}(a_{l-1})) \\bullet \\nabla_b f_{A_l, b}(b_l) = \\nabla G_l(z_l),\n", 268 | "\\end{align*}\n", 269 | "as $\\nabla b f_{A,b}$ is the identity matrix. Therefore, \n", 270 | "\\begin{align*}\n", 271 | " \\nabla_{b_l} C(F_{\\Theta}(x)) \n", 272 | " & = \\nabla _b(C \\circ F_L \\circ \\ldots \\circ F_1(x))(b_l) \\\\\n", 273 | " & = \\nabla_b(G_{A_l,b} \\circ F_{l-1} \\circ \\ldots \\circ F_1(x))(b_l)\\\\\n", 274 | " &= \\nabla G_l(z_l) = \\varepsilon_l,\n", 275 | "\\end{align*}\n", 276 | "which implies the first claim. \n", 277 | "\n", 278 | "To see the second, notice that as a function of $A$, we have $f_{\\_,b_l}(a_{l-1}):\\mathbb{R}^{n_l \\times n_{l-1}} \\to \\mathbb{R}^{n_l}$ and hence analogously, $G_{\\_,b}^l(a_{l-1}) = (G_l \\circ f_{\\_,b})(a_{l-1}):\\mathbb{R}^{n_l \\times n_{l-1}} \\to \\mathbb{R}$. Thence, we can calculate in coordinates using the chain rule\n", 279 | "\n", 280 | "\\begin{align*}\n", 281 | " \\frac{\\partial( G_{\\_,b}^l(a_{l-1}))(A_l)}{\\partial A_{\\nu \\mu}} \n", 282 | " =\\sum_{k=1}^{n_l}{\\nabla G_l}(f_{A_l,b_l}(a_{l-1}))_k \\frac{\\partial (A a_{l-1}+b)(A_l)_k}{\\partial A_{\\nu \\mu}}\n", 283 | " =\\sum_{k=1}^{n_l}{ \\nabla G_l(z_l)_k \\delta_{\\nu k} a_{l-1;\\mu} }\n", 284 | " =\\varepsilon_{l;\\nu} a_{l-1;\\mu}\n", 285 | " =(\\varepsilon_{l} a_{l-1}^{\\top})_{\\nu \\mu}.\n", 286 | "\\end{align*}" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## Algorithm\n", 294 | "Putting everything together, the backpropagation algorithm works as follows:\n", 295 | "\n", 296 | "**Algorithm (backpropagation):**\n", 297 | "\n", 298 | "**Inputs:**\n", 299 | "* A neural network $\\operatorname{NN} = (A_l, b_l, \\sigma_l)_{1 \\leq l \\leq L}$,\n", 300 | "* a single input $x \\in \\mathbb{R}^{n_i}$,\n", 301 | "* a cost function $C:\\mathbb{R}^{n_o} \\to \\mathbb{R}$ for that input.\n", 302 | "\n", 303 | "**Outputs:**\n", 304 | "The gradients\n", 305 | "* $\\nabla_{b_l}(C(F_{\\Theta}(x))$ and \n", 306 | "* $\\nabla_{A_l}(C(F_{\\Theta}(x))$.\n", 307 | "\n", 308 | "**Steps:**\n", 309 | "1. Compute the feed forward $F_{\\Theta}(x)$\n", 310 | " * Initialize: $a_0 := x$\n", 311 | " * For $l=1, \\ldots, L$:\n", 312 | " * $a_l := f_{A_l,b_l}(a_{l-1})$\n", 313 | " * $z_l := \\sigma_l(a_l)$\n", 314 | "2. Compute the errors $\\varepsilon_l$:\n", 315 | " * Initialize: $\\varepsilon_L := \\nabla \\sigma_L(z_L) \\operatorname{grad}C(a_L)$\n", 316 | " * For $L=l-1, \\ldots, 1$: $\\varepsilon_l := \\nabla \\sigma_l(z_l) A_{l+1}^{\\top} \\varepsilon_{l+1}$.\n", 317 | "3. Compute the gradients: For $l=1, \\ldots, L$ (or in any order):\n", 318 | " * $\\operatorname{grad}_{b_l}(C(F_{\\Theta}(x)) = \\varepsilon_l$\n", 319 | " * $\\operatorname{grad}_{A_l}(C(F_{\\Theta}(x)) = a_{l-1} \\varepsilon_l^{\\top}$\n", 320 | "\n", 321 | "In case, we have multiple training samples $x_i$ (which we usually have), the above is repeated on every training sample and then the gradient of the total cost function $J$ is given as the average of the gradients over the samples. " 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# References\n", 329 | "There are various other sources on backpropagations you might find helpful (list not exhaustive):\n", 330 | "\n", 331 | "* http://neuralnetworksanddeeplearning.com/chap2.html\n", 332 | "* https://brilliant.org/wiki/backpropagation/\n", 333 | "* https://datascience.stackexchange.com/questions/44703/how-does-gradient-descent-and-backpropagation-work-together\n", 334 | "* https://stackoverflow.com/questions/47416861/backward-propagation-in-keras" 335 | ] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "Python 3 (ipykernel)", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.9.18" 355 | }, 356 | "toc": { 357 | "base_numbering": 1, 358 | "nav_menu": {}, 359 | "number_sections": true, 360 | "sideBar": true, 361 | "skip_h1_title": false, 362 | "title_cell": "Table of Contents", 363 | "title_sidebar": "Contents", 364 | "toc_cell": false, 365 | "toc_position": {}, 366 | "toc_section_display": true, 367 | "toc_window_display": false 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 2 372 | } 373 | -------------------------------------------------------------------------------- /newton_gradient_backprop/gradient_descent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Gradient Descent
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Newton's Method
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Machine Learning in Finance
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "