├── .gitignore
├── Alphas
├── Filter_Rules.ipynb
├── Market_Return_by_Dates.ipynb
├── Market_Return_by_Payday.ipynb
├── Market_Return_by_Political_Party.ipynb
├── Retail_Crypto_Universe_Return_by_Meme_Coin.ipynb
└── Retail_Crypto_Universe_Return_by_Ticker_Rank.ipynb
├── LICENSE
├── Miscellaneous
├── Continuous_Time_Finance.ipynb
├── Crypto_Degrees_of_Freedom.ipynb
├── Direction_Statistics.ipynb
├── Generating_Multivariate_Random_Numbers.ipynb
├── Grinold-Kahn Notes.md
├── Index_Pairwise_Correlations.ipynb
├── Jarque-Bera Tests.ipynb
├── Longer_Term_Crypto_Degrees_of_Freedom.ipynb
├── Normality Tests.ipynb
└── S&P_500_Pairwise_Correlations.ipynb
├── README.md
├── The Market's Not Normal
├── Fundamental_Law.ipynb
├── Non_Stationarity_in_the_First_Two_Moments.ipynb
├── The_Autocorrelation_of_Variance.ipynb
├── The_Market's_Not_Normal.ipynb
├── The_Variance_is_Not_Stationary.ipynb
├── These_Two_Things_are_Not_the_Same.ipynb
├── my_library.py
└── not_the_same.tex
├── my_library.py
└── nlls.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2024 Giller Investments (New Jersey), LLC
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to use
5 | the Software, including without limitation the rights to use, copy, modify,
6 | and distribute, and to permit persons to whom the Software is furnished to do so,
7 | subject to the following conditions:
8 |
9 | The above copyright notice and the following notices shall be included in all
10 | copies or substantial portions of the Software.
11 |
12 | PERMISSION TO COMMERCIALLY EXPLOIT THIS SOFTWARE BY SELLING OR LICENSING
13 | UNMODIFIED VERSIONS OF IT TO THIRD PARTIES IS EXPLICITLY WITHHELD. THE
14 | TERM "UNMODIFIED" SHALL EXPRESSLY INCLUDE ANY MODIFICATIONS THAT A REASONABLE
15 | PERSON WOULD REGARD AS "TRIVIAL" OR "SLIGHT," SUCH AS CHANGES TO FORMATTING,
16 | PRESENTATION, SPECIFIC CHOICES OF NON-CONSEQUENTIAL WORDS, PUNCTUATION, LAYOUT ETC.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 |
26 | NOTHING IN THIS SOFTWARE, OR THE DOCUMENTATION ASSOCIATED WITH IT, SHOULD BE
27 | TAKEN AS REPRESENTING INVESTMENT ADVICE, EITHER EXPLICITLY OR IMPLICITLY.
28 | THE AUTHOR EXPRESSLY AND UNAMBIGUOUSLY WITHHOLDS ANY ENDORSEMENT OF THE USE
29 | OF THIS SOFTWARE AS PART OF A TRADING OR INVESTMENT STRATEGY. ALL INVESTMENTS
30 | ARE MADE AT THE SOLE RISK OF THE INVESTOR AND YOU SHOULD CONSULT A REGISTERED
31 | INVESTMENT PROFESSIONAL FOR ANY ADVICE REGARDING SPECIFIC INVESTMENTS OR
32 | INVESTMENT STRATEGIES PRIOR TO MAKING AN INVESTMENT OF ANY KIND.
33 |
--------------------------------------------------------------------------------
/Miscellaneous/Grinold-Kahn Notes.md:
--------------------------------------------------------------------------------
1 | # Grinold-Kahn Notes
2 |
3 | For asset returns $r_{it}\ :\ i\in[1,N], t\in\mathbb{Z}^+$, define the matrix $G_N(\rho)$ as the covariance matrix of returns where all pairwise correlations are equal. i.e.
4 |
5 | $$
6 | \mathbb{V}[r_{it},r_{jt}]=\sigma_{i}\sigma_{j}\rho \Leftrightarrow V=SG_N(\rho)S\ \mathrm{where}\ G_N(\rho)=\begin{pmatrix}
7 | 1&\rho&\cdots&\rho\\
8 | \rho&1&\cdots&\rho\\
9 | \vdots&&\ddots&\vdots\\
10 | \rho&\rho&\cdots&1
11 | \end{pmatrix}\ \mathrm{and}
12 | \ S_t=\begin{pmatrix}
13 | \sigma_1&0&\cdots&0\\
14 | 0&\sigma_2&\cdots&0\\
15 | \vdots&&\ddots&\vdots\\
16 | 0&0&\cdots&\sigma_N
17 | \end{pmatrix}.
18 | $$
19 |
20 | As a symmetric positive definite matrix, $G_N(\rho)$, may always be diagonalized by a similarity transformation. The eigenvalues are:
21 |
22 | 1. one eigenvalue of $1+(N-1)\rho$; and,
23 | 2. $N-1$ eigenvalues of magnitude $1-\rho$.
24 |
25 | and the associated eigenvectors are:
26 |
27 | 1. one eigenvector of all ones: $(1\,1\dots 1)^T=\mathbf{1}_N$ where $\mathbf{1}_N$ is the unit-vector of dimension $N$; and,
28 | 2. $N-1$ vectors of the form: $(1\,-1\,0 \dots 0)^T$, $(1,0\,-1\,0 \dots 0)^T$ through $(1\,0 \dots 0\,-1)^T$.
29 |
--------------------------------------------------------------------------------
/Miscellaneous/Jarque-Bera Tests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyN9PjJ29ioVj0KHDZQuwmOh",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | ""
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 1,
32 | "metadata": {
33 | "colab": {
34 | "base_uri": "https://localhost:8080/"
35 | },
36 | "id": "s34EvR0_ax-c",
37 | "outputId": "783a5304-11b7-4494-c4a8-f60eba41ceea"
38 | },
39 | "outputs": [
40 | {
41 | "output_type": "stream",
42 | "name": "stdout",
43 | "text": [
44 | "16:39:14 Starting...\n",
45 | "16:39:14 Installing yfinance into Google notebook...\n",
46 | "16:39:18 Installing arch into Google notebook...\n",
47 | "16:39:26 Initialized.\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "from my_library import *"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "source": [
58 | "from scipy.stats import norm,gennorm,jarque_bera as jbtest\n",
59 | "from itertools import product\n",
60 | "\n",
61 | "results=pd.DataFrame()\n",
62 | "results.index=pd.MultiIndex.from_tuples(list(product(list(range(10,1001,1)),list(np.arange(one,four,0.1)))))\n",
63 | "results.index.names=[\"Sample\",\"Beta\"]\n",
64 | "\n",
65 | "for n,beta in results.index:\n",
66 | " sample=pd.Series(gennorm(beta,zero,one).rvs(n))\n",
67 | " results.loc[(n,beta),\"Mean\"]=sample.mean()\n",
68 | " results.loc[(n,beta),\"St.Dev.\"]=sample.var()\n",
69 | " results.loc[(n,beta),\"Skewness\"]=sample.skew()\n",
70 | " results.loc[(n,beta),\"Kurtosis\"]=sample.kurt()+three\n",
71 | " results.loc[(n,beta),\"JB Test\"]=jbtest(sample)[1]\n",
72 | "\n",
73 | "results[\"JB 5%\"]=results[\"JB Test\"]<0.05\n",
74 | "results[\"JB 1%\"]=results[\"JB Test\"]<0.01\n",
75 | "results[\"JB 0.1%\"]=results[\"JB Test\"]<0.001\n",
76 | "\n",
77 | "Z=(df:=results.reset_index()).pivot_table(index=\"Beta\",columns=\"Sample\",values=\"JB Test\").values\n",
78 | "X,Y=np.meshgrid(np.sort(df[\"Sample\"].unique()),np.sort(df[\"Beta\"].unique()))\n",
79 | "figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten))\n",
80 | "plot.pcolormesh(X,Y,Z,cmap='Oranges')\n",
81 | "plot.set_xscale('log')"
82 | ],
83 | "metadata": {
84 | "id": "6HqH-iEra29s",
85 | "outputId": "5c06f8ea-2496-4f38-8628-526eae659881",
86 | "colab": {
87 | "base_uri": "https://localhost:8080/",
88 | "height": 835
89 | }
90 | },
91 | "execution_count": 14,
92 | "outputs": [
93 | {
94 | "output_type": "display_data",
95 | "data": {
96 | "text/plain": [
97 | ""
98 | ],
99 | "image/png": "\n"
100 | },
101 | "metadata": {}
102 | }
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "source": [
108 | "figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten))\n",
109 | "plot.pcolormesh(X,Y,Z,cmap='terrain')\n",
110 | "plot.set_xscale('log')\n",
111 | "plot.xaxis.set_major_formatter(CountLabels(0))"
112 | ],
113 | "metadata": {
114 | "id": "FniA_fHbfy14",
115 | "outputId": "499ec5ae-e085-4b08-e209-dce1c6ae22db",
116 | "colab": {
117 | "base_uri": "https://localhost:8080/",
118 | "height": 830
119 | }
120 | },
121 | "execution_count": 13,
122 | "outputs": [
123 | {
124 | "output_type": "display_data",
125 | "data": {
126 | "text/plain": [
127 | ""
128 | ],
129 | "image/png": "\n"
130 | },
131 | "metadata": {}
132 | }
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "source": [],
138 | "metadata": {
139 | "id": "cj0nyxfdoNjY"
140 | },
141 | "execution_count": null,
142 | "outputs": []
143 | }
144 | ]
145 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Financial Data Science in Python
2 | This GitHub repository
3 | https://github.com/Farmhouse121/Financial-Data-Science-in-Python
4 | collects the scripts and notebooks required to reproduce my published work. This includes both the articles that I have written in _Willmott_ magazine and my forthcoming book, which will also be titled _Financial Data Science in Python_.
5 |
6 | I will be featuring _mostly_ notebooks prepared for the Google _colab_ system, although I **strongly** reccommend using a more "procedural" workflow than most notebook users adopt. For analytical work it is important that the _internal state_ of the analytical system in use be _well known_ when an _inferential procedure_ is executed. Notebooks tend to encourage a "spaghetti" workflow that is not conducive to the internal state being well known. It is my intention that the notebooks, therefore, always be executed _from the beginning to the end_ in one session. Breaking the work into "cells" is provided for narrative convenience only.
7 |
8 | ## my_library.py
9 | A key file for all of my work will be the `my_library.py` _mini-package_. I write "mini" because it's simply a file, and it does not provide a novel software suite. It just imports most of the stuff I always import and provides a few, key, custom writtent extensions and utilities. That file will exist here, in the top level of this repository, and every independent analysis will sit in a folder below. When imaged onto a computer (with `git clone https://github.com/Farmhouse121/Financial-Data-Science-in-Python.git`, for example) each subfolder should _symlink_ to the parent version. On Unix systems, this is done via the command line with `ln -s ../my_library.py` from within the sub-folder, although `git` should copy all of those links automatically. (On Windows systems there is the `mklink my_library.py ..\my_library.py` command that can be used to execute this functionality in the shell, although I don't have a windows machine and haven't tried that personally.)
10 |
11 | To run this file from within a Python script, just include the line
12 | ```
13 | from my_library import *
14 | ```
15 | at the beginning. This will produce an identical internal state to having pasted all of the included code into a single file or cell and then executed it.
16 |
17 | ### Changes and Idiosyncracies
18 | All of the scripts are going to include useage of four "standard" packages, and three are imported wholly in `my_library.py`.
19 |
20 | ```
21 | import numpy as np
22 | import pandas as pd
23 | import matplotlib.pyplot as pl ; plt=pl
24 | ```
25 |
26 | I know that `pyplot` is usually imported as `plt`, not `pl` as I have, but I use it so much that having a two-letter namespace prefix is worthwhile to me and it's my computer, I get to chose the abstractions I like. For those who wish to follow the conventional usage, I have aliased it to `plt`. Feel free to use either prefix -- it's your computer -- and if you wish to "take back" the variable `pl`, Python will let you do that too.
27 |
28 | For the record, I am also going to invoke `matplotlib` _mostly_ via the construct
29 |
30 | ```
31 | figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten))
32 | ```
33 |
34 | You will find the _friendly numbers_ `ten` and `GoldenRatio` defined to their correct values inside `my_library.py`. My experience with numerical computation, over many years, in many languages and on many platforms, has taught me that typing one of `10`, `10.`, and `10e0` **will not guarantee** that you end up with the same IEEE floating point number stored in a CPU register on your computer. My assignment `ten=10e0` and usage as just `ten` guarantees that I will always get the same number. You may not find this particularly important immediately, but you will as soon as you try to represent `one/ten` in binary on any computer of your choice.
35 |
36 | Also, I'm using `figure,plot` not `fig,ax` in my code. I like _long descriptive names_ and these describe what I am making: a figure that contains a plot. (An axe is something that I use to split wood but, earlier in my career, it represented the bonds that a trading desk wanted to sell, occasionally _desperately_.)
37 |
38 | ### Statsmodels
39 | I will use `statsmodels` as my primary inference engine, but will not usually import the entire package into the namespace. I _really_ try to decrease namespace pollution where ever possible! Estimation is optimization of statistical models with inference about the presented results. The chain of actions -- modeling, estimation, inference -- represents empirical science. Mere optimization, such as that provided by `scikit.learn`, _without inference_ is not science. It is not merely necessary to know which set of parameters, whether latent, explicit or "hyper," give the best performance of your system _in sample_, we also need to know, or have some indication of, whether the selected performance differs significantly from that accessible by chance. What `statsmodels` does is the _hard work_ of adding the metrics of statistical inference to the output of optimization. I value that greatly.
40 |
41 | ### arch
42 | I'm going to use Kevin Sheppard's `arch` package as well, with some modifications to lessen the bounds on various regressions. This is an excellent and useful tool, although I feel Dr. Sheppard has erred in doing things like excluding the Laplace distribution via parametric constraints, etc. After a brief discussion with him, I follow his recommended path of deriving new versions of his classes `GeneralizedError` and `GARCH`, which I call `GeneralizedError2` and `GARCH2` (somewhat unoriginally). These are defined within `my_library.py`.
43 |
44 | ### scipy
45 | I'm going to use code from `scipy`, mostly `scipy.stats` and, occasionally, `scipy.optimize` directly (rather than via `statsmodels` or `arch`). I use the distributions and tests from `scipy.stats`, but often find that they have annoyingly short or annoyingly long names. Thus you will see things like:
46 | ```
47 | from scipy.stats import t as density
48 | ```
49 | and
50 | ```
51 | from scipy.stats import scipy.stats.ttest_1samp as ttest
52 | ```
53 | **I** would much rather import only the functions I need, rather the entire package, or give my self the ability to switch distributions easily, in later code...
54 | ```
55 | #from scipy.stats import t as density
56 | from scipy.stats import norm as density
57 | ```
58 | etc.
59 |
60 | ### nprint
61 | I find that, in notebooks, knowing _when_ code executed is so useful that attaching times to print statements is very useful. Thus `my_library.py` includes the following function definitions:
62 | ```
63 | from datetime import datetime ; date_format,time_format="%Y-%m-%d","%H:%M:%S" ; datetime_format=date_format+" "+time_format
64 |
65 | def now():
66 | """Quickly return the time."""
67 | return datetime.now().strftime(time_format)
68 |
69 | def nprint(*args,**kwargs):
70 | """Decorate the print statement with the time."""
71 | print(now(),*args,**kwargs)
72 | stdout.flush()
73 | ```
74 | To use it, use `nprint` as you would `print`. e.g.
75 | ```
76 | nprint("Hello world!")
77 | ```
78 | should output:
79 |
80 | 
81 |
82 | ## The Articles
83 | This repository is going to include all of the code supporting the new book I am writing, _Financial Data Science in Python_. **But**, I am also going to be writing on Medium at [https://medium.com/@stattrader](https://medium.com/@stattrader). I will include links to each article _and_ the relevant folder within this repository below. Since this `README` is fairly long, it will also serve as the first article.
84 |
85 | | Title | GitHub | Medium |
86 | |-------|--------|--------|
87 | | Financial Data Science in Python | [README](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/README.md) | [Financial Data Science in Python](https://stattrader.medium.com/financial-data-science-in-python-ee66dab460cf) |
88 | | The Market's Not Normal: Part 1 | [The_Market's_Not_Normal.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/The_Market's_Not_Normal.ipynb)| [The Market's Not Normal, Part 1](https://medium.com/adventures-in-data-science/the-markets-not-normal-part-1-bbba8dad2807) |
89 | | The Market's Not Normal: Part 2| [These_Two_Things_are_Not_the_Same.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/These_Two_Things_are_Not_the_Same.ipynb) | [The Market's Not Normal: Part 2](https://medium.com/adventures-in-data-science/the-markets-not-normal-part-2-cf8c4060f6b4)|
90 | | Can Non-Stationarity Rescue the Normal Distribution? | [Non_Stationarity_in_the_First_Two_Moments.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/Non_Stationarity_in_the_First_Two_Moments.ipynb) | [Can Non-Stationarity Rescue the Normal Distribution?](https://medium.com/adventures-in-data-science/can-non-stationarity-rescue-the-normal-distribution-4af9f708b26a)|
91 | | Let's Talk About Heteroskedasticity | [The_Variance_is_Not_Stationary.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/main/The%20Market's%20Not%20Normal/The_Variance_is_Not_Stationary.ipynb) | [Let's Talk About Heteroskedasticity](https://medium.com/@stattrader/lets-talk-about-heteroskedasticity-f1443d628da0)|
92 |
93 | ## The Data
94 | I am going to use public domain data sources, which will be mostly _Yahoo! Finance_, accessed via the `yfinance` package, and the _Fred_ depository, operated by the Federal Reserve Bank of St. Louis with data downloaded directly via the web service they provide. This is mostly daily and slower cadence data. Most of what I will write about lives in that space.
95 |
96 | ## Making Proper Time Indices for Pandas
97 | It's been my experience that many codes return Pandas dataframes with a timestamp field for an axis but that the system is _not properly told_ that the data is, in fact, temporal in nature. (I'm looking at you **everybody** who uses textual dates as their timestamps!) This error can be remedied with the following construct, which you will see _extensively_ in my code:
98 | ```
99 | df=pd.DataFrame(...)
100 | df.index=pd.DatetimeIndex(df.index).to_period('B')
101 | ```
102 | which delivers an index of _business days_, for example. (I am grateful to [Alex De Castro](https://github.com/decastro-alex) for pointing out the existence of the `B` argument to me.)
103 |
104 | ## $\LaTeX$ "Scratchpads"
105 | Since my writing contains _a lot_ of mathematics, which I generally render in $\LaTeX$ and then cut'n'paste into less civil document preparation systems, I've decided to add documents that include the _math mode code_ to generate the equations. These will not render as full $\LaTeX$ documents (as I am not writing the Medium articles in $\LaTeX$, I'm not going to go through the bother of preparing an analogue document for antoher format). If Medium would support $\LaTeX$ markup, in the way that GitHub does, then that would change.
106 |
107 | ## There Will be Many Commits
108 | I have learned from three decades doing scientific research & development work that it is very hard to predict which of the many edits to a script will be the _final_ one that makes it all work. In addition, I've learned that memorializing that "first working version" with a weighty editorial commit will be immediately followed by an "oh yea, also" commit to follow a few minutes later. So I don't really try, I commit when I think I've done something useful and don't shy away from committing frequently. This is particularly useful if one uses GitHub, _as I do_, to synchronize code between different physical locations (e.g. my desktop and an AWS server, for example). This may be "bad practice," but it is my practice.
109 |
110 | ## Support
111 | I appreciate the many positive comments I receive regarding my work and my attempts to explain aspects of the scientific analysis of financial markets to people. If you would like to _directly_ support this work, you can _Buy me a Coffee_ via the link below. [](https://ko-fi.com/H2H7EC7Z5)
112 |
113 | You may also buy my books via Amazon and other booksellers.
114 |
115 | * [Adventures in Financial Data Science](https://medium.com/r/?url=https%3A%2F%2Famzn.to%2F3P66fyK)
116 | * [Essays on Trading Strategy](https://medium.com/r/?url=https%3A%2F%2Fwww.amazon.com%2FEssays-Trading-Strategy-Scientific-Finance%2Fdp%2F9811273812)
117 |
--------------------------------------------------------------------------------
/The Market's Not Normal/my_library.py:
--------------------------------------------------------------------------------
1 | my_library.py
--------------------------------------------------------------------------------
/The Market's Not Normal/not_the_same.tex:
--------------------------------------------------------------------------------
1 | This is a ``scratchpad'' to generate the \LaTeX equations used in my articles. It will not run as a stand-alone LaTeX source document.
2 |
3 | \DeclareMathOperator*{\argmax}{arg~max}
4 | \DeclareMathOperator*{\argmin}{arg~min}
5 |
6 | \begin{equation}
7 | \ln\mathbb{P}[\mathrm{model}|\mathrm{data}]=\ln\mathbb{P}[\mathrm{data}|\mathrm{model}]+\ln\mathbb{P}[\mathrm{model}]-
8 | \ln\mathbb{P}[\mathrm{data}]
9 | \end{equation}
10 |
11 | \begin{equation}
12 | \Rightarrow\argmax_{\boldsymbol{\theta}}\ln\mathbb{P}[\mathrm{model}(\boldsymbol{\theta})|\mathrm{data}]=\argmax_{\boldsymbol{\theta}}\mathbb{L}[\mathrm{data}|\boldsymbol{\theta}]
13 | \end{equation}
14 |
15 | \begin{align}
16 | \hat{\boldsymbol{\theta}}&=\argmin_{\boldsymbol{\theta}}\sum_i\Big|x_i-x(\boldsymbol{\theta})\Big|\\
17 | \textit{or}\;
18 | \hat{\boldsymbol{\theta}}&=\argmin_{\boldsymbol{\theta}}\sum_i\Big(x_i-x(\boldsymbol{\theta})\Big)^2
19 | \end{align}
20 |
21 | \begin{align}
22 | y_i&=\alpha+\beta x_i+\varepsilon_i : \varepsilon_i\sim\mathcal{D}\\
23 | \Rightarrow\;\varepsilon_i&=y_i-\alpha-\beta x_i
24 | \end{align}
25 |
26 | \begin{align}
27 | (\hat{\alpha},\hat{\beta})&=\argmax_{(\alpha,\beta)}\sum_i\ln f(\varepsilon_i)\\
28 | &=\argmax_{(\alpha,\beta)}\sum_i\ln f(y_i-\alpha-\beta x_i)
29 | \end{align}
30 |
31 | \begin{equation}
32 | f(\varepsilon,\boldsymbol{\theta})=g(\varepsilon)h(\boldsymbol{\theta})e^{-q\left|\frac{\varepsilon}{\sigma}\right|^\kappa}\;\mathrm{where}\;\{q,\sigma,\kappa\}\subseteq\boldsymbol{\theta}
33 | \end{equation}
34 |
35 | \begin{equation*}
36 | (\hat{\alpha},\hat{\beta})=\argmin_{(\alpha,\beta)}\left\{
37 | \sum_{i=1}^Nq\left|\frac{y_i-\alpha-\beta x_i}{\sigma}\right|^\kappa
38 | -\sum_{i=1}^N\ln g(y_i-\alpha-\beta x_i)-N\,\ln h(\boldsymbol{\theta})\right\}
39 | \end{equation*}
40 |
41 | \begin{equation}
42 | f(y_i-\alpha-\beta x_i,\sigma)=\frac{e^{-\frac{1}{2}\left(\frac{y_i-\alpha-\beta x_i}{\sigma}\right)^2}}{\sqrt{2\pi}\sigma}\Rightarrow
43 | \begin{cases}
44 | q=\frac{1}{2}\\
45 | \kappa=2\\
46 | \ln g(y_i-\alpha-\beta x_i)=0\\
47 | \ln h(\boldsymbol{\theta})=-\ln(\sqrt{2\pi}\sigma)
48 | \end{cases}
49 | \end{equation}
50 |
51 | \begin{equation}
52 | (\hat{\alpha},\hat{\beta})=\mathop{\mathrm{arg~min}}_{(\alpha,\beta)}\sum_{i=1}^N(y_i-\alpha-\beta x_i)^2
53 | \end{equation}
54 |
55 | \begin{equation}
56 | f(y_i-\alpha-\beta x_i,\sigma)=\frac{e^{-\left|\frac{y_i-\alpha-\beta x_i}{\sigma}\right|}}{2\sigma}\Rightarrow
57 | \begin{cases}
58 | q=1\\
59 | \kappa=1\\
60 | \ln g(y_i-\alpha-\beta x_i)=0\\
61 | \ln h(\boldsymbol{\theta})=-\ln(2\sigma)
62 | \end{cases}
63 | \end{equation}
64 |
65 | \begin{equation}
66 | (\hat{\alpha},\hat{\beta})=\mathop{\mathrm{arg~min}}_{(\alpha,\beta)}\sum_{i=1}^N|y_i-\alpha-\beta x_i|
67 | \end{equation}
68 |
69 | \begin{align}
70 | \mathcal{D}=\mathrm{Normal}&\Leftrightarrow\hat{\alpha}=
71 | \mathop{\mathrm{mean}}\{x_i\}\\
72 | \mathcal{D}=\mathrm{Laplace}&\Leftrightarrow\hat{\alpha}=
73 | \mathop{\mathrm{median}}\{x_i\}
74 | \end{align}
75 |
76 |
--------------------------------------------------------------------------------
/my_library.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | # this is designed to set up every notebook or executable script properly with common resources
4 | from sys import stdout,stderr,executable,version_info
5 | assert version_info.major>=3
6 |
7 | from datetime import datetime
8 | DATE_FORMAT,TIME_FORMAT='%m/%d/%Y','%H:%M:%S'
9 | DATETIME_FORMAT=DATE_FORMAT+" "+TIME_FORMAT
10 |
11 | def now(format=TIME_FORMAT):
12 | """Quickly return the time."""
13 | return datetime.now().strftime(format)
14 |
15 | def today(format=DATE_FORMAT):
16 | """Quickly return the date."""
17 | return datetime.now().strftime(format)
18 |
19 | def nprint(*args,**kwargs):
20 | """Decorate the print statement with the time."""
21 | print(now(),*args,**kwargs)
22 | stdout.flush()
23 |
24 | nprint("Starting...")
25 |
26 | # allow code to use the SIGALARM functionality to interrupt itself in a controlled fashion
27 | from signal import signal,SIGALRM,alarm
28 |
29 | class Timeout(Exception):
30 | """Permits time limits."""
31 | def __str__(self):
32 | return "Timeout."
33 |
34 | def sigalrm(x,y):
35 | """Handle SIGALRM by raising a Timeout exception."""
36 | raise Timeout
37 |
38 | signal(SIGALRM,sigalrm) # register the handler
39 |
40 | # allow deep breaks, the python break syntax can only break out one level
41 | class Break(Exception):
42 | """Permits deep breaks."""
43 | def __str__(self):
44 | return "Break"
45 |
46 | # passive wrapper for with clauses for objects that don't provide __enter__ and __exit__
47 | class With:
48 | """Dummy for with clauses."""
49 | def __enter__(self):
50 | return self
51 |
52 | def __exit__(self,*args):
53 | pass
54 |
55 | def __init__(self,object=None):
56 | self.object=object
57 |
58 | def __call__(self):
59 | return self.object
60 |
61 | def __str__(self):
62 | return str(self.object)
63 |
64 | def __repr__(self):
65 | return repr(self.object)
66 |
67 | # deal with some Google colab specific stuff
68 | try:
69 | from IPython import get_ipython
70 | ip=get_ipython()
71 |
72 | if ip is not None and 'google' in str(ip):
73 | for package in 'yfinance','arch':
74 | nprint("Installing %s into Google notebook..." % package)
75 | ip.system("pip install --upgrade %s 1>/dev/null" % package)
76 |
77 | from tqdm.notebook import tqdm
78 |
79 | except ModuleNotFoundError:
80 | # if IPython not installed, we're definitely not in a notebook
81 | from tqdm import tqdm
82 |
83 | from warnings import filterwarnings
84 | filterwarnings("ignore",category=RuntimeWarning) # I don't care
85 |
86 | # yfinance - download function
87 | from yfinance import download
88 |
89 | # the usual suspects
90 | import pandas as pd
91 | import numpy as np
92 | import matplotlib.pyplot as pl ; plt=pl
93 |
94 | # some friendly numbers
95 | zero,one,two,three,four,five,ten,hundred,annualize,thousand=0e0,1e0,2e0,3e0,4e0,5e0,1e1,1e2,252e0,1e3
96 | half,GoldenRatio=one/two,(one+np.sqrt(five))/two
97 |
98 | # import arch classes
99 | from arch.univariate import ConstantMean,ARX,GeneralizedError,Normal,GARCH
100 | from arch.typing import Float64Array
101 |
102 | # change the constraints for GeneralizedError
103 | class GeneralizedError2(GeneralizedError):
104 | """Version of the GED with bounds on fit adjusted to include all feasible distributions (GED is valid for all positive nu values)."""
105 | def bounds(self,*args)->list[tuple[float,float]]:
106 | """Revised bounds."""
107 | return [(0e0,1e2)]
108 |
109 | def constraints(self,*args)->tuple[Float64Array,Float64Array]:
110 | return np.array([[1], [-1]]), np.array([self.bounds()[0][0],-self.bounds()[0][1]])
111 |
112 | # change the constraints for GARCH
113 | class GARCH2(GARCH):
114 | """Version of GARCH with constraints modified to be more relaxed, leads to models that don't bind on constraints."""
115 | def bounds(self,resids:Float64Array)->list[tuple[float,float]]:
116 | """Modify bounds to be more relaxed."""
117 | v=float(np.mean(abs(resids)**self.power))
118 | bounds=[(1e-8*v,ten*v)]
119 | bounds.extend([(-one,two)]*(self.p+self.o+self.q))
120 | return bounds
121 |
122 | def constraints(self,*args)->tuple[Float64Array,Float64Array]:
123 | """Modify constraints to permit negative values for A and D etc."""
124 | a,b=super(GARCH2,self).constraints()
125 | b[1:(self.p+self.o+1)]=-one
126 | return a,b
127 |
128 | # some special axis formatters for matplotlib
129 | from matplotlib.ticker import Formatter
130 |
131 | class DirectionalLabels(Formatter):
132 | """Base class to provide directional formats for matplotlib axes."""
133 |
134 | def __init__(self):
135 | """Abstract base class."""
136 | raise NotImplementedError("DirectionalLabels is an abstract base class. You cannot instantiate it directly.")
137 |
138 | def __call__(self,datum,pos=None):
139 | """Render the provided number as a string."""
140 | return self.plus.format(datum*self.scale) if datum>0e0 else self.minus.format(-datum*self.scale) if datum<0e0 else self.zero
141 |
142 | class PercentLabels(DirectionalLabels):
143 | """Output Excel style percent labels."""
144 | def __init__(self,precision=2,zero="0",scale=1e0):
145 | """Set decimal precision and string to use for zeros."""
146 | self.plus="{:,.%df} %%" % precision
147 | self.minus="({:,.%df}) %%" % precision
148 | self.zero=str(zero)
149 | self.scale=abs(scale)
150 |
151 | class CurrencyLabels(DirectionalLabels):
152 | """Matplotlib formatter to provide Excel type currency formats for axes."""
153 | def __init__(self,precision=2,zero="0",symbol="$",suffix="",scale=1e0):
154 | """Set decimal precision and string to use for zeros."""
155 | self.plus="%s {:,.%df}%s" % (symbol,precision,suffix)
156 | self.minus="(%s {:,.%df}%s)" % (symbol,precision,suffix)
157 | self.zero=str(zero)
158 | self.scale=abs(scale)
159 |
160 | class CountLabels(DirectionalLabels):
161 | """Matplotlib formatter to provide integers with commas."""
162 | def __init__(self,zero="0",scale=1e0):
163 | """Integers with commas."""
164 | self.plus="{:,.0f}"
165 | self.minus=self.plus
166 | self.zero=str(zero)
167 | self.scale=abs(scale)
168 |
169 | # loads index membership from Wikipedia
170 | def loadindex(indexname):
171 | """Load the specified index and return the members and the first date for data extraction."""
172 |
173 | if indexname=='S&P 500':
174 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker"))
175 | first_date=index['Date added'].max() # add data is in table returned
176 |
177 | elif indexname=='NASDAQ-100':
178 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/Nasdaq-100')[4].rename(columns={"Symbol":"Ticker"}).set_index("Ticker"))
179 | first_date=datetime.now().strftime("%Y-01-02") # NASDAQ rebalances (normally) on the first day of the year. Jan'1st. is *always* a holiday
180 |
181 | elif indexname=='S&P MidCap 400':
182 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker"))
183 | updates=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[1].set_index(("Date","Date"))
184 | updates.index=list(map(lambda x:pd.Period(x.split('[')[0],'D'),updates.index))
185 | first_date=str(updates.index.max())
186 |
187 | elif indexname=='S&P SmallCap 600':
188 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker"))
189 | updates=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[1].set_index(("Date","Date"))
190 | updates.index=list(map(lambda x:pd.Period(x.split('[')[0],'D'),updates.index))
191 | first_date=str(updates.index.max())
192 |
193 | elif indexname=='S&P 900': # union of S&P 500 and S&P MidCap 400
194 | sp500,dt500=loadindex('S&P 500')
195 | sp400,dt400=loadindex('S&P MidCap 400')
196 | columns=list(set(sp500.columns).intersection(set(sp400.columns)))
197 | index=pd.concat([sp500[columns],sp400[columns]]).sort_index()
198 | first_date=max([dt500,dt400])
199 |
200 | elif indexname=='S&P 1500': # union of S&P 500, S&P MidCap 400, and S&P SmallCap 600
201 | sp900,dt900=loadindex('S&P 900')
202 | sp600,dt600=loadindex('S&P SmallCap 600')
203 | columns=list(set(sp900.columns).intersection(set(sp600.columns)))
204 | index=pd.concat([sp900[columns],sp600[columns]]).sort_index()
205 | first_date=max([dt900,dt600])
206 |
207 | elif indexname=='Dow': # Dow Jones
208 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')[1].rename(columns={"Symbol":"Ticker"}).set_index("Ticker"))
209 | first_date=index['Date added'].max()
210 |
211 | elif indexname=='FTSE 250':
212 | index=pd.read_html('https://en.wikipedia.org/wiki/FTSE_250_Index')[3]
213 | index["Ticker"]=index["Ticker"].apply(lambda x:x+".L") # set to Reuter's style tickers
214 | index.set_index("Ticker",inplace=True)
215 | display(index)
216 | first_date=((pd.Period(datetime.now(),'Q')-1).asfreq('B')+1).strftime("%Y-%m-%d") # first date of current quarter
217 |
218 | else:
219 | raise ValueError("Don't know how to load members of %s Index!" % indexname)
220 |
221 | return index[~index.index.duplicated()],first_date # drop duplicates in case some exist
222 |
223 | # download data from FRED
224 | from requests import get
225 | from os import environ
226 | from getpass import getpass
227 |
228 | def get_fred(series_id,FRED_API_KEY=None):
229 | """Function to get data from FRED API and return it as a DataFrame, also returns metadata object."""
230 |
231 | if FRED_API_KEY is not None:
232 | environ['FRED_API_KEY']=FRED_API_KEY
233 |
234 | elif 'FRED_API_KEY' not in environ:
235 | environ['FRED_API_KEY']=getpass("You need to enter a FRED API key (your keys are stored here: https://fredaccount.stlouisfed.org/apikeys): ")
236 |
237 | response=get((url:="https://api.stlouisfed.org/fred/series/observations?series_id={}&api_key={}&file_type=json").format(series_id,environ['FRED_API_KEY']))
238 |
239 | if response.status_code//100!=2:
240 | raise ValueError("Get status_code={:d} from {:s}".format(response.status_code,url))
241 |
242 | df=pd.DataFrame.from_dict(pd.json_normalize(response.json())['observations'][0])[['date','value']].rename(columns={"date":"Date","value":series_id}).set_index("Date")
243 | df[series_id]=df[series_id].apply(lambda x:float(x) if x!='.' else np.nan)
244 |
245 | response=get((url:="https://api.stlouisfed.org/fred/series?series_id={}&api_key={}&file_type=json").format(series_id,environ['FRED_API_KEY']))
246 |
247 | if response.status_code//100!=2:
248 | raise ValueError("Get status_code={:d} from {:s}".format(response.status_code,url))
249 |
250 | metadata=response.json()['seriess'][0]
251 | df.index=pd.DatetimeIndex(df.index).to_period(metadata['frequency_short'])
252 | return df.dropna(),metadata
253 |
254 | # that's all folks
255 | nprint("Initialized.")
256 |
--------------------------------------------------------------------------------
/nlls.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from statsmodels.base.model import GenericLikelihoodModel
3 | from statsmodels.api import add_constant
4 | from scipy.stats import norm
5 | from scipy.optimize import Bounds
6 | from my_library import zero,one
7 |
8 | class NLLS(GenericLikelihoodModel):
9 | """
10 | Fit a Non-Linear Least Squares model to data via MLE.
11 |
12 | This is a maximum likelihood regression to a Normal distribution with a non-linear mean process,
13 | with an added regressor being the standard deviation of errors. If you do not override the `.predict()`
14 | method it will actually use a linear mean process and so is equivalent to OLS, but slower! You can change
15 | the distribution used by supplying a pdf function that matches the scipy.stats framework.
16 |
17 | The code follows the usual statsmodels conventions for a model defined in terms of a dependent variable named `endog` and independent
18 | variables named `exog`, and with the `add_constant()` function expected if an intercept is required. If you do not supply any data for
19 | `exog` a constant column will be used. The system fits the standard deviation of the data (which is required by the method). You can supply
20 | any `scipy.stats` format univariate distribution via the `distribution` argument and any extra variables via `extra_params_names`. The
21 | regression is performed via the `scipy.optimize.minimize` function and (in my experience) can be a little brittle. You may need to supply
22 | user bounds for the latent variables if the default ones chosen are incorrect.
23 | """
24 | def __init__(self,endog,exog=None,distribution=norm,extra_params_names=[],**kwargs):
25 | """Initialize the object, setting helpful data, and then call the base constructor."""
26 | # a small number
27 | self.epsilon=1e-7
28 |
29 | # check for univariate problem
30 | if len(endog.shape)>1:
31 | raise NotImplementedError("Only univariate processes are supported, you supplied a %d-d array for endog." % len(endog.shape))
32 |
33 | # if no exogenous series, put in a series of ones for the constant
34 | if exog is None:
35 | exog=np.ones(endog.shape[0])
36 |
37 | # add xtra distribution process parameters, which are "hidden" from the exog vector input
38 | self.distribution=distribution
39 | self.latent_variables=['sigma']+extra_params_names
40 |
41 | # initialize the base class with the added variables as "extra_params"
42 | super(NLLS,self).__init__(endog,exog,extra_params_names=self.latent_variables,**kwargs)
43 |
44 | # adjust d.o.f., it seems that the code doesn't get this right as given by the examples on line
45 | self.df_resid-=len(self.latent_variables)
46 | self.df_model=self.nparams-self.k_constant
47 |
48 | def _pick_params(self,params):
49 | """Helper function to divide the parameters into linear process, disperstion and other latent variable parameters."""
50 | assert len(params)==self.nparams
51 |
52 | # memorialize parameters
53 | self.params=params
54 |
55 | # pick up parameters from the right slots
56 | n=len(self.latent_variables)
57 |
58 | if n>1:
59 | beta=params[:-n]
60 | sigma=params[-n]
61 | extra=list(params[-n+1:])
62 |
63 | elif n==1:
64 | beta=params[:-1]
65 | sigma=params[-1]
66 | extra=[]
67 |
68 | else:
69 | raise ValueError("The number of latent variables cannot be zero.")
70 |
71 | return beta,sigma,extra
72 |
73 | def predict(self,exog=None,params=None):
74 | """
75 | Returns the mean process prediction for the given inputs.
76 |
77 | If exog or params are None then self.exog, and/or self.params, will be used respectively.
78 | If you want to do something other than OLS, you should override this method with some other
79 | formula. If you don't want to use the Normal Distribution for the error process you should
80 | supply some other distribution when you instantiate the class.
81 | """
82 | beta,sigma,extra=self._pick_params(params if params is not None else self.params)
83 | _exog=exog if exog is not None else self.exog
84 | mean=np.dot(_exog,beta) # if you want something other than OLS, change this formula
85 |
86 | return mean
87 |
88 | def nloglikeobs(self,params):
89 | """Returns a vector of negative log-likelihood values for each observation."""
90 | # get the parameters divided into those for the mean process and the latent variables
91 | beta,sigma,extra=self._pick_params(params)
92 |
93 | # compute the mean model, dot computes the time-series of the dot product of the observation rows with beta
94 | self.mean=self.predict()
95 | self.innovation=(self.endog-self.mean)/sigma
96 |
97 | # set the distribution with the location set to zero as we specify the mean-model explicitly
98 | density=self.distribution(*extra,loc=zero,scale=sigma)
99 |
100 | # compute vector of negative log likelihood of vector of observations
101 | return -density.logpdf(self.endog-self.mean)
102 |
103 | def fit(self,start_params=None,bounds=None,maxiter=1000,**kwargs):
104 | """
105 | Perform a regression using trust-constrained gradient minimization.
106 |
107 | NOTE: This code will try to guess values for `start_params` and `bounds` if you don't supply them. In particular,
108 | it will guess `(-np.inf,+np.inf)` for the limits of any latent variables added by the user. If this assumption is wrong,
109 | the regression may fail. In which case, user supplied values should be substituted. It's my experience that this regression
110 | method is "slow" and "fragile," but it is the required one.
111 | """
112 | # the regression problem parameters are defined HERE by the start_params vector, wierd huh?
113 | if start_params is None:
114 | start_params=[zero]*self.exog.shape[1]+[self.endog.std()]+[one]*(len(self.latent_variables)-1)
115 |
116 | if self.k_constant:
117 | start_params[0]=self.endog.mean()
118 |
119 | # set the bounds
120 | if bounds is None:
121 | bounds=[(-np.inf,np.inf)]*self.exog.shape[1]+[(self.epsilon,np.inf)]+[(-np.inf,np.inf)]*(len(self.latent_variables)-1)
122 |
123 | # check we set the right number of variables
124 | assert len(start_params)==self.nparams
125 | assert len(bounds)==self.nparams
126 |
127 | # now let the base class do the regression, specifying the trust-constraint method for scipy.optimize with bounds
128 | f=super(NLLS,self).fit(
129 | start_params=start_params,
130 | method='minimize',
131 | min_method='trust-constr',
132 | bounds=Bounds(*list(zip(*bounds))),
133 | maxiter=maxiter,
134 | **kwargs
135 | )
136 | f.named_params=dict(zip(self.exog_names,self.params))
137 | f.num_params=len(f.named_params)
138 | return f
139 |
140 |
--------------------------------------------------------------------------------