├── .gitignore ├── Alphas ├── Filter_Rules.ipynb ├── Market_Return_by_Dates.ipynb ├── Market_Return_by_Payday.ipynb ├── Market_Return_by_Political_Party.ipynb ├── Retail_Crypto_Universe_Return_by_Meme_Coin.ipynb └── Retail_Crypto_Universe_Return_by_Ticker_Rank.ipynb ├── LICENSE ├── Miscellaneous ├── Continuous_Time_Finance.ipynb ├── Crypto_Degrees_of_Freedom.ipynb ├── Direction_Statistics.ipynb ├── Generating_Multivariate_Random_Numbers.ipynb ├── Grinold-Kahn Notes.md ├── Index_Pairwise_Correlations.ipynb ├── Jarque-Bera Tests.ipynb ├── Longer_Term_Crypto_Degrees_of_Freedom.ipynb ├── Normality Tests.ipynb └── S&P_500_Pairwise_Correlations.ipynb ├── README.md ├── The Market's Not Normal ├── Fundamental_Law.ipynb ├── Non_Stationarity_in_the_First_Two_Moments.ipynb ├── The_Autocorrelation_of_Variance.ipynb ├── The_Market's_Not_Normal.ipynb ├── The_Variance_is_Not_Stationary.ipynb ├── These_Two_Things_are_Not_the_Same.ipynb ├── my_library.py └── not_the_same.tex ├── my_library.py └── nlls.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Giller Investments (New Jersey), LLC 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to use 5 | the Software, including without limitation the rights to use, copy, modify, 6 | and distribute, and to permit persons to whom the Software is furnished to do so, 7 | subject to the following conditions: 8 | 9 | The above copyright notice and the following notices shall be included in all 10 | copies or substantial portions of the Software. 11 | 12 | PERMISSION TO COMMERCIALLY EXPLOIT THIS SOFTWARE BY SELLING OR LICENSING 13 | UNMODIFIED VERSIONS OF IT TO THIRD PARTIES IS EXPLICITLY WITHHELD. THE 14 | TERM "UNMODIFIED" SHALL EXPRESSLY INCLUDE ANY MODIFICATIONS THAT A REASONABLE 15 | PERSON WOULD REGARD AS "TRIVIAL" OR "SLIGHT," SUCH AS CHANGES TO FORMATTING, 16 | PRESENTATION, SPECIFIC CHOICES OF NON-CONSEQUENTIAL WORDS, PUNCTUATION, LAYOUT ETC. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | 26 | NOTHING IN THIS SOFTWARE, OR THE DOCUMENTATION ASSOCIATED WITH IT, SHOULD BE 27 | TAKEN AS REPRESENTING INVESTMENT ADVICE, EITHER EXPLICITLY OR IMPLICITLY. 28 | THE AUTHOR EXPRESSLY AND UNAMBIGUOUSLY WITHHOLDS ANY ENDORSEMENT OF THE USE 29 | OF THIS SOFTWARE AS PART OF A TRADING OR INVESTMENT STRATEGY. ALL INVESTMENTS 30 | ARE MADE AT THE SOLE RISK OF THE INVESTOR AND YOU SHOULD CONSULT A REGISTERED 31 | INVESTMENT PROFESSIONAL FOR ANY ADVICE REGARDING SPECIFIC INVESTMENTS OR 32 | INVESTMENT STRATEGIES PRIOR TO MAKING AN INVESTMENT OF ANY KIND. 33 | -------------------------------------------------------------------------------- /Miscellaneous/Grinold-Kahn Notes.md: -------------------------------------------------------------------------------- 1 | # Grinold-Kahn Notes 2 | 3 | For asset returns $r_{it}\ :\ i\in[1,N], t\in\mathbb{Z}^+$, define the matrix $G_N(\rho)$ as the covariance matrix of returns where all pairwise correlations are equal. i.e. 4 | 5 | $$ 6 | \mathbb{V}[r_{it},r_{jt}]=\sigma_{i}\sigma_{j}\rho \Leftrightarrow V=SG_N(\rho)S\ \mathrm{where}\ G_N(\rho)=\begin{pmatrix} 7 | 1&\rho&\cdots&\rho\\ 8 | \rho&1&\cdots&\rho\\ 9 | \vdots&&\ddots&\vdots\\ 10 | \rho&\rho&\cdots&1 11 | \end{pmatrix}\ \mathrm{and} 12 | \ S_t=\begin{pmatrix} 13 | \sigma_1&0&\cdots&0\\ 14 | 0&\sigma_2&\cdots&0\\ 15 | \vdots&&\ddots&\vdots\\ 16 | 0&0&\cdots&\sigma_N 17 | \end{pmatrix}. 18 | $$ 19 | 20 | As a symmetric positive definite matrix, $G_N(\rho)$, may always be diagonalized by a similarity transformation. The eigenvalues are: 21 | 22 | 1. one eigenvalue of $1+(N-1)\rho$; and, 23 | 2. $N-1$ eigenvalues of magnitude $1-\rho$. 24 | 25 | and the associated eigenvectors are: 26 | 27 | 1. one eigenvector of all ones: $(1\,1\dots 1)^T=\mathbf{1}_N$ where $\mathbf{1}_N$ is the unit-vector of dimension $N$; and, 28 | 2. $N-1$ vectors of the form: $(1\,-1\,0 \dots 0)^T$, $(1,0\,-1\,0 \dots 0)^T$ through $(1\,0 \dots 0\,-1)^T$. 29 | -------------------------------------------------------------------------------- /Miscellaneous/Jarque-Bera Tests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyN9PjJ29ioVj0KHDZQuwmOh", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "s34EvR0_ax-c", 37 | "outputId": "783a5304-11b7-4494-c4a8-f60eba41ceea" 38 | }, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "16:39:14 Starting...\n", 45 | "16:39:14 Installing yfinance into Google notebook...\n", 46 | "16:39:18 Installing arch into Google notebook...\n", 47 | "16:39:26 Initialized.\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "from my_library import *" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "from scipy.stats import norm,gennorm,jarque_bera as jbtest\n", 59 | "from itertools import product\n", 60 | "\n", 61 | "results=pd.DataFrame()\n", 62 | "results.index=pd.MultiIndex.from_tuples(list(product(list(range(10,1001,1)),list(np.arange(one,four,0.1)))))\n", 63 | "results.index.names=[\"Sample\",\"Beta\"]\n", 64 | "\n", 65 | "for n,beta in results.index:\n", 66 | " sample=pd.Series(gennorm(beta,zero,one).rvs(n))\n", 67 | " results.loc[(n,beta),\"Mean\"]=sample.mean()\n", 68 | " results.loc[(n,beta),\"St.Dev.\"]=sample.var()\n", 69 | " results.loc[(n,beta),\"Skewness\"]=sample.skew()\n", 70 | " results.loc[(n,beta),\"Kurtosis\"]=sample.kurt()+three\n", 71 | " results.loc[(n,beta),\"JB Test\"]=jbtest(sample)[1]\n", 72 | "\n", 73 | "results[\"JB 5%\"]=results[\"JB Test\"]<0.05\n", 74 | "results[\"JB 1%\"]=results[\"JB Test\"]<0.01\n", 75 | "results[\"JB 0.1%\"]=results[\"JB Test\"]<0.001\n", 76 | "\n", 77 | "Z=(df:=results.reset_index()).pivot_table(index=\"Beta\",columns=\"Sample\",values=\"JB Test\").values\n", 78 | "X,Y=np.meshgrid(np.sort(df[\"Sample\"].unique()),np.sort(df[\"Beta\"].unique()))\n", 79 | "figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten))\n", 80 | "plot.pcolormesh(X,Y,Z,cmap='Oranges')\n", 81 | "plot.set_xscale('log')" 82 | ], 83 | "metadata": { 84 | "id": "6HqH-iEra29s", 85 | "outputId": "5c06f8ea-2496-4f38-8628-526eae659881", 86 | "colab": { 87 | "base_uri": "https://localhost:8080/", 88 | "height": 835 89 | } 90 | }, 91 | "execution_count": 14, 92 | "outputs": [ 93 | { 94 | "output_type": "display_data", 95 | "data": { 96 | "text/plain": [ 97 | "
" 98 | ], 99 | "image/png": "\n" 100 | }, 101 | "metadata": {} 102 | } 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "source": [ 108 | "figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten))\n", 109 | "plot.pcolormesh(X,Y,Z,cmap='terrain')\n", 110 | "plot.set_xscale('log')\n", 111 | "plot.xaxis.set_major_formatter(CountLabels(0))" 112 | ], 113 | "metadata": { 114 | "id": "FniA_fHbfy14", 115 | "outputId": "499ec5ae-e085-4b08-e209-dce1c6ae22db", 116 | "colab": { 117 | "base_uri": "https://localhost:8080/", 118 | "height": 830 119 | } 120 | }, 121 | "execution_count": 13, 122 | "outputs": [ 123 | { 124 | "output_type": "display_data", 125 | "data": { 126 | "text/plain": [ 127 | "
" 128 | ], 129 | "image/png": "\n" 130 | }, 131 | "metadata": {} 132 | } 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "source": [], 138 | "metadata": { 139 | "id": "cj0nyxfdoNjY" 140 | }, 141 | "execution_count": null, 142 | "outputs": [] 143 | } 144 | ] 145 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Financial Data Science in Python 2 | This GitHub repository 3 | https://github.com/Farmhouse121/Financial-Data-Science-in-Python 4 | collects the scripts and notebooks required to reproduce my published work. This includes both the articles that I have written in _Willmott_ magazine and my forthcoming book, which will also be titled _Financial Data Science in Python_. 5 | 6 | I will be featuring _mostly_ notebooks prepared for the Google _colab_ system, although I **strongly** reccommend using a more "procedural" workflow than most notebook users adopt. For analytical work it is important that the _internal state_ of the analytical system in use be _well known_ when an _inferential procedure_ is executed. Notebooks tend to encourage a "spaghetti" workflow that is not conducive to the internal state being well known. It is my intention that the notebooks, therefore, always be executed _from the beginning to the end_ in one session. Breaking the work into "cells" is provided for narrative convenience only. 7 | 8 | ## my_library.py 9 | A key file for all of my work will be the `my_library.py` _mini-package_. I write "mini" because it's simply a file, and it does not provide a novel software suite. It just imports most of the stuff I always import and provides a few, key, custom writtent extensions and utilities. That file will exist here, in the top level of this repository, and every independent analysis will sit in a folder below. When imaged onto a computer (with `git clone https://github.com/Farmhouse121/Financial-Data-Science-in-Python.git`, for example) each subfolder should _symlink_ to the parent version. On Unix systems, this is done via the command line with `ln -s ../my_library.py` from within the sub-folder, although `git` should copy all of those links automatically. (On Windows systems there is the `mklink my_library.py ..\my_library.py` command that can be used to execute this functionality in the shell, although I don't have a windows machine and haven't tried that personally.) 10 | 11 | To run this file from within a Python script, just include the line 12 | ``` 13 | from my_library import * 14 | ``` 15 | at the beginning. This will produce an identical internal state to having pasted all of the included code into a single file or cell and then executed it. 16 | 17 | ### Changes and Idiosyncracies 18 | All of the scripts are going to include useage of four "standard" packages, and three are imported wholly in `my_library.py`. 19 | 20 | ``` 21 | import numpy as np 22 | import pandas as pd 23 | import matplotlib.pyplot as pl ; plt=pl 24 | ``` 25 | 26 | I know that `pyplot` is usually imported as `plt`, not `pl` as I have, but I use it so much that having a two-letter namespace prefix is worthwhile to me and it's my computer, I get to chose the abstractions I like. For those who wish to follow the conventional usage, I have aliased it to `plt`. Feel free to use either prefix -- it's your computer -- and if you wish to "take back" the variable `pl`, Python will let you do that too. 27 | 28 | For the record, I am also going to invoke `matplotlib` _mostly_ via the construct 29 | 30 | ``` 31 | figure,plot=pl.subplots(figsize=(ten*GoldenRatio,ten)) 32 | ``` 33 | 34 | You will find the _friendly numbers_ `ten` and `GoldenRatio` defined to their correct values inside `my_library.py`. My experience with numerical computation, over many years, in many languages and on many platforms, has taught me that typing one of `10`, `10.`, and `10e0` **will not guarantee** that you end up with the same IEEE floating point number stored in a CPU register on your computer. My assignment `ten=10e0` and usage as just `ten` guarantees that I will always get the same number. You may not find this particularly important immediately, but you will as soon as you try to represent `one/ten` in binary on any computer of your choice. 35 | 36 | Also, I'm using `figure,plot` not `fig,ax` in my code. I like _long descriptive names_ and these describe what I am making: a figure that contains a plot. (An axe is something that I use to split wood but, earlier in my career, it represented the bonds that a trading desk wanted to sell, occasionally _desperately_.) 37 | 38 | ### Statsmodels 39 | I will use `statsmodels` as my primary inference engine, but will not usually import the entire package into the namespace. I _really_ try to decrease namespace pollution where ever possible! Estimation is optimization of statistical models with inference about the presented results. The chain of actions -- modeling, estimation, inference -- represents empirical science. Mere optimization, such as that provided by `scikit.learn`, _without inference_ is not science. It is not merely necessary to know which set of parameters, whether latent, explicit or "hyper," give the best performance of your system _in sample_, we also need to know, or have some indication of, whether the selected performance differs significantly from that accessible by chance. What `statsmodels` does is the _hard work_ of adding the metrics of statistical inference to the output of optimization. I value that greatly. 40 | 41 | ### arch 42 | I'm going to use Kevin Sheppard's `arch` package as well, with some modifications to lessen the bounds on various regressions. This is an excellent and useful tool, although I feel Dr. Sheppard has erred in doing things like excluding the Laplace distribution via parametric constraints, etc. After a brief discussion with him, I follow his recommended path of deriving new versions of his classes `GeneralizedError` and `GARCH`, which I call `GeneralizedError2` and `GARCH2` (somewhat unoriginally). These are defined within `my_library.py`. 43 | 44 | ### scipy 45 | I'm going to use code from `scipy`, mostly `scipy.stats` and, occasionally, `scipy.optimize` directly (rather than via `statsmodels` or `arch`). I use the distributions and tests from `scipy.stats`, but often find that they have annoyingly short or annoyingly long names. Thus you will see things like: 46 | ``` 47 | from scipy.stats import t as density 48 | ``` 49 | and 50 | ``` 51 | from scipy.stats import scipy.stats.ttest_1samp as ttest 52 | ``` 53 | **I** would much rather import only the functions I need, rather the entire package, or give my self the ability to switch distributions easily, in later code... 54 | ``` 55 | #from scipy.stats import t as density 56 | from scipy.stats import norm as density 57 | ``` 58 | etc. 59 | 60 | ### nprint 61 | I find that, in notebooks, knowing _when_ code executed is so useful that attaching times to print statements is very useful. Thus `my_library.py` includes the following function definitions: 62 | ``` 63 | from datetime import datetime ; date_format,time_format="%Y-%m-%d","%H:%M:%S" ; datetime_format=date_format+" "+time_format 64 | 65 | def now(): 66 | """Quickly return the time.""" 67 | return datetime.now().strftime(time_format) 68 | 69 | def nprint(*args,**kwargs): 70 | """Decorate the print statement with the time.""" 71 | print(now(),*args,**kwargs) 72 | stdout.flush() 73 | ``` 74 | To use it, use `nprint` as you would `print`. e.g. 75 | ``` 76 | nprint("Hello world!") 77 | ``` 78 | should output: 79 | 80 | ![image](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/assets/469106/f111b0ec-57e8-4acf-b97d-0b838ee13170) 81 | 82 | ## The Articles 83 | This repository is going to include all of the code supporting the new book I am writing, _Financial Data Science in Python_. **But**, I am also going to be writing on Medium at [https://medium.com/@stattrader](https://medium.com/@stattrader). I will include links to each article _and_ the relevant folder within this repository below. Since this `README` is fairly long, it will also serve as the first article. 84 | 85 | | Title | GitHub | Medium | 86 | |-------|--------|--------| 87 | | Financial Data Science in Python | [README](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/README.md) | [Financial Data Science in Python](https://stattrader.medium.com/financial-data-science-in-python-ee66dab460cf) | 88 | | The Market's Not Normal: Part 1 | [The_Market's_Not_Normal.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/The_Market's_Not_Normal.ipynb)| [The Market's Not Normal, Part 1](https://medium.com/adventures-in-data-science/the-markets-not-normal-part-1-bbba8dad2807) | 89 | | The Market's Not Normal: Part 2| [These_Two_Things_are_Not_the_Same.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/These_Two_Things_are_Not_the_Same.ipynb) | [The Market's Not Normal: Part 2](https://medium.com/adventures-in-data-science/the-markets-not-normal-part-2-cf8c4060f6b4)| 90 | | Can Non-Stationarity Rescue the Normal Distribution? | [Non_Stationarity_in_the_First_Two_Moments.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/2fe3ae6dc08dc80d2f5d0c38ba0562e01f1c7415/The%20Market's%20Not%20Normal/Non_Stationarity_in_the_First_Two_Moments.ipynb) | [Can Non-Stationarity Rescue the Normal Distribution?](https://medium.com/adventures-in-data-science/can-non-stationarity-rescue-the-normal-distribution-4af9f708b26a)| 91 | | Let's Talk About Heteroskedasticity | [The_Variance_is_Not_Stationary.ipynb](https://github.com/Farmhouse121/Financial-Data-Science-in-Python/blob/main/The%20Market's%20Not%20Normal/The_Variance_is_Not_Stationary.ipynb) | [Let's Talk About Heteroskedasticity](https://medium.com/@stattrader/lets-talk-about-heteroskedasticity-f1443d628da0)| 92 | 93 | ## The Data 94 | I am going to use public domain data sources, which will be mostly _Yahoo! Finance_, accessed via the `yfinance` package, and the _Fred_ depository, operated by the Federal Reserve Bank of St. Louis with data downloaded directly via the web service they provide. This is mostly daily and slower cadence data. Most of what I will write about lives in that space. 95 | 96 | ## Making Proper Time Indices for Pandas 97 | It's been my experience that many codes return Pandas dataframes with a timestamp field for an axis but that the system is _not properly told_ that the data is, in fact, temporal in nature. (I'm looking at you **everybody** who uses textual dates as their timestamps!) This error can be remedied with the following construct, which you will see _extensively_ in my code: 98 | ``` 99 | df=pd.DataFrame(...) 100 | df.index=pd.DatetimeIndex(df.index).to_period('B') 101 | ``` 102 | which delivers an index of _business days_, for example. (I am grateful to [Alex De Castro](https://github.com/decastro-alex) for pointing out the existence of the `B` argument to me.) 103 | 104 | ## $\LaTeX$ "Scratchpads" 105 | Since my writing contains _a lot_ of mathematics, which I generally render in $\LaTeX$ and then cut'n'paste into less civil document preparation systems, I've decided to add documents that include the _math mode code_ to generate the equations. These will not render as full $\LaTeX$ documents (as I am not writing the Medium articles in $\LaTeX$, I'm not going to go through the bother of preparing an analogue document for antoher format). If Medium would support $\LaTeX$ markup, in the way that GitHub does, then that would change. 106 | 107 | ## There Will be Many Commits 108 | I have learned from three decades doing scientific research & development work that it is very hard to predict which of the many edits to a script will be the _final_ one that makes it all work. In addition, I've learned that memorializing that "first working version" with a weighty editorial commit will be immediately followed by an "oh yea, also" commit to follow a few minutes later. So I don't really try, I commit when I think I've done something useful and don't shy away from committing frequently. This is particularly useful if one uses GitHub, _as I do_, to synchronize code between different physical locations (e.g. my desktop and an AWS server, for example). This may be "bad practice," but it is my practice. 109 | 110 | ## Support 111 | I appreciate the many positive comments I receive regarding my work and my attempts to explain aspects of the scientific analysis of financial markets to people. If you would like to _directly_ support this work, you can _Buy me a Coffee_ via the link below. [![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/H2H7EC7Z5) 112 | 113 | You may also buy my books via Amazon and other booksellers. 114 | 115 | * [Adventures in Financial Data Science](https://medium.com/r/?url=https%3A%2F%2Famzn.to%2F3P66fyK) 116 | * [Essays on Trading Strategy](https://medium.com/r/?url=https%3A%2F%2Fwww.amazon.com%2FEssays-Trading-Strategy-Scientific-Finance%2Fdp%2F9811273812) 117 | -------------------------------------------------------------------------------- /The Market's Not Normal/my_library.py: -------------------------------------------------------------------------------- 1 | my_library.py -------------------------------------------------------------------------------- /The Market's Not Normal/not_the_same.tex: -------------------------------------------------------------------------------- 1 | This is a ``scratchpad'' to generate the \LaTeX equations used in my articles. It will not run as a stand-alone LaTeX source document. 2 | 3 | \DeclareMathOperator*{\argmax}{arg~max} 4 | \DeclareMathOperator*{\argmin}{arg~min} 5 | 6 | \begin{equation} 7 | \ln\mathbb{P}[\mathrm{model}|\mathrm{data}]=\ln\mathbb{P}[\mathrm{data}|\mathrm{model}]+\ln\mathbb{P}[\mathrm{model}]- 8 | \ln\mathbb{P}[\mathrm{data}] 9 | \end{equation} 10 | 11 | \begin{equation} 12 | \Rightarrow\argmax_{\boldsymbol{\theta}}\ln\mathbb{P}[\mathrm{model}(\boldsymbol{\theta})|\mathrm{data}]=\argmax_{\boldsymbol{\theta}}\mathbb{L}[\mathrm{data}|\boldsymbol{\theta}] 13 | \end{equation} 14 | 15 | \begin{align} 16 | \hat{\boldsymbol{\theta}}&=\argmin_{\boldsymbol{\theta}}\sum_i\Big|x_i-x(\boldsymbol{\theta})\Big|\\ 17 | \textit{or}\; 18 | \hat{\boldsymbol{\theta}}&=\argmin_{\boldsymbol{\theta}}\sum_i\Big(x_i-x(\boldsymbol{\theta})\Big)^2 19 | \end{align} 20 | 21 | \begin{align} 22 | y_i&=\alpha+\beta x_i+\varepsilon_i : \varepsilon_i\sim\mathcal{D}\\ 23 | \Rightarrow\;\varepsilon_i&=y_i-\alpha-\beta x_i 24 | \end{align} 25 | 26 | \begin{align} 27 | (\hat{\alpha},\hat{\beta})&=\argmax_{(\alpha,\beta)}\sum_i\ln f(\varepsilon_i)\\ 28 | &=\argmax_{(\alpha,\beta)}\sum_i\ln f(y_i-\alpha-\beta x_i) 29 | \end{align} 30 | 31 | \begin{equation} 32 | f(\varepsilon,\boldsymbol{\theta})=g(\varepsilon)h(\boldsymbol{\theta})e^{-q\left|\frac{\varepsilon}{\sigma}\right|^\kappa}\;\mathrm{where}\;\{q,\sigma,\kappa\}\subseteq\boldsymbol{\theta} 33 | \end{equation} 34 | 35 | \begin{equation*} 36 | (\hat{\alpha},\hat{\beta})=\argmin_{(\alpha,\beta)}\left\{ 37 | \sum_{i=1}^Nq\left|\frac{y_i-\alpha-\beta x_i}{\sigma}\right|^\kappa 38 | -\sum_{i=1}^N\ln g(y_i-\alpha-\beta x_i)-N\,\ln h(\boldsymbol{\theta})\right\} 39 | \end{equation*} 40 | 41 | \begin{equation} 42 | f(y_i-\alpha-\beta x_i,\sigma)=\frac{e^{-\frac{1}{2}\left(\frac{y_i-\alpha-\beta x_i}{\sigma}\right)^2}}{\sqrt{2\pi}\sigma}\Rightarrow 43 | \begin{cases} 44 | q=\frac{1}{2}\\ 45 | \kappa=2\\ 46 | \ln g(y_i-\alpha-\beta x_i)=0\\ 47 | \ln h(\boldsymbol{\theta})=-\ln(\sqrt{2\pi}\sigma) 48 | \end{cases} 49 | \end{equation} 50 | 51 | \begin{equation} 52 | (\hat{\alpha},\hat{\beta})=\mathop{\mathrm{arg~min}}_{(\alpha,\beta)}\sum_{i=1}^N(y_i-\alpha-\beta x_i)^2 53 | \end{equation} 54 | 55 | \begin{equation} 56 | f(y_i-\alpha-\beta x_i,\sigma)=\frac{e^{-\left|\frac{y_i-\alpha-\beta x_i}{\sigma}\right|}}{2\sigma}\Rightarrow 57 | \begin{cases} 58 | q=1\\ 59 | \kappa=1\\ 60 | \ln g(y_i-\alpha-\beta x_i)=0\\ 61 | \ln h(\boldsymbol{\theta})=-\ln(2\sigma) 62 | \end{cases} 63 | \end{equation} 64 | 65 | \begin{equation} 66 | (\hat{\alpha},\hat{\beta})=\mathop{\mathrm{arg~min}}_{(\alpha,\beta)}\sum_{i=1}^N|y_i-\alpha-\beta x_i| 67 | \end{equation} 68 | 69 | \begin{align} 70 | \mathcal{D}=\mathrm{Normal}&\Leftrightarrow\hat{\alpha}= 71 | \mathop{\mathrm{mean}}\{x_i\}\\ 72 | \mathcal{D}=\mathrm{Laplace}&\Leftrightarrow\hat{\alpha}= 73 | \mathop{\mathrm{median}}\{x_i\} 74 | \end{align} 75 | 76 | -------------------------------------------------------------------------------- /my_library.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # this is designed to set up every notebook or executable script properly with common resources 4 | from sys import stdout,stderr,executable,version_info 5 | assert version_info.major>=3 6 | 7 | from datetime import datetime 8 | DATE_FORMAT,TIME_FORMAT='%m/%d/%Y','%H:%M:%S' 9 | DATETIME_FORMAT=DATE_FORMAT+" "+TIME_FORMAT 10 | 11 | def now(format=TIME_FORMAT): 12 | """Quickly return the time.""" 13 | return datetime.now().strftime(format) 14 | 15 | def today(format=DATE_FORMAT): 16 | """Quickly return the date.""" 17 | return datetime.now().strftime(format) 18 | 19 | def nprint(*args,**kwargs): 20 | """Decorate the print statement with the time.""" 21 | print(now(),*args,**kwargs) 22 | stdout.flush() 23 | 24 | nprint("Starting...") 25 | 26 | # allow code to use the SIGALARM functionality to interrupt itself in a controlled fashion 27 | from signal import signal,SIGALRM,alarm 28 | 29 | class Timeout(Exception): 30 | """Permits time limits.""" 31 | def __str__(self): 32 | return "Timeout." 33 | 34 | def sigalrm(x,y): 35 | """Handle SIGALRM by raising a Timeout exception.""" 36 | raise Timeout 37 | 38 | signal(SIGALRM,sigalrm) # register the handler 39 | 40 | # allow deep breaks, the python break syntax can only break out one level 41 | class Break(Exception): 42 | """Permits deep breaks.""" 43 | def __str__(self): 44 | return "Break" 45 | 46 | # passive wrapper for with clauses for objects that don't provide __enter__ and __exit__ 47 | class With: 48 | """Dummy for with clauses.""" 49 | def __enter__(self): 50 | return self 51 | 52 | def __exit__(self,*args): 53 | pass 54 | 55 | def __init__(self,object=None): 56 | self.object=object 57 | 58 | def __call__(self): 59 | return self.object 60 | 61 | def __str__(self): 62 | return str(self.object) 63 | 64 | def __repr__(self): 65 | return repr(self.object) 66 | 67 | # deal with some Google colab specific stuff 68 | try: 69 | from IPython import get_ipython 70 | ip=get_ipython() 71 | 72 | if ip is not None and 'google' in str(ip): 73 | for package in 'yfinance','arch': 74 | nprint("Installing %s into Google notebook..." % package) 75 | ip.system("pip install --upgrade %s 1>/dev/null" % package) 76 | 77 | from tqdm.notebook import tqdm 78 | 79 | except ModuleNotFoundError: 80 | # if IPython not installed, we're definitely not in a notebook 81 | from tqdm import tqdm 82 | 83 | from warnings import filterwarnings 84 | filterwarnings("ignore",category=RuntimeWarning) # I don't care 85 | 86 | # yfinance - download function 87 | from yfinance import download 88 | 89 | # the usual suspects 90 | import pandas as pd 91 | import numpy as np 92 | import matplotlib.pyplot as pl ; plt=pl 93 | 94 | # some friendly numbers 95 | zero,one,two,three,four,five,ten,hundred,annualize,thousand=0e0,1e0,2e0,3e0,4e0,5e0,1e1,1e2,252e0,1e3 96 | half,GoldenRatio=one/two,(one+np.sqrt(five))/two 97 | 98 | # import arch classes 99 | from arch.univariate import ConstantMean,ARX,GeneralizedError,Normal,GARCH 100 | from arch.typing import Float64Array 101 | 102 | # change the constraints for GeneralizedError 103 | class GeneralizedError2(GeneralizedError): 104 | """Version of the GED with bounds on fit adjusted to include all feasible distributions (GED is valid for all positive nu values).""" 105 | def bounds(self,*args)->list[tuple[float,float]]: 106 | """Revised bounds.""" 107 | return [(0e0,1e2)] 108 | 109 | def constraints(self,*args)->tuple[Float64Array,Float64Array]: 110 | return np.array([[1], [-1]]), np.array([self.bounds()[0][0],-self.bounds()[0][1]]) 111 | 112 | # change the constraints for GARCH 113 | class GARCH2(GARCH): 114 | """Version of GARCH with constraints modified to be more relaxed, leads to models that don't bind on constraints.""" 115 | def bounds(self,resids:Float64Array)->list[tuple[float,float]]: 116 | """Modify bounds to be more relaxed.""" 117 | v=float(np.mean(abs(resids)**self.power)) 118 | bounds=[(1e-8*v,ten*v)] 119 | bounds.extend([(-one,two)]*(self.p+self.o+self.q)) 120 | return bounds 121 | 122 | def constraints(self,*args)->tuple[Float64Array,Float64Array]: 123 | """Modify constraints to permit negative values for A and D etc.""" 124 | a,b=super(GARCH2,self).constraints() 125 | b[1:(self.p+self.o+1)]=-one 126 | return a,b 127 | 128 | # some special axis formatters for matplotlib 129 | from matplotlib.ticker import Formatter 130 | 131 | class DirectionalLabels(Formatter): 132 | """Base class to provide directional formats for matplotlib axes.""" 133 | 134 | def __init__(self): 135 | """Abstract base class.""" 136 | raise NotImplementedError("DirectionalLabels is an abstract base class. You cannot instantiate it directly.") 137 | 138 | def __call__(self,datum,pos=None): 139 | """Render the provided number as a string.""" 140 | return self.plus.format(datum*self.scale) if datum>0e0 else self.minus.format(-datum*self.scale) if datum<0e0 else self.zero 141 | 142 | class PercentLabels(DirectionalLabels): 143 | """Output Excel style percent labels.""" 144 | def __init__(self,precision=2,zero="0",scale=1e0): 145 | """Set decimal precision and string to use for zeros.""" 146 | self.plus="{:,.%df} %%" % precision 147 | self.minus="({:,.%df}) %%" % precision 148 | self.zero=str(zero) 149 | self.scale=abs(scale) 150 | 151 | class CurrencyLabels(DirectionalLabels): 152 | """Matplotlib formatter to provide Excel type currency formats for axes.""" 153 | def __init__(self,precision=2,zero="0",symbol="$",suffix="",scale=1e0): 154 | """Set decimal precision and string to use for zeros.""" 155 | self.plus="%s {:,.%df}%s" % (symbol,precision,suffix) 156 | self.minus="(%s {:,.%df}%s)" % (symbol,precision,suffix) 157 | self.zero=str(zero) 158 | self.scale=abs(scale) 159 | 160 | class CountLabels(DirectionalLabels): 161 | """Matplotlib formatter to provide integers with commas.""" 162 | def __init__(self,zero="0",scale=1e0): 163 | """Integers with commas.""" 164 | self.plus="{:,.0f}" 165 | self.minus=self.plus 166 | self.zero=str(zero) 167 | self.scale=abs(scale) 168 | 169 | # loads index membership from Wikipedia 170 | def loadindex(indexname): 171 | """Load the specified index and return the members and the first date for data extraction.""" 172 | 173 | if indexname=='S&P 500': 174 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker")) 175 | first_date=index['Date added'].max() # add data is in table returned 176 | 177 | elif indexname=='NASDAQ-100': 178 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/Nasdaq-100')[4].rename(columns={"Symbol":"Ticker"}).set_index("Ticker")) 179 | first_date=datetime.now().strftime("%Y-01-02") # NASDAQ rebalances (normally) on the first day of the year. Jan'1st. is *always* a holiday 180 | 181 | elif indexname=='S&P MidCap 400': 182 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker")) 183 | updates=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[1].set_index(("Date","Date")) 184 | updates.index=list(map(lambda x:pd.Period(x.split('[')[0],'D'),updates.index)) 185 | first_date=str(updates.index.max()) 186 | 187 | elif indexname=='S&P SmallCap 600': 188 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[0].rename(columns={"Symbol":"Ticker"}).set_index("Ticker")) 189 | updates=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[1].set_index(("Date","Date")) 190 | updates.index=list(map(lambda x:pd.Period(x.split('[')[0],'D'),updates.index)) 191 | first_date=str(updates.index.max()) 192 | 193 | elif indexname=='S&P 900': # union of S&P 500 and S&P MidCap 400 194 | sp500,dt500=loadindex('S&P 500') 195 | sp400,dt400=loadindex('S&P MidCap 400') 196 | columns=list(set(sp500.columns).intersection(set(sp400.columns))) 197 | index=pd.concat([sp500[columns],sp400[columns]]).sort_index() 198 | first_date=max([dt500,dt400]) 199 | 200 | elif indexname=='S&P 1500': # union of S&P 500, S&P MidCap 400, and S&P SmallCap 600 201 | sp900,dt900=loadindex('S&P 900') 202 | sp600,dt600=loadindex('S&P SmallCap 600') 203 | columns=list(set(sp900.columns).intersection(set(sp600.columns))) 204 | index=pd.concat([sp900[columns],sp600[columns]]).sort_index() 205 | first_date=max([dt900,dt600]) 206 | 207 | elif indexname=='Dow': # Dow Jones 208 | display(index:=pd.read_html('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')[1].rename(columns={"Symbol":"Ticker"}).set_index("Ticker")) 209 | first_date=index['Date added'].max() 210 | 211 | elif indexname=='FTSE 250': 212 | index=pd.read_html('https://en.wikipedia.org/wiki/FTSE_250_Index')[3] 213 | index["Ticker"]=index["Ticker"].apply(lambda x:x+".L") # set to Reuter's style tickers 214 | index.set_index("Ticker",inplace=True) 215 | display(index) 216 | first_date=((pd.Period(datetime.now(),'Q')-1).asfreq('B')+1).strftime("%Y-%m-%d") # first date of current quarter 217 | 218 | else: 219 | raise ValueError("Don't know how to load members of %s Index!" % indexname) 220 | 221 | return index[~index.index.duplicated()],first_date # drop duplicates in case some exist 222 | 223 | # download data from FRED 224 | from requests import get 225 | from os import environ 226 | from getpass import getpass 227 | 228 | def get_fred(series_id,FRED_API_KEY=None): 229 | """Function to get data from FRED API and return it as a DataFrame, also returns metadata object.""" 230 | 231 | if FRED_API_KEY is not None: 232 | environ['FRED_API_KEY']=FRED_API_KEY 233 | 234 | elif 'FRED_API_KEY' not in environ: 235 | environ['FRED_API_KEY']=getpass("You need to enter a FRED API key (your keys are stored here: https://fredaccount.stlouisfed.org/apikeys): ") 236 | 237 | response=get((url:="https://api.stlouisfed.org/fred/series/observations?series_id={}&api_key={}&file_type=json").format(series_id,environ['FRED_API_KEY'])) 238 | 239 | if response.status_code//100!=2: 240 | raise ValueError("Get status_code={:d} from {:s}".format(response.status_code,url)) 241 | 242 | df=pd.DataFrame.from_dict(pd.json_normalize(response.json())['observations'][0])[['date','value']].rename(columns={"date":"Date","value":series_id}).set_index("Date") 243 | df[series_id]=df[series_id].apply(lambda x:float(x) if x!='.' else np.nan) 244 | 245 | response=get((url:="https://api.stlouisfed.org/fred/series?series_id={}&api_key={}&file_type=json").format(series_id,environ['FRED_API_KEY'])) 246 | 247 | if response.status_code//100!=2: 248 | raise ValueError("Get status_code={:d} from {:s}".format(response.status_code,url)) 249 | 250 | metadata=response.json()['seriess'][0] 251 | df.index=pd.DatetimeIndex(df.index).to_period(metadata['frequency_short']) 252 | return df.dropna(),metadata 253 | 254 | # that's all folks 255 | nprint("Initialized.") 256 | -------------------------------------------------------------------------------- /nlls.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from statsmodels.base.model import GenericLikelihoodModel 3 | from statsmodels.api import add_constant 4 | from scipy.stats import norm 5 | from scipy.optimize import Bounds 6 | from my_library import zero,one 7 | 8 | class NLLS(GenericLikelihoodModel): 9 | """ 10 | Fit a Non-Linear Least Squares model to data via MLE. 11 | 12 | This is a maximum likelihood regression to a Normal distribution with a non-linear mean process, 13 | with an added regressor being the standard deviation of errors. If you do not override the `.predict()` 14 | method it will actually use a linear mean process and so is equivalent to OLS, but slower! You can change 15 | the distribution used by supplying a pdf function that matches the scipy.stats framework. 16 | 17 | The code follows the usual statsmodels conventions for a model defined in terms of a dependent variable named `endog` and independent 18 | variables named `exog`, and with the `add_constant()` function expected if an intercept is required. If you do not supply any data for 19 | `exog` a constant column will be used. The system fits the standard deviation of the data (which is required by the method). You can supply 20 | any `scipy.stats` format univariate distribution via the `distribution` argument and any extra variables via `extra_params_names`. The 21 | regression is performed via the `scipy.optimize.minimize` function and (in my experience) can be a little brittle. You may need to supply 22 | user bounds for the latent variables if the default ones chosen are incorrect. 23 | """ 24 | def __init__(self,endog,exog=None,distribution=norm,extra_params_names=[],**kwargs): 25 | """Initialize the object, setting helpful data, and then call the base constructor.""" 26 | # a small number 27 | self.epsilon=1e-7 28 | 29 | # check for univariate problem 30 | if len(endog.shape)>1: 31 | raise NotImplementedError("Only univariate processes are supported, you supplied a %d-d array for endog." % len(endog.shape)) 32 | 33 | # if no exogenous series, put in a series of ones for the constant 34 | if exog is None: 35 | exog=np.ones(endog.shape[0]) 36 | 37 | # add xtra distribution process parameters, which are "hidden" from the exog vector input 38 | self.distribution=distribution 39 | self.latent_variables=['sigma']+extra_params_names 40 | 41 | # initialize the base class with the added variables as "extra_params" 42 | super(NLLS,self).__init__(endog,exog,extra_params_names=self.latent_variables,**kwargs) 43 | 44 | # adjust d.o.f., it seems that the code doesn't get this right as given by the examples on line 45 | self.df_resid-=len(self.latent_variables) 46 | self.df_model=self.nparams-self.k_constant 47 | 48 | def _pick_params(self,params): 49 | """Helper function to divide the parameters into linear process, disperstion and other latent variable parameters.""" 50 | assert len(params)==self.nparams 51 | 52 | # memorialize parameters 53 | self.params=params 54 | 55 | # pick up parameters from the right slots 56 | n=len(self.latent_variables) 57 | 58 | if n>1: 59 | beta=params[:-n] 60 | sigma=params[-n] 61 | extra=list(params[-n+1:]) 62 | 63 | elif n==1: 64 | beta=params[:-1] 65 | sigma=params[-1] 66 | extra=[] 67 | 68 | else: 69 | raise ValueError("The number of latent variables cannot be zero.") 70 | 71 | return beta,sigma,extra 72 | 73 | def predict(self,exog=None,params=None): 74 | """ 75 | Returns the mean process prediction for the given inputs. 76 | 77 | If exog or params are None then self.exog, and/or self.params, will be used respectively. 78 | If you want to do something other than OLS, you should override this method with some other 79 | formula. If you don't want to use the Normal Distribution for the error process you should 80 | supply some other distribution when you instantiate the class. 81 | """ 82 | beta,sigma,extra=self._pick_params(params if params is not None else self.params) 83 | _exog=exog if exog is not None else self.exog 84 | mean=np.dot(_exog,beta) # if you want something other than OLS, change this formula 85 | 86 | return mean 87 | 88 | def nloglikeobs(self,params): 89 | """Returns a vector of negative log-likelihood values for each observation.""" 90 | # get the parameters divided into those for the mean process and the latent variables 91 | beta,sigma,extra=self._pick_params(params) 92 | 93 | # compute the mean model, dot computes the time-series of the dot product of the observation rows with beta 94 | self.mean=self.predict() 95 | self.innovation=(self.endog-self.mean)/sigma 96 | 97 | # set the distribution with the location set to zero as we specify the mean-model explicitly 98 | density=self.distribution(*extra,loc=zero,scale=sigma) 99 | 100 | # compute vector of negative log likelihood of vector of observations 101 | return -density.logpdf(self.endog-self.mean) 102 | 103 | def fit(self,start_params=None,bounds=None,maxiter=1000,**kwargs): 104 | """ 105 | Perform a regression using trust-constrained gradient minimization. 106 | 107 | NOTE: This code will try to guess values for `start_params` and `bounds` if you don't supply them. In particular, 108 | it will guess `(-np.inf,+np.inf)` for the limits of any latent variables added by the user. If this assumption is wrong, 109 | the regression may fail. In which case, user supplied values should be substituted. It's my experience that this regression 110 | method is "slow" and "fragile," but it is the required one. 111 | """ 112 | # the regression problem parameters are defined HERE by the start_params vector, wierd huh? 113 | if start_params is None: 114 | start_params=[zero]*self.exog.shape[1]+[self.endog.std()]+[one]*(len(self.latent_variables)-1) 115 | 116 | if self.k_constant: 117 | start_params[0]=self.endog.mean() 118 | 119 | # set the bounds 120 | if bounds is None: 121 | bounds=[(-np.inf,np.inf)]*self.exog.shape[1]+[(self.epsilon,np.inf)]+[(-np.inf,np.inf)]*(len(self.latent_variables)-1) 122 | 123 | # check we set the right number of variables 124 | assert len(start_params)==self.nparams 125 | assert len(bounds)==self.nparams 126 | 127 | # now let the base class do the regression, specifying the trust-constraint method for scipy.optimize with bounds 128 | f=super(NLLS,self).fit( 129 | start_params=start_params, 130 | method='minimize', 131 | min_method='trust-constr', 132 | bounds=Bounds(*list(zip(*bounds))), 133 | maxiter=maxiter, 134 | **kwargs 135 | ) 136 | f.named_params=dict(zip(self.exog_names,self.params)) 137 | f.num_params=len(f.named_params) 138 | return f 139 | 140 | --------------------------------------------------------------------------------