├── .github └── workflows │ ├── pylint.yml │ └── python-package.yml ├── .gitignore ├── .pylintrc ├── .readthedocs.yml ├── CITATION.cff ├── Demo.ipynb ├── LICENSE.txt ├── MANIFEST.in ├── README-pypi.md ├── README.md ├── __init__.py ├── benford ├── __init__.py ├── benford.py ├── checks.py ├── constants.py ├── expected.py ├── reports.py ├── stats.py ├── utils.py └── viz.py ├── data └── SPY.csv ├── docs ├── Makefile ├── build │ ├── doctrees │ │ ├── api.doctree │ │ ├── benford.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ └── modules.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _modules │ │ ├── benford │ │ │ ├── benford.html │ │ │ ├── expected.html │ │ │ ├── stats.html │ │ │ ├── utils.html │ │ │ └── viz.html │ │ └── index.html │ │ ├── _sources │ │ ├── api.rst.txt │ │ ├── index.rst.txt │ │ └── modules.rst.txt │ │ ├── api.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── modules.html │ │ ├── objects.inv │ │ ├── py-modindex.html │ │ ├── search.html │ │ └── searchindex.js ├── make.bat ├── requirements.txt └── source │ ├── api.rst │ ├── conf.py │ ├── index.rst │ └── modules.rst ├── img ├── 2429_Benford-Frank.jpg ├── Benford_Instance.png ├── First.png ├── First_Digits.png ├── SPY-f2d-conf_level-95.png ├── Simon_Newcomb_APS.jpg └── formula.png ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── conftest.py ├── test_checks.py ├── test_expected.py ├── test_stats.py └── test_utils.py /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 3.8 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: 3.8 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install pylint numpy pandas matplotlib 20 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 21 | - name: Analysing the code with pylint 22 | run: | 23 | pylint `ls -R|grep .py$|xargs` 24 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: benford_py 5 | 6 | on: 7 | push: 8 | branches: [ develop ] 9 | pull_request: 10 | branches: [ develop ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.6, 3.7, 3.8, 3.9] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest numpy pandas matplotlib 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | __pycache__/ 5 | 6 | # ipython notebook checkpoints 7 | *.ipynb_checkpoints 8 | 9 | # text editor backups 10 | *~ 11 | 12 | # VS Code 13 | .vscode/ 14 | 15 | # Jupyter NB Checkpoints 16 | .ipynb_checkpoints/ 17 | 18 | # Setuptools distribution folder. 19 | /dist/ 20 | /build/ 21 | 22 | # Python egg metadata, regenerated from source files by setuptools. 23 | /*.egg-info 24 | 25 | # Sphinx docs rendered files 26 | # /docs/build/ 27 | _build 28 | _static 29 | _templates 30 | 31 | # pytest 32 | .pytest_cache/ 33 | #VSCode 34 | .vscode/ 35 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | disable= 3 | F0001, # No module named XXXXXXXX 4 | 5 | 6 | 7 | ignored-classes=SQLObject,Registrant,scoped_session 8 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Milcent" 5 | given-names: "Marcel" 6 | orcid: 7 | title: "Benford_py: a Python Implementation of Benford's Law Tests" 8 | version: 0.5.0 9 | doi: 10 | date-released: 2017 11 | url: "https://github.com/milcent/benford_py" -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2014-2021, Marcel Milcent. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | * Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include README-pypi.md 3 | include LICENSE.txt -------------------------------------------------------------------------------- /README-pypi.md: -------------------------------------------------------------------------------- 1 | [![Downloads](https://pepy.tech/badge/benford-py)](https://pepy.tech/project/benford-py) 2 | 3 | # Benford for Python 4 | 5 | -------------------------------------------------------------------------------- 6 | 7 | **Citing** 8 | 9 | 10 | If you find *Benford_py* useful in your research, please consider adding the following citation: 11 | 12 | ```bibtex 13 | @misc{benford_py, 14 | author = {Marcel, Milcent}, 15 | title = {{Benford_py: a Python Implementation of Benford's Law Tests}}, 16 | year = {2017}, 17 | publisher = {GitHub}, 18 | journal = {GitHub repository}, 19 | howpublished = {\url{https://github.com/milcent/benford_py}}, 20 | } 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- 24 | 25 | `current version = 0.5.0` 26 | 27 | ### See [release notes](https://github.com/milcent/benford_py/releases/) for features in this and in older versions 28 | 29 | ### Python versions >= 3.6 30 | 31 | ### Installation 32 | 33 | Benford_py is a package in PyPi, so you can install with pip: 34 | 35 | `pip install benford_py` 36 | 37 | or 38 | 39 | `pip install benford-py` 40 | 41 | Or you can cd into the site-packages subfolder of your python distribution (or environment) and git clone from there: 42 | 43 | `git clone https://github.com/milcent/benford_py` 44 | 45 | For a quick start, please go to the [Demo notebook](https://github.com/milcent/benford_py/blob/master/Demo.ipynb), in which I show examples on how to run the tests with the SPY (S&P 500 ETF) daily returns. 46 | 47 | For more fine-grained details of the functions and classes, see the [docs](https://benford-py.readthedocs.io/en/latest/index.html). 48 | 49 | ### Background 50 | 51 | The first digit of a number is [its leftmost digit](https://github.com/milcent/benford_py/blob/master/img/First_Digits.png) 52 | 53 | Since the first digit of any number can range from "1" to "9" 54 | (not considering "0"), it would be intuitively expected that the 55 | proportion of each occurrence in a set of numerical records would 56 | be uniformly distributed at 1/9, i.e., approximately 0.1111, 57 | or 11.11%. 58 | 59 | [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law), 60 | also known as the Law of First Digits or the Phenomenon of 61 | Significant Digits, is the finding that the first digits of the 62 | numbers found in series of records of the most varied sources do 63 | not display a uniform distribution, but rather are arranged in such 64 | a way that the digit "1" is the most frequent, followed by "2", 65 | "3", and so in a successive and decremental way down to "9", 66 | which presents the lowest frequency as the first digit. 67 | 68 | The expected distributions of the First Digits in a 69 | Benford-compliant data set are the ones shown [here](https://github.com/milcent/benford_py/blob/master/img/First.png) 70 | 71 | The first record on the subject dates from 1881, in the work of 72 | [Simon Newcomb](https://github.com/milcent/benford_py/blob/master/img/Simon_Newcomb_APS.jpg), an American-Canadian astronomer and mathematician, 73 | who noted that in the logarithmic tables the first pages, which 74 | contained logarithms beginning with the numerals "1" and "2", 75 | were more worn out, that is, more consulted. 76 | 77 | In that same article, Newcomb proposed the [formula](https://github.com/milcent/benford_py/blob/master/img/formula.png) for the probability of a certain digit "d" 78 | being the first digit of a number, given by the following equation. 79 | 80 | In 1938, the American physicist [Frank Benford](https://github.com/milcent/benford_py/blob/master/img/2429_Benford-Frank.jpg) revisited the 81 | phenomenon, which he called the "Law of Anomalous Numbers," in 82 | a survey with more than 20,000 observations of empirical data 83 | compiled from various sources, ranging from areas of rivers to 84 | molecular weights of chemical compounds, including cost data, 85 | address numbers, population sizes and physical constants. All 86 | of them, to a greater or lesser extent, followed such 87 | distribution. 88 | 89 | The extent of Benford's work seems to have been one good reason 90 | for the phenomenon to be popularized with his name, though 91 | described by Newcomb 57 years earlier. 92 | 93 | Derivations of the original formula were also applied in the 94 | expected findings of the proportions of digits in other 95 | positions in the number, as in the case of the second digit 96 | (BENFORD, 1938), as well as combinations, such as the first 97 | two digits of a number (NIGRINI, 2012, p.5). 98 | 99 | Only in 1995, however, was the phenomenon proven by Hill. 100 | His proof was based on the fact that numbers in data series 101 | following the Benford Law are, in effect, "second generation" 102 | distributions, ie combinations of other distributions. 103 | The union of randomly drawn samples from various distributions 104 | forms a distribution that respects Benford's Law (HILL, 1995). 105 | 106 | When grouped in ascending order, data that obey Benford's Law 107 | must approximate a geometric sequence (NIGRINI, 2012, page 21). 108 | From this it follows that the logarithms of this ordered series 109 | must form a straight line. In addition, the mantissas (decimal 110 | parts) of the logarithms of these numbers must be uniformly 111 | distributed in the interval [0,1] (NIGRINI, 2012, p.10). 112 | 113 | In general, a series of numerical records follows Benford's Law 114 | when (NIGRINI, 2012, p.21): 115 | * it represents magnitudes of events or events, such as populations 116 | of cities, flows of water in rivers or sizes of celestial bodies; 117 | * it does not have pre-established minimum or maximum limits; 118 | * it is not made up of numbers used as identifiers, such as 119 | identity or social security numbers, bank accounts, telephone numbers; and 120 | * its mean is less than the median, and the data is not 121 | concentrated around the mean. 122 | 123 | It follows from this expected distribution that, if the set of 124 | numbers in a series of records that usually respects the Law 125 | shows a deviation in the proportions found, there may be 126 | distortions, whether intentional or not. 127 | 128 | Benford's Law has been used in [several fields](http://www.benfordonline.net/). 129 | Afer asserting that the usual data type is Benford-compliant, 130 | one can study samples from the same data type tin search of 131 | inconsistencies, errors or even [fraud](https://www.amazon.com.br/Benfords-Law-Applications-Accounting-Detection/dp/1118152859). 132 | 133 | This open source module is an attempt to facilitate the 134 | performance of Benford's Law-related tests by people using 135 | Python, whether interactively or in an automated, scripting way. 136 | 137 | It uses the versatility of numpy and pandas, along with 138 | matplotlib for vizualization, to deliver results like [this one](https://github.com/milcent/benford_py/blob/master/img/SPY-f2d-conf_level-95.png) and much more. 139 | 140 | 141 | It has been a long time since I last tested it in Python 2. The death clock has stopped ticking, so officially it is for Python 3 now. It should work on Linux, Windows and Mac, but please file a bug report if you run into some trouble. 142 | 143 | Also, if you have some nice data set that we can run these tests on, let'us try it. 144 | 145 | Thanks! 146 | 147 | Milcent 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Downloads](https://pepy.tech/badge/benford-py)](https://pepy.tech/project/benford-py) 2 | 3 | # Benford for Python 4 | 5 | -------------------------------------------------------------------------------- 6 | 7 | **Citing** 8 | 9 | 10 | If you find *Benford_py* useful in your research, please consider adding the following citation: 11 | 12 | ```bibtex 13 | @misc{benford_py, 14 | author = {Marcel, Milcent}, 15 | title = {{Benford_py: a Python Implementation of Benford's Law Tests}}, 16 | year = {2017}, 17 | publisher = {GitHub}, 18 | journal = {GitHub repository}, 19 | howpublished = {\url{https://github.com/milcent/benford_py}}, 20 | } 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- 24 | 25 | `current version = 0.5.0` 26 | 27 | ### See [release notes](https://github.com/milcent/benford_py/releases/) for features in this and in older versions 28 | 29 | ### Python versions >= 3.6 30 | 31 | ### Installation 32 | 33 | Benford_py is a package in PyPi, so you can install with pip: 34 | 35 | `pip install benford_py` 36 | 37 | or 38 | 39 | `pip install benford-py` 40 | 41 | Or you can cd into the site-packages subfolder of your python distribution (or environment) and git clone from there: 42 | 43 | `git clone https://github.com/milcent/benford_py` 44 | 45 | For a quick start, please go to the [Demo notebook](https://github.com/milcent/benford_py/blob/master/Demo.ipynb), in which I show examples on how to run the tests with the SPY (S&P 500 ETF) daily returns. 46 | 47 | For more fine-grained details of the functions and classes, see the [docs](https://benford-py.readthedocs.io/en/latest/index.html). 48 | 49 | ### Background 50 | 51 | The first digit of a number is its leftmost digit. 52 |

53 | First Digits 54 |

55 | 56 | Since the first digit of any number can range from "1" to "9" 57 | (not considering "0"), it would be intuitively expected that the 58 | proportion of each occurrence in a set of numerical records would 59 | be uniformly distributed at 1/9, i.e., approximately 0.1111, 60 | or 11.11%. 61 | 62 | [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law), 63 | also known as the Law of First Digits or the Phenomenon of 64 | Significant Digits, is the finding that the first digits of the 65 | numbers found in series of records of the most varied sources do 66 | not display a uniform distribution, but rather are arranged in such 67 | a way that the digit "1" is the most frequent, followed by "2", 68 | "3", and so in a successive and decremental way down to "9", 69 | which presents the lowest frequency as the first digit. 70 | 71 | The expected distributions of the First Digits in a 72 | Benford-compliant data set are the ones shown below: 73 |

74 | Expected Distributions of First Digits 75 |

76 | 77 | The first record on the subject dates from 1881, in the work of 78 | Simon Newcomb, an American-Canadian astronomer and mathematician, 79 | who noted that in the logarithmic tables the first pages, which 80 | contained logarithms beginning with the numerals "1" and "2", 81 | were more worn out, that is, more consulted. 82 | 83 |

84 | Simon Newcomb 85 |

86 |

87 | Simon Newcomb, 1835-1909. 88 |

89 | 90 | In that same article, Newcomb proposed the formula for the 91 | probability of a certain digit "d" being the first digit of a 92 | number, given by the following equation. 93 | 94 |

95 | First digit equation 96 |

97 |

where: P (D = d) is the probability that 98 | the first digit is equal to d, and d is an integer ranging 99 | from 1 to 9. 100 |

101 | 102 | In 1938, the American physicist Frank Benford revisited the 103 | phenomenon, which he called the "Law of Anomalous Numbers," in 104 | a survey with more than 20,000 observations of empirical data 105 | compiled from various sources, ranging from areas of rivers to 106 | molecular weights of chemical compounds, including cost data, 107 | address numbers, population sizes and physical constants. All 108 | of them, to a greater or lesser extent, followed such 109 | distribution. 110 | 111 |

112 | Frank Benford 113 |

114 |

115 | Frank Albert Benford, Jr., 1883-1948. 116 |

117 | 118 | The extent of Benford's work seems to have been one good reason 119 | for the phenomenon to be popularized with his name, though 120 | described by Newcomb 57 years earlier. 121 | 122 | Derivations of the original formula were also applied in the 123 | expected findings of the proportions of digits in other 124 | positions in the number, as in the case of the second digit 125 | (BENFORD, 1938), as well as combinations, such as the first 126 | two digits of a number (NIGRINI, 2012, p.5). 127 | 128 | Only in 1995, however, was the phenomenon proven by Hill. 129 | His proof was based on the fact that numbers in data series 130 | following the Benford Law are, in effect, "second generation" 131 | distributions, ie combinations of other distributions. 132 | The union of randomly drawn samples from various distributions 133 | forms a distribution that respects Benford's Law (HILL, 1995). 134 | 135 | When grouped in ascending order, data that obey Benford's Law 136 | must approximate a geometric sequence (NIGRINI, 2012, page 21). 137 | From this it follows that the logarithms of this ordered series 138 | must form a straight line. In addition, the mantissas (decimal 139 | parts) of the logarithms of these numbers must be uniformly 140 | distributed in the interval [0,1] (NIGRINI, 2012, p.10). 141 | 142 | In general, a series of numerical records follows Benford's Law 143 | when (NIGRINI, 2012, p.21): 144 | * it represents magnitudes of events or events, such as populations 145 | of cities, flows of water in rivers or sizes of celestial bodies; 146 | * it does not have pre-established minimum or maximum limits; 147 | * it is not made up of numbers used as identifiers, such as 148 | identity or social security numbers, bank accounts, telephone numbers; and 149 | * its mean is less than the median, and the data is not 150 | concentrated around the mean. 151 | 152 | It follows from this expected distribution that, if the set of 153 | numbers in a series of records that usually respects the Law 154 | shows a deviation in the proportions found, there may be 155 | distortions, whether intentional or not. 156 | 157 | Benford's Law has been used in [several fields](http://www.benfordonline.net/). 158 | Afer asserting that the usual data type is Benford-compliant, 159 | one can study samples from the same data type tin search of 160 | inconsistencies, errors or even [fraud](https://www.amazon.com.br/Benfords-Law-Applications-Accounting-Detection/dp/1118152859). 161 | 162 | This open source module is an attempt to facilitate the 163 | performance of Benford's Law-related tests by people using 164 | Python, whether interactively or in an automated, scripting way. 165 | 166 | It uses the versatility of numpy and pandas, along with 167 | matplotlib for vizualization, to deliver results like the one 168 | bellow and much more. 169 | 170 | ![Sample Image](https://github.com/milcent/benford_py/blob/master/img/SPY-f2d-conf_level-95.png) 171 | 172 | It has been a long time since I last tested it in Python 2. The death clock has stopped ticking, so officially it is for Python 3 now. It should work on Linux, Windows and Mac, but please file a bug report if you run into some trouble. 173 | 174 | Also, if you have some nice data set that we can run these tests on, let'us try it. 175 | 176 | Thanks! 177 | 178 | Milcent 179 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | '''Benfords law module''' 2 | __version__ = "0.5.0" 3 | -------------------------------------------------------------------------------- /benford/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benford_py for Python is a module for application of Benford's Law 3 | to a sequence of numbers. 4 | 5 | Dependent on pandas, numpy and matplotlib 6 | 7 | All logarithms ar in base 10: "log10" 8 | 9 | Author: Marcel Milcent 10 | 11 | SDPX-License-Identifier: BSD-3-Clause 12 | """ 13 | 14 | from .benford import * 15 | 16 | __version__ = '0.5.0' 17 | -------------------------------------------------------------------------------- /benford/checks.py: -------------------------------------------------------------------------------- 1 | from pandas import Series 2 | from numpy import array, ndarray 3 | from .constants import DIGS, REV_DIGS, CONFS 4 | 5 | 6 | def _check_digs_(digs): 7 | """Checks the possible values for the digs parameter of the 8 | First Digits tests 9 | """ 10 | if digs not in [1, 2, 3]: 11 | raise ValueError("The value assigned to the parameter -digs- " 12 | f"was {digs}. Value must be 1, 2 or 3.") 13 | 14 | 15 | def _check_test_(test): 16 | """Checks the test chosen, both for int or str values 17 | """ 18 | if isinstance(test, int): 19 | if test in DIGS.keys(): 20 | return test 21 | else: 22 | raise ValueError(f'Test was set to {test}. Should be one of ' 23 | f'{DIGS.keys()}') 24 | elif isinstance(test, str): 25 | if test in REV_DIGS.keys(): 26 | return REV_DIGS[test] 27 | else: 28 | raise ValueError(f'Test was set to {test}. Should be one of ' 29 | f'{REV_DIGS.keys()}') 30 | else: 31 | raise ValueError('Wrong value chosen for test parameter. Possible ' 32 | f'values are\n {list(DIGS.keys())} for ints and' 33 | f'\n {list(REV_DIGS.keys())} for strings.') 34 | 35 | 36 | def _check_decimals_(decimals): 37 | """""" 38 | if isinstance(decimals, int): 39 | if (decimals < 0): 40 | raise ValueError( 41 | "Parameter -decimals- must be an int >= 0, or 'infer'.") 42 | else: 43 | if decimals != 'infer': 44 | raise ValueError( 45 | "Parameter -decimals- must be an int >= 0, or 'infer'.") 46 | return decimals 47 | 48 | 49 | def _check_sign_(sign): 50 | """""" 51 | if sign not in ['all', 'pos', 'neg']: 52 | raise ValueError("Parameter -sign- must be one of the following: " 53 | "'all', 'pos' or 'neg'.") 54 | return sign 55 | 56 | 57 | def _check_confidence_(confidence): 58 | """""" 59 | if confidence not in CONFS.keys(): 60 | raise ValueError("Value of parameter -confidence- must be one of the " 61 | f"following:\n {list(CONFS.keys())}") 62 | return confidence 63 | 64 | 65 | def _check_high_Z_(high_Z): 66 | """""" 67 | if not high_Z in ['pos', 'all']: 68 | if not isinstance(high_Z, int): 69 | raise ValueError("The parameter -high_Z- should be 'pos', " 70 | "'all' or an int.") 71 | return high_Z 72 | 73 | 74 | def _check_num_array_(data): 75 | """""" 76 | if (not isinstance(data, ndarray)) & (not isinstance(data, Series)): 77 | print('\n`data` not a numpy NDarray nor a pandas Series.' 78 | ' Trying to convert...') 79 | try: 80 | data = array(data) 81 | except: 82 | raise ValueError('Could not convert data. Check input.') 83 | print('\nConversion successful.') 84 | 85 | try: 86 | data = data.astype(float) 87 | except: 88 | raise ValueError('Could not convert data. Check input.') 89 | else: 90 | if data.dtype not in [int, float]: 91 | try: 92 | data = data.astype(float) 93 | except: 94 | raise ValueError('Could not convert data. Check input.') 95 | return data 96 | -------------------------------------------------------------------------------- /benford/constants.py: -------------------------------------------------------------------------------- 1 | DIGS = {1: 'F1D', 2: 'F2D', 3: 'F3D', 22: 'SD', -2: 'L2D'} 2 | 3 | SEC_ORDER_DIGS = {key: f'{val}_sec' for key, val in DIGS.items()} 4 | 5 | REV_DIGS = {'F1D': 1, 'F2D': 2, 'F3D': 3, 'SD': 22, 'L2D': -2} 6 | 7 | LEN_TEST = {1: 9, 2: 90, 3: 900, 22: 10, -2: 100} 8 | 9 | TEST_NAMES = {'F1D': 'First Digit Test', 'F2D': 'First Two Digits Test', 10 | 'F3D': 'First Three Digits Test', 'SD': 'Second Digit Test', 11 | 'L2D': 'Last Two Digits Test', 12 | 'F1D_sec': 'First Digit Second Order Test', 13 | 'F2D_sec': 'First Two Digits Second Order Test', 14 | 'F3D_sec': 'First Three Digits Second Order Test', 15 | 'SD_sec': 'Second Digit Second Order Test', 16 | 'L2D_sec': 'Last Two Digits Second Order Test', 17 | 'F1D_Summ': 'First Digit Summation Test', 18 | 'F2D_Summ': 'First Two Digits Summation Test', 19 | 'F3D_Summ': 'First Three Digits Summation Test', 20 | 'Mantissas': 'Mantissas Test' 21 | } 22 | 23 | # Critical values for Mean Absolute Deviation 24 | MAD_CONFORM = {1: [0.006, 0.012, 0.015], 2: [0.0012, 0.0018, 0.0022], 25 | 3: [0.00036, 0.00044, 0.00050], 22: [0.008, 0.01, 0.012], 26 | -2: None, 'F1D': 'First Digit', 'F2D': 'First Two Digits', 27 | 'F3D': 'First Three Digits', 'SD': 'Second Digits'} 28 | 29 | # Color for the plotting 30 | COLORS = {'m': '#00798c', 'b': '#E2DCD8', 's': '#9c3848', 31 | 'af': '#edae49', 'ab': '#33658a', 'h': '#d1495b', 32 | 'h2': '#f64740', 't': '#16DB93'} 33 | 34 | # Critical Z-scores according to the confindence levels 35 | CONFS = {None: None, 80: 1.285, 85: 1.435, 90: 1.645, 95: 1.96, 36 | 99: 2.576, 99.9: 3.29, 99.99: 3.89, 99.999: 4.417, 37 | 99.9999: 4.892, 99.99999: 5.327} 38 | 39 | P_VALUES = {None: 'None', 80: '0.2', 85: '0.15', 90: '0.1', 95: '0.05', 40 | 99: '0.01', 99.9: '0.001', 99.99: '0.0001', 99.999: '0.00001', 41 | 99.9999: '0.000001', 99.99999: '0.0000001'} 42 | 43 | # Critical Chi-Square values according to the tests degrees of freedom 44 | # and confidence levels 45 | CRIT_CHI2 = {8: {80: 11.03, 85: 12.027, 90: 13.362, 95: 15.507, 46 | 99: 20.090, 99.9: 26.124, 99.99: 31.827, None: None, 47 | 99.999: 37.332, 99.9999: 42.701, 99.99999: 47.972}, 48 | 9: {80: 12.242, 85: 13.288, 90: 14.684, 95: 16.919, 49 | 99: 21.666, 99.9: 27.877, 99.99: 33.72, None: None, 50 | 99.999: 39.341, 99.9999: 44.811, 99.99999: 50.172}, 51 | 89: {80: 99.991, 85: 102.826, 90: 106.469, 95: 112.022, 52 | 99: 122.942, 99.9: 135.978, 99.99: 147.350, 53 | 99.999: 157.702, 99.9999: 167.348, 99.99999: 176.471, 54 | None: None}, 55 | 99: {80: 110.607, 85: 113.585, 90: 117.407, 56 | 95: 123.225, 99: 134.642, 99.9: 148.230, 57 | 99.99: 160.056, 99.999: 170.798, 99.9999: 180.792, 58 | 99.99999: 190.23, None: None}, 59 | 899: {80: 934.479, 85: 942.981, 90: 953.752, 95: 969.865, 60 | 99: 1000.575, 99.9: 1035.753, 99.99: 1065.314, 61 | 99.999: 1091.422, 99.9999: 1115.141, 62 | 99.99999: 1137.082, None: None} 63 | } 64 | 65 | # Critical Kolmogorov-Smirnov values according to the confidence levels 66 | # These values are yet to be divided by the square root of the sample size 67 | CRIT_KS = {80: 1.073, 85: 1.138, 90: 1.224, 95: 1.358, 99: 1.628, 68 | 99.9: 1.949, 99.99: 2.225, 99.999: 2.47, 69 | 99.9999: 2.693, 99.99999: 2.899, None: None} 70 | -------------------------------------------------------------------------------- /benford/expected.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | from numpy import array, arange, log10 3 | from .checks import _check_digs_ 4 | from .viz import plot_expected 5 | 6 | 7 | class First(DataFrame): 8 | """Holds the expected probabilities of the First, First Two, or 9 | First Three digits according to Benford's distribution. 10 | 11 | Args: 12 | digs: 1, 2 or 3 - tells which of the first digits to consider: 13 | 1 for the First Digit, 2 for the First Two Digits and 3 for 14 | the First Three Digits. 15 | plot: option to plot a bar chart of the Expected proportions. 16 | Defaults to True. 17 | save_plot: string with the path/name of the file in which the generated 18 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 19 | is infered by the file name extension. Only available when 20 | plot=True. 21 | save_plot_kwargs: dict with any of the kwargs accepted by 22 | matplotlib.pyplot.savefig() 23 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 24 | Only available when plot=True and save_plot is a string with the 25 | figure file path/name. 26 | """ 27 | 28 | def __init__(self, digs, plot=True, save_plot=None, save_plot_kwargs=None): 29 | _check_digs_(digs) 30 | dig_name = f'First_{digs}_Dig' 31 | exp_array, dig_array = _gen_first_digits_(digs) 32 | 33 | DataFrame.__init__(self, {'Expected': exp_array}, index=dig_array) 34 | self.index.names = [dig_name] 35 | 36 | if plot: 37 | plot_expected(self, digs, save_plot=save_plot, 38 | save_plot_kwargs=save_plot_kwargs) 39 | 40 | 41 | class Second(DataFrame): 42 | """Holds the expected probabilities of the Second Digits 43 | according to Benford's distribution. 44 | 45 | Args: 46 | plot: option to plot a bar chart of the Expected proportions. 47 | Defaults to True. 48 | save_plot: string with the path/name of the file in which the generated 49 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 50 | is infered by the file name extension. Only available when 51 | plot=True. 52 | save_plot_kwargs: dict with any of the kwargs accepted by 53 | matplotlib.pyplot.savefig() 54 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 55 | Only available when plot=True and save_plot is a string with the 56 | figure file path/name. 57 | """ 58 | def __init__(self, plot=True, save_plot=None, save_plot_kwargs=None): 59 | 60 | exp, sec_digs = _gen_second_digits_() 61 | 62 | DataFrame.__init__(self, {'Expected': exp, 'Sec_Dig': sec_digs}) 63 | self.set_index("Sec_Dig", inplace=True) 64 | 65 | if plot: 66 | plot_expected(self, 22, save_plot=save_plot, 67 | save_plot_kwargs=save_plot_kwargs) 68 | 69 | 70 | class LastTwo(DataFrame): 71 | """Holds the expected probabilities of the Last Two Digits 72 | according to Benford's distribution. 73 | 74 | Args: 75 | plot: option to plot a bar chart of the Expected proportions. 76 | Defaults to True. 77 | save_plot: string with the path/name of the file in which the generated 78 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 79 | is infered by the file name extension. Only available when 80 | plot=True. 81 | save_plot_kwargs: dict with any of the kwargs accepted by 82 | matplotlib.pyplot.savefig() 83 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 84 | Only available when plot=True and save_plot is a string with the 85 | figure file path/name. 86 | """ 87 | def __init__(self, num=False, plot=True, save_plot=None, save_plot_kwargs=None): 88 | exp, l2d = _gen_last_two_digits_(num=num) 89 | DataFrame.__init__(self, {'Expected': exp, 90 | 'Last_2_Dig': l2d}) 91 | self.set_index('Last_2_Dig', inplace=True) 92 | if plot: 93 | plot_expected(self, -2, save_plot=save_plot, 94 | save_plot_kwargs=save_plot_kwargs) 95 | 96 | 97 | def _get_expected_digits_(digs): 98 | """Chooses the Exxpected class to be used in a test 99 | 100 | Args: 101 | digs: the int corresponding to the Expected class to be instantiated 102 | 103 | Returns: 104 | the Expected instance forthe propoer test to be performed 105 | """ 106 | if digs in [1, 2, 3]: 107 | return First(digs, plot=False) 108 | elif digs == 22: 109 | return Second(plot=False) 110 | else: 111 | return LastTwo(num=True, plot=False) 112 | 113 | 114 | def _gen_last_two_digits_(num=False): 115 | """Creates two arrays, one with the possible last two digits and one with 116 | thei respective probabilities 117 | 118 | Args: 119 | num: returns numeric (ints) values. Defaluts to False, 120 | which returns strings. 121 | 122 | Returns: 123 | exp (np.array): Array with the (constant) probabilities of occurrence of 124 | each pair of last two digits 125 | l2d (np.array): Array of ints or str, in any case representing all 100 126 | possible combinations of last two digits 127 | """ 128 | exp = array([1 / 99.] * 100) 129 | l2d = arange(0, 100) 130 | if num: 131 | return exp, l2d 132 | l2d = l2d.astype(str) 133 | l2d[:10] = array(['00', '01', '02', '03', '04', '05', 134 | '06', '07', '08', '09']) 135 | return exp, l2d 136 | 137 | def _gen_first_digits_(digs): 138 | """Creates two arrays, one with the possible digits combinations and the 139 | other with their respective expected probabilities according to Benford 140 | 141 | Args: 142 | digs (int): 1, 2 or 3, for generation of the first, first two, or first 143 | three digits 144 | 145 | Returns: 146 | (tuple of arrays): the expected probabilities array and the digits 147 | combination array. 148 | """ 149 | dig_array = arange(10 ** (digs - 1), 10 ** digs) 150 | exp_prob = log10(1 + (1. / dig_array)) 151 | return exp_prob, dig_array 152 | 153 | def _gen_second_digits_(): 154 | """Creates two arrays, one with he possible second digits combinations and 155 | the other with their respective expected probabilities according to Benford 156 | 157 | Returns: 158 | (tuple of arrays): the expected probabilities array and the second 159 | digits array. 160 | """ 161 | exp_f2d, _ = _gen_first_digits_(2) 162 | sec_digs = range(10) 163 | sec_digs_in_f2d = array(list(range(10)) * 9) 164 | exp = array([exp_f2d[sec_digs_in_f2d == i].sum() for i in sec_digs]) 165 | return exp, array(sec_digs) -------------------------------------------------------------------------------- /benford/reports.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .constants import MAD_CONFORM 3 | 4 | 5 | def _inform_(df, high_Z, conf): 6 | """Selects and sorts by the Z_stats chosen to be considered, informing or not. 7 | """ 8 | 9 | if isinstance(high_Z, int): 10 | if conf is not None: 11 | dd = df[['Expected', 'Found', 'Z_score' 12 | ]].sort_values('Z_score', ascending=False).head(high_Z) 13 | print(f'\nThe entries with the top {high_Z} Z scores are:\n') 14 | # Summation Test 15 | else: 16 | dd = df[['Expected', 'Found', 'AbsDif' 17 | ]].sort_values('AbsDif', ascending=False 18 | ).head(high_Z) 19 | print(f'\nThe entries with the top {high_Z} absolute deviations ' 20 | 'are:\n') 21 | else: 22 | if high_Z == 'pos': 23 | m1 = df.Dif > 0 24 | m2 = df.Z_score > conf 25 | dd = df[['Expected', 'Found', 'Z_score' 26 | ]].loc[m1 & m2].sort_values('Z_score', ascending=False) 27 | print('\nThe entries with the significant positive ' 28 | 'deviations are:\n') 29 | elif high_Z == 'neg': 30 | m1 = df.Dif < 0 31 | m2 = df.Z_score > conf 32 | dd = df[['Expected', 'Found', 'Z_score' 33 | ]].loc[m1 & m2].sort_values('Z_score', ascending=False) 34 | print('\nThe entries with the significant negative ' 35 | 'deviations are:\n') 36 | else: 37 | dd = df[['Expected', 'Found', 'Z_score' 38 | ]].loc[df.Z_score > conf].sort_values('Z_score', 39 | ascending=False) 40 | print('\nThe entries with the significant deviations are:\n') 41 | print(dd) 42 | 43 | 44 | def _report_mad_(digs, MAD): 45 | """Reports the test Mean Absolut Deviation and compares it to critical values 46 | """ 47 | print(f'Mean Absolute Deviation: {MAD:.6f}') 48 | if digs != -2: 49 | mads = MAD_CONFORM[digs] 50 | if MAD <= mads[0]: 51 | print(f'MAD <= {mads[0]:.6f}: Close conformity.\n') 52 | elif MAD <= mads[1]: 53 | print(f'{mads[0]:.6f} < MAD <= {mads[1]:.6f}: ' 54 | 'Acceptable conformity.\n') 55 | elif MAD <= mads[2]: 56 | print(f'{mads[1]:.6f} < MAD <= {mads[2]:.6f}: ' 57 | 'Marginally Acceptable conformity.\n') 58 | else: 59 | print(f'MAD > {mads[2]:.6f}: Nonconformity.\n') 60 | else: 61 | print("There is no conformity check for this test's MAD.\n") 62 | 63 | 64 | def _report_KS_(KS, crit_KS): 65 | """Reports the test Kolmogorov-Smirnov statistic and compares it to critical 66 | values, depending on the confidence level 67 | """ 68 | result = 'PASS' if KS <= crit_KS else 'FAIL' 69 | print(f"\n\tKolmogorov-Smirnov: {KS:.6f}", 70 | f"\n\tCritical value: {crit_KS:.6f} -- {result}") 71 | 72 | 73 | def _report_chi2_(chi2, CRIT_CHI2): 74 | """Reports the test Chi-square statistic and compares it to critical values, 75 | depending on the confidence level 76 | """ 77 | result = 'PASS' if chi2 <= CRIT_CHI2 else 'FAIL' 78 | print(f"\n\tChi square: {chi2:.6f}", 79 | f"\n\tCritical value: {CRIT_CHI2:.6f} -- {result}") 80 | 81 | 82 | def _report_Z_(df, high_Z, crit_Z): 83 | """Reports the test Z scores and compares them to a critical value, 84 | depending on the confidence level 85 | """ 86 | print(f"\n\tCritical Z-score:{crit_Z}.") 87 | _inform_(df, high_Z, crit_Z) 88 | 89 | 90 | def _report_summ_(test, high_diff): 91 | """Reports the Summation Test Absolute Differences between the Found and 92 | the Expected proportions 93 | 94 | """ 95 | if high_diff is not None: 96 | print(f'\nThe top {high_diff} Absolute Differences are:\n') 97 | print(test.sort_values('AbsDif', ascending=False).head(high_diff)) 98 | else: 99 | print('\nThe top Absolute Differences are:\n') 100 | print(test.sort_values('AbsDif', ascending=False)) 101 | 102 | 103 | def _report_bhattac_coeff_(bhattac_coeff): 104 | """ 105 | """ 106 | print(f"Bhattacharyya Coefficient: {bhattac_coeff:6f}\n") 107 | 108 | 109 | def _report_bhattac_dist_(bhattac_dist): 110 | """ 111 | """ 112 | print(f"Bhattacharyya Distance: {bhattac_dist:6f}\n") 113 | 114 | 115 | def _report_kl_diverg_(kl_diverg): 116 | """ 117 | """ 118 | print(f"Kullback-Leibler Divergence: {kl_diverg:6f}\n") 119 | 120 | 121 | def _report_test_(test, high=None, crit_vals=None): 122 | """Main report function. Receives the Args: to report with, initiates 123 | the process, and calls the right reporting helper function(s), depending 124 | on the Test. 125 | """ 126 | print('\n', f' {test.name} '.center(50, '#'), '\n') 127 | if not 'Summation' in test.name: 128 | _report_mad_(test.digs, test.MAD) 129 | _report_bhattac_coeff_(test.bhattacharyya_coefficient) 130 | _report_bhattac_dist_(test.bhattacharyya_distance) 131 | _report_kl_diverg_(test.kullback_leibler_divergence) 132 | if test.confidence is not None: 133 | print(f"For confidence level {test.confidence}%: ") 134 | _report_KS_(test.KS, crit_vals['KS']) 135 | _report_chi2_(test.chi_square, crit_vals['chi2']) 136 | _report_Z_(test, high, crit_vals['Z']) 137 | else: 138 | print('Confidence is currently `None`. Set the confidence level, ' 139 | 'so as to generate comparable critical values.') 140 | if isinstance(high, int): 141 | _inform_(test, high, None) 142 | else: 143 | _report_summ_(test, high) 144 | 145 | 146 | def _report_mantissa_(stats, confidence): 147 | """Prints the mantissas statistics and their respective reference values 148 | 149 | Args: 150 | stats (dict): 151 | """ 152 | print("\n", ' Mantissas Test '.center(52, '#')) 153 | print(f"\nThe Mantissas MEAN is {stats['Mean']:.6f}." 154 | "\tRef: 0.5") 155 | print(f"The Mantissas VARIANCE is {stats['Var']:.6f}." 156 | "\tRef: 0.08333") 157 | print(f"The Mantissas SKEWNESS is {stats['Skew']:.6f}." 158 | "\tRef: 0.0") 159 | print(f"The Mantissas KURTOSIS is {stats['Kurt']:.6f}." 160 | "\tRef: -1.2") 161 | print("\nThe Kolmogorov-Smirnov statistic for the Mantissas distribution" 162 | f" is {stats['KS']:.6f}.\nThe critical value for the confidence " 163 | f"level of {confidence}% is {stats['KS_critical']:.6f} -- " 164 | f"{'PASS' if stats['KS'] < stats['KS_critical'] else 'FAIL'}\n") 165 | 166 | 167 | def _deprecate_inform_(verbose, inform): 168 | """ 169 | Raises: 170 | FutureWarning: if the arg `inform` is used (to be deprecated). 171 | """ 172 | if inform is None: 173 | return verbose 174 | else: 175 | warnings.warn('The parameter `inform` will be deprecated in future ' 176 | 'versions. Use `verbose` instead.', 177 | FutureWarning) 178 | return inform 179 | -------------------------------------------------------------------------------- /benford/stats.py: -------------------------------------------------------------------------------- 1 | from numpy import abs as nabs, errstate, linspace, log, sqrt, where 2 | from .constants import CRIT_CHI2, CRIT_KS, MAD_CONFORM, DIGS 3 | 4 | 5 | def Z_score(frame, N): 6 | """Computes the Z statistics for the proportions studied 7 | 8 | Args: 9 | frame: DataFrame with the expected proportions and the already calculated 10 | Absolute Diferences between the found and expeccted proportions 11 | N: sample size 12 | 13 | Returns: 14 | Series of computed Z scores 15 | """ 16 | return (frame.AbsDif - (1 / (2 * N))) / sqrt( 17 | (frame.Expected * (1. - frame.Expected)) / N) 18 | 19 | 20 | def chi_sq(frame, ddf, confidence, verbose=True): 21 | """Comnputes the chi-square statistic of the found distributions and compares 22 | it with the critical chi-square of such a sample, according to the 23 | confidence level chosen and the degrees of freedom - len(sample) -1. 24 | 25 | Args: 26 | frame: DataFrame with Found, Expected and their difference columns. 27 | ddf: Degrees of freedom to consider. 28 | confidence: Confidence level to look up critical value. 29 | verbose: prints the chi-squre result and compares to the critical 30 | chi-square for the sample. Defaults to True. 31 | 32 | Returns: 33 | The computed Chi square statistic and the critical chi square 34 | (according) to the degrees of freedom and confidence level, 35 | for comparison. None if confidence is None 36 | """ 37 | if confidence is None: 38 | print('\nChi-square test needs confidence other than None.') 39 | return 40 | else: 41 | exp_counts = frame.Counts.sum() * frame.Expected 42 | dif_counts = frame.Counts - exp_counts 43 | found_chi = (dif_counts ** 2 / exp_counts).sum() 44 | crit_chi = CRIT_CHI2[ddf][confidence] 45 | if verbose: 46 | print(f"\nThe Chi-square statistic is {found_chi:.4f}.\n" 47 | f"Critical Chi-square for this series: {crit_chi}.") 48 | return (found_chi, crit_chi) 49 | 50 | 51 | def chi_sq_2(frame): 52 | """Computes the chi-square statistic of the found distributions 53 | 54 | Args: 55 | frame: DataFrame with Found, Expected and their difference columns. 56 | 57 | Returns: 58 | The computed Chi square statistic 59 | """ 60 | exp_counts = frame.Counts.sum() * frame.Expected 61 | dif_counts = frame.Counts - exp_counts 62 | return (dif_counts ** 2 / exp_counts).sum() 63 | 64 | 65 | def kolmogorov_smirnov(frame, confidence, N, verbose=True): 66 | """Computes the Kolmogorov-Smirnov test of the found distributions 67 | and compares it with the critical chi-square of such a sample, 68 | according to the confidence level chosen. 69 | 70 | Args: 71 | frame: DataFrame with Foud and Expected distributions. 72 | confidence: Confidence level to look up critical value. 73 | N: Sample size 74 | verbose: prints the KS result and the critical value for the sample. 75 | Defaults to True. 76 | 77 | Returns: 78 | The Suprem, which is the greatest absolute difference between the 79 | Found and the expected proportions, and the Kolmogorov-Smirnov 80 | critical value according to the confidence level, for ccomparison 81 | """ 82 | if confidence is None: 83 | print('\nKolmogorov-Smirnov test needs confidence other than None.') 84 | return 85 | else: 86 | # sorting and calculating the cumulative distribution 87 | ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum() 88 | # finding the supremum - the largest cumul dist difference 89 | suprem = ((ks_frame.Found - ks_frame.Expected).abs()).max() 90 | # calculating the crittical value according to confidence 91 | crit_KS = CRIT_KS[confidence] / sqrt(N) 92 | 93 | if verbose: 94 | print(f"\nThe Kolmogorov-Smirnov statistic is {suprem:.4f}.\n" 95 | f"Critical K-S for this series: {crit_KS:.4f}") 96 | return (suprem, crit_KS) 97 | 98 | 99 | def kolmogorov_smirnov_2(frame): 100 | """Computes the Kolmogorov-Smirnov test of the found distributions 101 | 102 | Args: 103 | frame: DataFrame with Foud and Expected distributions. 104 | 105 | Returns: 106 | The Suprem, which is the greatest absolute difference between the 107 | Found end th expected proportions 108 | """ 109 | # sorting and calculating the cumulative distribution 110 | ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum() 111 | # finding the supremum - the largest cumul dist difference 112 | return ((ks_frame.Found - ks_frame.Expected).abs()).max() 113 | 114 | 115 | def _two_dist_ks_(dist1, dist2, cummulative=True): 116 | """Computes the Kolmogorov-Smirnov statistic between two distributions, 117 | a found one (dist2) and an expected one (dist1). 118 | 119 | Args: 120 | dist1 (np.arrat): array with the expected distribution 121 | dist2 (np.array): array with the found distribution 122 | cummulative (bool): makes apply cummulutative sum to the 123 | distributions (empirical cdf). 124 | 125 | Returns: 126 | tuple(floats): the KS statistic 127 | """ 128 | dist2.sort(); dist1.sort() 129 | if not cummulative: 130 | return nabs(dist2 - dist1).max() 131 | return nabs(dist2.cumsum() - dist1.cumsum()).max() 132 | 133 | 134 | def _mantissas_ks_(mant_dist, confidence, sample_size): 135 | """Computes the Kolmogorov-Smirnof statistic for the Mantissas, also 136 | providing the KS critical value according the the sample size and 137 | confidence level provided 138 | 139 | Args: 140 | mant_dist (np.array): array with the mantissas distribution found 141 | confidence (float, int): level of confidence to compute the critical 142 | value 143 | 144 | Returns: 145 | tuple(floats): the KS statistic and the critical value 146 | """ 147 | crit_ks = CRIT_KS[confidence] * sqrt(2 * sample_size / sample_size ** 2)\ 148 | if confidence else None 149 | # non-cummulative, uniformly distributed 150 | expected = linspace(0, 1, len(mant_dist), endpoint=False) 151 | ks = _two_dist_ks_(expected, mant_dist, cummulative=False) 152 | return ks, crit_ks 153 | 154 | 155 | def mad(frame, test, verbose=True): 156 | """Computes the Mean Absolute Deviation (MAD) between the found and the 157 | expected proportions. 158 | 159 | Args: 160 | frame: DataFrame with the Absolute Deviations already calculated. 161 | test: Test to compute the MAD from (F1D, SD, F2D...) 162 | verbose: prints the MAD result and compares to limit values of 163 | conformity. Defaults to True. 164 | 165 | Returns: 166 | The Mean of the Absolute Deviations between the found and expected 167 | proportions. 168 | """ 169 | mad = frame.AbsDif.mean() 170 | 171 | if verbose: 172 | print(f"\nThe Mean Absolute Deviation is {mad}") 173 | 174 | if test != -2: 175 | print(f"For the {MAD_CONFORM[DIGS[test]]}:\n\ 176 | - 0.0000 to {MAD_CONFORM[test][0]}: Close Conformity\n\ 177 | - {MAD_CONFORM[test][0]} to {MAD_CONFORM[test][1]}: Acceptable Conformity\n\ 178 | - {MAD_CONFORM[test][1]} to {MAD_CONFORM[test][2]}: Marginally Acceptable Conformity\n\ 179 | - Above {MAD_CONFORM[test][2]}: Nonconformity") 180 | else: 181 | pass 182 | return mad 183 | 184 | 185 | def mse(frame, verbose=True): 186 | """Computes the test's Mean Square Error 187 | 188 | Args: 189 | frame: DataFrame with the already computed Absolute Deviations between 190 | the found and expected proportions 191 | verbose: Prints the MSE. Defaults to True. 192 | 193 | Returns: 194 | Mean of the squared differences between the found and the expected proportions. 195 | """ 196 | mse = (frame.AbsDif ** 2).mean() 197 | 198 | if verbose: 199 | print(f"\nMean Square Error = {mse}") 200 | 201 | return mse 202 | 203 | def _bhattacharyya_coefficient(dist_1, dist_2): 204 | """Computes the Bhattacharyya Coeficient between two probability 205 | distributions, to be letar used to compute the Bhattacharyya Distance 206 | 207 | Args: 208 | dist_1 (np.array): The newly gathered distribution, to be compared 209 | with an older / established distribution. 210 | dist_2 (np.array): The older/ establhished distribution with which 211 | the new one will be compared. 212 | 213 | Returns: 214 | bhat_coef (float) 215 | """ 216 | return sqrt(dist_1 * dist_2).sum() 217 | 218 | 219 | def _bhattacharyya_distance_(dist_1, dist_2): 220 | """Computes the Bhattacharyya Dsitance between two probability 221 | distributions 222 | 223 | Args: 224 | dist_1 (np.array): The newly gathered distribution, to be compared 225 | with an older / established distribution. 226 | dist_2 (np.array): The older/ establhished distribution with which 227 | the new one will be compared. 228 | 229 | Returns: 230 | bhat_dist (float) 231 | """ 232 | with errstate(divide='ignore'): 233 | bhat_dist = -log(_bhattacharyya_coefficient(dist_1, dist_2)) 234 | return bhat_dist 235 | 236 | 237 | def _kullback_leibler_divergence_(dist_1, dist_2): 238 | """Computes the Kullback-Leibler Divergence between two probability 239 | distributions. 240 | 241 | Args: 242 | dist_1 (np.array): The newly gathered distribution, to be compared 243 | with an older / established distribution. 244 | dist_2 (np.array): The older/ establhished distribution with which 245 | the new one will be compared. 246 | 247 | Returns: 248 | kulb_leib_diverg (float) 249 | """ 250 | # ignore divide by zero warning in np.where 251 | with errstate(divide='ignore'): 252 | kl_d = (log((dist_1 / dist_2), where=(dist_1 != 0)) * dist_1).sum() 253 | return kl_d 254 | -------------------------------------------------------------------------------- /benford/utils.py: -------------------------------------------------------------------------------- 1 | from pandas import Series, DataFrame 2 | from numpy import array, arange, log10, ndarray 3 | from .expected import _get_expected_digits_ 4 | from .constants import DIGS, REV_DIGS 5 | from .stats import Z_score 6 | from .checks import _check_num_array_, _check_sign_, _check_decimals_ 7 | 8 | 9 | def _set_N_(len_df, limit_N): 10 | """""" 11 | # Assigning to N the superior limit or the lenght of the series 12 | if limit_N is None or limit_N > len_df: 13 | return max(1, len_df) 14 | # Check on limit_N being a positive integer 15 | else: 16 | if limit_N < 0 or not isinstance(limit_N, int): 17 | raise ValueError("limit_N must be None or a positive integer.") 18 | else: 19 | return max(1, limit_N) 20 | 21 | 22 | def get_mantissas(arr): 23 | """Computes the mantissas, the non-integer part of the log of a number. 24 | 25 | Args: 26 | arr: array of integers or floats 27 | 28 | Returns: 29 | Array of floats withe logs mantissas 30 | """ 31 | log_a = abs(log10(arr)) 32 | return log_a - log_a.astype(int) # the number - its integer part 33 | 34 | 35 | def input_data(given): 36 | """Internalizes and transforms the input data 37 | 38 | Args: 39 | given: ndarray, Series or tuple with DataFrame and name of the 40 | column to analyze 41 | 42 | Returns: 43 | The raw inputed data and the result of its first pre-processing, 44 | when required. 45 | """ 46 | if type(given) == Series: 47 | data = chosen = given 48 | elif type(given) == ndarray: 49 | data = given 50 | chosen = Series(given) 51 | elif type(given) == tuple: 52 | if (type(given[0]) != DataFrame) | (type(given[1]) != str): 53 | raise TypeError('The data tuple must be composed of a pandas ' 54 | 'DataFrame and the name (str) of the chosen ' 55 | 'column, in that order') 56 | data = given[0] 57 | chosen = given[0][given[1]] 58 | else: 59 | raise TypeError("Wrong data input type. Check docstring.") 60 | return data, chosen 61 | 62 | 63 | def set_sign(data, sign="all"): 64 | """ 65 | """ 66 | sign = _check_sign_(sign) 67 | 68 | if sign == 'all': 69 | data.seq = data.seq.loc[data.seq != 0] 70 | elif sign == 'pos': 71 | data.seq = data.seq.loc[data.seq > 0] 72 | else: 73 | data.seq = data.seq.loc[data.seq < 0] 74 | 75 | return data.dropna() 76 | 77 | 78 | def get_times_10_power(data, decimals=2): 79 | """""" 80 | decimals = _check_decimals_(decimals) 81 | 82 | ab = data.seq.abs() 83 | 84 | if data.seq.dtype == 'int': 85 | data['ZN'] = ab 86 | else: 87 | if decimals == 'infer': 88 | data['ZN'] = ab.astype(str).str\ 89 | .replace('.', '', regex=False)\ 90 | .str.lstrip('0')\ 91 | .str[:5].astype(int) 92 | else: 93 | data['ZN'] = (ab * (10 ** decimals)).astype(int) 94 | return data 95 | 96 | 97 | def get_digs(data, decimals=2, sign="all"): 98 | """ 99 | """ 100 | df = DataFrame({'seq': _check_num_array_(data)}) 101 | 102 | df = set_sign(df, sign=sign) 103 | 104 | df = get_times_10_power(df, decimals=decimals) 105 | 106 | # First digits 107 | for col in ['F1D', 'F2D', 'F3D']: 108 | temp = df.ZN.loc[df.ZN >= 10 ** (REV_DIGS[col] - 1)] 109 | df[col] = (temp // 10 ** ((log10(temp).astype(int)) - 110 | (REV_DIGS[col] - 1))) 111 | # fill NANs with -1, which is a non-usable value for digits, 112 | # to be discarded later. 113 | df[col] = df[col].fillna(-1).astype(int) 114 | # Second digit 115 | temp_sd = df.loc[df.ZN >= 10] 116 | df['SD'] = (temp_sd.ZN // 10**((log10(temp_sd.ZN)).astype(int) - 117 | 1)) % 10 118 | df['SD'] = df['SD'].fillna(-1).astype(int) 119 | # Last two digits 120 | temp_l2d = df.loc[df.ZN >= 1000] 121 | df['L2D'] = temp_l2d.ZN % 100 122 | df['L2D'] = df['L2D'].fillna(-1).astype(int) 123 | return df 124 | 125 | 126 | def get_found_proportions(data): 127 | """ 128 | """ 129 | counts = data.value_counts() 130 | # get their relative frequencies 131 | proportions = data.value_counts(normalize=True) 132 | # crate dataframe from them 133 | return DataFrame({'Counts': counts, 'Found': proportions}).sort_index() 134 | 135 | 136 | def join_expect_found_diff(data, digs): 137 | """ 138 | """ 139 | dd =_get_expected_digits_(digs).join(data).fillna(0) 140 | # create column with absolute differences 141 | dd['Dif'] = dd.Found - dd.Expected 142 | dd['AbsDif'] = dd.Dif.abs() 143 | return dd 144 | 145 | 146 | def prepare(data, digs, limit_N=None, simple=False): 147 | """Transforms the original number sequence into a DataFrame reduced 148 | by the ocurrences of the chosen digits, creating other computed 149 | columns 150 | """ 151 | df = get_found_proportions(data) 152 | dd = join_expect_found_diff(df, digs) 153 | if simple: 154 | del dd['Dif'] 155 | return dd 156 | else: 157 | N = _set_N_(len(data), limit_N=limit_N) 158 | dd['Z_score'] = Z_score(dd, N) 159 | return N, dd 160 | 161 | 162 | def subtract_sorted(data): 163 | """Subtracts the sorted sequence elements from each other, discarding zeros. 164 | Used in the Second Order test 165 | """ 166 | temp = data.copy().sort_values(ignore_index=True) 167 | temp = (temp - temp.shift(1)).dropna() 168 | return temp.loc[temp != 0] 169 | 170 | 171 | def prep_to_roll(start, test): 172 | """Used by the rolling mad and rolling mean, prepares each test and 173 | respective expected proportions for later application to the Series subset 174 | """ 175 | if test in [1, 2, 3]: 176 | start[DIGS[test]] = start.ZN // 10 ** (( 177 | log10(start.ZN).astype(int)) - (test - 1)) 178 | start = start.loc[start.ZN >= 10 ** (test - 1)] 179 | 180 | ind = arange(10 ** (test - 1), 10 ** test) 181 | Exp = log10(1 + (1. / ind)) 182 | 183 | elif test == 22: 184 | start[DIGS[test]] = (start.ZN // 10 ** (( 185 | log10(start.ZN)).astype(int) - 1)) % 10 186 | start = start.loc[start.ZN >= 10] 187 | 188 | Expec = log10(1 + (1. / arange(10, 100))) 189 | temp = DataFrame({'Expected': Expec, 'Sec_Dig': 190 | array(list(range(10)) * 9)}) 191 | Exp = temp.groupby('Sec_Dig').sum().values.reshape(10,) 192 | ind = arange(0, 10) 193 | 194 | else: 195 | start[DIGS[test]] = start.ZN % 100 196 | start = start.loc[start.ZN >= 1000] 197 | 198 | ind = arange(0, 100) 199 | Exp = array([1 / 99.] * 100) 200 | 201 | return Exp, ind 202 | 203 | 204 | def mad_to_roll(arr, Exp, ind): 205 | """Mean Absolute Deviation used in the rolling function 206 | """ 207 | prop = arr.value_counts(normalize=True).sort_index() 208 | 209 | if len(prop) < len(Exp): 210 | prop = prop.reindex(ind).fillna(0) 211 | 212 | return abs(prop - Exp).mean() 213 | 214 | 215 | def mse_to_roll(arr, Exp, ind): 216 | """Mean Squared Error used in the rolling function 217 | """ 218 | temp = arr.value_counts(normalize=True).sort_index() 219 | 220 | if len(temp) < len(Exp): 221 | temp = temp.reindex(ind).fillna(0) 222 | 223 | return ((temp - Exp) ** 2).mean() 224 | -------------------------------------------------------------------------------- /benford/viz.py: -------------------------------------------------------------------------------- 1 | from numpy import array, arange, maximum, sqrt, ones 2 | import matplotlib.pyplot as plt 3 | from matplotlib.text import Annotation 4 | from .constants import COLORS, MAD_CONFORM 5 | 6 | 7 | def plot_expected(df, digs, save_plot=None, save_plot_kwargs=None): 8 | """Plots the Expected Benford Distributions 9 | 10 | Args: 11 | df: DataFrame with the Expected Proportions 12 | digs: Test's digit 13 | save_plot: string with the path/name of the file in which the generated 14 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 15 | is infered by the file name extension. 16 | save_plot_kwargs: dict with any of the kwargs accepted by 17 | matplotlib.pyplot.savefig() 18 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 19 | """ 20 | if digs in [1, 2, 3]: 21 | y_max = (df.Expected.max() + (10 ** -(digs) / 3)) * 100 22 | figsize = 2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5) 23 | elif digs == 22: 24 | y_max = 13. 25 | figsize = 14, 10.5 26 | elif digs == -2: 27 | y_max = 1.1 28 | figsize = 15, 8 29 | fig, ax = plt.subplots(figsize=figsize) 30 | plt.title('Expected Benford Distributions', size='xx-large') 31 | plt.xlabel(df.index.name, size='x-large') 32 | plt.ylabel('Distribution (%)', size='x-large') 33 | ax.set_facecolor(COLORS['b']) 34 | ax.set_ylim(0, y_max) 35 | ax.bar(df.index, df.Expected * 100, color=COLORS['t'], align='center') 36 | ax.set_xticks(df.index) 37 | ax.set_xticklabels(df.index) 38 | 39 | if save_plot: 40 | if not save_plot_kwargs: 41 | save_plot_kwargs = {} 42 | plt.savefig(save_plot, **save_plot_kwargs) 43 | 44 | plt.show(block=False) 45 | 46 | 47 | def _get_plot_args(digs): 48 | """Selects the correct arguments for the plotting functions, depending on the 49 | the test (digs) chosen. 50 | """ 51 | if digs in [1, 2, 3]: 52 | text_x = False 53 | n, m = 10 ** (digs - 1), 10 ** (digs) 54 | x = arange(n, m) 55 | figsize = (2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5)) 56 | elif digs == 22: 57 | text_x = False 58 | x = arange(10) 59 | figsize = (14, 10) 60 | else: 61 | text_x = True 62 | x = arange(100) 63 | figsize = (15, 7) 64 | return x, figsize, text_x 65 | 66 | def plot_digs(df, x, y_Exp, y_Found, N, figsize, conf_Z, text_x=False, 67 | save_plot=None, save_plot_kwargs=None): 68 | """Plots the digits tests results 69 | 70 | Args: 71 | df: DataFrame with the data to be plotted 72 | x: sequence to be used in the x axis 73 | y_Exp: sequence of the expected proportions to be used in the y axis 74 | (line) 75 | y_Found: sequence of the found proportions to be used in the y axis 76 | (bars) 77 | N: lenght of sequence, to be used when plotting the confidence levels 78 | figsize: tuple to state the size of the plot figure 79 | conf_Z: Confidence level 80 | save_pic: file path to save figure 81 | text_x: Forces to show all x ticks labels. Defaluts to True. 82 | save_plot: string with the path/name of the file in which the generated 83 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 84 | is infered by the file name extension. 85 | save_plot_kwargs: dict with any of the kwargs accepted by 86 | matplotlib.pyplot.savefig() 87 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 88 | 89 | """ 90 | if len(x) > 10: 91 | rotation = 90 92 | else: 93 | rotation = 0 94 | fig, ax = plt.subplots(figsize=figsize) 95 | plt.title('Expected vs. Found Distributions', size='xx-large') 96 | plt.xlabel('Digits', size='x-large') 97 | plt.ylabel('Distribution (%)', size='x-large') 98 | if conf_Z is not None: 99 | sig = conf_Z * sqrt(y_Exp * (1 - y_Exp) / N) 100 | upper = y_Exp + sig + (1 / (2 * N)) 101 | lower_zeros = array([0]*len(upper)) 102 | lower = maximum(y_Exp - sig - (1 / (2 * N)), lower_zeros) 103 | u = (y_Found < lower) | (y_Found > upper) 104 | c = array([COLORS['m']] * len(u)) 105 | c[u] = COLORS['af'] 106 | lower *= 100. 107 | upper *= 100. 108 | ax.plot(x, upper, color=COLORS['s'], zorder=5) 109 | ax.plot(x, lower, color=COLORS['s'], zorder=5) 110 | ax.fill_between(x, upper, lower, color=COLORS['s'], 111 | alpha=.3, label='Conf') 112 | else: 113 | c = COLORS['m'] 114 | ax.bar(x, y_Found * 100., color=c, label='Found', zorder=3, align='center') 115 | ax.plot(x, y_Exp * 100., color=COLORS['s'], linewidth=2.5, 116 | label='Benford', zorder=4) 117 | ax.set_xticks(x) 118 | ax.set_xticklabels(x, rotation=rotation) 119 | ax.set_facecolor(COLORS['b']) 120 | if text_x: 121 | ind = array(df.index).astype(str) 122 | ind[:10] = array(['00', '01', '02', '03', '04', '05', 123 | '06', '07', '08', '09']) 124 | plt.xticks(x, ind, rotation='vertical') 125 | ax.legend() 126 | ax.set_ylim(0, max([y_Exp.max() * 100, y_Found.max() * 100]) + 10 / len(x)) 127 | ax.set_xlim(x[0] - 1, x[-1] + 1) 128 | 129 | if save_plot: 130 | if not save_plot_kwargs: 131 | save_plot_kwargs = {} 132 | plt.savefig(save_plot, **save_plot_kwargs) 133 | 134 | plt.show(block=False) 135 | 136 | 137 | def plot_sum(df, figsize, li, text_x=False, save_plot=None, save_plot_kwargs=None): 138 | """Plots the summation test results 139 | 140 | Args: 141 | df: DataFrame with the data to be plotted 142 | figsize: sets the dimensions of the plot figure 143 | li: value with which to draw the horizontal line 144 | save_plot: string with the path/name of the file in which the generated 145 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 146 | is infered by the file name extension. 147 | save_plot_kwargs: dict with any of the kwargs accepted by 148 | matplotlib.pyplot.savefig() 149 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 150 | """ 151 | x = df.index 152 | rotation = 90 if len(x) > 10 else 0 153 | fig = plt.figure(figsize=figsize) 154 | ax = fig.add_subplot(111) 155 | plt.title('Expected vs. Found Sums') 156 | plt.xlabel('Digits') 157 | plt.ylabel('Sums') 158 | ax.bar(x, df.Percent, color=COLORS['m'], 159 | label='Found Sums', zorder=3, align='center') 160 | ax.set_xlim(x[0] - 1, x[-1] + 1) 161 | ax.axhline(li, color=COLORS['s'], linewidth=2, label='Expected', zorder=4) 162 | ax.set_xticks(x) 163 | ax.set_xticklabels(x, rotation=rotation) 164 | ax.set_facecolor(COLORS['b']) 165 | if text_x: 166 | ind = array(x).astype(str) 167 | ind[:10] = array(['00', '01', '02', '03', '04', '05', 168 | '06', '07', '08', '09']) 169 | plt.xticks(x, ind, rotation='vertical') 170 | ax.legend() 171 | 172 | if save_plot: 173 | if not save_plot_kwargs: 174 | save_plot_kwargs = {} 175 | plt.savefig(save_plot, **save_plot_kwargs) 176 | 177 | plt.show(block=False) 178 | 179 | def plot_ordered_mantissas(col, figsize=(12, 12), 180 | save_plot=None, save_plot_kwargs=None): 181 | """Plots the ordered mantissas and compares them to the expected, straight 182 | line that should be formed in a Benford-cmpliant set. 183 | 184 | Args: 185 | col (Series): column of mantissas to plot. 186 | figsize (tuple): sets the dimensions of the plot figure. 187 | save_plot: string with the path/name of the file in which the generated 188 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 189 | is infered by the file name extension. 190 | save_plot_kwargs: dict with any of the kwargs accepted by 191 | matplotlib.pyplot.savefig() 192 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 193 | 194 | """ 195 | ld = len(col) 196 | x = arange(1, ld + 1) 197 | n = ones(ld) / ld 198 | fig = plt.figure(figsize=figsize) 199 | ax = fig.add_subplot(111) 200 | ax.plot(x, col.sort_values(), linestyle='--', 201 | color=COLORS['s'], linewidth=3, label='Mantissas') 202 | ax.plot(x, n.cumsum(), color=COLORS['m'], 203 | linewidth=2, label='Expected') 204 | plt.ylim((0, 1.)) 205 | plt.xlim((1, ld + 1)) 206 | ax.set_facecolor(COLORS['b']) 207 | ax.set_title("Ordered Mantissas") 208 | plt.legend(loc='upper left') 209 | 210 | if save_plot: 211 | if not save_plot_kwargs: 212 | save_plot_kwargs = {} 213 | plt.savefig(save_plot, **save_plot_kwargs) 214 | 215 | plt.show(block=False); 216 | 217 | def plot_mantissa_arc_test(df, gravity_center, grid=True, figsize=12, 218 | save_plot=None, save_plot_kwargs=None): 219 | """Draws thee Mantissa Arc Test after computing X and Y circular coordinates 220 | for every mantissa and the center of gravity for the set 221 | 222 | Args: 223 | df (DataFrame): pandas DataFrame with the mantissas and the X and Y 224 | coordinates. 225 | gravity_center (tuple): coordinates for plottling the gravity center 226 | grid (bool): show grid. Defaults to True. 227 | figsize (int): figure dimensions. No need to be a tuple, since the 228 | figure is a square. 229 | save_plot: string with the path/name of the file in which the generated 230 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 231 | is infered by the file name extension. 232 | save_plot_kwargs: dict with any of the kwargs accepted by 233 | matplotlib.pyplot.savefig() 234 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 235 | """ 236 | fig = plt.figure(figsize=(figsize, figsize)) 237 | ax = plt.subplot() 238 | ax.set_facecolor(COLORS['b']) 239 | ax.scatter(df.mant_x, df.mant_y, label="ARC TEST", 240 | color=COLORS['m']) 241 | ax.scatter(gravity_center[0], gravity_center[1], 242 | color=COLORS['s']) 243 | text_annotation = Annotation( 244 | " Gravity Center: " 245 | f"x({round(gravity_center[0], 3)})," 246 | f" y({round(gravity_center[1], 3)})", 247 | xy=(gravity_center[0] - 0.65, 248 | gravity_center[1] - 0.1), 249 | xycoords='data') 250 | ax.add_artist(text_annotation) 251 | ax.grid(True, which='both') 252 | ax.axhline(y=0, color='k') 253 | ax.axvline(x=0, color='k') 254 | ax.legend(loc='lower left') 255 | ax.set_title("Mantissas Arc Test") 256 | 257 | if save_plot: 258 | if not save_plot_kwargs: 259 | save_plot_kwargs = {} 260 | plt.savefig(save_plot, **save_plot_kwargs) 261 | 262 | plt.show(block=False); 263 | 264 | def plot_roll_mse(roll_series, figsize, save_plot=None, save_plot_kwargs=None): 265 | """Shows the rolling MSE plot 266 | 267 | Args: 268 | roll_series: pd.Series resultant form rolling mse. 269 | figsize: the figure dimensions. 270 | save_plot: string with the path/name of the file in which the generated 271 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 272 | is infered by the file name extension. 273 | save_plot_kwargs: dict with any of the kwargs accepted by 274 | matplotlib.pyplot.savefig() 275 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 276 | """ 277 | fig, ax = plt.subplots(figsize=figsize) 278 | ax.set_facecolor(COLORS['b']) 279 | ax.plot(roll_series, color=COLORS['m']) 280 | 281 | if save_plot: 282 | if not save_plot_kwargs: 283 | save_plot_kwargs = {} 284 | plt.savefig(save_plot, **save_plot_kwargs) 285 | 286 | plt.show(block=False) 287 | 288 | def plot_roll_mad(roll_mad, figsize, save_plot=None, save_plot_kwargs=None): 289 | """Shows the rolling MAD plot 290 | 291 | Args: 292 | roll_mad: pd.Series resultant form rolling mad. 293 | figsize: the figure dimensions. 294 | save_plot: string with the path/name of the file in which the generated 295 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 296 | is infered by the file name extension. 297 | save_plot_kwargs: dict with any of the kwargs accepted by 298 | matplotlib.pyplot.savefig() 299 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 300 | """ 301 | fig, ax = plt.subplots(figsize=figsize) 302 | ax.set_facecolor(COLORS['b']) 303 | ax.plot(roll_mad.roll_series, color=COLORS['m']) 304 | 305 | if roll_mad.test != -2: 306 | plt.axhline(y=MAD_CONFORM[roll_mad.test][0], color=COLORS['af'], linewidth=3) 307 | plt.axhline(y=MAD_CONFORM[roll_mad.test][1], color=COLORS['h2'], linewidth=3) 308 | plt.axhline(y=MAD_CONFORM[roll_mad.test][2], color=COLORS['s'], linewidth=3) 309 | 310 | if save_plot: 311 | if not save_plot_kwargs: 312 | save_plot_kwargs = {} 313 | plt.savefig(save_plot, **save_plot_kwargs) 314 | 315 | plt.show(block=False) 316 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/build/doctrees/api.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/api.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/benford.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/benford.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/build/doctrees/modules.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/modules.doctree -------------------------------------------------------------------------------- /docs/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 1ab0e725c448968d4851f0b695542647 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/build/html/_modules/benford/expected.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | benford.expected — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | 96 | 97 |
98 | 99 | 100 | 106 | 107 | 108 |
109 | 110 |
111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 |
131 | 132 |
    133 | 134 |
  • »
  • 135 | 136 |
  • Module code »
  • 137 | 138 |
  • benford.expected
  • 139 | 140 | 141 |
  • 142 | 143 |
  • 144 | 145 |
146 | 147 | 148 |
149 |
150 |
151 |
152 | 153 |

Source code for benford.expected

154 | from pandas import DataFrame
155 | from numpy import array, arange, log10
156 | from .checks import _check_digs_
157 | from .viz import plot_expected
158 | 
159 | 
160 | 
[docs]class First(DataFrame): 161 | """Holds the expected probabilities of the First, First Two, or 162 | First Three digits according to Benford's distribution. 163 | 164 | Args: 165 | digs: 1, 2 or 3 - tells which of the first digits to consider: 166 | 1 for the First Digit, 2 for the First Two Digits and 3 for 167 | the First Three Digits. 168 | plot: option to plot a bar chart of the Expected proportions. 169 | Defaults to True. 170 | save_plot: string with the path/name of the file in which the generated 171 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 172 | is infered by the file name extension. Only available when 173 | plot=True. 174 | save_plot_kwargs: dict with any of the kwargs accepted by 175 | matplotlib.pyplot.savefig() 176 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 177 | Only available when plot=True and save_plot is a string with the 178 | figure file path/name. 179 | """ 180 | 181 | def __init__(self, digs, plot=True, save_plot=None, save_plot_kwargs=None): 182 | _check_digs_(digs) 183 | dig_name = f'First_{digs}_Dig' 184 | exp_array, dig_array = _gen_first_digits_(digs) 185 | 186 | DataFrame.__init__(self, {'Expected': exp_array}, index=dig_array) 187 | self.index.names = [dig_name] 188 | 189 | if plot: 190 | plot_expected(self, digs, save_plot=save_plot, 191 | save_plot_kwargs=save_plot_kwargs)
192 | 193 | 194 |
[docs]class Second(DataFrame): 195 | """Holds the expected probabilities of the Second Digits 196 | according to Benford's distribution. 197 | 198 | Args: 199 | plot: option to plot a bar chart of the Expected proportions. 200 | Defaults to True. 201 | save_plot: string with the path/name of the file in which the generated 202 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 203 | is infered by the file name extension. Only available when 204 | plot=True. 205 | save_plot_kwargs: dict with any of the kwargs accepted by 206 | matplotlib.pyplot.savefig() 207 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 208 | Only available when plot=True and save_plot is a string with the 209 | figure file path/name. 210 | """ 211 | def __init__(self, plot=True, save_plot=None, save_plot_kwargs=None): 212 | 213 | exp, sec_digs = _gen_second_digits_() 214 | 215 | DataFrame.__init__(self, {'Expected': exp, 'Sec_Dig': sec_digs}) 216 | self.set_index("Sec_Dig", inplace=True) 217 | 218 | if plot: 219 | plot_expected(self, 22, save_plot=save_plot, 220 | save_plot_kwargs=save_plot_kwargs)
221 | 222 | 223 |
[docs]class LastTwo(DataFrame): 224 | """Holds the expected probabilities of the Last Two Digits 225 | according to Benford's distribution. 226 | 227 | Args: 228 | plot: option to plot a bar chart of the Expected proportions. 229 | Defaults to True. 230 | save_plot: string with the path/name of the file in which the generated 231 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format 232 | is infered by the file name extension. Only available when 233 | plot=True. 234 | save_plot_kwargs: dict with any of the kwargs accepted by 235 | matplotlib.pyplot.savefig() 236 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html 237 | Only available when plot=True and save_plot is a string with the 238 | figure file path/name. 239 | """ 240 | def __init__(self, num=False, plot=True, save_plot=None, save_plot_kwargs=None): 241 | exp, l2d = _gen_last_two_digits_(num=num) 242 | DataFrame.__init__(self, {'Expected': exp, 243 | 'Last_2_Dig': l2d}) 244 | self.set_index('Last_2_Dig', inplace=True) 245 | if plot: 246 | plot_expected(self, -2, save_plot=save_plot, 247 | save_plot_kwargs=save_plot_kwargs)
248 | 249 | 250 | def _get_expected_digits_(digs): 251 | """Chooses the Exxpected class to be used in a test 252 | 253 | Args: 254 | digs: the int corresponding to the Expected class to be instantiated 255 | 256 | Returns: 257 | the Expected instance forthe propoer test to be performed 258 | """ 259 | if digs in [1, 2, 3]: 260 | return First(digs, plot=False) 261 | elif digs == 22: 262 | return Second(plot=False) 263 | else: 264 | return LastTwo(num=True, plot=False) 265 | 266 | 267 | def _gen_last_two_digits_(num=False): 268 | """Creates two arrays, one with the possible last two digits and one with 269 | thei respective probabilities 270 | 271 | Args: 272 | num: returns numeric (ints) values. Defaluts to False, 273 | which returns strings. 274 | 275 | Returns: 276 | exp (np.array): Array with the (constant) probabilities of occurrence of 277 | each pair of last two digits 278 | l2d (np.array): Array of ints or str, in any case representing all 100 279 | possible combinations of last two digits 280 | """ 281 | exp = array([1 / 99.] * 100) 282 | l2d = arange(0, 100) 283 | if num: 284 | return exp, l2d 285 | l2d = l2d.astype(str) 286 | l2d[:10] = array(['00', '01', '02', '03', '04', '05', 287 | '06', '07', '08', '09']) 288 | return exp, l2d 289 | 290 | def _gen_first_digits_(digs): 291 | """Creates two arrays, one with the possible digits combinations and the 292 | other with their respective expected probabilities according to Benford 293 | 294 | Args: 295 | digs (int): 1, 2 or 3, for generation of the first, first two, or first 296 | three digits 297 | 298 | Returns: 299 | (tuple of arrays): the expected probabilities array and the digits 300 | combination array. 301 | """ 302 | dig_array = arange(10 ** (digs - 1), 10 ** digs) 303 | exp_prob = log10(1 + (1. / dig_array)) 304 | return exp_prob, dig_array 305 | 306 | def _gen_second_digits_(): 307 | """Creates two arrays, one with he possible second digits combinations and 308 | the other with their respective expected probabilities according to Benford 309 | 310 | Returns: 311 | (tuple of arrays): the expected probabilities array and the second 312 | digits array. 313 | """ 314 | exp_f2d, _ = _gen_first_digits_(2) 315 | sec_digs = range(10) 316 | sec_digs_in_f2d = array(list(range(10)) * 9) 317 | exp = array([exp_f2d[sec_digs_in_f2d == i].sum() for i in sec_digs]) 318 | return exp, array(sec_digs) 319 |
320 | 321 |
322 | 323 |
324 | 344 |
345 |
346 | 347 |
348 | 349 |
350 | 351 | 352 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | -------------------------------------------------------------------------------- /docs/build/html/_modules/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Overview: module code — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | 96 | 97 |
98 | 99 | 100 | 106 | 107 | 108 |
109 | 110 |
111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 |
131 | 132 |
    133 | 134 |
  • »
  • 135 | 136 |
  • Overview: module code
  • 137 | 138 | 139 |
  • 140 | 141 |
  • 142 | 143 |
144 | 145 | 146 |
147 |
148 |
149 |
150 | 151 |

All modules for which code is available

152 | 157 | 158 |
159 | 160 |
161 | 181 |
182 |
183 | 184 |
185 | 186 |
187 | 188 | 189 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /docs/build/html/_sources/api.rst.txt: -------------------------------------------------------------------------------- 1 | benford package 2 | =============== 3 | 4 | benford.benford module 5 | ---------------------- 6 | 7 | .. automodule:: benford.benford 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | 13 | benford.expected module 14 | ----------------------- 15 | 16 | .. automodule:: benford.expected 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | 22 | benford.stats module 23 | -------------------- 24 | 25 | .. automodule:: benford.stats 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | 31 | benford.viz module 32 | ------------------ 33 | 34 | .. automodule:: benford.viz 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | -------------------------------------------------------------------------------- /docs/build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | Welcome to benford_py's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | :caption: Contents: 7 | 8 | modules 9 | 10 | 11 | Indices and tables 12 | ================== 13 | 14 | * :ref:`genindex` 15 | * :ref:`modindex` 16 | * :ref:`search` 17 | 18 | On GitHub 19 | --------- 20 | 21 | `Package `_ 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | `Demo Jupyter Notebook `_ 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | -------------------------------------------------------------------------------- /docs/build/html/_sources/modules.rst.txt: -------------------------------------------------------------------------------- 1 | benford 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | api 8 | -------------------------------------------------------------------------------- /docs/build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Index — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 | 48 | 96 | 97 |
98 | 99 | 100 | 106 | 107 | 108 |
109 | 110 |
111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 |
131 | 132 |
    133 | 134 |
  • »
  • 135 | 136 |
  • Index
  • 137 | 138 | 139 |
  • 140 | 141 | 142 | 143 |
  • 144 | 145 |
146 | 147 | 148 |
149 |
150 |
151 |
152 | 153 | 154 |

Index

155 | 156 |
157 | A 158 | | B 159 | | C 160 | | D 161 | | F 162 | | K 163 | | L 164 | | M 165 | | N 166 | | P 167 | | R 168 | | S 169 | | T 170 | | U 171 | | V 172 | | Z 173 | 174 |
175 |

A

176 | 177 | 181 | 185 |
186 | 187 |

B

188 | 189 | 211 | 229 |
230 | 231 |

C

232 | 233 | 243 | 255 |
256 | 257 |

D

258 | 259 | 269 | 279 |
280 | 281 |

F

282 | 283 | 287 | 295 |
296 | 297 |

K

298 | 299 | 305 | 311 |
312 | 313 |

L

314 | 315 | 323 | 329 |
330 | 331 |

M

332 | 333 | 359 | 380 |
381 | 382 |

N

383 | 384 | 388 |
389 | 390 |

P

391 | 392 | 400 | 410 |
411 | 412 |

R

413 | 414 | 424 | 434 |
435 | 436 |

S

437 | 438 | 466 | 484 |
485 | 486 |

T

487 | 488 | 492 | 498 |
499 | 500 |

U

501 | 502 | 512 |
513 | 514 |

V

515 | 516 | 524 |
525 | 526 |

Z

527 | 528 | 532 |
533 | 534 | 535 | 536 |
537 | 538 |
539 |
540 | 541 |
542 | 543 |
544 |

545 | © Copyright 2020, Marcel Milcent. 546 | 547 |

548 |
549 | 550 | 551 | 552 | Built with Sphinx using a 553 | 554 | theme 555 | 556 | provided by Read the Docs. 557 | 558 |
559 |
560 |
561 | 562 |
563 | 564 |
565 | 566 | 567 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | -------------------------------------------------------------------------------- /docs/build/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Welcome to benford_py’s documentation! — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
48 | 49 | 97 | 98 |
99 | 100 | 101 | 107 | 108 | 109 |
110 | 111 |
112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 |
132 | 133 |
    134 | 135 |
  • »
  • 136 | 137 |
  • Welcome to benford_py’s documentation!
  • 138 | 139 | 140 |
  • 141 | 142 | 143 | View page source 144 | 145 | 146 |
  • 147 | 148 |
149 | 150 | 151 |
152 |
153 |
154 |
155 | 156 |
157 |

Welcome to benford_py’s documentation!

158 |
159 |

Contents:

160 | 172 |
173 |
174 |
175 |

Indices and tables

176 | 181 |
182 |

On GitHub

183 |
184 |

Package

185 |
186 | 189 |
190 |
191 | 192 | 193 |
194 | 195 |
196 |
197 | 200 | 201 |
202 | 203 |
204 |

205 | © Copyright 2020, Marcel Milcent. 206 | 207 |

208 |
209 | 210 | 211 | 212 | Built with Sphinx using a 213 | 214 | theme 215 | 216 | provided by Read the Docs. 217 | 218 |
219 |
220 |
221 | 222 |
223 | 224 |
225 | 226 | 227 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /docs/build/html/modules.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | benford — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
49 | 50 | 101 | 102 |
103 | 104 | 105 | 111 | 112 | 113 |
114 | 115 |
116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 |
136 | 137 |
    138 | 139 |
  • »
  • 140 | 141 |
  • benford
  • 142 | 143 | 144 |
  • 145 | 146 | 147 | View page source 148 | 149 | 150 |
  • 151 | 152 |
153 | 154 | 155 |
156 |
157 |
158 |
159 | 160 |
161 |

benford

162 |
163 | 172 |
173 |
174 | 175 | 176 |
177 | 178 |
179 |
180 | 184 | 185 |
186 | 187 |
188 |

189 | © Copyright 2020, Marcel Milcent. 190 | 191 |

192 |
193 | 194 | 195 | 196 | Built with Sphinx using a 197 | 198 | theme 199 | 200 | provided by Read the Docs. 201 | 202 |
203 |
204 |
205 | 206 |
207 | 208 |
209 | 210 | 211 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/html/objects.inv -------------------------------------------------------------------------------- /docs/build/html/py-modindex.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Python Module Index — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
50 | 51 | 99 | 100 |
101 | 102 | 103 | 109 | 110 | 111 |
112 | 113 |
114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 |
134 | 135 |
    136 | 137 |
  • »
  • 138 | 139 |
  • Python Module Index
  • 140 | 141 | 142 |
  • 143 | 144 |
  • 145 | 146 |
147 | 148 | 149 |
150 |
151 |
152 |
153 | 154 | 155 |

Python Module Index

156 | 157 |
158 | b 159 |
160 | 161 | 162 | 163 | 165 | 166 | 168 | 171 | 172 | 173 | 176 | 177 | 178 | 181 | 182 | 183 | 186 | 187 | 188 | 191 |
 
164 | b
169 | benford 170 |
    174 | benford.benford 175 |
    179 | benford.expected 180 |
    184 | benford.stats 185 |
    189 | benford.viz 190 |
192 | 193 | 194 |
195 | 196 |
197 |
198 | 199 |
200 | 201 |
202 |

203 | © Copyright 2020, Marcel Milcent. 204 | 205 |

206 |
207 | 208 | 209 | 210 | Built with Sphinx using a 211 | 212 | theme 213 | 214 | provided by Read the Docs. 215 | 216 |
217 |
218 |
219 | 220 |
221 | 222 |
223 | 224 | 225 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /docs/build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Search — benford_py 0.3.3 documentation 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
50 | 51 | 99 | 100 |
101 | 102 | 103 | 109 | 110 | 111 |
112 | 113 |
114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 |
134 | 135 |
    136 | 137 |
  • »
  • 138 | 139 |
  • Search
  • 140 | 141 | 142 |
  • 143 | 144 |
  • 145 | 146 |
147 | 148 | 149 |
150 |
151 |
152 |
153 | 154 | 161 | 162 | 163 |
164 | 165 |
166 | 167 |
168 | 169 |
170 |
171 | 172 |
173 | 174 |
175 |

176 | © Copyright 2020, Marcel Milcent. 177 | 178 |

179 |
180 | 181 | 182 | 183 | Built with Sphinx using a 184 | 185 | theme 186 | 187 | provided by Read the Docs. 188 | 189 |
190 |
191 |
192 | 193 |
194 | 195 |
196 | 197 | 198 | 203 | 204 | 205 | 206 | 207 | 208 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /docs/build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({docnames:["api","index","modules"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":3,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":2,"sphinx.domains.rst":2,"sphinx.domains.std":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["api.rst","index.rst","modules.rst"],objects:{"benford.benford":{Base:[0,1,1,""],Benford:[0,1,1,""],Mantissas:[0,1,1,""],Roll_mad:[0,1,1,""],Roll_mse:[0,1,1,""],Source:[0,1,1,""],Summ:[0,1,1,""],Test:[0,1,1,""],bhattacharyya_distance:[0,4,1,""],duplicates:[0,4,1,""],first_digits:[0,4,1,""],kullback_leibler_divergence:[0,4,1,""],last_two_digits:[0,4,1,""],mad:[0,4,1,""],mad_summ:[0,4,1,""],mantissas:[0,4,1,""],mse:[0,4,1,""],rolling_mad:[0,4,1,""],rolling_mse:[0,4,1,""],second_digit:[0,4,1,""],second_order:[0,4,1,""],summation:[0,4,1,""]},"benford.benford.Benford":{all_confidences:[0,2,1,""],base:[0,3,1,""],chosen:[0,3,1,""],confidence:[0,3,1,""],data:[0,3,1,""],limit_N:[0,3,1,""],mantissas:[0,2,1,""],sec_order:[0,2,1,""],sign:[0,3,1,""],summation:[0,2,1,""],tests:[0,3,1,""],update_confidence:[0,2,1,""],verbose:[0,3,1,""]},"benford.benford.Mantissas":{arc_test:[0,2,1,""],data:[0,3,1,""],report:[0,2,1,""],show_plot:[0,2,1,""],stats:[0,2,1,""],update_confidence:[0,2,1,""]},"benford.benford.Roll_mad":{show_plot:[0,2,1,""],test:[0,3,1,""]},"benford.benford.Roll_mse":{show_plot:[0,2,1,""]},"benford.benford.Source":{duplicates:[0,2,1,""],first_digits:[0,2,1,""],last_two_digits:[0,2,1,""],mantissas:[0,2,1,""],second_digit:[0,2,1,""],summation:[0,2,1,""],verbose:[0,3,1,""]},"benford.benford.Summ":{MAD:[0,3,1,""],confidence:[0,3,1,""],report:[0,2,1,""],show_plot:[0,2,1,""]},"benford.benford.Test":{KS:[0,3,1,""],MAD:[0,3,1,""],N:[0,3,1,""],chi_square:[0,3,1,""],confidence:[0,3,1,""],critical_values:[0,2,1,""],ddf:[0,3,1,""],digs:[0,3,1,""],report:[0,2,1,""],sec_order:[0,3,1,""],show_plot:[0,2,1,""],update_confidence:[0,2,1,""]},"benford.expected":{First:[0,1,1,""],LastTwo:[0,1,1,""],Second:[0,1,1,""]},"benford.stats":{Z_score:[0,4,1,""],chi_sq:[0,4,1,""],chi_sq_2:[0,4,1,""],kolmogorov_smirnov:[0,4,1,""],kolmogorov_smirnov_2:[0,4,1,""],mad:[0,4,1,""],mse:[0,4,1,""]},"benford.viz":{plot_digs:[0,4,1,""],plot_expected:[0,4,1,""],plot_mantissa_arc_test:[0,4,1,""],plot_ordered_mantissas:[0,4,1,""],plot_roll_mad:[0,4,1,""],plot_roll_mse:[0,4,1,""],plot_sum:[0,4,1,""]},benford:{benford:[0,0,0,"-"],expected:[0,0,0,"-"],stats:[0,0,0,"-"],viz:[0,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","attribute","Python attribute"],"4":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:attribute","4":"py:function"},terms:{"case":0,"class":0,"default":0,"float":0,"function":0,"int":0,"new":0,"return":0,"true":0,The:0,Uses:0,_as_gen:0,about:0,absolut:0,accept:0,accord:0,add:0,after:0,aldo:0,all:0,all_confid:0,along:0,alreadi:0,also:0,analys:0,analysi:0,ani:0,anoth:0,api:0,appli:0,applic:0,arc:0,arc_test:0,arg:0,argument:0,arrai:0,atribut:0,attribut:0,avail:0,axi:0,bar:0,base:0,befor:0,beford:0,begin:0,being:0,benford:1,between:0,bhat_coeff:0,bhat_dist:0,bhattacharrya:[],bhattacharyya:0,bhattacharyya_dist:0,big:0,bool:0,both:0,calcul:0,call:0,can:0,capabl:0,ccomparison:0,center:0,chart:0,check:0,chi:0,chi_sq:0,chi_sq_2:0,chi_squar:0,choic:0,choos:0,chose:0,chosen:0,circl:0,circular:0,cmpliant:0,coeffici:0,col:0,column:0,comnput:0,compar:0,comparison:0,compliant:0,comput:0,conf_z:0,confid:0,conform:0,consid:0,content:1,coordin:0,core:0,count:0,creat:0,critic:0,critical_valu:0,cumput:0,cumul:0,current:0,data:0,datafram:0,ddf:0,decim:0,defalut:0,defaulst:0,defaut:0,degre:0,descend:0,deviat:0,dict:0,dictionari:0,difer:0,differ:0,dig:0,digit:0,dimens:0,discard:0,displai:0,distanc:0,distribut:0,diverg:0,down:0,draw:0,duplic:0,each:0,element:0,end:0,entri:0,equal:0,error:0,especif:0,evalu:0,everi:0,expecct:0,expect:[1,2],expext:0,extens:0,extraxt:0,extrem:0,f1d:0,f2d:0,f3d:0,fals:0,fasl:0,fifth:0,figsiz:0,figur:0,file:0,first:0,first_digit:0,fisrt:0,follow:0,forc:0,form:0,format:0,foud:0,foudn:[],found:0,frame:0,freedom:0,frequenc:0,from:0,furst:0,futur:0,gener:0,get:0,give:0,given:0,graviti:0,gravity_cent:0,greatest:0,grid:0,hand:0,handl:0,has:0,have:0,henc:0,high_diff:0,high_z:0,higher:0,highest:0,highlight:0,hold:0,horizont:0,how:0,html:0,http:0,inclin:0,includ:0,index:1,infer:0,inform:0,initi:0,input:0,instanc:0,integ:0,intern:0,its:0,itself:0,keep:0,kl_diverg:0,kolmogorov:0,kolmogorov_smirnov:0,kolmogorov_smirnov_2:0,kulback:0,kullback_leibler_diverg:0,kurtosi:0,kwarg:0,l2d:0,label:0,laibler:0,last:0,last_two_digit:0,lasttwo:0,leibler:0,len:0,lenght:0,level:0,like:0,limit:0,limit_n:0,line:0,list:0,logarithm:0,look:0,loos:0,lower:0,mad:0,mad_summ:0,mamtissa:0,mani:0,mantissa:0,map:0,matplotlib:0,mean:0,method:0,minu:0,modul:[1,2],mse:0,must:0,name:0,ndarrai:0,need:0,neg:0,new_conf:0,none:0,num:0,number:0,numer:0,numpi:0,obj:0,object:0,occurr:0,ocurr:0,one:0,onli:0,opt:0,option:0,order:0,org:0,origin:0,other:0,output:0,packag:2,page:1,panda:0,paramet:0,path:0,perform:0,place:0,plot:0,plot_dig:0,plot_expect:0,plot_mantissa_arc_test:0,plot_ordered_mantissa:0,plot_roll_mad:0,plot_roll_ms:0,plot_sum:0,plottl:0,popul:0,portion:0,pos:0,posit:0,pre:0,prepar:0,print:0,probabl:0,process:0,produc:0,properti:0,proport:0,provid:0,pyplot:0,rais:0,raw:0,receiv:0,record:0,reduc:0,refer:0,regardless:0,registri:0,relev:[],remov:0,repetit:0,repitit:0,report:0,represent:0,requir:0,respect:0,respectt:[],result:0,ret_df:0,right:0,roll:0,roll_mad:0,roll_ms:0,roll_seri:0,rolling_mad:0,rolling_ms:0,run:0,same:0,sampl:0,save:0,save_p:0,save_plot:0,save_plot_kwarg:0,savefig:0,scatter:0,score:0,search:1,sec_ord:0,second:0,second_digit:0,second_ord:0,select:0,separ:0,sequenc:0,sequenti:0,seri:0,set:0,should:0,show:0,show_plot:0,shown:0,sign:0,simpl:0,sinc:0,size:0,skew:0,smirnov:0,some:0,sort:0,sourc:0,squar:0,squre:0,stat:[1,2],state:0,statist:0,str:0,straight:0,string:0,studi:0,subclass:0,subject:0,subset:0,subtract:0,sum:0,summ:0,summat:0,suprem:0,take:0,tell:0,tend:0,termin:0,test:0,tet:0,text_x:0,tha:0,than:0,thee:0,them:0,thi:0,three:0,through:0,tick:0,toe:0,too:0,top:0,top_rep:0,track:0,transform:0,tupl:0,two:0,type:0,typeerror:0,updat:0,update_confid:0,upper:0,use:0,used:0,usual:0,valu:0,valueerror:0,varianc:0,verbos:0,viz:[1,2],well:0,were:0,when:0,whether:0,which:0,window:0,withe:0,work:0,y_exp:0,y_found:0,z_score:0,zero:0},titles:["benford package","Welcome to benford_py\u2019s documentation!","benford"],titleterms:{benford:[0,2],benford_pi:1,demo:1,document:1,expect:0,github:1,indic:1,jupyt:1,modul:0,notebook:1,packag:[0,1],stat:0,tabl:1,viz:0,welcom:1}}) -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | matplotlib -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | benford package 2 | =============== 3 | 4 | benford.benford module 5 | ---------------------- 6 | 7 | .. automodule:: benford.benford 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | 13 | benford.expected module 14 | ----------------------- 15 | 16 | .. automodule:: benford.expected 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | 22 | benford.stats module 23 | -------------------- 24 | 25 | .. automodule:: benford.stats 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | 31 | benford.viz module 32 | ------------------ 33 | 34 | .. automodule:: benford.viz 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'benford_py' 21 | copyright = '2020, Marcel Milcent' 22 | author = 'Marcel Milcent' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.3.3' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.viewcode', 36 | 'sphinx.ext.napoleon' 37 | ] 38 | 39 | master_doc = 'index' 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = [] 48 | 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | 52 | # The theme to use for HTML and HTML Help pages. See the documentation for 53 | # a list of builtin themes. 54 | # 55 | html_theme = 'sphinx_rtd_theme' 56 | 57 | # Add any paths that contain custom static files (such as style sheets) here, 58 | # relative to this directory. They are copied after the builtin static files, 59 | # so a file named "default.css" will overwrite the builtin "default.css". 60 | html_static_path = ['_static'] 61 | 62 | # Show in order of code source, not alphabetical 63 | autodoc_member_order = 'bysource' 64 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to benford_py's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | :caption: Contents: 7 | 8 | modules 9 | 10 | 11 | Indices and tables 12 | ================== 13 | 14 | * :ref:`genindex` 15 | * :ref:`modindex` 16 | * :ref:`search` 17 | 18 | On GitHub 19 | --------- 20 | 21 | `Package `_ 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | `Demo Jupyter Notebook `_ 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | benford 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | api 8 | -------------------------------------------------------------------------------- /img/2429_Benford-Frank.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/2429_Benford-Frank.jpg -------------------------------------------------------------------------------- /img/Benford_Instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/Benford_Instance.png -------------------------------------------------------------------------------- /img/First.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/First.png -------------------------------------------------------------------------------- /img/First_Digits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/First_Digits.png -------------------------------------------------------------------------------- /img/SPY-f2d-conf_level-95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/SPY-f2d-conf_level-95.png -------------------------------------------------------------------------------- /img/Simon_Newcomb_APS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/Simon_Newcomb_APS.jpg -------------------------------------------------------------------------------- /img/formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/img/formula.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | ''' Setup for benford's module''' 2 | from os import path 3 | from setuptools import setup 4 | 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | with open(path.join(this_directory, 'README-pypi.md'), encoding='utf-8') as f: 7 | long_description = f.read() 8 | 9 | setup(name='benford_py', 10 | version='0.5.0', 11 | description='A library for testing data sets with Bendford\'s Law', 12 | long_description=long_description, 13 | long_description_content_type='text/markdown', 14 | url='https://github.com/milcent/benford_py', 15 | download_url='https://github.com/milcent/benford_py/archive/v0.5.0.tar.gz', 16 | author='Marcel Milcent', 17 | author_email='marcelmilcent@gmail.com', 18 | license='BSD 3-Clause', 19 | packages=['benford'], 20 | install_requires=[ 21 | 'pandas', 22 | 'numpy', 23 | 'matplotlib', 24 | ], 25 | zip_safe=False, 26 | classifiers=[ 27 | 'Programming Language :: Python :: 3', 28 | 'License :: OSI Approved :: BSD License', 29 | 'Natural Language :: English', 30 | 'Operating System :: OS Independent', 31 | 'Development Status :: 3 - Alpha', 32 | 'Intended Audience :: Financial and Insurance Industry', 33 | 'Intended Audience :: Science/Research', 34 | 'Intended Audience :: Education', 35 | 'Intended Audience :: Other Audience', 36 | 'Topic :: Office/Business :: Financial :: Accounting', 37 | 'Topic :: Scientific/Engineering :: Mathematics', 38 | ],) 39 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | import pytest 3 | import numpy as np 4 | import pandas as pd 5 | from ..benford import utils as ut 6 | from ..benford.constants import CONFS, REV_DIGS 7 | from ..benford.expected import _get_expected_digits_ 8 | from ..benford.stats import _two_dist_ks_ 9 | 10 | 11 | @pytest.fixture 12 | def gen_N(): 13 | return np.random.randint(0, 25000) 14 | 15 | 16 | @pytest.fixture 17 | def gen_decimals(): 18 | return np.random.randint(0, 8) 19 | 20 | 21 | @pytest.fixture 22 | def gen_N_lower(gen_N): 23 | return np.random.randint(0, gen_N) 24 | 25 | 26 | @pytest.fixture 27 | def gen_array(gen_N): 28 | num = gen_N 29 | return np.abs(np.random.rand(num) * np.random.randn(num) * 30 | np.random.randint(1, num, num)) 31 | 32 | @pytest.fixture 33 | def choose_digs_rand(): 34 | return choice([1, 2, 3, 22, -2]) 35 | 36 | 37 | @pytest.fixture 38 | def choose_test(): 39 | return choice(["F1D","F2D","F3D","SD","L2D"]) 40 | 41 | 42 | @pytest.fixture 43 | def choose_confidence(): 44 | return choice(list(CONFS.keys())[1:]) 45 | 46 | 47 | @pytest.fixture 48 | def gen_series(gen_array): 49 | return pd.Series(gen_array) 50 | 51 | 52 | @pytest.fixture 53 | def gen_data_frame(gen_array): 54 | return pd.DataFrame({'seq': gen_array, 'col2': gen_array}) 55 | 56 | 57 | @pytest.fixture 58 | def gen_int_df(gen_data_frame): 59 | return gen_data_frame.astype(int) 60 | 61 | small_arrays_type = [ 62 | (np.array([1, 2, 3, 4, 5.0, 6.3, .17]), float), 63 | (np.array([1, 2, 3, 4, 5, 6, 7]), int), 64 | (np.array(['1', '2', '3', '4', '5', '6', '7']), float), 65 | (pd.Series([1, 2, 3, 4, 5.0, 6.3, .17]), float), 66 | (pd.Series([1, 2, 3, 4, 5, 6, 7]), int), 67 | (pd.Series(['1', '2', '3', '4', '5', '6', '7']), float) 68 | ] 69 | 70 | @pytest.fixture(params=small_arrays_type) 71 | def get_small_arrays(request): 72 | return request.param 73 | 74 | 75 | @pytest.fixture 76 | def small_str_foo_array(): 77 | return np.array(['foo', 'baar', 'baz', 'hixks']) 78 | 79 | 80 | @pytest.fixture 81 | def small_str_foo_series(): 82 | return pd.Series(['foo', 'baar', 'baz', 'hixks']) 83 | 84 | 85 | @pytest.fixture 86 | def gen_get_digs_df(gen_series, gen_decimals): 87 | return ut.get_digs(gen_series, decimals=gen_decimals) 88 | 89 | 90 | @pytest.fixture 91 | def gen_proportions_F1D(gen_get_digs_df): 92 | return ut.get_found_proportions(gen_get_digs_df.F1D) 93 | 94 | 95 | @pytest.fixture 96 | def gen_proportions_F2D(gen_get_digs_df): 97 | return ut.get_found_proportions(gen_get_digs_df.F2D) 98 | 99 | 100 | @pytest.fixture 101 | def gen_proportions_F3D(gen_get_digs_df): 102 | return ut.get_found_proportions(gen_get_digs_df.F3D) 103 | 104 | 105 | @pytest.fixture 106 | def gen_proportions_SD(gen_get_digs_df): 107 | return ut.get_found_proportions(gen_get_digs_df.SD) 108 | 109 | 110 | @pytest.fixture 111 | def gen_proportions_L2D(gen_get_digs_df): 112 | return ut.get_found_proportions(gen_get_digs_df.L2D) 113 | 114 | 115 | @pytest.fixture 116 | def gen_proportions_random_test(choose_test, gen_get_digs_df): 117 | dig_str = choose_test 118 | return ut.get_found_proportions(gen_get_digs_df[dig_str]), REV_DIGS[dig_str] 119 | 120 | 121 | @pytest.fixture 122 | def gen_join_expect_found_diff_random_test(gen_proportions_random_test): 123 | rand_test, rand_digs = gen_proportions_random_test 124 | return ut.join_expect_found_diff(rand_test, rand_digs) 125 | 126 | 127 | @pytest.fixture 128 | def gen_join_expect_found_diff_F1D(gen_proportions_F1D): 129 | return ut.join_expect_found_diff(gen_proportions_F1D, 1) 130 | 131 | 132 | @pytest.fixture 133 | def gen_join_expect_found_diff_F2D(gen_proportions_F2D): 134 | return ut.join_expect_found_diff(gen_proportions_F2D, 2) 135 | 136 | 137 | @pytest.fixture 138 | def gen_join_expect_found_diff_F3D(gen_proportions_F3D): 139 | return ut.join_expect_found_diff(gen_proportions_F3D, 3) 140 | 141 | 142 | @pytest.fixture 143 | def gen_join_expect_found_diff_SD(gen_proportions_SD): 144 | return ut.join_expect_found_diff(gen_proportions_SD, 22) 145 | 146 | 147 | @pytest.fixture 148 | def gen_join_expect_found_diff_L2D(gen_proportions_L2D): 149 | return ut.join_expect_found_diff(gen_proportions_L2D, -2) 150 | 151 | 152 | @pytest.fixture 153 | def gen_linspaced_zero_one(cuts:int=1000): 154 | return np.linspace(0, 1, cuts) 155 | 156 | 157 | @pytest.fixture 158 | def gen_mantissas_ks_dists(gen_array): 159 | dist2 = ut.get_mantissas(gen_array) 160 | dist1 = np.linspace(0, 1, len(dist2), endpoint=False) 161 | return dist1, dist2 162 | 163 | 164 | def gen_mantissa_distribution(): 165 | num = np.random.randint(1500, 5000) 166 | a = np.random.rand(num) 167 | b = np.random.randint(1, 999, num) 168 | c = np.random.randn(num) 169 | abc = np.abs(a * b * c) 170 | return ut.get_mantissas(abc) 171 | 172 | 173 | mant_ks_dists_types = [ 174 | (gen_mantissa_distribution(), np.random.choice([True, False]), np.float_) 175 | for i in range(10) 176 | ] 177 | 178 | @pytest.fixture(params=mant_ks_dists_types) 179 | def get_mant_ks_types(request): 180 | dist2, cummulative, ks_type = request.param 181 | return np.linspace(0, 1, len(dist2), endpoint=False), \ 182 | dist2, cummulative, ks_type 183 | 184 | 185 | mant_dists = [ 186 | (gen_mantissa_distribution(), np.random.choice([True, False]), 0) 187 | for i in range(10) 188 | ] 189 | 190 | @pytest.fixture(params=mant_dists) 191 | def get_mant_ks_s(request): 192 | dist2, cummulative, zero = request.param 193 | return np.linspace(0, 1, len(dist2), endpoint=False), \ 194 | dist2, cummulative, zero 195 | 196 | def gen_mantissa_distribution_len(): 197 | mants = gen_mantissa_distribution() 198 | return mants, len(mants) 199 | 200 | mant_ks_confidences = [ 201 | (*gen_mantissa_distribution_len(), conf) for conf in CONFS.keys() 202 | ] 203 | 204 | @pytest.fixture(params=mant_ks_confidences) 205 | def get_mant_ks_confs_limit_N(request): 206 | mants, mants_lengths, confidence = request.param 207 | cap_sample = mants_lengths - np.random.randint(500, 1400) 208 | sample_size = np.random.choice([mants_lengths, cap_sample]) 209 | return mants, confidence, sample_size 210 | 211 | 212 | @pytest.fixture 213 | def gen_random_digs_and_proportions(gen_linspaced_zero_one, choose_digs_rand): 214 | exp = _get_expected_digits_(choose_digs_rand).Expected.values 215 | rand_prop = np.random.choice(gen_linspaced_zero_one, len(exp)) 216 | return exp, rand_prop / rand_prop.sum() -------------------------------------------------------------------------------- /tests/test_checks.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress as do_not_raise 2 | import pytest 3 | from pytest import raises 4 | from ..benford import checks as ch 5 | from ..benford.constants import CONFS, DIGS 6 | 7 | 8 | class TestCheckDigs(): 9 | 10 | digs_to_raise = [ 11 | (x, raises(ValueError)) for x in 12 | [0, 0.5, -3, -5, -1, 1.7, 22, 1000, "One", "Two", "Second", "LastTwo", "Three"] 13 | ] 14 | 15 | @pytest.mark.parametrize("dig, expectation", digs_to_raise) 16 | def test_digs_raise_msg(self, dig, expectation): 17 | with expectation as context: 18 | ch._check_digs_(dig) 19 | assert str(context.value) == "The value assigned to the parameter " +\ 20 | f"-digs- was {dig}. Value must be 1, 2 or 3." 21 | 22 | @pytest.mark.parametrize("dig, expectation", digs_to_raise) 23 | def test_check_digs_raise(self, dig, expectation): 24 | with expectation: 25 | assert ch._check_digs_(dig) is not None 26 | 27 | legit_digs = [ 28 | (y, do_not_raise()) for y in [1, 2, 3] 29 | ] 30 | @pytest.mark.parametrize("dig, expectation", legit_digs) 31 | def test_check_digs_no_raise(self, dig, expectation): 32 | with expectation: 33 | assert ch._check_digs_(dig) is None 34 | 35 | 36 | class TestCheckTest(): 37 | 38 | digs_tests = [(d, d) for d in DIGS.keys()] +\ 39 | [(val, key) for key, val in DIGS.items()] 40 | 41 | @pytest.mark.parametrize("dig, expected", digs_tests) 42 | def test_choose(self, dig, expected): 43 | assert ch._check_test_(dig) == expected 44 | 45 | test_check_raise = [ 46 | (y, raises(ValueError)) for y in [4, -3, 2.0, "F4D", False]] +\ 47 | [(x, do_not_raise()) for x in DIGS.keys()] +\ 48 | [(z, do_not_raise()) for z in DIGS.values()] 49 | 50 | @pytest.mark.parametrize("dig, expectation", test_check_raise) 51 | def test_raise(self, dig, expectation): 52 | with expectation: 53 | assert ch._check_test_(dig) is not None 54 | 55 | def test_None(self): 56 | with pytest.raises(ValueError): 57 | ch._check_test_(None) 58 | 59 | 60 | class TestCheckDecimals(): 61 | 62 | pos_int = zip(range(21), range(21)) 63 | 64 | @pytest.mark.parametrize("pos_int, expected", pos_int) 65 | def test_positive_int(self, pos_int, expected): 66 | assert ch._check_decimals_(pos_int) == expected 67 | 68 | dec_errors = [(x, raises(ValueError)) for x in range(-15, 0)] +\ 69 | [(y, do_not_raise()) for y in range(21)] +\ 70 | [(z, raises(ValueError)) for z in ["inf", "infe", "Infer", []]] 71 | 72 | @pytest.mark.parametrize("dec, expectation", dec_errors) 73 | def test_dec_raises(self, dec, expectation): 74 | with expectation: 75 | assert ch._check_decimals_(dec) is not None 76 | 77 | def test_negative_int_msg(self): 78 | with pytest.raises(ValueError) as context: 79 | ch._check_decimals_(-2) 80 | assert str( 81 | context.value) == "Parameter -decimals- must be an int >= 0, or 'infer'." 82 | 83 | def test_infer(self): 84 | assert ch._check_decimals_('infer') == 'infer' 85 | 86 | def test_None_type(self): 87 | with pytest.raises(ValueError): 88 | ch._check_decimals_(None) 89 | 90 | 91 | class TestCheckConfidence(): 92 | 93 | conf_errors = [ 94 | (x, raises(ValueError)) for x in 95 | [93, "95", 76, "80", "99", 84, 99.8] 96 | ] + [ # Except None ([:1]) due to comparison below 97 | (y, do_not_raise()) for y in list(CONFS.keys())[1:] 98 | ] 99 | @pytest.mark.parametrize("conf, expectation", conf_errors) 100 | def test_conf_raises(self, conf, expectation): 101 | with expectation: 102 | assert ch._check_confidence_(conf) is not None 103 | 104 | all_confidences = zip(CONFS.keys(), CONFS.keys()) 105 | 106 | @pytest.mark.parametrize("conf, expected", all_confidences) 107 | def test_all_confidences(self, conf, expected): 108 | assert ch._check_confidence_(conf) == expected 109 | 110 | 111 | class TestCheckHighZ(): 112 | 113 | z_errors = [ 114 | (x, raises(ValueError)) for x in 115 | [5.0, 0.3, "al", "poss", "po", "alll", ] 116 | ] + [ 117 | (y, do_not_raise()) for y in 118 | [10, 20, 5, 2, "pos", "all"] 119 | ] 120 | @pytest.mark.parametrize("high_Z, expectation", z_errors) 121 | def test_high_Z_raises(self, high_Z, expectation): 122 | with expectation: 123 | assert ch._check_high_Z_(high_Z) is not None 124 | 125 | high_Zs = [ 126 | (10, 10), ("pos", "pos"), ("all", "all") 127 | ] 128 | @pytest.mark.parametrize("z, expected", high_Zs) 129 | def test_high_zs(self, z, expected): 130 | assert ch._check_high_Z_(z) == expected 131 | 132 | 133 | class TestCheckNunmArray(): 134 | 135 | arrays = [ 136 | ['1', '2', '3', '4', '5', '6', '7'], 137 | [1, 2, 3, 4, 5, 6, 7], 138 | [1, 2, 3, 4, 5.0, 6.3, .17], 139 | [True, False, False, True, True, True, False, False] 140 | ] 141 | 142 | @pytest.mark.parametrize("arr", arrays) 143 | def test_arrays_to_float(self, arr): 144 | assert ch._check_num_array_(arr).dtype == float 145 | 146 | def test_small_arrays(self, get_small_arrays): 147 | arr, expected = get_small_arrays 148 | assert ch._check_num_array_(arr).dtype == expected 149 | 150 | def test_np_array_str(self, small_str_foo_array): 151 | with pytest.raises(ValueError): 152 | ch._check_num_array_(small_str_foo_array) 153 | 154 | num_arr_raise = [ 155 | ({1, 2, 3, 4}, raises(ValueError)), 156 | ({'a': 1, 'b': 2, 'c': 3, 'd': 4}, raises(ValueError)), 157 | ([1, 2, 3, 4, 5.0, 6.3, .17], do_not_raise()), 158 | (['foo', 'baar', 'baz', 'jinks'], raises(ValueError)), 159 | ('alocdwneceo;u', raises(ValueError)) 160 | ] 161 | 162 | @pytest.mark.parametrize("num_array, expectation", num_arr_raise) 163 | def test_num_array_raises(self, num_array, expectation): 164 | with expectation: 165 | print(num_array) 166 | assert ch._check_num_array_(num_array) is not None 167 | -------------------------------------------------------------------------------- /tests/test_expected.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ..benford import expected as ex 3 | 4 | 5 | class TestGetExpectedDigits(): 6 | 7 | expected_types = [ 8 | (x, ex.First) for x in [1, 2, 3] 9 | ] + [(22, ex.Second), (-2, ex.LastTwo)] 10 | 11 | @pytest.mark.parametrize("dig, expec_type", expected_types) 12 | def test_expected_types(self, dig, expec_type): 13 | assert type(ex._get_expected_digits_(dig)) == expec_type 14 | 15 | expected_lenghts = [ 16 | (1, 9), (2, 90), (3, 900), (22, 10), (-2, 100) 17 | ] 18 | 19 | @pytest.mark.parametrize("dig, exp_len", expected_lenghts) 20 | def test_expected_lenghts(self, dig, exp_len): 21 | assert len(ex._get_expected_digits_(dig)) == exp_len 22 | 23 | 24 | class TestGenLastTwoDigits(): 25 | 26 | l2d_types = [([], " 0.999999 46 | 47 | @pytest.mark.parametrize("func, dig", gen_digs) 48 | def test_no_negative_prob(self, func, dig): 49 | exp, _ = getattr(ex, func)(*dig) 50 | assert (exp < 0).sum() == 0 51 | 52 | digs_sums = [ 53 | ("_gen_first_digits_", [1], 45), ("_gen_first_digits_", [2], 4905), 54 | ("_gen_first_digits_", [3], 494550), ("_gen_second_digits_", [], 45), 55 | ("_gen_last_two_digits_", [True], 4950) 56 | ] 57 | 58 | @pytest.mark.parametrize("func, dig, exp_sum", digs_sums) 59 | def test_digs_sums(self, func, dig, exp_sum): 60 | _, digits = getattr(ex, func)(*dig) 61 | assert digits.sum() == exp_sum 62 | 63 | digs_lengths = [ 64 | ("_gen_first_digits_", [1], 9), ("_gen_first_digits_", [2], 90), 65 | ("_gen_first_digits_", [3], 900), ("_gen_second_digits_", [], 10), 66 | ("_gen_last_two_digits_", [], 100), ("_gen_last_two_digits_", [True], 100) 67 | ] 68 | 69 | @pytest.mark.parametrize("func, dig, exp_len", digs_lengths) 70 | def test_lengths(self, func, dig, exp_len): 71 | exp, digits = getattr(ex, func)(*dig) 72 | assert len(exp) == len(digits) == exp_len 73 | -------------------------------------------------------------------------------- /tests/test_stats.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy import float_ 3 | from ..benford import stats as st 4 | from ..benford.constants import CRIT_CHI2, CRIT_KS 5 | 6 | 7 | def test_Z_score_F1D(): 8 | pass 9 | 10 | class TestChiSquare(): 11 | 12 | def test_conf_None(self, gen_join_expect_found_diff_F1D, capsys): 13 | jefd_F1D = gen_join_expect_found_diff_F1D 14 | chi = st.chi_sq(jefd_F1D, len(jefd_F1D) - 1, None) 15 | out, _ = capsys.readouterr() 16 | assert "Chi-square test needs confidence other than None." in out 17 | assert chi is None 18 | 19 | def test_random_conf_F1D(self, gen_join_expect_found_diff_F1D, 20 | choose_confidence): 21 | jefd_F1D = gen_join_expect_found_diff_F1D 22 | ddf = len(jefd_F1D) - 1 23 | confidence = choose_confidence 24 | chis = st.chi_sq(jefd_F1D, ddf, choose_confidence, verbose=False) 25 | assert chis[1] == CRIT_CHI2[ddf][confidence] 26 | assert chis[0] >= 0 27 | assert isinstance(chis[0], float) 28 | 29 | 30 | def test_random_conf_F2D(self, gen_join_expect_found_diff_F2D, 31 | choose_confidence): 32 | jefd_F2D = gen_join_expect_found_diff_F2D 33 | ddf = len(jefd_F2D) - 1 34 | confidence = choose_confidence 35 | chis = st.chi_sq(jefd_F2D, ddf, choose_confidence, verbose=False) 36 | assert chis[1] == CRIT_CHI2[ddf][confidence] 37 | assert chis[0] >= 0 38 | assert isinstance(chis[0], float) 39 | 40 | def test_random_conf_F3D(self, gen_join_expect_found_diff_F3D, 41 | choose_confidence): 42 | jefd_F3D = gen_join_expect_found_diff_F3D 43 | ddf = len(jefd_F3D) - 1 44 | confidence = choose_confidence 45 | chis = st.chi_sq(jefd_F3D, ddf, choose_confidence, verbose=False) 46 | assert chis[1] == CRIT_CHI2[ddf][confidence] 47 | assert chis[0] >= 0 48 | assert isinstance(chis[0], float) 49 | 50 | def test_random_conf_SD(self, gen_join_expect_found_diff_SD, 51 | choose_confidence): 52 | jefd_SD = gen_join_expect_found_diff_SD 53 | ddf = len(jefd_SD) - 1 54 | confidence = choose_confidence 55 | chis = st.chi_sq(jefd_SD, ddf, choose_confidence, verbose=False) 56 | assert chis[1] == CRIT_CHI2[ddf][confidence] 57 | assert chis[0] >= 0 58 | assert isinstance(chis[0], float) 59 | 60 | def test_random_conf_L2D(self, gen_join_expect_found_diff_L2D, 61 | choose_confidence): 62 | jefd_L2D = gen_join_expect_found_diff_L2D 63 | ddf = len(jefd_L2D) - 1 64 | confidence = choose_confidence 65 | chis = st.chi_sq(jefd_L2D, ddf, choose_confidence, verbose=False) 66 | assert chis[1] == CRIT_CHI2[ddf][confidence] 67 | assert chis[0] >= 0 68 | assert isinstance(chis[0], float) 69 | 70 | def test_rand_test_rand_conf_verbose(self, choose_confidence, 71 | gen_join_expect_found_diff_random_test, capsys): 72 | r_test = gen_join_expect_found_diff_random_test 73 | ddf = len(r_test) - 1 74 | conf = choose_confidence 75 | chis = st.chi_sq(r_test, ddf, conf) 76 | out, _ = capsys.readouterr() 77 | assert f"The Chi-square statistic is {chis[0]:.4f}." in out 78 | assert f"Critical Chi-square for this series: {chis[1]}." in out 79 | 80 | def test_rand_test_conf_80(self, gen_join_expect_found_diff_random_test): 81 | r_test = gen_join_expect_found_diff_random_test 82 | ddf = len(r_test) - 1 83 | chis = st.chi_sq(r_test, ddf, 80, verbose=False) 84 | assert chis[1] == CRIT_CHI2[ddf][80] 85 | assert chis[0] >= 0 86 | assert isinstance(chis[0], float) 87 | 88 | def test_rand_test_conf_85(self, gen_join_expect_found_diff_random_test): 89 | r_test = gen_join_expect_found_diff_random_test 90 | ddf = len(r_test) - 1 91 | chis = st.chi_sq(r_test, ddf, 85, verbose=False) 92 | assert chis[1] == CRIT_CHI2[ddf][85] 93 | assert chis[0] >= 0 94 | assert isinstance(chis[0], float) 95 | 96 | def test_rand_test_conf_90(self, gen_join_expect_found_diff_random_test): 97 | r_test = gen_join_expect_found_diff_random_test 98 | ddf = len(r_test) - 1 99 | chis = st.chi_sq(r_test, ddf, 90, verbose=False) 100 | assert chis[1] == CRIT_CHI2[ddf][90] 101 | assert chis[0] >= 0 102 | assert isinstance(chis[0], float) 103 | 104 | def test_rand_test_conf_95(self, gen_join_expect_found_diff_random_test, 105 | capsys): 106 | r_test = gen_join_expect_found_diff_random_test 107 | ddf = len(r_test) - 1 108 | chis = st.chi_sq(r_test, ddf, 95, verbose=False) 109 | out, _ = capsys.readouterr() 110 | assert chis[1] == CRIT_CHI2[ddf][95] 111 | assert chis[0] >= 0 112 | assert isinstance(chis[0], float) 113 | assert f"The Chi-square statistic is {chis[0]:.4f}." not in out 114 | assert f"Critical Chi-square for this series: {chis[1]}." not in out 115 | 116 | def test_rand_test_conf_99(self, gen_join_expect_found_diff_random_test): 117 | r_test = gen_join_expect_found_diff_random_test 118 | ddf = len(r_test) - 1 119 | chis = st.chi_sq(r_test, ddf, 99, verbose=False) 120 | assert chis[1] == CRIT_CHI2[ddf][99] 121 | assert chis[0] >= 0 122 | assert isinstance(chis[0], float) 123 | 124 | def test_rand_test_conf_999(self, gen_join_expect_found_diff_random_test): 125 | r_test = gen_join_expect_found_diff_random_test 126 | ddf = len(r_test) - 1 127 | chis = st.chi_sq(r_test, ddf, 99.9, verbose=False) 128 | assert chis[1] == CRIT_CHI2[ddf][99.9] 129 | assert chis[0] >= 0 130 | assert isinstance(chis[0], float) 131 | 132 | def test_rand_test_conf_9999(self, gen_join_expect_found_diff_random_test): 133 | r_test = gen_join_expect_found_diff_random_test 134 | ddf = len(r_test) - 1 135 | chis = st.chi_sq(r_test, ddf, 99.99, verbose=False) 136 | assert chis[1] == CRIT_CHI2[ddf][99.99] 137 | assert chis[0] >= 0 138 | assert isinstance(chis[0], float) 139 | 140 | def test_rand_test_conf_99999(self, gen_join_expect_found_diff_random_test): 141 | r_test = gen_join_expect_found_diff_random_test 142 | ddf = len(r_test) - 1 143 | chis = st.chi_sq(r_test, ddf, 99.999, verbose=False) 144 | assert chis[1] == CRIT_CHI2[ddf][99.999] 145 | assert chis[0] >= 0 146 | assert isinstance(chis[0], float) 147 | 148 | def test_rand_test_conf_999999(self, gen_join_expect_found_diff_random_test): 149 | r_test = gen_join_expect_found_diff_random_test 150 | ddf = len(r_test) - 1 151 | chis = st.chi_sq(r_test, ddf, 99.9999, verbose=False) 152 | assert chis[1] == CRIT_CHI2[ddf][99.9999] 153 | assert chis[0] >= 0 154 | assert isinstance(chis[0], float) 155 | 156 | def test_rand_test_conf_9999999(self, gen_join_expect_found_diff_random_test): 157 | r_test = gen_join_expect_found_diff_random_test 158 | ddf = len(r_test) - 1 159 | chis = st.chi_sq(r_test, ddf, 99.99999, verbose=False) 160 | assert chis[1] == CRIT_CHI2[ddf][99.99999] 161 | assert chis[0] >= 0 162 | assert isinstance(chis[0], float) 163 | 164 | class TestBhattacharyya(): 165 | 166 | def test_coeff(self, gen_random_digs_and_proportions): 167 | exp, rand_prop = gen_random_digs_and_proportions 168 | bhat_coeff = st._bhattacharyya_coefficient(exp, rand_prop) 169 | assert isinstance(bhat_coeff, float) 170 | assert bhat_coeff >= 0 171 | assert bhat_coeff <= 1 172 | 173 | def test_distance(self, gen_random_digs_and_proportions): 174 | exp, rand_prop = gen_random_digs_and_proportions 175 | bhat_dist = st._bhattacharyya_distance_(exp, rand_prop) 176 | assert isinstance(bhat_dist, float) 177 | assert bhat_dist >= 0 178 | 179 | 180 | class TestKLDivergence(): 181 | 182 | def test_kld(self, gen_random_digs_and_proportions): 183 | exp, rand_prop = gen_random_digs_and_proportions 184 | kl_diverg = st._kullback_leibler_divergence_(exp, rand_prop) 185 | assert isinstance(kl_diverg, float) 186 | assert kl_diverg >= 0 187 | 188 | 189 | class TestTwoDistKS(): 190 | 191 | def test_type(self, get_mant_ks_types): 192 | dist1, dist2, cummulative, ks_type = get_mant_ks_types 193 | ks = st._two_dist_ks_(dist1, dist2, cummulative) 194 | assert type(ks) == ks_type 195 | 196 | def test_more_equal_zero(self, get_mant_ks_s): 197 | dist1, dist2, cummulative, zero = get_mant_ks_s 198 | ks = st._two_dist_ks_(dist1, dist2, cummulative) 199 | assert ks >= zero 200 | 201 | class TestMantissasKS: 202 | 203 | def test_confidence_limit_N(self, get_mant_ks_confs_limit_N): 204 | mants, confidence, sample_size = get_mant_ks_confs_limit_N 205 | ks, crit_ks = st._mantissas_ks_(mants, confidence, sample_size) 206 | assert ks >= 0 207 | if crit_ks is not None: 208 | assert crit_ks >= 0 209 | 210 | 211 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | from ..benford import utils as ut 4 | 5 | 6 | class Test_set_N_(): 7 | 8 | def test_Limit_None(self, gen_N): 9 | assert ut._set_N_(gen_N, None) == gen_N 10 | 11 | def test_Limit_greater(self, gen_N, gen_N_lower): 12 | assert ut._set_N_(gen_N, gen_N_lower) == gen_N_lower 13 | 14 | def test_negative(self, ): 15 | with pytest.raises(ValueError) as context: 16 | ut._set_N_(-250, -1000) 17 | 18 | def test_float(self, ): 19 | with pytest.raises(ValueError) as context: 20 | ut._set_N_(127.8, -100) 21 | 22 | def test_zero(self, gen_N): 23 | assert ut._set_N_(0, None) == 1 24 | assert ut._set_N_(0, gen_N) == 1 25 | 26 | 27 | class Test_get_mantissas(): 28 | 29 | def test_less_than_1(self, gen_array): 30 | assert sum(ut.get_mantissas(gen_array) > 1) == 0 31 | 32 | def test_less_than_0(self, gen_array): 33 | assert sum(ut.get_mantissas(gen_array) < 0) == 0 34 | 35 | 36 | class Test_input_data(): 37 | 38 | def test_Series(self, gen_series): 39 | tup = ut.input_data(gen_series) 40 | assert tup[0] is tup[1] 41 | 42 | def test_array(self, gen_array): 43 | tup = ut.input_data(gen_array) 44 | assert tup[0] is gen_array 45 | assert type(tup[1]) == pd.Series 46 | 47 | def test_wrong_tuple(self, gen_array, gen_series, gen_data_frame): 48 | with pytest.raises(TypeError) as context: 49 | ut.input_data((gen_array, 'seq')) 50 | ut.input_data((gen_series, 'col1')) 51 | ut.input_data((gen_data_frame, 2)) 52 | 53 | def test_df(self, gen_data_frame): 54 | tup = ut.input_data((gen_data_frame, 'seq')) 55 | assert type(tup[0]) == pd.DataFrame 56 | assert type(tup[1]) == pd.Series 57 | 58 | def test_wrong_input_type(self, gen_array): 59 | with pytest.raises(TypeError) as context: 60 | ut.input_data(gen_array.tolist()) 61 | 62 | 63 | class Test_set_sign(): 64 | 65 | def test_all(self, gen_data_frame): 66 | sign_df = ut.set_sign(gen_data_frame, 'all') 67 | assert len(sign_df.loc[sign_df.seq == 0]) == 0 68 | 69 | def test_pos(self, gen_data_frame): 70 | sign_df = ut.set_sign(gen_data_frame, 'pos') 71 | assert sum(sign_df.seq <= 0) == 0 72 | 73 | def test_neg(self, gen_data_frame): 74 | sign_df = ut.set_sign(gen_data_frame, 'neg') 75 | assert sum(sign_df.seq >= 0) == 0 76 | 77 | 78 | class Test_get_times_10_power(): 79 | 80 | def test_2(self, gen_data_frame): 81 | pow_df = ut.get_times_10_power(gen_data_frame) 82 | assert pow_df.ZN.dtype == int 83 | 84 | def test_8(self, gen_data_frame): 85 | pow_df = ut.get_times_10_power(gen_data_frame, 8) 86 | assert pow_df.ZN.dtype == int 87 | assert (pow_df.ZN == (pow_df.seq.abs() * 10 ** 8).astype(int)).all() 88 | 89 | def test_0(self, gen_int_df): 90 | pow_df = ut.get_times_10_power(gen_int_df) 91 | assert pow_df.ZN.dtype == int 92 | assert (pow_df.ZN == pow_df.seq.abs()).all() 93 | 94 | def test_infer(self, gen_data_frame): 95 | pow_df = ut.get_times_10_power(gen_data_frame, 'infer') 96 | assert pow_df.ZN.dtype == int 97 | assert (pow_df.ZN.astype(str).str.len() == 5).all() 98 | 99 | 100 | class Test_get_digs(): 101 | 102 | def test_dec_8(self, gen_array): 103 | e_digs = ut.get_digs(gen_array, decimals=8) 104 | cols = ['seq', 'ZN', 'F1D', 'F2D', 'F3D', 'SD', 'L2D'] 105 | assert e_digs.columns.str.contains('|'.join(cols)).all() 106 | assert (e_digs[['F1D', 'F2D', 'F3D', 'SD', 'L2D']].dtypes == int).all() 107 | assert e_digs.notna().all().all() 108 | 109 | def test_dec_0(self, gen_array): 110 | e_digs = ut.get_digs(gen_array, decimals=0) 111 | cols = ['seq', 'ZN', 'F1D', 'F2D', 'F3D', 'SD', 'L2D'] 112 | assert e_digs.columns.str.contains('|'.join(cols)).all() 113 | assert (e_digs[['F1D', 'F2D', 'F3D', 'SD', 'L2D']].dtypes == int).all() 114 | assert e_digs.notna().all().all() 115 | 116 | def test_dec_2(self, gen_array): 117 | e_digs = ut.get_digs(gen_array, decimals=2) 118 | cols = ['seq', 'ZN', 'F1D', 'F2D', 'F3D', 'SD', 'L2D'] 119 | assert e_digs.columns.str.contains('|'.join(cols)).all() 120 | assert (e_digs[['F1D', 'F2D', 'F3D', 'SD', 'L2D']].dtypes == int).all() 121 | assert e_digs.notna().all().all() 122 | 123 | def test_dec_infer(self, gen_array): 124 | e_digs = ut.get_digs(gen_array, decimals='infer') 125 | cols = ['seq', 'ZN', 'F1D', 'F2D', 'F3D', 'SD', 'L2D'] 126 | assert e_digs.columns.str.contains('|'.join(cols)).all() 127 | assert (e_digs[['F1D', 'F2D', 'F3D', 'SD', 'L2D']].dtypes == int).all() 128 | assert e_digs.notna().all().all() 129 | 130 | class Test_get_found_proportions(): 131 | 132 | def test_F1D(self, gen_proportions_F1D): 133 | prop_f1d = gen_proportions_F1D 134 | # assert ((prop_f1d.index >= 1) & (prop_f1d.index <= 9)).all() 135 | assert prop_f1d.Found.sum() > .99999 136 | assert (prop_f1d.Found >= 0).all() 137 | assert prop_f1d.Counts.dtype == int 138 | 139 | def test_F2D(self, gen_proportions_F2D): 140 | prop_f2d = gen_proportions_F2D 141 | # assert ((prop_f2d.index >= 10) & (prop_f2d.index <= 99)).all() 142 | assert prop_f2d.Found.sum() > .99999 143 | assert (prop_f2d.Found >= 0).all() 144 | assert prop_f2d.Counts.dtype == int 145 | 146 | def test_F3D(self, gen_proportions_F3D): 147 | prop_f3d = gen_proportions_F3D 148 | # assert ((prop_f3d.index >= 100) & (prop_f3d.index <= 999)).all() 149 | assert prop_f3d.Found.sum() > .99999 150 | assert (prop_f3d.Found >= 0).all() 151 | assert prop_f3d.Counts.dtype == int 152 | 153 | def test_SD(self, gen_proportions_SD): 154 | prop_sd = gen_proportions_SD 155 | # assert ((prop_sd.index >= 0) & (prop_sd.index <= 9)).all() 156 | assert prop_sd.Found.sum() > .99999 157 | assert (prop_sd.Found >= 0).all() 158 | assert prop_sd.Counts.dtype == int 159 | 160 | def test_L2D(self, gen_proportions_L2D): 161 | prop_l2d = gen_proportions_L2D 162 | # assert ((prop_l2d.index >= 00) & (prop_l2d.index <= 99)).all() 163 | assert prop_l2d.Found.sum() > .99999 164 | assert (prop_l2d.Found >= 0).all() 165 | assert prop_l2d.Counts.dtype == int 166 | 167 | 168 | class Test_join_exp_found_diff(): 169 | 170 | def test_F1D(self, gen_proportions_F1D): 171 | jefd_F1D = ut.join_expect_found_diff(gen_proportions_F1D, 1) 172 | assert len(jefd_F1D) == 9 173 | assert (jefd_F1D.columns.str.contains('|'.join( 174 | ['Expected', 'Counts', 'Found', 'Dif', 'AbsDif']))).all() 175 | assert jefd_F1D.isna().sum().sum() == 0 176 | 177 | def test_F2D(self, gen_proportions_F2D): 178 | jefd_F2D = ut.join_expect_found_diff(gen_proportions_F2D, 2) 179 | assert len(jefd_F2D) == 90 180 | assert (jefd_F2D.columns.str.contains('|'.join( 181 | ['Expected', 'Counts', 'Found', 'Dif', 'AbsDif']))).all() 182 | assert jefd_F2D.isna().sum().sum() == 0 183 | 184 | def test_F3D(self, gen_proportions_F3D): 185 | jefd_F3D = ut.join_expect_found_diff(gen_proportions_F3D, 3) 186 | assert len(jefd_F3D) == 900 187 | assert (jefd_F3D.columns.str.contains('|'.join( 188 | ['Expected', 'Counts', 'Found', 'Dif', 'AbsDif']))).all() 189 | assert jefd_F3D.isna().sum().sum() == 0 190 | 191 | def test_SD(self, gen_proportions_SD): 192 | jefd_SD = ut.join_expect_found_diff(gen_proportions_SD, 22) 193 | assert len(jefd_SD) == 10 194 | assert (jefd_SD.columns.str.contains('|'.join( 195 | ['Expected', 'Counts', 'Found', 'Dif', 'AbsDif']))).all() 196 | assert jefd_SD.isna().sum().sum() == 0 197 | 198 | def test_L2D(self, gen_proportions_L2D): 199 | jefd_L2D = ut.join_expect_found_diff(gen_proportions_L2D, -2) 200 | assert len(jefd_L2D) == 100 201 | assert (jefd_L2D.columns.str.contains('|'.join( 202 | ['Expected', 'Counts', 'Found', 'Dif', 'AbsDif']))).all() 203 | assert jefd_L2D.isna().sum().sum() == 0 204 | 205 | 206 | class Test_prepare(): 207 | 208 | def test_F1D_simple(self, gen_series): 209 | prep_F1D = ut.prepare(gen_series, 1, simple=True) 210 | assert "Dif" not in prep_F1D.columns 211 | 212 | def test_F2D_simple(self, gen_series): 213 | prep_F2D = ut.prepare(gen_series, 2, simple=True) 214 | assert "Dif" not in prep_F2D.columns 215 | 216 | def test_F3D_simple(self, gen_series): 217 | prep_F3D = ut.prepare(gen_series, 3, simple=True) 218 | assert "Dif" not in prep_F3D.columns 219 | 220 | def test_SD_simple(self, gen_series): 221 | prep_SD = ut.prepare(gen_series, 22, simple=True) 222 | assert "Dif" not in prep_SD.columns 223 | 224 | def test_L2D_simple(self, gen_series): 225 | prep_L2D = ut.prepare(gen_series, -2, simple=True) 226 | assert "Dif" not in prep_L2D.columns 227 | 228 | def test_F1D(self, gen_series): 229 | ser = gen_series 230 | lf = len(ser) 231 | num, prep_F1D = ut.prepare(ser, 1) 232 | assert "Z_score" in prep_F1D.columns 233 | assert num == lf 234 | 235 | def test_F2D(self, gen_series): 236 | ser = gen_series 237 | lf = len(ser) 238 | num, prep_F2D = ut.prepare(ser, 2) 239 | assert "Z_score" in prep_F2D.columns 240 | assert num == lf 241 | 242 | def test_F3D(self, gen_series): 243 | ser = gen_series 244 | lf = len(ser) 245 | num, prep_F3D = ut.prepare(ser, 3) 246 | assert "Z_score" in prep_F3D.columns 247 | assert num == lf 248 | 249 | def test_SD(self, gen_series): 250 | ser = gen_series 251 | lf = len(ser) 252 | num, prep_SD = ut.prepare(ser, 22) 253 | assert "Z_score" in prep_SD.columns 254 | assert num == lf 255 | 256 | def test_L2D(self, gen_series): 257 | ser = gen_series 258 | lf = len(ser) 259 | num, prep_L2D = ut.prepare(ser, -2) 260 | assert "Z_score" in prep_L2D.columns 261 | assert num == lf 262 | 263 | def test_F1D_N(self, gen_N, gen_series): 264 | ser = gen_series 265 | n_diff = gen_N 266 | num, prep_F1D = ut.prepare(ser, 1, limit_N=n_diff) 267 | assert "Z_score" in prep_F1D.columns 268 | assert num == n_diff 269 | 270 | def test_F2D_N(self, gen_N, gen_series): 271 | ser = gen_series 272 | n_diff = gen_N 273 | num, prep_F2D = ut.prepare(ser, 2, limit_N=n_diff) 274 | assert "Z_score" in prep_F2D.columns 275 | assert num == n_diff 276 | 277 | def test_F3D_N(self, gen_N, gen_series): 278 | ser = gen_series 279 | n_diff = gen_N 280 | num, prep_F3D = ut.prepare(ser, 3, limit_N=n_diff) 281 | assert "Z_score" in prep_F3D.columns 282 | assert num == n_diff 283 | 284 | def test_SD_N(self, gen_N, gen_series): 285 | ser = gen_series 286 | n_diff = gen_N 287 | num, prep_SD = ut.prepare(ser, 22, limit_N=n_diff) 288 | assert "Z_score" in prep_SD.columns 289 | assert num == n_diff 290 | 291 | def test_L2D_N(self, gen_N, gen_series): 292 | ser = gen_series 293 | n_diff = gen_N 294 | num, prep_L2D = ut.prepare(ser, -2, limit_N=n_diff) 295 | assert "Z_score" in prep_L2D.columns 296 | assert num == n_diff 297 | 298 | 299 | def test_subtract_sorted(gen_series): 300 | ser = gen_series 301 | sort = ut.subtract_sorted(ser) 302 | assert len(ser) - len(sort) >= 1 303 | assert (sort != 0).all() 304 | --------------------------------------------------------------------------------