├── .github
└── workflows
│ ├── pylint.yml
│ └── python-package.yml
├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── CITATION.cff
├── Demo.ipynb
├── LICENSE.txt
├── MANIFEST.in
├── README-pypi.md
├── README.md
├── __init__.py
├── benford
├── __init__.py
├── benford.py
├── checks.py
├── constants.py
├── expected.py
├── reports.py
├── stats.py
├── utils.py
└── viz.py
├── data
└── SPY.csv
├── docs
├── Makefile
├── build
│ ├── doctrees
│ │ ├── api.doctree
│ │ ├── benford.doctree
│ │ ├── environment.pickle
│ │ ├── index.doctree
│ │ └── modules.doctree
│ └── html
│ │ ├── .buildinfo
│ │ ├── _modules
│ │ ├── benford
│ │ │ ├── benford.html
│ │ │ ├── expected.html
│ │ │ ├── stats.html
│ │ │ ├── utils.html
│ │ │ └── viz.html
│ │ └── index.html
│ │ ├── _sources
│ │ ├── api.rst.txt
│ │ ├── index.rst.txt
│ │ └── modules.rst.txt
│ │ ├── api.html
│ │ ├── genindex.html
│ │ ├── index.html
│ │ ├── modules.html
│ │ ├── objects.inv
│ │ ├── py-modindex.html
│ │ ├── search.html
│ │ └── searchindex.js
├── make.bat
├── requirements.txt
└── source
│ ├── api.rst
│ ├── conf.py
│ ├── index.rst
│ └── modules.rst
├── img
├── 2429_Benford-Frank.jpg
├── Benford_Instance.png
├── First.png
├── First_Digits.png
├── SPY-f2d-conf_level-95.png
├── Simon_Newcomb_APS.jpg
└── formula.png
├── setup.cfg
├── setup.py
└── tests
├── __init__.py
├── conftest.py
├── test_checks.py
├── test_expected.py
├── test_stats.py
└── test_utils.py
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
1 | name: Pylint
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 |
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python 3.8
13 | uses: actions/setup-python@v1
14 | with:
15 | python-version: 3.8
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install pylint numpy pandas matplotlib
20 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
21 | - name: Analysing the code with pylint
22 | run: |
23 | pylint `ls -R|grep .py$|xargs`
24 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: benford_py
5 |
6 | on:
7 | push:
8 | branches: [ develop ]
9 | pull_request:
10 | branches: [ develop ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: [3.6, 3.7, 3.8, 3.9]
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install flake8 pytest numpy pandas matplotlib
30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 | - name: Lint with flake8
32 | run: |
33 | # stop the build if there are Python syntax errors or undefined names
34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | - name: Test with pytest
38 | run: |
39 | pytest
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled python modules.
2 | *.pyc
3 |
4 | __pycache__/
5 |
6 | # ipython notebook checkpoints
7 | *.ipynb_checkpoints
8 |
9 | # text editor backups
10 | *~
11 |
12 | # VS Code
13 | .vscode/
14 |
15 | # Jupyter NB Checkpoints
16 | .ipynb_checkpoints/
17 |
18 | # Setuptools distribution folder.
19 | /dist/
20 | /build/
21 |
22 | # Python egg metadata, regenerated from source files by setuptools.
23 | /*.egg-info
24 |
25 | # Sphinx docs rendered files
26 | # /docs/build/
27 | _build
28 | _static
29 | _templates
30 |
31 | # pytest
32 | .pytest_cache/
33 | #VSCode
34 | .vscode/
35 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 | disable=
3 | F0001, # No module named XXXXXXXX
4 |
5 |
6 |
7 | ignored-classes=SQLObject,Registrant,scoped_session
8 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/source/conf.py
11 |
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | # configuration: mkdocs.yml
15 |
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 |
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 | version: 3.7
22 | install:
23 | - requirements: docs/requirements.txt
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Milcent"
5 | given-names: "Marcel"
6 | orcid:
7 | title: "Benford_py: a Python Implementation of Benford's Law Tests"
8 | version: 0.5.0
9 | doi:
10 | date-released: 2017
11 | url: "https://github.com/milcent/benford_py"
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2014-2021, Marcel Milcent.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | * Neither the name of the copyright holder nor the names of its
16 | contributors may be used to endorse or promote products derived from
17 | this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include README-pypi.md
3 | include LICENSE.txt
--------------------------------------------------------------------------------
/README-pypi.md:
--------------------------------------------------------------------------------
1 | [](https://pepy.tech/project/benford-py)
2 |
3 | # Benford for Python
4 |
5 | --------------------------------------------------------------------------------
6 |
7 | **Citing**
8 |
9 |
10 | If you find *Benford_py* useful in your research, please consider adding the following citation:
11 |
12 | ```bibtex
13 | @misc{benford_py,
14 | author = {Marcel, Milcent},
15 | title = {{Benford_py: a Python Implementation of Benford's Law Tests}},
16 | year = {2017},
17 | publisher = {GitHub},
18 | journal = {GitHub repository},
19 | howpublished = {\url{https://github.com/milcent/benford_py}},
20 | }
21 | ```
22 |
23 | --------------------------------------------------------------------------------
24 |
25 | `current version = 0.5.0`
26 |
27 | ### See [release notes](https://github.com/milcent/benford_py/releases/) for features in this and in older versions
28 |
29 | ### Python versions >= 3.6
30 |
31 | ### Installation
32 |
33 | Benford_py is a package in PyPi, so you can install with pip:
34 |
35 | `pip install benford_py`
36 |
37 | or
38 |
39 | `pip install benford-py`
40 |
41 | Or you can cd into the site-packages subfolder of your python distribution (or environment) and git clone from there:
42 |
43 | `git clone https://github.com/milcent/benford_py`
44 |
45 | For a quick start, please go to the [Demo notebook](https://github.com/milcent/benford_py/blob/master/Demo.ipynb), in which I show examples on how to run the tests with the SPY (S&P 500 ETF) daily returns.
46 |
47 | For more fine-grained details of the functions and classes, see the [docs](https://benford-py.readthedocs.io/en/latest/index.html).
48 |
49 | ### Background
50 |
51 | The first digit of a number is [its leftmost digit](https://github.com/milcent/benford_py/blob/master/img/First_Digits.png)
52 |
53 | Since the first digit of any number can range from "1" to "9"
54 | (not considering "0"), it would be intuitively expected that the
55 | proportion of each occurrence in a set of numerical records would
56 | be uniformly distributed at 1/9, i.e., approximately 0.1111,
57 | or 11.11%.
58 |
59 | [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law),
60 | also known as the Law of First Digits or the Phenomenon of
61 | Significant Digits, is the finding that the first digits of the
62 | numbers found in series of records of the most varied sources do
63 | not display a uniform distribution, but rather are arranged in such
64 | a way that the digit "1" is the most frequent, followed by "2",
65 | "3", and so in a successive and decremental way down to "9",
66 | which presents the lowest frequency as the first digit.
67 |
68 | The expected distributions of the First Digits in a
69 | Benford-compliant data set are the ones shown [here](https://github.com/milcent/benford_py/blob/master/img/First.png)
70 |
71 | The first record on the subject dates from 1881, in the work of
72 | [Simon Newcomb](https://github.com/milcent/benford_py/blob/master/img/Simon_Newcomb_APS.jpg), an American-Canadian astronomer and mathematician,
73 | who noted that in the logarithmic tables the first pages, which
74 | contained logarithms beginning with the numerals "1" and "2",
75 | were more worn out, that is, more consulted.
76 |
77 | In that same article, Newcomb proposed the [formula](https://github.com/milcent/benford_py/blob/master/img/formula.png) for the probability of a certain digit "d"
78 | being the first digit of a number, given by the following equation.
79 |
80 | In 1938, the American physicist [Frank Benford](https://github.com/milcent/benford_py/blob/master/img/2429_Benford-Frank.jpg) revisited the
81 | phenomenon, which he called the "Law of Anomalous Numbers," in
82 | a survey with more than 20,000 observations of empirical data
83 | compiled from various sources, ranging from areas of rivers to
84 | molecular weights of chemical compounds, including cost data,
85 | address numbers, population sizes and physical constants. All
86 | of them, to a greater or lesser extent, followed such
87 | distribution.
88 |
89 | The extent of Benford's work seems to have been one good reason
90 | for the phenomenon to be popularized with his name, though
91 | described by Newcomb 57 years earlier.
92 |
93 | Derivations of the original formula were also applied in the
94 | expected findings of the proportions of digits in other
95 | positions in the number, as in the case of the second digit
96 | (BENFORD, 1938), as well as combinations, such as the first
97 | two digits of a number (NIGRINI, 2012, p.5).
98 |
99 | Only in 1995, however, was the phenomenon proven by Hill.
100 | His proof was based on the fact that numbers in data series
101 | following the Benford Law are, in effect, "second generation"
102 | distributions, ie combinations of other distributions.
103 | The union of randomly drawn samples from various distributions
104 | forms a distribution that respects Benford's Law (HILL, 1995).
105 |
106 | When grouped in ascending order, data that obey Benford's Law
107 | must approximate a geometric sequence (NIGRINI, 2012, page 21).
108 | From this it follows that the logarithms of this ordered series
109 | must form a straight line. In addition, the mantissas (decimal
110 | parts) of the logarithms of these numbers must be uniformly
111 | distributed in the interval [0,1] (NIGRINI, 2012, p.10).
112 |
113 | In general, a series of numerical records follows Benford's Law
114 | when (NIGRINI, 2012, p.21):
115 | * it represents magnitudes of events or events, such as populations
116 | of cities, flows of water in rivers or sizes of celestial bodies;
117 | * it does not have pre-established minimum or maximum limits;
118 | * it is not made up of numbers used as identifiers, such as
119 | identity or social security numbers, bank accounts, telephone numbers; and
120 | * its mean is less than the median, and the data is not
121 | concentrated around the mean.
122 |
123 | It follows from this expected distribution that, if the set of
124 | numbers in a series of records that usually respects the Law
125 | shows a deviation in the proportions found, there may be
126 | distortions, whether intentional or not.
127 |
128 | Benford's Law has been used in [several fields](http://www.benfordonline.net/).
129 | Afer asserting that the usual data type is Benford-compliant,
130 | one can study samples from the same data type tin search of
131 | inconsistencies, errors or even [fraud](https://www.amazon.com.br/Benfords-Law-Applications-Accounting-Detection/dp/1118152859).
132 |
133 | This open source module is an attempt to facilitate the
134 | performance of Benford's Law-related tests by people using
135 | Python, whether interactively or in an automated, scripting way.
136 |
137 | It uses the versatility of numpy and pandas, along with
138 | matplotlib for vizualization, to deliver results like [this one](https://github.com/milcent/benford_py/blob/master/img/SPY-f2d-conf_level-95.png) and much more.
139 |
140 |
141 | It has been a long time since I last tested it in Python 2. The death clock has stopped ticking, so officially it is for Python 3 now. It should work on Linux, Windows and Mac, but please file a bug report if you run into some trouble.
142 |
143 | Also, if you have some nice data set that we can run these tests on, let'us try it.
144 |
145 | Thanks!
146 |
147 | Milcent
148 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://pepy.tech/project/benford-py)
2 |
3 | # Benford for Python
4 |
5 | --------------------------------------------------------------------------------
6 |
7 | **Citing**
8 |
9 |
10 | If you find *Benford_py* useful in your research, please consider adding the following citation:
11 |
12 | ```bibtex
13 | @misc{benford_py,
14 | author = {Marcel, Milcent},
15 | title = {{Benford_py: a Python Implementation of Benford's Law Tests}},
16 | year = {2017},
17 | publisher = {GitHub},
18 | journal = {GitHub repository},
19 | howpublished = {\url{https://github.com/milcent/benford_py}},
20 | }
21 | ```
22 |
23 | --------------------------------------------------------------------------------
24 |
25 | `current version = 0.5.0`
26 |
27 | ### See [release notes](https://github.com/milcent/benford_py/releases/) for features in this and in older versions
28 |
29 | ### Python versions >= 3.6
30 |
31 | ### Installation
32 |
33 | Benford_py is a package in PyPi, so you can install with pip:
34 |
35 | `pip install benford_py`
36 |
37 | or
38 |
39 | `pip install benford-py`
40 |
41 | Or you can cd into the site-packages subfolder of your python distribution (or environment) and git clone from there:
42 |
43 | `git clone https://github.com/milcent/benford_py`
44 |
45 | For a quick start, please go to the [Demo notebook](https://github.com/milcent/benford_py/blob/master/Demo.ipynb), in which I show examples on how to run the tests with the SPY (S&P 500 ETF) daily returns.
46 |
47 | For more fine-grained details of the functions and classes, see the [docs](https://benford-py.readthedocs.io/en/latest/index.html).
48 |
49 | ### Background
50 |
51 | The first digit of a number is its leftmost digit.
52 |
53 |
54 |
55 |
56 | Since the first digit of any number can range from "1" to "9"
57 | (not considering "0"), it would be intuitively expected that the
58 | proportion of each occurrence in a set of numerical records would
59 | be uniformly distributed at 1/9, i.e., approximately 0.1111,
60 | or 11.11%.
61 |
62 | [Benford's Law](https://en.wikipedia.org/wiki/Benford%27s_law),
63 | also known as the Law of First Digits or the Phenomenon of
64 | Significant Digits, is the finding that the first digits of the
65 | numbers found in series of records of the most varied sources do
66 | not display a uniform distribution, but rather are arranged in such
67 | a way that the digit "1" is the most frequent, followed by "2",
68 | "3", and so in a successive and decremental way down to "9",
69 | which presents the lowest frequency as the first digit.
70 |
71 | The expected distributions of the First Digits in a
72 | Benford-compliant data set are the ones shown below:
73 |
74 |
75 |
76 |
77 | The first record on the subject dates from 1881, in the work of
78 | Simon Newcomb, an American-Canadian astronomer and mathematician,
79 | who noted that in the logarithmic tables the first pages, which
80 | contained logarithms beginning with the numerals "1" and "2",
81 | were more worn out, that is, more consulted.
82 |
83 |
84 |
85 |
86 |
87 | Simon Newcomb, 1835-1909.
88 |
89 |
90 | In that same article, Newcomb proposed the formula for the
91 | probability of a certain digit "d" being the first digit of a
92 | number, given by the following equation.
93 |
94 |
95 |
96 |
97 |
where: P (D = d) is the probability that
98 | the first digit is equal to d, and d is an integer ranging
99 | from 1 to 9.
100 |
101 |
102 | In 1938, the American physicist Frank Benford revisited the
103 | phenomenon, which he called the "Law of Anomalous Numbers," in
104 | a survey with more than 20,000 observations of empirical data
105 | compiled from various sources, ranging from areas of rivers to
106 | molecular weights of chemical compounds, including cost data,
107 | address numbers, population sizes and physical constants. All
108 | of them, to a greater or lesser extent, followed such
109 | distribution.
110 |
111 |
112 |
113 |
114 |
115 | Frank Albert Benford, Jr., 1883-1948.
116 |
117 |
118 | The extent of Benford's work seems to have been one good reason
119 | for the phenomenon to be popularized with his name, though
120 | described by Newcomb 57 years earlier.
121 |
122 | Derivations of the original formula were also applied in the
123 | expected findings of the proportions of digits in other
124 | positions in the number, as in the case of the second digit
125 | (BENFORD, 1938), as well as combinations, such as the first
126 | two digits of a number (NIGRINI, 2012, p.5).
127 |
128 | Only in 1995, however, was the phenomenon proven by Hill.
129 | His proof was based on the fact that numbers in data series
130 | following the Benford Law are, in effect, "second generation"
131 | distributions, ie combinations of other distributions.
132 | The union of randomly drawn samples from various distributions
133 | forms a distribution that respects Benford's Law (HILL, 1995).
134 |
135 | When grouped in ascending order, data that obey Benford's Law
136 | must approximate a geometric sequence (NIGRINI, 2012, page 21).
137 | From this it follows that the logarithms of this ordered series
138 | must form a straight line. In addition, the mantissas (decimal
139 | parts) of the logarithms of these numbers must be uniformly
140 | distributed in the interval [0,1] (NIGRINI, 2012, p.10).
141 |
142 | In general, a series of numerical records follows Benford's Law
143 | when (NIGRINI, 2012, p.21):
144 | * it represents magnitudes of events or events, such as populations
145 | of cities, flows of water in rivers or sizes of celestial bodies;
146 | * it does not have pre-established minimum or maximum limits;
147 | * it is not made up of numbers used as identifiers, such as
148 | identity or social security numbers, bank accounts, telephone numbers; and
149 | * its mean is less than the median, and the data is not
150 | concentrated around the mean.
151 |
152 | It follows from this expected distribution that, if the set of
153 | numbers in a series of records that usually respects the Law
154 | shows a deviation in the proportions found, there may be
155 | distortions, whether intentional or not.
156 |
157 | Benford's Law has been used in [several fields](http://www.benfordonline.net/).
158 | Afer asserting that the usual data type is Benford-compliant,
159 | one can study samples from the same data type tin search of
160 | inconsistencies, errors or even [fraud](https://www.amazon.com.br/Benfords-Law-Applications-Accounting-Detection/dp/1118152859).
161 |
162 | This open source module is an attempt to facilitate the
163 | performance of Benford's Law-related tests by people using
164 | Python, whether interactively or in an automated, scripting way.
165 |
166 | It uses the versatility of numpy and pandas, along with
167 | matplotlib for vizualization, to deliver results like the one
168 | bellow and much more.
169 |
170 | 
171 |
172 | It has been a long time since I last tested it in Python 2. The death clock has stopped ticking, so officially it is for Python 3 now. It should work on Linux, Windows and Mac, but please file a bug report if you run into some trouble.
173 |
174 | Also, if you have some nice data set that we can run these tests on, let'us try it.
175 |
176 | Thanks!
177 |
178 | Milcent
179 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | '''Benfords law module'''
2 | __version__ = "0.5.0"
3 |
--------------------------------------------------------------------------------
/benford/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Benford_py for Python is a module for application of Benford's Law
3 | to a sequence of numbers.
4 |
5 | Dependent on pandas, numpy and matplotlib
6 |
7 | All logarithms ar in base 10: "log10"
8 |
9 | Author: Marcel Milcent
10 |
11 | SDPX-License-Identifier: BSD-3-Clause
12 | """
13 |
14 | from .benford import *
15 |
16 | __version__ = '0.5.0'
17 |
--------------------------------------------------------------------------------
/benford/checks.py:
--------------------------------------------------------------------------------
1 | from pandas import Series
2 | from numpy import array, ndarray
3 | from .constants import DIGS, REV_DIGS, CONFS
4 |
5 |
6 | def _check_digs_(digs):
7 | """Checks the possible values for the digs parameter of the
8 | First Digits tests
9 | """
10 | if digs not in [1, 2, 3]:
11 | raise ValueError("The value assigned to the parameter -digs- "
12 | f"was {digs}. Value must be 1, 2 or 3.")
13 |
14 |
15 | def _check_test_(test):
16 | """Checks the test chosen, both for int or str values
17 | """
18 | if isinstance(test, int):
19 | if test in DIGS.keys():
20 | return test
21 | else:
22 | raise ValueError(f'Test was set to {test}. Should be one of '
23 | f'{DIGS.keys()}')
24 | elif isinstance(test, str):
25 | if test in REV_DIGS.keys():
26 | return REV_DIGS[test]
27 | else:
28 | raise ValueError(f'Test was set to {test}. Should be one of '
29 | f'{REV_DIGS.keys()}')
30 | else:
31 | raise ValueError('Wrong value chosen for test parameter. Possible '
32 | f'values are\n {list(DIGS.keys())} for ints and'
33 | f'\n {list(REV_DIGS.keys())} for strings.')
34 |
35 |
36 | def _check_decimals_(decimals):
37 | """"""
38 | if isinstance(decimals, int):
39 | if (decimals < 0):
40 | raise ValueError(
41 | "Parameter -decimals- must be an int >= 0, or 'infer'.")
42 | else:
43 | if decimals != 'infer':
44 | raise ValueError(
45 | "Parameter -decimals- must be an int >= 0, or 'infer'.")
46 | return decimals
47 |
48 |
49 | def _check_sign_(sign):
50 | """"""
51 | if sign not in ['all', 'pos', 'neg']:
52 | raise ValueError("Parameter -sign- must be one of the following: "
53 | "'all', 'pos' or 'neg'.")
54 | return sign
55 |
56 |
57 | def _check_confidence_(confidence):
58 | """"""
59 | if confidence not in CONFS.keys():
60 | raise ValueError("Value of parameter -confidence- must be one of the "
61 | f"following:\n {list(CONFS.keys())}")
62 | return confidence
63 |
64 |
65 | def _check_high_Z_(high_Z):
66 | """"""
67 | if not high_Z in ['pos', 'all']:
68 | if not isinstance(high_Z, int):
69 | raise ValueError("The parameter -high_Z- should be 'pos', "
70 | "'all' or an int.")
71 | return high_Z
72 |
73 |
74 | def _check_num_array_(data):
75 | """"""
76 | if (not isinstance(data, ndarray)) & (not isinstance(data, Series)):
77 | print('\n`data` not a numpy NDarray nor a pandas Series.'
78 | ' Trying to convert...')
79 | try:
80 | data = array(data)
81 | except:
82 | raise ValueError('Could not convert data. Check input.')
83 | print('\nConversion successful.')
84 |
85 | try:
86 | data = data.astype(float)
87 | except:
88 | raise ValueError('Could not convert data. Check input.')
89 | else:
90 | if data.dtype not in [int, float]:
91 | try:
92 | data = data.astype(float)
93 | except:
94 | raise ValueError('Could not convert data. Check input.')
95 | return data
96 |
--------------------------------------------------------------------------------
/benford/constants.py:
--------------------------------------------------------------------------------
1 | DIGS = {1: 'F1D', 2: 'F2D', 3: 'F3D', 22: 'SD', -2: 'L2D'}
2 |
3 | SEC_ORDER_DIGS = {key: f'{val}_sec' for key, val in DIGS.items()}
4 |
5 | REV_DIGS = {'F1D': 1, 'F2D': 2, 'F3D': 3, 'SD': 22, 'L2D': -2}
6 |
7 | LEN_TEST = {1: 9, 2: 90, 3: 900, 22: 10, -2: 100}
8 |
9 | TEST_NAMES = {'F1D': 'First Digit Test', 'F2D': 'First Two Digits Test',
10 | 'F3D': 'First Three Digits Test', 'SD': 'Second Digit Test',
11 | 'L2D': 'Last Two Digits Test',
12 | 'F1D_sec': 'First Digit Second Order Test',
13 | 'F2D_sec': 'First Two Digits Second Order Test',
14 | 'F3D_sec': 'First Three Digits Second Order Test',
15 | 'SD_sec': 'Second Digit Second Order Test',
16 | 'L2D_sec': 'Last Two Digits Second Order Test',
17 | 'F1D_Summ': 'First Digit Summation Test',
18 | 'F2D_Summ': 'First Two Digits Summation Test',
19 | 'F3D_Summ': 'First Three Digits Summation Test',
20 | 'Mantissas': 'Mantissas Test'
21 | }
22 |
23 | # Critical values for Mean Absolute Deviation
24 | MAD_CONFORM = {1: [0.006, 0.012, 0.015], 2: [0.0012, 0.0018, 0.0022],
25 | 3: [0.00036, 0.00044, 0.00050], 22: [0.008, 0.01, 0.012],
26 | -2: None, 'F1D': 'First Digit', 'F2D': 'First Two Digits',
27 | 'F3D': 'First Three Digits', 'SD': 'Second Digits'}
28 |
29 | # Color for the plotting
30 | COLORS = {'m': '#00798c', 'b': '#E2DCD8', 's': '#9c3848',
31 | 'af': '#edae49', 'ab': '#33658a', 'h': '#d1495b',
32 | 'h2': '#f64740', 't': '#16DB93'}
33 |
34 | # Critical Z-scores according to the confindence levels
35 | CONFS = {None: None, 80: 1.285, 85: 1.435, 90: 1.645, 95: 1.96,
36 | 99: 2.576, 99.9: 3.29, 99.99: 3.89, 99.999: 4.417,
37 | 99.9999: 4.892, 99.99999: 5.327}
38 |
39 | P_VALUES = {None: 'None', 80: '0.2', 85: '0.15', 90: '0.1', 95: '0.05',
40 | 99: '0.01', 99.9: '0.001', 99.99: '0.0001', 99.999: '0.00001',
41 | 99.9999: '0.000001', 99.99999: '0.0000001'}
42 |
43 | # Critical Chi-Square values according to the tests degrees of freedom
44 | # and confidence levels
45 | CRIT_CHI2 = {8: {80: 11.03, 85: 12.027, 90: 13.362, 95: 15.507,
46 | 99: 20.090, 99.9: 26.124, 99.99: 31.827, None: None,
47 | 99.999: 37.332, 99.9999: 42.701, 99.99999: 47.972},
48 | 9: {80: 12.242, 85: 13.288, 90: 14.684, 95: 16.919,
49 | 99: 21.666, 99.9: 27.877, 99.99: 33.72, None: None,
50 | 99.999: 39.341, 99.9999: 44.811, 99.99999: 50.172},
51 | 89: {80: 99.991, 85: 102.826, 90: 106.469, 95: 112.022,
52 | 99: 122.942, 99.9: 135.978, 99.99: 147.350,
53 | 99.999: 157.702, 99.9999: 167.348, 99.99999: 176.471,
54 | None: None},
55 | 99: {80: 110.607, 85: 113.585, 90: 117.407,
56 | 95: 123.225, 99: 134.642, 99.9: 148.230,
57 | 99.99: 160.056, 99.999: 170.798, 99.9999: 180.792,
58 | 99.99999: 190.23, None: None},
59 | 899: {80: 934.479, 85: 942.981, 90: 953.752, 95: 969.865,
60 | 99: 1000.575, 99.9: 1035.753, 99.99: 1065.314,
61 | 99.999: 1091.422, 99.9999: 1115.141,
62 | 99.99999: 1137.082, None: None}
63 | }
64 |
65 | # Critical Kolmogorov-Smirnov values according to the confidence levels
66 | # These values are yet to be divided by the square root of the sample size
67 | CRIT_KS = {80: 1.073, 85: 1.138, 90: 1.224, 95: 1.358, 99: 1.628,
68 | 99.9: 1.949, 99.99: 2.225, 99.999: 2.47,
69 | 99.9999: 2.693, 99.99999: 2.899, None: None}
70 |
--------------------------------------------------------------------------------
/benford/expected.py:
--------------------------------------------------------------------------------
1 | from pandas import DataFrame
2 | from numpy import array, arange, log10
3 | from .checks import _check_digs_
4 | from .viz import plot_expected
5 |
6 |
7 | class First(DataFrame):
8 | """Holds the expected probabilities of the First, First Two, or
9 | First Three digits according to Benford's distribution.
10 |
11 | Args:
12 | digs: 1, 2 or 3 - tells which of the first digits to consider:
13 | 1 for the First Digit, 2 for the First Two Digits and 3 for
14 | the First Three Digits.
15 | plot: option to plot a bar chart of the Expected proportions.
16 | Defaults to True.
17 | save_plot: string with the path/name of the file in which the generated
18 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
19 | is infered by the file name extension. Only available when
20 | plot=True.
21 | save_plot_kwargs: dict with any of the kwargs accepted by
22 | matplotlib.pyplot.savefig()
23 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
24 | Only available when plot=True and save_plot is a string with the
25 | figure file path/name.
26 | """
27 |
28 | def __init__(self, digs, plot=True, save_plot=None, save_plot_kwargs=None):
29 | _check_digs_(digs)
30 | dig_name = f'First_{digs}_Dig'
31 | exp_array, dig_array = _gen_first_digits_(digs)
32 |
33 | DataFrame.__init__(self, {'Expected': exp_array}, index=dig_array)
34 | self.index.names = [dig_name]
35 |
36 | if plot:
37 | plot_expected(self, digs, save_plot=save_plot,
38 | save_plot_kwargs=save_plot_kwargs)
39 |
40 |
41 | class Second(DataFrame):
42 | """Holds the expected probabilities of the Second Digits
43 | according to Benford's distribution.
44 |
45 | Args:
46 | plot: option to plot a bar chart of the Expected proportions.
47 | Defaults to True.
48 | save_plot: string with the path/name of the file in which the generated
49 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
50 | is infered by the file name extension. Only available when
51 | plot=True.
52 | save_plot_kwargs: dict with any of the kwargs accepted by
53 | matplotlib.pyplot.savefig()
54 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
55 | Only available when plot=True and save_plot is a string with the
56 | figure file path/name.
57 | """
58 | def __init__(self, plot=True, save_plot=None, save_plot_kwargs=None):
59 |
60 | exp, sec_digs = _gen_second_digits_()
61 |
62 | DataFrame.__init__(self, {'Expected': exp, 'Sec_Dig': sec_digs})
63 | self.set_index("Sec_Dig", inplace=True)
64 |
65 | if plot:
66 | plot_expected(self, 22, save_plot=save_plot,
67 | save_plot_kwargs=save_plot_kwargs)
68 |
69 |
70 | class LastTwo(DataFrame):
71 | """Holds the expected probabilities of the Last Two Digits
72 | according to Benford's distribution.
73 |
74 | Args:
75 | plot: option to plot a bar chart of the Expected proportions.
76 | Defaults to True.
77 | save_plot: string with the path/name of the file in which the generated
78 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
79 | is infered by the file name extension. Only available when
80 | plot=True.
81 | save_plot_kwargs: dict with any of the kwargs accepted by
82 | matplotlib.pyplot.savefig()
83 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
84 | Only available when plot=True and save_plot is a string with the
85 | figure file path/name.
86 | """
87 | def __init__(self, num=False, plot=True, save_plot=None, save_plot_kwargs=None):
88 | exp, l2d = _gen_last_two_digits_(num=num)
89 | DataFrame.__init__(self, {'Expected': exp,
90 | 'Last_2_Dig': l2d})
91 | self.set_index('Last_2_Dig', inplace=True)
92 | if plot:
93 | plot_expected(self, -2, save_plot=save_plot,
94 | save_plot_kwargs=save_plot_kwargs)
95 |
96 |
97 | def _get_expected_digits_(digs):
98 | """Chooses the Exxpected class to be used in a test
99 |
100 | Args:
101 | digs: the int corresponding to the Expected class to be instantiated
102 |
103 | Returns:
104 | the Expected instance forthe propoer test to be performed
105 | """
106 | if digs in [1, 2, 3]:
107 | return First(digs, plot=False)
108 | elif digs == 22:
109 | return Second(plot=False)
110 | else:
111 | return LastTwo(num=True, plot=False)
112 |
113 |
114 | def _gen_last_two_digits_(num=False):
115 | """Creates two arrays, one with the possible last two digits and one with
116 | thei respective probabilities
117 |
118 | Args:
119 | num: returns numeric (ints) values. Defaluts to False,
120 | which returns strings.
121 |
122 | Returns:
123 | exp (np.array): Array with the (constant) probabilities of occurrence of
124 | each pair of last two digits
125 | l2d (np.array): Array of ints or str, in any case representing all 100
126 | possible combinations of last two digits
127 | """
128 | exp = array([1 / 99.] * 100)
129 | l2d = arange(0, 100)
130 | if num:
131 | return exp, l2d
132 | l2d = l2d.astype(str)
133 | l2d[:10] = array(['00', '01', '02', '03', '04', '05',
134 | '06', '07', '08', '09'])
135 | return exp, l2d
136 |
137 | def _gen_first_digits_(digs):
138 | """Creates two arrays, one with the possible digits combinations and the
139 | other with their respective expected probabilities according to Benford
140 |
141 | Args:
142 | digs (int): 1, 2 or 3, for generation of the first, first two, or first
143 | three digits
144 |
145 | Returns:
146 | (tuple of arrays): the expected probabilities array and the digits
147 | combination array.
148 | """
149 | dig_array = arange(10 ** (digs - 1), 10 ** digs)
150 | exp_prob = log10(1 + (1. / dig_array))
151 | return exp_prob, dig_array
152 |
153 | def _gen_second_digits_():
154 | """Creates two arrays, one with he possible second digits combinations and
155 | the other with their respective expected probabilities according to Benford
156 |
157 | Returns:
158 | (tuple of arrays): the expected probabilities array and the second
159 | digits array.
160 | """
161 | exp_f2d, _ = _gen_first_digits_(2)
162 | sec_digs = range(10)
163 | sec_digs_in_f2d = array(list(range(10)) * 9)
164 | exp = array([exp_f2d[sec_digs_in_f2d == i].sum() for i in sec_digs])
165 | return exp, array(sec_digs)
--------------------------------------------------------------------------------
/benford/reports.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from .constants import MAD_CONFORM
3 |
4 |
5 | def _inform_(df, high_Z, conf):
6 | """Selects and sorts by the Z_stats chosen to be considered, informing or not.
7 | """
8 |
9 | if isinstance(high_Z, int):
10 | if conf is not None:
11 | dd = df[['Expected', 'Found', 'Z_score'
12 | ]].sort_values('Z_score', ascending=False).head(high_Z)
13 | print(f'\nThe entries with the top {high_Z} Z scores are:\n')
14 | # Summation Test
15 | else:
16 | dd = df[['Expected', 'Found', 'AbsDif'
17 | ]].sort_values('AbsDif', ascending=False
18 | ).head(high_Z)
19 | print(f'\nThe entries with the top {high_Z} absolute deviations '
20 | 'are:\n')
21 | else:
22 | if high_Z == 'pos':
23 | m1 = df.Dif > 0
24 | m2 = df.Z_score > conf
25 | dd = df[['Expected', 'Found', 'Z_score'
26 | ]].loc[m1 & m2].sort_values('Z_score', ascending=False)
27 | print('\nThe entries with the significant positive '
28 | 'deviations are:\n')
29 | elif high_Z == 'neg':
30 | m1 = df.Dif < 0
31 | m2 = df.Z_score > conf
32 | dd = df[['Expected', 'Found', 'Z_score'
33 | ]].loc[m1 & m2].sort_values('Z_score', ascending=False)
34 | print('\nThe entries with the significant negative '
35 | 'deviations are:\n')
36 | else:
37 | dd = df[['Expected', 'Found', 'Z_score'
38 | ]].loc[df.Z_score > conf].sort_values('Z_score',
39 | ascending=False)
40 | print('\nThe entries with the significant deviations are:\n')
41 | print(dd)
42 |
43 |
44 | def _report_mad_(digs, MAD):
45 | """Reports the test Mean Absolut Deviation and compares it to critical values
46 | """
47 | print(f'Mean Absolute Deviation: {MAD:.6f}')
48 | if digs != -2:
49 | mads = MAD_CONFORM[digs]
50 | if MAD <= mads[0]:
51 | print(f'MAD <= {mads[0]:.6f}: Close conformity.\n')
52 | elif MAD <= mads[1]:
53 | print(f'{mads[0]:.6f} < MAD <= {mads[1]:.6f}: '
54 | 'Acceptable conformity.\n')
55 | elif MAD <= mads[2]:
56 | print(f'{mads[1]:.6f} < MAD <= {mads[2]:.6f}: '
57 | 'Marginally Acceptable conformity.\n')
58 | else:
59 | print(f'MAD > {mads[2]:.6f}: Nonconformity.\n')
60 | else:
61 | print("There is no conformity check for this test's MAD.\n")
62 |
63 |
64 | def _report_KS_(KS, crit_KS):
65 | """Reports the test Kolmogorov-Smirnov statistic and compares it to critical
66 | values, depending on the confidence level
67 | """
68 | result = 'PASS' if KS <= crit_KS else 'FAIL'
69 | print(f"\n\tKolmogorov-Smirnov: {KS:.6f}",
70 | f"\n\tCritical value: {crit_KS:.6f} -- {result}")
71 |
72 |
73 | def _report_chi2_(chi2, CRIT_CHI2):
74 | """Reports the test Chi-square statistic and compares it to critical values,
75 | depending on the confidence level
76 | """
77 | result = 'PASS' if chi2 <= CRIT_CHI2 else 'FAIL'
78 | print(f"\n\tChi square: {chi2:.6f}",
79 | f"\n\tCritical value: {CRIT_CHI2:.6f} -- {result}")
80 |
81 |
82 | def _report_Z_(df, high_Z, crit_Z):
83 | """Reports the test Z scores and compares them to a critical value,
84 | depending on the confidence level
85 | """
86 | print(f"\n\tCritical Z-score:{crit_Z}.")
87 | _inform_(df, high_Z, crit_Z)
88 |
89 |
90 | def _report_summ_(test, high_diff):
91 | """Reports the Summation Test Absolute Differences between the Found and
92 | the Expected proportions
93 |
94 | """
95 | if high_diff is not None:
96 | print(f'\nThe top {high_diff} Absolute Differences are:\n')
97 | print(test.sort_values('AbsDif', ascending=False).head(high_diff))
98 | else:
99 | print('\nThe top Absolute Differences are:\n')
100 | print(test.sort_values('AbsDif', ascending=False))
101 |
102 |
103 | def _report_bhattac_coeff_(bhattac_coeff):
104 | """
105 | """
106 | print(f"Bhattacharyya Coefficient: {bhattac_coeff:6f}\n")
107 |
108 |
109 | def _report_bhattac_dist_(bhattac_dist):
110 | """
111 | """
112 | print(f"Bhattacharyya Distance: {bhattac_dist:6f}\n")
113 |
114 |
115 | def _report_kl_diverg_(kl_diverg):
116 | """
117 | """
118 | print(f"Kullback-Leibler Divergence: {kl_diverg:6f}\n")
119 |
120 |
121 | def _report_test_(test, high=None, crit_vals=None):
122 | """Main report function. Receives the Args: to report with, initiates
123 | the process, and calls the right reporting helper function(s), depending
124 | on the Test.
125 | """
126 | print('\n', f' {test.name} '.center(50, '#'), '\n')
127 | if not 'Summation' in test.name:
128 | _report_mad_(test.digs, test.MAD)
129 | _report_bhattac_coeff_(test.bhattacharyya_coefficient)
130 | _report_bhattac_dist_(test.bhattacharyya_distance)
131 | _report_kl_diverg_(test.kullback_leibler_divergence)
132 | if test.confidence is not None:
133 | print(f"For confidence level {test.confidence}%: ")
134 | _report_KS_(test.KS, crit_vals['KS'])
135 | _report_chi2_(test.chi_square, crit_vals['chi2'])
136 | _report_Z_(test, high, crit_vals['Z'])
137 | else:
138 | print('Confidence is currently `None`. Set the confidence level, '
139 | 'so as to generate comparable critical values.')
140 | if isinstance(high, int):
141 | _inform_(test, high, None)
142 | else:
143 | _report_summ_(test, high)
144 |
145 |
146 | def _report_mantissa_(stats, confidence):
147 | """Prints the mantissas statistics and their respective reference values
148 |
149 | Args:
150 | stats (dict):
151 | """
152 | print("\n", ' Mantissas Test '.center(52, '#'))
153 | print(f"\nThe Mantissas MEAN is {stats['Mean']:.6f}."
154 | "\tRef: 0.5")
155 | print(f"The Mantissas VARIANCE is {stats['Var']:.6f}."
156 | "\tRef: 0.08333")
157 | print(f"The Mantissas SKEWNESS is {stats['Skew']:.6f}."
158 | "\tRef: 0.0")
159 | print(f"The Mantissas KURTOSIS is {stats['Kurt']:.6f}."
160 | "\tRef: -1.2")
161 | print("\nThe Kolmogorov-Smirnov statistic for the Mantissas distribution"
162 | f" is {stats['KS']:.6f}.\nThe critical value for the confidence "
163 | f"level of {confidence}% is {stats['KS_critical']:.6f} -- "
164 | f"{'PASS' if stats['KS'] < stats['KS_critical'] else 'FAIL'}\n")
165 |
166 |
167 | def _deprecate_inform_(verbose, inform):
168 | """
169 | Raises:
170 | FutureWarning: if the arg `inform` is used (to be deprecated).
171 | """
172 | if inform is None:
173 | return verbose
174 | else:
175 | warnings.warn('The parameter `inform` will be deprecated in future '
176 | 'versions. Use `verbose` instead.',
177 | FutureWarning)
178 | return inform
179 |
--------------------------------------------------------------------------------
/benford/stats.py:
--------------------------------------------------------------------------------
1 | from numpy import abs as nabs, errstate, linspace, log, sqrt, where
2 | from .constants import CRIT_CHI2, CRIT_KS, MAD_CONFORM, DIGS
3 |
4 |
5 | def Z_score(frame, N):
6 | """Computes the Z statistics for the proportions studied
7 |
8 | Args:
9 | frame: DataFrame with the expected proportions and the already calculated
10 | Absolute Diferences between the found and expeccted proportions
11 | N: sample size
12 |
13 | Returns:
14 | Series of computed Z scores
15 | """
16 | return (frame.AbsDif - (1 / (2 * N))) / sqrt(
17 | (frame.Expected * (1. - frame.Expected)) / N)
18 |
19 |
20 | def chi_sq(frame, ddf, confidence, verbose=True):
21 | """Comnputes the chi-square statistic of the found distributions and compares
22 | it with the critical chi-square of such a sample, according to the
23 | confidence level chosen and the degrees of freedom - len(sample) -1.
24 |
25 | Args:
26 | frame: DataFrame with Found, Expected and their difference columns.
27 | ddf: Degrees of freedom to consider.
28 | confidence: Confidence level to look up critical value.
29 | verbose: prints the chi-squre result and compares to the critical
30 | chi-square for the sample. Defaults to True.
31 |
32 | Returns:
33 | The computed Chi square statistic and the critical chi square
34 | (according) to the degrees of freedom and confidence level,
35 | for comparison. None if confidence is None
36 | """
37 | if confidence is None:
38 | print('\nChi-square test needs confidence other than None.')
39 | return
40 | else:
41 | exp_counts = frame.Counts.sum() * frame.Expected
42 | dif_counts = frame.Counts - exp_counts
43 | found_chi = (dif_counts ** 2 / exp_counts).sum()
44 | crit_chi = CRIT_CHI2[ddf][confidence]
45 | if verbose:
46 | print(f"\nThe Chi-square statistic is {found_chi:.4f}.\n"
47 | f"Critical Chi-square for this series: {crit_chi}.")
48 | return (found_chi, crit_chi)
49 |
50 |
51 | def chi_sq_2(frame):
52 | """Computes the chi-square statistic of the found distributions
53 |
54 | Args:
55 | frame: DataFrame with Found, Expected and their difference columns.
56 |
57 | Returns:
58 | The computed Chi square statistic
59 | """
60 | exp_counts = frame.Counts.sum() * frame.Expected
61 | dif_counts = frame.Counts - exp_counts
62 | return (dif_counts ** 2 / exp_counts).sum()
63 |
64 |
65 | def kolmogorov_smirnov(frame, confidence, N, verbose=True):
66 | """Computes the Kolmogorov-Smirnov test of the found distributions
67 | and compares it with the critical chi-square of such a sample,
68 | according to the confidence level chosen.
69 |
70 | Args:
71 | frame: DataFrame with Foud and Expected distributions.
72 | confidence: Confidence level to look up critical value.
73 | N: Sample size
74 | verbose: prints the KS result and the critical value for the sample.
75 | Defaults to True.
76 |
77 | Returns:
78 | The Suprem, which is the greatest absolute difference between the
79 | Found and the expected proportions, and the Kolmogorov-Smirnov
80 | critical value according to the confidence level, for ccomparison
81 | """
82 | if confidence is None:
83 | print('\nKolmogorov-Smirnov test needs confidence other than None.')
84 | return
85 | else:
86 | # sorting and calculating the cumulative distribution
87 | ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum()
88 | # finding the supremum - the largest cumul dist difference
89 | suprem = ((ks_frame.Found - ks_frame.Expected).abs()).max()
90 | # calculating the crittical value according to confidence
91 | crit_KS = CRIT_KS[confidence] / sqrt(N)
92 |
93 | if verbose:
94 | print(f"\nThe Kolmogorov-Smirnov statistic is {suprem:.4f}.\n"
95 | f"Critical K-S for this series: {crit_KS:.4f}")
96 | return (suprem, crit_KS)
97 |
98 |
99 | def kolmogorov_smirnov_2(frame):
100 | """Computes the Kolmogorov-Smirnov test of the found distributions
101 |
102 | Args:
103 | frame: DataFrame with Foud and Expected distributions.
104 |
105 | Returns:
106 | The Suprem, which is the greatest absolute difference between the
107 | Found end th expected proportions
108 | """
109 | # sorting and calculating the cumulative distribution
110 | ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum()
111 | # finding the supremum - the largest cumul dist difference
112 | return ((ks_frame.Found - ks_frame.Expected).abs()).max()
113 |
114 |
115 | def _two_dist_ks_(dist1, dist2, cummulative=True):
116 | """Computes the Kolmogorov-Smirnov statistic between two distributions,
117 | a found one (dist2) and an expected one (dist1).
118 |
119 | Args:
120 | dist1 (np.arrat): array with the expected distribution
121 | dist2 (np.array): array with the found distribution
122 | cummulative (bool): makes apply cummulutative sum to the
123 | distributions (empirical cdf).
124 |
125 | Returns:
126 | tuple(floats): the KS statistic
127 | """
128 | dist2.sort(); dist1.sort()
129 | if not cummulative:
130 | return nabs(dist2 - dist1).max()
131 | return nabs(dist2.cumsum() - dist1.cumsum()).max()
132 |
133 |
134 | def _mantissas_ks_(mant_dist, confidence, sample_size):
135 | """Computes the Kolmogorov-Smirnof statistic for the Mantissas, also
136 | providing the KS critical value according the the sample size and
137 | confidence level provided
138 |
139 | Args:
140 | mant_dist (np.array): array with the mantissas distribution found
141 | confidence (float, int): level of confidence to compute the critical
142 | value
143 |
144 | Returns:
145 | tuple(floats): the KS statistic and the critical value
146 | """
147 | crit_ks = CRIT_KS[confidence] * sqrt(2 * sample_size / sample_size ** 2)\
148 | if confidence else None
149 | # non-cummulative, uniformly distributed
150 | expected = linspace(0, 1, len(mant_dist), endpoint=False)
151 | ks = _two_dist_ks_(expected, mant_dist, cummulative=False)
152 | return ks, crit_ks
153 |
154 |
155 | def mad(frame, test, verbose=True):
156 | """Computes the Mean Absolute Deviation (MAD) between the found and the
157 | expected proportions.
158 |
159 | Args:
160 | frame: DataFrame with the Absolute Deviations already calculated.
161 | test: Test to compute the MAD from (F1D, SD, F2D...)
162 | verbose: prints the MAD result and compares to limit values of
163 | conformity. Defaults to True.
164 |
165 | Returns:
166 | The Mean of the Absolute Deviations between the found and expected
167 | proportions.
168 | """
169 | mad = frame.AbsDif.mean()
170 |
171 | if verbose:
172 | print(f"\nThe Mean Absolute Deviation is {mad}")
173 |
174 | if test != -2:
175 | print(f"For the {MAD_CONFORM[DIGS[test]]}:\n\
176 | - 0.0000 to {MAD_CONFORM[test][0]}: Close Conformity\n\
177 | - {MAD_CONFORM[test][0]} to {MAD_CONFORM[test][1]}: Acceptable Conformity\n\
178 | - {MAD_CONFORM[test][1]} to {MAD_CONFORM[test][2]}: Marginally Acceptable Conformity\n\
179 | - Above {MAD_CONFORM[test][2]}: Nonconformity")
180 | else:
181 | pass
182 | return mad
183 |
184 |
185 | def mse(frame, verbose=True):
186 | """Computes the test's Mean Square Error
187 |
188 | Args:
189 | frame: DataFrame with the already computed Absolute Deviations between
190 | the found and expected proportions
191 | verbose: Prints the MSE. Defaults to True.
192 |
193 | Returns:
194 | Mean of the squared differences between the found and the expected proportions.
195 | """
196 | mse = (frame.AbsDif ** 2).mean()
197 |
198 | if verbose:
199 | print(f"\nMean Square Error = {mse}")
200 |
201 | return mse
202 |
203 | def _bhattacharyya_coefficient(dist_1, dist_2):
204 | """Computes the Bhattacharyya Coeficient between two probability
205 | distributions, to be letar used to compute the Bhattacharyya Distance
206 |
207 | Args:
208 | dist_1 (np.array): The newly gathered distribution, to be compared
209 | with an older / established distribution.
210 | dist_2 (np.array): The older/ establhished distribution with which
211 | the new one will be compared.
212 |
213 | Returns:
214 | bhat_coef (float)
215 | """
216 | return sqrt(dist_1 * dist_2).sum()
217 |
218 |
219 | def _bhattacharyya_distance_(dist_1, dist_2):
220 | """Computes the Bhattacharyya Dsitance between two probability
221 | distributions
222 |
223 | Args:
224 | dist_1 (np.array): The newly gathered distribution, to be compared
225 | with an older / established distribution.
226 | dist_2 (np.array): The older/ establhished distribution with which
227 | the new one will be compared.
228 |
229 | Returns:
230 | bhat_dist (float)
231 | """
232 | with errstate(divide='ignore'):
233 | bhat_dist = -log(_bhattacharyya_coefficient(dist_1, dist_2))
234 | return bhat_dist
235 |
236 |
237 | def _kullback_leibler_divergence_(dist_1, dist_2):
238 | """Computes the Kullback-Leibler Divergence between two probability
239 | distributions.
240 |
241 | Args:
242 | dist_1 (np.array): The newly gathered distribution, to be compared
243 | with an older / established distribution.
244 | dist_2 (np.array): The older/ establhished distribution with which
245 | the new one will be compared.
246 |
247 | Returns:
248 | kulb_leib_diverg (float)
249 | """
250 | # ignore divide by zero warning in np.where
251 | with errstate(divide='ignore'):
252 | kl_d = (log((dist_1 / dist_2), where=(dist_1 != 0)) * dist_1).sum()
253 | return kl_d
254 |
--------------------------------------------------------------------------------
/benford/utils.py:
--------------------------------------------------------------------------------
1 | from pandas import Series, DataFrame
2 | from numpy import array, arange, log10, ndarray
3 | from .expected import _get_expected_digits_
4 | from .constants import DIGS, REV_DIGS
5 | from .stats import Z_score
6 | from .checks import _check_num_array_, _check_sign_, _check_decimals_
7 |
8 |
9 | def _set_N_(len_df, limit_N):
10 | """"""
11 | # Assigning to N the superior limit or the lenght of the series
12 | if limit_N is None or limit_N > len_df:
13 | return max(1, len_df)
14 | # Check on limit_N being a positive integer
15 | else:
16 | if limit_N < 0 or not isinstance(limit_N, int):
17 | raise ValueError("limit_N must be None or a positive integer.")
18 | else:
19 | return max(1, limit_N)
20 |
21 |
22 | def get_mantissas(arr):
23 | """Computes the mantissas, the non-integer part of the log of a number.
24 |
25 | Args:
26 | arr: array of integers or floats
27 |
28 | Returns:
29 | Array of floats withe logs mantissas
30 | """
31 | log_a = abs(log10(arr))
32 | return log_a - log_a.astype(int) # the number - its integer part
33 |
34 |
35 | def input_data(given):
36 | """Internalizes and transforms the input data
37 |
38 | Args:
39 | given: ndarray, Series or tuple with DataFrame and name of the
40 | column to analyze
41 |
42 | Returns:
43 | The raw inputed data and the result of its first pre-processing,
44 | when required.
45 | """
46 | if type(given) == Series:
47 | data = chosen = given
48 | elif type(given) == ndarray:
49 | data = given
50 | chosen = Series(given)
51 | elif type(given) == tuple:
52 | if (type(given[0]) != DataFrame) | (type(given[1]) != str):
53 | raise TypeError('The data tuple must be composed of a pandas '
54 | 'DataFrame and the name (str) of the chosen '
55 | 'column, in that order')
56 | data = given[0]
57 | chosen = given[0][given[1]]
58 | else:
59 | raise TypeError("Wrong data input type. Check docstring.")
60 | return data, chosen
61 |
62 |
63 | def set_sign(data, sign="all"):
64 | """
65 | """
66 | sign = _check_sign_(sign)
67 |
68 | if sign == 'all':
69 | data.seq = data.seq.loc[data.seq != 0]
70 | elif sign == 'pos':
71 | data.seq = data.seq.loc[data.seq > 0]
72 | else:
73 | data.seq = data.seq.loc[data.seq < 0]
74 |
75 | return data.dropna()
76 |
77 |
78 | def get_times_10_power(data, decimals=2):
79 | """"""
80 | decimals = _check_decimals_(decimals)
81 |
82 | ab = data.seq.abs()
83 |
84 | if data.seq.dtype == 'int':
85 | data['ZN'] = ab
86 | else:
87 | if decimals == 'infer':
88 | data['ZN'] = ab.astype(str).str\
89 | .replace('.', '', regex=False)\
90 | .str.lstrip('0')\
91 | .str[:5].astype(int)
92 | else:
93 | data['ZN'] = (ab * (10 ** decimals)).astype(int)
94 | return data
95 |
96 |
97 | def get_digs(data, decimals=2, sign="all"):
98 | """
99 | """
100 | df = DataFrame({'seq': _check_num_array_(data)})
101 |
102 | df = set_sign(df, sign=sign)
103 |
104 | df = get_times_10_power(df, decimals=decimals)
105 |
106 | # First digits
107 | for col in ['F1D', 'F2D', 'F3D']:
108 | temp = df.ZN.loc[df.ZN >= 10 ** (REV_DIGS[col] - 1)]
109 | df[col] = (temp // 10 ** ((log10(temp).astype(int)) -
110 | (REV_DIGS[col] - 1)))
111 | # fill NANs with -1, which is a non-usable value for digits,
112 | # to be discarded later.
113 | df[col] = df[col].fillna(-1).astype(int)
114 | # Second digit
115 | temp_sd = df.loc[df.ZN >= 10]
116 | df['SD'] = (temp_sd.ZN // 10**((log10(temp_sd.ZN)).astype(int) -
117 | 1)) % 10
118 | df['SD'] = df['SD'].fillna(-1).astype(int)
119 | # Last two digits
120 | temp_l2d = df.loc[df.ZN >= 1000]
121 | df['L2D'] = temp_l2d.ZN % 100
122 | df['L2D'] = df['L2D'].fillna(-1).astype(int)
123 | return df
124 |
125 |
126 | def get_found_proportions(data):
127 | """
128 | """
129 | counts = data.value_counts()
130 | # get their relative frequencies
131 | proportions = data.value_counts(normalize=True)
132 | # crate dataframe from them
133 | return DataFrame({'Counts': counts, 'Found': proportions}).sort_index()
134 |
135 |
136 | def join_expect_found_diff(data, digs):
137 | """
138 | """
139 | dd =_get_expected_digits_(digs).join(data).fillna(0)
140 | # create column with absolute differences
141 | dd['Dif'] = dd.Found - dd.Expected
142 | dd['AbsDif'] = dd.Dif.abs()
143 | return dd
144 |
145 |
146 | def prepare(data, digs, limit_N=None, simple=False):
147 | """Transforms the original number sequence into a DataFrame reduced
148 | by the ocurrences of the chosen digits, creating other computed
149 | columns
150 | """
151 | df = get_found_proportions(data)
152 | dd = join_expect_found_diff(df, digs)
153 | if simple:
154 | del dd['Dif']
155 | return dd
156 | else:
157 | N = _set_N_(len(data), limit_N=limit_N)
158 | dd['Z_score'] = Z_score(dd, N)
159 | return N, dd
160 |
161 |
162 | def subtract_sorted(data):
163 | """Subtracts the sorted sequence elements from each other, discarding zeros.
164 | Used in the Second Order test
165 | """
166 | temp = data.copy().sort_values(ignore_index=True)
167 | temp = (temp - temp.shift(1)).dropna()
168 | return temp.loc[temp != 0]
169 |
170 |
171 | def prep_to_roll(start, test):
172 | """Used by the rolling mad and rolling mean, prepares each test and
173 | respective expected proportions for later application to the Series subset
174 | """
175 | if test in [1, 2, 3]:
176 | start[DIGS[test]] = start.ZN // 10 ** ((
177 | log10(start.ZN).astype(int)) - (test - 1))
178 | start = start.loc[start.ZN >= 10 ** (test - 1)]
179 |
180 | ind = arange(10 ** (test - 1), 10 ** test)
181 | Exp = log10(1 + (1. / ind))
182 |
183 | elif test == 22:
184 | start[DIGS[test]] = (start.ZN // 10 ** ((
185 | log10(start.ZN)).astype(int) - 1)) % 10
186 | start = start.loc[start.ZN >= 10]
187 |
188 | Expec = log10(1 + (1. / arange(10, 100)))
189 | temp = DataFrame({'Expected': Expec, 'Sec_Dig':
190 | array(list(range(10)) * 9)})
191 | Exp = temp.groupby('Sec_Dig').sum().values.reshape(10,)
192 | ind = arange(0, 10)
193 |
194 | else:
195 | start[DIGS[test]] = start.ZN % 100
196 | start = start.loc[start.ZN >= 1000]
197 |
198 | ind = arange(0, 100)
199 | Exp = array([1 / 99.] * 100)
200 |
201 | return Exp, ind
202 |
203 |
204 | def mad_to_roll(arr, Exp, ind):
205 | """Mean Absolute Deviation used in the rolling function
206 | """
207 | prop = arr.value_counts(normalize=True).sort_index()
208 |
209 | if len(prop) < len(Exp):
210 | prop = prop.reindex(ind).fillna(0)
211 |
212 | return abs(prop - Exp).mean()
213 |
214 |
215 | def mse_to_roll(arr, Exp, ind):
216 | """Mean Squared Error used in the rolling function
217 | """
218 | temp = arr.value_counts(normalize=True).sort_index()
219 |
220 | if len(temp) < len(Exp):
221 | temp = temp.reindex(ind).fillna(0)
222 |
223 | return ((temp - Exp) ** 2).mean()
224 |
--------------------------------------------------------------------------------
/benford/viz.py:
--------------------------------------------------------------------------------
1 | from numpy import array, arange, maximum, sqrt, ones
2 | import matplotlib.pyplot as plt
3 | from matplotlib.text import Annotation
4 | from .constants import COLORS, MAD_CONFORM
5 |
6 |
7 | def plot_expected(df, digs, save_plot=None, save_plot_kwargs=None):
8 | """Plots the Expected Benford Distributions
9 |
10 | Args:
11 | df: DataFrame with the Expected Proportions
12 | digs: Test's digit
13 | save_plot: string with the path/name of the file in which the generated
14 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
15 | is infered by the file name extension.
16 | save_plot_kwargs: dict with any of the kwargs accepted by
17 | matplotlib.pyplot.savefig()
18 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
19 | """
20 | if digs in [1, 2, 3]:
21 | y_max = (df.Expected.max() + (10 ** -(digs) / 3)) * 100
22 | figsize = 2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5)
23 | elif digs == 22:
24 | y_max = 13.
25 | figsize = 14, 10.5
26 | elif digs == -2:
27 | y_max = 1.1
28 | figsize = 15, 8
29 | fig, ax = plt.subplots(figsize=figsize)
30 | plt.title('Expected Benford Distributions', size='xx-large')
31 | plt.xlabel(df.index.name, size='x-large')
32 | plt.ylabel('Distribution (%)', size='x-large')
33 | ax.set_facecolor(COLORS['b'])
34 | ax.set_ylim(0, y_max)
35 | ax.bar(df.index, df.Expected * 100, color=COLORS['t'], align='center')
36 | ax.set_xticks(df.index)
37 | ax.set_xticklabels(df.index)
38 |
39 | if save_plot:
40 | if not save_plot_kwargs:
41 | save_plot_kwargs = {}
42 | plt.savefig(save_plot, **save_plot_kwargs)
43 |
44 | plt.show(block=False)
45 |
46 |
47 | def _get_plot_args(digs):
48 | """Selects the correct arguments for the plotting functions, depending on the
49 | the test (digs) chosen.
50 | """
51 | if digs in [1, 2, 3]:
52 | text_x = False
53 | n, m = 10 ** (digs - 1), 10 ** (digs)
54 | x = arange(n, m)
55 | figsize = (2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5))
56 | elif digs == 22:
57 | text_x = False
58 | x = arange(10)
59 | figsize = (14, 10)
60 | else:
61 | text_x = True
62 | x = arange(100)
63 | figsize = (15, 7)
64 | return x, figsize, text_x
65 |
66 | def plot_digs(df, x, y_Exp, y_Found, N, figsize, conf_Z, text_x=False,
67 | save_plot=None, save_plot_kwargs=None):
68 | """Plots the digits tests results
69 |
70 | Args:
71 | df: DataFrame with the data to be plotted
72 | x: sequence to be used in the x axis
73 | y_Exp: sequence of the expected proportions to be used in the y axis
74 | (line)
75 | y_Found: sequence of the found proportions to be used in the y axis
76 | (bars)
77 | N: lenght of sequence, to be used when plotting the confidence levels
78 | figsize: tuple to state the size of the plot figure
79 | conf_Z: Confidence level
80 | save_pic: file path to save figure
81 | text_x: Forces to show all x ticks labels. Defaluts to True.
82 | save_plot: string with the path/name of the file in which the generated
83 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
84 | is infered by the file name extension.
85 | save_plot_kwargs: dict with any of the kwargs accepted by
86 | matplotlib.pyplot.savefig()
87 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
88 |
89 | """
90 | if len(x) > 10:
91 | rotation = 90
92 | else:
93 | rotation = 0
94 | fig, ax = plt.subplots(figsize=figsize)
95 | plt.title('Expected vs. Found Distributions', size='xx-large')
96 | plt.xlabel('Digits', size='x-large')
97 | plt.ylabel('Distribution (%)', size='x-large')
98 | if conf_Z is not None:
99 | sig = conf_Z * sqrt(y_Exp * (1 - y_Exp) / N)
100 | upper = y_Exp + sig + (1 / (2 * N))
101 | lower_zeros = array([0]*len(upper))
102 | lower = maximum(y_Exp - sig - (1 / (2 * N)), lower_zeros)
103 | u = (y_Found < lower) | (y_Found > upper)
104 | c = array([COLORS['m']] * len(u))
105 | c[u] = COLORS['af']
106 | lower *= 100.
107 | upper *= 100.
108 | ax.plot(x, upper, color=COLORS['s'], zorder=5)
109 | ax.plot(x, lower, color=COLORS['s'], zorder=5)
110 | ax.fill_between(x, upper, lower, color=COLORS['s'],
111 | alpha=.3, label='Conf')
112 | else:
113 | c = COLORS['m']
114 | ax.bar(x, y_Found * 100., color=c, label='Found', zorder=3, align='center')
115 | ax.plot(x, y_Exp * 100., color=COLORS['s'], linewidth=2.5,
116 | label='Benford', zorder=4)
117 | ax.set_xticks(x)
118 | ax.set_xticklabels(x, rotation=rotation)
119 | ax.set_facecolor(COLORS['b'])
120 | if text_x:
121 | ind = array(df.index).astype(str)
122 | ind[:10] = array(['00', '01', '02', '03', '04', '05',
123 | '06', '07', '08', '09'])
124 | plt.xticks(x, ind, rotation='vertical')
125 | ax.legend()
126 | ax.set_ylim(0, max([y_Exp.max() * 100, y_Found.max() * 100]) + 10 / len(x))
127 | ax.set_xlim(x[0] - 1, x[-1] + 1)
128 |
129 | if save_plot:
130 | if not save_plot_kwargs:
131 | save_plot_kwargs = {}
132 | plt.savefig(save_plot, **save_plot_kwargs)
133 |
134 | plt.show(block=False)
135 |
136 |
137 | def plot_sum(df, figsize, li, text_x=False, save_plot=None, save_plot_kwargs=None):
138 | """Plots the summation test results
139 |
140 | Args:
141 | df: DataFrame with the data to be plotted
142 | figsize: sets the dimensions of the plot figure
143 | li: value with which to draw the horizontal line
144 | save_plot: string with the path/name of the file in which the generated
145 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
146 | is infered by the file name extension.
147 | save_plot_kwargs: dict with any of the kwargs accepted by
148 | matplotlib.pyplot.savefig()
149 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
150 | """
151 | x = df.index
152 | rotation = 90 if len(x) > 10 else 0
153 | fig = plt.figure(figsize=figsize)
154 | ax = fig.add_subplot(111)
155 | plt.title('Expected vs. Found Sums')
156 | plt.xlabel('Digits')
157 | plt.ylabel('Sums')
158 | ax.bar(x, df.Percent, color=COLORS['m'],
159 | label='Found Sums', zorder=3, align='center')
160 | ax.set_xlim(x[0] - 1, x[-1] + 1)
161 | ax.axhline(li, color=COLORS['s'], linewidth=2, label='Expected', zorder=4)
162 | ax.set_xticks(x)
163 | ax.set_xticklabels(x, rotation=rotation)
164 | ax.set_facecolor(COLORS['b'])
165 | if text_x:
166 | ind = array(x).astype(str)
167 | ind[:10] = array(['00', '01', '02', '03', '04', '05',
168 | '06', '07', '08', '09'])
169 | plt.xticks(x, ind, rotation='vertical')
170 | ax.legend()
171 |
172 | if save_plot:
173 | if not save_plot_kwargs:
174 | save_plot_kwargs = {}
175 | plt.savefig(save_plot, **save_plot_kwargs)
176 |
177 | plt.show(block=False)
178 |
179 | def plot_ordered_mantissas(col, figsize=(12, 12),
180 | save_plot=None, save_plot_kwargs=None):
181 | """Plots the ordered mantissas and compares them to the expected, straight
182 | line that should be formed in a Benford-cmpliant set.
183 |
184 | Args:
185 | col (Series): column of mantissas to plot.
186 | figsize (tuple): sets the dimensions of the plot figure.
187 | save_plot: string with the path/name of the file in which the generated
188 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
189 | is infered by the file name extension.
190 | save_plot_kwargs: dict with any of the kwargs accepted by
191 | matplotlib.pyplot.savefig()
192 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
193 |
194 | """
195 | ld = len(col)
196 | x = arange(1, ld + 1)
197 | n = ones(ld) / ld
198 | fig = plt.figure(figsize=figsize)
199 | ax = fig.add_subplot(111)
200 | ax.plot(x, col.sort_values(), linestyle='--',
201 | color=COLORS['s'], linewidth=3, label='Mantissas')
202 | ax.plot(x, n.cumsum(), color=COLORS['m'],
203 | linewidth=2, label='Expected')
204 | plt.ylim((0, 1.))
205 | plt.xlim((1, ld + 1))
206 | ax.set_facecolor(COLORS['b'])
207 | ax.set_title("Ordered Mantissas")
208 | plt.legend(loc='upper left')
209 |
210 | if save_plot:
211 | if not save_plot_kwargs:
212 | save_plot_kwargs = {}
213 | plt.savefig(save_plot, **save_plot_kwargs)
214 |
215 | plt.show(block=False);
216 |
217 | def plot_mantissa_arc_test(df, gravity_center, grid=True, figsize=12,
218 | save_plot=None, save_plot_kwargs=None):
219 | """Draws thee Mantissa Arc Test after computing X and Y circular coordinates
220 | for every mantissa and the center of gravity for the set
221 |
222 | Args:
223 | df (DataFrame): pandas DataFrame with the mantissas and the X and Y
224 | coordinates.
225 | gravity_center (tuple): coordinates for plottling the gravity center
226 | grid (bool): show grid. Defaults to True.
227 | figsize (int): figure dimensions. No need to be a tuple, since the
228 | figure is a square.
229 | save_plot: string with the path/name of the file in which the generated
230 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
231 | is infered by the file name extension.
232 | save_plot_kwargs: dict with any of the kwargs accepted by
233 | matplotlib.pyplot.savefig()
234 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
235 | """
236 | fig = plt.figure(figsize=(figsize, figsize))
237 | ax = plt.subplot()
238 | ax.set_facecolor(COLORS['b'])
239 | ax.scatter(df.mant_x, df.mant_y, label="ARC TEST",
240 | color=COLORS['m'])
241 | ax.scatter(gravity_center[0], gravity_center[1],
242 | color=COLORS['s'])
243 | text_annotation = Annotation(
244 | " Gravity Center: "
245 | f"x({round(gravity_center[0], 3)}),"
246 | f" y({round(gravity_center[1], 3)})",
247 | xy=(gravity_center[0] - 0.65,
248 | gravity_center[1] - 0.1),
249 | xycoords='data')
250 | ax.add_artist(text_annotation)
251 | ax.grid(True, which='both')
252 | ax.axhline(y=0, color='k')
253 | ax.axvline(x=0, color='k')
254 | ax.legend(loc='lower left')
255 | ax.set_title("Mantissas Arc Test")
256 |
257 | if save_plot:
258 | if not save_plot_kwargs:
259 | save_plot_kwargs = {}
260 | plt.savefig(save_plot, **save_plot_kwargs)
261 |
262 | plt.show(block=False);
263 |
264 | def plot_roll_mse(roll_series, figsize, save_plot=None, save_plot_kwargs=None):
265 | """Shows the rolling MSE plot
266 |
267 | Args:
268 | roll_series: pd.Series resultant form rolling mse.
269 | figsize: the figure dimensions.
270 | save_plot: string with the path/name of the file in which the generated
271 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
272 | is infered by the file name extension.
273 | save_plot_kwargs: dict with any of the kwargs accepted by
274 | matplotlib.pyplot.savefig()
275 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
276 | """
277 | fig, ax = plt.subplots(figsize=figsize)
278 | ax.set_facecolor(COLORS['b'])
279 | ax.plot(roll_series, color=COLORS['m'])
280 |
281 | if save_plot:
282 | if not save_plot_kwargs:
283 | save_plot_kwargs = {}
284 | plt.savefig(save_plot, **save_plot_kwargs)
285 |
286 | plt.show(block=False)
287 |
288 | def plot_roll_mad(roll_mad, figsize, save_plot=None, save_plot_kwargs=None):
289 | """Shows the rolling MAD plot
290 |
291 | Args:
292 | roll_mad: pd.Series resultant form rolling mad.
293 | figsize: the figure dimensions.
294 | save_plot: string with the path/name of the file in which the generated
295 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
296 | is infered by the file name extension.
297 | save_plot_kwargs: dict with any of the kwargs accepted by
298 | matplotlib.pyplot.savefig()
299 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
300 | """
301 | fig, ax = plt.subplots(figsize=figsize)
302 | ax.set_facecolor(COLORS['b'])
303 | ax.plot(roll_mad.roll_series, color=COLORS['m'])
304 |
305 | if roll_mad.test != -2:
306 | plt.axhline(y=MAD_CONFORM[roll_mad.test][0], color=COLORS['af'], linewidth=3)
307 | plt.axhline(y=MAD_CONFORM[roll_mad.test][1], color=COLORS['h2'], linewidth=3)
308 | plt.axhline(y=MAD_CONFORM[roll_mad.test][2], color=COLORS['s'], linewidth=3)
309 |
310 | if save_plot:
311 | if not save_plot_kwargs:
312 | save_plot_kwargs = {}
313 | plt.savefig(save_plot, **save_plot_kwargs)
314 |
315 | plt.show(block=False)
316 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/build/doctrees/api.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/api.doctree
--------------------------------------------------------------------------------
/docs/build/doctrees/benford.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/benford.doctree
--------------------------------------------------------------------------------
/docs/build/doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/environment.pickle
--------------------------------------------------------------------------------
/docs/build/doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/index.doctree
--------------------------------------------------------------------------------
/docs/build/doctrees/modules.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/milcent/benford_py/0126c606ae9c27cba43e6dc83b73bb329f839ae4/docs/build/doctrees/modules.doctree
--------------------------------------------------------------------------------
/docs/build/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: 1ab0e725c448968d4851f0b695542647
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 |
--------------------------------------------------------------------------------
/docs/build/html/_modules/benford/expected.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | benford.expected — benford_py 0.3.3 documentation
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
[docs]classFirst(DataFrame):
161 | """Holds the expected probabilities of the First, First Two, or
162 | First Three digits according to Benford's distribution.
163 |
164 | Args:
165 | digs: 1, 2 or 3 - tells which of the first digits to consider:
166 | 1 for the First Digit, 2 for the First Two Digits and 3 for
167 | the First Three Digits.
168 | plot: option to plot a bar chart of the Expected proportions.
169 | Defaults to True.
170 | save_plot: string with the path/name of the file in which the generated
171 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
172 | is infered by the file name extension. Only available when
173 | plot=True.
174 | save_plot_kwargs: dict with any of the kwargs accepted by
175 | matplotlib.pyplot.savefig()
176 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
177 | Only available when plot=True and save_plot is a string with the
178 | figure file path/name.
179 | """
180 |
181 | def__init__(self,digs,plot=True,save_plot=None,save_plot_kwargs=None):
182 | _check_digs_(digs)
183 | dig_name=f'First_{digs}_Dig'
184 | exp_array,dig_array=_gen_first_digits_(digs)
185 |
186 | DataFrame.__init__(self,{'Expected':exp_array},index=dig_array)
187 | self.index.names=[dig_name]
188 |
189 | ifplot:
190 | plot_expected(self,digs,save_plot=save_plot,
191 | save_plot_kwargs=save_plot_kwargs)
192 |
193 |
194 |
[docs]classSecond(DataFrame):
195 | """Holds the expected probabilities of the Second Digits
196 | according to Benford's distribution.
197 |
198 | Args:
199 | plot: option to plot a bar chart of the Expected proportions.
200 | Defaults to True.
201 | save_plot: string with the path/name of the file in which the generated
202 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
203 | is infered by the file name extension. Only available when
204 | plot=True.
205 | save_plot_kwargs: dict with any of the kwargs accepted by
206 | matplotlib.pyplot.savefig()
207 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
208 | Only available when plot=True and save_plot is a string with the
209 | figure file path/name.
210 | """
211 | def__init__(self,plot=True,save_plot=None,save_plot_kwargs=None):
212 |
213 | exp,sec_digs=_gen_second_digits_()
214 |
215 | DataFrame.__init__(self,{'Expected':exp,'Sec_Dig':sec_digs})
216 | self.set_index("Sec_Dig",inplace=True)
217 |
218 | ifplot:
219 | plot_expected(self,22,save_plot=save_plot,
220 | save_plot_kwargs=save_plot_kwargs)
221 |
222 |
223 |
[docs]classLastTwo(DataFrame):
224 | """Holds the expected probabilities of the Last Two Digits
225 | according to Benford's distribution.
226 |
227 | Args:
228 | plot: option to plot a bar chart of the Expected proportions.
229 | Defaults to True.
230 | save_plot: string with the path/name of the file in which the generated
231 | plot will be saved. Uses matplotlib.pyplot.savefig(). File format
232 | is infered by the file name extension. Only available when
233 | plot=True.
234 | save_plot_kwargs: dict with any of the kwargs accepted by
235 | matplotlib.pyplot.savefig()
236 | https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
237 | Only available when plot=True and save_plot is a string with the
238 | figure file path/name.
239 | """
240 | def__init__(self,num=False,plot=True,save_plot=None,save_plot_kwargs=None):
241 | exp,l2d=_gen_last_two_digits_(num=num)
242 | DataFrame.__init__(self,{'Expected':exp,
243 | 'Last_2_Dig':l2d})
244 | self.set_index('Last_2_Dig',inplace=True)
245 | ifplot:
246 | plot_expected(self,-2,save_plot=save_plot,
247 | save_plot_kwargs=save_plot_kwargs)
248 |
249 |
250 | def_get_expected_digits_(digs):
251 | """Chooses the Exxpected class to be used in a test
252 |
253 | Args:
254 | digs: the int corresponding to the Expected class to be instantiated
255 |
256 | Returns:
257 | the Expected instance forthe propoer test to be performed
258 | """
259 | ifdigsin[1,2,3]:
260 | returnFirst(digs,plot=False)
261 | elifdigs==22:
262 | returnSecond(plot=False)
263 | else:
264 | returnLastTwo(num=True,plot=False)
265 |
266 |
267 | def_gen_last_two_digits_(num=False):
268 | """Creates two arrays, one with the possible last two digits and one with
269 | thei respective probabilities
270 |
271 | Args:
272 | num: returns numeric (ints) values. Defaluts to False,
273 | which returns strings.
274 |
275 | Returns:
276 | exp (np.array): Array with the (constant) probabilities of occurrence of
277 | each pair of last two digits
278 | l2d (np.array): Array of ints or str, in any case representing all 100
279 | possible combinations of last two digits
280 | """
281 | exp=array([1/99.]*100)
282 | l2d=arange(0,100)
283 | ifnum:
284 | returnexp,l2d
285 | l2d=l2d.astype(str)
286 | l2d[:10]=array(['00','01','02','03','04','05',
287 | '06','07','08','09'])
288 | returnexp,l2d
289 |
290 | def_gen_first_digits_(digs):
291 | """Creates two arrays, one with the possible digits combinations and the
292 | other with their respective expected probabilities according to Benford
293 |
294 | Args:
295 | digs (int): 1, 2 or 3, for generation of the first, first two, or first
296 | three digits
297 |
298 | Returns:
299 | (tuple of arrays): the expected probabilities array and the digits
300 | combination array.
301 | """
302 | dig_array=arange(10**(digs-1),10**digs)
303 | exp_prob=log10(1+(1./dig_array))
304 | returnexp_prob,dig_array
305 |
306 | def_gen_second_digits_():
307 | """Creates two arrays, one with he possible second digits combinations and
308 | the other with their respective expected probabilities according to Benford
309 |
310 | Returns:
311 | (tuple of arrays): the expected probabilities array and the second
312 | digits array.
313 | """
314 | exp_f2d,_=_gen_first_digits_(2)
315 | sec_digs=range(10)
316 | sec_digs_in_f2d=array(list(range(10))*9)
317 | exp=array([exp_f2d[sec_digs_in_f2d==i].sum()foriinsec_digs])
318 | returnexp,array(sec_digs)
319 |
157 | A
158 | | B
159 | | C
160 | | D
161 | | F
162 | | K
163 | | L
164 | | M
165 | | N
166 | | P
167 | | R
168 | | S
169 | | T
170 | | U
171 | | V
172 | | Z
173 |
174 |